X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/4388f060552cc537e71e957d32f35e9d75a61233..f3c0d7a59d99c2a94c6b8822291f0e42be3773c9:/icuSources/i18n/repattrn.cpp diff --git a/icuSources/i18n/repattrn.cpp b/icuSources/i18n/repattrn.cpp index 1454a093..b8aee1a0 100644 --- a/icuSources/i18n/repattrn.cpp +++ b/icuSources/i18n/repattrn.cpp @@ -1,10 +1,12 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html // // file: repattrn.cpp // /* *************************************************************************** -* Copyright (C) 2002-2012 International Business Machines Corporation * -* and others. All rights reserved. * +* Copyright (C) 2002-2016 International Business Machines Corporation +* and others. All rights reserved. *************************************************************************** */ @@ -14,7 +16,10 @@ #include "unicode/regex.h" #include "unicode/uclean.h" +#include "cmemory.h" +#include "cstr.h" #include "uassert.h" +#include "uhash.h" #include "uvector.h" #include "uvectr32.h" #include "uvectr64.h" @@ -66,25 +71,32 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) { init(); // Copy simple fields - if ( other.fPatternString == NULL ) { + fDeferredStatus = other.fDeferredStatus; + + if (U_FAILURE(fDeferredStatus)) { + return *this; + } + + if (other.fPatternString == NULL) { fPatternString = NULL; - fPattern = utext_clone(fPattern, other.fPattern, FALSE, TRUE, &fDeferredStatus); + fPattern = utext_clone(fPattern, other.fPattern, FALSE, TRUE, &fDeferredStatus); } else { fPatternString = new UnicodeString(*(other.fPatternString)); - UErrorCode status = U_ZERO_ERROR; - fPattern = utext_openConstUnicodeString(NULL, fPatternString, &status); - if (U_FAILURE(status)) { + if (fPatternString == NULL) { fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; - return *this; + } else { + fPattern = utext_openConstUnicodeString(NULL, fPatternString, &fDeferredStatus); } } + if (U_FAILURE(fDeferredStatus)) { + return *this; + } + fFlags = other.fFlags; fLiteralText = other.fLiteralText; - fDeferredStatus = other.fDeferredStatus; fMinMatchLen = other.fMinMatchLen; fFrameSize = other.fFrameSize; fDataSize = other.fDataSize; - fMaxCaptureDigits = other.fMaxCaptureDigits; fStaticSets = other.fStaticSets; fStaticSets8 = other.fStaticSets8; @@ -125,6 +137,21 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) { fSets8[i] = other.fSets8[i]; } + // Copy the named capture group hash map. + int32_t hashPos = UHASH_FIRST; + while (const UHashElement *hashEl = uhash_nextElement(other.fNamedCaptureMap, &hashPos)) { + if (U_FAILURE(fDeferredStatus)) { + break; + } + const UnicodeString *name = (const UnicodeString *)hashEl->key.pointer; + UnicodeString *key = new UnicodeString(*name); + int32_t val = hashEl->value.integer; + if (key == NULL) { + fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; + } else { + uhash_puti(fNamedCaptureMap, key, val, &fDeferredStatus); + } + } return *this; } @@ -146,7 +173,6 @@ void RegexPattern::init() { fFrameSize = 0; fDataSize = 0; fGroupMap = NULL; - fMaxCaptureDigits = 1; fStaticSets = NULL; fStaticSets8 = NULL; fStartType = START_NO_INFO; @@ -156,6 +182,7 @@ void RegexPattern::init() { fInitialChar = 0; fInitialChars8 = NULL; fNeedsAltInput = FALSE; + fNamedCaptureMap = NULL; fPattern = NULL; // will be set later fPatternString = NULL; // may be set later @@ -164,17 +191,24 @@ void RegexPattern::init() { fSets = new UVector(fDeferredStatus); fInitialChars = new UnicodeSet; fInitialChars8 = new Regex8BitSet; + fNamedCaptureMap = uhash_open(uhash_hashUnicodeString, // Key hash function + uhash_compareUnicodeString, // Key comparator function + uhash_compareLong, // Value comparator function + &fDeferredStatus); if (U_FAILURE(fDeferredStatus)) { return; } if (fCompiledPat == NULL || fGroupMap == NULL || fSets == NULL || - fInitialChars == NULL || fInitialChars8 == NULL) { + fInitialChars == NULL || fInitialChars8 == NULL || fNamedCaptureMap == NULL) { fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; return; } // Slot zero of the vector of sets is reserved. Fill it here. fSets->addElement((int32_t)0, fDeferredStatus); + + // fNamedCaptureMap owns its key strings, type (UnicodeString *) + uhash_setKeyDeleter(fNamedCaptureMap, uprv_deleteUObject); } @@ -212,6 +246,8 @@ void RegexPattern::zap() { delete fPatternString; fPatternString = NULL; } + uhash_close(fNamedCaptureMap); + fNamedCaptureMap = NULL; } @@ -275,21 +311,21 @@ RegexPattern::compile(const UnicodeString ®ex, if (U_FAILURE(status)) { return NULL; } - + const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS | UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD | UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES | UREGEX_LITERAL; - + if ((flags & ~allFlags) != 0) { status = U_REGEX_INVALID_FLAG; return NULL; } - + if ((flags & UREGEX_CANON_EQ) != 0) { status = U_REGEX_UNIMPLEMENTED; return NULL; } - + RegexPattern *This = new RegexPattern; if (This == NULL) { status = U_MEMORY_ALLOCATION_ERROR; @@ -301,15 +337,15 @@ RegexPattern::compile(const UnicodeString ®ex, return NULL; } This->fFlags = flags; - + RegexCompile compiler(This, status); compiler.compile(regex, pe, status); - + if (U_FAILURE(status)) { delete This; This = NULL; } - + return This; } @@ -355,7 +391,7 @@ RegexPattern::compile(UText *regex, RegexCompile compiler(This, status); compiler.compile(regex, pe, status); - + if (U_FAILURE(status)) { delete This; This = NULL; @@ -538,12 +574,12 @@ UnicodeString RegexPattern::pattern() const { int64_t nativeLen = utext_nativeLength(fPattern); int32_t len16 = utext_extract(fPattern, 0, nativeLen, NULL, 0, &status); // buffer overflow error UnicodeString result; - + status = U_ZERO_ERROR; UChar *resultChars = result.getBuffer(len16); utext_extract(fPattern, 0, nativeLen, resultChars, len16, &status); // unterminated warning result.releaseBuffer(len16); - + return result; } } @@ -569,6 +605,34 @@ UText *RegexPattern::patternText(UErrorCode &status) const { } +//-------------------------------------------------------------------------------- +// +// groupNumberFromName() +// +//-------------------------------------------------------------------------------- +int32_t RegexPattern::groupNumberFromName(const UnicodeString &groupName, UErrorCode &status) const { + if (U_FAILURE(status)) { + return 0; + } + + // No need to explicitly check for syntactically valid names. + // Invalid ones will never be in the map, and the lookup will fail. + + int32_t number = uhash_geti(fNamedCaptureMap, &groupName); + if (number == 0) { + status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; + } + return number; +} + +int32_t RegexPattern::groupNumberFromName(const char *groupName, int32_t nameLength, UErrorCode &status) const { + if (U_FAILURE(status)) { + return 0; + } + UnicodeString name(groupName, nameLength, US_INV); + return groupNumberFromName(name, status); +} + //--------------------------------------------------------------------- // @@ -615,25 +679,25 @@ int32_t RegexPattern::split(UText *input, } - //--------------------------------------------------------------------- // // dump Output the compiled form of the pattern. // Debugging function only. // //--------------------------------------------------------------------- -#if defined(REGEX_DEBUG) void RegexPattern::dumpOp(int32_t index) const { + (void)index; // Suppress warnings in non-debug build. +#if defined(REGEX_DEBUG) static const char * const opNames[] = {URX_OPCODE_NAMES}; int32_t op = fCompiledPat->elementAti(index); int32_t val = URX_VAL(op); int32_t type = URX_TYPE(op); int32_t pinnedType = type; - if ((uint32_t)pinnedType >= sizeof(opNames)/sizeof(char *)) { + if ((uint32_t)pinnedType >= UPRV_LENGTHOF(opNames)) { pinnedType = 0; } - REGEX_DUMP_DEBUG_PRINTF(("%4d %08x %-15s ", index, op, opNames[pinnedType])); + printf("%4d %08x %-15s ", index, op, opNames[pinnedType]); switch (type) { case URX_NOP: case URX_DOTANY: @@ -681,13 +745,20 @@ void RegexPattern::dumpOp(int32_t index) const { case URX_LBN_END: case URX_LOOP_C: case URX_LOOP_DOT_I: + case URX_BACKSLASH_H: + case URX_BACKSLASH_R: + case URX_BACKSLASH_V: // types with an integer operand field. - REGEX_DUMP_DEBUG_PRINTF(("%d", val)); + printf("%d", val); break; case URX_ONECHAR: case URX_ONECHAR_I: - REGEX_DUMP_DEBUG_PRINTF(("%c", val<256?val:'?')); + if (val < 0x20) { + printf("%#x", val); + } else { + printf("'%s'", CStr(UnicodeString(val))()); + } break; case URX_STRING: @@ -696,12 +767,8 @@ void RegexPattern::dumpOp(int32_t index) const { int32_t lengthOp = fCompiledPat->elementAti(index+1); U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN); int32_t length = URX_VAL(lengthOp); - int32_t i; - for (i=val; i= 256) {c = '.';} - REGEX_DUMP_DEBUG_PRINTF(("%c", c)); - } + UnicodeString str(fLiteralText, val, length); + printf("%s", CStr(str)()); } break; @@ -711,9 +778,7 @@ void RegexPattern::dumpOp(int32_t index) const { UnicodeString s; UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val); set->toPattern(s, TRUE); - for (int32_t i=0; itoPattern(s, TRUE); - for (int32_t i=0; ifPattern, 0); - while (c != U_SENTINEL) { - if (c<32 || c>256) { - c = '.'; - } - REGEX_DUMP_DEBUG_PRINTF(("%c", c)); - - c = UTEXT_NEXT32(This->fPattern); - } - REGEX_DUMP_DEBUG_PRINTF(("\n")); - REGEX_DUMP_DEBUG_PRINTF((" Min Match Length: %d\n", This->fMinMatchLen)); - REGEX_DUMP_DEBUG_PRINTF((" Match Start Type: %s\n", START_OF_MATCH_STR(This->fStartType))); - if (This->fStartType == START_STRING) { - REGEX_DUMP_DEBUG_PRINTF((" Initial match string: \"")); - for (i=This->fInitialStringIdx; ifInitialStringIdx+This->fInitialStringLen; i++) { - REGEX_DUMP_DEBUG_PRINTF(("%c", This->fLiteralText[i])); // TODO: non-printables, surrogates. - } - REGEX_DUMP_DEBUG_PRINTF(("\"\n")); - } else if (This->fStartType == START_SET) { - int32_t numSetChars = This->fInitialChars->size(); - if (numSetChars > 20) { - numSetChars = 20; - } - REGEX_DUMP_DEBUG_PRINTF((" Match First Chars : ")); - for (i=0; ifInitialChars->charAt(i); - if (0x20toPattern(s, TRUE); + printf(" Match First Chars: %s\n", CStr(s)()); + + } else if (fStartType == START_CHAR) { + printf(" First char of Match: "); + if (fInitialChar > 0x20) { + printf("'%s'\n", CStr(UnicodeString(fInitialChar))()); } else { - REGEX_DUMP_DEBUG_PRINTF(("%#x ", c)); + printf("%#x\n", fInitialChar); } - } - if (numSetChars < This->fInitialChars->size()) { - REGEX_DUMP_DEBUG_PRINTF((" ...")); - } - REGEX_DUMP_DEBUG_PRINTF(("\n")); + } - } else if (This->fStartType == START_CHAR) { - REGEX_DUMP_DEBUG_PRINTF((" First char of Match : ")); - if (0x20 < This->fInitialChar && This->fInitialChar<0x7e) { - REGEX_DUMP_DEBUG_PRINTF(("%c\n", This->fInitialChar)); - } else { - REGEX_DUMP_DEBUG_PRINTF(("%#x\n", This->fInitialChar)); - } + printf("Named Capture Groups:\n"); + if (uhash_count(fNamedCaptureMap) == 0) { + printf(" None\n"); + } else { + int32_t pos = UHASH_FIRST; + const UHashElement *el = NULL; + while ((el = uhash_nextElement(fNamedCaptureMap, &pos))) { + const UnicodeString *name = (const UnicodeString *)el->key.pointer; + int32_t number = el->value.integer; + printf(" %d\t%s\n", number, CStr(*name)()); + } } - REGEX_DUMP_DEBUG_PRINTF(("\nIndex Binary Type Operand\n" \ - "-------------------------------------------\n")); - for (index = 0; indexfCompiledPat->size(); index++) { - This->dumpOp(index); + printf("\nIndex Binary Type Operand\n" \ + "-------------------------------------------\n"); + for (index = 0; indexsize(); index++) { + dumpOp(index); } - REGEX_DUMP_DEBUG_PRINTF(("\n\n")); -} + printf("\n\n"); #endif +}