X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/729e4ab9bc6618bc3d8a898e575df7f4019e29ca..340931cb2e044a2141d11567dd0f782524e32994:/icuSources/i18n/repattrn.cpp diff --git a/icuSources/i18n/repattrn.cpp b/icuSources/i18n/repattrn.cpp index 4369eb83..bf186959 100644 --- a/icuSources/i18n/repattrn.cpp +++ b/icuSources/i18n/repattrn.cpp @@ -1,10 +1,12 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html // // file: repattrn.cpp // /* *************************************************************************** -* Copyright (C) 2002-2010 International Business Machines Corporation * -* and others. All rights reserved. * +* Copyright (C) 2002-2016 International Business Machines Corporation +* and others. All rights reserved. *************************************************************************** */ @@ -14,7 +16,10 @@ #include "unicode/regex.h" #include "unicode/uclean.h" +#include "cmemory.h" +#include "cstr.h" #include "uassert.h" +#include "uhash.h" #include "uvector.h" #include "uvectr32.h" #include "uvectr64.h" @@ -30,9 +35,6 @@ U_NAMESPACE_BEGIN // //-------------------------------------------------------------------------- RegexPattern::RegexPattern() { - UErrorCode status = U_ZERO_ERROR; - u_init(&status); - // Init all of this instances data. init(); } @@ -69,25 +71,32 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) { init(); // Copy simple fields - if ( other.fPatternString == NULL ) { + fDeferredStatus = other.fDeferredStatus; + + if (U_FAILURE(fDeferredStatus)) { + return *this; + } + + if (other.fPatternString == NULL) { fPatternString = NULL; - fPattern = utext_clone(fPattern, other.fPattern, FALSE, TRUE, &fDeferredStatus); + fPattern = utext_clone(fPattern, other.fPattern, FALSE, TRUE, &fDeferredStatus); } else { fPatternString = new UnicodeString(*(other.fPatternString)); - UErrorCode status = U_ZERO_ERROR; - fPattern = utext_openConstUnicodeString(NULL, fPatternString, &status); - if (U_FAILURE(status)) { + if (fPatternString == NULL) { fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; - return *this; + } else { + fPattern = utext_openConstUnicodeString(NULL, fPatternString, &fDeferredStatus); } } + if (U_FAILURE(fDeferredStatus)) { + return *this; + } + fFlags = other.fFlags; fLiteralText = other.fLiteralText; - fDeferredStatus = other.fDeferredStatus; fMinMatchLen = other.fMinMatchLen; fFrameSize = other.fFrameSize; fDataSize = other.fDataSize; - fMaxCaptureDigits = other.fMaxCaptureDigits; fStaticSets = other.fStaticSets; fStaticSets8 = other.fStaticSets8; @@ -128,6 +137,23 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) { fSets8[i] = other.fSets8[i]; } + // Copy the named capture group hash map. + if (other.fNamedCaptureMap != nullptr && initNamedCaptureMap()) { + int32_t hashPos = UHASH_FIRST; + while (const UHashElement *hashEl = uhash_nextElement(other.fNamedCaptureMap, &hashPos)) { + if (U_FAILURE(fDeferredStatus)) { + break; + } + const UnicodeString *name = (const UnicodeString *)hashEl->key.pointer; + UnicodeString *key = new UnicodeString(*name); + int32_t val = hashEl->value.integer; + if (key == NULL) { + fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; + } else { + uhash_puti(fNamedCaptureMap, key, val, &fDeferredStatus); + } + } + } return *this; } @@ -149,7 +175,6 @@ void RegexPattern::init() { fFrameSize = 0; fDataSize = 0; fGroupMap = NULL; - fMaxCaptureDigits = 1; fStaticSets = NULL; fStaticSets8 = NULL; fStartType = START_NO_INFO; @@ -159,6 +184,7 @@ void RegexPattern::init() { fInitialChar = 0; fInitialChars8 = NULL; fNeedsAltInput = FALSE; + fNamedCaptureMap = NULL; fPattern = NULL; // will be set later fPatternString = NULL; // may be set later @@ -171,7 +197,7 @@ void RegexPattern::init() { return; } if (fCompiledPat == NULL || fGroupMap == NULL || fSets == NULL || - fInitialChars == NULL || fInitialChars8 == NULL) { + fInitialChars == NULL || fInitialChars8 == NULL) { fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; return; } @@ -181,6 +207,24 @@ void RegexPattern::init() { } +bool RegexPattern::initNamedCaptureMap() { + if (fNamedCaptureMap) { + return true; + } + fNamedCaptureMap = uhash_openSize(uhash_hashUnicodeString, // Key hash function + uhash_compareUnicodeString, // Key comparator function + uhash_compareLong, // Value comparator function + 7, // Initial table capacity + &fDeferredStatus); + if (U_FAILURE(fDeferredStatus)) { + return false; + } + + // fNamedCaptureMap owns its key strings, type (UnicodeString *) + uhash_setKeyDeleter(fNamedCaptureMap, uprv_deleteUObject); + return true; +} + //-------------------------------------------------------------------------- // // zap Delete everything owned by this RegexPattern. @@ -215,6 +259,10 @@ void RegexPattern::zap() { delete fPatternString; fPatternString = NULL; } + if (fNamedCaptureMap != NULL) { + uhash_close(fNamedCaptureMap); + fNamedCaptureMap = NULL; + } } @@ -278,21 +326,21 @@ RegexPattern::compile(const UnicodeString ®ex, if (U_FAILURE(status)) { return NULL; } - + const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS | UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD | UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES | UREGEX_LITERAL; - + if ((flags & ~allFlags) != 0) { status = U_REGEX_INVALID_FLAG; return NULL; } - - if ((flags & (UREGEX_CANON_EQ | UREGEX_LITERAL)) != 0) { + + if ((flags & UREGEX_CANON_EQ) != 0) { status = U_REGEX_UNIMPLEMENTED; return NULL; } - + RegexPattern *This = new RegexPattern; if (This == NULL) { status = U_MEMORY_ALLOCATION_ERROR; @@ -304,15 +352,15 @@ RegexPattern::compile(const UnicodeString ®ex, return NULL; } This->fFlags = flags; - + RegexCompile compiler(This, status); compiler.compile(regex, pe, status); - + if (U_FAILURE(status)) { delete This; This = NULL; } - + return This; } @@ -339,7 +387,7 @@ RegexPattern::compile(UText *regex, return NULL; } - if ((flags & (UREGEX_CANON_EQ | UREGEX_LITERAL)) != 0) { + if ((flags & UREGEX_CANON_EQ) != 0) { status = U_REGEX_UNIMPLEMENTED; return NULL; } @@ -358,7 +406,7 @@ RegexPattern::compile(UText *regex, RegexCompile compiler(This, status); compiler.compile(regex, pe, status); - + if (U_FAILURE(status)) { delete This; This = NULL; @@ -442,31 +490,6 @@ RegexMatcher *RegexPattern::matcher(const UnicodeString &input, return retMatcher; } -// -// matcher, UText mode -// -RegexMatcher *RegexPattern::matcher(UText *input, - PatternIsUTextFlag /*flag*/, - UErrorCode &status) const { - RegexMatcher *retMatcher = matcher(status); - if (retMatcher != NULL) { - retMatcher->fDeferredStatus = status; - retMatcher->reset(input); - } - return retMatcher; -} - -#if 0 -RegexMatcher *RegexPattern::matcher(const UChar * /*input*/, - UErrorCode &status) const -{ - /* This should never get called. The API with UnicodeString should be called instead. */ - if (U_SUCCESS(status)) { - status = U_UNSUPPORTED_ERROR; - } - return NULL; -} -#endif //--------------------------------------------------------------------- // @@ -531,13 +554,16 @@ UBool U_EXPORT2 RegexPattern::matches(UText *regex, if (U_FAILURE(status)) {return FALSE;} - UBool retVal; + UBool retVal = FALSE; RegexPattern *pat = NULL; RegexMatcher *matcher = NULL; pat = RegexPattern::compile(regex, 0, pe, status); - matcher = pat->matcher(input, PATTERN_IS_UTEXT, status); - retVal = matcher->matches(status); + matcher = pat->matcher(status); + if (U_SUCCESS(status)) { + matcher->reset(input); + retVal = matcher->matches(status); + } delete matcher; delete pat; @@ -563,12 +589,12 @@ UnicodeString RegexPattern::pattern() const { int64_t nativeLen = utext_nativeLength(fPattern); int32_t len16 = utext_extract(fPattern, 0, nativeLen, NULL, 0, &status); // buffer overflow error UnicodeString result; - + status = U_ZERO_ERROR; UChar *resultChars = result.getBuffer(len16); utext_extract(fPattern, 0, nativeLen, resultChars, len16, &status); // unterminated warning result.releaseBuffer(len16); - + return result; } } @@ -594,6 +620,34 @@ UText *RegexPattern::patternText(UErrorCode &status) const { } +//-------------------------------------------------------------------------------- +// +// groupNumberFromName() +// +//-------------------------------------------------------------------------------- +int32_t RegexPattern::groupNumberFromName(const UnicodeString &groupName, UErrorCode &status) const { + if (U_FAILURE(status)) { + return 0; + } + + // No need to explicitly check for syntactically valid names. + // Invalid ones will never be in the map, and the lookup will fail. + + int32_t number = fNamedCaptureMap ? uhash_geti(fNamedCaptureMap, &groupName) : 0; + if (number == 0) { + status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; + } + return number; +} + +int32_t RegexPattern::groupNumberFromName(const char *groupName, int32_t nameLength, UErrorCode &status) const { + if (U_FAILURE(status)) { + return 0; + } + UnicodeString name(groupName, nameLength, US_INV); + return groupNumberFromName(name, status); +} + //--------------------------------------------------------------------- // @@ -607,7 +661,7 @@ int32_t RegexPattern::split(const UnicodeString &input, { if (U_FAILURE(status)) { return 0; - }; + } RegexMatcher m(this); int32_t r = 0; @@ -628,7 +682,7 @@ int32_t RegexPattern::split(UText *input, { if (U_FAILURE(status)) { return 0; - }; + } RegexMatcher m(this); int32_t r = 0; @@ -640,25 +694,25 @@ int32_t RegexPattern::split(UText *input, } - //--------------------------------------------------------------------- // // dump Output the compiled form of the pattern. // Debugging function only. // //--------------------------------------------------------------------- -#if defined(REGEX_DEBUG) void RegexPattern::dumpOp(int32_t index) const { + (void)index; // Suppress warnings in non-debug build. +#if defined(REGEX_DEBUG) static const char * const opNames[] = {URX_OPCODE_NAMES}; int32_t op = fCompiledPat->elementAti(index); int32_t val = URX_VAL(op); int32_t type = URX_TYPE(op); int32_t pinnedType = type; - if ((uint32_t)pinnedType >= sizeof(opNames)/sizeof(char *)) { + if ((uint32_t)pinnedType >= UPRV_LENGTHOF(opNames)) { pinnedType = 0; } - REGEX_DUMP_DEBUG_PRINTF(("%4d %08x %-15s ", index, op, opNames[pinnedType])); + printf("%4d %08x %-15s ", index, op, opNames[pinnedType]); switch (type) { case URX_NOP: case URX_DOTANY: @@ -706,13 +760,20 @@ void RegexPattern::dumpOp(int32_t index) const { case URX_LBN_END: case URX_LOOP_C: case URX_LOOP_DOT_I: + case URX_BACKSLASH_H: + case URX_BACKSLASH_R: + case URX_BACKSLASH_V: // types with an integer operand field. - REGEX_DUMP_DEBUG_PRINTF(("%d", val)); + printf("%d", val); break; case URX_ONECHAR: case URX_ONECHAR_I: - REGEX_DUMP_DEBUG_PRINTF(("%c", val<256?val:'?')); + if (val < 0x20) { + printf("%#x", val); + } else { + printf("'%s'", CStr(UnicodeString(val))()); + } break; case URX_STRING: @@ -721,12 +782,8 @@ void RegexPattern::dumpOp(int32_t index) const { int32_t lengthOp = fCompiledPat->elementAti(index+1); U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN); int32_t length = URX_VAL(lengthOp); - int32_t i; - for (i=val; i= 256) {c = '.';} - REGEX_DUMP_DEBUG_PRINTF(("%c", c)); - } + UnicodeString str(fLiteralText, val, length); + printf("%s", CStr(str)()); } break; @@ -736,9 +793,7 @@ void RegexPattern::dumpOp(int32_t index) const { UnicodeString s; UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val); set->toPattern(s, TRUE); - for (int32_t i=0; itoPattern(s, TRUE); - for (int32_t i=0; ifPattern, 0); - while (c != U_SENTINEL) { - if (c<32 || c>256) { - c = '.'; - } - REGEX_DUMP_DEBUG_PRINTF(("%c", c)); - - c = UTEXT_NEXT32(This->fPattern); - } - REGEX_DUMP_DEBUG_PRINTF(("\n")); - REGEX_DUMP_DEBUG_PRINTF((" Min Match Length: %d\n", This->fMinMatchLen)); - REGEX_DUMP_DEBUG_PRINTF((" Match Start Type: %s\n", START_OF_MATCH_STR(This->fStartType))); - if (This->fStartType == START_STRING) { - REGEX_DUMP_DEBUG_PRINTF((" Initial match string: \"")); - for (i=This->fInitialStringIdx; ifInitialStringIdx+This->fInitialStringLen; i++) { - REGEX_DUMP_DEBUG_PRINTF(("%c", This->fLiteralText[i])); // TODO: non-printables, surrogates. - } - REGEX_DUMP_DEBUG_PRINTF(("\"\n")); - - } else if (This->fStartType == START_SET) { - int32_t numSetChars = This->fInitialChars->size(); - if (numSetChars > 20) { - numSetChars = 20; - } - REGEX_DUMP_DEBUG_PRINTF((" Match First Chars : ")); - for (i=0; ifInitialChars->charAt(i); - if (0x20toPattern(s, TRUE); + printf(" Match First Chars: %s\n", CStr(s)()); + + } else if (fStartType == START_CHAR) { + printf(" First char of Match: "); + if (fInitialChar > 0x20) { + printf("'%s'\n", CStr(UnicodeString(fInitialChar))()); } else { - REGEX_DUMP_DEBUG_PRINTF(("%#x ", c)); + printf("%#x\n", fInitialChar); } - } - if (numSetChars < This->fInitialChars->size()) { - REGEX_DUMP_DEBUG_PRINTF((" ...")); - } - REGEX_DUMP_DEBUG_PRINTF(("\n")); + } - } else if (This->fStartType == START_CHAR) { - REGEX_DUMP_DEBUG_PRINTF((" First char of Match : ")); - if (0x20 < This->fInitialChar && This->fInitialChar<0x7e) { - REGEX_DUMP_DEBUG_PRINTF(("%c\n", This->fInitialChar)); - } else { - REGEX_DUMP_DEBUG_PRINTF(("%#x\n", This->fInitialChar)); - } + printf("Named Capture Groups:\n"); + if (!fNamedCaptureMap || uhash_count(fNamedCaptureMap) == 0) { + printf(" None\n"); + } else { + int32_t pos = UHASH_FIRST; + const UHashElement *el = NULL; + while ((el = uhash_nextElement(fNamedCaptureMap, &pos))) { + const UnicodeString *name = (const UnicodeString *)el->key.pointer; + int32_t number = el->value.integer; + printf(" %d\t%s\n", number, CStr(*name)()); + } } - REGEX_DUMP_DEBUG_PRINTF(("\nIndex Binary Type Operand\n" \ - "-------------------------------------------\n")); - for (index = 0; indexfCompiledPat->size(); index++) { - This->dumpOp(index); + printf("\nIndex Binary Type Operand\n" \ + "-------------------------------------------\n"); + for (index = 0; indexsize(); index++) { + dumpOp(index); } - REGEX_DUMP_DEBUG_PRINTF(("\n\n")); -} + printf("\n\n"); #endif +}