X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/48b980fed3435926e0b3a8d72ecb58be703a1c7a..729e4ab9bc6618bc3d8a898e575df7f4019e29ca:/icuSources/i18n/rematch.cpp diff --git a/icuSources/i18n/rematch.cpp b/icuSources/i18n/rematch.cpp index 52190e26..dda8a1ec 100644 --- a/icuSources/i18n/rematch.cpp +++ b/icuSources/i18n/rematch.cpp @@ -1,6 +1,6 @@ /* ************************************************************************** -* Copyright (C) 2002-2008 International Business Machines Corporation * +* Copyright (C) 2002-2010 International Business Machines Corporation * * and others. All rights reserved. * ************************************************************************** */ @@ -23,11 +23,36 @@ #include "cmemory.h" #include "uvector.h" #include "uvectr32.h" +#include "uvectr64.h" #include "regeximp.h" #include "regexst.h" +#include "regextxt.h" +#include "ucase.h" // #include // Needed for heapcheck testing + +// Find progress callback +// ---------------------- +// Macro to inline test & call to ReportFindProgress(). Eliminates unnecessary function call. +// +#define REGEXFINDPROGRESS_INTERRUPT(pos, status) \ + (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE) + + +// Smart Backtracking +// ------------------ +// When a failure would go back to a LOOP_C instruction, +// strings, characters, and setrefs scan backwards for a valid start +// character themselves, pop the stack, and save state, emulating the +// LOOP_C's effect but assured that the next character of input is a +// possible matching character. +// +// Good idea in theory; unfortunately it only helps out a few specific +// cases and slows the engine down a little in the rest. + +//#define REGEX_SMART_BACKTRACKING 1 + U_NAMESPACE_BEGIN // Default limit for the size of the back track stack, to avoid system @@ -60,7 +85,7 @@ RegexMatcher::RegexMatcher(const RegexPattern *pat) { return; } fPattern = pat; - init2(RegexStaticSets::gStaticSets->fEmptyString, fDeferredStatus); + init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus); } @@ -73,6 +98,29 @@ RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &inp } UParseError pe; fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); + fPattern = fPatternOwned; + + UText inputText = UTEXT_INITIALIZER; + utext_openConstUnicodeString(&inputText, &input, &status); + init2(&inputText, status); + utext_close(&inputText); + + fInputUniStrMaybeMutable = TRUE; +} + + +RegexMatcher::RegexMatcher(UText *regexp, UText *input, + uint32_t flags, UErrorCode &status) { + init(status); + if (U_FAILURE(status)) { + return; + } + UParseError pe; + fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); + if (U_FAILURE(status)) { + return; + } + fPattern = fPatternOwned; init2(input, status); } @@ -86,8 +134,27 @@ RegexMatcher::RegexMatcher(const UnicodeString ®exp, } UParseError pe; fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); + if (U_FAILURE(status)) { + return; + } + fPattern = fPatternOwned; + init2(RegexStaticSets::gStaticSets->fEmptyText, status); +} + +RegexMatcher::RegexMatcher(UText *regexp, + uint32_t flags, UErrorCode &status) { + init(status); + if (U_FAILURE(status)) { + return; + } + UParseError pe; + fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); + if (U_FAILURE(status)) { + return; + } + fPattern = fPatternOwned; - init2(RegexStaticSets::gStaticSets->fEmptyString, status); + init2(RegexStaticSets::gStaticSets->fEmptyText, status); } @@ -104,6 +171,17 @@ RegexMatcher::~RegexMatcher() { fPatternOwned = NULL; fPattern = NULL; } + + if (fInput) { + delete fInput; + } + if (fInputText) { + utext_close(fInputText); + } + if (fAltInputText) { + utext_close(fAltInputText); + } + #if UCONFIG_NO_BREAK_ITERATION==0 delete fWordBreakItr; #endif @@ -119,7 +197,6 @@ RegexMatcher::~RegexMatcher() { void RegexMatcher::init(UErrorCode &status) { fPattern = NULL; fPatternOwned = NULL; - fInput = NULL; fFrameSize = 0; fRegionStart = 0; fRegionLimit = 0; @@ -146,12 +223,20 @@ void RegexMatcher::init(UErrorCode &status) { fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY; fCallbackFn = NULL; fCallbackContext = NULL; + fFindProgressCallbackFn = NULL; + fFindProgressCallbackContext = NULL; fTraceDebug = FALSE; fDeferredStatus = status; fData = fSmallData; fWordBreakItr = NULL; - fStack = new UVector32(status); + fStack = new UVector64(status); + fInputText = NULL; + fAltInputText = NULL; + fInput = NULL; + fInputLength = 0; + fInputUniStrMaybeMutable = FALSE; + if (U_FAILURE(status)) { fDeferredStatus = status; } @@ -161,14 +246,14 @@ void RegexMatcher::init(UErrorCode &status) { // init2() Common initialization for use by RegexMatcher constructors, part 2. // This handles the common setup to be done after the Pattern is available. // -void RegexMatcher::init2(const UnicodeString &input, UErrorCode &status) { +void RegexMatcher::init2(UText *input, UErrorCode &status) { if (U_FAILURE(status)) { fDeferredStatus = status; return; } - if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) { - fData = (int32_t *)uprv_malloc(fPattern->fDataSize * sizeof(int32_t)); + if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) { + fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t)); if (fData == NULL) { status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; return; @@ -194,6 +279,29 @@ static const UChar DOLLARSIGN = 0x24; RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest, const UnicodeString &replacement, UErrorCode &status) { + UText replacementText = UTEXT_INITIALIZER; + + utext_openConstUnicodeString(&replacementText, &replacement, &status); + if (U_SUCCESS(status)) { + UText resultText = UTEXT_INITIALIZER; + utext_openUnicodeString(&resultText, &dest, &status); + + if (U_SUCCESS(status)) { + appendReplacement(&resultText, &replacementText, status); + utext_close(&resultText); + } + utext_close(&replacementText); + } + + return *this; +} + +// +// appendReplacement, UText mode +// +RegexMatcher &RegexMatcher::appendReplacement(UText *dest, + UText *replacement, + UErrorCode &status) { if (U_FAILURE(status)) { return *this; } @@ -205,97 +313,151 @@ RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest, status = U_REGEX_INVALID_STATE; return *this; } - + // Copy input string from the end of previous match to start of current match - int32_t len = fMatchStart-fAppendPosition; - if (len > 0) { - dest.append(*fInput, fAppendPosition, len); + int64_t destLen = utext_nativeLength(dest); + if (fMatchStart > fAppendPosition) { + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { + destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, + (int32_t)(fMatchStart-fAppendPosition), &status); + } else { + int32_t len16; + if (UTEXT_USES_U16(fInputText)) { + len16 = (int32_t)(fMatchStart-fAppendPosition); + } else { + UErrorCode lengthStatus = U_ZERO_ERROR; + len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus); + } + UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); + if (inputChars == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return *this; + } + utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status); + destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status); + uprv_free(inputChars); + } } fAppendPosition = fMatchEnd; - + // scan the replacement text, looking for substitutions ($n) and \escapes. // TODO: optimize this loop by efficiently scanning for '$' or '\', // move entire ranges not containing substitutions. - int32_t replLen = replacement.length(); - int32_t replIdx = 0; - while (replIdx= replLen) { + c = UTEXT_CURRENT32(replacement); + if (c == U_SENTINEL) { break; } - c = replacement.charAt(replIdx); - + if (c==0x55/*U*/ || c==0x75/*u*/) { // We have a \udddd or \Udddddddd escape sequence. - UChar32 escapedChar = replacement.unescapeAt(replIdx); + int32_t offset = 0; + struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement); + UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context); if (escapedChar != (UChar32)0xFFFFFFFF) { - dest.append(escapedChar); + if (U_IS_BMP(escapedChar)) { + UChar c16 = (UChar)escapedChar; + destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); + } else { + UChar surrogate[2]; + surrogate[0] = U16_LEAD(escapedChar); + surrogate[1] = U16_TRAIL(escapedChar); + if (U_SUCCESS(status)) { + destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); + } + } // TODO: Report errors for mal-formed \u escapes? // As this is, the original sequence is output, which may be OK. - continue; + if (context.lastOffset == offset) { + UTEXT_PREVIOUS32(replacement); + } else if (context.lastOffset != offset-1) { + utext_moveIndex32(replacement, offset - context.lastOffset - 1); + } + } + } else { + UTEXT_NEXT32(replacement); + // Plain backslash escape. Just put out the escaped character. + if (U_IS_BMP(c)) { + UChar c16 = (UChar)c; + destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); + } else { + UChar surrogate[2]; + surrogate[0] = U16_LEAD(c); + surrogate[1] = U16_TRAIL(c); + if (U_SUCCESS(status)) { + destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); + } } } - - // Plain backslash escape. Just put out the escaped character. - dest.append(c); - replIdx++; - continue; - } - - if (c != DOLLARSIGN) { + } else if (c != DOLLARSIGN) { // Normal char, not a $. Copy it out without further checks. - dest.append(c); - continue; - } - - // We've got a $. Pick up a capture group number if one follows. - // Consume at most the number of digits necessary for the largest capture - // number that is valid for this pattern. - - int32_t numDigits = 0; - int32_t groupNum = 0; - UChar32 digitC; - for (;;) { - if (replIdx >= replLen) { - break; + if (U_IS_BMP(c)) { + UChar c16 = (UChar)c; + destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); + } else { + UChar surrogate[2]; + surrogate[0] = U16_LEAD(c); + surrogate[1] = U16_TRAIL(c); + if (U_SUCCESS(status)) { + destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); + } } - digitC = replacement.char32At(replIdx); - if (u_isdigit(digitC) == FALSE) { - break; + } else { + // We've got a $. Pick up a capture group number if one follows. + // Consume at most the number of digits necessary for the largest capture + // number that is valid for this pattern. + + int32_t numDigits = 0; + int32_t groupNum = 0; + UChar32 digitC; + for (;;) { + digitC = UTEXT_CURRENT32(replacement); + if (digitC == U_SENTINEL) { + break; + } + if (u_isdigit(digitC) == FALSE) { + break; + } + UTEXT_NEXT32(replacement); + groupNum=groupNum*10 + u_charDigitValue(digitC); + numDigits++; + if (numDigits >= fPattern->fMaxCaptureDigits) { + break; + } } - replIdx = replacement.moveIndex32(replIdx, 1); - groupNum=groupNum*10 + u_charDigitValue(digitC); - numDigits++; - if (numDigits >= fPattern->fMaxCaptureDigits) { - break; + + + if (numDigits == 0) { + // The $ didn't introduce a group number at all. + // Treat it as just part of the substitution text. + UChar c16 = DOLLARSIGN; + destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); + } else { + // Finally, append the capture group data to the destination. + destLen += appendGroup(groupNum, dest, status); + if (U_FAILURE(status)) { + // Can fail if group number is out of range. + break; + } } } - - - if (numDigits == 0) { - // The $ didn't introduce a group number at all. - // Treat it as just part of the substitution text. - dest.append(DOLLARSIGN); - continue; - } - - // Finally, append the capture group data to the destination. - dest.append(group(groupNum, status)); + if (U_FAILURE(status)) { - // Can fail if group number is out of range. break; + } else { + c = UTEXT_NEXT32(replacement); } - } - + return *this; } @@ -311,9 +473,63 @@ RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest, // //-------------------------------------------------------------------------------- UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) { - int32_t len = fInput->length() - fAppendPosition; - if (len > 0) { - dest.append(*fInput, fAppendPosition, len); + UErrorCode status = U_ZERO_ERROR; + UText resultText = UTEXT_INITIALIZER; + utext_openUnicodeString(&resultText, &dest, &status); + + if (U_SUCCESS(status)) { + appendTail(&resultText, status); + utext_close(&resultText); + } + + return dest; +} + +// +// appendTail, UText mode +// +UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) { + UBool bailOut = FALSE; + if (U_FAILURE(status)) { + bailOut = TRUE; + } + if (U_FAILURE(fDeferredStatus)) { + status = fDeferredStatus; + bailOut = TRUE; + } + + if (bailOut) { + // dest must not be NULL + if (dest) { + utext_replace(dest, utext_nativeLength(dest), utext_nativeLength(dest), NULL, 0, &status); + return dest; + } + } + + if (fInputLength > fAppendPosition) { + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { + int64_t destLen = utext_nativeLength(dest); + utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, + (int32_t)(fInputLength-fAppendPosition), &status); + } else { + int32_t len16; + if (UTEXT_USES_U16(fInputText)) { + len16 = (int32_t)(fInputLength-fAppendPosition); + } else { + len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status); + status = U_ZERO_ERROR; // buffer overflow + } + + UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16)); + if (inputChars == NULL) { + fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; + } else { + utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated + int64_t destLen = utext_nativeLength(dest); + utext_replace(dest, destLen, destLen, inputChars, len16, &status); + uprv_free(inputChars); + } + } } return dest; } @@ -329,9 +545,11 @@ int32_t RegexMatcher::end(UErrorCode &err) const { return end(0, err); } +int64_t RegexMatcher::end64(UErrorCode &err) const { + return end64(0, err); +} - -int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const { +int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const { if (U_FAILURE(err)) { return -1; } @@ -343,7 +561,7 @@ int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const { err = U_INDEX_OUTOFBOUNDS_ERROR; return -1; } - int32_t e = -1; + int64_t e = -1; if (group == 0) { e = fMatchEnd; } else { @@ -354,9 +572,13 @@ int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const { U_ASSERT(groupOffset >= 0); e = fFrame->fExtra[groupOffset + 1]; } - return e; + + return e; } +int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const { + return (int32_t)end64(group, err); +} //-------------------------------------------------------------------------------- @@ -366,13 +588,17 @@ int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const { //-------------------------------------------------------------------------------- UBool RegexMatcher::find() { // Start at the position of the last match end. (Will be zero if the - // matcher has been reset. + // matcher has been reset.) // if (U_FAILURE(fDeferredStatus)) { return FALSE; } + + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { + return findUsingChunk(); + } - int32_t startPos = fMatchEnd; + int64_t startPos = fMatchEnd; if (startPos==0) { startPos = fActiveStart; } @@ -389,7 +615,9 @@ UBool RegexMatcher::find() { fHitEnd = TRUE; return FALSE; } - startPos = fInput->moveIndex32(startPos, 1); + UTEXT_SETNATIVEINDEX(fInputText, startPos); + UTEXT_NEXT32(fInputText); + startPos = UTEXT_GETNATIVEINDEX(fInputText); } } else { if (fLastMatchEnd >= 0) { @@ -406,14 +634,20 @@ UBool RegexMatcher::find() { // the minimum length match would extend past the end of the input. // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. // Be aware of possible overflows if making changes here. - int32_t testLen = fActiveLimit - fPattern->fMinMatchLen; - if (startPos > testLen) { - fMatch = FALSE; - fHitEnd = TRUE; - return FALSE; + int64_t testStartLimit; + if (UTEXT_USES_U16(fInputText)) { + testStartLimit = fActiveLimit - fPattern->fMinMatchLen; + if (startPos > testStartLimit) { + fMatch = FALSE; + fHitEnd = TRUE; + return FALSE; + } + } else { + // For now, let the matcher discover that it can't match on its own + // We don't know how long the match len is in native characters + testStartLimit = fActiveLimit; } - const UChar *inputBuf = fInput->getBuffer(); UChar32 c; U_ASSERT(startPos >= 0); @@ -429,14 +663,18 @@ UBool RegexMatcher::find() { if (fMatch) { return TRUE; } - if (startPos >= testLen) { + if (startPos >= testStartLimit) { fHitEnd = TRUE; return FALSE; } - U16_FWD_1(inputBuf, startPos, fActiveLimit); + UTEXT_SETNATIVEINDEX(fInputText, startPos); + UTEXT_NEXT32(fInputText); + startPos = UTEXT_GETNATIVEINDEX(fInputText); // Note that it's perfectly OK for a pattern to have a zero-length // match at the end of a string, so we must make sure that the loop - // runs with startPos == testLen the last time through. + // runs with startPos == testStartLimit the last time through. + if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) + return FALSE; } U_ASSERT(FALSE); @@ -458,24 +696,33 @@ UBool RegexMatcher::find() { { // Match may start on any char from a pre-computed set. U_ASSERT(fPattern->fMinMatchLen > 0); + int64_t pos; + UTEXT_SETNATIVEINDEX(fInputText, startPos); for (;;) { - int32_t pos = startPos; - U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; - if (c<256 && fPattern->fInitialChars8->contains(c) || - c>=256 && fPattern->fInitialChars->contains(c)) { - MatchAt(pos, FALSE, fDeferredStatus); + c = UTEXT_NEXT32(fInputText); + pos = UTEXT_GETNATIVEINDEX(fInputText); + // c will be -1 (U_SENTINEL) at end of text, in which case we + // skip this next block (so we don't have a negative array index) + // and handle end of text in the following block. + if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) || + (c>=256 && fPattern->fInitialChars->contains(c)))) { + MatchAt(startPos, FALSE, fDeferredStatus); if (U_FAILURE(fDeferredStatus)) { return FALSE; } if (fMatch) { return TRUE; } + UTEXT_SETNATIVEINDEX(fInputText, pos); } - if (pos >= testLen) { + if (startPos >= testStartLimit) { fMatch = FALSE; fHitEnd = TRUE; return FALSE; } + startPos = pos; + if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) + return FALSE; } } U_ASSERT(FALSE); @@ -486,24 +733,30 @@ UBool RegexMatcher::find() { // Match starts on exactly one char. U_ASSERT(fPattern->fMinMatchLen > 0); UChar32 theChar = fPattern->fInitialChar; + int64_t pos; + UTEXT_SETNATIVEINDEX(fInputText, startPos); for (;;) { - int32_t pos = startPos; - U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; + c = UTEXT_NEXT32(fInputText); + pos = UTEXT_GETNATIVEINDEX(fInputText); if (c == theChar) { - MatchAt(pos, FALSE, fDeferredStatus); + MatchAt(startPos, FALSE, fDeferredStatus); if (U_FAILURE(fDeferredStatus)) { return FALSE; } if (fMatch) { return TRUE; } + UTEXT_SETNATIVEINDEX(fInputText, pos); } - if (pos >= testLen) { + if (startPos >= testStartLimit) { fMatch = FALSE; fHitEnd = TRUE; return FALSE; } - } + startPos = pos; + if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) + return FALSE; + } } U_ASSERT(FALSE); @@ -518,12 +771,17 @@ UBool RegexMatcher::find() { if (fMatch) { return TRUE; } - U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; + UTEXT_SETNATIVEINDEX(fInputText, startPos); + c = UTEXT_NEXT32(fInputText); + startPos = UTEXT_GETNATIVEINDEX(fInputText); + } else { + UTEXT_SETNATIVEINDEX(fInputText, startPos); + c = UTEXT_PREVIOUS32(fInputText); + UTEXT_SETNATIVEINDEX(fInputText, startPos); } if (fPattern->fFlags & UREGEX_UNIX_LINES) { - for (;;) { - c = inputBuf[startPos-1]; + for (;;) { if (c == 0x0a) { MatchAt(startPos, FALSE, fDeferredStatus); if (U_FAILURE(fDeferredStatus)) { @@ -532,24 +790,28 @@ UBool RegexMatcher::find() { if (fMatch) { return TRUE; } + UTEXT_SETNATIVEINDEX(fInputText, startPos); } - if (startPos >= testLen) { + if (startPos >= testStartLimit) { fMatch = FALSE; fHitEnd = TRUE; return FALSE; } - U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; + c = UTEXT_NEXT32(fInputText); + startPos = UTEXT_GETNATIVEINDEX(fInputText); // Note that it's perfectly OK for a pattern to have a zero-length // match at the end of a string, so we must make sure that the loop - // runs with startPos == testLen the last time through. + // runs with startPos == testStartLimit the last time through. + if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) + return FALSE; } } else { for (;;) { - c = inputBuf[startPos-1]; if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) { - if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) { - startPos++; + if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) { + UTEXT_NEXT32(fInputText); + startPos = UTEXT_GETNATIVEINDEX(fInputText); } MatchAt(startPos, FALSE, fDeferredStatus); if (U_FAILURE(fDeferredStatus)) { @@ -558,16 +820,20 @@ UBool RegexMatcher::find() { if (fMatch) { return TRUE; } + UTEXT_SETNATIVEINDEX(fInputText, startPos); } - if (startPos >= testLen) { + if (startPos >= testStartLimit) { fMatch = FALSE; fHitEnd = TRUE; return FALSE; } - U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; + c = UTEXT_NEXT32(fInputText); + startPos = UTEXT_GETNATIVEINDEX(fInputText); // Note that it's perfectly OK for a pattern to have a zero-length // match at the end of a string, so we must make sure that the loop - // runs with startPos == testLen the last time through. + // runs with startPos == testStartLimit the last time through. + if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) + return FALSE; } } } @@ -582,7 +848,7 @@ UBool RegexMatcher::find() { -UBool RegexMatcher::find(int32_t start, UErrorCode &status) { +UBool RegexMatcher::find(int64_t start, UErrorCode &status) { if (U_FAILURE(status)) { return FALSE; } @@ -592,209 +858,857 @@ UBool RegexMatcher::find(int32_t start, UErrorCode &status) { } this->reset(); // Note: Reset() is specified by Java Matcher documentation. // This will reset the region to be the full input length. - if (start < fActiveStart || start > fActiveLimit) { + if (start < 0) { + status = U_INDEX_OUTOFBOUNDS_ERROR; + return FALSE; + } + + int64_t nativeStart = start; + if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { status = U_INDEX_OUTOFBOUNDS_ERROR; return FALSE; } - fMatchEnd = start; + fMatchEnd = nativeStart; return find(); } - //-------------------------------------------------------------------------------- // -// group() +// findUsingChunk() -- like find(), but with the advance knowledge that the +// entire string is available in the UText's chunk buffer. // //-------------------------------------------------------------------------------- -UnicodeString RegexMatcher::group(UErrorCode &status) const { - return group(0, status); -} - - - -UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const { - int32_t s = start(groupNum, status); - int32_t e = end(groupNum, status); +UBool RegexMatcher::findUsingChunk() { + // Start at the position of the last match end. (Will be zero if the + // matcher has been reset. + // - // Note: calling start() and end() above will do all necessary checking that - // the group number is OK and that a match exists. status will be set. - if (U_FAILURE(status)) { - return UnicodeString(); - } - if (U_FAILURE(fDeferredStatus)) { - status = fDeferredStatus; - return UnicodeString(); + int32_t startPos = (int32_t)fMatchEnd; + if (startPos==0) { + startPos = (int32_t)fActiveStart; } + + const UChar *inputBuf = fInputText->chunkContents; - if (s < 0) { - // A capture group wasn't part of the match - return UnicodeString(); + if (fMatch) { + // Save the position of any previous successful match. + fLastMatchEnd = fMatchEnd; + + if (fMatchStart == fMatchEnd) { + // Previous match had zero length. Move start position up one position + // to avoid sending find() into a loop on zero-length matches. + if (startPos >= fActiveLimit) { + fMatch = FALSE; + fHitEnd = TRUE; + return FALSE; + } + U16_FWD_1(inputBuf, startPos, fInputLength); + } + } else { + if (fLastMatchEnd >= 0) { + // A previous find() failed to match. Don't try again. + // (without this test, a pattern with a zero-length match + // could match again at the end of an input string.) + fHitEnd = TRUE; + return FALSE; + } } - U_ASSERT(s <= e); - return UnicodeString(*fInput, s, e-s); -} - - - - -int32_t RegexMatcher::groupCount() const { - return fPattern->fGroupMap->size(); -} - - - -const UnicodeString &RegexMatcher::input() const { - return *fInput; -} - - -//-------------------------------------------------------------------------------- -// -// hasAnchoringBounds() -// -//-------------------------------------------------------------------------------- -UBool RegexMatcher::hasAnchoringBounds() const { - return fAnchoringBounds; -} - - -//-------------------------------------------------------------------------------- -// -// hasTransparentBounds() -// -//-------------------------------------------------------------------------------- -UBool RegexMatcher::hasTransparentBounds() const { - return fTransparentBounds; -} - - - -//-------------------------------------------------------------------------------- -// -// hitEnd() -// -//-------------------------------------------------------------------------------- -UBool RegexMatcher::hitEnd() const { - return fHitEnd; -} - -//-------------------------------------------------------------------------------- -// -// lookingAt() -// -//-------------------------------------------------------------------------------- -UBool RegexMatcher::lookingAt(UErrorCode &status) { - if (U_FAILURE(status)) { + + + // Compute the position in the input string beyond which a match can not begin, because + // the minimum length match would extend past the end of the input. + // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. + // Be aware of possible overflows if making changes here. + int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen); + if (startPos > testLen) { + fMatch = FALSE; + fHitEnd = TRUE; return FALSE; } - if (U_FAILURE(fDeferredStatus)) { - status = fDeferredStatus; - return FALSE; + + UChar32 c; + U_ASSERT(startPos >= 0); + + switch (fPattern->fStartType) { + case START_NO_INFO: + // No optimization was found. + // Try a match at each input position. + for (;;) { + MatchChunkAt(startPos, FALSE, fDeferredStatus); + if (U_FAILURE(fDeferredStatus)) { + return FALSE; + } + if (fMatch) { + return TRUE; + } + if (startPos >= testLen) { + fHitEnd = TRUE; + return FALSE; + } + U16_FWD_1(inputBuf, startPos, fActiveLimit); + // Note that it's perfectly OK for a pattern to have a zero-length + // match at the end of a string, so we must make sure that the loop + // runs with startPos == testLen the last time through. + if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) + return FALSE; + } + U_ASSERT(FALSE); + + case START_START: + // Matches are only possible at the start of the input string + // (pattern begins with ^ or \A) + if (startPos > fActiveStart) { + fMatch = FALSE; + return FALSE; + } + MatchChunkAt(startPos, FALSE, fDeferredStatus); + if (U_FAILURE(fDeferredStatus)) { + return FALSE; + } + return fMatch; + + + case START_SET: + { + // Match may start on any char from a pre-computed set. + U_ASSERT(fPattern->fMinMatchLen > 0); + for (;;) { + int32_t pos = startPos; + U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; + if ((c<256 && fPattern->fInitialChars8->contains(c)) || + (c>=256 && fPattern->fInitialChars->contains(c))) { + MatchChunkAt(pos, FALSE, fDeferredStatus); + if (U_FAILURE(fDeferredStatus)) { + return FALSE; + } + if (fMatch) { + return TRUE; + } + } + if (pos >= testLen) { + fMatch = FALSE; + fHitEnd = TRUE; + return FALSE; + } + if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) + return FALSE; + } } - resetPreserveRegion(); - MatchAt(fActiveStart, FALSE, status); - return fMatch; + U_ASSERT(FALSE); + + case START_STRING: + case START_CHAR: + { + // Match starts on exactly one char. + U_ASSERT(fPattern->fMinMatchLen > 0); + UChar32 theChar = fPattern->fInitialChar; + for (;;) { + int32_t pos = startPos; + U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; + if (c == theChar) { + MatchChunkAt(pos, FALSE, fDeferredStatus); + if (U_FAILURE(fDeferredStatus)) { + return FALSE; + } + if (fMatch) { + return TRUE; + } + } + if (pos >= testLen) { + fMatch = FALSE; + fHitEnd = TRUE; + return FALSE; + } + if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) + return FALSE; + } + } + U_ASSERT(FALSE); + + case START_LINE: + { + UChar32 c; + if (startPos == fAnchorStart) { + MatchChunkAt(startPos, FALSE, fDeferredStatus); + if (U_FAILURE(fDeferredStatus)) { + return FALSE; + } + if (fMatch) { + return TRUE; + } + U16_FWD_1(inputBuf, startPos, fActiveLimit); + } + + if (fPattern->fFlags & UREGEX_UNIX_LINES) { + for (;;) { + c = inputBuf[startPos-1]; + if (c == 0x0a) { + MatchChunkAt(startPos, FALSE, fDeferredStatus); + if (U_FAILURE(fDeferredStatus)) { + return FALSE; + } + if (fMatch) { + return TRUE; + } + } + if (startPos >= testLen) { + fMatch = FALSE; + fHitEnd = TRUE; + return FALSE; + } + U16_FWD_1(inputBuf, startPos, fActiveLimit); + // Note that it's perfectly OK for a pattern to have a zero-length + // match at the end of a string, so we must make sure that the loop + // runs with startPos == testLen the last time through. + if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) + return FALSE; + } + } else { + for (;;) { + c = inputBuf[startPos-1]; + if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible + ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) { + if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) { + startPos++; + } + MatchChunkAt(startPos, FALSE, fDeferredStatus); + if (U_FAILURE(fDeferredStatus)) { + return FALSE; + } + if (fMatch) { + return TRUE; + } + } + if (startPos >= testLen) { + fMatch = FALSE; + fHitEnd = TRUE; + return FALSE; + } + U16_FWD_1(inputBuf, startPos, fActiveLimit); + // Note that it's perfectly OK for a pattern to have a zero-length + // match at the end of a string, so we must make sure that the loop + // runs with startPos == testLen the last time through. + if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) + return FALSE; + } + } + } + + default: + U_ASSERT(FALSE); + } + + U_ASSERT(FALSE); + return FALSE; +} + + + +//-------------------------------------------------------------------------------- +// +// group() +// +//-------------------------------------------------------------------------------- +UnicodeString RegexMatcher::group(UErrorCode &status) const { + return group(0, status); } +// Return immutable shallow clone +UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const { + return group(0, dest, group_len, status); +} -UBool RegexMatcher::lookingAt(int32_t start, UErrorCode &status) { +// Return immutable shallow clone +UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const { + group_len = 0; + UBool bailOut = FALSE; if (U_FAILURE(status)) { - return FALSE; + return dest; } if (U_FAILURE(fDeferredStatus)) { status = fDeferredStatus; - return FALSE; + bailOut = TRUE; } - reset(); - if (start < fActiveStart || start > fActiveLimit) { + if (fMatch == FALSE) { + status = U_REGEX_INVALID_STATE; + bailOut = TRUE; + } + if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { status = U_INDEX_OUTOFBOUNDS_ERROR; - return FALSE; + bailOut = TRUE; } - MatchAt(start, FALSE, status); - return fMatch; + + if (bailOut) { + return (dest) ? dest : utext_openUChars(NULL, NULL, 0, &status); + } + + int64_t s, e; + if (groupNum == 0) { + s = fMatchStart; + e = fMatchEnd; + } else { + int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); + U_ASSERT(groupOffset < fPattern->fFrameSize); + U_ASSERT(groupOffset >= 0); + s = fFrame->fExtra[groupOffset]; + e = fFrame->fExtra[groupOffset+1]; + } + + if (s < 0) { + // A capture group wasn't part of the match + return utext_clone(dest, fInputText, FALSE, TRUE, &status); + } + U_ASSERT(s <= e); + group_len = e - s; + + dest = utext_clone(dest, fInputText, FALSE, TRUE, &status); + if (dest) + UTEXT_SETNATIVEINDEX(dest, s); + return dest; } +UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const { + UnicodeString result; + if (U_FAILURE(status)) { + return result; + } + UText resultText = UTEXT_INITIALIZER; + utext_openUnicodeString(&resultText, &result, &status); + group(groupNum, &resultText, status); + utext_close(&resultText); + return result; +} -//-------------------------------------------------------------------------------- -// -// matches() -// -//-------------------------------------------------------------------------------- -UBool RegexMatcher::matches(UErrorCode &status) { +// Return deep (mutable) clone +// Technology Preview (as an API), but note that the UnicodeString API is implemented +// using this function. +UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const { + UBool bailOut = FALSE; if (U_FAILURE(status)) { - return FALSE; + return dest; } if (U_FAILURE(fDeferredStatus)) { status = fDeferredStatus; - return FALSE; + bailOut = TRUE; } - resetPreserveRegion(); - MatchAt(fActiveStart, TRUE, status); - return fMatch; + + if (fMatch == FALSE) { + status = U_REGEX_INVALID_STATE; + bailOut = TRUE; + } + if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { + status = U_INDEX_OUTOFBOUNDS_ERROR; + bailOut = TRUE; + } + + if (bailOut) { + if (dest) { + utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); + return dest; + } else { + return utext_openUChars(NULL, NULL, 0, &status); + } + } + + int64_t s, e; + if (groupNum == 0) { + s = fMatchStart; + e = fMatchEnd; + } else { + int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); + U_ASSERT(groupOffset < fPattern->fFrameSize); + U_ASSERT(groupOffset >= 0); + s = fFrame->fExtra[groupOffset]; + e = fFrame->fExtra[groupOffset+1]; + } + + if (s < 0) { + // A capture group wasn't part of the match + if (dest) { + utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); + return dest; + } else { + return utext_openUChars(NULL, NULL, 0, &status); + } + } + U_ASSERT(s <= e); + + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { + U_ASSERT(e <= fInputLength); + if (dest) { + utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status); + } else { + UText groupText = UTEXT_INITIALIZER; + utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status); + dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); + utext_close(&groupText); + } + } else { + int32_t len16; + if (UTEXT_USES_U16(fInputText)) { + len16 = (int32_t)(e-s); + } else { + UErrorCode lengthStatus = U_ZERO_ERROR; + len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus); + } + UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); + if (groupChars == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return dest; + } + utext_extract(fInputText, s, e, groupChars, len16+1, &status); + + if (dest) { + utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status); + } else { + UText groupText = UTEXT_INITIALIZER; + utext_openUChars(&groupText, groupChars, len16, &status); + dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); + utext_close(&groupText); + } + + uprv_free(groupChars); + } + return dest; } +//-------------------------------------------------------------------------------- +// +// appendGroup() -- currently internal only, appends a group to a UText rather +// than replacing its contents +// +//-------------------------------------------------------------------------------- -UBool RegexMatcher::matches(int32_t start, UErrorCode &status) { +int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const { if (U_FAILURE(status)) { - return FALSE; + return 0; } if (U_FAILURE(fDeferredStatus)) { status = fDeferredStatus; - return FALSE; + return 0; } - reset(); - if (start < fActiveStart || start > fActiveLimit) { + int64_t destLen = utext_nativeLength(dest); + + if (fMatch == FALSE) { + status = U_REGEX_INVALID_STATE; + return utext_replace(dest, destLen, destLen, NULL, 0, &status); + } + if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { status = U_INDEX_OUTOFBOUNDS_ERROR; - return FALSE; + return utext_replace(dest, destLen, destLen, NULL, 0, &status); } - MatchAt(start, TRUE, status); - return fMatch; + + int64_t s, e; + if (groupNum == 0) { + s = fMatchStart; + e = fMatchEnd; + } else { + int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); + U_ASSERT(groupOffset < fPattern->fFrameSize); + U_ASSERT(groupOffset >= 0); + s = fFrame->fExtra[groupOffset]; + e = fFrame->fExtra[groupOffset+1]; + } + + if (s < 0) { + // A capture group wasn't part of the match + return utext_replace(dest, destLen, destLen, NULL, 0, &status); + } + U_ASSERT(s <= e); + + int64_t deltaLen; + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { + U_ASSERT(e <= fInputLength); + deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status); + } else { + int32_t len16; + if (UTEXT_USES_U16(fInputText)) { + len16 = (int32_t)(e-s); + } else { + UErrorCode lengthStatus = U_ZERO_ERROR; + len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus); + } + UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); + if (groupChars == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return 0; + } + utext_extract(fInputText, s, e, groupChars, len16+1, &status); + + deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status); + uprv_free(groupChars); + } + return deltaLen; } //-------------------------------------------------------------------------------- // -// pattern +// groupCount() // //-------------------------------------------------------------------------------- -const RegexPattern &RegexMatcher::pattern() const { - return *fPattern; +int32_t RegexMatcher::groupCount() const { + return fPattern->fGroupMap->size(); } //-------------------------------------------------------------------------------- // -// region +// hasAnchoringBounds() +// +//-------------------------------------------------------------------------------- +UBool RegexMatcher::hasAnchoringBounds() const { + return fAnchoringBounds; +} + + +//-------------------------------------------------------------------------------- +// +// hasTransparentBounds() +// +//-------------------------------------------------------------------------------- +UBool RegexMatcher::hasTransparentBounds() const { + return fTransparentBounds; +} + + + +//-------------------------------------------------------------------------------- +// +// hitEnd() +// +//-------------------------------------------------------------------------------- +UBool RegexMatcher::hitEnd() const { + return fHitEnd; +} + + +//-------------------------------------------------------------------------------- +// +// input() +// +//-------------------------------------------------------------------------------- +const UnicodeString &RegexMatcher::input() const { + if (!fInput) { + UErrorCode status = U_ZERO_ERROR; + int32_t len16; + if (UTEXT_USES_U16(fInputText)) { + len16 = (int32_t)fInputLength; + } else { + len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status); + status = U_ZERO_ERROR; // overflow, length status + } + UnicodeString *result = new UnicodeString(len16, 0, 0); + + UChar *inputChars = result->getBuffer(len16); + utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning + result->releaseBuffer(len16); + + (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator= + } + + return *fInput; +} + +//-------------------------------------------------------------------------------- +// +// inputText() +// +//-------------------------------------------------------------------------------- +UText *RegexMatcher::inputText() const { + return fInputText; +} + + +//-------------------------------------------------------------------------------- +// +// getInput() -- like inputText(), but makes a clone or copies into another UText +// +//-------------------------------------------------------------------------------- +UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const { + UBool bailOut = FALSE; + if (U_FAILURE(status)) { + return dest; + } + if (U_FAILURE(fDeferredStatus)) { + status = fDeferredStatus; + bailOut = TRUE; + } + + if (bailOut) { + if (dest) { + utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); + return dest; + } else { + return utext_clone(NULL, fInputText, FALSE, TRUE, &status); + } + } + + if (dest) { + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { + utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status); + } else { + int32_t input16Len; + if (UTEXT_USES_U16(fInputText)) { + input16Len = (int32_t)fInputLength; + } else { + UErrorCode lengthStatus = U_ZERO_ERROR; + input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error + } + UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len)); + if (inputChars == NULL) { + return dest; + } + + status = U_ZERO_ERROR; + utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning + status = U_ZERO_ERROR; + utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status); + + uprv_free(inputChars); + } + return dest; + } else { + return utext_clone(NULL, fInputText, FALSE, TRUE, &status); + } +} + + +static UBool compat_SyncMutableUTextContents(UText *ut); +static UBool compat_SyncMutableUTextContents(UText *ut) { + UBool retVal = FALSE; + + // In the following test, we're really only interested in whether the UText should switch + // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents + // will still point to the correct data. + if (utext_nativeLength(ut) != ut->nativeIndexingLimit) { + UnicodeString *us=(UnicodeString *)ut->context; + + // Update to the latest length. + // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit). + int32_t newLength = us->length(); + + // Update the chunk description. + // The buffer may have switched between stack- and heap-based. + ut->chunkContents = us->getBuffer(); + ut->chunkLength = newLength; + ut->chunkNativeLimit = newLength; + ut->nativeIndexingLimit = newLength; + retVal = TRUE; + } + + return retVal; +} + +//-------------------------------------------------------------------------------- +// +// lookingAt() +// +//-------------------------------------------------------------------------------- +UBool RegexMatcher::lookingAt(UErrorCode &status) { + if (U_FAILURE(status)) { + return FALSE; + } + if (U_FAILURE(fDeferredStatus)) { + status = fDeferredStatus; + return FALSE; + } + + if (fInputUniStrMaybeMutable) { + if (compat_SyncMutableUTextContents(fInputText)) { + fInputLength = utext_nativeLength(fInputText); + reset(); + } + } + else { + resetPreserveRegion(); + } + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { + MatchChunkAt((int32_t)fActiveStart, FALSE, status); + } else { + MatchAt(fActiveStart, FALSE, status); + } + return fMatch; +} + + +UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) { + if (U_FAILURE(status)) { + return FALSE; + } + if (U_FAILURE(fDeferredStatus)) { + status = fDeferredStatus; + return FALSE; + } + reset(); + + if (start < 0) { + status = U_INDEX_OUTOFBOUNDS_ERROR; + return FALSE; + } + + if (fInputUniStrMaybeMutable) { + if (compat_SyncMutableUTextContents(fInputText)) { + fInputLength = utext_nativeLength(fInputText); + reset(); + } + } + + int64_t nativeStart; + nativeStart = start; + if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { + status = U_INDEX_OUTOFBOUNDS_ERROR; + return FALSE; + } + + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { + MatchChunkAt((int32_t)nativeStart, FALSE, status); + } else { + MatchAt(nativeStart, FALSE, status); + } + return fMatch; +} + + + +//-------------------------------------------------------------------------------- +// +// matches() +// +//-------------------------------------------------------------------------------- +UBool RegexMatcher::matches(UErrorCode &status) { + if (U_FAILURE(status)) { + return FALSE; + } + if (U_FAILURE(fDeferredStatus)) { + status = fDeferredStatus; + return FALSE; + } + + if (fInputUniStrMaybeMutable) { + if (compat_SyncMutableUTextContents(fInputText)) { + fInputLength = utext_nativeLength(fInputText); + reset(); + } + } + else { + resetPreserveRegion(); + } + + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { + MatchChunkAt((int32_t)fActiveStart, TRUE, status); + } else { + MatchAt(fActiveStart, TRUE, status); + } + return fMatch; +} + + +UBool RegexMatcher::matches(int64_t start, UErrorCode &status) { + if (U_FAILURE(status)) { + return FALSE; + } + if (U_FAILURE(fDeferredStatus)) { + status = fDeferredStatus; + return FALSE; + } + reset(); + + if (start < 0) { + status = U_INDEX_OUTOFBOUNDS_ERROR; + return FALSE; + } + + if (fInputUniStrMaybeMutable) { + if (compat_SyncMutableUTextContents(fInputText)) { + fInputLength = utext_nativeLength(fInputText); + reset(); + } + } + + int64_t nativeStart; + nativeStart = start; + if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { + status = U_INDEX_OUTOFBOUNDS_ERROR; + return FALSE; + } + + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { + MatchChunkAt((int32_t)nativeStart, TRUE, status); + } else { + MatchAt(nativeStart, TRUE, status); + } + return fMatch; +} + + + +//-------------------------------------------------------------------------------- +// +// pattern +// +//-------------------------------------------------------------------------------- +const RegexPattern &RegexMatcher::pattern() const { + return *fPattern; +} + + + +//-------------------------------------------------------------------------------- +// +// region // //-------------------------------------------------------------------------------- -RegexMatcher &RegexMatcher::region(int32_t start, int32_t limit, UErrorCode &status) { +RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) { if (U_FAILURE(status)) { return *this; } - if (start>limit || start<0 || limit<0 || limit>fInput->length()) { + + if (regionStart>regionLimit || regionStart<0 || regionLimit<0) { status = U_ILLEGAL_ARGUMENT_ERROR; } - this->reset(); - fRegionStart = start; - fRegionLimit = limit; - fActiveStart = start; - fActiveLimit = limit; + + int64_t nativeStart = regionStart; + int64_t nativeLimit = regionLimit; + if (nativeStart > fInputLength || nativeLimit > fInputLength) { + status = U_ILLEGAL_ARGUMENT_ERROR; + } + + if (startIndex == -1) + this->reset(); + else + resetPreserveRegion(); + + fRegionStart = nativeStart; + fRegionLimit = nativeLimit; + fActiveStart = nativeStart; + fActiveLimit = nativeLimit; + + if (startIndex != -1) { + if (startIndex < fActiveStart || startIndex > fActiveLimit) { + status = U_INDEX_OUTOFBOUNDS_ERROR; + } + fMatchEnd = startIndex; + } + if (!fTransparentBounds) { - fLookStart = start; - fLookLimit = limit; + fLookStart = nativeStart; + fLookLimit = nativeLimit; } if (fAnchoringBounds) { - fAnchorStart = start; - fAnchorLimit = limit; + fAnchorStart = nativeStart; + fAnchorLimit = nativeLimit; } return *this; } - +RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) { + return region(start, limit, -1, status); +} //-------------------------------------------------------------------------------- // @@ -802,9 +1716,12 @@ RegexMatcher &RegexMatcher::region(int32_t start, int32_t limit, UErrorCode &sta // //-------------------------------------------------------------------------------- int32_t RegexMatcher::regionEnd() const { - return fRegionLimit; + return (int32_t)fRegionLimit; } +int64_t RegexMatcher::regionEnd64() const { + return fRegionLimit; +} //-------------------------------------------------------------------------------- // @@ -812,6 +1729,10 @@ int32_t RegexMatcher::regionEnd() const { // //-------------------------------------------------------------------------------- int32_t RegexMatcher::regionStart() const { + return (int32_t)fRegionStart; +} + +int64_t RegexMatcher::regionStart64() const { return fRegionStart; } @@ -822,51 +1743,112 @@ int32_t RegexMatcher::regionStart() const { // //-------------------------------------------------------------------------------- UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) { + UText replacementText = UTEXT_INITIALIZER; + UText resultText = UTEXT_INITIALIZER; + UnicodeString resultString; + if (U_FAILURE(status)) { + return resultString; + } + + utext_openConstUnicodeString(&replacementText, &replacement, &status); + utext_openUnicodeString(&resultText, &resultString, &status); + + replaceAll(&replacementText, &resultText, status); + + utext_close(&resultText); + utext_close(&replacementText); + + return resultString; +} + + +// +// replaceAll, UText mode +// +UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) { if (U_FAILURE(status)) { - return *fInput; + return dest; } if (U_FAILURE(fDeferredStatus)) { status = fDeferredStatus; - return *fInput; + return dest; } - UnicodeString destString; - reset(); - while (find()) { - appendReplacement(destString, replacement, status); - if (U_FAILURE(status)) { - break; + + if (dest == NULL) { + UnicodeString emptyString; + UText empty = UTEXT_INITIALIZER; + + utext_openUnicodeString(&empty, &emptyString, &status); + dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); + utext_close(&empty); + } + + if (U_SUCCESS(status)) { + reset(); + while (find()) { + appendReplacement(dest, replacement, status); + if (U_FAILURE(status)) { + break; + } } + appendTail(dest, status); } - appendTail(destString); - return destString; + + return dest; } - - //-------------------------------------------------------------------------------- // // replaceFirst // //-------------------------------------------------------------------------------- UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) { + UText replacementText = UTEXT_INITIALIZER; + UText resultText = UTEXT_INITIALIZER; + UnicodeString resultString; + + utext_openConstUnicodeString(&replacementText, &replacement, &status); + utext_openUnicodeString(&resultText, &resultString, &status); + + replaceFirst(&replacementText, &resultText, status); + + utext_close(&resultText); + utext_close(&replacementText); + + return resultString; +} + +// +// replaceFirst, UText mode +// +UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) { if (U_FAILURE(status)) { - return *fInput; + return dest; } if (U_FAILURE(fDeferredStatus)) { status = fDeferredStatus; - return *fInput; + return dest; } reset(); if (!find()) { - return *fInput; + return getInput(dest, status); } - - UnicodeString destString; - appendReplacement(destString, replacement, status); - appendTail(destString); - return destString; + + if (dest == NULL) { + UnicodeString emptyString; + UText empty = UTEXT_INITIALIZER; + + utext_openUnicodeString(&empty, &emptyString, &status); + dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); + utext_close(&empty); + } + + appendReplacement(dest, replacement, status); + appendTail(dest, status); + + return dest; } @@ -887,13 +1869,13 @@ UBool RegexMatcher::requireEnd() const { //-------------------------------------------------------------------------------- RegexMatcher &RegexMatcher::reset() { fRegionStart = 0; - fRegionLimit = fInput->length(); + fRegionLimit = fInputLength; fActiveStart = 0; - fActiveLimit = fRegionLimit; + fActiveLimit = fInputLength; fAnchorStart = 0; - fAnchorLimit = fRegionLimit; + fAnchorLimit = fInputLength; fLookStart = 0; - fLookLimit = fRegionLimit; + fLookLimit = fInputLength; resetPreserveRegion(); return *this; } @@ -910,33 +1892,69 @@ void RegexMatcher::resetPreserveRegion() { fRequireEnd = FALSE; fTime = 0; fTickCounter = TIMER_INITIAL_VALUE; - resetStack(); + //resetStack(); // more expensive than it looks... } RegexMatcher &RegexMatcher::reset(const UnicodeString &input) { - fInput = &input; + fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus); + if (fPattern->fNeedsAltInput) { + fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus); + } + fInputLength = utext_nativeLength(fInputText); + reset(); + delete fInput; + fInput = NULL; + + // Do the following for any UnicodeString. + // This is for compatibility for those clients who modify the input string "live" during regex operations. + fInputUniStrMaybeMutable = TRUE; + if (fWordBreakItr != NULL) { - #if UCONFIG_NO_BREAK_ITERATION==0 - fWordBreakItr->setText(input); - #endif +#if UCONFIG_NO_BREAK_ITERATION==0 + UErrorCode status = U_ZERO_ERROR; + fWordBreakItr->setText(fInputText, status); +#endif } return *this; } -/*RegexMatcher &RegexMatcher::reset(const UChar *) { - fDeferredStatus = U_INTERNAL_PROGRAM_ERROR; - return *this; -}*/ - -RegexMatcher &RegexMatcher::reset(int32_t position, UErrorCode &status) { - if (U_FAILURE(status)) { +RegexMatcher &RegexMatcher::reset(UText *input) { + if (fInputText != input) { + fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus); + if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus); + fInputLength = utext_nativeLength(fInputText); + + delete fInput; + fInput = NULL; + + if (fWordBreakItr != NULL) { +#if UCONFIG_NO_BREAK_ITERATION==0 + UErrorCode status = U_ZERO_ERROR; + fWordBreakItr->setText(input, status); +#endif + } + } + reset(); + fInputUniStrMaybeMutable = FALSE; + + return *this; +} + +/*RegexMatcher &RegexMatcher::reset(const UChar *) { + fDeferredStatus = U_INTERNAL_PROGRAM_ERROR; + return *this; +}*/ + +RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) { + if (U_FAILURE(status)) { return *this; } reset(); // Reset also resets the region to be the entire string. - if (position < 0 || position >= fActiveLimit) { + + if (position < 0 || position > fActiveLimit) { status = U_INDEX_OUTOFBOUNDS_ERROR; return *this; } @@ -967,7 +1985,42 @@ void RegexMatcher::setTrace(UBool state) { int32_t RegexMatcher::split(const UnicodeString &input, UnicodeString dest[], int32_t destCapacity, - UErrorCode &status) + UErrorCode &status) +{ + UText inputText = UTEXT_INITIALIZER; + utext_openConstUnicodeString(&inputText, &input, &status); + if (U_FAILURE(status)) { + return 0; + } + + UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity); + if (destText == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return 0; + } + int32_t i; + for (i = 0; i < destCapacity; i++) { + destText[i] = utext_openUnicodeString(NULL, &dest[i], &status); + } + + int32_t fieldCount = split(&inputText, destText, destCapacity, status); + + for (i = 0; i < destCapacity; i++) { + utext_close(destText[i]); + } + + uprv_free(destText); + utext_close(&inputText); + return fieldCount; +} + +// +// split, UText mode +// +int32_t RegexMatcher::split(UText *input, + UText *dest[], + int32_t destCapacity, + UErrorCode &status) { // // Check arguements for validity @@ -985,7 +2038,7 @@ int32_t RegexMatcher::split(const UnicodeString &input, // Reset for the input text // reset(input); - int32_t nextOutputStringStart = 0; + int64_t nextOutputStringStart = 0; if (fActiveLimit == 0) { return 0; } @@ -999,38 +2052,108 @@ int32_t RegexMatcher::split(const UnicodeString &input, if (i>=destCapacity-1) { // There is one or zero output string left. // Fill the last output string with whatever is left from the input, then exit the loop. - // ( i will be == destCapicity if we filled the output array while processing + // ( i will be == destCapacity if we filled the output array while processing // capture groups of the delimiter expression, in which case we will discard the // last capture group saved in favor of the unprocessed remainder of the // input string.) i = destCapacity-1; - int32_t remainingLength = fActiveLimit-nextOutputStringStart; - if (remainingLength > 0) { - dest[i].setTo(input, nextOutputStringStart, remainingLength); + if (fActiveLimit > nextOutputStringStart) { + if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { + if (dest[i]) { + utext_replace(dest[i], 0, utext_nativeLength(dest[i]), + input->chunkContents+nextOutputStringStart, + (int32_t)(fActiveLimit-nextOutputStringStart), &status); + } else { + UText remainingText = UTEXT_INITIALIZER; + utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, + fActiveLimit-nextOutputStringStart, &status); + dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); + utext_close(&remainingText); + } + } else { + UErrorCode lengthStatus = U_ZERO_ERROR; + int32_t remaining16Length = + utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus); + UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); + if (remainingChars == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + break; + } + + utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status); + if (dest[i]) { + utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); + } else { + UText remainingText = UTEXT_INITIALIZER; + utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); + dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); + utext_close(&remainingText); + } + + uprv_free(remainingChars); + } } break; } if (find()) { // We found another delimiter. Move everything from where we started looking // up until the start of the delimiter into the next output string. - int32_t fieldLen = fMatchStart - nextOutputStringStart; - dest[i].setTo(input, nextOutputStringStart, fieldLen); + if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { + if (dest[i]) { + utext_replace(dest[i], 0, utext_nativeLength(dest[i]), + input->chunkContents+nextOutputStringStart, + (int32_t)(fMatchStart-nextOutputStringStart), &status); + } else { + UText remainingText = UTEXT_INITIALIZER; + utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, + fMatchStart-nextOutputStringStart, &status); + dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); + utext_close(&remainingText); + } + } else { + UErrorCode lengthStatus = U_ZERO_ERROR; + int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus); + UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); + if (remainingChars == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + break; + } + utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status); + if (dest[i]) { + utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); + } else { + UText remainingText = UTEXT_INITIALIZER; + utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); + dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); + utext_close(&remainingText); + } + + uprv_free(remainingChars); + } nextOutputStringStart = fMatchEnd; // If the delimiter pattern has capturing parentheses, the captured // text goes out into the next n destination strings. int32_t groupNum; + UBool lastGroupWasNullUText = FALSE; for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { if (i==destCapacity-1) { break; } i++; - dest[i] = group(groupNum, status); + lastGroupWasNullUText = (dest[i] == NULL ? TRUE : FALSE); + dest[i] = group(groupNum, dest[i], status); } if (nextOutputStringStart == fActiveLimit) { // The delimiter was at the end of the string. We're done. break; + } else if (i == destCapacity-1) { + // We're out of capture groups, and the rest of the string is more important + if (lastGroupWasNullUText) { + utext_close(dest[i]); + dest[i] = NULL; + } } } @@ -1038,15 +2161,49 @@ int32_t RegexMatcher::split(const UnicodeString &input, { // We ran off the end of the input while looking for the next delimiter. // All the remaining text goes into the current output string. - dest[i].setTo(input, nextOutputStringStart, fActiveLimit-nextOutputStringStart); + if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { + if (dest[i]) { + utext_replace(dest[i], 0, utext_nativeLength(dest[i]), + input->chunkContents+nextOutputStringStart, + (int32_t)(fActiveLimit-nextOutputStringStart), &status); + } else { + UText remainingText = UTEXT_INITIALIZER; + utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, + fActiveLimit-nextOutputStringStart, &status); + dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); + utext_close(&remainingText); + } + } else { + UErrorCode lengthStatus = U_ZERO_ERROR; + int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus); + UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); + if (remainingChars == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + break; + } + + utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status); + if (dest[i]) { + utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); + } else { + UText remainingText = UTEXT_INITIALIZER; + utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); + dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); + utext_close(&remainingText); + } + + uprv_free(remainingChars); + } break; } - } + if (U_FAILURE(status)) { + break; + } + } // end of for loop return i+1; } - //-------------------------------------------------------------------------------- // // start @@ -1056,15 +2213,17 @@ int32_t RegexMatcher::start(UErrorCode &status) const { return start(0, status); } - - +int64_t RegexMatcher::start64(UErrorCode &status) const { + return start64(0, status); +} //-------------------------------------------------------------------------------- // // start(int32_t group, UErrorCode &status) // //-------------------------------------------------------------------------------- -int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const { + +int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const { if (U_FAILURE(status)) { return -1; } @@ -1080,7 +2239,7 @@ int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const { status = U_INDEX_OUTOFBOUNDS_ERROR; return -1; } - int32_t s; + int64_t s; if (group == 0) { s = fMatchStart; } else { @@ -1089,10 +2248,14 @@ int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const { U_ASSERT(groupOffset >= 0); s = fFrame->fExtra[groupOffset]; } + return s; } +int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const { + return (int32_t)start64(group, status); +} //-------------------------------------------------------------------------------- // @@ -1101,9 +2264,8 @@ int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const { //-------------------------------------------------------------------------------- RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) { fAnchoringBounds = b; - UErrorCode status = U_ZERO_ERROR; - region(fRegionStart, fRegionLimit, status); - U_ASSERT(U_SUCCESS(status)); + fAnchorStart = (fAnchoringBounds ? fRegionStart : 0); + fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength); return *this; } @@ -1115,9 +2277,8 @@ RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) { //-------------------------------------------------------------------------------- RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) { fTransparentBounds = b; - UErrorCode status = U_ZERO_ERROR; - region(fRegionStart, fRegionLimit, status); - U_ASSERT(U_SUCCESS(status)); + fLookStart = (fTransparentBounds ? 0 : fRegionStart); + fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit); return *this; } @@ -1210,11 +2371,11 @@ int32_t RegexMatcher::getStackLimit() const { void RegexMatcher::setMatchCallback(URegexMatchCallback *callback, const void *context, UErrorCode &status) { - if (U_FAILURE(status)) { - return; - } - fCallbackFn = callback; - fCallbackContext = context; + if (U_FAILURE(status)) { + return; + } + fCallbackFn = callback; + fCallbackContext = context; } @@ -1234,6 +2395,38 @@ void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback, } +//-------------------------------------------------------------------------------- +// +// setMatchCallback +// +//-------------------------------------------------------------------------------- +void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback, + const void *context, + UErrorCode &status) { + if (U_FAILURE(status)) { + return; + } + fFindProgressCallbackFn = callback; + fFindProgressCallbackContext = context; +} + + +//-------------------------------------------------------------------------------- +// +// getMatchCallback +// +//-------------------------------------------------------------------------------- +void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback, + const void *&context, + UErrorCode &status) { + if (U_FAILURE(status)) { + return; + } + callback = fFindProgressCallbackFn; + context = fFindProgressCallbackContext; +} + + //================================================================================ // // Code following this point in this file is the internal @@ -1251,252 +2444,2034 @@ void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback, //-------------------------------------------------------------------------------- REStackFrame *RegexMatcher::resetStack() { // Discard any previous contents of the state save stack, and initialize a - // new stack frame to all -1. The -1s are needed for capture group limits, where - // they indicate that a group has not yet matched anything. + // new stack frame with all -1 data. The -1s are needed for capture group limits, + // where they indicate that a group has not yet matched anything. fStack->removeAllElements(); - int32_t *iFrame = fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus); - int32_t i; - for (i=0; ifFrameSize; i++) { - iFrame[i] = -1; - } - return (REStackFrame *)iFrame; -} + REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus); + int32_t i; + for (i=0; ifFrameSize-RESTACKFRAME_HDRCOUNT; i++) { + iFrame->fExtra[i] = -1; + } + return iFrame; +} + + + +//-------------------------------------------------------------------------------- +// +// isWordBoundary +// in perl, "xab..cd..", \b is true at positions 0,3,5,7 +// For us, +// If the current char is a combining mark, +// \b is FALSE. +// Else Scan backwards to the first non-combining char. +// We are at a boundary if the this char and the original chars are +// opposite in membership in \w set +// +// parameters: pos - the current position in the input buffer +// +// TODO: double-check edge cases at region boundaries. +// +//-------------------------------------------------------------------------------- +UBool RegexMatcher::isWordBoundary(int64_t pos) { + UBool isBoundary = FALSE; + UBool cIsWord = FALSE; + + if (pos >= fLookLimit) { + fHitEnd = TRUE; + } else { + // Determine whether char c at current position is a member of the word set of chars. + // If we're off the end of the string, behave as though we're not at a word char. + UTEXT_SETNATIVEINDEX(fInputText, pos); + UChar32 c = UTEXT_CURRENT32(fInputText); + if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { + // Current char is a combining one. Not a boundary. + return FALSE; + } + cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); + } + + // Back up until we come to a non-combining char, determine whether + // that char is a word char. + UBool prevCIsWord = FALSE; + for (;;) { + if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) { + break; + } + UChar32 prevChar = UTEXT_PREVIOUS32(fInputText); + if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) + || u_charType(prevChar) == U_FORMAT_CHAR)) { + prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar); + break; + } + } + isBoundary = cIsWord ^ prevCIsWord; + return isBoundary; +} + +UBool RegexMatcher::isChunkWordBoundary(int32_t pos) { + UBool isBoundary = FALSE; + UBool cIsWord = FALSE; + + const UChar *inputBuf = fInputText->chunkContents; + + if (pos >= fLookLimit) { + fHitEnd = TRUE; + } else { + // Determine whether char c at current position is a member of the word set of chars. + // If we're off the end of the string, behave as though we're not at a word char. + UChar32 c; + U16_GET(inputBuf, fLookStart, pos, fLookLimit, c); + if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { + // Current char is a combining one. Not a boundary. + return FALSE; + } + cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); + } + + // Back up until we come to a non-combining char, determine whether + // that char is a word char. + UBool prevCIsWord = FALSE; + for (;;) { + if (pos <= fLookStart) { + break; + } + UChar32 prevChar; + U16_PREV(inputBuf, fLookStart, pos, prevChar); + if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) + || u_charType(prevChar) == U_FORMAT_CHAR)) { + prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar); + break; + } + } + isBoundary = cIsWord ^ prevCIsWord; + return isBoundary; +} + +//-------------------------------------------------------------------------------- +// +// isUWordBoundary +// +// Test for a word boundary using RBBI word break. +// +// parameters: pos - the current position in the input buffer +// +//-------------------------------------------------------------------------------- +UBool RegexMatcher::isUWordBoundary(int64_t pos) { + UBool returnVal = FALSE; +#if UCONFIG_NO_BREAK_ITERATION==0 + + // If we haven't yet created a break iterator for this matcher, do it now. + if (fWordBreakItr == NULL) { + fWordBreakItr = + (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus); + if (U_FAILURE(fDeferredStatus)) { + return FALSE; + } + fWordBreakItr->setText(fInputText, fDeferredStatus); + } + + if (pos >= fLookLimit) { + fHitEnd = TRUE; + returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real" + // words are not boundaries. All non-word chars stand by themselves, + // with word boundaries on both sides. + } else { + if (!UTEXT_USES_U16(fInputText)) { + // !!!: Would like a better way to do this! + UErrorCode status = U_ZERO_ERROR; + pos = utext_extract(fInputText, 0, pos, NULL, 0, &status); + } + returnVal = fWordBreakItr->isBoundary((int32_t)pos); + } +#endif + return returnVal; +} + +//-------------------------------------------------------------------------------- +// +// IncrementTime This function is called once each TIMER_INITIAL_VALUE state +// saves. Increment the "time" counter, and call the +// user callback function if there is one installed. +// +// If the match operation needs to be aborted, either for a time-out +// or because the user callback asked for it, just set an error status. +// The engine will pick that up and stop in its outer loop. +// +//-------------------------------------------------------------------------------- +void RegexMatcher::IncrementTime(UErrorCode &status) { + fTickCounter = TIMER_INITIAL_VALUE; + fTime++; + if (fCallbackFn != NULL) { + if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) { + status = U_REGEX_STOPPED_BY_CALLER; + return; + } + } + if (fTimeLimit > 0 && fTime >= fTimeLimit) { + status = U_REGEX_TIME_OUT; + } +} + +//-------------------------------------------------------------------------------- +// +// ReportFindProgress This function is called once for each advance in the target +// string from the find() function, and calls the user progress callback +// function if there is one installed. +// +// NOTE: +// +// If the match operation needs to be aborted because the user +// callback asked for it, just set an error status. +// The engine will pick that up and stop in its outer loop. +// +//-------------------------------------------------------------------------------- +UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) { + if (fFindProgressCallbackFn != NULL) { + if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) { + status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/; + return FALSE; + } + } + return TRUE; +} + +//-------------------------------------------------------------------------------- +// +// StateSave +// Make a new stack frame, initialized as a copy of the current stack frame. +// Set the pattern index in the original stack frame from the operand value +// in the opcode. Execution of the engine continues with the state in +// the newly created stack frame +// +// Note that reserveBlock() may grow the stack, resulting in the +// whole thing being relocated in memory. +// +// Parameters: +// fp The top frame pointer when called. At return, a new +// fame will be present +// savePatIdx An index into the compiled pattern. Goes into the original +// (not new) frame. If execution ever back-tracks out of the +// new frame, this will be where we continue from in the pattern. +// Return +// The new frame pointer. +// +//-------------------------------------------------------------------------------- +inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) { + // push storage for a new frame. + int64_t *newFP = fStack->reserveBlock(fFrameSize, status); + if (newFP == NULL) { + // Failure on attempted stack expansion. + // Stack function set some other error code, change it to a more + // specific one for regular expressions. + status = U_REGEX_STACK_OVERFLOW; + // We need to return a writable stack frame, so just return the + // previous frame. The match operation will stop quickly + // because of the error status, after which the frame will never + // be looked at again. + return fp; + } + fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack. + + // New stack frame = copy of old top frame. + int64_t *source = (int64_t *)fp; + int64_t *dest = newFP; + for (;;) { + *dest++ = *source++; + if (source == newFP) { + break; + } + } + + fTickCounter--; + if (fTickCounter <= 0) { + IncrementTime(status); // Re-initializes fTickCounter + } + fp->fPatIdx = savePatIdx; + return (REStackFrame *)newFP; +} + + +//-------------------------------------------------------------------------------- +// +// MatchAt This is the actual matching engine. +// +// startIdx: begin matching a this index. +// toEnd: if true, match must extend to end of the input region +// +//-------------------------------------------------------------------------------- +void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { + UBool isMatch = FALSE; // True if the we have a match. + + int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards + + int32_t op; // Operation from the compiled pattern, split into + int32_t opType; // the opcode + int32_t opValue; // and the operand value. + + #ifdef REGEX_RUN_DEBUG + if (fTraceDebug) + { + printf("MatchAt(startIdx=%ld)\n", startIdx); + printf("Original Pattern: "); + UChar32 c = utext_next32From(fPattern->fPattern, 0); + while (c != U_SENTINEL) { + if (c<32 || c>256) { + c = '.'; + } + REGEX_DUMP_DEBUG_PRINTF(("%c", c)); + + c = UTEXT_NEXT32(fPattern->fPattern); + } + printf("\n"); + printf("Input String: "); + c = utext_next32From(fInputText, 0); + while (c != U_SENTINEL) { + if (c<32 || c>256) { + c = '.'; + } + printf("%c", c); + + c = UTEXT_NEXT32(fInputText); + } + printf("\n"); + printf("\n"); + } + #endif + + if (U_FAILURE(status)) { + return; + } + + // Cache frequently referenced items from the compiled pattern + // + int64_t *pat = fPattern->fCompiledPat->getBuffer(); + + const UChar *litText = fPattern->fLiteralText.getBuffer(); + UVector *sets = fPattern->fSets; + + fFrameSize = fPattern->fFrameSize; + REStackFrame *fp = resetStack(); + + fp->fPatIdx = 0; + fp->fInputIdx = startIdx; + + // Zero out the pattern's static data + int32_t i; + for (i = 0; ifDataSize; i++) { + fData[i] = 0; + } + + // + // Main loop for interpreting the compiled pattern. + // One iteration of the loop per pattern operation performed. + // + for (;;) { +#if 0 + if (_heapchk() != _HEAPOK) { + fprintf(stderr, "Heap Trouble\n"); + } +#endif + + op = (int32_t)pat[fp->fPatIdx]; + opType = URX_TYPE(op); + opValue = URX_VAL(op); + #ifdef REGEX_RUN_DEBUG + if (fTraceDebug) { + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp->fInputIdx, + UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); + fPattern->dumpOp(fp->fPatIdx); + } + #endif + fp->fPatIdx++; + + switch (opType) { + + + case URX_NOP: + break; + + + case URX_BACKTRACK: + // Force a backtrack. In some circumstances, the pattern compiler + // will notice that the pattern can't possibly match anything, and will + // emit one of these at that point. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + + + case URX_ONECHAR: + if (fp->fInputIdx < fActiveLimit) { + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + UChar32 c = UTEXT_NEXT32(fInputText); + if (c == opValue) { + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + break; + } + } else { + fHitEnd = TRUE; + } + + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + UBool success = FALSE; + UChar32 c = UTEXT_PREVIOUS32(fInputText); + while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) { + if (c == opValue) { + success = TRUE; + break; + } else if (c == U_SENTINEL) { + break; + } + c = UTEXT_PREVIOUS32(fInputText); + } + if (success) { + fHitEnd = FALSE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + + + case URX_STRING: + { + // Test input against a literal string. + // Strings require two slots in the compiled pattern, one for the + // offset to the string text, and one for the length. + int32_t stringStartIdx = opValue; + int32_t stringLen; + + op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand + fp->fPatIdx++; + opType = URX_TYPE(op); + stringLen = URX_VAL(op); + U_ASSERT(opType == URX_STRING_LEN); + U_ASSERT(stringLen >= 2); + + const UChar *patternChars = litText+stringStartIdx; + const UChar *patternEnd = patternChars+stringLen; + + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + UChar32 c; + UBool success = TRUE; + + while (patternChars < patternEnd && success) { + c = UTEXT_NEXT32(fInputText); + + if (c != U_SENTINEL && UTEXT_GETNATIVEINDEX(fInputText) <= fActiveLimit) { + if (U_IS_BMP(c)) { + success = (*patternChars == c); + patternChars += 1; + } else if (patternChars+1 < patternEnd) { + success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c)); + patternChars += 2; + } + } else { + success = FALSE; + fHitEnd = TRUE; // TODO: See ticket 6074 + } + } + + if (success) { + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + } else { + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size()) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + // Reset to last start point + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + patternChars = litText+stringStartIdx; + + // Search backwards for a possible start + do { + c = UTEXT_PREVIOUS32(fInputText); + if (c == U_SENTINEL) { + break; + } else if ((U_IS_BMP(c) && *patternChars == c) || + (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) { + success = TRUE; + break; + } + } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex); + + // And try again + if (success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + + case URX_STATE_SAVE: + fp = StateSave(fp, opValue, status); + break; + + + case URX_END: + // The match loop will exit via this path on a successful match, + // when we reach the end of the pattern. + if (toEnd && fp->fInputIdx != fActiveLimit) { + // The pattern matched, but not to the end of input. Try some more. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + isMatch = TRUE; + goto breakFromLoop; + + // Start and End Capture stack frame variables are laid out out like this: + // fp->fExtra[opValue] - The start of a completed capture group + // opValue+1 - The end of a completed capture group + // opValue+2 - the start of a capture group whose end + // has not yet been reached (and might not ever be). + case URX_START_CAPTURE: + U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); + fp->fExtra[opValue+2] = fp->fInputIdx; + break; + + + case URX_END_CAPTURE: + U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); + U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. + fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. + fp->fExtra[opValue+1] = fp->fInputIdx; // End position + U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); + break; + + + case URX_DOLLAR: // $, test for End of line + // or for position before new line at end of input + { + if (fp->fInputIdx >= fAnchorLimit) { + // We really are at the end of input. Success. + fHitEnd = TRUE; + fRequireEnd = TRUE; + break; + } + + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + + // If we are positioned just before a new-line that is located at the + // end of input, succeed. + UChar32 c = UTEXT_NEXT32(fInputText); + if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { + if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) { + // If not in the middle of a CR/LF sequence + if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && (UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) { + // At new-line at end of input. Success + fHitEnd = TRUE; + fRequireEnd = TRUE; + + break; + } + } + } else { + UChar32 nextC = UTEXT_NEXT32(fInputText); + if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { + fHitEnd = TRUE; + fRequireEnd = TRUE; + break; // At CR/LF at end of input. Success + } + } + + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + break; + + + case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. + if (fp->fInputIdx >= fAnchorLimit) { + // Off the end of input. Success. + fHitEnd = TRUE; + fRequireEnd = TRUE; + break; + } else { + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + UChar32 c = UTEXT_NEXT32(fInputText); + // Either at the last character of input, or off the end. + if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) { + fHitEnd = TRUE; + fRequireEnd = TRUE; + break; + } + } + + // Not at end of input. Back-track out. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + + + case URX_DOLLAR_M: // $, test for End of line in multi-line mode + { + if (fp->fInputIdx >= fAnchorLimit) { + // We really are at the end of input. Success. + fHitEnd = TRUE; + fRequireEnd = TRUE; + break; + } + // If we are positioned just before a new-line, succeed. + // It makes no difference where the new-line is within the input. + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + UChar32 c = UTEXT_CURRENT32(fInputText); + if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { + // At a line end, except for the odd chance of being in the middle of a CR/LF sequence + // In multi-line mode, hitting a new-line just before the end of input does not + // set the hitEnd or requireEnd flags + if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) { + break; + } + } + // not at a new line. Fail. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + break; + + + case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode + { + if (fp->fInputIdx >= fAnchorLimit) { + // We really are at the end of input. Success. + fHitEnd = TRUE; + fRequireEnd = TRUE; // Java set requireEnd in this case, even though + break; // adding a new-line would not lose the match. + } + // If we are not positioned just before a new-line, the test fails; backtrack out. + // It makes no difference where the new-line is within the input. + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + if (UTEXT_CURRENT32(fInputText) != 0x0a) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + + case URX_CARET: // ^, test for start of line + if (fp->fInputIdx != fAnchorStart) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + break; + + + case URX_CARET_M: // ^, test for start of line in mulit-line mode + { + if (fp->fInputIdx == fAnchorStart) { + // We are at the start input. Success. + break; + } + // Check whether character just before the current pos is a new-line + // unless we are at the end of input + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + UChar32 c = UTEXT_PREVIOUS32(fInputText); + if ((fp->fInputIdx < fAnchorLimit) && + ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { + // It's a new-line. ^ is true. Success. + // TODO: what should be done with positions between a CR and LF? + break; + } + // Not at the start of a line. Fail. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + break; + + + case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode + { + U_ASSERT(fp->fInputIdx >= fAnchorStart); + if (fp->fInputIdx <= fAnchorStart) { + // We are at the start input. Success. + break; + } + // Check whether character just before the current pos is a new-line + U_ASSERT(fp->fInputIdx <= fAnchorLimit); + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + UChar32 c = UTEXT_PREVIOUS32(fInputText); + if (c != 0x0a) { + // Not at the start of a line. Back-track out. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + case URX_BACKSLASH_B: // Test for word boundaries + { + UBool success = isWordBoundary(fp->fInputIdx); + success ^= (opValue != 0); // flip sense for \B + if (!success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + + case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style + { + UBool success = isUWordBoundary(fp->fInputIdx); + success ^= (opValue != 0); // flip sense for \B + if (!success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + + case URX_BACKSLASH_D: // Test for decimal digit + { + if (fp->fInputIdx >= fActiveLimit) { + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + + UChar32 c = UTEXT_NEXT32(fInputText); + int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster. + UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); + success ^= (opValue != 0); // flip sense for \D + if (success) { + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + } else { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + + case URX_BACKSLASH_G: // Test for position at end of previous match + if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + break; + + + case URX_BACKSLASH_X: + // Match a Grapheme, as defined by Unicode TR 29. + // Differs slightly from Perl, which consumes combining marks independently + // of context. + { + + // Fail if at end of input + if (fp->fInputIdx >= fActiveLimit) { + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + + // Examine (and consume) the current char. + // Dispatch into a little state machine, based on the char. + UChar32 c; + c = UTEXT_NEXT32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + UnicodeSet **sets = fPattern->fStaticSets; + if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; + if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; + if (sets[URX_GC_L]->contains(c)) goto GC_L; + if (sets[URX_GC_LV]->contains(c)) goto GC_V; + if (sets[URX_GC_LVT]->contains(c)) goto GC_T; + if (sets[URX_GC_V]->contains(c)) goto GC_V; + if (sets[URX_GC_T]->contains(c)) goto GC_T; + goto GC_Extend; + + + +GC_L: + if (fp->fInputIdx >= fActiveLimit) goto GC_Done; + c = UTEXT_NEXT32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + if (sets[URX_GC_L]->contains(c)) goto GC_L; + if (sets[URX_GC_LV]->contains(c)) goto GC_V; + if (sets[URX_GC_LVT]->contains(c)) goto GC_T; + if (sets[URX_GC_V]->contains(c)) goto GC_V; + UTEXT_PREVIOUS32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + goto GC_Extend; + +GC_V: + if (fp->fInputIdx >= fActiveLimit) goto GC_Done; + c = UTEXT_NEXT32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + if (sets[URX_GC_V]->contains(c)) goto GC_V; + if (sets[URX_GC_T]->contains(c)) goto GC_T; + UTEXT_PREVIOUS32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + goto GC_Extend; + +GC_T: + if (fp->fInputIdx >= fActiveLimit) goto GC_Done; + c = UTEXT_NEXT32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + if (sets[URX_GC_T]->contains(c)) goto GC_T; + UTEXT_PREVIOUS32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + goto GC_Extend; + +GC_Extend: + // Combining characters are consumed here + for (;;) { + if (fp->fInputIdx >= fActiveLimit) { + break; + } + c = UTEXT_CURRENT32(fInputText); + if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { + break; + } + UTEXT_NEXT32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + } + goto GC_Done; + +GC_Control: + // Most control chars stand alone (don't combine with combining chars), + // except for that CR/LF sequence is a single grapheme cluster. + if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) { + c = UTEXT_NEXT32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + } + +GC_Done: + if (fp->fInputIdx >= fActiveLimit) { + fHitEnd = TRUE; + } + break; + } + + + + + case URX_BACKSLASH_Z: // Test for end of Input + if (fp->fInputIdx < fAnchorLimit) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } else { + fHitEnd = TRUE; + fRequireEnd = TRUE; + } + break; + + + + case URX_STATIC_SETREF: + { + // Test input character against one of the predefined sets + // (Word Characters, for example) + // The high bit of the op value is a flag for the match polarity. + // 0: success if input char is in set. + // 1: success if input char is not in set. + if (fp->fInputIdx >= fActiveLimit) { + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + + UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); + opValue &= ~URX_NEG_SET; + U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); + + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + UChar32 c = UTEXT_NEXT32(fInputText); + if (c < 256) { + Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; + if (s8->contains(c)) { + success = !success; + } + } else { + const UnicodeSet *s = fPattern->fStaticSets[opValue]; + if (s->contains(c)) { + success = !success; + } + } + if (success) { + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + } else { + // the character wasn't in the set. + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + // Try to find it, backwards + UTEXT_PREVIOUS32(fInputText); // skip the first character we tried + success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset + do { + c = UTEXT_PREVIOUS32(fInputText); + if (c == U_SENTINEL) { + break; + } else if (c < 256) { + Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; + if (s8->contains(c)) { + success = !success; + } + } else { + const UnicodeSet *s = fPattern->fStaticSets[opValue]; + if (s->contains(c)) { + success = !success; + } + } + } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex && !success); + + if (success && c != U_SENTINEL) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + + case URX_STAT_SETREF_N: + { + // Test input character for NOT being a member of one of + // the predefined sets (Word Characters, for example) + if (fp->fInputIdx >= fActiveLimit) { + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + + U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); + + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + + UChar32 c = UTEXT_NEXT32(fInputText); + if (c < 256) { + Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; + if (s8->contains(c) == FALSE) { + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + break; + } + } else { + const UnicodeSet *s = fPattern->fStaticSets[opValue]; + if (s->contains(c) == FALSE) { + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + break; + } + } + // the character wasn't in the set. + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + // Try to find it, backwards + UTEXT_PREVIOUS32(fInputText); // skip the first character we tried + UBool success = FALSE; + do { + c = UTEXT_PREVIOUS32(fInputText); + if (c == U_SENTINEL) { + break; + } else if (c < 256) { + Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; + if (s8->contains(c) == FALSE) { + success = TRUE; + break; + } + } else { + const UnicodeSet *s = fPattern->fStaticSets[opValue]; + if (s->contains(c) == FALSE) { + success = TRUE; + break; + } + } + } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex); + + if (success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + break; + + + case URX_SETREF: + if (fp->fInputIdx >= fActiveLimit) { + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } else { + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + + // There is input left. Pick up one char and test it for set membership. + UChar32 c = UTEXT_NEXT32(fInputText); + U_ASSERT(opValue > 0 && opValue < sets->size()); + if (c<256) { + Regex8BitSet *s8 = &fPattern->fSets8[opValue]; + if (s8->contains(c)) { + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + break; + } + } else { + UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); + if (s->contains(c)) { + // The character is in the set. A Match. + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + break; + } + } + + // the character wasn't in the set. + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + // Try to find it, backwards + UTEXT_PREVIOUS32(fInputText); // skip the first character we tried + UBool success = FALSE; + do { + c = UTEXT_PREVIOUS32(fInputText); + if (c == U_SENTINEL) { + break; + } else if (c < 256) { + Regex8BitSet *s8 = &fPattern->fSets8[opValue]; + if (s8->contains(c)) { + success = TRUE; + break; + } + } else { + UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); + if (s->contains(c)) { + success = TRUE; + break; + } + } + } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex); + + if (success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + break; + + + case URX_DOTANY: + { + // . matches anything, but stops at end-of-line. + if (fp->fInputIdx >= fActiveLimit) { + // At end of input. Match failed. Backtrack out. + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + + // There is input left. Advance over one char, unless we've hit end-of-line + UChar32 c = UTEXT_NEXT32(fInputText); + if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible + ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { + // End of line in normal mode. . does not match. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + } + break; + + + case URX_DOTANY_ALL: + { + // ., in dot-matches-all (including new lines) mode + if (fp->fInputIdx >= fActiveLimit) { + // At end of input. Match failed. Backtrack out. + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + + // There is input left. Advance over one char, except if we are + // at a cr/lf, advance over both of them. + UChar32 c; + c = UTEXT_NEXT32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + if (c==0x0d && fp->fInputIdx < fActiveLimit) { + // In the case of a CR/LF, we need to advance over both. + UChar32 nextc = UTEXT_CURRENT32(fInputText); + if (nextc == 0x0a) { + UTEXT_NEXT32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + } + } + } + break; + + + case URX_DOTANY_UNIX: + { + // '.' operator, matches all, but stops at end-of-line. + // UNIX_LINES mode, so 0x0a is the only recognized line ending. + if (fp->fInputIdx >= fActiveLimit) { + // At end of input. Match failed. Backtrack out. + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + + // There is input left. Advance over one char, unless we've hit end-of-line + UChar32 c = UTEXT_NEXT32(fInputText); + if (c == 0x0a) { + // End of line in normal mode. '.' does not match the \n + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } else { + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + } + } + break; + + + case URX_JMP: + fp->fPatIdx = opValue; + break; + + case URX_FAIL: + isMatch = FALSE; + goto breakFromLoop; + + case URX_JMP_SAV: + U_ASSERT(opValue < fPattern->fCompiledPat->size()); + fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current + fp->fPatIdx = opValue; // Then JMP. + break; + + case URX_JMP_SAV_X: + // This opcode is used with (x)+, when x can match a zero length string. + // Same as JMP_SAV, except conditional on the match having made forward progress. + // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the + // data address of the input position at the start of the loop. + { + U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()); + int32_t stoOp = (int32_t)pat[opValue-1]; + U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); + int32_t frameLoc = URX_VAL(stoOp); + U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); + int64_t prevInputIdx = fp->fExtra[frameLoc]; + U_ASSERT(prevInputIdx <= fp->fInputIdx); + if (prevInputIdx < fp->fInputIdx) { + // The match did make progress. Repeat the loop. + fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current + fp->fPatIdx = opValue; + fp->fExtra[frameLoc] = fp->fInputIdx; + } + // If the input position did not advance, we do nothing here, + // execution will fall out of the loop. + } + break; + + case URX_CTR_INIT: + { + U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); + fp->fExtra[opValue] = 0; // Set the loop counter variable to zero + + // Pick up the three extra operands that CTR_INIT has, and + // skip the pattern location counter past + int32_t instrOperandLoc = (int32_t)fp->fPatIdx; + fp->fPatIdx += 3; + int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); + int32_t minCount = (int32_t)pat[instrOperandLoc+1]; + int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; + U_ASSERT(minCount>=0); + U_ASSERT(maxCount>=minCount || maxCount==-1); + U_ASSERT(loopLoc>fp->fPatIdx); + + if (minCount == 0) { + fp = StateSave(fp, loopLoc+1, status); + } + if (maxCount == 0) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + case URX_CTR_LOOP: + { + U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); + int32_t initOp = (int32_t)pat[opValue]; + U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); + int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; + int32_t minCount = (int32_t)pat[opValue+2]; + int32_t maxCount = (int32_t)pat[opValue+3]; + // Increment the counter. Note: we DIDN'T worry about counter + // overflow, since the data comes from UnicodeStrings, which + // stores its length in an int32_t. Do we have to think about + // this now that we're using UText? Probably not, since the length + // in UChar32s is still an int32_t. + (*pCounter)++; + U_ASSERT(*pCounter > 0); + if ((uint64_t)*pCounter >= (uint32_t)maxCount) { + U_ASSERT(*pCounter == maxCount || maxCount == -1); + break; + } + if (*pCounter >= minCount) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx = opValue + 4; // Loop back. + } + break; + + case URX_CTR_INIT_NG: + { + // Initialize a non-greedy loop + U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); + fp->fExtra[opValue] = 0; // Set the loop counter variable to zero + + // Pick up the three extra operands that CTR_INIT has, and + // skip the pattern location counter past + int32_t instrOperandLoc = (int32_t)fp->fPatIdx; + fp->fPatIdx += 3; + int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); + int32_t minCount = (int32_t)pat[instrOperandLoc+1]; + int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; + U_ASSERT(minCount>=0); + U_ASSERT(maxCount>=minCount || maxCount==-1); + U_ASSERT(loopLoc>fp->fPatIdx); + + if (minCount == 0) { + if (maxCount != 0) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block + } + } + break; + + case URX_CTR_LOOP_NG: + { + // Non-greedy {min, max} loops + U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); + int32_t initOp = (int32_t)pat[opValue]; + U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); + int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; + int32_t minCount = (int32_t)pat[opValue+2]; + int32_t maxCount = (int32_t)pat[opValue+3]; + // Increment the counter. Note: we DIDN'T worry about counter + // overflow, since the data comes from UnicodeStrings, which + // stores its length in an int32_t. Do we have to think about + // this now that we're using UText? Probably not, since the length + // in UChar32s is still an int32_t. + (*pCounter)++; + U_ASSERT(*pCounter > 0); + + if ((uint64_t)*pCounter >= (uint32_t)maxCount) { + // The loop has matched the maximum permitted number of times. + // Break out of here with no action. Matching will + // continue with the following pattern. + U_ASSERT(*pCounter == maxCount || maxCount == -1); + break; + } + + if (*pCounter < minCount) { + // We haven't met the minimum number of matches yet. + // Loop back for another one. + fp->fPatIdx = opValue + 4; // Loop back. + } else { + // We do have the minimum number of matches. + // Fall into the following pattern, but first do + // a state save to the top of the loop, so that a failure + // in the following pattern will try another iteration of the loop. + fp = StateSave(fp, opValue + 4, status); + } + } + break; + + case URX_STO_SP: + U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); + fData[opValue] = fStack->size(); + break; + + case URX_LD_SP: + { + U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); + int32_t newStackSize = (int32_t)fData[opValue]; + U_ASSERT(newStackSize <= fStack->size()); + int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; + if (newFP == (int64_t *)fp) { + break; + } + int32_t i; + for (i=0; isetSize(newStackSize); + } + break; + + case URX_BACKREF: + case URX_BACKREF_I: + { + U_ASSERT(opValue < fFrameSize); + int64_t groupStartIdx = fp->fExtra[opValue]; + int64_t groupEndIdx = fp->fExtra[opValue+1]; + U_ASSERT(groupStartIdx <= groupEndIdx); + if (groupStartIdx < 0) { + // This capture group has not participated in the match thus far, + fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. + } + + if (groupEndIdx == groupStartIdx) { + // The capture group match was of an empty string. + // Verified by testing: Perl matches succeed in this case, so + // we do too. + break; + } + + UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx); + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + + UBool haveMatch = (opType == URX_BACKREF ? + (0 == utext_compareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1)) : + (0 == utext_caseCompareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1, U_FOLD_CASE_DEFAULT, &status))); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + + if (fp->fInputIdx > fActiveLimit) { + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. + } else if (!haveMatch) { + if (fp->fInputIdx == fActiveLimit) { + fHitEnd = TRUE; + } + fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. + } + } + break; + + case URX_STO_INP_LOC: + { + U_ASSERT(opValue >= 0 && opValue < fFrameSize); + fp->fExtra[opValue] = fp->fInputIdx; + } + break; + + case URX_JMPX: + { + int32_t instrOperandLoc = (int32_t)fp->fPatIdx; + fp->fPatIdx += 1; + int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); + U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); + int64_t savedInputIdx = fp->fExtra[dataLoc]; + U_ASSERT(savedInputIdx <= fp->fInputIdx); + if (savedInputIdx < fp->fInputIdx) { + fp->fPatIdx = opValue; // JMP + } else { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop. + } + } + break; + + case URX_LA_START: + { + // Entering a lookahead block. + // Save Stack Ptr, Input Pos. + U_ASSERT(opValue>=0 && opValue+1fDataSize); + fData[opValue] = fStack->size(); + fData[opValue+1] = fp->fInputIdx; + fActiveStart = fLookStart; // Set the match region change for + fActiveLimit = fLookLimit; // transparent bounds. + } + break; + + case URX_LA_END: + { + // Leaving a look-ahead block. + // restore Stack Ptr, Input Pos to positions they had on entry to block. + U_ASSERT(opValue>=0 && opValue+1fDataSize); + int32_t stackSize = fStack->size(); + int32_t newStackSize =(int32_t)fData[opValue]; + U_ASSERT(stackSize >= newStackSize); + if (stackSize > newStackSize) { + // Copy the current top frame back to the new (cut back) top frame. + // This makes the capture groups from within the look-ahead + // expression available. + int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; + int32_t i; + for (i=0; isetSize(newStackSize); + } + fp->fInputIdx = fData[opValue+1]; + + // Restore the active region bounds in the input string; they may have + // been changed because of transparent bounds on a Region. + fActiveStart = fRegionStart; + fActiveLimit = fRegionLimit; + } + break; + + case URX_ONECHAR_I: + if (fp->fInputIdx < fActiveLimit) { + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + + UChar32 c = UTEXT_NEXT32(fInputText); + if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + break; + } + } else { + fHitEnd = TRUE; + } + + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + UBool success = FALSE; + UChar32 c = UTEXT_PREVIOUS32(fInputText); + while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) { + if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { + success = TRUE; + break; + } else if (c == U_SENTINEL) { + break; + } + c = UTEXT_PREVIOUS32(fInputText); + } + if (success) { + fHitEnd = FALSE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + + case URX_STRING_I: + { + // Test input against a literal string. + // Strings require two slots in the compiled pattern, one for the + // offset to the string text, and one for the length. + const UCaseProps *csp = ucase_getSingleton(); + { + int32_t stringStartIdx, stringLen; + stringStartIdx = opValue; + + op = (int32_t)pat[fp->fPatIdx]; + fp->fPatIdx++; + opType = URX_TYPE(op); + opValue = URX_VAL(op); + U_ASSERT(opType == URX_STRING_LEN); + stringLen = opValue; + + const UChar *patternChars = litText+stringStartIdx; + const UChar *patternEnd = patternChars+stringLen; + + const UChar *foldChars = NULL; + int32_t foldOffset, foldLength; + UChar32 c; + + foldOffset = foldLength = 0; + UBool success = TRUE; + + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + while (patternChars < patternEnd && success) { + if(foldOffset < foldLength) { + U16_NEXT_UNSAFE(foldChars, foldOffset, c); + } else { + c = UTEXT_NEXT32(fInputText); + if (c != U_SENTINEL) { + foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT); + if(foldLength >= 0) { + if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings + foldOffset = 0; + U16_NEXT_UNSAFE(foldChars, foldOffset, c); + } else { + c = foldLength; + foldLength = foldOffset; // to avoid reading chars from the folding buffer + } + } + } + + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + } + + success = FALSE; + if (c != U_SENTINEL && (fp->fInputIdx <= fActiveLimit)) { + if (U_IS_BMP(c)) { + success = (*patternChars == c); + patternChars += 1; + } else if (patternChars+1 < patternEnd) { + success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c)); + patternChars += 2; + } + } else { + fHitEnd = TRUE; // TODO: See ticket 6074 + } + } + + if (!success) { + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size()) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + // Reset to last start point + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + patternChars = litText+stringStartIdx; + + // Search backwards for a possible start + do { + c = UTEXT_PREVIOUS32(fInputText); + if (c == U_SENTINEL) { + break; + } else { + foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT); + if(foldLength >= 0) { + if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings + foldOffset = 0; + U16_NEXT_UNSAFE(foldChars, foldOffset, c); + } else { + c = foldLength; + foldLength = foldOffset; // to avoid reading chars from the folding buffer + } + } + + if ((U_IS_BMP(c) && *patternChars == c) || + (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) { + success = TRUE; + break; + } + } + } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex); + + // And try again + if (success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + } + break; + + case URX_LB_START: + { + // Entering a look-behind block. + // Save Stack Ptr, Input Pos. + // TODO: implement transparent bounds. Ticket #6067 + U_ASSERT(opValue>=0 && opValue+1fDataSize); + fData[opValue] = fStack->size(); + fData[opValue+1] = fp->fInputIdx; + // Init the variable containing the start index for attempted matches. + fData[opValue+2] = -1; + // Save input string length, then reset to pin any matches to end at + // the current position. + fData[opValue+3] = fActiveLimit; + fActiveLimit = fp->fInputIdx; + } + break; + + + case URX_LB_CONT: + { + // Positive Look-Behind, at top of loop checking for matches of LB expression + // at all possible input starting positions. + + // Fetch the min and max possible match lengths. They are the operands + // of this op in the pattern. + int32_t minML = (int32_t)pat[fp->fPatIdx++]; + int32_t maxML = (int32_t)pat[fp->fPatIdx++]; + U_ASSERT(minML <= maxML); + U_ASSERT(minML >= 0); + + // Fetch (from data) the last input index where a match was attempted. + U_ASSERT(opValue>=0 && opValue+1fDataSize); + int64_t *lbStartIdx = &fData[opValue+2]; + if (*lbStartIdx < 0) { + // First time through loop. + *lbStartIdx = fp->fInputIdx - minML; + } else { + // 2nd through nth time through the loop. + // Back up start position for match by one. + if (*lbStartIdx == 0) { + (*lbStartIdx)--; + } else { + UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx); + UTEXT_PREVIOUS32(fInputText); + *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); + } + } + + if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { + // We have tried all potential match starting points without + // getting a match. Backtrack out, and out of the + // Look Behind altogether. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + int64_t restoreInputLen = fData[opValue+3]; + U_ASSERT(restoreInputLen >= fActiveLimit); + U_ASSERT(restoreInputLen <= fInputLength); + fActiveLimit = restoreInputLen; + break; + } + + // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. + // (successful match will fall off the end of the loop.) + fp = StateSave(fp, fp->fPatIdx-3, status); + fp->fInputIdx = *lbStartIdx; + } + break; + + case URX_LB_END: + // End of a look-behind block, after a successful match. + { + U_ASSERT(opValue>=0 && opValue+1fDataSize); + if (fp->fInputIdx != fActiveLimit) { + // The look-behind expression matched, but the match did not + // extend all the way to the point that we are looking behind from. + // FAIL out of here, which will take us back to the LB_CONT, which + // will retry the match starting at another position or fail + // the look-behind altogether, whichever is appropriate. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + + // Look-behind match is good. Restore the orignal input string length, + // which had been truncated to pin the end of the lookbehind match to the + // position being looked-behind. + int64_t originalInputLen = fData[opValue+3]; + U_ASSERT(originalInputLen >= fActiveLimit); + U_ASSERT(originalInputLen <= fInputLength); + fActiveLimit = originalInputLen; + } + break; + + + case URX_LBN_CONT: + { + // Negative Look-Behind, at top of loop checking for matches of LB expression + // at all possible input starting positions. + + // Fetch the extra parameters of this op. + int32_t minML = (int32_t)pat[fp->fPatIdx++]; + int32_t maxML = (int32_t)pat[fp->fPatIdx++]; + int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; + continueLoc = URX_VAL(continueLoc); + U_ASSERT(minML <= maxML); + U_ASSERT(minML >= 0); + U_ASSERT(continueLoc > fp->fPatIdx); + + // Fetch (from data) the last input index where a match was attempted. + U_ASSERT(opValue>=0 && opValue+1fDataSize); + int64_t *lbStartIdx = &fData[opValue+2]; + if (*lbStartIdx < 0) { + // First time through loop. + *lbStartIdx = fp->fInputIdx - minML; + } else { + // 2nd through nth time through the loop. + // Back up start position for match by one. + if (*lbStartIdx == 0) { + (*lbStartIdx)--; + } else { + UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx); + UTEXT_PREVIOUS32(fInputText); + *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); + } + } + + if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { + // We have tried all potential match starting points without + // getting a match, which means that the negative lookbehind as + // a whole has succeeded. Jump forward to the continue location + int64_t restoreInputLen = fData[opValue+3]; + U_ASSERT(restoreInputLen >= fActiveLimit); + U_ASSERT(restoreInputLen <= fInputLength); + fActiveLimit = restoreInputLen; + fp->fPatIdx = continueLoc; + break; + } + + // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. + // (successful match will cause a FAIL out of the loop altogether.) + fp = StateSave(fp, fp->fPatIdx-4, status); + fp->fInputIdx = *lbStartIdx; + } + break; + + case URX_LBN_END: + // End of a negative look-behind block, after a successful match. + { + U_ASSERT(opValue>=0 && opValue+1fDataSize); + if (fp->fInputIdx != fActiveLimit) { + // The look-behind expression matched, but the match did not + // extend all the way to the point that we are looking behind from. + // FAIL out of here, which will take us back to the LB_CONT, which + // will retry the match starting at another position or succeed + // the look-behind altogether, whichever is appropriate. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + + // Look-behind expression matched, which means look-behind test as + // a whole Fails + + // Restore the orignal input string length, which had been truncated + // inorder to pin the end of the lookbehind match + // to the position being looked-behind. + int64_t originalInputLen = fData[opValue+3]; + U_ASSERT(originalInputLen >= fActiveLimit); + U_ASSERT(originalInputLen <= fInputLength); + fActiveLimit = originalInputLen; + + // Restore original stack position, discarding any state saved + // by the successful pattern match. + U_ASSERT(opValue>=0 && opValue+1fDataSize); + int32_t newStackSize = (int32_t)fData[opValue]; + U_ASSERT(fStack->size() > newStackSize); + fStack->setSize(newStackSize); + + // FAIL, which will take control back to someplace + // prior to entering the look-behind test. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + break; + + + case URX_LOOP_SR_I: + // Loop Initialization for the optimized implementation of + // [some character set]* + // This op scans through all matching input. + // The following LOOP_C op emulates stack unwinding if the following pattern fails. + { + U_ASSERT(opValue > 0 && opValue < sets->size()); + Regex8BitSet *s8 = &fPattern->fSets8[opValue]; + UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); + + // Loop through input, until either the input is exhausted or + // we reach a character that is not a member of the set. + int64_t ix = fp->fInputIdx; + UTEXT_SETNATIVEINDEX(fInputText, ix); + for (;;) { + if (ix >= fActiveLimit) { + fHitEnd = TRUE; + break; + } + UChar32 c = UTEXT_NEXT32(fInputText); + if (c<256) { + if (s8->contains(c) == FALSE) { + break; + } + } else { + if (s->contains(c) == FALSE) { + break; + } + } + ix = UTEXT_GETNATIVEINDEX(fInputText); + } + + // If there were no matching characters, skip over the loop altogether. + // The loop doesn't run at all, a * op always succeeds. + if (ix == fp->fInputIdx) { + fp->fPatIdx++; // skip the URX_LOOP_C op. + break; + } + + // Peek ahead in the compiled pattern, to the URX_LOOP_C that + // must follow. It's operand is the stack location + // that holds the starting input index for the match of this [set]* + int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; + U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); + int32_t stackLoc = URX_VAL(loopcOp); + U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); + fp->fExtra[stackLoc] = fp->fInputIdx; + #ifdef REGEX_SMART_BACKTRACKING + backSearchIndex = fp->fInputIdx; + #endif + fp->fInputIdx = ix; + + // Save State to the URX_LOOP_C op that follows this one, + // so that match failures in the following code will return to there. + // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. + fp = StateSave(fp, fp->fPatIdx, status); + fp->fPatIdx++; + } + break; + + + case URX_LOOP_DOT_I: + // Loop Initialization for the optimized implementation of .* + // This op scans through all remaining input. + // The following LOOP_C op emulates stack unwinding if the following pattern fails. + { + // Loop through input until the input is exhausted (we reach an end-of-line) + // In DOTALL mode, we can just go straight to the end of the input. + int64_t ix; + if ((opValue & 1) == 1) { + // Dot-matches-All mode. Jump straight to the end of the string. + ix = fActiveLimit; + fHitEnd = TRUE; + } else { + // NOT DOT ALL mode. Line endings do not match '.' + // Scan forward until a line ending or end of input. + ix = fp->fInputIdx; + UTEXT_SETNATIVEINDEX(fInputText, ix); + for (;;) { + if (ix >= fActiveLimit) { + fHitEnd = TRUE; + break; + } + UChar32 c = UTEXT_NEXT32(fInputText); + if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s + if ((c == 0x0a) || // 0x0a is newline in both modes. + (((opValue & 2) == 0) && // IF not UNIX_LINES mode + (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) { + // char is a line ending. Exit the scanning loop. + break; + } + } + ix = UTEXT_GETNATIVEINDEX(fInputText); + } + } + + // If there were no matching characters, skip over the loop altogether. + // The loop doesn't run at all, a * op always succeeds. + if (ix == fp->fInputIdx) { + fp->fPatIdx++; // skip the URX_LOOP_C op. + break; + } + + // Peek ahead in the compiled pattern, to the URX_LOOP_C that + // must follow. It's operand is the stack location + // that holds the starting input index for the match of this .* + int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; + U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); + int32_t stackLoc = URX_VAL(loopcOp); + U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); + fp->fExtra[stackLoc] = fp->fInputIdx; + #ifdef REGEX_SMART_BACKTRACKING + backSearchIndex = fp->fInputIdx; + #endif + fp->fInputIdx = ix; + + // Save State to the URX_LOOP_C op that follows this one, + // so that match failures in the following code will return to there. + // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. + fp = StateSave(fp, fp->fPatIdx, status); + fp->fPatIdx++; + } + break; + + + case URX_LOOP_C: + { + U_ASSERT(opValue>=0 && opValuefExtra[opValue]; + U_ASSERT(backSearchIndex <= fp->fInputIdx); + if (backSearchIndex == fp->fInputIdx) { + // We've backed up the input idx to the point that the loop started. + // The loop is done. Leave here without saving state. + // Subsequent failures won't come back here. + break; + } + // Set up for the next iteration of the loop, with input index + // backed up by one from the last time through, + // and a state save to this instruction in case the following code fails again. + // (We're going backwards because this loop emulates stack unwinding, not + // the initial scan forward.) + U_ASSERT(fp->fInputIdx > 0); + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + UChar32 prevC = UTEXT_PREVIOUS32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + + UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText); + if (prevC == 0x0a && + fp->fInputIdx > backSearchIndex && + twoPrevC == 0x0d) { + int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; + if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { + // .*, stepping back over CRLF pair. + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + } + } + + fp = StateSave(fp, fp->fPatIdx-1, status); + } + break; -//-------------------------------------------------------------------------------- -// -// isWordBoundary -// in perl, "xab..cd..", \b is true at positions 0,3,5,7 -// For us, -// If the current char is a combining mark, -// \b is FALSE. -// Else Scan backwards to the first non-combining char. -// We are at a boundary if the this char and the original chars are -// opposite in membership in \w set -// -// parameters: pos - the current position in the input buffer -// -// TODO: double-check edge cases at region boundaries. -// -//-------------------------------------------------------------------------------- -UBool RegexMatcher::isWordBoundary(int32_t pos) { - UBool isBoundary = FALSE; - UBool cIsWord = FALSE; - - if (pos >= fLookLimit) { - fHitEnd = TRUE; - } else { - // Determine whether char c at current position is a member of the word set of chars. - // If we're off the end of the string, behave as though we're not at a word char. - UChar32 c = fInput->char32At(pos); - if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { - // Current char is a combining one. Not a boundary. - return FALSE; - } - cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); - } - - // Back up until we come to a non-combining char, determine whether - // that char is a word char. - UBool prevCIsWord = FALSE; - int32_t prevPos = pos; - for (;;) { - if (prevPos <= fLookStart) { - break; + + default: + // Trouble. The compiled pattern contains an entry with an + // unrecognized type tag. + U_ASSERT(FALSE); } - prevPos = fInput->moveIndex32(prevPos, -1); - UChar32 prevChar = fInput->char32At(prevPos); - if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) - || u_charType(prevChar) == U_FORMAT_CHAR)) { - prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar); + + if (U_FAILURE(status)) { + isMatch = FALSE; break; } } - isBoundary = cIsWord ^ prevCIsWord; - return isBoundary; -} - -//-------------------------------------------------------------------------------- -// -// isUWordBoundary -// -// Test for a word boundary using RBBI word break. -// -// parameters: pos - the current position in the input buffer -// -//-------------------------------------------------------------------------------- -UBool RegexMatcher::isUWordBoundary(int32_t pos) { - UBool returnVal = FALSE; -#if UCONFIG_NO_BREAK_ITERATION==0 - // If we haven't yet created a break iterator for this matcher, do it now. - if (fWordBreakItr == NULL) { - fWordBreakItr = - (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus); - if (U_FAILURE(fDeferredStatus)) { - return FALSE; +breakFromLoop: + fMatch = isMatch; + if (isMatch) { + fLastMatchEnd = fMatchEnd; + fMatchStart = startIdx; + fMatchEnd = fp->fInputIdx; + if (fTraceDebug) { + REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd)); } - fWordBreakItr->setText(*fInput); - } - - if (pos >= fLookLimit) { - fHitEnd = TRUE; - returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real" - // words are not boundaries. All non-word chars stand by themselves, - // with word boundaries on both sides. - } else { - returnVal = fWordBreakItr->isBoundary(pos); } -#endif - return returnVal; -} - -//-------------------------------------------------------------------------------- -// -// IncrementTime This function is called once each TIMER_INITIAL_VALUE state -// saves. Increment the "time" counter, and call the -// user callback function if there is one installed. -// -// If the match operation needs to be aborted, either for a time-out -// or because the user callback asked for it, just set an error status. -// The engine will pick that up and stop in its outer loop. -// -//-------------------------------------------------------------------------------- -void RegexMatcher::IncrementTime(UErrorCode &status) { - fTickCounter = TIMER_INITIAL_VALUE; - fTime++; - if (fCallbackFn != NULL) { - if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) { - status = U_REGEX_STOPPED_BY_CALLER; - return; + else + { + if (fTraceDebug) { + REGEX_RUN_DEBUG_PRINTF(("No match\n\n")); } } - if (fTimeLimit > 0 && fTime >= fTimeLimit) { - status = U_REGEX_TIME_OUT; - } -} -//-------------------------------------------------------------------------------- -// -// StateSave -// Make a new stack frame, initialized as a copy of the current stack frame. -// Set the pattern index in the original stack frame from the operand value -// in the opcode. Execution of the engine continues with the state in -// the newly created stack frame -// -// Note that reserveBlock() may grow the stack, resulting in the -// whole thing being relocated in memory. -// -// Parameters: -// fp The top frame pointer when called. At return, a new -// fame will be present -// savePatIdx An index into the compiled pattern. Goes into the original -// (not new) frame. If execution ever back-tracks out of the -// new frame, this will be where we continue from in the pattern. -// Return -// The new frame pointer. -// -//-------------------------------------------------------------------------------- -inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int32_t savePatIdx, UErrorCode &status) { - // push storage for a new frame. - int32_t *newFP = fStack->reserveBlock(fFrameSize, status); - if (newFP == NULL) { - // Failure on attempted stack expansion. - // Stack function set some other error code, change it to a more - // specific one for regular expressions. - status = U_REGEX_STACK_OVERFLOW; - // We need to return a writable stack frame, so just return the - // previous frame. The match operation will stop quickly - // because of the error status, after which the frame will never - // be looked at again. - return fp; - } - fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack. - - // New stack frame = copy of old top frame. - int32_t *source = (int32_t *)fp; - int32_t *dest = newFP; - for (;;) { - *dest++ = *source++; - if (source == newFP) { - break; - } - } - - fTickCounter--; - if (fTickCounter <= 0) { - IncrementTime(status); // Re-initializes fTickCounter - } - fp->fPatIdx = savePatIdx; - return (REStackFrame *)newFP; + fFrame = fp; // The active stack frame when the engine stopped. + // Contains the capture group results that we need to + // access later. + return; } //-------------------------------------------------------------------------------- // -// MatchAt This is the actual matching engine. +// MatchChunkAt This is the actual matching engine. Like MatchAt, but with the +// assumption that the entire string is available in the UText's +// chunk buffer. For now, that means we can use int32_t indexes, +// except for anything that needs to be saved (like group starts +// and ends). // // startIdx: begin matching a this index. // toEnd: if true, match must extend to end of the input region // //-------------------------------------------------------------------------------- -void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { +void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { UBool isMatch = FALSE; // True if the we have a match. + + int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards int32_t op; // Operation from the compiled pattern, split into int32_t opType; // the opcode int32_t opValue; // and the operand value. - - #ifdef REGEX_RUN_DEBUG + +#ifdef REGEX_RUN_DEBUG if (fTraceDebug) { - printf("MatchAt(startIdx=%d)\n", startIdx); + printf("MatchAt(startIdx=%ld)\n", startIdx); printf("Original Pattern: "); - int32_t i; - for (i=0; ifPattern.length(); i++) { - printf("%c", fPattern->fPattern.charAt(i)); + UChar32 c = utext_next32From(fPattern->fPattern, 0); + while (c != U_SENTINEL) { + if (c<32 || c>256) { + c = '.'; + } + REGEX_DUMP_DEBUG_PRINTF(("%c", c)); + + c = UTEXT_NEXT32(fPattern->fPattern); } printf("\n"); printf("Input String: "); - for (i=0; ilength(); i++) { - UChar c = fInput->charAt(i); + c = utext_next32From(fInputText, 0); + while (c != U_SENTINEL) { if (c<32 || c>256) { c = '.'; } printf("%c", c); + + c = UTEXT_NEXT32(fInputText); } printf("\n"); printf("\n"); } - #endif - +#endif + if (U_FAILURE(status)) { return; } - + // Cache frequently referenced items from the compiled pattern // - int32_t *pat = fPattern->fCompiledPat->getBuffer(); - + int64_t *pat = fPattern->fCompiledPat->getBuffer(); + const UChar *litText = fPattern->fLiteralText.getBuffer(); UVector *sets = fPattern->fSets; - - const UChar *inputBuf = fInput->getBuffer(); - + + const UChar *inputBuf = fInputText->chunkContents; + fFrameSize = fPattern->fFrameSize; REStackFrame *fp = resetStack(); - + fp->fPatIdx = 0; fp->fInputIdx = startIdx; - + // Zero out the pattern's static data int32_t i; for (i = 0; ifDataSize; i++) { fData[i] = 0; } - + // // Main loop for interpreting the compiled pattern. // One iteration of the loop per pattern operation performed. @@ -1507,36 +4482,38 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { fprintf(stderr, "Heap Trouble\n"); } #endif - op = pat[fp->fPatIdx]; + + op = (int32_t)pat[fp->fPatIdx]; opType = URX_TYPE(op); opValue = URX_VAL(op); - #ifdef REGEX_RUN_DEBUG +#ifdef REGEX_RUN_DEBUG if (fTraceDebug) { - printf("inputIdx=%d inputChar=%c sp=%3d ", fp->fInputIdx, - fInput->char32At(fp->fInputIdx), (int32_t *)fp-fStack->getBuffer()); + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp->fInputIdx, + UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); fPattern->dumpOp(fp->fPatIdx); } - #endif +#endif fp->fPatIdx++; - + switch (opType) { - - + + case URX_NOP: break; - - + + case URX_BACKTRACK: // Force a backtrack. In some circumstances, the pattern compiler // will notice that the pattern can't possibly match anything, and will // emit one of these at that point. fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; - - + + case URX_ONECHAR: if (fp->fInputIdx < fActiveLimit) { - UChar32 c; + UChar32 c; U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); if (c == opValue) { break; @@ -1544,10 +4521,37 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { } else { fHitEnd = TRUE; } - fp = (REStackFrame *)fStack->popFrame(fFrameSize); - break; + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + int64_t reverseIndex = fp->fInputIdx; + UChar32 c; + do { + U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); + if (c == opValue) { + break; + } + } while (reverseIndex > backSearchIndex); + if (c == opValue) { + fHitEnd = FALSE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = reverseIndex; + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + + case URX_STRING: { // Test input against a literal string. @@ -1555,49 +4559,86 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { // offset to the string text, and one for the length. int32_t stringStartIdx = opValue; int32_t stringLen; - - op = pat[fp->fPatIdx]; // Fetch the second operand + + op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand fp->fPatIdx++; opType = URX_TYPE(op); stringLen = URX_VAL(op); U_ASSERT(opType == URX_STRING_LEN); U_ASSERT(stringLen >= 2); - + if (fp->fInputIdx + stringLen > fActiveLimit) { // No match. String is longer than the remaining input text. fHitEnd = TRUE; // TODO: See ticket 6074 fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } - + const UChar * pInp = inputBuf + fp->fInputIdx; const UChar * pPat = litText+stringStartIdx; const UChar * pEnd = pInp + stringLen; + UBool success = FALSE; for(;;) { if (*pInp == *pPat) { pInp++; pPat++; if (pInp == pEnd) { // Successful Match. - fp->fInputIdx += stringLen; + success = TRUE; break; } } else { // Match failed. - fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } } + + if (success) { + fp->fInputIdx += stringLen; + } else { + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size()) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + // Reset to last start point + int64_t reverseIndex = fp->fInputIdx; + UChar32 c; + pPat = litText+stringStartIdx; + + // Search backwards for a possible start + do { + U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); + if ((U_IS_BMP(c) && *pPat == c) || + (*pPat == U16_LEAD(c) && *(pPat+1) == U16_TRAIL(c))) { + success = TRUE; + break; + } + } while (reverseIndex > backSearchIndex); + + // And try again + if (success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = reverseIndex; + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } } - break; - - - + break; + + case URX_STATE_SAVE: fp = StateSave(fp, opValue, status); break; - - + + case URX_END: // The match loop will exit via this path on a successful match, // when we reach the end of the pattern. @@ -1608,8 +4649,8 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { } isMatch = TRUE; goto breakFromLoop; - - // Start and End Capture stack frame variables are layout out like this: + + // Start and End Capture stack frame variables are laid out out like this: // fp->fExtra[opValue] - The start of a completed capture group // opValue+1 - The end of a completed capture group // opValue+2 - the start of a capture group whose end @@ -1618,8 +4659,8 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); fp->fExtra[opValue+2] = fp->fInputIdx; break; - - + + case URX_END_CAPTURE: U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. @@ -1627,10 +4668,10 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { fp->fExtra[opValue+1] = fp->fInputIdx; // End position U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); break; - - + + case URX_DOLLAR: // $, test for End of line - // or for position before new line at end of input + // or for position before new line at end of input if (fp->fInputIdx < fAnchorLimit-2) { // We are no where near the end of input. Fail. // This is the common case. Keep it first. @@ -1643,12 +4684,14 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { fRequireEnd = TRUE; break; } + // If we are positioned just before a new-line that is located at the // end of input, succeed. if (fp->fInputIdx == fAnchorLimit-1) { - UChar32 c = fInput->char32At(fp->fInputIdx); + UChar32 c; + U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c); + if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) { - // If not in the middle of a CR/LF sequence if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { // At new-line at end of input. Success fHitEnd = TRUE; @@ -1656,26 +4699,24 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { break; } } - } - - if (fp->fInputIdx == fAnchorLimit-2 && - fInput->char32At(fp->fInputIdx) == 0x0d && fInput->char32At(fp->fInputIdx+1) == 0x0a) { + } else if (fp->fInputIdx == fAnchorLimit-2 && + inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) { fHitEnd = TRUE; fRequireEnd = TRUE; break; // At CR/LF at end of input. Success } - + fp = (REStackFrame *)fStack->popFrame(fFrameSize); - + break; - - - case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. + + + case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. if (fp->fInputIdx >= fAnchorLimit-1) { // Either at the last character of input, or off the end. if (fp->fInputIdx == fAnchorLimit-1) { // At last char of input. Success if it's a new line. - if (fInput->char32At(fp->fInputIdx) == 0x0a) { + if (inputBuf[fp->fInputIdx] == 0x0a) { fHitEnd = TRUE; fRequireEnd = TRUE; break; @@ -1687,110 +4728,110 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { break; } } - + // Not at end of input. Back-track out. fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; - - - case URX_DOLLAR_M: // $, test for End of line in multi-line mode - { - if (fp->fInputIdx >= fAnchorLimit) { - // We really are at the end of input. Success. - fHitEnd = TRUE; - fRequireEnd = TRUE; - break; - } - // If we are positioned just before a new-line, succeed. - // It makes no difference where the new-line is within the input. - UChar32 c = inputBuf[fp->fInputIdx]; - if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { - // At a line end, except for the odd chance of being in the middle of a CR/LF sequence - // In multi-line mode, hitting a new-line just before the end of input does not - // set the hitEnd or requireEnd flags - if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { + + + case URX_DOLLAR_M: // $, test for End of line in multi-line mode + { + if (fp->fInputIdx >= fAnchorLimit) { + // We really are at the end of input. Success. + fHitEnd = TRUE; + fRequireEnd = TRUE; + break; + } + // If we are positioned just before a new-line, succeed. + // It makes no difference where the new-line is within the input. + UChar32 c = inputBuf[fp->fInputIdx]; + if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { + // At a line end, except for the odd chance of being in the middle of a CR/LF sequence + // In multi-line mode, hitting a new-line just before the end of input does not + // set the hitEnd or requireEnd flags + if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { break; - } - } - // not at a new line. Fail. - fp = (REStackFrame *)fStack->popFrame(fFrameSize); - } - break; - - - case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode - { - if (fp->fInputIdx >= fAnchorLimit) { - // We really are at the end of input. Success. - fHitEnd = TRUE; - fRequireEnd = TRUE; // Java set requireEnd in this case, even though - break; // adding a new-line would not lose the match. - } - // If we are not positioned just before a new-line, the test fails; backtrack out. - // It makes no difference where the new-line is within the input. - if (inputBuf[fp->fInputIdx] != 0x0a) { - fp = (REStackFrame *)fStack->popFrame(fFrameSize); - } - } - break; - - - case URX_CARET: // ^, test for start of line + } + } + // not at a new line. Fail. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + break; + + + case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode + { + if (fp->fInputIdx >= fAnchorLimit) { + // We really are at the end of input. Success. + fHitEnd = TRUE; + fRequireEnd = TRUE; // Java set requireEnd in this case, even though + break; // adding a new-line would not lose the match. + } + // If we are not positioned just before a new-line, the test fails; backtrack out. + // It makes no difference where the new-line is within the input. + if (inputBuf[fp->fInputIdx] != 0x0a) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + + case URX_CARET: // ^, test for start of line if (fp->fInputIdx != fAnchorStart) { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } break; - - - case URX_CARET_M: // ^, test for start of line in mulit-line mode - { - if (fp->fInputIdx == fAnchorStart) { - // We are at the start input. Success. - break; - } - // Check whether character just before the current pos is a new-line - // unless we are at the end of input - UChar c = inputBuf[fp->fInputIdx - 1]; - if ((fp->fInputIdx < fAnchorLimit) && - ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { - // It's a new-line. ^ is true. Success. - // TODO: what should be done with positions between a CR and LF? - break; - } - // Not at the start of a line. Fail. - fp = (REStackFrame *)fStack->popFrame(fFrameSize); - } - break; - - - case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode - { - U_ASSERT(fp->fInputIdx >= fAnchorStart); - if (fp->fInputIdx <= fAnchorStart) { - // We are at the start input. Success. - break; - } - // Check whether character just before the current pos is a new-line - U_ASSERT(fp->fInputIdx <= fAnchorLimit); - UChar c = inputBuf[fp->fInputIdx - 1]; - if (c != 0x0a) { - // Not at the start of a line. Back-track out. - fp = (REStackFrame *)fStack->popFrame(fFrameSize); - } - } - break; - + + + case URX_CARET_M: // ^, test for start of line in mulit-line mode + { + if (fp->fInputIdx == fAnchorStart) { + // We are at the start input. Success. + break; + } + // Check whether character just before the current pos is a new-line + // unless we are at the end of input + UChar c = inputBuf[fp->fInputIdx - 1]; + if ((fp->fInputIdx < fAnchorLimit) && + ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { + // It's a new-line. ^ is true. Success. + // TODO: what should be done with positions between a CR and LF? + break; + } + // Not at the start of a line. Fail. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + break; + + + case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode + { + U_ASSERT(fp->fInputIdx >= fAnchorStart); + if (fp->fInputIdx <= fAnchorStart) { + // We are at the start input. Success. + break; + } + // Check whether character just before the current pos is a new-line + U_ASSERT(fp->fInputIdx <= fAnchorLimit); + UChar c = inputBuf[fp->fInputIdx - 1]; + if (c != 0x0a) { + // Not at the start of a line. Back-track out. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + case URX_BACKSLASH_B: // Test for word boundaries { - UBool success = isWordBoundary(fp->fInputIdx); + UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx); success ^= (opValue != 0); // flip sense for \B if (!success) { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; - - + + case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style { UBool success = isUWordBoundary(fp->fInputIdx); @@ -1800,8 +4841,8 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { } } break; - - + + case URX_BACKSLASH_D: // Test for decimal digit { if (fp->fInputIdx >= fActiveLimit) { @@ -1809,112 +4850,111 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } - - UChar32 c = fInput->char32At(fp->fInputIdx); + + UChar32 c; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster. UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); success ^= (opValue != 0); // flip sense for \D - if (success) { - fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1); - } else { + if (!success) { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; - - + + case URX_BACKSLASH_G: // Test for position at end of previous match - if (!((fMatch && fp->fInputIdx==fMatchEnd) || fMatch==FALSE && fp->fInputIdx==fActiveStart)) { + if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } break; - - + + case URX_BACKSLASH_X: - // Match a Grapheme, as defined by Unicode TR 29. - // Differs slightly from Perl, which consumes combining marks independently - // of context. - { + // Match a Grapheme, as defined by Unicode TR 29. + // Differs slightly from Perl, which consumes combining marks independently + // of context. + { - // Fail if at end of input - if (fp->fInputIdx >= fActiveLimit) { - fHitEnd = TRUE; - fp = (REStackFrame *)fStack->popFrame(fFrameSize); - break; - } + // Fail if at end of input + if (fp->fInputIdx >= fActiveLimit) { + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } - // Examine (and consume) the current char. - // Dispatch into a little state machine, based on the char. - UChar32 c; - U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); - UnicodeSet **sets = fPattern->fStaticSets; - if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; - if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; - if (sets[URX_GC_L]->contains(c)) goto GC_L; - if (sets[URX_GC_LV]->contains(c)) goto GC_V; - if (sets[URX_GC_LVT]->contains(c)) goto GC_T; - if (sets[URX_GC_V]->contains(c)) goto GC_V; - if (sets[URX_GC_T]->contains(c)) goto GC_T; - goto GC_Extend; + // Examine (and consume) the current char. + // Dispatch into a little state machine, based on the char. + UChar32 c; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + UnicodeSet **sets = fPattern->fStaticSets; + if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; + if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; + if (sets[URX_GC_L]->contains(c)) goto GC_L; + if (sets[URX_GC_LV]->contains(c)) goto GC_V; + if (sets[URX_GC_LVT]->contains(c)) goto GC_T; + if (sets[URX_GC_V]->contains(c)) goto GC_V; + if (sets[URX_GC_T]->contains(c)) goto GC_T; + goto GC_Extend; GC_L: - if (fp->fInputIdx >= fActiveLimit) goto GC_Done; - U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); - if (sets[URX_GC_L]->contains(c)) goto GC_L; - if (sets[URX_GC_LV]->contains(c)) goto GC_V; - if (sets[URX_GC_LVT]->contains(c)) goto GC_T; - if (sets[URX_GC_V]->contains(c)) goto GC_V; - U16_PREV(inputBuf, 0, fp->fInputIdx, c); - goto GC_Extend; + if (fp->fInputIdx >= fActiveLimit) goto GC_Done; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + if (sets[URX_GC_L]->contains(c)) goto GC_L; + if (sets[URX_GC_LV]->contains(c)) goto GC_V; + if (sets[URX_GC_LVT]->contains(c)) goto GC_T; + if (sets[URX_GC_V]->contains(c)) goto GC_V; + U16_PREV(inputBuf, 0, fp->fInputIdx, c); + goto GC_Extend; GC_V: - if (fp->fInputIdx >= fActiveLimit) goto GC_Done; - U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); - if (sets[URX_GC_V]->contains(c)) goto GC_V; - if (sets[URX_GC_T]->contains(c)) goto GC_T; - U16_PREV(inputBuf, 0, fp->fInputIdx, c); - goto GC_Extend; + if (fp->fInputIdx >= fActiveLimit) goto GC_Done; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + if (sets[URX_GC_V]->contains(c)) goto GC_V; + if (sets[URX_GC_T]->contains(c)) goto GC_T; + U16_PREV(inputBuf, 0, fp->fInputIdx, c); + goto GC_Extend; GC_T: - if (fp->fInputIdx >= fActiveLimit) goto GC_Done; - U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); - if (sets[URX_GC_T]->contains(c)) goto GC_T; - U16_PREV(inputBuf, 0, fp->fInputIdx, c); - goto GC_Extend; + if (fp->fInputIdx >= fActiveLimit) goto GC_Done; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + if (sets[URX_GC_T]->contains(c)) goto GC_T; + U16_PREV(inputBuf, 0, fp->fInputIdx, c); + goto GC_Extend; GC_Extend: - // Combining characters are consumed here - for (;;) { - if (fp->fInputIdx >= fActiveLimit) { - break; - } - U16_GET(inputBuf, 0, fp->fInputIdx, fActiveLimit, c); - if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { - break; - } - U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit); + // Combining characters are consumed here + for (;;) { + if (fp->fInputIdx >= fActiveLimit) { + break; } - goto GC_Done; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { + U16_BACK_1(inputBuf, 0, fp->fInputIdx); + break; + } + } + goto GC_Done; GC_Control: - // Most control chars stand alone (don't combine with combining chars), - // except for that CR/LF sequence is a single grapheme cluster. - if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) { - fp->fInputIdx++; - } + // Most control chars stand alone (don't combine with combining chars), + // except for that CR/LF sequence is a single grapheme cluster. + if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) { + fp->fInputIdx++; + } GC_Done: - if (fp->fInputIdx >= fActiveLimit) { - fHitEnd = TRUE; - } - break; + if (fp->fInputIdx >= fActiveLimit) { + fHitEnd = TRUE; } + break; + } + + + - - - case URX_BACKSLASH_Z: // Test for end of Input if (fp->fInputIdx < fAnchorLimit) { fp = (REStackFrame *)fStack->popFrame(fFrameSize); @@ -1923,9 +4963,9 @@ GC_Done: fRequireEnd = TRUE; } break; - - - + + + case URX_STATIC_SETREF: { // Test input character against one of the predefined sets @@ -1938,11 +4978,12 @@ GC_Done: fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } - + UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); opValue &= ~URX_NEG_SET; U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); - UChar32 c; + + UChar32 c; U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); if (c < 256) { Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; @@ -1956,12 +4997,47 @@ GC_Done: } } if (!success) { + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + // Try to find it, backwards + int64_t reverseIndex = fp->fInputIdx; + U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried + success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset + do { + U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); + if (c < 256) { + Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; + if (s8->contains(c)) { + success = !success; + } + } else { + const UnicodeSet *s = fPattern->fStaticSets[opValue]; + if (s->contains(c)) { + success = !success; + } + } + } while (reverseIndex > backSearchIndex && !success); + + if (success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = reverseIndex; + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; - + case URX_STAT_SETREF_N: { // Test input character for NOT being a member of one of @@ -1971,8 +5047,9 @@ GC_Done: fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } - + U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); + UChar32 c; U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); if (c < 256) { @@ -1987,38 +5064,118 @@ GC_Done: } } + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + // Try to find it, backwards + int64_t reverseIndex = fp->fInputIdx; + U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried + UBool success = FALSE; + do { + U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); + if (c < 256) { + Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; + if (s8->contains(c) == FALSE) { + success = TRUE; + break; + } + } else { + const UnicodeSet *s = fPattern->fStaticSets[opValue]; + if (s->contains(c) == FALSE) { + success = TRUE; + break; + } + } + } while (reverseIndex > backSearchIndex); + + if (success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = reverseIndex; + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif fp = (REStackFrame *)fStack->popFrame(fFrameSize); } break; - + case URX_SETREF: - if (fp->fInputIdx >= fActiveLimit) { - fHitEnd = TRUE; - fp = (REStackFrame *)fStack->popFrame(fFrameSize); - break; - } - // There is input left. Pick up one char and test it for set membership. - UChar32 c; - U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); - U_ASSERT(opValue > 0 && opValue < sets->size()); - if (c<256) { - Regex8BitSet *s8 = &fPattern->fSets8[opValue]; - if (s8->contains(c)) { + { + if (fp->fInputIdx >= fActiveLimit) { + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } - } else { - UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); - if (s->contains(c)) { - // The character is in the set. A Match. - break; + + U_ASSERT(opValue > 0 && opValue < sets->size()); + + // There is input left. Pick up one char and test it for set membership. + UChar32 c; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + if (c<256) { + Regex8BitSet *s8 = &fPattern->fSets8[opValue]; + if (s8->contains(c)) { + // The character is in the set. A Match. + break; + } + } else { + UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); + if (s->contains(c)) { + // The character is in the set. A Match. + break; + } + } + + // the character wasn't in the set. + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + // Try to find it, backwards + int64_t reverseIndex = fp->fInputIdx; + U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried + UBool success = FALSE; + do { + U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); + if (c < 256) { + Regex8BitSet *s8 = &fPattern->fSets8[opValue]; + if (s8->contains(c)) { + success = TRUE; + break; + } + } else { + UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); + if (s->contains(c)) { + success = TRUE; + break; + } + } + } while (reverseIndex > backSearchIndex); + + if (success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = reverseIndex; + if (fp->fInputIdx > reverseIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } } + #endif + fp = (REStackFrame *)fStack->popFrame(fFrameSize); } - // the character wasn't in the set. Back track out. - fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; - - + + case URX_DOTANY: { // . matches anything, but stops at end-of-line. @@ -2028,43 +5185,44 @@ GC_Done: fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } + // There is input left. Advance over one char, unless we've hit end-of-line - UChar32 c; + UChar32 c; U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { // End of line in normal mode. . does not match. - fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } } break; - - + + case URX_DOTANY_ALL: { - // ., in dot-matches-all (including new lines) mode + // . in dot-matches-all (including new lines) mode if (fp->fInputIdx >= fActiveLimit) { // At end of input. Match failed. Backtrack out. fHitEnd = TRUE; fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } + // There is input left. Advance over one char, except if we are // at a cr/lf, advance over both of them. UChar32 c; U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); if (c==0x0d && fp->fInputIdx < fActiveLimit) { // In the case of a CR/LF, we need to advance over both. - UChar nextc = inputBuf[fp->fInputIdx]; - if (nextc == 0x0a) { - fp->fInputIdx++; + if (inputBuf[fp->fInputIdx] == 0x0a) { + U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit); } } } break; - - + + case URX_DOTANY_UNIX: { // '.' operator, matches all, but stops at end-of-line. @@ -2075,8 +5233,9 @@ GC_Done: fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } + // There is input left. Advance over one char, unless we've hit end-of-line - UChar32 c; + UChar32 c; U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); if (c == 0x0a) { // End of line in normal mode. '.' does not match the \n @@ -2084,22 +5243,22 @@ GC_Done: } } break; - - + + case URX_JMP: fp->fPatIdx = opValue; break; - + case URX_FAIL: isMatch = FALSE; goto breakFromLoop; - + case URX_JMP_SAV: U_ASSERT(opValue < fPattern->fCompiledPat->size()); fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current fp->fPatIdx = opValue; // Then JMP. break; - + case URX_JMP_SAV_X: // This opcode is used with (x)+, when x can match a zero length string. // Same as JMP_SAV, except conditional on the match having made forward progress. @@ -2107,11 +5266,11 @@ GC_Done: // data address of the input position at the start of the loop. { U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()); - int32_t stoOp = pat[opValue-1]; + int32_t stoOp = (int32_t)pat[opValue-1]; U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); int32_t frameLoc = URX_VAL(stoOp); U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); - int32_t prevInputIdx = fp->fExtra[frameLoc]; + int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc]; U_ASSERT(prevInputIdx <= fp->fInputIdx); if (prevInputIdx < fp->fInputIdx) { // The match did make progress. Repeat the loop. @@ -2123,23 +5282,23 @@ GC_Done: // execution will fall out of the loop. } break; - + case URX_CTR_INIT: { U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); fp->fExtra[opValue] = 0; // Set the loop counter variable to zero - + // Pick up the three extra operands that CTR_INIT has, and // skip the pattern location counter past - int32_t instrOperandLoc = fp->fPatIdx; + int32_t instrOperandLoc = (int32_t)fp->fPatIdx; fp->fPatIdx += 3; int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); - int32_t minCount = pat[instrOperandLoc+1]; - int32_t maxCount = pat[instrOperandLoc+2]; + int32_t minCount = (int32_t)pat[instrOperandLoc+1]; + int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; U_ASSERT(minCount>=0); U_ASSERT(maxCount>=minCount || maxCount==-1); U_ASSERT(loopLoc>fp->fPatIdx); - + if (minCount == 0) { fp = StateSave(fp, loopLoc+1, status); } @@ -2148,21 +5307,23 @@ GC_Done: } } break; - + case URX_CTR_LOOP: { U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); - int32_t initOp = pat[opValue]; + int32_t initOp = (int32_t)pat[opValue]; U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); - int32_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; - int32_t minCount = pat[opValue+2]; - int32_t maxCount = pat[opValue+3]; - // Increment the counter. Note: we're not worrying about counter + int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; + int32_t minCount = (int32_t)pat[opValue+2]; + int32_t maxCount = (int32_t)pat[opValue+3]; + // Increment the counter. Note: we DIDN'T worry about counter // overflow, since the data comes from UnicodeStrings, which - // stores its length in an int32_t. + // stores its length in an int32_t. Do we have to think about + // this now that we're using UText? Probably not, since the length + // in UChar32s is still an int32_t. (*pCounter)++; U_ASSERT(*pCounter > 0); - if ((uint32_t)*pCounter >= (uint32_t)maxCount) { + if ((uint64_t)*pCounter >= (uint32_t)maxCount) { U_ASSERT(*pCounter == maxCount || maxCount == -1); break; } @@ -2172,24 +5333,24 @@ GC_Done: fp->fPatIdx = opValue + 4; // Loop back. } break; - + case URX_CTR_INIT_NG: { // Initialize a non-greedy loop U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); fp->fExtra[opValue] = 0; // Set the loop counter variable to zero - + // Pick up the three extra operands that CTR_INIT has, and // skip the pattern location counter past - int32_t instrOperandLoc = fp->fPatIdx; + int32_t instrOperandLoc = (int32_t)fp->fPatIdx; fp->fPatIdx += 3; int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); - int32_t minCount = pat[instrOperandLoc+1]; - int32_t maxCount = pat[instrOperandLoc+2]; + int32_t minCount = (int32_t)pat[instrOperandLoc+1]; + int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; U_ASSERT(minCount>=0); U_ASSERT(maxCount>=minCount || maxCount==-1); U_ASSERT(loopLoc>fp->fPatIdx); - + if (minCount == 0) { if (maxCount != 0) { fp = StateSave(fp, fp->fPatIdx, status); @@ -2198,30 +5359,32 @@ GC_Done: } } break; - + case URX_CTR_LOOP_NG: { // Non-greedy {min, max} loops U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); - int32_t initOp = pat[opValue]; + int32_t initOp = (int32_t)pat[opValue]; U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); - int32_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; - int32_t minCount = pat[opValue+2]; - int32_t maxCount = pat[opValue+3]; - // Increment the counter. Note: we're not worrying about counter + int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; + int32_t minCount = (int32_t)pat[opValue+2]; + int32_t maxCount = (int32_t)pat[opValue+3]; + // Increment the counter. Note: we DIDN'T worry about counter // overflow, since the data comes from UnicodeStrings, which - // stores its length in an int32_t. + // stores its length in an int32_t. Do we have to think about + // this now that we're using UText? Probably not, since the length + // in UChar32s is still an int32_t. (*pCounter)++; U_ASSERT(*pCounter > 0); - - if ((uint32_t)*pCounter >= (uint32_t)maxCount) { + + if ((uint64_t)*pCounter >= (uint32_t)maxCount) { // The loop has matched the maximum permitted number of times. // Break out of here with no action. Matching will // continue with the following pattern. U_ASSERT(*pCounter == maxCount || maxCount == -1); break; } - + if (*pCounter < minCount) { // We haven't met the minimum number of matches yet. // Loop back for another one. @@ -2235,38 +5398,38 @@ GC_Done: } } break; - + case URX_STO_SP: U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); fData[opValue] = fStack->size(); break; - + case URX_LD_SP: { U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); - int32_t newStackSize = fData[opValue]; + int32_t newStackSize = (int32_t)fData[opValue]; U_ASSERT(newStackSize <= fStack->size()); - int32_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; - if (newFP == (int32_t *)fp) { + int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; + if (newFP == (int64_t *)fp) { break; } int32_t i; for (i=0; isetSize(newStackSize); } break; - + case URX_BACKREF: case URX_BACKREF_I: { U_ASSERT(opValue < fFrameSize); - int32_t groupStartIdx = fp->fExtra[opValue]; - int32_t groupEndIdx = fp->fExtra[opValue+1]; + int64_t groupStartIdx = fp->fExtra[opValue]; + int64_t groupEndIdx = fp->fExtra[opValue+1]; U_ASSERT(groupStartIdx <= groupEndIdx); - int32_t len = groupEndIdx-groupStartIdx; + int64_t len = groupEndIdx-groupStartIdx; if (groupStartIdx < 0) { // This capture group has not participated in the match thus far, fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. @@ -2282,12 +5445,12 @@ GC_Done: UBool haveMatch = FALSE; if (fp->fInputIdx + len <= fActiveLimit) { if (opType == URX_BACKREF) { - if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, len) == 0) { + if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, (int32_t)len) == 0) { haveMatch = TRUE; } } else { if (u_strncasecmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, - len, U_FOLD_CASE_DEFAULT) == 0) { + (int32_t)len, U_FOLD_CASE_DEFAULT) == 0) { haveMatch = TRUE; } } @@ -2303,30 +5466,30 @@ GC_Done: } } break; - + case URX_STO_INP_LOC: { U_ASSERT(opValue >= 0 && opValue < fFrameSize); fp->fExtra[opValue] = fp->fInputIdx; } break; - + case URX_JMPX: { - int32_t instrOperandLoc = fp->fPatIdx; + int32_t instrOperandLoc = (int32_t)fp->fPatIdx; fp->fPatIdx += 1; int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); - int32_t savedInputIdx = fp->fExtra[dataLoc]; + int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc]; U_ASSERT(savedInputIdx <= fp->fInputIdx); if (savedInputIdx < fp->fInputIdx) { fp->fPatIdx = opValue; // JMP } else { - fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop. } } break; - + case URX_LA_START: { // Entering a lookahead block. @@ -2338,39 +5501,39 @@ GC_Done: fActiveLimit = fLookLimit; // transparent bounds. } break; - + case URX_LA_END: { // Leaving a look-ahead block. // restore Stack Ptr, Input Pos to positions they had on entry to block. U_ASSERT(opValue>=0 && opValue+1fDataSize); int32_t stackSize = fStack->size(); - int32_t newStackSize = fData[opValue]; + int32_t newStackSize = (int32_t)fData[opValue]; U_ASSERT(stackSize >= newStackSize); if (stackSize > newStackSize) { // Copy the current top frame back to the new (cut back) top frame. // This makes the capture groups from within the look-ahead // expression available. - int32_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; + int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; int32_t i; for (i=0; isetSize(newStackSize); } fp->fInputIdx = fData[opValue+1]; - + // Restore the active region bounds in the input string; they may have // been changed because of transparent bounds on a Region. fActiveStart = fRegionStart; fActiveLimit = fRegionLimit; } break; - + case URX_ONECHAR_I: if (fp->fInputIdx < fActiveLimit) { - UChar32 c; + UChar32 c; U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { break; @@ -2378,41 +5541,151 @@ GC_Done: } else { fHitEnd = TRUE; } + + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + UBool success = FALSE; + int64_t reverseIndex = fp->fInputIdx; + UChar32 c; + while (reverseIndex > backSearchIndex) { + U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); + if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { + success = TRUE; + break; + } else if (c == U_SENTINEL) { + break; + } + } + if (success) { + fHitEnd = FALSE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = reverseIndex; + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; - + case URX_STRING_I: { // Test input against a literal string. // Strings require two slots in the compiled pattern, one for the // offset to the string text, and one for the length. - int32_t stringStartIdx, stringLen; - stringStartIdx = opValue; - - op = pat[fp->fPatIdx]; - fp->fPatIdx++; - opType = URX_TYPE(op); - opValue = URX_VAL(op); - U_ASSERT(opType == URX_STRING_LEN); - stringLen = opValue; - - int32_t stringEndIndex = fp->fInputIdx + stringLen; - if (stringEndIndex <= fActiveLimit) { - if (u_strncasecmp(inputBuf+fp->fInputIdx, litText+stringStartIdx, - stringLen, U_FOLD_CASE_DEFAULT) == 0) { - // Success. Advance the current input position. - fp->fInputIdx = stringEndIndex; - break; + const UCaseProps *csp = ucase_getSingleton(); + { + int32_t stringStartIdx, stringLen; + stringStartIdx = opValue; + + op = (int32_t)pat[fp->fPatIdx]; + fp->fPatIdx++; + opType = URX_TYPE(op); + opValue = URX_VAL(op); + U_ASSERT(opType == URX_STRING_LEN); + stringLen = opValue; + + const UChar *patternChars = litText+stringStartIdx; + const UChar *patternEnd = patternChars+stringLen; + + const UChar *foldChars = NULL; + int32_t foldOffset, foldLength; + UChar32 c; + + #ifdef REGEX_SMART_BACKTRACKING + int32_t originalInputIdx = fp->fInputIdx; + #endif + UBool success = TRUE; + + foldOffset = foldLength = 0; + + while (patternChars < patternEnd && success) { + if(foldOffset < foldLength) { + U16_NEXT_UNSAFE(foldChars, foldOffset, c); + } else { + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT); + if(foldLength >= 0) { + if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings + foldOffset = 0; + U16_NEXT_UNSAFE(foldChars, foldOffset, c); + } else { + c = foldLength; + foldLength = foldOffset; // to avoid reading chars from the folding buffer + } + } + } + + if (fp->fInputIdx <= fActiveLimit) { + if (U_IS_BMP(c)) { + success = (*patternChars == c); + patternChars += 1; + } else if (patternChars+1 < patternEnd) { + success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c)); + patternChars += 2; + } + } else { + success = FALSE; + fHitEnd = TRUE; // TODO: See ticket 6074 + } + } + + if (!success) { + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size()) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + // Reset to last start point + int64_t reverseIndex = originalInputIdx; + patternChars = litText+stringStartIdx; + + // Search backwards for a possible start + do { + U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); + foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT); + if(foldLength >= 0) { + if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings + foldOffset = 0; + U16_NEXT_UNSAFE(foldChars, foldOffset, c); + } else { + c = foldLength; + foldLength = foldOffset; // to avoid reading chars from the folding buffer + } + } + + if ((U_IS_BMP(c) && *patternChars == c) || + (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) { + success = TRUE; + break; + } + } while (reverseIndex > backSearchIndex); + + // And try again + if (success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = reverseIndex; + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + fp = (REStackFrame *)fStack->popFrame(fFrameSize); } - } else { - // Insufficent input left for a match. - fHitEnd = TRUE; // See ticket 6074 } - // No match. Back up matching to a saved state - fp = (REStackFrame *)fStack->popFrame(fFrameSize); } break; - + case URX_LB_START: { // Entering a look-behind block. @@ -2429,23 +5702,23 @@ GC_Done: fActiveLimit = fp->fInputIdx; } break; - - + + case URX_LB_CONT: { // Positive Look-Behind, at top of loop checking for matches of LB expression // at all possible input starting positions. - + // Fetch the min and max possible match lengths. They are the operands // of this op in the pattern. - int32_t minML = pat[fp->fPatIdx++]; - int32_t maxML = pat[fp->fPatIdx++]; + int32_t minML = (int32_t)pat[fp->fPatIdx++]; + int32_t maxML = (int32_t)pat[fp->fPatIdx++]; U_ASSERT(minML <= maxML); U_ASSERT(minML >= 0); - + // Fetch (from data) the last input index where a match was attempted. U_ASSERT(opValue>=0 && opValue+1fDataSize); - int32_t *lbStartIdx = &fData[opValue+2]; + int64_t *lbStartIdx = &fData[opValue+2]; if (*lbStartIdx < 0) { // First time through loop. *lbStartIdx = fp->fInputIdx - minML; @@ -2453,31 +5726,31 @@ GC_Done: // 2nd through nth time through the loop. // Back up start position for match by one. if (*lbStartIdx == 0) { - (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0. + (*lbStartIdx)--; } else { U16_BACK_1(inputBuf, 0, *lbStartIdx); } } - + if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { // We have tried all potential match starting points without // getting a match. Backtrack out, and out of the // Look Behind altogether. fp = (REStackFrame *)fStack->popFrame(fFrameSize); - int32_t restoreInputLen = fData[opValue+3]; + int64_t restoreInputLen = fData[opValue+3]; U_ASSERT(restoreInputLen >= fActiveLimit); - U_ASSERT(restoreInputLen <= fInput->length()); + U_ASSERT(restoreInputLen <= fInputLength); fActiveLimit = restoreInputLen; break; } - + // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. // (successful match will fall off the end of the loop.) fp = StateSave(fp, fp->fPatIdx-3, status); fp->fInputIdx = *lbStartIdx; } break; - + case URX_LB_END: // End of a look-behind block, after a successful match. { @@ -2491,35 +5764,35 @@ GC_Done: fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } - + // Look-behind match is good. Restore the orignal input string length, // which had been truncated to pin the end of the lookbehind match to the // position being looked-behind. - int32_t originalInputLen = fData[opValue+3]; + int64_t originalInputLen = fData[opValue+3]; U_ASSERT(originalInputLen >= fActiveLimit); - U_ASSERT(originalInputLen <= fInput->length()); + U_ASSERT(originalInputLen <= fInputLength); fActiveLimit = originalInputLen; } break; - - + + case URX_LBN_CONT: { // Negative Look-Behind, at top of loop checking for matches of LB expression // at all possible input starting positions. - + // Fetch the extra parameters of this op. - int32_t minML = pat[fp->fPatIdx++]; - int32_t maxML = pat[fp->fPatIdx++]; - int32_t continueLoc = pat[fp->fPatIdx++]; - continueLoc = URX_VAL(continueLoc); + int32_t minML = (int32_t)pat[fp->fPatIdx++]; + int32_t maxML = (int32_t)pat[fp->fPatIdx++]; + int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; + continueLoc = URX_VAL(continueLoc); U_ASSERT(minML <= maxML); U_ASSERT(minML >= 0); U_ASSERT(continueLoc > fp->fPatIdx); - + // Fetch (from data) the last input index where a match was attempted. U_ASSERT(opValue>=0 && opValue+1fDataSize); - int32_t *lbStartIdx = &fData[opValue+2]; + int64_t *lbStartIdx = &fData[opValue+2]; if (*lbStartIdx < 0) { // First time through loop. *lbStartIdx = fp->fInputIdx - minML; @@ -2532,26 +5805,26 @@ GC_Done: U16_BACK_1(inputBuf, 0, *lbStartIdx); } } - + if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { // We have tried all potential match starting points without // getting a match, which means that the negative lookbehind as // a whole has succeeded. Jump forward to the continue location - int32_t restoreInputLen = fData[opValue+3]; + int64_t restoreInputLen = fData[opValue+3]; U_ASSERT(restoreInputLen >= fActiveLimit); - U_ASSERT(restoreInputLen <= fInput->length()); + U_ASSERT(restoreInputLen <= fInputLength); fActiveLimit = restoreInputLen; fp->fPatIdx = continueLoc; break; } - + // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. // (successful match will cause a FAIL out of the loop altogether.) fp = StateSave(fp, fp->fPatIdx-4, status); fp->fInputIdx = *lbStartIdx; } break; - + case URX_LBN_END: // End of a negative look-behind block, after a successful match. { @@ -2565,22 +5838,22 @@ GC_Done: fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } - + // Look-behind expression matched, which means look-behind test as // a whole Fails // Restore the orignal input string length, which had been truncated // inorder to pin the end of the lookbehind match // to the position being looked-behind. - int32_t originalInputLen = fData[opValue+3]; + int64_t originalInputLen = fData[opValue+3]; U_ASSERT(originalInputLen >= fActiveLimit); - U_ASSERT(originalInputLen <= fInput->length()); + U_ASSERT(originalInputLen <= fInputLength); fActiveLimit = originalInputLen; - + // Restore original stack position, discarding any state saved // by the successful pattern match. U_ASSERT(opValue>=0 && opValue+1fDataSize); - int32_t newStackSize = fData[opValue]; + int32_t newStackSize = (int32_t)fData[opValue]; U_ASSERT(fStack->size() > newStackSize); fStack->setSize(newStackSize); @@ -2589,8 +5862,8 @@ GC_Done: fp = (REStackFrame *)fStack->popFrame(fFrameSize); } break; - - + + case URX_LOOP_SR_I: // Loop Initialization for the optimized implementation of // [some character set]* @@ -2600,10 +5873,10 @@ GC_Done: U_ASSERT(opValue > 0 && opValue < sets->size()); Regex8BitSet *s8 = &fPattern->fSets8[opValue]; UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); - + // Loop through input, until either the input is exhausted or // we reach a character that is not a member of the set. - int32_t ix = fp->fInputIdx; + int32_t ix = (int32_t)fp->fInputIdx; for (;;) { if (ix >= fActiveLimit) { fHitEnd = TRUE; @@ -2623,24 +5896,27 @@ GC_Done: } } } - + // If there were no matching characters, skip over the loop altogether. // The loop doesn't run at all, a * op always succeeds. if (ix == fp->fInputIdx) { fp->fPatIdx++; // skip the URX_LOOP_C op. break; } - + // Peek ahead in the compiled pattern, to the URX_LOOP_C that // must follow. It's operand is the stack location // that holds the starting input index for the match of this [set]* - int32_t loopcOp = pat[fp->fPatIdx]; + int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); int32_t stackLoc = URX_VAL(loopcOp); U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); fp->fExtra[stackLoc] = fp->fInputIdx; + #ifdef REGEX_SMART_BACKTRACKING + backSearchIndex = fp->fInputIdx; + #endif fp->fInputIdx = ix; - + // Save State to the URX_LOOP_C op that follows this one, // so that match failures in the following code will return to there. // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. @@ -2648,8 +5924,8 @@ GC_Done: fp->fPatIdx++; } break; - - + + case URX_LOOP_DOT_I: // Loop Initialization for the optimized implementation of .* // This op scans through all remaining input. @@ -2660,24 +5936,23 @@ GC_Done: int32_t ix; if ((opValue & 1) == 1) { // Dot-matches-All mode. Jump straight to the end of the string. - ix = fActiveLimit; + ix = (int32_t)fActiveLimit; fHitEnd = TRUE; } else { // NOT DOT ALL mode. Line endings do not match '.' // Scan forward until a line ending or end of input. - ix = fp->fInputIdx; + ix = (int32_t)fp->fInputIdx; for (;;) { if (ix >= fActiveLimit) { fHitEnd = TRUE; - ix = fActiveLimit; break; } UChar32 c; U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++] - if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s - if ((c == 0x0a) || // 0x0a is newline in both modes. - ((opValue & 2) == 0) && // IF not UNIX_LINES mode - (c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029) { + if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s + if ((c == 0x0a) || // 0x0a is newline in both modes. + (((opValue & 2) == 0) && // IF not UNIX_LINES mode + ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) { // char is a line ending. Put the input pos back to the // line ending char, and exit the scanning loop. U16_BACK_1(inputBuf, 0, ix); @@ -2686,24 +5961,27 @@ GC_Done: } } } - + // If there were no matching characters, skip over the loop altogether. // The loop doesn't run at all, a * op always succeeds. if (ix == fp->fInputIdx) { fp->fPatIdx++; // skip the URX_LOOP_C op. break; } - + // Peek ahead in the compiled pattern, to the URX_LOOP_C that // must follow. It's operand is the stack location // that holds the starting input index for the match of this .* - int32_t loopcOp = pat[fp->fPatIdx]; + int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); int32_t stackLoc = URX_VAL(loopcOp); U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); fp->fExtra[stackLoc] = fp->fInputIdx; + #ifdef REGEX_SMART_BACKTRACKING + backSearchIndex = fp->fInputIdx; + #endif fp->fInputIdx = ix; - + // Save State to the URX_LOOP_C op that follows this one, // so that match failures in the following code will return to there. // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. @@ -2711,14 +5989,14 @@ GC_Done: fp->fPatIdx++; } break; - - + + case URX_LOOP_C: { U_ASSERT(opValue>=0 && opValuefExtra[opValue]; - U_ASSERT(terminalIdx <= fp->fInputIdx); - if (terminalIdx == fp->fInputIdx) { + backSearchIndex = (int32_t)fp->fExtra[opValue]; + U_ASSERT(backSearchIndex <= fp->fInputIdx); + if (backSearchIndex == fp->fInputIdx) { // We've backed up the input idx to the point that the loop started. // The loop is done. Leave here without saving state. // Subsequent failures won't come back here. @@ -2730,30 +6008,32 @@ GC_Done: // (We're going backwards because this loop emulates stack unwinding, not // the initial scan forward.) U_ASSERT(fp->fInputIdx > 0); - U16_BACK_1(inputBuf, 0, fp->fInputIdx); - if (inputBuf[fp->fInputIdx] == 0x0a && - fp->fInputIdx > terminalIdx && + UChar32 prevC; + U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit? + + if (prevC == 0x0a && + fp->fInputIdx > backSearchIndex && inputBuf[fp->fInputIdx-1] == 0x0d) { - int32_t prevOp = pat[fp->fPatIdx-2]; + int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { // .*, stepping back over CRLF pair. - fp->fInputIdx--; + U16_BACK_1(inputBuf, 0, fp->fInputIdx); } } - - + + fp = StateSave(fp, fp->fPatIdx-1, status); } break; - - - + + + default: // Trouble. The compiled pattern contains an entry with an // unrecognized type tag. U_ASSERT(FALSE); } - + if (U_FAILURE(status)) { isMatch = FALSE; break; @@ -2776,16 +6056,15 @@ breakFromLoop: REGEX_RUN_DEBUG_PRINTF(("No match\n\n")); } } - + fFrame = fp; // The active stack frame when the engine stopped. - // Contains the capture group results that we need to - // access later. + // Contains the capture group results that we need to + // access later. return; } - UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher) U_NAMESPACE_END