X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/b75a7d8f3b4adbae880cab104ce2c6a50eee4db2..9953cfe31e2dce81abce5bae29a7d5001cb9c635:/icuSources/i18n/rematch.cpp diff --git a/icuSources/i18n/rematch.cpp b/icuSources/i18n/rematch.cpp index 7fc42a2b..dda8a1ec 100644 --- a/icuSources/i18n/rematch.cpp +++ b/icuSources/i18n/rematch.cpp @@ -1,15 +1,15 @@ -// -// file: rematch.cpp -// -// Contains the implementation of class RegexMatcher, -// which is one of the main API classes for the ICU regular expression package. -// /* ************************************************************************** -* Copyright (C) 2002-2003 International Business Machines Corporation * +* Copyright (C) 2002-2010 International Business Machines Corporation * * and others. All rights reserved. * ************************************************************************** */ +// +// file: rematch.cpp +// +// Contains the implementation of class RegexMatcher, +// which is one of the main API classes for the ICU regular expression package. +// #include "unicode/utypes.h" #if !UCONFIG_NO_REGULAR_EXPRESSIONS @@ -18,96 +18,152 @@ #include "unicode/uniset.h" #include "unicode/uchar.h" #include "unicode/ustring.h" +#include "unicode/rbbi.h" #include "uassert.h" #include "cmemory.h" #include "uvector.h" #include "uvectr32.h" +#include "uvectr64.h" #include "regeximp.h" #include "regexst.h" +#include "regextxt.h" +#include "ucase.h" // #include // Needed for heapcheck testing + +// Find progress callback +// ---------------------- +// Macro to inline test & call to ReportFindProgress(). Eliminates unnecessary function call. +// +#define REGEXFINDPROGRESS_INTERRUPT(pos, status) \ + (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE) + + +// Smart Backtracking +// ------------------ +// When a failure would go back to a LOOP_C instruction, +// strings, characters, and setrefs scan backwards for a valid start +// character themselves, pop the stack, and save state, emulating the +// LOOP_C's effect but assured that the next character of input is a +// possible matching character. +// +// Good idea in theory; unfortunately it only helps out a few specific +// cases and slows the engine down a little in the rest. + +//#define REGEX_SMART_BACKTRACKING 1 + U_NAMESPACE_BEGIN +// Default limit for the size of the back track stack, to avoid system +// failures causedby heap exhaustion. Units are in 32 bit words, not bytes. +// This value puts ICU's limits higher than most other regexp implementations, +// which use recursion rather than the heap, and take more storage per +// backtrack point. +// +static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000; + +// Time limit counter constant. +// Time limits for expression evaluation are in terms of quanta of work by +// the engine, each of which is 10,000 state saves. +// This constant determines that state saves per tick number. +static const int32_t TIMER_INITIAL_VALUE = 10000; + //----------------------------------------------------------------------------- // // Constructor and Destructor // //----------------------------------------------------------------------------- RegexMatcher::RegexMatcher(const RegexPattern *pat) { - fPattern = pat; - fPatternOwned = NULL; - fInput = NULL; - fTraceDebug = FALSE; - fDeferredStatus = U_ZERO_ERROR; - fStack = new UVector32(fDeferredStatus); - fData = fSmallData; + fDeferredStatus = U_ZERO_ERROR; + init(fDeferredStatus); + if (U_FAILURE(fDeferredStatus)) { + return; + } if (pat==NULL) { fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR; return; } - if (pat->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) { - fData = (int32_t *)uprv_malloc(pat->fDataSize * sizeof(int32_t)); - } - if (fStack == NULL || fData == NULL) { - fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; - } - - reset(*RegexStaticSets::gStaticSets->fEmptyString); + fPattern = pat; + init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus); } RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &input, uint32_t flags, UErrorCode &status) { + init(status); + if (U_FAILURE(status)) { + return; + } UParseError pe; fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); fPattern = fPatternOwned; - fTraceDebug = FALSE; - fDeferredStatus = U_ZERO_ERROR; - fStack = new UVector32(status); - fData = fSmallData; + + UText inputText = UTEXT_INITIALIZER; + utext_openConstUnicodeString(&inputText, &input, &status); + init2(&inputText, status); + utext_close(&inputText); + + fInputUniStrMaybeMutable = TRUE; +} + + +RegexMatcher::RegexMatcher(UText *regexp, UText *input, + uint32_t flags, UErrorCode &status) { + init(status); if (U_FAILURE(status)) { return; } - if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) { - fData = (int32_t *)uprv_malloc(fPattern->fDataSize * sizeof(int32_t)); - } - if (fStack == NULL || fData == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; + UParseError pe; + fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); + if (U_FAILURE(status)) { + return; } - reset(input); + + fPattern = fPatternOwned; + init2(input, status); } RegexMatcher::RegexMatcher(const UnicodeString ®exp, uint32_t flags, UErrorCode &status) { + init(status); + if (U_FAILURE(status)) { + return; + } UParseError pe; - fTraceDebug = FALSE; - fDeferredStatus = U_ZERO_ERROR; - fStack = new UVector32(status); - fData = fSmallData; fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); - fPattern = fPatternOwned; if (U_FAILURE(status)) { return; } + fPattern = fPatternOwned; + init2(RegexStaticSets::gStaticSets->fEmptyText, status); +} - if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) { - fData = (int32_t *)uprv_malloc(fPattern->fDataSize * sizeof(int32_t)); +RegexMatcher::RegexMatcher(UText *regexp, + uint32_t flags, UErrorCode &status) { + init(status); + if (U_FAILURE(status)) { + return; } - if (fStack == NULL || fData == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; + UParseError pe; + fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); + if (U_FAILURE(status)) { + return; } - reset(*RegexStaticSets::gStaticSets->fEmptyString); + + fPattern = fPatternOwned; + init2(RegexStaticSets::gStaticSets->fEmptyText, status); } + RegexMatcher::~RegexMatcher() { delete fStack; if (fData != fSmallData) { - delete fData; + uprv_free(fData); fData = NULL; } if (fPatternOwned) { @@ -115,8 +171,102 @@ RegexMatcher::~RegexMatcher() { fPatternOwned = NULL; fPattern = NULL; } + + if (fInput) { + delete fInput; + } + if (fInputText) { + utext_close(fInputText); + } + if (fAltInputText) { + utext_close(fAltInputText); + } + + #if UCONFIG_NO_BREAK_ITERATION==0 + delete fWordBreakItr; + #endif +} + +// +// init() common initialization for use by all constructors. +// Initialize all fields, get the object into a consistent state. +// This must be done even when the initial status shows an error, +// so that the object is initialized sufficiently well for the destructor +// to run safely. +// +void RegexMatcher::init(UErrorCode &status) { + fPattern = NULL; + fPatternOwned = NULL; + fFrameSize = 0; + fRegionStart = 0; + fRegionLimit = 0; + fAnchorStart = 0; + fAnchorLimit = 0; + fLookStart = 0; + fLookLimit = 0; + fActiveStart = 0; + fActiveLimit = 0; + fTransparentBounds = FALSE; + fAnchoringBounds = TRUE; + fMatch = FALSE; + fMatchStart = 0; + fMatchEnd = 0; + fLastMatchEnd = -1; + fAppendPosition = 0; + fHitEnd = FALSE; + fRequireEnd = FALSE; + fStack = NULL; + fFrame = NULL; + fTimeLimit = 0; + fTime = 0; + fTickCounter = 0; + fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY; + fCallbackFn = NULL; + fCallbackContext = NULL; + fFindProgressCallbackFn = NULL; + fFindProgressCallbackContext = NULL; + fTraceDebug = FALSE; + fDeferredStatus = status; + fData = fSmallData; + fWordBreakItr = NULL; + + fStack = new UVector64(status); + fInputText = NULL; + fAltInputText = NULL; + fInput = NULL; + fInputLength = 0; + fInputUniStrMaybeMutable = FALSE; + + if (U_FAILURE(status)) { + fDeferredStatus = status; + } } +// +// init2() Common initialization for use by RegexMatcher constructors, part 2. +// This handles the common setup to be done after the Pattern is available. +// +void RegexMatcher::init2(UText *input, UErrorCode &status) { + if (U_FAILURE(status)) { + fDeferredStatus = status; + return; + } + + if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) { + fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t)); + if (fData == NULL) { + status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; + return; + } + } + + reset(input); + setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status); + if (U_FAILURE(status)) { + fDeferredStatus = status; + return; + } +} static const UChar BACKSLASH = 0x5c; @@ -129,6 +279,29 @@ static const UChar DOLLARSIGN = 0x24; RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest, const UnicodeString &replacement, UErrorCode &status) { + UText replacementText = UTEXT_INITIALIZER; + + utext_openConstUnicodeString(&replacementText, &replacement, &status); + if (U_SUCCESS(status)) { + UText resultText = UTEXT_INITIALIZER; + utext_openUnicodeString(&resultText, &dest, &status); + + if (U_SUCCESS(status)) { + appendReplacement(&resultText, &replacementText, status); + utext_close(&resultText); + } + utext_close(&replacementText); + } + + return *this; +} + +// +// appendReplacement, UText mode +// +RegexMatcher &RegexMatcher::appendReplacement(UText *dest, + UText *replacement, + UErrorCode &status) { if (U_FAILURE(status)) { return *this; } @@ -140,97 +313,151 @@ RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest, status = U_REGEX_INVALID_STATE; return *this; } - + // Copy input string from the end of previous match to start of current match - int32_t len = fMatchStart-fLastMatchEnd; - if (len > 0) { - dest.append(*fInput, fLastMatchEnd, len); + int64_t destLen = utext_nativeLength(dest); + if (fMatchStart > fAppendPosition) { + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { + destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, + (int32_t)(fMatchStart-fAppendPosition), &status); + } else { + int32_t len16; + if (UTEXT_USES_U16(fInputText)) { + len16 = (int32_t)(fMatchStart-fAppendPosition); + } else { + UErrorCode lengthStatus = U_ZERO_ERROR; + len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus); + } + UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); + if (inputChars == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return *this; + } + utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status); + destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status); + uprv_free(inputChars); + } } + fAppendPosition = fMatchEnd; + - // scan the replacement text, looking for substitutions ($n) and \escapes. // TODO: optimize this loop by efficiently scanning for '$' or '\', // move entire ranges not containing substitutions. - int32_t replLen = replacement.length(); - int32_t replIdx = 0; - while (replIdx= replLen) { + c = UTEXT_CURRENT32(replacement); + if (c == U_SENTINEL) { break; } - c = replacement.charAt(replIdx); - + if (c==0x55/*U*/ || c==0x75/*u*/) { // We have a \udddd or \Udddddddd escape sequence. - UChar32 escapedChar = replacement.unescapeAt(replIdx); + int32_t offset = 0; + struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement); + UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context); if (escapedChar != (UChar32)0xFFFFFFFF) { - dest.append(escapedChar); - replIdx += (c==0x55? 9: 5); + if (U_IS_BMP(escapedChar)) { + UChar c16 = (UChar)escapedChar; + destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); + } else { + UChar surrogate[2]; + surrogate[0] = U16_LEAD(escapedChar); + surrogate[1] = U16_TRAIL(escapedChar); + if (U_SUCCESS(status)) { + destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); + } + } // TODO: Report errors for mal-formed \u escapes? // As this is, the original sequence is output, which may be OK. - continue; + if (context.lastOffset == offset) { + UTEXT_PREVIOUS32(replacement); + } else if (context.lastOffset != offset-1) { + utext_moveIndex32(replacement, offset - context.lastOffset - 1); + } + } + } else { + UTEXT_NEXT32(replacement); + // Plain backslash escape. Just put out the escaped character. + if (U_IS_BMP(c)) { + UChar c16 = (UChar)c; + destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); + } else { + UChar surrogate[2]; + surrogate[0] = U16_LEAD(c); + surrogate[1] = U16_TRAIL(c); + if (U_SUCCESS(status)) { + destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); + } } } - - // Plain backslash escape. Just put out the escaped character. - dest.append(c); - replIdx++; - continue; - } - - if (c != DOLLARSIGN) { + } else if (c != DOLLARSIGN) { // Normal char, not a $. Copy it out without further checks. - dest.append(c); - continue; - } - - // We've got a $. Pick up a capture group number if one follows. - // Consume at most the number of digits necessary for the largest capture - // number that is valid for this pattern. - - int32_t numDigits = 0; - int32_t groupNum = 0; - UChar32 digitC; - for (;;) { - if (replIdx >= replLen) { - break; + if (U_IS_BMP(c)) { + UChar c16 = (UChar)c; + destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); + } else { + UChar surrogate[2]; + surrogate[0] = U16_LEAD(c); + surrogate[1] = U16_TRAIL(c); + if (U_SUCCESS(status)) { + destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); + } } - digitC = replacement.char32At(replIdx); - if (u_isdigit(digitC) == FALSE) { - break; + } else { + // We've got a $. Pick up a capture group number if one follows. + // Consume at most the number of digits necessary for the largest capture + // number that is valid for this pattern. + + int32_t numDigits = 0; + int32_t groupNum = 0; + UChar32 digitC; + for (;;) { + digitC = UTEXT_CURRENT32(replacement); + if (digitC == U_SENTINEL) { + break; + } + if (u_isdigit(digitC) == FALSE) { + break; + } + UTEXT_NEXT32(replacement); + groupNum=groupNum*10 + u_charDigitValue(digitC); + numDigits++; + if (numDigits >= fPattern->fMaxCaptureDigits) { + break; + } } - replIdx = replacement.moveIndex32(replIdx, 1); - groupNum=groupNum*10 + u_charDigitValue(digitC); - numDigits++; - if (numDigits >= fPattern->fMaxCaptureDigits) { - break; + + + if (numDigits == 0) { + // The $ didn't introduce a group number at all. + // Treat it as just part of the substitution text. + UChar c16 = DOLLARSIGN; + destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); + } else { + // Finally, append the capture group data to the destination. + destLen += appendGroup(groupNum, dest, status); + if (U_FAILURE(status)) { + // Can fail if group number is out of range. + break; + } } } - - - if (numDigits == 0) { - // The $ didn't introduce a group number at all. - // Treat it as just part of the substitution text. - dest.append(DOLLARSIGN); - continue; - } - - // Finally, append the capture group data to the destination. - dest.append(group(groupNum, status)); + if (U_FAILURE(status)) { - // Can fail if group number is out of range. break; + } else { + c = UTEXT_NEXT32(replacement); } - } - + return *this; } @@ -242,11 +469,67 @@ RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest, // To the destination string, append everything following // the last match position from the input string. // +// Note: Match ranges do not affect appendTail or appendReplacement +// //-------------------------------------------------------------------------------- UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) { - int32_t len = fInput->length()-fMatchEnd; - if (len > 0) { - dest.append(*fInput, fMatchEnd, len); + UErrorCode status = U_ZERO_ERROR; + UText resultText = UTEXT_INITIALIZER; + utext_openUnicodeString(&resultText, &dest, &status); + + if (U_SUCCESS(status)) { + appendTail(&resultText, status); + utext_close(&resultText); + } + + return dest; +} + +// +// appendTail, UText mode +// +UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) { + UBool bailOut = FALSE; + if (U_FAILURE(status)) { + bailOut = TRUE; + } + if (U_FAILURE(fDeferredStatus)) { + status = fDeferredStatus; + bailOut = TRUE; + } + + if (bailOut) { + // dest must not be NULL + if (dest) { + utext_replace(dest, utext_nativeLength(dest), utext_nativeLength(dest), NULL, 0, &status); + return dest; + } + } + + if (fInputLength > fAppendPosition) { + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { + int64_t destLen = utext_nativeLength(dest); + utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, + (int32_t)(fInputLength-fAppendPosition), &status); + } else { + int32_t len16; + if (UTEXT_USES_U16(fInputText)) { + len16 = (int32_t)(fInputLength-fAppendPosition); + } else { + len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status); + status = U_ZERO_ERROR; // buffer overflow + } + + UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16)); + if (inputChars == NULL) { + fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; + } else { + utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated + int64_t destLen = utext_nativeLength(dest); + utext_replace(dest, destLen, destLen, inputChars, len16, &status); + uprv_free(inputChars); + } + } } return dest; } @@ -262,9 +545,11 @@ int32_t RegexMatcher::end(UErrorCode &err) const { return end(0, err); } +int64_t RegexMatcher::end64(UErrorCode &err) const { + return end64(0, err); +} - -int32_t RegexMatcher::end(int group, UErrorCode &err) const { +int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const { if (U_FAILURE(err)) { return -1; } @@ -276,7 +561,7 @@ int32_t RegexMatcher::end(int group, UErrorCode &err) const { err = U_INDEX_OUTOFBOUNDS_ERROR; return -1; } - int32_t e = -1; + int64_t e = -1; if (group == 0) { e = fMatchEnd; } else { @@ -287,9 +572,13 @@ int32_t RegexMatcher::end(int group, UErrorCode &err) const { U_ASSERT(groupOffset >= 0); e = fFrame->fExtra[groupOffset + 1]; } - return e; + + return e; } +int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const { + return (int32_t)end64(group, err); +} //-------------------------------------------------------------------------------- @@ -299,20 +588,66 @@ int32_t RegexMatcher::end(int group, UErrorCode &err) const { //-------------------------------------------------------------------------------- UBool RegexMatcher::find() { // Start at the position of the last match end. (Will be zero if the - // matcher has been reset. + // matcher has been reset.) // if (U_FAILURE(fDeferredStatus)) { return FALSE; } + + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { + return findUsingChunk(); + } - int32_t startPos = fMatchEnd; - int32_t inputLen = fInput->length(); - int32_t testLen = inputLen - fPattern->fMinMatchLen; - if (startPos > testLen) { - return FALSE; + int64_t startPos = fMatchEnd; + if (startPos==0) { + startPos = fActiveStart; + } + + if (fMatch) { + // Save the position of any previous successful match. + fLastMatchEnd = fMatchEnd; + + if (fMatchStart == fMatchEnd) { + // Previous match had zero length. Move start position up one position + // to avoid sending find() into a loop on zero-length matches. + if (startPos >= fActiveLimit) { + fMatch = FALSE; + fHitEnd = TRUE; + return FALSE; + } + UTEXT_SETNATIVEINDEX(fInputText, startPos); + UTEXT_NEXT32(fInputText); + startPos = UTEXT_GETNATIVEINDEX(fInputText); + } + } else { + if (fLastMatchEnd >= 0) { + // A previous find() failed to match. Don't try again. + // (without this test, a pattern with a zero-length match + // could match again at the end of an input string.) + fHitEnd = TRUE; + return FALSE; + } + } + + + // Compute the position in the input string beyond which a match can not begin, because + // the minimum length match would extend past the end of the input. + // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. + // Be aware of possible overflows if making changes here. + int64_t testStartLimit; + if (UTEXT_USES_U16(fInputText)) { + testStartLimit = fActiveLimit - fPattern->fMinMatchLen; + if (startPos > testStartLimit) { + fMatch = FALSE; + fHitEnd = TRUE; + return FALSE; + } + } else { + // For now, let the matcher discover that it can't match on its own + // We don't know how long the match len is in native characters + testStartLimit = fActiveLimit; } - const UChar *inputBuf = fInput->getBuffer(); UChar32 c; U_ASSERT(startPos >= 0); @@ -321,30 +656,36 @@ UBool RegexMatcher::find() { // No optimization was found. // Try a match at each input position. for (;;) { - MatchAt(startPos, fDeferredStatus); + MatchAt(startPos, FALSE, fDeferredStatus); if (U_FAILURE(fDeferredStatus)) { return FALSE; } if (fMatch) { return TRUE; } - if (startPos >= testLen) { + if (startPos >= testStartLimit) { + fHitEnd = TRUE; return FALSE; } - U16_FWD_1(inputBuf, startPos, inputLen); + UTEXT_SETNATIVEINDEX(fInputText, startPos); + UTEXT_NEXT32(fInputText); + startPos = UTEXT_GETNATIVEINDEX(fInputText); // Note that it's perfectly OK for a pattern to have a zero-length // match at the end of a string, so we must make sure that the loop - // runs with startPos == testLen the last time through. + // runs with startPos == testStartLimit the last time through. + if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) + return FALSE; } U_ASSERT(FALSE); case START_START: // Matches are only possible at the start of the input string // (pattern begins with ^ or \A) - if (startPos > 0) { + if (startPos > fActiveStart) { + fMatch = FALSE; return FALSE; } - MatchAt(startPos, fDeferredStatus); + MatchAt(startPos, FALSE, fDeferredStatus); if (U_FAILURE(fDeferredStatus)) { return FALSE; } @@ -355,22 +696,33 @@ UBool RegexMatcher::find() { { // Match may start on any char from a pre-computed set. U_ASSERT(fPattern->fMinMatchLen > 0); + int64_t pos; + UTEXT_SETNATIVEINDEX(fInputText, startPos); for (;;) { - int32_t pos = startPos; - U16_NEXT(inputBuf, startPos, inputLen, c); // like c = inputBuf[startPos++]; - if (c<256 && fPattern->fInitialChars8->contains(c) || - c>=256 && fPattern->fInitialChars->contains(c)) { - MatchAt(pos, fDeferredStatus); + c = UTEXT_NEXT32(fInputText); + pos = UTEXT_GETNATIVEINDEX(fInputText); + // c will be -1 (U_SENTINEL) at end of text, in which case we + // skip this next block (so we don't have a negative array index) + // and handle end of text in the following block. + if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) || + (c>=256 && fPattern->fInitialChars->contains(c)))) { + MatchAt(startPos, FALSE, fDeferredStatus); if (U_FAILURE(fDeferredStatus)) { return FALSE; } if (fMatch) { return TRUE; } + UTEXT_SETNATIVEINDEX(fInputText, pos); } - if (pos >= testLen) { + if (startPos >= testStartLimit) { + fMatch = FALSE; + fHitEnd = TRUE; return FALSE; } + startPos = pos; + if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) + return FALSE; } } U_ASSERT(FALSE); @@ -381,59 +733,108 @@ UBool RegexMatcher::find() { // Match starts on exactly one char. U_ASSERT(fPattern->fMinMatchLen > 0); UChar32 theChar = fPattern->fInitialChar; + int64_t pos; + UTEXT_SETNATIVEINDEX(fInputText, startPos); for (;;) { - int32_t pos = startPos; - U16_NEXT(inputBuf, startPos, inputLen, c); // like c = inputBuf[startPos++]; + c = UTEXT_NEXT32(fInputText); + pos = UTEXT_GETNATIVEINDEX(fInputText); if (c == theChar) { - MatchAt(pos, fDeferredStatus); + MatchAt(startPos, FALSE, fDeferredStatus); if (U_FAILURE(fDeferredStatus)) { return FALSE; } if (fMatch) { return TRUE; } + UTEXT_SETNATIVEINDEX(fInputText, pos); } - if (pos >= testLen) { + if (startPos >= testStartLimit) { + fMatch = FALSE; + fHitEnd = TRUE; return FALSE; } - } + startPos = pos; + if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) + return FALSE; + } } U_ASSERT(FALSE); case START_LINE: { UChar32 c; - if (startPos == 0) { - MatchAt(startPos, fDeferredStatus); + if (startPos == fAnchorStart) { + MatchAt(startPos, FALSE, fDeferredStatus); if (U_FAILURE(fDeferredStatus)) { return FALSE; } if (fMatch) { return TRUE; } - U16_NEXT(inputBuf, startPos, inputLen, c); // like c = inputBuf[startPos++]; + UTEXT_SETNATIVEINDEX(fInputText, startPos); + c = UTEXT_NEXT32(fInputText); + startPos = UTEXT_GETNATIVEINDEX(fInputText); + } else { + UTEXT_SETNATIVEINDEX(fInputText, startPos); + c = UTEXT_PREVIOUS32(fInputText); + UTEXT_SETNATIVEINDEX(fInputText, startPos); } - for (;;) { - UChar32 c = inputBuf[startPos-1]; - if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible - (c == 0x0a || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029 || - c == 0x0d && startPos+1 < inputLen && inputBuf[startPos+1] != 0x0a)) { - MatchAt(startPos, fDeferredStatus); - if (U_FAILURE(fDeferredStatus)) { - return FALSE; + if (fPattern->fFlags & UREGEX_UNIX_LINES) { + for (;;) { + if (c == 0x0a) { + MatchAt(startPos, FALSE, fDeferredStatus); + if (U_FAILURE(fDeferredStatus)) { + return FALSE; + } + if (fMatch) { + return TRUE; + } + UTEXT_SETNATIVEINDEX(fInputText, startPos); } - if (fMatch) { - return TRUE; + if (startPos >= testStartLimit) { + fMatch = FALSE; + fHitEnd = TRUE; + return FALSE; } + c = UTEXT_NEXT32(fInputText); + startPos = UTEXT_GETNATIVEINDEX(fInputText); + // Note that it's perfectly OK for a pattern to have a zero-length + // match at the end of a string, so we must make sure that the loop + // runs with startPos == testStartLimit the last time through. + if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) + return FALSE; } - if (startPos >= testLen) { - return FALSE; + } else { + for (;;) { + if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible + ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) { + if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) { + UTEXT_NEXT32(fInputText); + startPos = UTEXT_GETNATIVEINDEX(fInputText); + } + MatchAt(startPos, FALSE, fDeferredStatus); + if (U_FAILURE(fDeferredStatus)) { + return FALSE; + } + if (fMatch) { + return TRUE; + } + UTEXT_SETNATIVEINDEX(fInputText, startPos); + } + if (startPos >= testStartLimit) { + fMatch = FALSE; + fHitEnd = TRUE; + return FALSE; + } + c = UTEXT_NEXT32(fInputText); + startPos = UTEXT_GETNATIVEINDEX(fInputText); + // Note that it's perfectly OK for a pattern to have a zero-length + // match at the end of a string, so we must make sure that the loop + // runs with startPos == testStartLimit the last time through. + if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) + return FALSE; } - U16_NEXT(inputBuf, startPos, inputLen, c); // like c = inputBuf[startPos++]; - // Note that it's perfectly OK for a pattern to have a zero-length - // match at the end of a string, so we must make sure that the loop - // runs with startPos == testLen the last time through. } } @@ -447,7 +848,7 @@ UBool RegexMatcher::find() { -UBool RegexMatcher::find(int32_t start, UErrorCode &status) { +UBool RegexMatcher::find(int64_t start, UErrorCode &status) { if (U_FAILURE(status)) { return FALSE; } @@ -455,489 +856,3622 @@ UBool RegexMatcher::find(int32_t start, UErrorCode &status) { status = fDeferredStatus; return FALSE; } - int32_t inputLen = fInput->length(); - if (start < 0 || start >= inputLen) { + this->reset(); // Note: Reset() is specified by Java Matcher documentation. + // This will reset the region to be the full input length. + if (start < 0) { status = U_INDEX_OUTOFBOUNDS_ERROR; return FALSE; } - this->reset(); - fMatchEnd = start; + + int64_t nativeStart = start; + if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { + status = U_INDEX_OUTOFBOUNDS_ERROR; + return FALSE; + } + fMatchEnd = nativeStart; return find(); } - //-------------------------------------------------------------------------------- // -// group() +// findUsingChunk() -- like find(), but with the advance knowledge that the +// entire string is available in the UText's chunk buffer. // //-------------------------------------------------------------------------------- -UnicodeString RegexMatcher::group(UErrorCode &status) const { - return group(0, status); -} - - - -UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const { - int32_t s = start(groupNum, status); - int32_t e = end(groupNum, status); +UBool RegexMatcher::findUsingChunk() { + // Start at the position of the last match end. (Will be zero if the + // matcher has been reset. + // - // Note: calling start() and end() above will do all necessary checking that - // the group number is OK and that a match exists. status will be set. - if (U_FAILURE(status)) { - return UnicodeString(); - } - if (U_FAILURE(fDeferredStatus)) { - status = fDeferredStatus; - return UnicodeString(); + int32_t startPos = (int32_t)fMatchEnd; + if (startPos==0) { + startPos = (int32_t)fActiveStart; } + + const UChar *inputBuf = fInputText->chunkContents; - if (s < 0) { - // A capture group wasn't part of the match - return UnicodeString(); - } - U_ASSERT(s <= e); - return UnicodeString(*fInput, s, e-s); + if (fMatch) { + // Save the position of any previous successful match. + fLastMatchEnd = fMatchEnd; + + if (fMatchStart == fMatchEnd) { + // Previous match had zero length. Move start position up one position + // to avoid sending find() into a loop on zero-length matches. + if (startPos >= fActiveLimit) { + fMatch = FALSE; + fHitEnd = TRUE; + return FALSE; + } + U16_FWD_1(inputBuf, startPos, fInputLength); + } + } else { + if (fLastMatchEnd >= 0) { + // A previous find() failed to match. Don't try again. + // (without this test, a pattern with a zero-length match + // could match again at the end of an input string.) + fHitEnd = TRUE; + return FALSE; + } + } + + + // Compute the position in the input string beyond which a match can not begin, because + // the minimum length match would extend past the end of the input. + // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. + // Be aware of possible overflows if making changes here. + int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen); + if (startPos > testLen) { + fMatch = FALSE; + fHitEnd = TRUE; + return FALSE; + } + + UChar32 c; + U_ASSERT(startPos >= 0); + + switch (fPattern->fStartType) { + case START_NO_INFO: + // No optimization was found. + // Try a match at each input position. + for (;;) { + MatchChunkAt(startPos, FALSE, fDeferredStatus); + if (U_FAILURE(fDeferredStatus)) { + return FALSE; + } + if (fMatch) { + return TRUE; + } + if (startPos >= testLen) { + fHitEnd = TRUE; + return FALSE; + } + U16_FWD_1(inputBuf, startPos, fActiveLimit); + // Note that it's perfectly OK for a pattern to have a zero-length + // match at the end of a string, so we must make sure that the loop + // runs with startPos == testLen the last time through. + if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) + return FALSE; + } + U_ASSERT(FALSE); + + case START_START: + // Matches are only possible at the start of the input string + // (pattern begins with ^ or \A) + if (startPos > fActiveStart) { + fMatch = FALSE; + return FALSE; + } + MatchChunkAt(startPos, FALSE, fDeferredStatus); + if (U_FAILURE(fDeferredStatus)) { + return FALSE; + } + return fMatch; + + + case START_SET: + { + // Match may start on any char from a pre-computed set. + U_ASSERT(fPattern->fMinMatchLen > 0); + for (;;) { + int32_t pos = startPos; + U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; + if ((c<256 && fPattern->fInitialChars8->contains(c)) || + (c>=256 && fPattern->fInitialChars->contains(c))) { + MatchChunkAt(pos, FALSE, fDeferredStatus); + if (U_FAILURE(fDeferredStatus)) { + return FALSE; + } + if (fMatch) { + return TRUE; + } + } + if (pos >= testLen) { + fMatch = FALSE; + fHitEnd = TRUE; + return FALSE; + } + if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) + return FALSE; + } + } + U_ASSERT(FALSE); + + case START_STRING: + case START_CHAR: + { + // Match starts on exactly one char. + U_ASSERT(fPattern->fMinMatchLen > 0); + UChar32 theChar = fPattern->fInitialChar; + for (;;) { + int32_t pos = startPos; + U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; + if (c == theChar) { + MatchChunkAt(pos, FALSE, fDeferredStatus); + if (U_FAILURE(fDeferredStatus)) { + return FALSE; + } + if (fMatch) { + return TRUE; + } + } + if (pos >= testLen) { + fMatch = FALSE; + fHitEnd = TRUE; + return FALSE; + } + if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) + return FALSE; + } + } + U_ASSERT(FALSE); + + case START_LINE: + { + UChar32 c; + if (startPos == fAnchorStart) { + MatchChunkAt(startPos, FALSE, fDeferredStatus); + if (U_FAILURE(fDeferredStatus)) { + return FALSE; + } + if (fMatch) { + return TRUE; + } + U16_FWD_1(inputBuf, startPos, fActiveLimit); + } + + if (fPattern->fFlags & UREGEX_UNIX_LINES) { + for (;;) { + c = inputBuf[startPos-1]; + if (c == 0x0a) { + MatchChunkAt(startPos, FALSE, fDeferredStatus); + if (U_FAILURE(fDeferredStatus)) { + return FALSE; + } + if (fMatch) { + return TRUE; + } + } + if (startPos >= testLen) { + fMatch = FALSE; + fHitEnd = TRUE; + return FALSE; + } + U16_FWD_1(inputBuf, startPos, fActiveLimit); + // Note that it's perfectly OK for a pattern to have a zero-length + // match at the end of a string, so we must make sure that the loop + // runs with startPos == testLen the last time through. + if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) + return FALSE; + } + } else { + for (;;) { + c = inputBuf[startPos-1]; + if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible + ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) { + if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) { + startPos++; + } + MatchChunkAt(startPos, FALSE, fDeferredStatus); + if (U_FAILURE(fDeferredStatus)) { + return FALSE; + } + if (fMatch) { + return TRUE; + } + } + if (startPos >= testLen) { + fMatch = FALSE; + fHitEnd = TRUE; + return FALSE; + } + U16_FWD_1(inputBuf, startPos, fActiveLimit); + // Note that it's perfectly OK for a pattern to have a zero-length + // match at the end of a string, so we must make sure that the loop + // runs with startPos == testLen the last time through. + if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) + return FALSE; + } + } + } + + default: + U_ASSERT(FALSE); + } + + U_ASSERT(FALSE); + return FALSE; } - -int32_t RegexMatcher::groupCount() const { - return fPattern->fGroupMap->size(); +//-------------------------------------------------------------------------------- +// +// group() +// +//-------------------------------------------------------------------------------- +UnicodeString RegexMatcher::group(UErrorCode &status) const { + return group(0, status); } - - -const UnicodeString &RegexMatcher::input() const { - return *fInput; +// Return immutable shallow clone +UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const { + return group(0, dest, group_len, status); } - - - -UBool RegexMatcher::lookingAt(UErrorCode &status) { +// Return immutable shallow clone +UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const { + group_len = 0; + UBool bailOut = FALSE; if (U_FAILURE(status)) { - return FALSE; + return dest; } if (U_FAILURE(fDeferredStatus)) { status = fDeferredStatus; - return FALSE; + bailOut = TRUE; } - reset(); - MatchAt(0, status); - return fMatch; + if (fMatch == FALSE) { + status = U_REGEX_INVALID_STATE; + bailOut = TRUE; + } + if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { + status = U_INDEX_OUTOFBOUNDS_ERROR; + bailOut = TRUE; + } + + if (bailOut) { + return (dest) ? dest : utext_openUChars(NULL, NULL, 0, &status); + } + + int64_t s, e; + if (groupNum == 0) { + s = fMatchStart; + e = fMatchEnd; + } else { + int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); + U_ASSERT(groupOffset < fPattern->fFrameSize); + U_ASSERT(groupOffset >= 0); + s = fFrame->fExtra[groupOffset]; + e = fFrame->fExtra[groupOffset+1]; + } + + if (s < 0) { + // A capture group wasn't part of the match + return utext_clone(dest, fInputText, FALSE, TRUE, &status); + } + U_ASSERT(s <= e); + group_len = e - s; + + dest = utext_clone(dest, fInputText, FALSE, TRUE, &status); + if (dest) + UTEXT_SETNATIVEINDEX(dest, s); + return dest; } +UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const { + UnicodeString result; + if (U_FAILURE(status)) { + return result; + } + UText resultText = UTEXT_INITIALIZER; + utext_openUnicodeString(&resultText, &result, &status); + group(groupNum, &resultText, status); + utext_close(&resultText); + return result; +} -UBool RegexMatcher::matches(UErrorCode &status) { +// Return deep (mutable) clone +// Technology Preview (as an API), but note that the UnicodeString API is implemented +// using this function. +UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const { + UBool bailOut = FALSE; if (U_FAILURE(status)) { - return FALSE; + return dest; } if (U_FAILURE(fDeferredStatus)) { status = fDeferredStatus; - return FALSE; + bailOut = TRUE; } - reset(); - MatchAt(0, status); - UBool success = (fMatch && fMatchEnd==fInput->length()); - return success; -} - - - - -const RegexPattern &RegexMatcher::pattern() const { - return *fPattern; + + if (fMatch == FALSE) { + status = U_REGEX_INVALID_STATE; + bailOut = TRUE; + } + if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { + status = U_INDEX_OUTOFBOUNDS_ERROR; + bailOut = TRUE; + } + + if (bailOut) { + if (dest) { + utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); + return dest; + } else { + return utext_openUChars(NULL, NULL, 0, &status); + } + } + + int64_t s, e; + if (groupNum == 0) { + s = fMatchStart; + e = fMatchEnd; + } else { + int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); + U_ASSERT(groupOffset < fPattern->fFrameSize); + U_ASSERT(groupOffset >= 0); + s = fFrame->fExtra[groupOffset]; + e = fFrame->fExtra[groupOffset+1]; + } + + if (s < 0) { + // A capture group wasn't part of the match + if (dest) { + utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); + return dest; + } else { + return utext_openUChars(NULL, NULL, 0, &status); + } + } + U_ASSERT(s <= e); + + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { + U_ASSERT(e <= fInputLength); + if (dest) { + utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status); + } else { + UText groupText = UTEXT_INITIALIZER; + utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status); + dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); + utext_close(&groupText); + } + } else { + int32_t len16; + if (UTEXT_USES_U16(fInputText)) { + len16 = (int32_t)(e-s); + } else { + UErrorCode lengthStatus = U_ZERO_ERROR; + len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus); + } + UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); + if (groupChars == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return dest; + } + utext_extract(fInputText, s, e, groupChars, len16+1, &status); + + if (dest) { + utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status); + } else { + UText groupText = UTEXT_INITIALIZER; + utext_openUChars(&groupText, groupChars, len16, &status); + dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); + utext_close(&groupText); + } + + uprv_free(groupChars); + } + return dest; } - - //-------------------------------------------------------------------------------- // -// replaceAll +// appendGroup() -- currently internal only, appends a group to a UText rather +// than replacing its contents // //-------------------------------------------------------------------------------- -UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) { + +int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const { if (U_FAILURE(status)) { - return *fInput; + return 0; } if (U_FAILURE(fDeferredStatus)) { status = fDeferredStatus; - return *fInput; + return 0; } - UnicodeString destString; - for (reset(); find(); ) { - appendReplacement(destString, replacement, status); - if (U_FAILURE(status)) { - break; + int64_t destLen = utext_nativeLength(dest); + + if (fMatch == FALSE) { + status = U_REGEX_INVALID_STATE; + return utext_replace(dest, destLen, destLen, NULL, 0, &status); + } + if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { + status = U_INDEX_OUTOFBOUNDS_ERROR; + return utext_replace(dest, destLen, destLen, NULL, 0, &status); + } + + int64_t s, e; + if (groupNum == 0) { + s = fMatchStart; + e = fMatchEnd; + } else { + int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); + U_ASSERT(groupOffset < fPattern->fFrameSize); + U_ASSERT(groupOffset >= 0); + s = fFrame->fExtra[groupOffset]; + e = fFrame->fExtra[groupOffset+1]; + } + + if (s < 0) { + // A capture group wasn't part of the match + return utext_replace(dest, destLen, destLen, NULL, 0, &status); + } + U_ASSERT(s <= e); + + int64_t deltaLen; + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { + U_ASSERT(e <= fInputLength); + deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status); + } else { + int32_t len16; + if (UTEXT_USES_U16(fInputText)) { + len16 = (int32_t)(e-s); + } else { + UErrorCode lengthStatus = U_ZERO_ERROR; + len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus); + } + UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); + if (groupChars == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return 0; } + utext_extract(fInputText, s, e, groupChars, len16+1, &status); + + deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status); + uprv_free(groupChars); } - appendTail(destString); - return destString; + return deltaLen; } - //-------------------------------------------------------------------------------- // -// replaceFirst +// groupCount() // //-------------------------------------------------------------------------------- -UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) { - if (U_FAILURE(status)) { - return *fInput; - } - if (U_FAILURE(fDeferredStatus)) { - status = fDeferredStatus; - return *fInput; - } - - reset(); - if (!find()) { - return *fInput; - } - - UnicodeString destString; - appendReplacement(destString, replacement, status); - appendTail(destString); - return destString; +int32_t RegexMatcher::groupCount() const { + return fPattern->fGroupMap->size(); } //-------------------------------------------------------------------------------- // -// reset +// hasAnchoringBounds() // //-------------------------------------------------------------------------------- -RegexMatcher &RegexMatcher::reset() { - fMatchStart = 0; - fMatchEnd = 0; - fLastMatchEnd = 0; - fMatch = FALSE; - resetStack(); - return *this; +UBool RegexMatcher::hasAnchoringBounds() const { + return fAnchoringBounds; } - -RegexMatcher &RegexMatcher::reset(const UnicodeString &input) { - fInput = &input; - reset(); - return *this; +//-------------------------------------------------------------------------------- +// +// hasTransparentBounds() +// +//-------------------------------------------------------------------------------- +UBool RegexMatcher::hasTransparentBounds() const { + return fTransparentBounds; } -REStackFrame *RegexMatcher::resetStack() { - // Discard any previous contents of the state save stack, and initialize a - // new stack frame to all -1. The -1s are needed for capture group limits, where - // they indicate that a group has not yet matched anything. - fStack->removeAllElements(); - - int32_t *iFrame = fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus); - int i; - for (i=0; ifFrameSize; i++) { - iFrame[i] = -1; - } - return (REStackFrame *)iFrame; +//-------------------------------------------------------------------------------- +// +// hitEnd() +// +//-------------------------------------------------------------------------------- +UBool RegexMatcher::hitEnd() const { + return fHitEnd; } - //-------------------------------------------------------------------------------- // -// setTrace +// input() // //-------------------------------------------------------------------------------- -void RegexMatcher::setTrace(UBool state) { - fTraceDebug = state; +const UnicodeString &RegexMatcher::input() const { + if (!fInput) { + UErrorCode status = U_ZERO_ERROR; + int32_t len16; + if (UTEXT_USES_U16(fInputText)) { + len16 = (int32_t)fInputLength; + } else { + len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status); + status = U_ZERO_ERROR; // overflow, length status + } + UnicodeString *result = new UnicodeString(len16, 0, 0); + + UChar *inputChars = result->getBuffer(len16); + utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning + result->releaseBuffer(len16); + + (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator= + } + + return *fInput; } +//-------------------------------------------------------------------------------- +// +// inputText() +// +//-------------------------------------------------------------------------------- +UText *RegexMatcher::inputText() const { + return fInputText; +} -//--------------------------------------------------------------------- +//-------------------------------------------------------------------------------- // -// split +// getInput() -- like inputText(), but makes a clone or copies into another UText // -//--------------------------------------------------------------------- -int32_t RegexMatcher::split(const UnicodeString &input, - UnicodeString dest[], - int32_t destCapacity, - UErrorCode &status) -{ - // - // Check arguements for validity - // +//-------------------------------------------------------------------------------- +UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const { + UBool bailOut = FALSE; if (U_FAILURE(status)) { - return 0; - }; - - if (destCapacity < 1) { - status = U_ILLEGAL_ARGUMENT_ERROR; - return 0; + return dest; } - - - // - // Reset for the input text - // - reset(input); - int32_t inputLen = input.length(); - int32_t nextOutputStringStart = 0; - if (inputLen == 0) { - return 0; + if (U_FAILURE(fDeferredStatus)) { + status = fDeferredStatus; + bailOut = TRUE; } - - - // - // Loop through the input text, searching for the delimiter pattern - // - int i; - int32_t numCaptureGroups = fPattern->fGroupMap->size(); - for (i=0; ; i++) { - if (i>=destCapacity-1) { - // There is one or zero output string left. - // Fill the last output string with whatever is left from the input, then exit the loop. - // ( i will be == destCapicity if we filled the output array while processing - // capture groups of the delimiter expression, in which case we will discard the - // last capture group saved in favor of the unprocessed remainder of the - // input string.) - i = destCapacity-1; - int32_t remainingLength = inputLen-nextOutputStringStart; - if (remainingLength > 0) { - dest[i].setTo(input, nextOutputStringStart, remainingLength); - } - break; + + if (bailOut) { + if (dest) { + utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); + return dest; + } else { + return utext_clone(NULL, fInputText, FALSE, TRUE, &status); } - if (find()) { - // We found another delimiter. Move everything from where we started looking - // up until the start of the delimiter into the next output string. - int32_t fieldLen = fMatchStart - nextOutputStringStart; - dest[i].setTo(input, nextOutputStringStart, fieldLen); - nextOutputStringStart = fMatchEnd; - - // If the delimiter pattern has capturing parentheses, the captured - // text goes out into the next n destination strings. - int32_t groupNum; - for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { - if (i==destCapacity-1) { - break; - } - i++; - dest[i] = group(groupNum, status); + } + + if (dest) { + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { + utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status); + } else { + int32_t input16Len; + if (UTEXT_USES_U16(fInputText)) { + input16Len = (int32_t)fInputLength; + } else { + UErrorCode lengthStatus = U_ZERO_ERROR; + input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error } - - if (nextOutputStringStart == inputLen) { - // The delimiter was at the end of the string. We're done. - break; + UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len)); + if (inputChars == NULL) { + return dest; } - - } - else - { - // We ran off the end of the input while looking for the next delimiter. - // All the remaining text goes into the current output string. - dest[i].setTo(input, nextOutputStringStart, inputLen-nextOutputStringStart); - break; + + status = U_ZERO_ERROR; + utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning + status = U_ZERO_ERROR; + utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status); + + uprv_free(inputChars); } + return dest; + } else { + return utext_clone(NULL, fInputText, FALSE, TRUE, &status); } - return i+1; } +static UBool compat_SyncMutableUTextContents(UText *ut); +static UBool compat_SyncMutableUTextContents(UText *ut) { + UBool retVal = FALSE; + + // In the following test, we're really only interested in whether the UText should switch + // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents + // will still point to the correct data. + if (utext_nativeLength(ut) != ut->nativeIndexingLimit) { + UnicodeString *us=(UnicodeString *)ut->context; + + // Update to the latest length. + // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit). + int32_t newLength = us->length(); + + // Update the chunk description. + // The buffer may have switched between stack- and heap-based. + ut->chunkContents = us->getBuffer(); + ut->chunkLength = newLength; + ut->chunkNativeLimit = newLength; + ut->nativeIndexingLimit = newLength; + retVal = TRUE; + } + + return retVal; +} //-------------------------------------------------------------------------------- // -// start +// lookingAt() // //-------------------------------------------------------------------------------- -int32_t RegexMatcher::start(UErrorCode &status) const { - return start(0, status); +UBool RegexMatcher::lookingAt(UErrorCode &status) { + if (U_FAILURE(status)) { + return FALSE; + } + if (U_FAILURE(fDeferredStatus)) { + status = fDeferredStatus; + return FALSE; + } + + if (fInputUniStrMaybeMutable) { + if (compat_SyncMutableUTextContents(fInputText)) { + fInputLength = utext_nativeLength(fInputText); + reset(); + } + } + else { + resetPreserveRegion(); + } + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { + MatchChunkAt((int32_t)fActiveStart, FALSE, status); + } else { + MatchAt(fActiveStart, FALSE, status); + } + return fMatch; } - - -int32_t RegexMatcher::start(int group, UErrorCode &status) const { +UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) { if (U_FAILURE(status)) { - return -1; + return FALSE; } if (U_FAILURE(fDeferredStatus)) { status = fDeferredStatus; - return -1; + return FALSE; } - if (fMatch == FALSE) { - status = U_REGEX_INVALID_STATE; - return -1; + reset(); + + if (start < 0) { + status = U_INDEX_OUTOFBOUNDS_ERROR; + return FALSE; } - if (group < 0 || group > fPattern->fGroupMap->size()) { + + if (fInputUniStrMaybeMutable) { + if (compat_SyncMutableUTextContents(fInputText)) { + fInputLength = utext_nativeLength(fInputText); + reset(); + } + } + + int64_t nativeStart; + nativeStart = start; + if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { status = U_INDEX_OUTOFBOUNDS_ERROR; - return -1; + return FALSE; } - int32_t s; - if (group == 0) { - s = fMatchStart; + + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { + MatchChunkAt((int32_t)nativeStart, FALSE, status); } else { - int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); - U_ASSERT(groupOffset < fPattern->fFrameSize); - U_ASSERT(groupOffset >= 0); - s = fFrame->fExtra[groupOffset]; + MatchAt(nativeStart, FALSE, status); } - return s; + return fMatch; } //-------------------------------------------------------------------------------- // -// isWordBoundary -// in perl, "xab..cd..", \b is true at positions 0,3,5,7 -// For us, -// If the current char is a combining mark, -// \b is FALSE. -// Else Scan backwards to the first non-combining char. -// We are at a boundary if the this char and the original chars are -// opposite in membership in \w set -// -// parameters: pos - the current position in the input buffer -// start - the position where the match operation started. -// don't backup before this position when looking back -// for a preceding base char. +// matches() // //-------------------------------------------------------------------------------- -UBool RegexMatcher::isWordBoundary(int32_t pos) { - UBool isBoundary = FALSE; - UBool cIsWord = FALSE; - - // Determine whether char c at current position is a member of the word set of chars. - // If we're off the end of the string, behave as though we're not at a word char. - if (pos < fInput->length()) { - UChar32 c = fInput->char32At(pos); - int8_t ctype = u_charType(c); - if (ctype==U_NON_SPACING_MARK || ctype==U_ENCLOSING_MARK) { - // Current char is a combining one. Not a boundary. - return FALSE; +UBool RegexMatcher::matches(UErrorCode &status) { + if (U_FAILURE(status)) { + return FALSE; + } + if (U_FAILURE(fDeferredStatus)) { + status = fDeferredStatus; + return FALSE; + } + + if (fInputUniStrMaybeMutable) { + if (compat_SyncMutableUTextContents(fInputText)) { + fInputLength = utext_nativeLength(fInputText); + reset(); } - cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); } + else { + resetPreserveRegion(); + } + + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { + MatchChunkAt((int32_t)fActiveStart, TRUE, status); + } else { + MatchAt(fActiveStart, TRUE, status); + } + return fMatch; +} + + +UBool RegexMatcher::matches(int64_t start, UErrorCode &status) { + if (U_FAILURE(status)) { + return FALSE; + } + if (U_FAILURE(fDeferredStatus)) { + status = fDeferredStatus; + return FALSE; + } + reset(); - // Back up until we come to a non-combining char, determine whether - // that char is a word char. - UBool prevCIsWord = FALSE; - int32_t prevPos = pos; - for (;;) { - if (prevPos == 0) { - break; - } - prevPos = fInput->moveIndex32(prevPos, -1); - UChar32 prevChar = fInput->char32At(prevPos); - int8_t prevCType = u_charType(prevChar); - if (!(prevCType==U_NON_SPACING_MARK || prevCType==U_ENCLOSING_MARK)) { - prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar); - break; + if (start < 0) { + status = U_INDEX_OUTOFBOUNDS_ERROR; + return FALSE; + } + + if (fInputUniStrMaybeMutable) { + if (compat_SyncMutableUTextContents(fInputText)) { + fInputLength = utext_nativeLength(fInputText); + reset(); } } - isBoundary = cIsWord ^ prevCIsWord; - return isBoundary; + + int64_t nativeStart; + nativeStart = start; + if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { + status = U_INDEX_OUTOFBOUNDS_ERROR; + return FALSE; + } + + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { + MatchChunkAt((int32_t)nativeStart, TRUE, status); + } else { + MatchAt(nativeStart, TRUE, status); + } + return fMatch; } + + //-------------------------------------------------------------------------------- // -// StateSave -// Make a new stack frame, initialized as a copy of the current stack frame. -// Set the pattern index in the original stack frame from the operand value -// in the opcode. Execution of the engine continues with the state in -// the newly created stack frame +// pattern // -// Note that reserveBlock() may grow the stack, resulting in the -// whole thing being relocated in memory. +//-------------------------------------------------------------------------------- +const RegexPattern &RegexMatcher::pattern() const { + return *fPattern; +} + + + +//-------------------------------------------------------------------------------- +// +// region // //-------------------------------------------------------------------------------- -inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int32_t savePatIdx, int32_t frameSize, UErrorCode &status) { - // push storage for a new frame. - int32_t *newFP = fStack->reserveBlock(frameSize, status); - fp = (REStackFrame *)(newFP - frameSize); // in case of realloc of stack. +RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) { + if (U_FAILURE(status)) { + return *this; + } - // New stack frame = copy of old top frame. - int32_t *source = (int32_t *)fp; - int32_t *dest = newFP; - for (;;) { - *dest++ = *source++; - if (source == newFP) { + if (regionStart>regionLimit || regionStart<0 || regionLimit<0) { + status = U_ILLEGAL_ARGUMENT_ERROR; + } + + int64_t nativeStart = regionStart; + int64_t nativeLimit = regionLimit; + if (nativeStart > fInputLength || nativeLimit > fInputLength) { + status = U_ILLEGAL_ARGUMENT_ERROR; + } + + if (startIndex == -1) + this->reset(); + else + resetPreserveRegion(); + + fRegionStart = nativeStart; + fRegionLimit = nativeLimit; + fActiveStart = nativeStart; + fActiveLimit = nativeLimit; + + if (startIndex != -1) { + if (startIndex < fActiveStart || startIndex > fActiveLimit) { + status = U_INDEX_OUTOFBOUNDS_ERROR; + } + fMatchEnd = startIndex; + } + + if (!fTransparentBounds) { + fLookStart = nativeStart; + fLookLimit = nativeLimit; + } + if (fAnchoringBounds) { + fAnchorStart = nativeStart; + fAnchorLimit = nativeLimit; + } + return *this; +} + +RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) { + return region(start, limit, -1, status); +} + +//-------------------------------------------------------------------------------- +// +// regionEnd +// +//-------------------------------------------------------------------------------- +int32_t RegexMatcher::regionEnd() const { + return (int32_t)fRegionLimit; +} + +int64_t RegexMatcher::regionEnd64() const { + return fRegionLimit; +} + +//-------------------------------------------------------------------------------- +// +// regionStart +// +//-------------------------------------------------------------------------------- +int32_t RegexMatcher::regionStart() const { + return (int32_t)fRegionStart; +} + +int64_t RegexMatcher::regionStart64() const { + return fRegionStart; +} + + +//-------------------------------------------------------------------------------- +// +// replaceAll +// +//-------------------------------------------------------------------------------- +UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) { + UText replacementText = UTEXT_INITIALIZER; + UText resultText = UTEXT_INITIALIZER; + UnicodeString resultString; + if (U_FAILURE(status)) { + return resultString; + } + + utext_openConstUnicodeString(&replacementText, &replacement, &status); + utext_openUnicodeString(&resultText, &resultString, &status); + + replaceAll(&replacementText, &resultText, status); + + utext_close(&resultText); + utext_close(&replacementText); + + return resultString; +} + + +// +// replaceAll, UText mode +// +UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) { + if (U_FAILURE(status)) { + return dest; + } + if (U_FAILURE(fDeferredStatus)) { + status = fDeferredStatus; + return dest; + } + + if (dest == NULL) { + UnicodeString emptyString; + UText empty = UTEXT_INITIALIZER; + + utext_openUnicodeString(&empty, &emptyString, &status); + dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); + utext_close(&empty); + } + + if (U_SUCCESS(status)) { + reset(); + while (find()) { + appendReplacement(dest, replacement, status); + if (U_FAILURE(status)) { + break; + } + } + appendTail(dest, status); + } + + return dest; +} + + +//-------------------------------------------------------------------------------- +// +// replaceFirst +// +//-------------------------------------------------------------------------------- +UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) { + UText replacementText = UTEXT_INITIALIZER; + UText resultText = UTEXT_INITIALIZER; + UnicodeString resultString; + + utext_openConstUnicodeString(&replacementText, &replacement, &status); + utext_openUnicodeString(&resultText, &resultString, &status); + + replaceFirst(&replacementText, &resultText, status); + + utext_close(&resultText); + utext_close(&replacementText); + + return resultString; +} + +// +// replaceFirst, UText mode +// +UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) { + if (U_FAILURE(status)) { + return dest; + } + if (U_FAILURE(fDeferredStatus)) { + status = fDeferredStatus; + return dest; + } + + reset(); + if (!find()) { + return getInput(dest, status); + } + + if (dest == NULL) { + UnicodeString emptyString; + UText empty = UTEXT_INITIALIZER; + + utext_openUnicodeString(&empty, &emptyString, &status); + dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); + utext_close(&empty); + } + + appendReplacement(dest, replacement, status); + appendTail(dest, status); + + return dest; +} + + +//-------------------------------------------------------------------------------- +// +// requireEnd +// +//-------------------------------------------------------------------------------- +UBool RegexMatcher::requireEnd() const { + return fRequireEnd; +} + + +//-------------------------------------------------------------------------------- +// +// reset +// +//-------------------------------------------------------------------------------- +RegexMatcher &RegexMatcher::reset() { + fRegionStart = 0; + fRegionLimit = fInputLength; + fActiveStart = 0; + fActiveLimit = fInputLength; + fAnchorStart = 0; + fAnchorLimit = fInputLength; + fLookStart = 0; + fLookLimit = fInputLength; + resetPreserveRegion(); + return *this; +} + + + +void RegexMatcher::resetPreserveRegion() { + fMatchStart = 0; + fMatchEnd = 0; + fLastMatchEnd = -1; + fAppendPosition = 0; + fMatch = FALSE; + fHitEnd = FALSE; + fRequireEnd = FALSE; + fTime = 0; + fTickCounter = TIMER_INITIAL_VALUE; + //resetStack(); // more expensive than it looks... +} + + +RegexMatcher &RegexMatcher::reset(const UnicodeString &input) { + fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus); + if (fPattern->fNeedsAltInput) { + fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus); + } + fInputLength = utext_nativeLength(fInputText); + + reset(); + delete fInput; + fInput = NULL; + + // Do the following for any UnicodeString. + // This is for compatibility for those clients who modify the input string "live" during regex operations. + fInputUniStrMaybeMutable = TRUE; + + if (fWordBreakItr != NULL) { +#if UCONFIG_NO_BREAK_ITERATION==0 + UErrorCode status = U_ZERO_ERROR; + fWordBreakItr->setText(fInputText, status); +#endif + } + return *this; +} + + +RegexMatcher &RegexMatcher::reset(UText *input) { + if (fInputText != input) { + fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus); + if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus); + fInputLength = utext_nativeLength(fInputText); + + delete fInput; + fInput = NULL; + + if (fWordBreakItr != NULL) { +#if UCONFIG_NO_BREAK_ITERATION==0 + UErrorCode status = U_ZERO_ERROR; + fWordBreakItr->setText(input, status); +#endif + } + } + reset(); + fInputUniStrMaybeMutable = FALSE; + + return *this; +} + +/*RegexMatcher &RegexMatcher::reset(const UChar *) { + fDeferredStatus = U_INTERNAL_PROGRAM_ERROR; + return *this; +}*/ + +RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) { + if (U_FAILURE(status)) { + return *this; + } + reset(); // Reset also resets the region to be the entire string. + + if (position < 0 || position > fActiveLimit) { + status = U_INDEX_OUTOFBOUNDS_ERROR; + return *this; + } + fMatchEnd = position; + return *this; +} + + + + + +//-------------------------------------------------------------------------------- +// +// setTrace +// +//-------------------------------------------------------------------------------- +void RegexMatcher::setTrace(UBool state) { + fTraceDebug = state; +} + + + +//--------------------------------------------------------------------- +// +// split +// +//--------------------------------------------------------------------- +int32_t RegexMatcher::split(const UnicodeString &input, + UnicodeString dest[], + int32_t destCapacity, + UErrorCode &status) +{ + UText inputText = UTEXT_INITIALIZER; + utext_openConstUnicodeString(&inputText, &input, &status); + if (U_FAILURE(status)) { + return 0; + } + + UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity); + if (destText == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return 0; + } + int32_t i; + for (i = 0; i < destCapacity; i++) { + destText[i] = utext_openUnicodeString(NULL, &dest[i], &status); + } + + int32_t fieldCount = split(&inputText, destText, destCapacity, status); + + for (i = 0; i < destCapacity; i++) { + utext_close(destText[i]); + } + + uprv_free(destText); + utext_close(&inputText); + return fieldCount; +} + +// +// split, UText mode +// +int32_t RegexMatcher::split(UText *input, + UText *dest[], + int32_t destCapacity, + UErrorCode &status) +{ + // + // Check arguements for validity + // + if (U_FAILURE(status)) { + return 0; + }; + + if (destCapacity < 1) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + // + // Reset for the input text + // + reset(input); + int64_t nextOutputStringStart = 0; + if (fActiveLimit == 0) { + return 0; + } + + // + // Loop through the input text, searching for the delimiter pattern + // + int32_t i; + int32_t numCaptureGroups = fPattern->fGroupMap->size(); + for (i=0; ; i++) { + if (i>=destCapacity-1) { + // There is one or zero output string left. + // Fill the last output string with whatever is left from the input, then exit the loop. + // ( i will be == destCapacity if we filled the output array while processing + // capture groups of the delimiter expression, in which case we will discard the + // last capture group saved in favor of the unprocessed remainder of the + // input string.) + i = destCapacity-1; + if (fActiveLimit > nextOutputStringStart) { + if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { + if (dest[i]) { + utext_replace(dest[i], 0, utext_nativeLength(dest[i]), + input->chunkContents+nextOutputStringStart, + (int32_t)(fActiveLimit-nextOutputStringStart), &status); + } else { + UText remainingText = UTEXT_INITIALIZER; + utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, + fActiveLimit-nextOutputStringStart, &status); + dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); + utext_close(&remainingText); + } + } else { + UErrorCode lengthStatus = U_ZERO_ERROR; + int32_t remaining16Length = + utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus); + UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); + if (remainingChars == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + break; + } + + utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status); + if (dest[i]) { + utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); + } else { + UText remainingText = UTEXT_INITIALIZER; + utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); + dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); + utext_close(&remainingText); + } + + uprv_free(remainingChars); + } + } + break; + } + if (find()) { + // We found another delimiter. Move everything from where we started looking + // up until the start of the delimiter into the next output string. + if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { + if (dest[i]) { + utext_replace(dest[i], 0, utext_nativeLength(dest[i]), + input->chunkContents+nextOutputStringStart, + (int32_t)(fMatchStart-nextOutputStringStart), &status); + } else { + UText remainingText = UTEXT_INITIALIZER; + utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, + fMatchStart-nextOutputStringStart, &status); + dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); + utext_close(&remainingText); + } + } else { + UErrorCode lengthStatus = U_ZERO_ERROR; + int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus); + UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); + if (remainingChars == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + break; + } + utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status); + if (dest[i]) { + utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); + } else { + UText remainingText = UTEXT_INITIALIZER; + utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); + dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); + utext_close(&remainingText); + } + + uprv_free(remainingChars); + } + nextOutputStringStart = fMatchEnd; + + // If the delimiter pattern has capturing parentheses, the captured + // text goes out into the next n destination strings. + int32_t groupNum; + UBool lastGroupWasNullUText = FALSE; + for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { + if (i==destCapacity-1) { + break; + } + i++; + lastGroupWasNullUText = (dest[i] == NULL ? TRUE : FALSE); + dest[i] = group(groupNum, dest[i], status); + } + + if (nextOutputStringStart == fActiveLimit) { + // The delimiter was at the end of the string. We're done. + break; + } else if (i == destCapacity-1) { + // We're out of capture groups, and the rest of the string is more important + if (lastGroupWasNullUText) { + utext_close(dest[i]); + dest[i] = NULL; + } + } + + } + else + { + // We ran off the end of the input while looking for the next delimiter. + // All the remaining text goes into the current output string. + if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { + if (dest[i]) { + utext_replace(dest[i], 0, utext_nativeLength(dest[i]), + input->chunkContents+nextOutputStringStart, + (int32_t)(fActiveLimit-nextOutputStringStart), &status); + } else { + UText remainingText = UTEXT_INITIALIZER; + utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, + fActiveLimit-nextOutputStringStart, &status); + dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); + utext_close(&remainingText); + } + } else { + UErrorCode lengthStatus = U_ZERO_ERROR; + int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus); + UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); + if (remainingChars == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + break; + } + + utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status); + if (dest[i]) { + utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); + } else { + UText remainingText = UTEXT_INITIALIZER; + utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); + dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); + utext_close(&remainingText); + } + + uprv_free(remainingChars); + } + break; + } + if (U_FAILURE(status)) { + break; + } + } // end of for loop + return i+1; +} + + +//-------------------------------------------------------------------------------- +// +// start +// +//-------------------------------------------------------------------------------- +int32_t RegexMatcher::start(UErrorCode &status) const { + return start(0, status); +} + +int64_t RegexMatcher::start64(UErrorCode &status) const { + return start64(0, status); +} + +//-------------------------------------------------------------------------------- +// +// start(int32_t group, UErrorCode &status) +// +//-------------------------------------------------------------------------------- + +int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const { + if (U_FAILURE(status)) { + return -1; + } + if (U_FAILURE(fDeferredStatus)) { + status = fDeferredStatus; + return -1; + } + if (fMatch == FALSE) { + status = U_REGEX_INVALID_STATE; + return -1; + } + if (group < 0 || group > fPattern->fGroupMap->size()) { + status = U_INDEX_OUTOFBOUNDS_ERROR; + return -1; + } + int64_t s; + if (group == 0) { + s = fMatchStart; + } else { + int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); + U_ASSERT(groupOffset < fPattern->fFrameSize); + U_ASSERT(groupOffset >= 0); + s = fFrame->fExtra[groupOffset]; + } + + return s; +} + + +int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const { + return (int32_t)start64(group, status); +} + +//-------------------------------------------------------------------------------- +// +// useAnchoringBounds +// +//-------------------------------------------------------------------------------- +RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) { + fAnchoringBounds = b; + fAnchorStart = (fAnchoringBounds ? fRegionStart : 0); + fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength); + return *this; +} + + +//-------------------------------------------------------------------------------- +// +// useTransparentBounds +// +//-------------------------------------------------------------------------------- +RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) { + fTransparentBounds = b; + fLookStart = (fTransparentBounds ? 0 : fRegionStart); + fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit); + return *this; +} + +//-------------------------------------------------------------------------------- +// +// setTimeLimit +// +//-------------------------------------------------------------------------------- +void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) { + if (U_FAILURE(status)) { + return; + } + if (U_FAILURE(fDeferredStatus)) { + status = fDeferredStatus; + return; + } + if (limit < 0) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + fTimeLimit = limit; +} + + +//-------------------------------------------------------------------------------- +// +// getTimeLimit +// +//-------------------------------------------------------------------------------- +int32_t RegexMatcher::getTimeLimit() const { + return fTimeLimit; +} + + +//-------------------------------------------------------------------------------- +// +// setStackLimit +// +//-------------------------------------------------------------------------------- +void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) { + if (U_FAILURE(status)) { + return; + } + if (U_FAILURE(fDeferredStatus)) { + status = fDeferredStatus; + return; + } + if (limit < 0) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + + // Reset the matcher. This is needed here in case there is a current match + // whose final stack frame (containing the match results, pointed to by fFrame) + // would be lost by resizing to a smaller stack size. + reset(); + + if (limit == 0) { + // Unlimited stack expansion + fStack->setMaxCapacity(0); + } else { + // Change the units of the limit from bytes to ints, and bump the size up + // to be big enough to hold at least one stack frame for the pattern, + // if it isn't there already. + int32_t adjustedLimit = limit / sizeof(int32_t); + if (adjustedLimit < fPattern->fFrameSize) { + adjustedLimit = fPattern->fFrameSize; + } + fStack->setMaxCapacity(adjustedLimit); + } + fStackLimit = limit; +} + + +//-------------------------------------------------------------------------------- +// +// getStackLimit +// +//-------------------------------------------------------------------------------- +int32_t RegexMatcher::getStackLimit() const { + return fStackLimit; +} + + +//-------------------------------------------------------------------------------- +// +// setMatchCallback +// +//-------------------------------------------------------------------------------- +void RegexMatcher::setMatchCallback(URegexMatchCallback *callback, + const void *context, + UErrorCode &status) { + if (U_FAILURE(status)) { + return; + } + fCallbackFn = callback; + fCallbackContext = context; +} + + +//-------------------------------------------------------------------------------- +// +// getMatchCallback +// +//-------------------------------------------------------------------------------- +void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback, + const void *&context, + UErrorCode &status) { + if (U_FAILURE(status)) { + return; + } + callback = fCallbackFn; + context = fCallbackContext; +} + + +//-------------------------------------------------------------------------------- +// +// setMatchCallback +// +//-------------------------------------------------------------------------------- +void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback, + const void *context, + UErrorCode &status) { + if (U_FAILURE(status)) { + return; + } + fFindProgressCallbackFn = callback; + fFindProgressCallbackContext = context; +} + + +//-------------------------------------------------------------------------------- +// +// getMatchCallback +// +//-------------------------------------------------------------------------------- +void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback, + const void *&context, + UErrorCode &status) { + if (U_FAILURE(status)) { + return; + } + callback = fFindProgressCallbackFn; + context = fFindProgressCallbackContext; +} + + +//================================================================================ +// +// Code following this point in this file is the internal +// Match Engine Implementation. +// +//================================================================================ + + +//-------------------------------------------------------------------------------- +// +// resetStack +// Discard any previous contents of the state save stack, and initialize a +// new stack frame to all -1. The -1s are needed for capture group limits, +// where they indicate that a group has not yet matched anything. +//-------------------------------------------------------------------------------- +REStackFrame *RegexMatcher::resetStack() { + // Discard any previous contents of the state save stack, and initialize a + // new stack frame with all -1 data. The -1s are needed for capture group limits, + // where they indicate that a group has not yet matched anything. + fStack->removeAllElements(); + + REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus); + int32_t i; + for (i=0; ifFrameSize-RESTACKFRAME_HDRCOUNT; i++) { + iFrame->fExtra[i] = -1; + } + return iFrame; +} + + + +//-------------------------------------------------------------------------------- +// +// isWordBoundary +// in perl, "xab..cd..", \b is true at positions 0,3,5,7 +// For us, +// If the current char is a combining mark, +// \b is FALSE. +// Else Scan backwards to the first non-combining char. +// We are at a boundary if the this char and the original chars are +// opposite in membership in \w set +// +// parameters: pos - the current position in the input buffer +// +// TODO: double-check edge cases at region boundaries. +// +//-------------------------------------------------------------------------------- +UBool RegexMatcher::isWordBoundary(int64_t pos) { + UBool isBoundary = FALSE; + UBool cIsWord = FALSE; + + if (pos >= fLookLimit) { + fHitEnd = TRUE; + } else { + // Determine whether char c at current position is a member of the word set of chars. + // If we're off the end of the string, behave as though we're not at a word char. + UTEXT_SETNATIVEINDEX(fInputText, pos); + UChar32 c = UTEXT_CURRENT32(fInputText); + if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { + // Current char is a combining one. Not a boundary. + return FALSE; + } + cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); + } + + // Back up until we come to a non-combining char, determine whether + // that char is a word char. + UBool prevCIsWord = FALSE; + for (;;) { + if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) { + break; + } + UChar32 prevChar = UTEXT_PREVIOUS32(fInputText); + if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) + || u_charType(prevChar) == U_FORMAT_CHAR)) { + prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar); + break; + } + } + isBoundary = cIsWord ^ prevCIsWord; + return isBoundary; +} + +UBool RegexMatcher::isChunkWordBoundary(int32_t pos) { + UBool isBoundary = FALSE; + UBool cIsWord = FALSE; + + const UChar *inputBuf = fInputText->chunkContents; + + if (pos >= fLookLimit) { + fHitEnd = TRUE; + } else { + // Determine whether char c at current position is a member of the word set of chars. + // If we're off the end of the string, behave as though we're not at a word char. + UChar32 c; + U16_GET(inputBuf, fLookStart, pos, fLookLimit, c); + if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { + // Current char is a combining one. Not a boundary. + return FALSE; + } + cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); + } + + // Back up until we come to a non-combining char, determine whether + // that char is a word char. + UBool prevCIsWord = FALSE; + for (;;) { + if (pos <= fLookStart) { + break; + } + UChar32 prevChar; + U16_PREV(inputBuf, fLookStart, pos, prevChar); + if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) + || u_charType(prevChar) == U_FORMAT_CHAR)) { + prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar); + break; + } + } + isBoundary = cIsWord ^ prevCIsWord; + return isBoundary; +} + +//-------------------------------------------------------------------------------- +// +// isUWordBoundary +// +// Test for a word boundary using RBBI word break. +// +// parameters: pos - the current position in the input buffer +// +//-------------------------------------------------------------------------------- +UBool RegexMatcher::isUWordBoundary(int64_t pos) { + UBool returnVal = FALSE; +#if UCONFIG_NO_BREAK_ITERATION==0 + + // If we haven't yet created a break iterator for this matcher, do it now. + if (fWordBreakItr == NULL) { + fWordBreakItr = + (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus); + if (U_FAILURE(fDeferredStatus)) { + return FALSE; + } + fWordBreakItr->setText(fInputText, fDeferredStatus); + } + + if (pos >= fLookLimit) { + fHitEnd = TRUE; + returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real" + // words are not boundaries. All non-word chars stand by themselves, + // with word boundaries on both sides. + } else { + if (!UTEXT_USES_U16(fInputText)) { + // !!!: Would like a better way to do this! + UErrorCode status = U_ZERO_ERROR; + pos = utext_extract(fInputText, 0, pos, NULL, 0, &status); + } + returnVal = fWordBreakItr->isBoundary((int32_t)pos); + } +#endif + return returnVal; +} + +//-------------------------------------------------------------------------------- +// +// IncrementTime This function is called once each TIMER_INITIAL_VALUE state +// saves. Increment the "time" counter, and call the +// user callback function if there is one installed. +// +// If the match operation needs to be aborted, either for a time-out +// or because the user callback asked for it, just set an error status. +// The engine will pick that up and stop in its outer loop. +// +//-------------------------------------------------------------------------------- +void RegexMatcher::IncrementTime(UErrorCode &status) { + fTickCounter = TIMER_INITIAL_VALUE; + fTime++; + if (fCallbackFn != NULL) { + if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) { + status = U_REGEX_STOPPED_BY_CALLER; + return; + } + } + if (fTimeLimit > 0 && fTime >= fTimeLimit) { + status = U_REGEX_TIME_OUT; + } +} + +//-------------------------------------------------------------------------------- +// +// ReportFindProgress This function is called once for each advance in the target +// string from the find() function, and calls the user progress callback +// function if there is one installed. +// +// NOTE: +// +// If the match operation needs to be aborted because the user +// callback asked for it, just set an error status. +// The engine will pick that up and stop in its outer loop. +// +//-------------------------------------------------------------------------------- +UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) { + if (fFindProgressCallbackFn != NULL) { + if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) { + status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/; + return FALSE; + } + } + return TRUE; +} + +//-------------------------------------------------------------------------------- +// +// StateSave +// Make a new stack frame, initialized as a copy of the current stack frame. +// Set the pattern index in the original stack frame from the operand value +// in the opcode. Execution of the engine continues with the state in +// the newly created stack frame +// +// Note that reserveBlock() may grow the stack, resulting in the +// whole thing being relocated in memory. +// +// Parameters: +// fp The top frame pointer when called. At return, a new +// fame will be present +// savePatIdx An index into the compiled pattern. Goes into the original +// (not new) frame. If execution ever back-tracks out of the +// new frame, this will be where we continue from in the pattern. +// Return +// The new frame pointer. +// +//-------------------------------------------------------------------------------- +inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) { + // push storage for a new frame. + int64_t *newFP = fStack->reserveBlock(fFrameSize, status); + if (newFP == NULL) { + // Failure on attempted stack expansion. + // Stack function set some other error code, change it to a more + // specific one for regular expressions. + status = U_REGEX_STACK_OVERFLOW; + // We need to return a writable stack frame, so just return the + // previous frame. The match operation will stop quickly + // because of the error status, after which the frame will never + // be looked at again. + return fp; + } + fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack. + + // New stack frame = copy of old top frame. + int64_t *source = (int64_t *)fp; + int64_t *dest = newFP; + for (;;) { + *dest++ = *source++; + if (source == newFP) { + break; + } + } + + fTickCounter--; + if (fTickCounter <= 0) { + IncrementTime(status); // Re-initializes fTickCounter + } + fp->fPatIdx = savePatIdx; + return (REStackFrame *)newFP; +} + + +//-------------------------------------------------------------------------------- +// +// MatchAt This is the actual matching engine. +// +// startIdx: begin matching a this index. +// toEnd: if true, match must extend to end of the input region +// +//-------------------------------------------------------------------------------- +void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { + UBool isMatch = FALSE; // True if the we have a match. + + int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards + + int32_t op; // Operation from the compiled pattern, split into + int32_t opType; // the opcode + int32_t opValue; // and the operand value. + + #ifdef REGEX_RUN_DEBUG + if (fTraceDebug) + { + printf("MatchAt(startIdx=%ld)\n", startIdx); + printf("Original Pattern: "); + UChar32 c = utext_next32From(fPattern->fPattern, 0); + while (c != U_SENTINEL) { + if (c<32 || c>256) { + c = '.'; + } + REGEX_DUMP_DEBUG_PRINTF(("%c", c)); + + c = UTEXT_NEXT32(fPattern->fPattern); + } + printf("\n"); + printf("Input String: "); + c = utext_next32From(fInputText, 0); + while (c != U_SENTINEL) { + if (c<32 || c>256) { + c = '.'; + } + printf("%c", c); + + c = UTEXT_NEXT32(fInputText); + } + printf("\n"); + printf("\n"); + } + #endif + + if (U_FAILURE(status)) { + return; + } + + // Cache frequently referenced items from the compiled pattern + // + int64_t *pat = fPattern->fCompiledPat->getBuffer(); + + const UChar *litText = fPattern->fLiteralText.getBuffer(); + UVector *sets = fPattern->fSets; + + fFrameSize = fPattern->fFrameSize; + REStackFrame *fp = resetStack(); + + fp->fPatIdx = 0; + fp->fInputIdx = startIdx; + + // Zero out the pattern's static data + int32_t i; + for (i = 0; ifDataSize; i++) { + fData[i] = 0; + } + + // + // Main loop for interpreting the compiled pattern. + // One iteration of the loop per pattern operation performed. + // + for (;;) { +#if 0 + if (_heapchk() != _HEAPOK) { + fprintf(stderr, "Heap Trouble\n"); + } +#endif + + op = (int32_t)pat[fp->fPatIdx]; + opType = URX_TYPE(op); + opValue = URX_VAL(op); + #ifdef REGEX_RUN_DEBUG + if (fTraceDebug) { + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp->fInputIdx, + UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); + fPattern->dumpOp(fp->fPatIdx); + } + #endif + fp->fPatIdx++; + + switch (opType) { + + + case URX_NOP: + break; + + + case URX_BACKTRACK: + // Force a backtrack. In some circumstances, the pattern compiler + // will notice that the pattern can't possibly match anything, and will + // emit one of these at that point. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + + + case URX_ONECHAR: + if (fp->fInputIdx < fActiveLimit) { + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + UChar32 c = UTEXT_NEXT32(fInputText); + if (c == opValue) { + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + break; + } + } else { + fHitEnd = TRUE; + } + + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + UBool success = FALSE; + UChar32 c = UTEXT_PREVIOUS32(fInputText); + while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) { + if (c == opValue) { + success = TRUE; + break; + } else if (c == U_SENTINEL) { + break; + } + c = UTEXT_PREVIOUS32(fInputText); + } + if (success) { + fHitEnd = FALSE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + + + case URX_STRING: + { + // Test input against a literal string. + // Strings require two slots in the compiled pattern, one for the + // offset to the string text, and one for the length. + int32_t stringStartIdx = opValue; + int32_t stringLen; + + op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand + fp->fPatIdx++; + opType = URX_TYPE(op); + stringLen = URX_VAL(op); + U_ASSERT(opType == URX_STRING_LEN); + U_ASSERT(stringLen >= 2); + + const UChar *patternChars = litText+stringStartIdx; + const UChar *patternEnd = patternChars+stringLen; + + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + UChar32 c; + UBool success = TRUE; + + while (patternChars < patternEnd && success) { + c = UTEXT_NEXT32(fInputText); + + if (c != U_SENTINEL && UTEXT_GETNATIVEINDEX(fInputText) <= fActiveLimit) { + if (U_IS_BMP(c)) { + success = (*patternChars == c); + patternChars += 1; + } else if (patternChars+1 < patternEnd) { + success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c)); + patternChars += 2; + } + } else { + success = FALSE; + fHitEnd = TRUE; // TODO: See ticket 6074 + } + } + + if (success) { + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + } else { + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size()) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + // Reset to last start point + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + patternChars = litText+stringStartIdx; + + // Search backwards for a possible start + do { + c = UTEXT_PREVIOUS32(fInputText); + if (c == U_SENTINEL) { + break; + } else if ((U_IS_BMP(c) && *patternChars == c) || + (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) { + success = TRUE; + break; + } + } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex); + + // And try again + if (success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + + case URX_STATE_SAVE: + fp = StateSave(fp, opValue, status); + break; + + + case URX_END: + // The match loop will exit via this path on a successful match, + // when we reach the end of the pattern. + if (toEnd && fp->fInputIdx != fActiveLimit) { + // The pattern matched, but not to the end of input. Try some more. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + isMatch = TRUE; + goto breakFromLoop; + + // Start and End Capture stack frame variables are laid out out like this: + // fp->fExtra[opValue] - The start of a completed capture group + // opValue+1 - The end of a completed capture group + // opValue+2 - the start of a capture group whose end + // has not yet been reached (and might not ever be). + case URX_START_CAPTURE: + U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); + fp->fExtra[opValue+2] = fp->fInputIdx; + break; + + + case URX_END_CAPTURE: + U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); + U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. + fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. + fp->fExtra[opValue+1] = fp->fInputIdx; // End position + U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); + break; + + + case URX_DOLLAR: // $, test for End of line + // or for position before new line at end of input + { + if (fp->fInputIdx >= fAnchorLimit) { + // We really are at the end of input. Success. + fHitEnd = TRUE; + fRequireEnd = TRUE; + break; + } + + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + + // If we are positioned just before a new-line that is located at the + // end of input, succeed. + UChar32 c = UTEXT_NEXT32(fInputText); + if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { + if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) { + // If not in the middle of a CR/LF sequence + if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && (UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) { + // At new-line at end of input. Success + fHitEnd = TRUE; + fRequireEnd = TRUE; + + break; + } + } + } else { + UChar32 nextC = UTEXT_NEXT32(fInputText); + if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { + fHitEnd = TRUE; + fRequireEnd = TRUE; + break; // At CR/LF at end of input. Success + } + } + + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + break; + + + case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. + if (fp->fInputIdx >= fAnchorLimit) { + // Off the end of input. Success. + fHitEnd = TRUE; + fRequireEnd = TRUE; + break; + } else { + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + UChar32 c = UTEXT_NEXT32(fInputText); + // Either at the last character of input, or off the end. + if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) { + fHitEnd = TRUE; + fRequireEnd = TRUE; + break; + } + } + + // Not at end of input. Back-track out. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + + + case URX_DOLLAR_M: // $, test for End of line in multi-line mode + { + if (fp->fInputIdx >= fAnchorLimit) { + // We really are at the end of input. Success. + fHitEnd = TRUE; + fRequireEnd = TRUE; + break; + } + // If we are positioned just before a new-line, succeed. + // It makes no difference where the new-line is within the input. + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + UChar32 c = UTEXT_CURRENT32(fInputText); + if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { + // At a line end, except for the odd chance of being in the middle of a CR/LF sequence + // In multi-line mode, hitting a new-line just before the end of input does not + // set the hitEnd or requireEnd flags + if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) { + break; + } + } + // not at a new line. Fail. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + break; + + + case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode + { + if (fp->fInputIdx >= fAnchorLimit) { + // We really are at the end of input. Success. + fHitEnd = TRUE; + fRequireEnd = TRUE; // Java set requireEnd in this case, even though + break; // adding a new-line would not lose the match. + } + // If we are not positioned just before a new-line, the test fails; backtrack out. + // It makes no difference where the new-line is within the input. + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + if (UTEXT_CURRENT32(fInputText) != 0x0a) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + + case URX_CARET: // ^, test for start of line + if (fp->fInputIdx != fAnchorStart) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + break; + + + case URX_CARET_M: // ^, test for start of line in mulit-line mode + { + if (fp->fInputIdx == fAnchorStart) { + // We are at the start input. Success. + break; + } + // Check whether character just before the current pos is a new-line + // unless we are at the end of input + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + UChar32 c = UTEXT_PREVIOUS32(fInputText); + if ((fp->fInputIdx < fAnchorLimit) && + ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { + // It's a new-line. ^ is true. Success. + // TODO: what should be done with positions between a CR and LF? + break; + } + // Not at the start of a line. Fail. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + break; + + + case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode + { + U_ASSERT(fp->fInputIdx >= fAnchorStart); + if (fp->fInputIdx <= fAnchorStart) { + // We are at the start input. Success. + break; + } + // Check whether character just before the current pos is a new-line + U_ASSERT(fp->fInputIdx <= fAnchorLimit); + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + UChar32 c = UTEXT_PREVIOUS32(fInputText); + if (c != 0x0a) { + // Not at the start of a line. Back-track out. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + case URX_BACKSLASH_B: // Test for word boundaries + { + UBool success = isWordBoundary(fp->fInputIdx); + success ^= (opValue != 0); // flip sense for \B + if (!success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + + case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style + { + UBool success = isUWordBoundary(fp->fInputIdx); + success ^= (opValue != 0); // flip sense for \B + if (!success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + + case URX_BACKSLASH_D: // Test for decimal digit + { + if (fp->fInputIdx >= fActiveLimit) { + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + + UChar32 c = UTEXT_NEXT32(fInputText); + int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster. + UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); + success ^= (opValue != 0); // flip sense for \D + if (success) { + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + } else { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + + case URX_BACKSLASH_G: // Test for position at end of previous match + if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + break; + + + case URX_BACKSLASH_X: + // Match a Grapheme, as defined by Unicode TR 29. + // Differs slightly from Perl, which consumes combining marks independently + // of context. + { + + // Fail if at end of input + if (fp->fInputIdx >= fActiveLimit) { + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + + // Examine (and consume) the current char. + // Dispatch into a little state machine, based on the char. + UChar32 c; + c = UTEXT_NEXT32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + UnicodeSet **sets = fPattern->fStaticSets; + if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; + if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; + if (sets[URX_GC_L]->contains(c)) goto GC_L; + if (sets[URX_GC_LV]->contains(c)) goto GC_V; + if (sets[URX_GC_LVT]->contains(c)) goto GC_T; + if (sets[URX_GC_V]->contains(c)) goto GC_V; + if (sets[URX_GC_T]->contains(c)) goto GC_T; + goto GC_Extend; + + + +GC_L: + if (fp->fInputIdx >= fActiveLimit) goto GC_Done; + c = UTEXT_NEXT32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + if (sets[URX_GC_L]->contains(c)) goto GC_L; + if (sets[URX_GC_LV]->contains(c)) goto GC_V; + if (sets[URX_GC_LVT]->contains(c)) goto GC_T; + if (sets[URX_GC_V]->contains(c)) goto GC_V; + UTEXT_PREVIOUS32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + goto GC_Extend; + +GC_V: + if (fp->fInputIdx >= fActiveLimit) goto GC_Done; + c = UTEXT_NEXT32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + if (sets[URX_GC_V]->contains(c)) goto GC_V; + if (sets[URX_GC_T]->contains(c)) goto GC_T; + UTEXT_PREVIOUS32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + goto GC_Extend; + +GC_T: + if (fp->fInputIdx >= fActiveLimit) goto GC_Done; + c = UTEXT_NEXT32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + if (sets[URX_GC_T]->contains(c)) goto GC_T; + UTEXT_PREVIOUS32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + goto GC_Extend; + +GC_Extend: + // Combining characters are consumed here + for (;;) { + if (fp->fInputIdx >= fActiveLimit) { + break; + } + c = UTEXT_CURRENT32(fInputText); + if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { + break; + } + UTEXT_NEXT32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + } + goto GC_Done; + +GC_Control: + // Most control chars stand alone (don't combine with combining chars), + // except for that CR/LF sequence is a single grapheme cluster. + if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) { + c = UTEXT_NEXT32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + } + +GC_Done: + if (fp->fInputIdx >= fActiveLimit) { + fHitEnd = TRUE; + } + break; + } + + + + + case URX_BACKSLASH_Z: // Test for end of Input + if (fp->fInputIdx < fAnchorLimit) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } else { + fHitEnd = TRUE; + fRequireEnd = TRUE; + } + break; + + + + case URX_STATIC_SETREF: + { + // Test input character against one of the predefined sets + // (Word Characters, for example) + // The high bit of the op value is a flag for the match polarity. + // 0: success if input char is in set. + // 1: success if input char is not in set. + if (fp->fInputIdx >= fActiveLimit) { + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + + UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); + opValue &= ~URX_NEG_SET; + U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); + + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + UChar32 c = UTEXT_NEXT32(fInputText); + if (c < 256) { + Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; + if (s8->contains(c)) { + success = !success; + } + } else { + const UnicodeSet *s = fPattern->fStaticSets[opValue]; + if (s->contains(c)) { + success = !success; + } + } + if (success) { + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + } else { + // the character wasn't in the set. + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + // Try to find it, backwards + UTEXT_PREVIOUS32(fInputText); // skip the first character we tried + success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset + do { + c = UTEXT_PREVIOUS32(fInputText); + if (c == U_SENTINEL) { + break; + } else if (c < 256) { + Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; + if (s8->contains(c)) { + success = !success; + } + } else { + const UnicodeSet *s = fPattern->fStaticSets[opValue]; + if (s->contains(c)) { + success = !success; + } + } + } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex && !success); + + if (success && c != U_SENTINEL) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + + case URX_STAT_SETREF_N: + { + // Test input character for NOT being a member of one of + // the predefined sets (Word Characters, for example) + if (fp->fInputIdx >= fActiveLimit) { + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + + U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); + + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + + UChar32 c = UTEXT_NEXT32(fInputText); + if (c < 256) { + Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; + if (s8->contains(c) == FALSE) { + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + break; + } + } else { + const UnicodeSet *s = fPattern->fStaticSets[opValue]; + if (s->contains(c) == FALSE) { + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + break; + } + } + // the character wasn't in the set. + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + // Try to find it, backwards + UTEXT_PREVIOUS32(fInputText); // skip the first character we tried + UBool success = FALSE; + do { + c = UTEXT_PREVIOUS32(fInputText); + if (c == U_SENTINEL) { + break; + } else if (c < 256) { + Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; + if (s8->contains(c) == FALSE) { + success = TRUE; + break; + } + } else { + const UnicodeSet *s = fPattern->fStaticSets[opValue]; + if (s->contains(c) == FALSE) { + success = TRUE; + break; + } + } + } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex); + + if (success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + break; + + + case URX_SETREF: + if (fp->fInputIdx >= fActiveLimit) { + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } else { + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + + // There is input left. Pick up one char and test it for set membership. + UChar32 c = UTEXT_NEXT32(fInputText); + U_ASSERT(opValue > 0 && opValue < sets->size()); + if (c<256) { + Regex8BitSet *s8 = &fPattern->fSets8[opValue]; + if (s8->contains(c)) { + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + break; + } + } else { + UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); + if (s->contains(c)) { + // The character is in the set. A Match. + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + break; + } + } + + // the character wasn't in the set. + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + // Try to find it, backwards + UTEXT_PREVIOUS32(fInputText); // skip the first character we tried + UBool success = FALSE; + do { + c = UTEXT_PREVIOUS32(fInputText); + if (c == U_SENTINEL) { + break; + } else if (c < 256) { + Regex8BitSet *s8 = &fPattern->fSets8[opValue]; + if (s8->contains(c)) { + success = TRUE; + break; + } + } else { + UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); + if (s->contains(c)) { + success = TRUE; + break; + } + } + } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex); + + if (success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + break; + + + case URX_DOTANY: + { + // . matches anything, but stops at end-of-line. + if (fp->fInputIdx >= fActiveLimit) { + // At end of input. Match failed. Backtrack out. + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + + // There is input left. Advance over one char, unless we've hit end-of-line + UChar32 c = UTEXT_NEXT32(fInputText); + if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible + ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { + // End of line in normal mode. . does not match. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + } + break; + + + case URX_DOTANY_ALL: + { + // ., in dot-matches-all (including new lines) mode + if (fp->fInputIdx >= fActiveLimit) { + // At end of input. Match failed. Backtrack out. + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + + // There is input left. Advance over one char, except if we are + // at a cr/lf, advance over both of them. + UChar32 c; + c = UTEXT_NEXT32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + if (c==0x0d && fp->fInputIdx < fActiveLimit) { + // In the case of a CR/LF, we need to advance over both. + UChar32 nextc = UTEXT_CURRENT32(fInputText); + if (nextc == 0x0a) { + UTEXT_NEXT32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + } + } + } + break; + + + case URX_DOTANY_UNIX: + { + // '.' operator, matches all, but stops at end-of-line. + // UNIX_LINES mode, so 0x0a is the only recognized line ending. + if (fp->fInputIdx >= fActiveLimit) { + // At end of input. Match failed. Backtrack out. + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + + // There is input left. Advance over one char, unless we've hit end-of-line + UChar32 c = UTEXT_NEXT32(fInputText); + if (c == 0x0a) { + // End of line in normal mode. '.' does not match the \n + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } else { + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + } + } + break; + + + case URX_JMP: + fp->fPatIdx = opValue; + break; + + case URX_FAIL: + isMatch = FALSE; + goto breakFromLoop; + + case URX_JMP_SAV: + U_ASSERT(opValue < fPattern->fCompiledPat->size()); + fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current + fp->fPatIdx = opValue; // Then JMP. + break; + + case URX_JMP_SAV_X: + // This opcode is used with (x)+, when x can match a zero length string. + // Same as JMP_SAV, except conditional on the match having made forward progress. + // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the + // data address of the input position at the start of the loop. + { + U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()); + int32_t stoOp = (int32_t)pat[opValue-1]; + U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); + int32_t frameLoc = URX_VAL(stoOp); + U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); + int64_t prevInputIdx = fp->fExtra[frameLoc]; + U_ASSERT(prevInputIdx <= fp->fInputIdx); + if (prevInputIdx < fp->fInputIdx) { + // The match did make progress. Repeat the loop. + fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current + fp->fPatIdx = opValue; + fp->fExtra[frameLoc] = fp->fInputIdx; + } + // If the input position did not advance, we do nothing here, + // execution will fall out of the loop. + } + break; + + case URX_CTR_INIT: + { + U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); + fp->fExtra[opValue] = 0; // Set the loop counter variable to zero + + // Pick up the three extra operands that CTR_INIT has, and + // skip the pattern location counter past + int32_t instrOperandLoc = (int32_t)fp->fPatIdx; + fp->fPatIdx += 3; + int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); + int32_t minCount = (int32_t)pat[instrOperandLoc+1]; + int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; + U_ASSERT(minCount>=0); + U_ASSERT(maxCount>=minCount || maxCount==-1); + U_ASSERT(loopLoc>fp->fPatIdx); + + if (minCount == 0) { + fp = StateSave(fp, loopLoc+1, status); + } + if (maxCount == 0) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + case URX_CTR_LOOP: + { + U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); + int32_t initOp = (int32_t)pat[opValue]; + U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); + int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; + int32_t minCount = (int32_t)pat[opValue+2]; + int32_t maxCount = (int32_t)pat[opValue+3]; + // Increment the counter. Note: we DIDN'T worry about counter + // overflow, since the data comes from UnicodeStrings, which + // stores its length in an int32_t. Do we have to think about + // this now that we're using UText? Probably not, since the length + // in UChar32s is still an int32_t. + (*pCounter)++; + U_ASSERT(*pCounter > 0); + if ((uint64_t)*pCounter >= (uint32_t)maxCount) { + U_ASSERT(*pCounter == maxCount || maxCount == -1); + break; + } + if (*pCounter >= minCount) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx = opValue + 4; // Loop back. + } + break; + + case URX_CTR_INIT_NG: + { + // Initialize a non-greedy loop + U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); + fp->fExtra[opValue] = 0; // Set the loop counter variable to zero + + // Pick up the three extra operands that CTR_INIT has, and + // skip the pattern location counter past + int32_t instrOperandLoc = (int32_t)fp->fPatIdx; + fp->fPatIdx += 3; + int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); + int32_t minCount = (int32_t)pat[instrOperandLoc+1]; + int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; + U_ASSERT(minCount>=0); + U_ASSERT(maxCount>=minCount || maxCount==-1); + U_ASSERT(loopLoc>fp->fPatIdx); + + if (minCount == 0) { + if (maxCount != 0) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block + } + } + break; + + case URX_CTR_LOOP_NG: + { + // Non-greedy {min, max} loops + U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); + int32_t initOp = (int32_t)pat[opValue]; + U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); + int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; + int32_t minCount = (int32_t)pat[opValue+2]; + int32_t maxCount = (int32_t)pat[opValue+3]; + // Increment the counter. Note: we DIDN'T worry about counter + // overflow, since the data comes from UnicodeStrings, which + // stores its length in an int32_t. Do we have to think about + // this now that we're using UText? Probably not, since the length + // in UChar32s is still an int32_t. + (*pCounter)++; + U_ASSERT(*pCounter > 0); + + if ((uint64_t)*pCounter >= (uint32_t)maxCount) { + // The loop has matched the maximum permitted number of times. + // Break out of here with no action. Matching will + // continue with the following pattern. + U_ASSERT(*pCounter == maxCount || maxCount == -1); + break; + } + + if (*pCounter < minCount) { + // We haven't met the minimum number of matches yet. + // Loop back for another one. + fp->fPatIdx = opValue + 4; // Loop back. + } else { + // We do have the minimum number of matches. + // Fall into the following pattern, but first do + // a state save to the top of the loop, so that a failure + // in the following pattern will try another iteration of the loop. + fp = StateSave(fp, opValue + 4, status); + } + } + break; + + case URX_STO_SP: + U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); + fData[opValue] = fStack->size(); + break; + + case URX_LD_SP: + { + U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); + int32_t newStackSize = (int32_t)fData[opValue]; + U_ASSERT(newStackSize <= fStack->size()); + int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; + if (newFP == (int64_t *)fp) { + break; + } + int32_t i; + for (i=0; isetSize(newStackSize); + } + break; + + case URX_BACKREF: + case URX_BACKREF_I: + { + U_ASSERT(opValue < fFrameSize); + int64_t groupStartIdx = fp->fExtra[opValue]; + int64_t groupEndIdx = fp->fExtra[opValue+1]; + U_ASSERT(groupStartIdx <= groupEndIdx); + if (groupStartIdx < 0) { + // This capture group has not participated in the match thus far, + fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. + } + + if (groupEndIdx == groupStartIdx) { + // The capture group match was of an empty string. + // Verified by testing: Perl matches succeed in this case, so + // we do too. + break; + } + + UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx); + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + + UBool haveMatch = (opType == URX_BACKREF ? + (0 == utext_compareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1)) : + (0 == utext_caseCompareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1, U_FOLD_CASE_DEFAULT, &status))); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + + if (fp->fInputIdx > fActiveLimit) { + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. + } else if (!haveMatch) { + if (fp->fInputIdx == fActiveLimit) { + fHitEnd = TRUE; + } + fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. + } + } + break; + + case URX_STO_INP_LOC: + { + U_ASSERT(opValue >= 0 && opValue < fFrameSize); + fp->fExtra[opValue] = fp->fInputIdx; + } + break; + + case URX_JMPX: + { + int32_t instrOperandLoc = (int32_t)fp->fPatIdx; + fp->fPatIdx += 1; + int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); + U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); + int64_t savedInputIdx = fp->fExtra[dataLoc]; + U_ASSERT(savedInputIdx <= fp->fInputIdx); + if (savedInputIdx < fp->fInputIdx) { + fp->fPatIdx = opValue; // JMP + } else { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop. + } + } + break; + + case URX_LA_START: + { + // Entering a lookahead block. + // Save Stack Ptr, Input Pos. + U_ASSERT(opValue>=0 && opValue+1fDataSize); + fData[opValue] = fStack->size(); + fData[opValue+1] = fp->fInputIdx; + fActiveStart = fLookStart; // Set the match region change for + fActiveLimit = fLookLimit; // transparent bounds. + } + break; + + case URX_LA_END: + { + // Leaving a look-ahead block. + // restore Stack Ptr, Input Pos to positions they had on entry to block. + U_ASSERT(opValue>=0 && opValue+1fDataSize); + int32_t stackSize = fStack->size(); + int32_t newStackSize =(int32_t)fData[opValue]; + U_ASSERT(stackSize >= newStackSize); + if (stackSize > newStackSize) { + // Copy the current top frame back to the new (cut back) top frame. + // This makes the capture groups from within the look-ahead + // expression available. + int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; + int32_t i; + for (i=0; isetSize(newStackSize); + } + fp->fInputIdx = fData[opValue+1]; + + // Restore the active region bounds in the input string; they may have + // been changed because of transparent bounds on a Region. + fActiveStart = fRegionStart; + fActiveLimit = fRegionLimit; + } + break; + + case URX_ONECHAR_I: + if (fp->fInputIdx < fActiveLimit) { + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + + UChar32 c = UTEXT_NEXT32(fInputText); + if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + break; + } + } else { + fHitEnd = TRUE; + } + + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + UBool success = FALSE; + UChar32 c = UTEXT_PREVIOUS32(fInputText); + while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) { + if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { + success = TRUE; + break; + } else if (c == U_SENTINEL) { + break; + } + c = UTEXT_PREVIOUS32(fInputText); + } + if (success) { + fHitEnd = FALSE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + + case URX_STRING_I: + { + // Test input against a literal string. + // Strings require two slots in the compiled pattern, one for the + // offset to the string text, and one for the length. + const UCaseProps *csp = ucase_getSingleton(); + { + int32_t stringStartIdx, stringLen; + stringStartIdx = opValue; + + op = (int32_t)pat[fp->fPatIdx]; + fp->fPatIdx++; + opType = URX_TYPE(op); + opValue = URX_VAL(op); + U_ASSERT(opType == URX_STRING_LEN); + stringLen = opValue; + + const UChar *patternChars = litText+stringStartIdx; + const UChar *patternEnd = patternChars+stringLen; + + const UChar *foldChars = NULL; + int32_t foldOffset, foldLength; + UChar32 c; + + foldOffset = foldLength = 0; + UBool success = TRUE; + + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + while (patternChars < patternEnd && success) { + if(foldOffset < foldLength) { + U16_NEXT_UNSAFE(foldChars, foldOffset, c); + } else { + c = UTEXT_NEXT32(fInputText); + if (c != U_SENTINEL) { + foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT); + if(foldLength >= 0) { + if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings + foldOffset = 0; + U16_NEXT_UNSAFE(foldChars, foldOffset, c); + } else { + c = foldLength; + foldLength = foldOffset; // to avoid reading chars from the folding buffer + } + } + } + + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + } + + success = FALSE; + if (c != U_SENTINEL && (fp->fInputIdx <= fActiveLimit)) { + if (U_IS_BMP(c)) { + success = (*patternChars == c); + patternChars += 1; + } else if (patternChars+1 < patternEnd) { + success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c)); + patternChars += 2; + } + } else { + fHitEnd = TRUE; // TODO: See ticket 6074 + } + } + + if (!success) { + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size()) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + // Reset to last start point + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + patternChars = litText+stringStartIdx; + + // Search backwards for a possible start + do { + c = UTEXT_PREVIOUS32(fInputText); + if (c == U_SENTINEL) { + break; + } else { + foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT); + if(foldLength >= 0) { + if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings + foldOffset = 0; + U16_NEXT_UNSAFE(foldChars, foldOffset, c); + } else { + c = foldLength; + foldLength = foldOffset; // to avoid reading chars from the folding buffer + } + } + + if ((U_IS_BMP(c) && *patternChars == c) || + (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) { + success = TRUE; + break; + } + } + } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex); + + // And try again + if (success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + } + break; + + case URX_LB_START: + { + // Entering a look-behind block. + // Save Stack Ptr, Input Pos. + // TODO: implement transparent bounds. Ticket #6067 + U_ASSERT(opValue>=0 && opValue+1fDataSize); + fData[opValue] = fStack->size(); + fData[opValue+1] = fp->fInputIdx; + // Init the variable containing the start index for attempted matches. + fData[opValue+2] = -1; + // Save input string length, then reset to pin any matches to end at + // the current position. + fData[opValue+3] = fActiveLimit; + fActiveLimit = fp->fInputIdx; + } + break; + + + case URX_LB_CONT: + { + // Positive Look-Behind, at top of loop checking for matches of LB expression + // at all possible input starting positions. + + // Fetch the min and max possible match lengths. They are the operands + // of this op in the pattern. + int32_t minML = (int32_t)pat[fp->fPatIdx++]; + int32_t maxML = (int32_t)pat[fp->fPatIdx++]; + U_ASSERT(minML <= maxML); + U_ASSERT(minML >= 0); + + // Fetch (from data) the last input index where a match was attempted. + U_ASSERT(opValue>=0 && opValue+1fDataSize); + int64_t *lbStartIdx = &fData[opValue+2]; + if (*lbStartIdx < 0) { + // First time through loop. + *lbStartIdx = fp->fInputIdx - minML; + } else { + // 2nd through nth time through the loop. + // Back up start position for match by one. + if (*lbStartIdx == 0) { + (*lbStartIdx)--; + } else { + UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx); + UTEXT_PREVIOUS32(fInputText); + *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); + } + } + + if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { + // We have tried all potential match starting points without + // getting a match. Backtrack out, and out of the + // Look Behind altogether. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + int64_t restoreInputLen = fData[opValue+3]; + U_ASSERT(restoreInputLen >= fActiveLimit); + U_ASSERT(restoreInputLen <= fInputLength); + fActiveLimit = restoreInputLen; + break; + } + + // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. + // (successful match will fall off the end of the loop.) + fp = StateSave(fp, fp->fPatIdx-3, status); + fp->fInputIdx = *lbStartIdx; + } + break; + + case URX_LB_END: + // End of a look-behind block, after a successful match. + { + U_ASSERT(opValue>=0 && opValue+1fDataSize); + if (fp->fInputIdx != fActiveLimit) { + // The look-behind expression matched, but the match did not + // extend all the way to the point that we are looking behind from. + // FAIL out of here, which will take us back to the LB_CONT, which + // will retry the match starting at another position or fail + // the look-behind altogether, whichever is appropriate. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + + // Look-behind match is good. Restore the orignal input string length, + // which had been truncated to pin the end of the lookbehind match to the + // position being looked-behind. + int64_t originalInputLen = fData[opValue+3]; + U_ASSERT(originalInputLen >= fActiveLimit); + U_ASSERT(originalInputLen <= fInputLength); + fActiveLimit = originalInputLen; + } + break; + + + case URX_LBN_CONT: + { + // Negative Look-Behind, at top of loop checking for matches of LB expression + // at all possible input starting positions. + + // Fetch the extra parameters of this op. + int32_t minML = (int32_t)pat[fp->fPatIdx++]; + int32_t maxML = (int32_t)pat[fp->fPatIdx++]; + int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; + continueLoc = URX_VAL(continueLoc); + U_ASSERT(minML <= maxML); + U_ASSERT(minML >= 0); + U_ASSERT(continueLoc > fp->fPatIdx); + + // Fetch (from data) the last input index where a match was attempted. + U_ASSERT(opValue>=0 && opValue+1fDataSize); + int64_t *lbStartIdx = &fData[opValue+2]; + if (*lbStartIdx < 0) { + // First time through loop. + *lbStartIdx = fp->fInputIdx - minML; + } else { + // 2nd through nth time through the loop. + // Back up start position for match by one. + if (*lbStartIdx == 0) { + (*lbStartIdx)--; + } else { + UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx); + UTEXT_PREVIOUS32(fInputText); + *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); + } + } + + if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { + // We have tried all potential match starting points without + // getting a match, which means that the negative lookbehind as + // a whole has succeeded. Jump forward to the continue location + int64_t restoreInputLen = fData[opValue+3]; + U_ASSERT(restoreInputLen >= fActiveLimit); + U_ASSERT(restoreInputLen <= fInputLength); + fActiveLimit = restoreInputLen; + fp->fPatIdx = continueLoc; + break; + } + + // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. + // (successful match will cause a FAIL out of the loop altogether.) + fp = StateSave(fp, fp->fPatIdx-4, status); + fp->fInputIdx = *lbStartIdx; + } + break; + + case URX_LBN_END: + // End of a negative look-behind block, after a successful match. + { + U_ASSERT(opValue>=0 && opValue+1fDataSize); + if (fp->fInputIdx != fActiveLimit) { + // The look-behind expression matched, but the match did not + // extend all the way to the point that we are looking behind from. + // FAIL out of here, which will take us back to the LB_CONT, which + // will retry the match starting at another position or succeed + // the look-behind altogether, whichever is appropriate. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + + // Look-behind expression matched, which means look-behind test as + // a whole Fails + + // Restore the orignal input string length, which had been truncated + // inorder to pin the end of the lookbehind match + // to the position being looked-behind. + int64_t originalInputLen = fData[opValue+3]; + U_ASSERT(originalInputLen >= fActiveLimit); + U_ASSERT(originalInputLen <= fInputLength); + fActiveLimit = originalInputLen; + + // Restore original stack position, discarding any state saved + // by the successful pattern match. + U_ASSERT(opValue>=0 && opValue+1fDataSize); + int32_t newStackSize = (int32_t)fData[opValue]; + U_ASSERT(fStack->size() > newStackSize); + fStack->setSize(newStackSize); + + // FAIL, which will take control back to someplace + // prior to entering the look-behind test. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + break; + + + case URX_LOOP_SR_I: + // Loop Initialization for the optimized implementation of + // [some character set]* + // This op scans through all matching input. + // The following LOOP_C op emulates stack unwinding if the following pattern fails. + { + U_ASSERT(opValue > 0 && opValue < sets->size()); + Regex8BitSet *s8 = &fPattern->fSets8[opValue]; + UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); + + // Loop through input, until either the input is exhausted or + // we reach a character that is not a member of the set. + int64_t ix = fp->fInputIdx; + UTEXT_SETNATIVEINDEX(fInputText, ix); + for (;;) { + if (ix >= fActiveLimit) { + fHitEnd = TRUE; + break; + } + UChar32 c = UTEXT_NEXT32(fInputText); + if (c<256) { + if (s8->contains(c) == FALSE) { + break; + } + } else { + if (s->contains(c) == FALSE) { + break; + } + } + ix = UTEXT_GETNATIVEINDEX(fInputText); + } + + // If there were no matching characters, skip over the loop altogether. + // The loop doesn't run at all, a * op always succeeds. + if (ix == fp->fInputIdx) { + fp->fPatIdx++; // skip the URX_LOOP_C op. + break; + } + + // Peek ahead in the compiled pattern, to the URX_LOOP_C that + // must follow. It's operand is the stack location + // that holds the starting input index for the match of this [set]* + int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; + U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); + int32_t stackLoc = URX_VAL(loopcOp); + U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); + fp->fExtra[stackLoc] = fp->fInputIdx; + #ifdef REGEX_SMART_BACKTRACKING + backSearchIndex = fp->fInputIdx; + #endif + fp->fInputIdx = ix; + + // Save State to the URX_LOOP_C op that follows this one, + // so that match failures in the following code will return to there. + // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. + fp = StateSave(fp, fp->fPatIdx, status); + fp->fPatIdx++; + } + break; + + + case URX_LOOP_DOT_I: + // Loop Initialization for the optimized implementation of .* + // This op scans through all remaining input. + // The following LOOP_C op emulates stack unwinding if the following pattern fails. + { + // Loop through input until the input is exhausted (we reach an end-of-line) + // In DOTALL mode, we can just go straight to the end of the input. + int64_t ix; + if ((opValue & 1) == 1) { + // Dot-matches-All mode. Jump straight to the end of the string. + ix = fActiveLimit; + fHitEnd = TRUE; + } else { + // NOT DOT ALL mode. Line endings do not match '.' + // Scan forward until a line ending or end of input. + ix = fp->fInputIdx; + UTEXT_SETNATIVEINDEX(fInputText, ix); + for (;;) { + if (ix >= fActiveLimit) { + fHitEnd = TRUE; + break; + } + UChar32 c = UTEXT_NEXT32(fInputText); + if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s + if ((c == 0x0a) || // 0x0a is newline in both modes. + (((opValue & 2) == 0) && // IF not UNIX_LINES mode + (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) { + // char is a line ending. Exit the scanning loop. + break; + } + } + ix = UTEXT_GETNATIVEINDEX(fInputText); + } + } + + // If there were no matching characters, skip over the loop altogether. + // The loop doesn't run at all, a * op always succeeds. + if (ix == fp->fInputIdx) { + fp->fPatIdx++; // skip the URX_LOOP_C op. + break; + } + + // Peek ahead in the compiled pattern, to the URX_LOOP_C that + // must follow. It's operand is the stack location + // that holds the starting input index for the match of this .* + int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; + U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); + int32_t stackLoc = URX_VAL(loopcOp); + U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); + fp->fExtra[stackLoc] = fp->fInputIdx; + #ifdef REGEX_SMART_BACKTRACKING + backSearchIndex = fp->fInputIdx; + #endif + fp->fInputIdx = ix; + + // Save State to the URX_LOOP_C op that follows this one, + // so that match failures in the following code will return to there. + // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. + fp = StateSave(fp, fp->fPatIdx, status); + fp->fPatIdx++; + } + break; + + + case URX_LOOP_C: + { + U_ASSERT(opValue>=0 && opValuefExtra[opValue]; + U_ASSERT(backSearchIndex <= fp->fInputIdx); + if (backSearchIndex == fp->fInputIdx) { + // We've backed up the input idx to the point that the loop started. + // The loop is done. Leave here without saving state. + // Subsequent failures won't come back here. + break; + } + // Set up for the next iteration of the loop, with input index + // backed up by one from the last time through, + // and a state save to this instruction in case the following code fails again. + // (We're going backwards because this loop emulates stack unwinding, not + // the initial scan forward.) + U_ASSERT(fp->fInputIdx > 0); + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + UChar32 prevC = UTEXT_PREVIOUS32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + + UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText); + if (prevC == 0x0a && + fp->fInputIdx > backSearchIndex && + twoPrevC == 0x0d) { + int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; + if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { + // .*, stepping back over CRLF pair. + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + } + } + + + fp = StateSave(fp, fp->fPatIdx-1, status); + } + break; + + + + default: + // Trouble. The compiled pattern contains an entry with an + // unrecognized type tag. + U_ASSERT(FALSE); + } + + if (U_FAILURE(status)) { + isMatch = FALSE; break; } } - fp->fPatIdx = savePatIdx; - return (REStackFrame *)newFP; +breakFromLoop: + fMatch = isMatch; + if (isMatch) { + fLastMatchEnd = fMatchEnd; + fMatchStart = startIdx; + fMatchEnd = fp->fInputIdx; + if (fTraceDebug) { + REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd)); + } + } + else + { + if (fTraceDebug) { + REGEX_RUN_DEBUG_PRINTF(("No match\n\n")); + } + } + + fFrame = fp; // The active stack frame when the engine stopped. + // Contains the capture group results that we need to + // access later. + return; } - - + + //-------------------------------------------------------------------------------- // -// MatchAt This is the actual matching engine. +// MatchChunkAt This is the actual matching engine. Like MatchAt, but with the +// assumption that the entire string is available in the UText's +// chunk buffer. For now, that means we can use int32_t indexes, +// except for anything that needs to be saved (like group starts +// and ends). +// +// startIdx: begin matching a this index. +// toEnd: if true, match must extend to end of the input region // //-------------------------------------------------------------------------------- -void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) { +void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { UBool isMatch = FALSE; // True if the we have a match. + + int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards int32_t op; // Operation from the compiled pattern, split into int32_t opType; // the opcode int32_t opValue; // and the operand value. - - #ifdef REGEX_RUN_DEBUG + +#ifdef REGEX_RUN_DEBUG if (fTraceDebug) { - printf("MatchAt(startIdx=%d)\n", startIdx); + printf("MatchAt(startIdx=%ld)\n", startIdx); printf("Original Pattern: "); - int i; - for (i=0; ifPattern.length(); i++) { - printf("%c", fPattern->fPattern.charAt(i)); + UChar32 c = utext_next32From(fPattern->fPattern, 0); + while (c != U_SENTINEL) { + if (c<32 || c>256) { + c = '.'; + } + REGEX_DUMP_DEBUG_PRINTF(("%c", c)); + + c = UTEXT_NEXT32(fPattern->fPattern); } printf("\n"); printf("Input String: "); - for (i=0; ilength(); i++) { - UChar c = fInput->charAt(i); + c = utext_next32From(fInputText, 0); + while (c != U_SENTINEL) { if (c<32 || c>256) { c = '.'; } printf("%c", c); + + c = UTEXT_NEXT32(fInputText); } printf("\n"); printf("\n"); } - #endif - +#endif + if (U_FAILURE(status)) { return; } - + // Cache frequently referenced items from the compiled pattern - // in local variables. // - int32_t *pat = fPattern->fCompiledPat->getBuffer(); - + int64_t *pat = fPattern->fCompiledPat->getBuffer(); + const UChar *litText = fPattern->fLiteralText.getBuffer(); UVector *sets = fPattern->fSets; - int32_t inputLen = fInput->length(); - const UChar *inputBuf = fInput->getBuffer(); - + + const UChar *inputBuf = fInputText->chunkContents; + + fFrameSize = fPattern->fFrameSize; REStackFrame *fp = resetStack(); - int32_t frameSize = fPattern->fFrameSize; - + fp->fPatIdx = 0; fp->fInputIdx = startIdx; - + // Zero out the pattern's static data int32_t i; for (i = 0; ifDataSize; i++) { fData[i] = 0; } - + // // Main loop for interpreting the compiled pattern. // One iteration of the loop per pattern operation performed. @@ -948,45 +4482,76 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) { fprintf(stderr, "Heap Trouble\n"); } #endif - op = pat[fp->fPatIdx]; + + op = (int32_t)pat[fp->fPatIdx]; opType = URX_TYPE(op); opValue = URX_VAL(op); - #ifdef REGEX_RUN_DEBUG +#ifdef REGEX_RUN_DEBUG if (fTraceDebug) { - printf("inputIdx=%d inputChar=%c sp=%3d ", fp->fInputIdx, - fInput->char32At(fp->fInputIdx), (int32_t *)fp-fStack->getBuffer()); + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp->fInputIdx, + UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); fPattern->dumpOp(fp->fPatIdx); } - #endif +#endif fp->fPatIdx++; - + switch (opType) { - - + + case URX_NOP: break; - - + + case URX_BACKTRACK: // Force a backtrack. In some circumstances, the pattern compiler // will notice that the pattern can't possibly match anything, and will // emit one of these at that point. - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; - - + + case URX_ONECHAR: - if (fp->fInputIdx < inputLen) { - UChar32 c; - U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c); - if (c == opValue) { + if (fp->fInputIdx < fActiveLimit) { + UChar32 c; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + if (c == opValue) { break; } + } else { + fHitEnd = TRUE; } - fp = (REStackFrame *)fStack->popFrame(frameSize); - break; + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + int64_t reverseIndex = fp->fInputIdx; + UChar32 c; + do { + U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); + if (c == opValue) { + break; + } + } while (reverseIndex > backSearchIndex); + if (c == opValue) { + fHitEnd = FALSE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = reverseIndex; + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + + case URX_STRING: { // Test input against a literal string. @@ -994,281 +4559,413 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) { // offset to the string text, and one for the length. int32_t stringStartIdx = opValue; int32_t stringLen; - - op = pat[fp->fPatIdx]; // Fetch the second operand + + op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand fp->fPatIdx++; opType = URX_TYPE(op); stringLen = URX_VAL(op); U_ASSERT(opType == URX_STRING_LEN); U_ASSERT(stringLen >= 2); - - if (fp->fInputIdx + stringLen > inputLen) { + + if (fp->fInputIdx + stringLen > fActiveLimit) { // No match. String is longer than the remaining input text. - fp = (REStackFrame *)fStack->popFrame(frameSize); + fHitEnd = TRUE; // TODO: See ticket 6074 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } - + const UChar * pInp = inputBuf + fp->fInputIdx; const UChar * pPat = litText+stringStartIdx; const UChar * pEnd = pInp + stringLen; + UBool success = FALSE; for(;;) { if (*pInp == *pPat) { pInp++; pPat++; if (pInp == pEnd) { // Successful Match. - fp->fInputIdx += stringLen; + success = TRUE; break; } } else { // Match failed. - fp = (REStackFrame *)fStack->popFrame(frameSize); break; } } - break; - + + if (success) { + fp->fInputIdx += stringLen; + } else { + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size()) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + // Reset to last start point + int64_t reverseIndex = fp->fInputIdx; + UChar32 c; + pPat = litText+stringStartIdx; + + // Search backwards for a possible start + do { + U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); + if ((U_IS_BMP(c) && *pPat == c) || + (*pPat == U16_LEAD(c) && *(pPat+1) == U16_TRAIL(c))) { + success = TRUE; + break; + } + } while (reverseIndex > backSearchIndex); + + // And try again + if (success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = reverseIndex; + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } } break; - - - + + case URX_STATE_SAVE: - fp = StateSave(fp, opValue, frameSize, status); + fp = StateSave(fp, opValue, status); break; - - + + case URX_END: // The match loop will exit via this path on a successful match, // when we reach the end of the pattern. + if (toEnd && fp->fInputIdx != fActiveLimit) { + // The pattern matched, but not to the end of input. Try some more. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } isMatch = TRUE; goto breakFromLoop; - - // Start and End Capture stack frame variables are layout out like this: + + // Start and End Capture stack frame variables are laid out out like this: // fp->fExtra[opValue] - The start of a completed capture group // opValue+1 - The end of a completed capture group // opValue+2 - the start of a capture group whose end // has not yet been reached (and might not ever be). case URX_START_CAPTURE: - U_ASSERT(opValue >= 0 && opValue < frameSize-3); + U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); fp->fExtra[opValue+2] = fp->fInputIdx; break; - - + + case URX_END_CAPTURE: - U_ASSERT(opValue >= 0 && opValue < frameSize-3); + U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. fp->fExtra[opValue+1] = fp->fInputIdx; // End position U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); break; - + case URX_DOLLAR: // $, test for End of line - // or for position before new line at end of input - if (fp->fInputIdx < inputLen-2) { + // or for position before new line at end of input + if (fp->fInputIdx < fAnchorLimit-2) { // We are no where near the end of input. Fail. - fp = (REStackFrame *)fStack->popFrame(frameSize); + // This is the common case. Keep it first. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } - if (fp->fInputIdx >= inputLen) { + if (fp->fInputIdx >= fAnchorLimit) { // We really are at the end of input. Success. + fHitEnd = TRUE; + fRequireEnd = TRUE; break; } + // If we are positioned just before a new-line that is located at the // end of input, succeed. - if (fp->fInputIdx == inputLen-1) { - UChar32 c = fInput->char32At(fp->fInputIdx); - if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) { - break; // At new-line at end of input. Success + if (fp->fInputIdx == fAnchorLimit-1) { + UChar32 c; + U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c); + + if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) { + if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { + // At new-line at end of input. Success + fHitEnd = TRUE; + fRequireEnd = TRUE; + break; + } } - } - - if (fp->fInputIdx == inputLen-2) { - if (fInput->char32At(fp->fInputIdx) == 0x0d && fInput->char32At(fp->fInputIdx+1) == 0x0a) { + } else if (fp->fInputIdx == fAnchorLimit-2 && + inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) { + fHitEnd = TRUE; + fRequireEnd = TRUE; break; // At CR/LF at end of input. Success + } + + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + + break; + + + case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. + if (fp->fInputIdx >= fAnchorLimit-1) { + // Either at the last character of input, or off the end. + if (fp->fInputIdx == fAnchorLimit-1) { + // At last char of input. Success if it's a new line. + if (inputBuf[fp->fInputIdx] == 0x0a) { + fHitEnd = TRUE; + fRequireEnd = TRUE; + break; + } + } else { + // Off the end of input. Success. + fHitEnd = TRUE; + fRequireEnd = TRUE; + break; } } - - fp = (REStackFrame *)fStack->popFrame(frameSize); - + + // Not at end of input. Back-track out. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; - - - case URX_DOLLAR_M: // $, test for End of line in multi-line mode - { - if (fp->fInputIdx >= inputLen) { - // We really are at the end of input. Success. - break; - } - // If we are positioned just before a new-line , succeed. - // It makes no difference where the new-line is within the input. - UChar32 c = inputBuf[fp->fInputIdx]; - if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) { - break; // At new-line at end of input. Success - } - // not at a new line. Fail. - fp = (REStackFrame *)fStack->popFrame(frameSize); - } - break; - - - case URX_CARET: // ^, test for start of line - if (fp->fInputIdx != 0) { - fp = (REStackFrame *)fStack->popFrame(frameSize); - } + + + case URX_DOLLAR_M: // $, test for End of line in multi-line mode + { + if (fp->fInputIdx >= fAnchorLimit) { + // We really are at the end of input. Success. + fHitEnd = TRUE; + fRequireEnd = TRUE; + break; + } + // If we are positioned just before a new-line, succeed. + // It makes no difference where the new-line is within the input. + UChar32 c = inputBuf[fp->fInputIdx]; + if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { + // At a line end, except for the odd chance of being in the middle of a CR/LF sequence + // In multi-line mode, hitting a new-line just before the end of input does not + // set the hitEnd or requireEnd flags + if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { + break; + } + } + // not at a new line. Fail. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } break; - - - case URX_CARET_M: // ^, test for start of line in mulit-line mode - { - if (fp->fInputIdx == 0) { - // We are at the start input. Success. - break; - } - // Check whether character just before the current pos is a new-line - // unless we are at the end of input - UChar c = inputBuf[fp->fInputIdx - 1]; - if ((fp->fInputIdx < inputLen) && - (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029)) { - // It's a new-line. ^ is true. Success. - break; - } - // Not at the start of a line. Fail. - fp = (REStackFrame *)fStack->popFrame(frameSize); - } - break; - - + + + case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode + { + if (fp->fInputIdx >= fAnchorLimit) { + // We really are at the end of input. Success. + fHitEnd = TRUE; + fRequireEnd = TRUE; // Java set requireEnd in this case, even though + break; // adding a new-line would not lose the match. + } + // If we are not positioned just before a new-line, the test fails; backtrack out. + // It makes no difference where the new-line is within the input. + if (inputBuf[fp->fInputIdx] != 0x0a) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + + case URX_CARET: // ^, test for start of line + if (fp->fInputIdx != fAnchorStart) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + break; + + + case URX_CARET_M: // ^, test for start of line in mulit-line mode + { + if (fp->fInputIdx == fAnchorStart) { + // We are at the start input. Success. + break; + } + // Check whether character just before the current pos is a new-line + // unless we are at the end of input + UChar c = inputBuf[fp->fInputIdx - 1]; + if ((fp->fInputIdx < fAnchorLimit) && + ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { + // It's a new-line. ^ is true. Success. + // TODO: what should be done with positions between a CR and LF? + break; + } + // Not at the start of a line. Fail. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + break; + + + case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode + { + U_ASSERT(fp->fInputIdx >= fAnchorStart); + if (fp->fInputIdx <= fAnchorStart) { + // We are at the start input. Success. + break; + } + // Check whether character just before the current pos is a new-line + U_ASSERT(fp->fInputIdx <= fAnchorLimit); + UChar c = inputBuf[fp->fInputIdx - 1]; + if (c != 0x0a) { + // Not at the start of a line. Back-track out. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + case URX_BACKSLASH_B: // Test for word boundaries { - UBool success = isWordBoundary(fp->fInputIdx); + UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx); success ^= (opValue != 0); // flip sense for \B if (!success) { - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; - - + + + case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style + { + UBool success = isUWordBoundary(fp->fInputIdx); + success ^= (opValue != 0); // flip sense for \B + if (!success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + case URX_BACKSLASH_D: // Test for decimal digit { - if (fp->fInputIdx >= inputLen) { - fp = (REStackFrame *)fStack->popFrame(frameSize); + if (fp->fInputIdx >= fActiveLimit) { + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } - - UChar32 c = fInput->char32At(fp->fInputIdx); - int8_t ctype = u_charType(c); + + UChar32 c; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster. UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); success ^= (opValue != 0); // flip sense for \D - if (success) { - fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1); - } else { - fp = (REStackFrame *)fStack->popFrame(frameSize); + if (!success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; - - - - + + case URX_BACKSLASH_G: // Test for position at end of previous match - if (!((fMatch && fp->fInputIdx==fMatchEnd) || fMatch==FALSE && fp->fInputIdx==0)) { - fp = (REStackFrame *)fStack->popFrame(frameSize); + if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); } break; - - + + case URX_BACKSLASH_X: - // Match a Grapheme, as defined by Unicode TR 29. - // Differs slightly from Perl, which consumes combining marks independently - // of context. - { + // Match a Grapheme, as defined by Unicode TR 29. + // Differs slightly from Perl, which consumes combining marks independently + // of context. + { - // Fail if at end of input - if (fp->fInputIdx >= inputLen) { - fp = (REStackFrame *)fStack->popFrame(frameSize); - break; - } + // Fail if at end of input + if (fp->fInputIdx >= fActiveLimit) { + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } - // Examine (and consume) the current char. - // Dispatch into a little state machine, based on the char. - UChar32 c; - U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c); - UnicodeSet **sets = fPattern->fStaticSets; - if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; - if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; - if (sets[URX_GC_L]->contains(c)) goto GC_L; - if (sets[URX_GC_LV]->contains(c)) goto GC_V; - if (sets[URX_GC_LVT]->contains(c)) goto GC_T; - if (sets[URX_GC_V]->contains(c)) goto GC_V; - if (sets[URX_GC_T]->contains(c)) goto GC_T; - goto GC_Extend; + // Examine (and consume) the current char. + // Dispatch into a little state machine, based on the char. + UChar32 c; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + UnicodeSet **sets = fPattern->fStaticSets; + if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; + if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; + if (sets[URX_GC_L]->contains(c)) goto GC_L; + if (sets[URX_GC_LV]->contains(c)) goto GC_V; + if (sets[URX_GC_LVT]->contains(c)) goto GC_T; + if (sets[URX_GC_V]->contains(c)) goto GC_V; + if (sets[URX_GC_T]->contains(c)) goto GC_T; + goto GC_Extend; GC_L: - if (fp->fInputIdx >= inputLen) goto GC_Done; - U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c); - if (sets[URX_GC_L]->contains(c)) goto GC_L; - if (sets[URX_GC_LV]->contains(c)) goto GC_V; - if (sets[URX_GC_LVT]->contains(c)) goto GC_T; - if (sets[URX_GC_V]->contains(c)) goto GC_V; - U16_PREV(inputBuf, 0, fp->fInputIdx, c); - goto GC_Extend; + if (fp->fInputIdx >= fActiveLimit) goto GC_Done; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + if (sets[URX_GC_L]->contains(c)) goto GC_L; + if (sets[URX_GC_LV]->contains(c)) goto GC_V; + if (sets[URX_GC_LVT]->contains(c)) goto GC_T; + if (sets[URX_GC_V]->contains(c)) goto GC_V; + U16_PREV(inputBuf, 0, fp->fInputIdx, c); + goto GC_Extend; GC_V: - if (fp->fInputIdx >= inputLen) goto GC_Done; - U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c); - if (sets[URX_GC_V]->contains(c)) goto GC_V; - if (sets[URX_GC_T]->contains(c)) goto GC_T; - U16_PREV(inputBuf, 0, fp->fInputIdx, c); - goto GC_Extend; + if (fp->fInputIdx >= fActiveLimit) goto GC_Done; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + if (sets[URX_GC_V]->contains(c)) goto GC_V; + if (sets[URX_GC_T]->contains(c)) goto GC_T; + U16_PREV(inputBuf, 0, fp->fInputIdx, c); + goto GC_Extend; GC_T: - if (fp->fInputIdx >= inputLen) goto GC_Done; - U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c); - if (sets[URX_GC_T]->contains(c)) goto GC_T; - U16_PREV(inputBuf, 0, fp->fInputIdx, c); - goto GC_Extend; + if (fp->fInputIdx >= fActiveLimit) goto GC_Done; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + if (sets[URX_GC_T]->contains(c)) goto GC_T; + U16_PREV(inputBuf, 0, fp->fInputIdx, c); + goto GC_Extend; -GC_Extend: - // Combining characters are consumed here - for (;;) { - if (fp->fInputIdx >= inputLen) { - break; - } - U16_GET(inputBuf, 0, fp->fInputIdx, inputLen, c); - if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { - break; - } - U16_FWD_1(inputBuf, fp->fInputIdx, inputLen); +GC_Extend: + // Combining characters are consumed here + for (;;) { + if (fp->fInputIdx >= fActiveLimit) { + break; } - goto GC_Done; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { + U16_BACK_1(inputBuf, 0, fp->fInputIdx); + break; + } + } + goto GC_Done; GC_Control: - // Most control chars stand alone (don't combine with combining chars), - // except for that CR/LF sequence is a single grapheme cluster. - if (c == 0x0d && fp->fInputIdx < inputLen && inputBuf[fp->fInputIdx] == 0x0a) { - fp->fInputIdx++; - } + // Most control chars stand alone (don't combine with combining chars), + // except for that CR/LF sequence is a single grapheme cluster. + if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) { + fp->fInputIdx++; + } GC_Done: - break; + if (fp->fInputIdx >= fActiveLimit) { + fHitEnd = TRUE; } + break; + } - - - - case URX_BACKSLASH_Z: // Test for end of line - if (fp->fInputIdx < inputLen) { - fp = (REStackFrame *)fStack->popFrame(frameSize); + + + + case URX_BACKSLASH_Z: // Test for end of Input + if (fp->fInputIdx < fAnchorLimit) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } else { + fHitEnd = TRUE; + fRequireEnd = TRUE; } break; - - - + + + case URX_STATIC_SETREF: { // Test input character against one of the predefined sets @@ -1276,16 +4973,18 @@ GC_Done: // The high bit of the op value is a flag for the match polarity. // 0: success if input char is in set. // 1: success if input char is not in set. - if (fp->fInputIdx >= inputLen) { - fp = (REStackFrame *)fStack->popFrame(frameSize); + if (fp->fInputIdx >= fActiveLimit) { + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } - + UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); opValue &= ~URX_NEG_SET; U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); - UChar32 c; - U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c); + + UChar32 c; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); if (c < 256) { Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; if (s8->contains(c)) { @@ -1298,24 +4997,61 @@ GC_Done: } } if (!success) { - fp = (REStackFrame *)fStack->popFrame(frameSize); + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + // Try to find it, backwards + int64_t reverseIndex = fp->fInputIdx; + U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried + success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset + do { + U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); + if (c < 256) { + Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; + if (s8->contains(c)) { + success = !success; + } + } else { + const UnicodeSet *s = fPattern->fStaticSets[opValue]; + if (s->contains(c)) { + success = !success; + } + } + } while (reverseIndex > backSearchIndex && !success); + + if (success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = reverseIndex; + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; - + case URX_STAT_SETREF_N: { // Test input character for NOT being a member of one of // the predefined sets (Word Characters, for example) - if (fp->fInputIdx >= inputLen) { - fp = (REStackFrame *)fStack->popFrame(frameSize); + if (fp->fInputIdx >= fActiveLimit) { + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } - + U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); + UChar32 c; - U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c); + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); if (c < 256) { Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; if (s8->contains(c) == FALSE) { @@ -1328,52 +5064,135 @@ GC_Done: } } - fp = (REStackFrame *)fStack->popFrame(frameSize); + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + // Try to find it, backwards + int64_t reverseIndex = fp->fInputIdx; + U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried + UBool success = FALSE; + do { + U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); + if (c < 256) { + Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; + if (s8->contains(c) == FALSE) { + success = TRUE; + break; + } + } else { + const UnicodeSet *s = fPattern->fStaticSets[opValue]; + if (s->contains(c) == FALSE) { + success = TRUE; + break; + } + } + } while (reverseIndex > backSearchIndex); + + if (success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = reverseIndex; + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + fp = (REStackFrame *)fStack->popFrame(fFrameSize); } break; - + case URX_SETREF: - if (fp->fInputIdx < inputLen) { - // There is input left. Pick up one char and test it for set membership. - UChar32 c; - U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c); + { + if (fp->fInputIdx >= fActiveLimit) { + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + U_ASSERT(opValue > 0 && opValue < sets->size()); + + // There is input left. Pick up one char and test it for set membership. + UChar32 c; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); if (c<256) { Regex8BitSet *s8 = &fPattern->fSets8[opValue]; if (s8->contains(c)) { + // The character is in the set. A Match. break; } } else { - UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); if (s->contains(c)) { // The character is in the set. A Match. break; } } + + // the character wasn't in the set. + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + // Try to find it, backwards + int64_t reverseIndex = fp->fInputIdx; + U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried + UBool success = FALSE; + do { + U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); + if (c < 256) { + Regex8BitSet *s8 = &fPattern->fSets8[opValue]; + if (s8->contains(c)) { + success = TRUE; + break; + } + } else { + UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); + if (s->contains(c)) { + success = TRUE; + break; + } + } + } while (reverseIndex > backSearchIndex); + + if (success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = reverseIndex; + if (fp->fInputIdx > reverseIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + fp = (REStackFrame *)fStack->popFrame(fFrameSize); } - // Either at end of input, or the character wasn't in the set. - // Either way, we need to back track out. - fp = (REStackFrame *)fStack->popFrame(frameSize); break; - + case URX_DOTANY: { // . matches anything, but stops at end-of-line. - if (fp->fInputIdx >= inputLen) { + if (fp->fInputIdx >= fActiveLimit) { // At end of input. Match failed. Backtrack out. - fp = (REStackFrame *)fStack->popFrame(frameSize); + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } + // There is input left. Advance over one char, unless we've hit end-of-line - UChar32 c; - U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c); + UChar32 c; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible - (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029)) { + ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { // End of line in normal mode. . does not match. - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } } @@ -1382,85 +5201,64 @@ GC_Done: case URX_DOTANY_ALL: { - // ., in dot-matches-all (including new lines) mode - if (fp->fInputIdx >= inputLen) { + // . in dot-matches-all (including new lines) mode + if (fp->fInputIdx >= fActiveLimit) { // At end of input. Match failed. Backtrack out. - fp = (REStackFrame *)fStack->popFrame(frameSize); + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } + // There is input left. Advance over one char, except if we are // at a cr/lf, advance over both of them. UChar32 c; - U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c); - if (c==0x0d) { + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + if (c==0x0d && fp->fInputIdx < fActiveLimit) { // In the case of a CR/LF, we need to advance over both. - UChar nextc = inputBuf[fp->fInputIdx]; - if (nextc == 0x0a) { - fp->fInputIdx++; + if (inputBuf[fp->fInputIdx] == 0x0a) { + U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit); } } } break; - - case URX_DOTANY_PL: - // Match all up to and end-of-line or end-of-input. + + + case URX_DOTANY_UNIX: { - // Fail if input already exhausted. - if (fp->fInputIdx >= inputLen) { - fp = (REStackFrame *)fStack->popFrame(frameSize); - break; - } - - // There is input left. Fail if we are at the end of a line. - UChar32 c; - U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c); - if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible - (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029)) { - // End of line in normal mode. . does not match. - fp = (REStackFrame *)fStack->popFrame(frameSize); + // '.' operator, matches all, but stops at end-of-line. + // UNIX_LINES mode, so 0x0a is the only recognized line ending. + if (fp->fInputIdx >= fActiveLimit) { + // At end of input. Match failed. Backtrack out. + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } - // There was input left. Consume it until we hit the end of a line, - // or until it's exhausted. - while (fp->fInputIdx < inputLen) { - U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c); - if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible - (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029)) { - U16_BACK_1(inputBuf, 0, fp->fInputIdx) - // Scan has reached a line-end. We are done. - break; - } - } - } - break; - - case URX_DOTANY_ALL_PL: - { - // Match up to end of input. Fail if already at end of input. - if (fp->fInputIdx >= inputLen) { - fp = (REStackFrame *)fStack->popFrame(frameSize); - } else { - fp->fInputIdx = inputLen; + // There is input left. Advance over one char, unless we've hit end-of-line + UChar32 c; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + if (c == 0x0a) { + // End of line in normal mode. '.' does not match the \n + fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; - - + + case URX_JMP: fp->fPatIdx = opValue; break; - + case URX_FAIL: isMatch = FALSE; goto breakFromLoop; - + case URX_JMP_SAV: U_ASSERT(opValue < fPattern->fCompiledPat->size()); - fp = StateSave(fp, fp->fPatIdx, frameSize, status); // State save to loc following current - fp->fPatIdx = opValue; // Then JMP. + fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current + fp->fPatIdx = opValue; // Then JMP. break; - + case URX_JMP_SAV_X: // This opcode is used with (x)+, when x can match a zero length string. // Same as JMP_SAV, except conditional on the match having made forward progress. @@ -1468,15 +5266,15 @@ GC_Done: // data address of the input position at the start of the loop. { U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()); - int32_t stoOp = pat[opValue-1]; + int32_t stoOp = (int32_t)pat[opValue-1]; U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); int32_t frameLoc = URX_VAL(stoOp); - U_ASSERT(frameLoc >= 0 && frameLoc < frameSize); - int32_t prevInputIdx = fp->fExtra[frameLoc]; + U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); + int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc]; U_ASSERT(prevInputIdx <= fp->fInputIdx); if (prevInputIdx < fp->fInputIdx) { // The match did make progress. Repeat the loop. - fp = StateSave(fp, fp->fPatIdx, frameSize, status); // State save to loc following current + fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current fp->fPatIdx = opValue; fp->fExtra[frameLoc] = fp->fInputIdx; } @@ -1484,103 +5282,109 @@ GC_Done: // execution will fall out of the loop. } break; - + case URX_CTR_INIT: { - U_ASSERT(opValue >= 0 && opValue < frameSize-2); + U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); fp->fExtra[opValue] = 0; // Set the loop counter variable to zero - + // Pick up the three extra operands that CTR_INIT has, and // skip the pattern location counter past - int32_t instrOperandLoc = fp->fPatIdx; + int32_t instrOperandLoc = (int32_t)fp->fPatIdx; fp->fPatIdx += 3; int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); - int32_t minCount = pat[instrOperandLoc+1]; - int32_t maxCount = pat[instrOperandLoc+2]; + int32_t minCount = (int32_t)pat[instrOperandLoc+1]; + int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; U_ASSERT(minCount>=0); U_ASSERT(maxCount>=minCount || maxCount==-1); U_ASSERT(loopLoc>fp->fPatIdx); - + if (minCount == 0) { - fp = StateSave(fp, loopLoc+1, frameSize, status); + fp = StateSave(fp, loopLoc+1, status); } if (maxCount == 0) { - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; - + case URX_CTR_LOOP: { U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); - int32_t initOp = pat[opValue]; + int32_t initOp = (int32_t)pat[opValue]; U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); - int32_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; - int32_t minCount = pat[opValue+2]; - int32_t maxCount = pat[opValue+3]; - // Increment the counter. Note: we're not worrying about counter + int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; + int32_t minCount = (int32_t)pat[opValue+2]; + int32_t maxCount = (int32_t)pat[opValue+3]; + // Increment the counter. Note: we DIDN'T worry about counter // overflow, since the data comes from UnicodeStrings, which - // stores its length in an int32_t. + // stores its length in an int32_t. Do we have to think about + // this now that we're using UText? Probably not, since the length + // in UChar32s is still an int32_t. (*pCounter)++; U_ASSERT(*pCounter > 0); - if ((uint32_t)*pCounter >= (uint32_t)maxCount) { + if ((uint64_t)*pCounter >= (uint32_t)maxCount) { U_ASSERT(*pCounter == maxCount || maxCount == -1); break; } if (*pCounter >= minCount) { - fp = StateSave(fp, fp->fPatIdx, frameSize, status); + fp = StateSave(fp, fp->fPatIdx, status); } fp->fPatIdx = opValue + 4; // Loop back. } break; - + case URX_CTR_INIT_NG: { - U_ASSERT(opValue >= 0 && opValue < frameSize-2); + // Initialize a non-greedy loop + U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); fp->fExtra[opValue] = 0; // Set the loop counter variable to zero - + // Pick up the three extra operands that CTR_INIT has, and // skip the pattern location counter past - int32_t instrOperandLoc = fp->fPatIdx; + int32_t instrOperandLoc = (int32_t)fp->fPatIdx; fp->fPatIdx += 3; int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); - int32_t minCount = pat[instrOperandLoc+1]; - int32_t maxCount = pat[instrOperandLoc+2]; + int32_t minCount = (int32_t)pat[instrOperandLoc+1]; + int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; U_ASSERT(minCount>=0); U_ASSERT(maxCount>=minCount || maxCount==-1); U_ASSERT(loopLoc>fp->fPatIdx); - + if (minCount == 0) { if (maxCount != 0) { - fp = StateSave(fp, fp->fPatIdx, frameSize, status); + fp = StateSave(fp, fp->fPatIdx, status); } fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block } } break; - + case URX_CTR_LOOP_NG: { + // Non-greedy {min, max} loops U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); - int32_t initOp = pat[opValue]; + int32_t initOp = (int32_t)pat[opValue]; U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); - int32_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; - int32_t minCount = pat[opValue+2]; - int32_t maxCount = pat[opValue+3]; - // Increment the counter. Note: we're not worrying about counter + int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; + int32_t minCount = (int32_t)pat[opValue+2]; + int32_t maxCount = (int32_t)pat[opValue+3]; + // Increment the counter. Note: we DIDN'T worry about counter // overflow, since the data comes from UnicodeStrings, which - // stores its length in an int32_t. + // stores its length in an int32_t. Do we have to think about + // this now that we're using UText? Probably not, since the length + // in UChar32s is still an int32_t. (*pCounter)++; U_ASSERT(*pCounter > 0); - - if ((uint32_t)*pCounter >= (uint32_t)maxCount) { + + if ((uint64_t)*pCounter >= (uint32_t)maxCount) { // The loop has matched the maximum permitted number of times. // Break out of here with no action. Matching will // continue with the following pattern. U_ASSERT(*pCounter == maxCount || maxCount == -1); break; } - + if (*pCounter < minCount) { // We haven't met the minimum number of matches yet. // Loop back for another one. @@ -1590,47 +5394,45 @@ GC_Done: // Fall into the following pattern, but first do // a state save to the top of the loop, so that a failure // in the following pattern will try another iteration of the loop. - fp = StateSave(fp, opValue + 4, frameSize, status); + fp = StateSave(fp, opValue + 4, status); } } break; - - // TODO: Possessive flavor of loop ops, or take them out if no longer needed. - + case URX_STO_SP: U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); fData[opValue] = fStack->size(); break; - + case URX_LD_SP: { U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); - int32_t newStackSize = fData[opValue]; + int32_t newStackSize = (int32_t)fData[opValue]; U_ASSERT(newStackSize <= fStack->size()); - int32_t *newFP = fStack->getBuffer() + newStackSize - frameSize; - if (newFP == (int32_t *)fp) { + int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; + if (newFP == (int64_t *)fp) { break; } int32_t i; - for (i=0; isetSize(newStackSize); } break; - + case URX_BACKREF: case URX_BACKREF_I: { - U_ASSERT(opValue < frameSize); - int32_t groupStartIdx = fp->fExtra[opValue]; - int32_t groupEndIdx = fp->fExtra[opValue+1]; + U_ASSERT(opValue < fFrameSize); + int64_t groupStartIdx = fp->fExtra[opValue]; + int64_t groupEndIdx = fp->fExtra[opValue+1]; U_ASSERT(groupStartIdx <= groupEndIdx); - int32_t len = groupEndIdx-groupStartIdx; + int64_t len = groupEndIdx-groupStartIdx; if (groupStartIdx < 0) { // This capture group has not participated in the match thus far, - fp = (REStackFrame *)fStack->popFrame(frameSize); // FAIL, no match. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. } if (len == 0) { @@ -1639,58 +5441,55 @@ GC_Done: // we do too. break; } - /* - if ((fp->fInputIdx + len > inputLen) || - u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, len) != 0) { - fp = (REStackFrame *)fStack->popFrame(frameSize); // FAIL, no match. - } else { - fp->fInputIdx += len; // Match. Advance current input position. - } - */ + UBool haveMatch = FALSE; - if (fp->fInputIdx + len <= inputLen) { + if (fp->fInputIdx + len <= fActiveLimit) { if (opType == URX_BACKREF) { - if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, len) == 0) { + if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, (int32_t)len) == 0) { haveMatch = TRUE; } } else { if (u_strncasecmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, - len, U_FOLD_CASE_DEFAULT) == 0) { + (int32_t)len, U_FOLD_CASE_DEFAULT) == 0) { haveMatch = TRUE; } } + } else { + // TODO: probably need to do a partial string comparison, and only + // set HitEnd if the available input matched. Ticket #6074 + fHitEnd = TRUE; } if (haveMatch) { fp->fInputIdx += len; // Match. Advance current input position. } else { - fp = (REStackFrame *)fStack->popFrame(frameSize); // FAIL, no match. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. } } break; - + case URX_STO_INP_LOC: { - U_ASSERT(opValue >= 0 && opValue < frameSize); + U_ASSERT(opValue >= 0 && opValue < fFrameSize); fp->fExtra[opValue] = fp->fInputIdx; } break; - + case URX_JMPX: { - int32_t instrOperandLoc = fp->fPatIdx; + int32_t instrOperandLoc = (int32_t)fp->fPatIdx; fp->fPatIdx += 1; int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); - U_ASSERT(dataLoc >= 0 && dataLoc < frameSize); - int32_t savedInputIdx = fp->fExtra[dataLoc]; + U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); + int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc]; U_ASSERT(savedInputIdx <= fp->fInputIdx); if (savedInputIdx < fp->fInputIdx) { fp->fPatIdx = opValue; // JMP } else { - fp = (REStackFrame *)fStack->popFrame(frameSize); // FAIL, no progress in loop. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop. } } break; - + case URX_LA_START: { // Entering a lookahead block. @@ -1698,73 +5497,200 @@ GC_Done: U_ASSERT(opValue>=0 && opValue+1fDataSize); fData[opValue] = fStack->size(); fData[opValue+1] = fp->fInputIdx; + fActiveStart = fLookStart; // Set the match region change for + fActiveLimit = fLookLimit; // transparent bounds. } break; - + case URX_LA_END: { // Leaving a look-ahead block. // restore Stack Ptr, Input Pos to positions they had on entry to block. U_ASSERT(opValue>=0 && opValue+1fDataSize); int32_t stackSize = fStack->size(); - int32_t newStackSize = fData[opValue]; + int32_t newStackSize = (int32_t)fData[opValue]; U_ASSERT(stackSize >= newStackSize); if (stackSize > newStackSize) { - int32_t *newFP = fStack->getBuffer() + newStackSize - frameSize; + // Copy the current top frame back to the new (cut back) top frame. + // This makes the capture groups from within the look-ahead + // expression available. + int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; int32_t i; - for (i=0; isetSize(newStackSize); } fp->fInputIdx = fData[opValue+1]; + + // Restore the active region bounds in the input string; they may have + // been changed because of transparent bounds on a Region. + fActiveStart = fRegionStart; + fActiveLimit = fRegionLimit; } break; - + case URX_ONECHAR_I: - if (fp->fInputIdx < inputLen) { - UChar32 c; - U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c); - if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { + if (fp->fInputIdx < fActiveLimit) { + UChar32 c; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { break; } + } else { + fHitEnd = TRUE; } - fp = (REStackFrame *)fStack->popFrame(frameSize); - break; + + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + UBool success = FALSE; + int64_t reverseIndex = fp->fInputIdx; + UChar32 c; + while (reverseIndex > backSearchIndex) { + U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); + if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { + success = TRUE; + break; + } else if (c == U_SENTINEL) { + break; + } + } + if (success) { + fHitEnd = FALSE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = reverseIndex; + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + case URX_STRING_I: { // Test input against a literal string. // Strings require two slots in the compiled pattern, one for the // offset to the string text, and one for the length. - int32_t stringStartIdx, stringLen; - stringStartIdx = opValue; - - op = pat[fp->fPatIdx]; - fp->fPatIdx++; - opType = URX_TYPE(op); - opValue = URX_VAL(op); - U_ASSERT(opType == URX_STRING_LEN); - stringLen = opValue; - - int32_t stringEndIndex = fp->fInputIdx + stringLen; - if (stringEndIndex <= inputLen && - u_strncasecmp(inputBuf+fp->fInputIdx, litText+stringStartIdx, - stringLen, U_FOLD_CASE_DEFAULT) == 0) { - // Success. Advance the current input position. - fp->fInputIdx = stringEndIndex; - } else { - // No match. Back up matching to a saved state - fp = (REStackFrame *)fStack->popFrame(frameSize); + const UCaseProps *csp = ucase_getSingleton(); + { + int32_t stringStartIdx, stringLen; + stringStartIdx = opValue; + + op = (int32_t)pat[fp->fPatIdx]; + fp->fPatIdx++; + opType = URX_TYPE(op); + opValue = URX_VAL(op); + U_ASSERT(opType == URX_STRING_LEN); + stringLen = opValue; + + const UChar *patternChars = litText+stringStartIdx; + const UChar *patternEnd = patternChars+stringLen; + + const UChar *foldChars = NULL; + int32_t foldOffset, foldLength; + UChar32 c; + + #ifdef REGEX_SMART_BACKTRACKING + int32_t originalInputIdx = fp->fInputIdx; + #endif + UBool success = TRUE; + + foldOffset = foldLength = 0; + + while (patternChars < patternEnd && success) { + if(foldOffset < foldLength) { + U16_NEXT_UNSAFE(foldChars, foldOffset, c); + } else { + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT); + if(foldLength >= 0) { + if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings + foldOffset = 0; + U16_NEXT_UNSAFE(foldChars, foldOffset, c); + } else { + c = foldLength; + foldLength = foldOffset; // to avoid reading chars from the folding buffer + } + } + } + + if (fp->fInputIdx <= fActiveLimit) { + if (U_IS_BMP(c)) { + success = (*patternChars == c); + patternChars += 1; + } else if (patternChars+1 < patternEnd) { + success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c)); + patternChars += 2; + } + } else { + success = FALSE; + fHitEnd = TRUE; // TODO: See ticket 6074 + } + } + + if (!success) { + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size()) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + // Reset to last start point + int64_t reverseIndex = originalInputIdx; + patternChars = litText+stringStartIdx; + + // Search backwards for a possible start + do { + U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); + foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT); + if(foldLength >= 0) { + if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings + foldOffset = 0; + U16_NEXT_UNSAFE(foldChars, foldOffset, c); + } else { + c = foldLength; + foldLength = foldOffset; // to avoid reading chars from the folding buffer + } + } + + if ((U_IS_BMP(c) && *patternChars == c) || + (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) { + success = TRUE; + break; + } + } while (reverseIndex > backSearchIndex); + + // And try again + if (success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = reverseIndex; + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } } } break; - + case URX_LB_START: { // Entering a look-behind block. // Save Stack Ptr, Input Pos. + // TODO: implement transparent bounds. Ticket #6067 U_ASSERT(opValue>=0 && opValue+1fDataSize); fData[opValue] = fStack->size(); fData[opValue+1] = fp->fInputIdx; @@ -1772,27 +5698,27 @@ GC_Done: fData[opValue+2] = -1; // Save input string length, then reset to pin any matches to end at // the current position. - fData[opValue+3] = inputLen; - inputLen = fp->fInputIdx; + fData[opValue+3] = fActiveLimit; + fActiveLimit = fp->fInputIdx; } break; - - + + case URX_LB_CONT: { // Positive Look-Behind, at top of loop checking for matches of LB expression // at all possible input starting positions. - + // Fetch the min and max possible match lengths. They are the operands // of this op in the pattern. - int32_t minML = pat[fp->fPatIdx++]; - int32_t maxML = pat[fp->fPatIdx++]; + int32_t minML = (int32_t)pat[fp->fPatIdx++]; + int32_t maxML = (int32_t)pat[fp->fPatIdx++]; U_ASSERT(minML <= maxML); U_ASSERT(minML >= 0); - + // Fetch (from data) the last input index where a match was attempted. U_ASSERT(opValue>=0 && opValue+1fDataSize); - int32_t *lbStartIdx = &fData[opValue+2]; + int64_t *lbStartIdx = &fData[opValue+2]; if (*lbStartIdx < 0) { // First time through loop. *lbStartIdx = fp->fInputIdx - minML; @@ -1800,73 +5726,73 @@ GC_Done: // 2nd through nth time through the loop. // Back up start position for match by one. if (*lbStartIdx == 0) { - (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0. + (*lbStartIdx)--; } else { U16_BACK_1(inputBuf, 0, *lbStartIdx); } } - + if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { // We have tried all potential match starting points without // getting a match. Backtrack out, and out of the // Look Behind altogether. - fp = (REStackFrame *)fStack->popFrame(frameSize); - int32_t restoreInputLen = fData[opValue+3]; - U_ASSERT(restoreInputLen >= inputLen); - U_ASSERT(restoreInputLen <= fInput->length()); - inputLen = restoreInputLen; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + int64_t restoreInputLen = fData[opValue+3]; + U_ASSERT(restoreInputLen >= fActiveLimit); + U_ASSERT(restoreInputLen <= fInputLength); + fActiveLimit = restoreInputLen; break; } - + // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. // (successful match will fall off the end of the loop.) - fp = StateSave(fp, fp->fPatIdx-3, frameSize, status); + fp = StateSave(fp, fp->fPatIdx-3, status); fp->fInputIdx = *lbStartIdx; } break; - + case URX_LB_END: // End of a look-behind block, after a successful match. { U_ASSERT(opValue>=0 && opValue+1fDataSize); - if (fp->fInputIdx != inputLen) { + if (fp->fInputIdx != fActiveLimit) { // The look-behind expression matched, but the match did not // extend all the way to the point that we are looking behind from. // FAIL out of here, which will take us back to the LB_CONT, which // will retry the match starting at another position or fail // the look-behind altogether, whichever is appropriate. - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } - + // Look-behind match is good. Restore the orignal input string length, // which had been truncated to pin the end of the lookbehind match to the // position being looked-behind. - int32_t originalInputLen = fData[opValue+3]; - U_ASSERT(originalInputLen >= inputLen); - U_ASSERT(originalInputLen <= fInput->length()); - inputLen = originalInputLen; + int64_t originalInputLen = fData[opValue+3]; + U_ASSERT(originalInputLen >= fActiveLimit); + U_ASSERT(originalInputLen <= fInputLength); + fActiveLimit = originalInputLen; } break; - - + + case URX_LBN_CONT: { // Negative Look-Behind, at top of loop checking for matches of LB expression // at all possible input starting positions. - + // Fetch the extra parameters of this op. - int32_t minML = pat[fp->fPatIdx++]; - int32_t maxML = pat[fp->fPatIdx++]; - int32_t continueLoc = pat[fp->fPatIdx++]; - continueLoc = URX_VAL(continueLoc); + int32_t minML = (int32_t)pat[fp->fPatIdx++]; + int32_t maxML = (int32_t)pat[fp->fPatIdx++]; + int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; + continueLoc = URX_VAL(continueLoc); U_ASSERT(minML <= maxML); U_ASSERT(minML >= 0); U_ASSERT(continueLoc > fp->fPatIdx); - + // Fetch (from data) the last input index where a match was attempted. U_ASSERT(opValue>=0 && opValue+1fDataSize); - int32_t *lbStartIdx = &fData[opValue+2]; + int64_t *lbStartIdx = &fData[opValue+2]; if (*lbStartIdx < 0) { // First time through loop. *lbStartIdx = fp->fInputIdx - minML; @@ -1879,65 +5805,65 @@ GC_Done: U16_BACK_1(inputBuf, 0, *lbStartIdx); } } - + if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { // We have tried all potential match starting points without // getting a match, which means that the negative lookbehind as // a whole has succeeded. Jump forward to the continue location - int32_t restoreInputLen = fData[opValue+3]; - U_ASSERT(restoreInputLen >= inputLen); - U_ASSERT(restoreInputLen <= fInput->length()); - inputLen = restoreInputLen; + int64_t restoreInputLen = fData[opValue+3]; + U_ASSERT(restoreInputLen >= fActiveLimit); + U_ASSERT(restoreInputLen <= fInputLength); + fActiveLimit = restoreInputLen; fp->fPatIdx = continueLoc; break; } - + // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. // (successful match will cause a FAIL out of the loop altogether.) - fp = StateSave(fp, fp->fPatIdx-4, frameSize, status); + fp = StateSave(fp, fp->fPatIdx-4, status); fp->fInputIdx = *lbStartIdx; } break; - + case URX_LBN_END: // End of a negative look-behind block, after a successful match. { U_ASSERT(opValue>=0 && opValue+1fDataSize); - if (fp->fInputIdx != inputLen) { + if (fp->fInputIdx != fActiveLimit) { // The look-behind expression matched, but the match did not // extend all the way to the point that we are looking behind from. // FAIL out of here, which will take us back to the LB_CONT, which // will retry the match starting at another position or succeed // the look-behind altogether, whichever is appropriate. - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } - + // Look-behind expression matched, which means look-behind test as // a whole Fails // Restore the orignal input string length, which had been truncated // inorder to pin the end of the lookbehind match // to the position being looked-behind. - int32_t originalInputLen = fData[opValue+3]; - U_ASSERT(originalInputLen >= inputLen); - U_ASSERT(originalInputLen <= fInput->length()); - inputLen = originalInputLen; - + int64_t originalInputLen = fData[opValue+3]; + U_ASSERT(originalInputLen >= fActiveLimit); + U_ASSERT(originalInputLen <= fInputLength); + fActiveLimit = originalInputLen; + // Restore original stack position, discarding any state saved // by the successful pattern match. U_ASSERT(opValue>=0 && opValue+1fDataSize); - int32_t newStackSize = fData[opValue]; + int32_t newStackSize = (int32_t)fData[opValue]; U_ASSERT(fStack->size() > newStackSize); fStack->setSize(newStackSize); // FAIL, which will take control back to someplace // prior to entering the look-behind test. - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); } break; - - + + case URX_LOOP_SR_I: // Loop Initialization for the optimized implementation of // [some character set]* @@ -1947,16 +5873,17 @@ GC_Done: U_ASSERT(opValue > 0 && opValue < sets->size()); Regex8BitSet *s8 = &fPattern->fSets8[opValue]; UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); - + // Loop through input, until either the input is exhausted or // we reach a character that is not a member of the set. - int32_t ix = fp->fInputIdx; + int32_t ix = (int32_t)fp->fInputIdx; for (;;) { - if (ix >= inputLen) { + if (ix >= fActiveLimit) { + fHitEnd = TRUE; break; } UChar32 c; - U16_NEXT(inputBuf, ix, inputLen, c); + U16_NEXT(inputBuf, ix, fActiveLimit, c); if (c<256) { if (s8->contains(c) == FALSE) { U16_BACK_1(inputBuf, 0, ix); @@ -1969,57 +5896,68 @@ GC_Done: } } } - + // If there were no matching characters, skip over the loop altogether. // The loop doesn't run at all, a * op always succeeds. if (ix == fp->fInputIdx) { fp->fPatIdx++; // skip the URX_LOOP_C op. break; } - + // Peek ahead in the compiled pattern, to the URX_LOOP_C that // must follow. It's operand is the stack location // that holds the starting input index for the match of this [set]* - int32_t loopcOp = pat[fp->fPatIdx]; + int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); int32_t stackLoc = URX_VAL(loopcOp); - U_ASSERT(stackLoc >= 0 && stackLoc < frameSize); + U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); fp->fExtra[stackLoc] = fp->fInputIdx; + #ifdef REGEX_SMART_BACKTRACKING + backSearchIndex = fp->fInputIdx; + #endif fp->fInputIdx = ix; - + // Save State to the URX_LOOP_C op that follows this one, // so that match failures in the following code will return to there. // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. - fp = StateSave(fp, fp->fPatIdx, frameSize, status); + fp = StateSave(fp, fp->fPatIdx, status); fp->fPatIdx++; } break; - - + + case URX_LOOP_DOT_I: // Loop Initialization for the optimized implementation of .* // This op scans through all remaining input. // The following LOOP_C op emulates stack unwinding if the following pattern fails. { // Loop through input until the input is exhausted (we reach an end-of-line) - // In multi-line mode, we can just go straight to the end of the input. - int32_t ix = inputLen; - if (opValue == 0) { - // NOT multi-line mode. Line endings do not match '.' + // In DOTALL mode, we can just go straight to the end of the input. + int32_t ix; + if ((opValue & 1) == 1) { + // Dot-matches-All mode. Jump straight to the end of the string. + ix = (int32_t)fActiveLimit; + fHitEnd = TRUE; + } else { + // NOT DOT ALL mode. Line endings do not match '.' // Scan forward until a line ending or end of input. - ix = fp->fInputIdx; + ix = (int32_t)fp->fInputIdx; for (;;) { - if (ix >= inputLen) { + if (ix >= fActiveLimit) { + fHitEnd = TRUE; break; } UChar32 c; - U16_NEXT(inputBuf, ix, inputLen, c); // c = inputBuf[ix++] - if (((c & 0x7f) <= 0x29) && - (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029)) { - // char is a line ending. Put the input pos back to the - // line ending char, and exit the scanning loop. - U16_BACK_1(inputBuf, 0, ix); - break; + U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++] + if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s + if ((c == 0x0a) || // 0x0a is newline in both modes. + (((opValue & 2) == 0) && // IF not UNIX_LINES mode + ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) { + // char is a line ending. Put the input pos back to the + // line ending char, and exit the scanning loop. + U16_BACK_1(inputBuf, 0, ix); + break; + } } } } @@ -2030,32 +5968,35 @@ GC_Done: fp->fPatIdx++; // skip the URX_LOOP_C op. break; } - + // Peek ahead in the compiled pattern, to the URX_LOOP_C that // must follow. It's operand is the stack location - // that holds the starting input index for the match of this [set]* - int32_t loopcOp = pat[fp->fPatIdx]; + // that holds the starting input index for the match of this .* + int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); int32_t stackLoc = URX_VAL(loopcOp); - U_ASSERT(stackLoc >= 0 && stackLoc < frameSize); + U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); fp->fExtra[stackLoc] = fp->fInputIdx; + #ifdef REGEX_SMART_BACKTRACKING + backSearchIndex = fp->fInputIdx; + #endif fp->fInputIdx = ix; - + // Save State to the URX_LOOP_C op that follows this one, // so that match failures in the following code will return to there. // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. - fp = StateSave(fp, fp->fPatIdx, frameSize, status); + fp = StateSave(fp, fp->fPatIdx, status); fp->fPatIdx++; } break; - - + + case URX_LOOP_C: { - U_ASSERT(opValue>=0 && opValuefExtra[opValue]; - U_ASSERT(terminalIdx <= fp->fInputIdx); - if (terminalIdx == fp->fInputIdx) { + U_ASSERT(opValue>=0 && opValuefExtra[opValue]; + U_ASSERT(backSearchIndex <= fp->fInputIdx); + if (backSearchIndex == fp->fInputIdx) { // We've backed up the input idx to the point that the loop started. // The loop is done. Leave here without saving state. // Subsequent failures won't come back here. @@ -2067,31 +6008,34 @@ GC_Done: // (We're going backwards because this loop emulates stack unwinding, not // the initial scan forward.) U_ASSERT(fp->fInputIdx > 0); - U16_BACK_1(inputBuf, 0, fp->fInputIdx); - if (inputBuf[fp->fInputIdx] == 0x0a && - fp->fInputIdx > terminalIdx && + UChar32 prevC; + U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit? + + if (prevC == 0x0a && + fp->fInputIdx > backSearchIndex && inputBuf[fp->fInputIdx-1] == 0x0d) { - int32_t prevOp = pat[fp->fPatIdx-2]; + int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { // .*, stepping back over CRLF pair. - fp->fInputIdx--; + U16_BACK_1(inputBuf, 0, fp->fInputIdx); } } - - - fp = StateSave(fp, fp->fPatIdx-1, frameSize, status); + + + fp = StateSave(fp, fp->fPatIdx-1, status); } break; - - - + + + default: // Trouble. The compiled pattern contains an entry with an // unrecognized type tag. U_ASSERT(FALSE); } - + if (U_FAILURE(status)) { + isMatch = FALSE; break; } } @@ -2103,26 +6047,25 @@ breakFromLoop: fMatchStart = startIdx; fMatchEnd = fp->fInputIdx; if (fTraceDebug) { - REGEX_RUN_DEBUG_PRINTF("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd); + REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd)); } } else { if (fTraceDebug) { - REGEX_RUN_DEBUG_PRINTF("No match\n\n"); + REGEX_RUN_DEBUG_PRINTF(("No match\n\n")); } } - + fFrame = fp; // The active stack frame when the engine stopped. - // Contains the capture group results that we need to - // access later. + // Contains the capture group results that we need to + // access later. return; } - -const char RegexMatcher::fgClassID = 0; +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher) U_NAMESPACE_END