/*
**************************************************************************
-* Copyright (C) 2002-2010 International Business Machines Corporation *
+* Copyright (C) 2002-2012 International Business Machines Corporation *
* and others. All rights reserved. *
**************************************************************************
*/
#include "unicode/uchar.h"
#include "unicode/ustring.h"
#include "unicode/rbbi.h"
+#include "unicode/utf.h"
+#include "unicode/utf16.h"
#include "uassert.h"
#include "cmemory.h"
#include "uvector.h"
// Good idea in theory; unfortunately it only helps out a few specific
// cases and slows the engine down a little in the rest.
-//#define REGEX_SMART_BACKTRACKING 1
-
U_NAMESPACE_BEGIN
// Default limit for the size of the back track stack, to avoid system
fData = fSmallData;
fWordBreakItr = NULL;
- fStack = new UVector64(status);
+ fStack = NULL;
fInputText = NULL;
fAltInputText = NULL;
fInput = NULL;
}
}
+ fStack = new UVector64(status);
+ if (fStack == NULL) {
+ status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+
reset(input);
setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
if (U_FAILURE(status)) {
// TODO: Report errors for mal-formed \u escapes?
// As this is, the original sequence is output, which may be OK.
if (context.lastOffset == offset) {
- UTEXT_PREVIOUS32(replacement);
+ (void)UTEXT_PREVIOUS32(replacement);
} else if (context.lastOffset != offset-1) {
utext_moveIndex32(replacement, offset - context.lastOffset - 1);
}
}
} else {
- UTEXT_NEXT32(replacement);
+ (void)UTEXT_NEXT32(replacement);
// Plain backslash escape. Just put out the escaped character.
if (U_IS_BMP(c)) {
UChar c16 = (UChar)c;
if (u_isdigit(digitC) == FALSE) {
break;
}
- UTEXT_NEXT32(replacement);
+ (void)UTEXT_NEXT32(replacement);
groupNum=groupNum*10 + u_charDigitValue(digitC);
numDigits++;
if (numDigits >= fPattern->fMaxCaptureDigits) {
return FALSE;
}
UTEXT_SETNATIVEINDEX(fInputText, startPos);
- UTEXT_NEXT32(fInputText);
+ (void)UTEXT_NEXT32(fInputText);
startPos = UTEXT_GETNATIVEINDEX(fInputText);
}
} else {
return FALSE;
}
UTEXT_SETNATIVEINDEX(fInputText, startPos);
- UTEXT_NEXT32(fInputText);
+ (void)UTEXT_NEXT32(fInputText);
startPos = UTEXT_GETNATIVEINDEX(fInputText);
// Note that it's perfectly OK for a pattern to have a zero-length
// match at the end of a string, so we must make sure that the loop
if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
- UTEXT_NEXT32(fInputText);
+ (void)UTEXT_NEXT32(fInputText);
startPos = UTEXT_GETNATIVEINDEX(fInputText);
}
MatchAt(startPos, FALSE, fDeferredStatus);
}
+//--------------------------------------------------------------------------------
+//
+// refresh
+//
+//--------------------------------------------------------------------------------
+RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
+ if (U_FAILURE(status)) {
+ return *this;
+ }
+ if (input == NULL) {
+ status = U_ILLEGAL_ARGUMENT_ERROR;
+ return *this;
+ }
+ if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
+ status = U_ILLEGAL_ARGUMENT_ERROR;
+ return *this;
+ }
+ int64_t pos = utext_getNativeIndex(fInputText);
+ // Shallow read-only clone of the new UText into the existing input UText
+ fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
+ if (U_FAILURE(status)) {
+ return *this;
+ }
+ utext_setNativeIndex(fInputText, pos);
+
+ if (fAltInputText != NULL) {
+ pos = utext_getNativeIndex(fAltInputText);
+ fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
+ if (U_FAILURE(status)) {
+ return *this;
+ }
+ utext_setNativeIndex(fAltInputText, pos);
+ }
+ return *this;
+}
// If the delimiter pattern has capturing parentheses, the captured
// text goes out into the next n destination strings.
int32_t groupNum;
- UBool lastGroupWasNullUText = FALSE;
for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
- if (i==destCapacity-1) {
+ if (i >= destCapacity-2) {
+ // Never fill the last available output string with capture group text.
+ // It will filled with the last field, the remainder of the
+ // unsplit input text.
break;
}
i++;
- lastGroupWasNullUText = (dest[i] == NULL ? TRUE : FALSE);
dest[i] = group(groupNum, dest[i], status);
}
if (nextOutputStringStart == fActiveLimit) {
- // The delimiter was at the end of the string. We're done.
- break;
- } else if (i == destCapacity-1) {
- // We're out of capture groups, and the rest of the string is more important
- if (lastGroupWasNullUText) {
- utext_close(dest[i]);
- dest[i] = NULL;
+ // The delimiter was at the end of the string. We're done, but first
+ // we output one last empty string, for the empty field following
+ // the delimiter at the end of input.
+ if (i+1 < destCapacity) {
+ ++i;
+ if (dest[i] == NULL) {
+ dest[i] = utext_openUChars(NULL, NULL, 0, &status);
+ } else {
+ static UChar emptyString[] = {(UChar)0};
+ utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
+ }
}
- }
-
+ break;
+
+ }
}
else
{
} else {
fHitEnd = TRUE;
}
-
- #ifdef REGEX_SMART_BACKTRACKING
- if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
- REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
- if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
- UBool success = FALSE;
- UChar32 c = UTEXT_PREVIOUS32(fInputText);
- while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) {
- if (c == opValue) {
- success = TRUE;
- break;
- } else if (c == U_SENTINEL) {
- break;
- }
- c = UTEXT_PREVIOUS32(fInputText);
- }
- if (success) {
- fHitEnd = FALSE;
- fp = (REStackFrame *)fStack->popFrame(fFrameSize);
- fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
- if (fp->fInputIdx > backSearchIndex) {
- fp = StateSave(fp, fp->fPatIdx, status);
- }
- fp->fPatIdx++; // Skip the LOOP_C, we just did that
- break;
- }
- }
- }
- #endif
-
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
// Test input against a literal string.
// Strings require two slots in the compiled pattern, one for the
// offset to the string text, and one for the length.
- int32_t stringStartIdx = opValue;
- int32_t stringLen;
+ int32_t stringStartIdx = opValue;
op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
fp->fPatIdx++;
opType = URX_TYPE(op);
- stringLen = URX_VAL(op);
+ int32_t stringLen = URX_VAL(op);
U_ASSERT(opType == URX_STRING_LEN);
U_ASSERT(stringLen >= 2);
- const UChar *patternChars = litText+stringStartIdx;
- const UChar *patternEnd = patternChars+stringLen;
-
+ const UChar *patternString = litText+stringStartIdx;
+ int32_t patternStringIndex = 0;
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
- UChar32 c;
+ UChar32 inputChar;
+ UChar32 patternChar;
UBool success = TRUE;
-
- while (patternChars < patternEnd && success) {
- c = UTEXT_NEXT32(fInputText);
-
- if (c != U_SENTINEL && UTEXT_GETNATIVEINDEX(fInputText) <= fActiveLimit) {
- if (U_IS_BMP(c)) {
- success = (*patternChars == c);
- patternChars += 1;
- } else if (patternChars+1 < patternEnd) {
- success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
- patternChars += 2;
- }
- } else {
+ while (patternStringIndex < stringLen) {
+ if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
success = FALSE;
- fHitEnd = TRUE; // TODO: See ticket 6074
+ fHitEnd = TRUE;
+ break;
+ }
+ inputChar = UTEXT_NEXT32(fInputText);
+ U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
+ if (patternChar != inputChar) {
+ success = FALSE;
+ break;
}
}
if (success) {
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
} else {
- #ifdef REGEX_SMART_BACKTRACKING
- if (fp->fInputIdx > backSearchIndex && fStack->size()) {
- REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
- if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
- // Reset to last start point
- UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
- patternChars = litText+stringStartIdx;
-
- // Search backwards for a possible start
- do {
- c = UTEXT_PREVIOUS32(fInputText);
- if (c == U_SENTINEL) {
- break;
- } else if ((U_IS_BMP(c) && *patternChars == c) ||
- (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
- success = TRUE;
- break;
- }
- } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
-
- // And try again
- if (success) {
- fp = (REStackFrame *)fStack->popFrame(fFrameSize);
- fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
- if (fp->fInputIdx > backSearchIndex) {
- fp = StateSave(fp, fp->fPatIdx, status);
- }
- fp->fPatIdx++; // Skip the LOOP_C, we just did that
- break;
- }
- }
- }
- #endif
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
// If not in the middle of a CR/LF sequence
- if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && (UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
+ if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
// At new-line at end of input. Success
fHitEnd = TRUE;
fRequireEnd = TRUE;
case URX_BACKSLASH_B: // Test for word boundaries
{
UBool success = isWordBoundary(fp->fInputIdx);
- success ^= (opValue != 0); // flip sense for \B
+ success ^= (UBool)(opValue != 0); // flip sense for \B
if (!success) {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
{
UBool success = isUWordBoundary(fp->fInputIdx);
- success ^= (opValue != 0); // flip sense for \B
+ success ^= (UBool)(opValue != 0); // flip sense for \B
if (!success) {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
UChar32 c = UTEXT_NEXT32(fInputText);
int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
- success ^= (opValue != 0); // flip sense for \D
+ success ^= (UBool)(opValue != 0); // flip sense for \D
if (success) {
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
} else {
if (sets[URX_GC_LV]->contains(c)) goto GC_V;
if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
if (sets[URX_GC_V]->contains(c)) goto GC_V;
- UTEXT_PREVIOUS32(fInputText);
+ (void)UTEXT_PREVIOUS32(fInputText);
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
goto GC_Extend;
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
if (sets[URX_GC_V]->contains(c)) goto GC_V;
if (sets[URX_GC_T]->contains(c)) goto GC_T;
- UTEXT_PREVIOUS32(fInputText);
+ (void)UTEXT_PREVIOUS32(fInputText);
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
goto GC_Extend;
c = UTEXT_NEXT32(fInputText);
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
if (sets[URX_GC_T]->contains(c)) goto GC_T;
- UTEXT_PREVIOUS32(fInputText);
+ (void)UTEXT_PREVIOUS32(fInputText);
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
goto GC_Extend;
if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
break;
}
- UTEXT_NEXT32(fInputText);
+ (void)UTEXT_NEXT32(fInputText);
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
}
goto GC_Done;
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
} else {
// the character wasn't in the set.
- #ifdef REGEX_SMART_BACKTRACKING
- if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
- REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
- if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
- // Try to find it, backwards
- UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
- success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset
- do {
- c = UTEXT_PREVIOUS32(fInputText);
- if (c == U_SENTINEL) {
- break;
- } else if (c < 256) {
- Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
- if (s8->contains(c)) {
- success = !success;
- }
- } else {
- const UnicodeSet *s = fPattern->fStaticSets[opValue];
- if (s->contains(c)) {
- success = !success;
- }
- }
- } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex && !success);
-
- if (success && c != U_SENTINEL) {
- fp = (REStackFrame *)fStack->popFrame(fFrameSize);
- fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
- if (fp->fInputIdx > backSearchIndex) {
- fp = StateSave(fp, fp->fPatIdx, status);
- }
- fp->fPatIdx++; // Skip the LOOP_C, we just did that
- break;
- }
- }
- }
- #endif
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
}
}
// the character wasn't in the set.
- #ifdef REGEX_SMART_BACKTRACKING
- if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
- REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
- if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
- // Try to find it, backwards
- UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
- UBool success = FALSE;
- do {
- c = UTEXT_PREVIOUS32(fInputText);
- if (c == U_SENTINEL) {
- break;
- } else if (c < 256) {
- Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
- if (s8->contains(c) == FALSE) {
- success = TRUE;
- break;
- }
- } else {
- const UnicodeSet *s = fPattern->fStaticSets[opValue];
- if (s->contains(c) == FALSE) {
- success = TRUE;
- break;
- }
- }
- } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
-
- if (success) {
- fp = (REStackFrame *)fStack->popFrame(fFrameSize);
- fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
- if (fp->fInputIdx > backSearchIndex) {
- fp = StateSave(fp, fp->fPatIdx, status);
- }
- fp->fPatIdx++; // Skip the LOOP_C, we just did that
- break;
- }
- }
- }
- #endif
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
break;
}
// the character wasn't in the set.
- #ifdef REGEX_SMART_BACKTRACKING
- if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
- REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
- if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
- // Try to find it, backwards
- UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
- UBool success = FALSE;
- do {
- c = UTEXT_PREVIOUS32(fInputText);
- if (c == U_SENTINEL) {
- break;
- } else if (c < 256) {
- Regex8BitSet *s8 = &fPattern->fSets8[opValue];
- if (s8->contains(c)) {
- success = TRUE;
- break;
- }
- } else {
- UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
- if (s->contains(c)) {
- success = TRUE;
- break;
- }
- }
- } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
-
- if (success) {
- fp = (REStackFrame *)fStack->popFrame(fFrameSize);
- fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
- if (fp->fInputIdx > backSearchIndex) {
- fp = StateSave(fp, fp->fPatIdx, status);
- }
- fp->fPatIdx++; // Skip the LOOP_C, we just did that
- break;
- }
- }
- }
- #endif
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
break;
// In the case of a CR/LF, we need to advance over both.
UChar32 nextc = UTEXT_CURRENT32(fInputText);
if (nextc == 0x0a) {
- UTEXT_NEXT32(fInputText);
+ (void)UTEXT_NEXT32(fInputText);
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
}
}
break;
case URX_BACKREF:
- case URX_BACKREF_I:
{
U_ASSERT(opValue < fFrameSize);
int64_t groupStartIdx = fp->fExtra[opValue];
if (groupStartIdx < 0) {
// This capture group has not participated in the match thus far,
fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
- }
-
- if (groupEndIdx == groupStartIdx) {
- // The capture group match was of an empty string.
- // Verified by testing: Perl matches succeed in this case, so
- // we do too.
break;
}
-
UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
+
+ // Note: if the capture group match was of an empty string the backref
+ // match succeeds. Verified by testing: Perl matches succeed
+ // in this case, so we do too.
- UBool haveMatch = (opType == URX_BACKREF ?
- (0 == utext_compareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1)) :
- (0 == utext_caseCompareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1, U_FOLD_CASE_DEFAULT, &status)));
- fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
-
- if (fp->fInputIdx > fActiveLimit) {
- fHitEnd = TRUE;
- fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
- } else if (!haveMatch) {
- if (fp->fInputIdx == fActiveLimit) {
+ UBool success = TRUE;
+ for (;;) {
+ if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
+ success = TRUE;
+ break;
+ }
+ if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
+ success = FALSE;
fHitEnd = TRUE;
+ break;
+ }
+ UChar32 captureGroupChar = utext_next32(fAltInputText);
+ UChar32 inputChar = utext_next32(fInputText);
+ if (inputChar != captureGroupChar) {
+ success = FALSE;
+ break;
}
+ }
+
+ if (success) {
+ fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
+ } else {
+ fp = (REStackFrame *)fStack->popFrame(fFrameSize);
+ }
+ }
+ break;
+
+
+
+ case URX_BACKREF_I:
+ {
+ U_ASSERT(opValue < fFrameSize);
+ int64_t groupStartIdx = fp->fExtra[opValue];
+ int64_t groupEndIdx = fp->fExtra[opValue+1];
+ U_ASSERT(groupStartIdx <= groupEndIdx);
+ if (groupStartIdx < 0) {
+ // This capture group has not participated in the match thus far,
fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
+ break;
+ }
+ utext_setNativeIndex(fAltInputText, groupStartIdx);
+ utext_setNativeIndex(fInputText, fp->fInputIdx);
+ CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
+ CaseFoldingUTextIterator inputItr(*fInputText);
+
+ // Note: if the capture group match was of an empty string the backref
+ // match succeeds. Verified by testing: Perl matches succeed
+ // in this case, so we do too.
+
+ UBool success = TRUE;
+ for (;;) {
+ if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
+ success = TRUE;
+ break;
+ }
+ if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
+ success = FALSE;
+ fHitEnd = TRUE;
+ break;
+ }
+ UChar32 captureGroupChar = captureGroupItr.next();
+ UChar32 inputChar = inputItr.next();
+ if (inputChar != captureGroupChar) {
+ success = FALSE;
+ break;
+ }
+ }
+
+ if (success && inputItr.inExpansion()) {
+ // We otained a match by consuming part of a string obtained from
+ // case-folding a single code point of the input text.
+ // This does not count as an overall match.
+ success = FALSE;
+ }
+
+ if (success) {
+ fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
+ } else {
+ fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
+
}
break;
break;
case URX_ONECHAR_I:
+ // Case insensitive one char. The char from the pattern is already case folded.
+ // Input text is not, but case folding the input can not reduce two or more code
+ // points to one.
if (fp->fInputIdx < fActiveLimit) {
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
fHitEnd = TRUE;
}
- #ifdef REGEX_SMART_BACKTRACKING
- if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
- REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
- if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
- UBool success = FALSE;
- UChar32 c = UTEXT_PREVIOUS32(fInputText);
- while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) {
- if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
- success = TRUE;
- break;
- } else if (c == U_SENTINEL) {
- break;
- }
- c = UTEXT_PREVIOUS32(fInputText);
- }
- if (success) {
- fHitEnd = FALSE;
- fp = (REStackFrame *)fStack->popFrame(fFrameSize);
- fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
- if (fp->fInputIdx > backSearchIndex) {
- fp = StateSave(fp, fp->fPatIdx, status);
- }
- fp->fPatIdx++; // Skip the LOOP_C, we just did that
- break;
- }
- }
- }
- #endif
-
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
case URX_STRING_I:
{
- // Test input against a literal string.
+ // Case-insensitive test input against a literal string.
// Strings require two slots in the compiled pattern, one for the
// offset to the string text, and one for the length.
- const UCaseProps *csp = ucase_getSingleton();
+ // The compiled string has already been case folded.
{
- int32_t stringStartIdx, stringLen;
- stringStartIdx = opValue;
+ const UChar *patternString = litText + opValue;
+ int32_t patternStringIdx = 0;
op = (int32_t)pat[fp->fPatIdx];
fp->fPatIdx++;
opType = URX_TYPE(op);
opValue = URX_VAL(op);
U_ASSERT(opType == URX_STRING_LEN);
- stringLen = opValue;
+ int32_t patternStringLen = opValue; // Length of the string from the pattern.
- const UChar *patternChars = litText+stringStartIdx;
- const UChar *patternEnd = patternChars+stringLen;
-
- const UChar *foldChars = NULL;
- int32_t foldOffset, foldLength;
- UChar32 c;
-
- foldOffset = foldLength = 0;
- UBool success = TRUE;
+ UChar32 cPattern;
+ UChar32 cText;
+ UBool success = TRUE;
+
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
- while (patternChars < patternEnd && success) {
- if(foldOffset < foldLength) {
- U16_NEXT_UNSAFE(foldChars, foldOffset, c);
- } else {
- c = UTEXT_NEXT32(fInputText);
- if (c != U_SENTINEL) {
- foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
- if(foldLength >= 0) {
- if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings
- foldOffset = 0;
- U16_NEXT_UNSAFE(foldChars, foldOffset, c);
- } else {
- c = foldLength;
- foldLength = foldOffset; // to avoid reading chars from the folding buffer
- }
- }
- }
-
- fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
+ CaseFoldingUTextIterator inputIterator(*fInputText);
+ while (patternStringIdx < patternStringLen) {
+ if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
+ success = FALSE;
+ fHitEnd = TRUE;
+ break;
}
-
- success = FALSE;
- if (c != U_SENTINEL && (fp->fInputIdx <= fActiveLimit)) {
- if (U_IS_BMP(c)) {
- success = (*patternChars == c);
- patternChars += 1;
- } else if (patternChars+1 < patternEnd) {
- success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
- patternChars += 2;
- }
- } else {
- fHitEnd = TRUE; // TODO: See ticket 6074
+ U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
+ cText = inputIterator.next();
+ if (cText != cPattern) {
+ success = FALSE;
+ break;
}
}
-
- if (!success) {
- #ifdef REGEX_SMART_BACKTRACKING
- if (fp->fInputIdx > backSearchIndex && fStack->size()) {
- REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
- if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
- // Reset to last start point
- UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
- patternChars = litText+stringStartIdx;
-
- // Search backwards for a possible start
- do {
- c = UTEXT_PREVIOUS32(fInputText);
- if (c == U_SENTINEL) {
- break;
- } else {
- foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
- if(foldLength >= 0) {
- if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings
- foldOffset = 0;
- U16_NEXT_UNSAFE(foldChars, foldOffset, c);
- } else {
- c = foldLength;
- foldLength = foldOffset; // to avoid reading chars from the folding buffer
- }
- }
-
- if ((U_IS_BMP(c) && *patternChars == c) ||
- (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
- success = TRUE;
- break;
- }
- }
- } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
-
- // And try again
- if (success) {
- fp = (REStackFrame *)fStack->popFrame(fFrameSize);
- fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
- if (fp->fInputIdx > backSearchIndex) {
- fp = StateSave(fp, fp->fPatIdx, status);
- }
- fp->fPatIdx++; // Skip the LOOP_C, we just did that
- break;
- }
- }
- }
- #endif
+ if (inputIterator.inExpansion()) {
+ success = FALSE;
+ }
+
+ if (success) {
+ fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
+ } else {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
(*lbStartIdx)--;
} else {
UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
- UTEXT_PREVIOUS32(fInputText);
+ (void)UTEXT_PREVIOUS32(fInputText);
*lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
}
}
(*lbStartIdx)--;
} else {
UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
- UTEXT_PREVIOUS32(fInputText);
+ (void)UTEXT_PREVIOUS32(fInputText);
*lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
}
}
int32_t stackLoc = URX_VAL(loopcOp);
U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
fp->fExtra[stackLoc] = fp->fInputIdx;
- #ifdef REGEX_SMART_BACKTRACKING
- backSearchIndex = fp->fInputIdx;
- #endif
fp->fInputIdx = ix;
// Save State to the URX_LOOP_C op that follows this one,
int32_t stackLoc = URX_VAL(loopcOp);
U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
fp->fExtra[stackLoc] = fp->fInputIdx;
- #ifdef REGEX_SMART_BACKTRACKING
- backSearchIndex = fp->fInputIdx;
- #endif
fp->fInputIdx = ix;
// Save State to the URX_LOOP_C op that follows this one,
} else {
fHitEnd = TRUE;
}
-
- #ifdef REGEX_SMART_BACKTRACKING
- if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
- REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
- if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
- int64_t reverseIndex = fp->fInputIdx;
- UChar32 c;
- do {
- U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
- if (c == opValue) {
- break;
- }
- } while (reverseIndex > backSearchIndex);
- if (c == opValue) {
- fHitEnd = FALSE;
- fp = (REStackFrame *)fStack->popFrame(fFrameSize);
- fp->fInputIdx = reverseIndex;
- if (fp->fInputIdx > backSearchIndex) {
- fp = StateSave(fp, fp->fPatIdx, status);
- }
- fp->fPatIdx++; // Skip the LOOP_C, we just did that
- break;
- }
- }
- }
- #endif
-
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
U_ASSERT(opType == URX_STRING_LEN);
U_ASSERT(stringLen >= 2);
- if (fp->fInputIdx + stringLen > fActiveLimit) {
- // No match. String is longer than the remaining input text.
- fHitEnd = TRUE; // TODO: See ticket 6074
- fp = (REStackFrame *)fStack->popFrame(fFrameSize);
- break;
- }
-
const UChar * pInp = inputBuf + fp->fInputIdx;
+ const UChar * pInpLimit = inputBuf + fActiveLimit;
const UChar * pPat = litText+stringStartIdx;
const UChar * pEnd = pInp + stringLen;
- UBool success = FALSE;
- for(;;) {
- if (*pInp == *pPat) {
- pInp++;
- pPat++;
- if (pInp == pEnd) {
- // Successful Match.
- success = TRUE;
- break;
- }
- } else {
- // Match failed.
+ UBool success = TRUE;
+ while (pInp < pEnd) {
+ if (pInp >= pInpLimit) {
+ fHitEnd = TRUE;
+ success = FALSE;
+ break;
+ }
+ if (*pInp++ != *pPat++) {
+ success = FALSE;
break;
}
}
if (success) {
fp->fInputIdx += stringLen;
} else {
- #ifdef REGEX_SMART_BACKTRACKING
- if (fp->fInputIdx > backSearchIndex && fStack->size()) {
- REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
- if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
- // Reset to last start point
- int64_t reverseIndex = fp->fInputIdx;
- UChar32 c;
- pPat = litText+stringStartIdx;
-
- // Search backwards for a possible start
- do {
- U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
- if ((U_IS_BMP(c) && *pPat == c) ||
- (*pPat == U16_LEAD(c) && *(pPat+1) == U16_TRAIL(c))) {
- success = TRUE;
- break;
- }
- } while (reverseIndex > backSearchIndex);
-
- // And try again
- if (success) {
- fp = (REStackFrame *)fStack->popFrame(fFrameSize);
- fp->fInputIdx = reverseIndex;
- if (fp->fInputIdx > backSearchIndex) {
- fp = StateSave(fp, fp->fPatIdx, status);
- }
- fp->fPatIdx++; // Skip the LOOP_C, we just did that
- break;
- }
- }
- }
- #endif
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
case URX_BACKSLASH_B: // Test for word boundaries
{
UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
- success ^= (opValue != 0); // flip sense for \B
+ success ^= (UBool)(opValue != 0); // flip sense for \B
if (!success) {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
{
UBool success = isUWordBoundary(fp->fInputIdx);
- success ^= (opValue != 0); // flip sense for \B
+ success ^= (UBool)(opValue != 0); // flip sense for \B
if (!success) {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
- success ^= (opValue != 0); // flip sense for \D
+ success ^= (UBool)(opValue != 0); // flip sense for \D
if (!success) {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
}
if (!success) {
- #ifdef REGEX_SMART_BACKTRACKING
- if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
- REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
- if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
- // Try to find it, backwards
- int64_t reverseIndex = fp->fInputIdx;
- U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
- success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset
- do {
- U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
- if (c < 256) {
- Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
- if (s8->contains(c)) {
- success = !success;
- }
- } else {
- const UnicodeSet *s = fPattern->fStaticSets[opValue];
- if (s->contains(c)) {
- success = !success;
- }
- }
- } while (reverseIndex > backSearchIndex && !success);
-
- if (success) {
- fp = (REStackFrame *)fStack->popFrame(fFrameSize);
- fp->fInputIdx = reverseIndex;
- if (fp->fInputIdx > backSearchIndex) {
- fp = StateSave(fp, fp->fPatIdx, status);
- }
- fp->fPatIdx++; // Skip the LOOP_C, we just did that
- break;
- }
- }
- }
- #endif
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
}
}
-
- #ifdef REGEX_SMART_BACKTRACKING
- if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
- REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
- if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
- // Try to find it, backwards
- int64_t reverseIndex = fp->fInputIdx;
- U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
- UBool success = FALSE;
- do {
- U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
- if (c < 256) {
- Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
- if (s8->contains(c) == FALSE) {
- success = TRUE;
- break;
- }
- } else {
- const UnicodeSet *s = fPattern->fStaticSets[opValue];
- if (s->contains(c) == FALSE) {
- success = TRUE;
- break;
- }
- }
- } while (reverseIndex > backSearchIndex);
-
- if (success) {
- fp = (REStackFrame *)fStack->popFrame(fFrameSize);
- fp->fInputIdx = reverseIndex;
- if (fp->fInputIdx > backSearchIndex) {
- fp = StateSave(fp, fp->fPatIdx, status);
- }
- fp->fPatIdx++; // Skip the LOOP_C, we just did that
- break;
- }
- }
- }
- #endif
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
break;
}
// the character wasn't in the set.
- #ifdef REGEX_SMART_BACKTRACKING
- if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
- REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
- if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
- // Try to find it, backwards
- int64_t reverseIndex = fp->fInputIdx;
- U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
- UBool success = FALSE;
- do {
- U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
- if (c < 256) {
- Regex8BitSet *s8 = &fPattern->fSets8[opValue];
- if (s8->contains(c)) {
- success = TRUE;
- break;
- }
- } else {
- UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
- if (s->contains(c)) {
- success = TRUE;
- break;
- }
- }
- } while (reverseIndex > backSearchIndex);
-
- if (success) {
- fp = (REStackFrame *)fStack->popFrame(fFrameSize);
- fp->fInputIdx = reverseIndex;
- if (fp->fInputIdx > reverseIndex) {
- fp = StateSave(fp, fp->fPatIdx, status);
- }
- fp->fPatIdx++; // Skip the LOOP_C, we just did that
- break;
- }
- }
- }
- #endif
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
break;
break;
case URX_BACKREF:
+ {
+ U_ASSERT(opValue < fFrameSize);
+ int64_t groupStartIdx = fp->fExtra[opValue];
+ int64_t groupEndIdx = fp->fExtra[opValue+1];
+ U_ASSERT(groupStartIdx <= groupEndIdx);
+ int64_t inputIndex = fp->fInputIdx;
+ if (groupStartIdx < 0) {
+ // This capture group has not participated in the match thus far,
+ fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
+ break;
+ }
+ UBool success = TRUE;
+ for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
+ if (inputIndex >= fActiveLimit) {
+ success = FALSE;
+ fHitEnd = TRUE;
+ break;
+ }
+ if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
+ success = FALSE;
+ break;
+ }
+ }
+ if (success) {
+ fp->fInputIdx = inputIndex;
+ } else {
+ fp = (REStackFrame *)fStack->popFrame(fFrameSize);
+ }
+ }
+ break;
+
case URX_BACKREF_I:
{
U_ASSERT(opValue < fFrameSize);
int64_t groupStartIdx = fp->fExtra[opValue];
int64_t groupEndIdx = fp->fExtra[opValue+1];
U_ASSERT(groupStartIdx <= groupEndIdx);
- int64_t len = groupEndIdx-groupStartIdx;
if (groupStartIdx < 0) {
// This capture group has not participated in the match thus far,
fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
+ break;
}
+ CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
+ CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
- if (len == 0) {
- // The capture group match was of an empty string.
- // Verified by testing: Perl matches succeed in this case, so
- // we do too.
+ // Note: if the capture group match was of an empty string the backref
+ // match succeeds. Verified by testing: Perl matches succeed
+ // in this case, so we do too.
+
+ UBool success = TRUE;
+ for (;;) {
+ UChar32 captureGroupChar = captureGroupItr.next();
+ if (captureGroupChar == U_SENTINEL) {
+ success = TRUE;
break;
}
-
- UBool haveMatch = FALSE;
- if (fp->fInputIdx + len <= fActiveLimit) {
- if (opType == URX_BACKREF) {
- if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, (int32_t)len) == 0) {
- haveMatch = TRUE;
- }
- } else {
- if (u_strncasecmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx,
- (int32_t)len, U_FOLD_CASE_DEFAULT) == 0) {
- haveMatch = TRUE;
- }
+ UChar32 inputChar = inputItr.next();
+ if (inputChar == U_SENTINEL) {
+ success = FALSE;
+ fHitEnd = TRUE;
+ break;
}
- } else {
- // TODO: probably need to do a partial string comparison, and only
- // set HitEnd if the available input matched. Ticket #6074
- fHitEnd = TRUE;
+ if (inputChar != captureGroupChar) {
+ success = FALSE;
+ break;
+ }
+ }
+
+ if (success && inputItr.inExpansion()) {
+ // We otained a match by consuming part of a string obtained from
+ // case-folding a single code point of the input text.
+ // This does not count as an overall match.
+ success = FALSE;
}
- if (haveMatch) {
- fp->fInputIdx += len; // Match. Advance current input position.
+
+ if (success) {
+ fp->fInputIdx = inputItr.getIndex();
} else {
- fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
+ fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
-
+
case URX_STO_INP_LOC:
{
U_ASSERT(opValue >= 0 && opValue < fFrameSize);
} else {
fHitEnd = TRUE;
}
-
- #ifdef REGEX_SMART_BACKTRACKING
- if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
- REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
- if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
- UBool success = FALSE;
- int64_t reverseIndex = fp->fInputIdx;
- UChar32 c;
- while (reverseIndex > backSearchIndex) {
- U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
- if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
- success = TRUE;
- break;
- } else if (c == U_SENTINEL) {
- break;
- }
- }
- if (success) {
- fHitEnd = FALSE;
- fp = (REStackFrame *)fStack->popFrame(fFrameSize);
- fp->fInputIdx = reverseIndex;
- if (fp->fInputIdx > backSearchIndex) {
- fp = StateSave(fp, fp->fPatIdx, status);
- }
- fp->fPatIdx++; // Skip the LOOP_C, we just did that
- break;
- }
- }
- }
- #endif
-
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
case URX_STRING_I:
+ // Case-insensitive test input against a literal string.
+ // Strings require two slots in the compiled pattern, one for the
+ // offset to the string text, and one for the length.
+ // The compiled string has already been case folded.
{
- // Test input against a literal string.
- // Strings require two slots in the compiled pattern, one for the
- // offset to the string text, and one for the length.
- const UCaseProps *csp = ucase_getSingleton();
- {
- int32_t stringStartIdx, stringLen;
- stringStartIdx = opValue;
-
- op = (int32_t)pat[fp->fPatIdx];
- fp->fPatIdx++;
- opType = URX_TYPE(op);
- opValue = URX_VAL(op);
- U_ASSERT(opType == URX_STRING_LEN);
- stringLen = opValue;
-
- const UChar *patternChars = litText+stringStartIdx;
- const UChar *patternEnd = patternChars+stringLen;
-
- const UChar *foldChars = NULL;
- int32_t foldOffset, foldLength;
- UChar32 c;
-
- #ifdef REGEX_SMART_BACKTRACKING
- int32_t originalInputIdx = fp->fInputIdx;
- #endif
- UBool success = TRUE;
-
- foldOffset = foldLength = 0;
-
- while (patternChars < patternEnd && success) {
- if(foldOffset < foldLength) {
- U16_NEXT_UNSAFE(foldChars, foldOffset, c);
- } else {
- U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
- foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
- if(foldLength >= 0) {
- if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings
- foldOffset = 0;
- U16_NEXT_UNSAFE(foldChars, foldOffset, c);
- } else {
- c = foldLength;
- foldLength = foldOffset; // to avoid reading chars from the folding buffer
- }
- }
- }
-
- if (fp->fInputIdx <= fActiveLimit) {
- if (U_IS_BMP(c)) {
- success = (*patternChars == c);
- patternChars += 1;
- } else if (patternChars+1 < patternEnd) {
- success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
- patternChars += 2;
- }
- } else {
- success = FALSE;
- fHitEnd = TRUE; // TODO: See ticket 6074
- }
- }
-
- if (!success) {
- #ifdef REGEX_SMART_BACKTRACKING
- if (fp->fInputIdx > backSearchIndex && fStack->size()) {
- REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
- if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
- // Reset to last start point
- int64_t reverseIndex = originalInputIdx;
- patternChars = litText+stringStartIdx;
-
- // Search backwards for a possible start
- do {
- U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
- foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
- if(foldLength >= 0) {
- if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings
- foldOffset = 0;
- U16_NEXT_UNSAFE(foldChars, foldOffset, c);
- } else {
- c = foldLength;
- foldLength = foldOffset; // to avoid reading chars from the folding buffer
- }
- }
-
- if ((U_IS_BMP(c) && *patternChars == c) ||
- (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
- success = TRUE;
- break;
- }
- } while (reverseIndex > backSearchIndex);
-
- // And try again
- if (success) {
- fp = (REStackFrame *)fStack->popFrame(fFrameSize);
- fp->fInputIdx = reverseIndex;
- if (fp->fInputIdx > backSearchIndex) {
- fp = StateSave(fp, fp->fPatIdx, status);
- }
- fp->fPatIdx++; // Skip the LOOP_C, we just did that
- break;
- }
- }
+ const UChar *patternString = litText + opValue;
+
+ op = (int32_t)pat[fp->fPatIdx];
+ fp->fPatIdx++;
+ opType = URX_TYPE(op);
+ opValue = URX_VAL(op);
+ U_ASSERT(opType == URX_STRING_LEN);
+ int32_t patternStringLen = opValue; // Length of the string from the pattern.
+
+ UChar32 cText;
+ UChar32 cPattern;
+ UBool success = TRUE;
+ int32_t patternStringIdx = 0;
+ CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
+ while (patternStringIdx < patternStringLen) {
+ U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
+ cText = inputIterator.next();
+ if (cText != cPattern) {
+ success = FALSE;
+ if (cText == U_SENTINEL) {
+ fHitEnd = TRUE;
}
- #endif
- fp = (REStackFrame *)fStack->popFrame(fFrameSize);
+ break;
}
}
+ if (inputIterator.inExpansion()) {
+ success = FALSE;
+ }
+
+ if (success) {
+ fp->fInputIdx = inputIterator.getIndex();
+ } else {
+ fp = (REStackFrame *)fStack->popFrame(fFrameSize);
+ }
}
break;
-
+
case URX_LB_START:
{
// Entering a look-behind block.
int32_t stackLoc = URX_VAL(loopcOp);
U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
fp->fExtra[stackLoc] = fp->fInputIdx;
- #ifdef REGEX_SMART_BACKTRACKING
- backSearchIndex = fp->fInputIdx;
- #endif
fp->fInputIdx = ix;
// Save State to the URX_LOOP_C op that follows this one,
int32_t stackLoc = URX_VAL(loopcOp);
U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
fp->fExtra[stackLoc] = fp->fInputIdx;
- #ifdef REGEX_SMART_BACKTRACKING
- backSearchIndex = fp->fInputIdx;
- #endif
fp->fInputIdx = ix;
// Save State to the URX_LOOP_C op that follows this one,
U_NAMESPACE_END
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
-