//
/*
**************************************************************************
-* Copyright (C) 2002-2003 International Business Machines Corporation *
+* Copyright (C) 2002-2004 International Business Machines Corporation *
* and others. All rights reserved. *
**************************************************************************
*/
#include "unicode/uniset.h"
#include "unicode/uchar.h"
#include "unicode/ustring.h"
+#include "unicode/rbbi.h"
#include "uassert.h"
#include "cmemory.h"
#include "uvector.h"
fDeferredStatus = U_ZERO_ERROR;
fStack = new UVector32(fDeferredStatus);
fData = fSmallData;
+ fWordBreakItr = NULL;
if (pat==NULL) {
fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
return;
fDeferredStatus = U_ZERO_ERROR;
fStack = new UVector32(status);
fData = fSmallData;
+ fWordBreakItr = NULL;
if (U_FAILURE(status)) {
return;
}
fData = fSmallData;
fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
fPattern = fPatternOwned;
+ fWordBreakItr = NULL;
if (U_FAILURE(status)) {
return;
}
RegexMatcher::~RegexMatcher() {
delete fStack;
if (fData != fSmallData) {
- delete fData;
+ uprv_free(fData);
fData = NULL;
}
if (fPatternOwned) {
fPatternOwned = NULL;
fPattern = NULL;
}
+ #if UCONFIG_NO_BREAK_ITERATION==0
+ delete fWordBreakItr;
+ #endif
}
}
// Copy input string from the end of previous match to start of current match
- int32_t len = fMatchStart-fLastMatchEnd;
+ int32_t len = fMatchStart-fLastReplaceEnd;
if (len > 0) {
- dest.append(*fInput, fLastMatchEnd, len);
+ dest.append(*fInput, fLastReplaceEnd, len);
}
+ fLastReplaceEnd = fMatchEnd;
// scan the replacement text, looking for substitutions ($n) and \escapes.
UChar32 escapedChar = replacement.unescapeAt(replIdx);
if (escapedChar != (UChar32)0xFFFFFFFF) {
dest.append(escapedChar);
- replIdx += (c==0x55? 9: 5);
// TODO: Report errors for mal-formed \u escapes?
// As this is, the original sequence is output, which may be OK.
continue;
}
int32_t startPos = fMatchEnd;
+
+ if (fMatch) {
+ // Save the position of any previous successful match.
+ fLastMatchEnd = fMatchEnd;
+
+ if (fMatchStart == fMatchEnd) {
+ // Previous match had zero length. Move start position up one position
+ // to avoid sending find() into a loop on zero-length matches.
+ if (startPos == fInput->length()) {
+ fMatch = FALSE;
+ return FALSE;
+ }
+ startPos = fInput->moveIndex32(startPos, 1);
+ }
+ } else {
+ if (fLastMatchEnd >= 0) {
+ // A previous find() failed to match. Don't try again.
+ // (without this test, a pattern with a zero-length match
+ // could match again at the end of an input string.)
+ return FALSE;
+ }
+ }
+
int32_t inputLen = fInput->length();
+
+ // Compute the position in the input string beyond which a match can not begin, because
+ // the minimum length match would extend past the end of the input.
int32_t testLen = inputLen - fPattern->fMinMatchLen;
if (startPos > testLen) {
+ fMatch = FALSE;
return FALSE;
}
// Matches are only possible at the start of the input string
// (pattern begins with ^ or \A)
if (startPos > 0) {
+ fMatch = FALSE;
return FALSE;
}
MatchAt(startPos, fDeferredStatus);
}
}
if (pos >= testLen) {
+ fMatch = FALSE;
return FALSE;
}
}
}
}
if (pos >= testLen) {
+ fMatch = FALSE;
return FALSE;
}
}
}
for (;;) {
- UChar32 c = inputBuf[startPos-1];
+ c = inputBuf[startPos-1];
if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
- (c == 0x0a || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029 ||
- c == 0x0d && startPos+1 < inputLen && inputBuf[startPos+1] != 0x0a)) {
- MatchAt(startPos, fDeferredStatus);
- if (U_FAILURE(fDeferredStatus)) {
- return FALSE;
- }
- if (fMatch) {
- return TRUE;
- }
+ (c == 0x0a || c == 0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029 )) {
+ if (c == 0x0d && startPos < inputLen && inputBuf[startPos] == 0x0a) {
+ startPos++;
+ }
+ MatchAt(startPos, fDeferredStatus);
+ if (U_FAILURE(fDeferredStatus)) {
+ return FALSE;
+ }
+ if (fMatch) {
+ return TRUE;
+ }
}
if (startPos >= testLen) {
+ fMatch = FALSE;
return FALSE;
}
U16_NEXT(inputBuf, startPos, inputLen, c); // like c = inputBuf[startPos++];
return FALSE;
}
int32_t inputLen = fInput->length();
- if (start < 0 || start >= inputLen) {
+ if (start < 0 || start > inputLen) {
status = U_INDEX_OUTOFBOUNDS_ERROR;
return FALSE;
}
+//--------------------------------------------------------------------------------
+//
+// lookingAt()
+//
+//--------------------------------------------------------------------------------
UBool RegexMatcher::lookingAt(UErrorCode &status) {
if (U_FAILURE(status)) {
return FALSE;
}
+UBool RegexMatcher::lookingAt(int32_t start, UErrorCode &status) {
+ if (U_FAILURE(status)) {
+ return FALSE;
+ }
+ if (U_FAILURE(fDeferredStatus)) {
+ status = fDeferredStatus;
+ return FALSE;
+ }
+ if (start < 0 || start > fInput->length()) {
+ status = U_INDEX_OUTOFBOUNDS_ERROR;
+ return FALSE;
+ }
+ reset();
+ MatchAt(start, status);
+ return fMatch;
+}
+
+
+//--------------------------------------------------------------------------------
+//
+// matches()
+//
+//--------------------------------------------------------------------------------
UBool RegexMatcher::matches(UErrorCode &status) {
if (U_FAILURE(status)) {
return FALSE;
}
+UBool RegexMatcher::matches(int32_t start, UErrorCode &status) {
+ if (U_FAILURE(status)) {
+ return FALSE;
+ }
+ if (U_FAILURE(fDeferredStatus)) {
+ status = fDeferredStatus;
+ return FALSE;
+ }
+ if (start < 0 || start > fInput->length()) {
+ status = U_INDEX_OUTOFBOUNDS_ERROR;
+ return FALSE;
+ }
+ reset();
+ MatchAt(start, status);
+ UBool success = (fMatch && fMatchEnd==fInput->length());
+ return success;
+}
+
const RegexPattern &RegexMatcher::pattern() const {
//
//--------------------------------------------------------------------------------
RegexMatcher &RegexMatcher::reset() {
- fMatchStart = 0;
- fMatchEnd = 0;
- fLastMatchEnd = 0;
- fMatch = FALSE;
+ fMatchStart = 0;
+ fMatchEnd = 0;
+ fLastMatchEnd = -1;
+ fLastReplaceEnd = 0;
+ fMatch = FALSE;
resetStack();
return *this;
}
RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
fInput = &input;
reset();
+ if (fWordBreakItr != NULL) {
+ #if UCONFIG_NO_BREAK_ITERATION==0
+ fWordBreakItr->setText(input);
+ #endif
+ }
return *this;
}
+RegexMatcher &RegexMatcher::reset(const UChar *) {
+ fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
+ return *this;
+}
-REStackFrame *RegexMatcher::resetStack() {
- // Discard any previous contents of the state save stack, and initialize a
- // new stack frame to all -1. The -1s are needed for capture group limits, where
- // they indicate that a group has not yet matched anything.
- fStack->removeAllElements();
-
- int32_t *iFrame = fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
- int i;
- for (i=0; i<fPattern->fFrameSize; i++) {
- iFrame[i] = -1;
+RegexMatcher &RegexMatcher::reset(int32_t position, UErrorCode &status) {
+ if (U_FAILURE(status)) {
+ return *this;
}
- return (REStackFrame *)iFrame;
+ reset();
+ if (position < 0 || position >= fInput->length()) {
+ status = U_INDEX_OUTOFBOUNDS_ERROR;
+ return *this;
+ }
+ fMatchEnd = position;
+ return *this;
}
+
+
//--------------------------------------------------------------------------------
//
// setTrace
+//================================================================================
+//
+// Code following this point in this file is the internal
+// Match Engine Implementation.
+//
+//================================================================================
+
+
+//--------------------------------------------------------------------------------
+//
+// resetStack
+// Discard any previous contents of the state save stack, and initialize a
+// new stack frame to all -1. The -1s are needed for capture group limits,
+// where they indicate that a group has not yet matched anything.
+//--------------------------------------------------------------------------------
+REStackFrame *RegexMatcher::resetStack() {
+ // Discard any previous contents of the state save stack, and initialize a
+ // new stack frame to all -1. The -1s are needed for capture group limits, where
+ // they indicate that a group has not yet matched anything.
+ fStack->removeAllElements();
+
+ int32_t *iFrame = fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
+ int i;
+ for (i=0; i<fPattern->fFrameSize; i++) {
+ iFrame[i] = -1;
+ }
+ return (REStackFrame *)iFrame;
+}
+
+
+
//--------------------------------------------------------------------------------
//
// isWordBoundary
// opposite in membership in \w set
//
// parameters: pos - the current position in the input buffer
-// start - the position where the match operation started.
-// don't backup before this position when looking back
-// for a preceding base char.
//
//--------------------------------------------------------------------------------
UBool RegexMatcher::isWordBoundary(int32_t pos) {
return isBoundary;
}
+//--------------------------------------------------------------------------------
+//
+// isUWordBoundary
+//
+// Test for a word boundary using RBBI word break.
+//
+// parameters: pos - the current position in the input buffer
+//
+//--------------------------------------------------------------------------------
+UBool RegexMatcher::isUWordBoundary(int32_t pos) {
+ UBool returnVal = FALSE;
+#if UCONFIG_NO_BREAK_ITERATION==0
+ UErrorCode status = U_ZERO_ERROR;
+
+ // If we haven't yet created a break iterator for this matcher, do it now.
+ if (fWordBreakItr == NULL) {
+ fWordBreakItr =
+ (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
+ if (U_FAILURE(status)) {
+ // TODO: reliable error reporting for BI failures.
+ return FALSE;
+ }
+ fWordBreakItr->setText(*fInput);
+ }
+
+ returnVal = fWordBreakItr->isBoundary(pos);
+#endif
+ return returnVal;
+}
+
//--------------------------------------------------------------------------------
//
// StateSave
if (fp->fInputIdx == inputLen-1) {
UChar32 c = fInput->char32At(fp->fInputIdx);
if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) {
- break; // At new-line at end of input. Success
+ // If not in the middle of a CR/LF sequence
+ if ( !(c==0x0a && fp->fInputIdx>0 && inputBuf[fp->fInputIdx-1]==0x0d)) {
+ break;
+ // At new-line at end of input. Success
+ }
}
}
// We really are at the end of input. Success.
break;
}
- // If we are positioned just before a new-line , succeed.
+ // If we are positioned just before a new-line, succeed.
// It makes no difference where the new-line is within the input.
UChar32 c = inputBuf[fp->fInputIdx];
if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) {
- break; // At new-line at end of input. Success
+ // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
+ if ( !(c==0x0a && fp->fInputIdx>0 && inputBuf[fp->fInputIdx-1]==0x0d)) {
+ break; // At new-line at end of input. Success
+ }
}
+
// not at a new line. Fail.
fp = (REStackFrame *)fStack->popFrame(frameSize);
}
break;
+ case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
+ {
+ UBool success = isUWordBoundary(fp->fInputIdx);
+ success ^= (opValue != 0); // flip sense for \B
+ if (!success) {
+ fp = (REStackFrame *)fStack->popFrame(frameSize);
+ }
+ }
+ break;
+
+
case URX_BACKSLASH_D: // Test for decimal digit
{
if (fp->fInputIdx >= inputLen) {
break;
}
}
- }
+ }
// Either at end of input, or the character wasn't in the set.
// Either way, we need to back track out.
fp = (REStackFrame *)fStack->popFrame(frameSize);
// we do too.
break;
}
- /*
- if ((fp->fInputIdx + len > inputLen) ||
- u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, len) != 0) {
- fp = (REStackFrame *)fStack->popFrame(frameSize); // FAIL, no match.
- } else {
- fp->fInputIdx += len; // Match. Advance current input position.
- }
- */
+
UBool haveMatch = FALSE;
if (fp->fInputIdx + len <= inputLen) {
if (opType == URX_BACKREF) {
if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
break;
}
- }
+ }
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
opValue = URX_VAL(op);
U_ASSERT(opType == URX_STRING_LEN);
stringLen = opValue;
-
+
int32_t stringEndIndex = fp->fInputIdx + stringLen;
- if (stringEndIndex <= inputLen &&
- u_strncasecmp(inputBuf+fp->fInputIdx, litText+stringStartIdx,
- stringLen, U_FOLD_CASE_DEFAULT) == 0) {
- // Success. Advance the current input position.
- fp->fInputIdx = stringEndIndex;
- } else {
- // No match. Back up matching to a saved state
- fp = (REStackFrame *)fStack->popFrame(frameSize);
- }
+ if (stringEndIndex <= inputLen) {
+ if (u_strncasecmp(inputBuf+fp->fInputIdx, litText+stringStartIdx,
+ stringLen, U_FOLD_CASE_DEFAULT) == 0) {
+ // Success. Advance the current input position.
+ fp->fInputIdx = stringEndIndex;
+ break;
+ }
+ }
+ // No match. Back up matching to a saved state
+ fp = (REStackFrame *)fStack->popFrame(frameSize);
}
break;
{
// Loop through input until the input is exhausted (we reach an end-of-line)
// In multi-line mode, we can just go straight to the end of the input.
- int32_t ix = inputLen;
- if (opValue == 0) {
+ int32_t ix;
+ if (opValue == 1) {
+ // Multi-line mode.
+ ix = inputLen;
+ } else {
// NOT multi-line mode. Line endings do not match '.'
// Scan forward until a line ending or end of input.
ix = fp->fInputIdx;
for (;;) {
if (ix >= inputLen) {
+ ix = inputLen;
break;
}
UChar32 c;
fMatchStart = startIdx;
fMatchEnd = fp->fInputIdx;
if (fTraceDebug) {
- REGEX_RUN_DEBUG_PRINTF("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd);
+ REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd));
}
}
else
{
if (fTraceDebug) {
- REGEX_RUN_DEBUG_PRINTF("No match\n\n");
+ REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
}
}
-const char RegexMatcher::fgClassID = 0;
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
U_NAMESPACE_END