ICU-6.2.14.tar.gz

[apple/icu.git] / icuSources / i18n / rematch.cpp
diff --git a/icuSources/i18n/rematch.cpp b/icuSources/i18n/rematch.cpp

index 7fc42a2be8d76c17b4930ef8a1c459e69a0cdf2a..aabbadf9c650a79bf4f8de8dce56a998a5e6fc0f 100644 (file)
--- a/icuSources/i18n/rematch.cpp
+++ b/icuSources/i18n/rematch.cpp
@@ -6,7 +6,7 @@
  //
  /*
  **************************************************************************
-*   Copyright (C) 2002-2003 International Business Machines Corporation  *
+*   Copyright (C) 2002-2004 International Business Machines Corporation  *
  *   and others. All rights reserved.                                     *
  **************************************************************************
  */
@@ -18,6 +18,7 @@
  #include "unicode/uniset.h"
  #include "unicode/uchar.h"
  #include "unicode/ustring.h"
+#include "unicode/rbbi.h"
  #include "uassert.h"
  #include "cmemory.h"
  #include "uvector.h"
@@ -42,6 +43,7 @@ RegexMatcher::RegexMatcher(const RegexPattern *pat)  {
      fDeferredStatus    = U_ZERO_ERROR;
      fStack             = new UVector32(fDeferredStatus); 
      fData              = fSmallData;
+    fWordBreakItr      = NULL;
      if (pat==NULL) {
          fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
          return;
@@ -67,6 +69,7 @@ RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &inp
      fDeferredStatus    = U_ZERO_ERROR;
      fStack             = new UVector32(status); 
      fData              = fSmallData;
+    fWordBreakItr      = NULL;
      if (U_FAILURE(status)) {
          return;
      }
@@ -89,6 +92,7 @@ RegexMatcher::RegexMatcher(const UnicodeString &regexp,
      fData              = fSmallData;
      fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
      fPattern           = fPatternOwned;
+    fWordBreakItr      = NULL;
      if (U_FAILURE(status)) {
          return;
      }
@@ -107,7 +111,7 @@ RegexMatcher::RegexMatcher(const UnicodeString &regexp,
  RegexMatcher::~RegexMatcher() {
      delete fStack;
      if (fData != fSmallData) {
-        delete fData;
+        uprv_free(fData);
          fData = NULL;
      }
      if (fPatternOwned) {
@@ -115,6 +119,9 @@ RegexMatcher::~RegexMatcher() {
          fPatternOwned = NULL;
          fPattern = NULL;
      }
+    #if UCONFIG_NO_BREAK_ITERATION==0
+    delete fWordBreakItr;
+    #endif
  }
  
  
@@ -142,10 +149,11 @@ RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
      }
  
      // Copy input string from the end of previous match to start of current match
-    int32_t  len = fMatchStart-fLastMatchEnd;
+    int32_t  len = fMatchStart-fLastReplaceEnd;
      if (len > 0) {
-        dest.append(*fInput, fLastMatchEnd, len);
+        dest.append(*fInput, fLastReplaceEnd, len);
      }
+    fLastReplaceEnd = fMatchEnd;
      
  
      // scan the replacement text, looking for substitutions ($n) and \escapes.
@@ -172,7 +180,6 @@ RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
                  UChar32 escapedChar = replacement.unescapeAt(replIdx);
                  if (escapedChar != (UChar32)0xFFFFFFFF) {
                      dest.append(escapedChar);
-                    replIdx += (c==0x55? 9: 5); 
                      // TODO:  Report errors for mal-formed \u escapes?
                      //        As this is, the original sequence is output, which may be OK.
                      continue;
@@ -306,9 +313,36 @@ UBool RegexMatcher::find() {
      }
  
      int32_t startPos = fMatchEnd;
+
+    if (fMatch) {
+        // Save the position of any previous successful match.
+        fLastMatchEnd = fMatchEnd;
+
+        if (fMatchStart == fMatchEnd) {
+            // Previous match had zero length.  Move start position up one position
+            //  to avoid sending find() into a loop on zero-length matches.
+            if (startPos == fInput->length()) {
+                fMatch = FALSE;
+                return FALSE;
+            }
+            startPos = fInput->moveIndex32(startPos, 1);
+        }
+    } else {
+        if (fLastMatchEnd >= 0) {
+            // A previous find() failed to match.  Don't try again.
+            //   (without this test, a pattern with a zero-length match
+            //    could match again at the end of an input string.)
+            return FALSE;
+        }
+    }
+
      int32_t inputLen = fInput->length();
+
+    // Compute the position in the input string beyond which a match can not begin, because
+    //   the minimum length match would extend past the end of the input.
      int32_t testLen  = inputLen - fPattern->fMinMatchLen;
      if (startPos > testLen) {
+        fMatch = FALSE;
          return FALSE;
      }
  
@@ -342,6 +376,7 @@ UBool RegexMatcher::find() {
          // Matches are only possible at the start of the input string
          //   (pattern begins with ^ or \A)
          if (startPos > 0) {
+            fMatch = FALSE;
              return FALSE;
          }
          MatchAt(startPos, fDeferredStatus);
@@ -369,6 +404,7 @@ UBool RegexMatcher::find() {
                      }
                  }
                  if (pos >= testLen) {
+                    fMatch = FALSE;
                      return FALSE;
                  }
              }
@@ -394,6 +430,7 @@ UBool RegexMatcher::find() {
                      }
                  }
                  if (pos >= testLen) {
+                    fMatch = FALSE;
                      return FALSE;
                  }
              }
@@ -415,19 +452,22 @@ UBool RegexMatcher::find() {
              }
  
              for (;;) {
-                UChar32 c = inputBuf[startPos-1];
+                c = inputBuf[startPos-1];
                  if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
-                    (c == 0x0a ||  c==0x0c || c==0x85 ||c==0x2028 || c==0x2029 || 
-                    c == 0x0d && startPos+1 < inputLen && inputBuf[startPos+1] != 0x0a)) {
-                    MatchAt(startPos, fDeferredStatus);
-                    if (U_FAILURE(fDeferredStatus)) {
-                        return FALSE;
-                    }
-                    if (fMatch) {
-                        return TRUE;
-                    }
+                    (c == 0x0a || c == 0x0d ||  c==0x0c || c==0x85 ||c==0x2028 || c==0x2029 )) {
+                        if (c == 0x0d && startPos < inputLen && inputBuf[startPos] == 0x0a) {
+                            startPos++;
+                        }
+                        MatchAt(startPos, fDeferredStatus);
+                        if (U_FAILURE(fDeferredStatus)) {
+                            return FALSE;
+                        }
+                        if (fMatch) {
+                            return TRUE;
+                        }
                  }
                  if (startPos >= testLen) {
+                    fMatch = FALSE;
                      return FALSE;
                  }
                  U16_NEXT(inputBuf, startPos, inputLen, c);  // like c = inputBuf[startPos++];
@@ -456,7 +496,7 @@ UBool RegexMatcher::find(int32_t start, UErrorCode &status) {
          return FALSE;
      }
      int32_t inputLen = fInput->length();
-    if (start < 0 || start >= inputLen) {
+    if (start < 0 || start > inputLen) {
          status = U_INDEX_OUTOFBOUNDS_ERROR;
          return FALSE;
      }
@@ -516,6 +556,11 @@ const UnicodeString &RegexMatcher::input() const {
  
  
  
+//--------------------------------------------------------------------------------
+//
+//  lookingAt()
+//
+//--------------------------------------------------------------------------------
  UBool RegexMatcher::lookingAt(UErrorCode &status) {
      if (U_FAILURE(status)) {
          return FALSE;
@@ -530,7 +575,30 @@ UBool RegexMatcher::lookingAt(UErrorCode &status) {
  }
  
  
+UBool RegexMatcher::lookingAt(int32_t start, UErrorCode &status) {
+    if (U_FAILURE(status)) {
+        return FALSE;
+    }
+    if (U_FAILURE(fDeferredStatus)) {
+        status = fDeferredStatus;
+        return FALSE;
+    }
+    if (start < 0 || start > fInput->length()) {
+        status = U_INDEX_OUTOFBOUNDS_ERROR;
+        return FALSE;
+    }
+    reset();
+    MatchAt(start, status);
+    return fMatch;
+}
+
+
  
+//--------------------------------------------------------------------------------
+//
+//  matches()
+//
+//--------------------------------------------------------------------------------
  UBool RegexMatcher::matches(UErrorCode &status) {
      if (U_FAILURE(status)) {
          return FALSE;
@@ -546,6 +614,24 @@ UBool RegexMatcher::matches(UErrorCode &status) {
  }
  
  
+UBool RegexMatcher::matches(int32_t start, UErrorCode &status) {
+    if (U_FAILURE(status)) {
+        return FALSE;
+    }
+    if (U_FAILURE(fDeferredStatus)) {
+        status = fDeferredStatus;
+        return FALSE;
+    }
+    if (start < 0 || start > fInput->length()) {
+        status = U_INDEX_OUTOFBOUNDS_ERROR;
+        return FALSE;
+    }
+    reset();
+    MatchAt(start, status);
+    UBool   success  = (fMatch && fMatchEnd==fInput->length());
+    return success;
+}
+
  
  
  const RegexPattern &RegexMatcher::pattern() const {
@@ -614,10 +700,11 @@ UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErro
  //
  //--------------------------------------------------------------------------------
  RegexMatcher &RegexMatcher::reset() {
-    fMatchStart   = 0;
-    fMatchEnd     = 0;
-    fLastMatchEnd = 0;
-    fMatch        = FALSE;
+    fMatchStart     = 0;
+    fMatchEnd       = 0;
+    fLastMatchEnd   = -1;
+    fLastReplaceEnd = 0;
+    fMatch          = FALSE;
      resetStack();
      return *this;
  }
@@ -627,27 +714,37 @@ RegexMatcher &RegexMatcher::reset() {
  RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
      fInput          = &input;
      reset();
+    if (fWordBreakItr != NULL) {
+        #if UCONFIG_NO_BREAK_ITERATION==0
+        fWordBreakItr->setText(input);
+        #endif
+    }
      return *this;
  }
  
+RegexMatcher &RegexMatcher::reset(const UChar *) {
+    fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
+    return *this;
+}
  
  
-REStackFrame *RegexMatcher::resetStack() {
-    // Discard any previous contents of the state save stack, and initialize a
-    //  new stack frame to all -1.  The -1s are needed for capture group limits, where
-    //  they indicate that a group has not yet matched anything.
-    fStack->removeAllElements();
-
-    int32_t *iFrame = fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
-    int i;
-    for (i=0; i<fPattern->fFrameSize; i++) {
-        iFrame[i] = -1;
+RegexMatcher &RegexMatcher::reset(int32_t position, UErrorCode &status) {
+    if (U_FAILURE(status)) {
+        return *this;
      }
-    return (REStackFrame *)iFrame;
+    reset();
+    if (position < 0 || position >= fInput->length()) {
+        status = U_INDEX_OUTOFBOUNDS_ERROR;
+        return *this;
+    }
+    fMatchEnd = position;
+    return *this;
  }
  
  
  
+
+
  //--------------------------------------------------------------------------------
  //
  //    setTrace
@@ -792,6 +889,37 @@ int32_t RegexMatcher::start(int group, UErrorCode &status) const {
  
  
  
+//================================================================================
+//
+//    Code following this point in this file is the internal
+//    Match Engine Implementation.
+//
+//================================================================================
+
+
+//--------------------------------------------------------------------------------
+//
+//   resetStack
+//           Discard any previous contents of the state save stack, and initialize a
+//           new stack frame to all -1.  The -1s are needed for capture group limits, 
+//           where they indicate that a group has not yet matched anything.
+//--------------------------------------------------------------------------------
+REStackFrame *RegexMatcher::resetStack() {
+    // Discard any previous contents of the state save stack, and initialize a
+    //  new stack frame to all -1.  The -1s are needed for capture group limits, where
+    //  they indicate that a group has not yet matched anything.
+    fStack->removeAllElements();
+
+    int32_t *iFrame = fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
+    int i;
+    for (i=0; i<fPattern->fFrameSize; i++) {
+        iFrame[i] = -1;
+    }
+    return (REStackFrame *)iFrame;
+}
+
+
+
  //--------------------------------------------------------------------------------
  //
  //   isWordBoundary 
@@ -804,9 +932,6 @@ int32_t RegexMatcher::start(int group, UErrorCode &status) const {
  //                               opposite in membership in \w set
  //
  //          parameters:   pos   - the current position in the input buffer
-//                        start - the position where the match operation started.
-//                                don't backup before this position when looking back
-//                                for a preceding base char.
  //
  //--------------------------------------------------------------------------------
  UBool RegexMatcher::isWordBoundary(int32_t pos) {
@@ -845,6 +970,36 @@ UBool RegexMatcher::isWordBoundary(int32_t pos) {
      return isBoundary;
  }
  
+//--------------------------------------------------------------------------------
+//
+//   isUWordBoundary 
+//
+//         Test for a word boundary using RBBI word break.
+//
+//          parameters:   pos   - the current position in the input buffer
+//
+//--------------------------------------------------------------------------------
+UBool RegexMatcher::isUWordBoundary(int32_t pos) {
+    UBool       returnVal = FALSE;
+#if UCONFIG_NO_BREAK_ITERATION==0
+    UErrorCode  status    = U_ZERO_ERROR;  
+    
+    // If we haven't yet created a break iterator for this matcher, do it now.
+    if (fWordBreakItr == NULL) {
+        fWordBreakItr = 
+            (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
+        if (U_FAILURE(status)) {
+            // TODO:  reliable error reporting for BI failures.
+            return FALSE;
+        }
+        fWordBreakItr->setText(*fInput);
+    }
+
+    returnVal = fWordBreakItr->isBoundary(pos);
+#endif
+    return   returnVal;
+}
+
  //--------------------------------------------------------------------------------
  //
  //   StateSave      
@@ -1080,7 +1235,11 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
              if (fp->fInputIdx == inputLen-1) {
                  UChar32 c = fInput->char32At(fp->fInputIdx);
                  if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) {
-                    break;                         // At new-line at end of input. Success
+                    // If not in the middle of a CR/LF sequence
+                    if ( !(c==0x0a && fp->fInputIdx>0 && inputBuf[fp->fInputIdx-1]==0x0d)) {
+                        break;
+                        // At new-line at end of input. Success
+                    }
                  }
              }
  
@@ -1101,12 +1260,16 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
                       // We really are at the end of input.  Success.
                       break;
                   }
-                 // If we are positioned just before a new-line , succeed.
+                 // If we are positioned just before a new-line, succeed.
                   // It makes no difference where the new-line is within the input.
                   UChar32 c = inputBuf[fp->fInputIdx];
                   if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) {
-                     break;                         // At new-line at end of input. Success
+                     // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
+                     if ( !(c==0x0a && fp->fInputIdx>0 && inputBuf[fp->fInputIdx-1]==0x0d)) {
+                        break;                         // At new-line at end of input. Success
+                     }
                   }
+                 
                   // not at a new line.  Fail.
                   fp = (REStackFrame *)fStack->popFrame(frameSize);
               }
@@ -1151,6 +1314,17 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
              break;
  
  
+        case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
+            {
+                UBool success = isUWordBoundary(fp->fInputIdx);
+                success ^= (opValue != 0);     // flip sense for \B
+                if (!success) {
+                    fp = (REStackFrame *)fStack->popFrame(frameSize);
+                }
+            }
+            break;
+
+
          case URX_BACKSLASH_D:            // Test for decimal digit
              {
                  if (fp->fInputIdx >= inputLen) {
@@ -1352,7 +1526,7 @@ GC_Done:
                          break;
                      }
                  }
-            }
+            } 
              // Either at end of input, or the character wasn't in the set.
              // Either way, we need to back track out.
              fp = (REStackFrame *)fStack->popFrame(frameSize);
@@ -1639,14 +1813,7 @@ GC_Done:
                          //   we do too.
                          break;
                      }
-                /*
-                if ((fp->fInputIdx + len > inputLen) || 
-                    u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, len) != 0) {
-                    fp = (REStackFrame *)fStack->popFrame(frameSize);   // FAIL, no match.
-                } else {
-                    fp->fInputIdx += len;     // Match.  Advance current input position.
-                }
-                */
+
                  UBool  haveMatch = FALSE;
                  if (fp->fInputIdx + len <= inputLen) {
                      if (opType == URX_BACKREF) {
@@ -1729,7 +1896,7 @@ GC_Done:
                  if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {           
                      break;
                  }
-            }
+            } 
              fp = (REStackFrame *)fStack->popFrame(frameSize);
              break;
  
@@ -1747,17 +1914,18 @@ GC_Done:
                  opValue = URX_VAL(op);
                  U_ASSERT(opType == URX_STRING_LEN);
                  stringLen = opValue;
-
+                
                  int32_t stringEndIndex = fp->fInputIdx + stringLen;
-                if (stringEndIndex <= inputLen &&
-                    u_strncasecmp(inputBuf+fp->fInputIdx, litText+stringStartIdx,
-                                  stringLen, U_FOLD_CASE_DEFAULT) == 0) {
-                    // Success.  Advance the current input position.
-                    fp->fInputIdx = stringEndIndex;
-                } else {
-                    // No match.  Back up matching to a saved state
-                    fp = (REStackFrame *)fStack->popFrame(frameSize);
-                }
+                if (stringEndIndex <= inputLen) {
+                    if (u_strncasecmp(inputBuf+fp->fInputIdx, litText+stringStartIdx,
+                        stringLen, U_FOLD_CASE_DEFAULT) == 0) {
+                        // Success.  Advance the current input position.
+                        fp->fInputIdx = stringEndIndex;
+                        break;
+                    }
+                }                 
+                // No match.  Back up matching to a saved state
+                fp = (REStackFrame *)fStack->popFrame(frameSize);
              }
              break;
  
@@ -2003,13 +2171,17 @@ GC_Done:
              {
                  // Loop through input until the input is exhausted (we reach an end-of-line)
                  // In multi-line mode, we can just go straight to the end of the input.
-                int32_t ix = inputLen;
-                if (opValue == 0) {
+                int32_t ix;
+                if (opValue == 1) {
+                    // Multi-line mode.
+                    ix = inputLen;
+                } else {
                      // NOT multi-line mode.  Line endings do not match '.'
                      // Scan forward until a line ending or end of input.
                      ix = fp->fInputIdx;
                      for (;;) {
                          if (ix >= inputLen) {
+                            ix = inputLen;
                              break;
                          }
                          UChar32   c;
@@ -2103,13 +2275,13 @@ breakFromLoop:
          fMatchStart   = startIdx;
          fMatchEnd     = fp->fInputIdx;
          if (fTraceDebug) {
-            REGEX_RUN_DEBUG_PRINTF("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd);
+            REGEX_RUN_DEBUG_PRINTF(("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd));
          }
      }
      else
      {
          if (fTraceDebug) {
-            REGEX_RUN_DEBUG_PRINTF("No match\n\n");
+            REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
          }
      }
  
@@ -2122,7 +2294,7 @@ breakFromLoop:
  
  
  
-const char RegexMatcher::fgClassID = 0;
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
  
  U_NAMESPACE_END