]> git.saurik.com Git - apple/icu.git/blobdiff - icuSources/i18n/rematch.cpp
ICU-64252.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / rematch.cpp
index 341d29f13a8dd9d15224d371e961ccc27971b4a3..968fc47da375ae9d6e59711a1ba9d7da8ffb3b98 100644 (file)
@@ -1,7 +1,9 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
 /*
 **************************************************************************
 /*
 **************************************************************************
-*   Copyright (C) 2002-2015 International Business Machines Corporation  *
-*   and others. All rights reserved.                                     *
+*   Copyright (C) 2002-2016 International Business Machines Corporation
+*   and others. All rights reserved.
 **************************************************************************
 */
 //
 **************************************************************************
 */
 //
@@ -23,6 +25,7 @@
 #include "unicode/utf16.h"
 #include "uassert.h"
 #include "cmemory.h"
 #include "unicode/utf16.h"
 #include "uassert.h"
 #include "cmemory.h"
+#include "cstr.h"
 #include "uvector.h"
 #include "uvectr32.h"
 #include "uvectr64.h"
 #include "uvector.h"
 #include "uvectr32.h"
 #include "uvectr64.h"
@@ -33,6 +36,7 @@
 
 // #include <malloc.h>        // Needed for heapcheck testing
 
 
 // #include <malloc.h>        // Needed for heapcheck testing
 
+
 U_NAMESPACE_BEGIN
 
 // Default limit for the size of the back track stack, to avoid system
 U_NAMESPACE_BEGIN
 
 // Default limit for the size of the back track stack, to avoid system
@@ -225,10 +229,6 @@ void RegexMatcher::init(UErrorCode &status) {
     fInput             = NULL;
     fInputLength       = 0;
     fInputUniStrMaybeMutable = FALSE;
     fInput             = NULL;
     fInputLength       = 0;
     fInputUniStrMaybeMutable = FALSE;
-
-    if (U_FAILURE(status)) {
-        fDeferredStatus = status;
-    }
 }
 
 //
 }
 
 //
@@ -241,7 +241,7 @@ void RegexMatcher::init2(UText *input, UErrorCode &status) {
         return;
     }
 
         return;
     }
 
-    if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) {
+    if (fPattern->fDataSize > UPRV_LENGTHOF(fSmallData)) {
         fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
         if (fData == NULL) {
             status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
         fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
         if (fData == NULL) {
             status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
@@ -438,7 +438,7 @@ RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
                         status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
                     }
                 }
                         status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
                     }
                 }
-                        
+
             } else if (u_isdigit(nextChar)) {
                 // $n    Scan for a capture group number
                 int32_t numCaptureGroups = fPattern->fGroupMap->size();
             } else if (u_isdigit(nextChar)) {
                 // $n    Scan for a capture group number
                 int32_t numCaptureGroups = fPattern->fGroupMap->size();
@@ -459,7 +459,7 @@ RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
                         break;
                     }
                     (void)UTEXT_NEXT32(replacement);
                         break;
                     }
                     (void)UTEXT_NEXT32(replacement);
-                    groupNum=groupNum*10 + nextDigitVal; 
+                    groupNum=groupNum*10 + nextDigitVal;
                     ++numDigits;
                 }
             } else {
                     ++numDigits;
                 }
             } else {
@@ -717,7 +717,7 @@ UBool RegexMatcher::find(UErrorCode &status) {
             if  (findProgressInterrupt(startPos, status))
                 return FALSE;
         }
             if  (findProgressInterrupt(startPos, status))
                 return FALSE;
         }
-        U_ASSERT(FALSE);
+        UPRV_UNREACHABLE;
 
     case START_START:
         // Matches are only possible at the start of the input string
 
     case START_START:
         // Matches are only possible at the start of the input string
@@ -765,7 +765,7 @@ UBool RegexMatcher::find(UErrorCode &status) {
                     return FALSE;
             }
         }
                     return FALSE;
             }
         }
-        U_ASSERT(FALSE);
+        UPRV_UNREACHABLE;
 
     case START_STRING:
     case START_CHAR:
 
     case START_STRING:
     case START_CHAR:
@@ -786,7 +786,7 @@ UBool RegexMatcher::find(UErrorCode &status) {
                     if (fMatch) {
                         return TRUE;
                     }
                     if (fMatch) {
                         return TRUE;
                     }
-                    UTEXT_SETNATIVEINDEX(fInputText, pos);
+                    UTEXT_SETNATIVEINDEX(fInputText, startPos);
                 }
                 if (startPos > testStartLimit) {
                     fMatch = FALSE;
                 }
                 if (startPos > testStartLimit) {
                     fMatch = FALSE;
@@ -797,11 +797,11 @@ UBool RegexMatcher::find(UErrorCode &status) {
                     return FALSE;
            }
         }
                     return FALSE;
            }
         }
-        U_ASSERT(FALSE);
+        UPRV_UNREACHABLE;
 
     case START_LINE:
         {
 
     case START_LINE:
         {
-            UChar32  c;
+            UChar32 ch;
             if (startPos == fAnchorStart) {
                 MatchAt(startPos, FALSE, status);
                 if (U_FAILURE(status)) {
             if (startPos == fAnchorStart) {
                 MatchAt(startPos, FALSE, status);
                 if (U_FAILURE(status)) {
@@ -811,17 +811,17 @@ UBool RegexMatcher::find(UErrorCode &status) {
                     return TRUE;
                 }
                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
                     return TRUE;
                 }
                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
-                c = UTEXT_NEXT32(fInputText);
+                ch = UTEXT_NEXT32(fInputText);
                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
             } else {
                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
             } else {
                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
-                c = UTEXT_PREVIOUS32(fInputText);
+                ch = UTEXT_PREVIOUS32(fInputText);
                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
             }
 
             if (fPattern->fFlags & UREGEX_UNIX_LINES) {
                 for (;;) {
                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
             }
 
             if (fPattern->fFlags & UREGEX_UNIX_LINES) {
                 for (;;) {
-                    if (c == 0x0a) {
+                    if (ch == 0x0a) {
                             MatchAt(startPos, FALSE, status);
                             if (U_FAILURE(status)) {
                                 return FALSE;
                             MatchAt(startPos, FALSE, status);
                             if (U_FAILURE(status)) {
                                 return FALSE;
@@ -836,7 +836,7 @@ UBool RegexMatcher::find(UErrorCode &status) {
                         fHitEnd = TRUE;
                         return FALSE;
                     }
                         fHitEnd = TRUE;
                         return FALSE;
                     }
-                    c = UTEXT_NEXT32(fInputText);
+                    ch = UTEXT_NEXT32(fInputText);
                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
                     // Note that it's perfectly OK for a pattern to have a zero-length
                     //   match at the end of a string, so we must make sure that the loop
                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
                     // Note that it's perfectly OK for a pattern to have a zero-length
                     //   match at the end of a string, so we must make sure that the loop
@@ -846,8 +846,8 @@ UBool RegexMatcher::find(UErrorCode &status) {
                 }
             } else {
                 for (;;) {
                 }
             } else {
                 for (;;) {
-                    if (isLineTerminator(c)) {
-                        if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
+                    if (isLineTerminator(ch)) {
+                        if (ch == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
                             (void)UTEXT_NEXT32(fInputText);
                             startPos = UTEXT_GETNATIVEINDEX(fInputText);
                         }
                             (void)UTEXT_NEXT32(fInputText);
                             startPos = UTEXT_GETNATIVEINDEX(fInputText);
                         }
@@ -865,7 +865,7 @@ UBool RegexMatcher::find(UErrorCode &status) {
                         fHitEnd = TRUE;
                         return FALSE;
                     }
                         fHitEnd = TRUE;
                         return FALSE;
                     }
-                    c = UTEXT_NEXT32(fInputText);
+                    ch = UTEXT_NEXT32(fInputText);
                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
                     // Note that it's perfectly OK for a pattern to have a zero-length
                     //   match at the end of a string, so we must make sure that the loop
                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
                     // Note that it's perfectly OK for a pattern to have a zero-length
                     //   match at the end of a string, so we must make sure that the loop
@@ -877,11 +877,10 @@ UBool RegexMatcher::find(UErrorCode &status) {
         }
 
     default:
         }
 
     default:
-        U_ASSERT(FALSE);
+        UPRV_UNREACHABLE;
     }
 
     }
 
-    U_ASSERT(FALSE);
-    return FALSE;
+    UPRV_UNREACHABLE;
 }
 
 
 }
 
 
@@ -992,7 +991,7 @@ UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
             if  (findProgressInterrupt(startPos, status))
                 return FALSE;
         }
             if  (findProgressInterrupt(startPos, status))
                 return FALSE;
         }
-        U_ASSERT(FALSE);
+        UPRV_UNREACHABLE;
 
     case START_START:
         // Matches are only possible at the start of the input string
 
     case START_START:
         // Matches are only possible at the start of the input string
@@ -1034,7 +1033,7 @@ UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
                 return FALSE;
         }
     }
                 return FALSE;
         }
     }
-        U_ASSERT(FALSE);
+    UPRV_UNREACHABLE;
 
     case START_STRING:
     case START_CHAR:
 
     case START_STRING:
     case START_CHAR:
@@ -1063,11 +1062,11 @@ UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
                 return FALSE;
         }
     }
                 return FALSE;
         }
     }
-    U_ASSERT(FALSE);
+    UPRV_UNREACHABLE;
 
     case START_LINE:
     {
 
     case START_LINE:
     {
-        UChar32  c;
+        UChar32 ch;
         if (startPos == fAnchorStart) {
             MatchChunkAt(startPos, FALSE, status);
             if (U_FAILURE(status)) {
         if (startPos == fAnchorStart) {
             MatchChunkAt(startPos, FALSE, status);
             if (U_FAILURE(status)) {
@@ -1076,13 +1075,22 @@ UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
             if (fMatch) {
                 return TRUE;
             }
             if (fMatch) {
                 return TRUE;
             }
+            // In bug 31063104 which has a zero-length text buffer we get here with
+            // inputBuf=NULL, startPos=fActiveLimit=0 (and fMatch F) which violates the
+            // requirement for U16_FWD_1 (utf16.h) that startPos < fActiveLimit. Having
+            // inputBuf=NULL (chunkContexts NULL) is probably due to an error in the
+            // CFStringUText functions. Nevertheless, to be defensive, add test below.
+            if (startPos >= testLen) {
+                fHitEnd = TRUE;
+                return FALSE;
+            }
             U16_FWD_1(inputBuf, startPos, fActiveLimit);
         }
 
         if (fPattern->fFlags & UREGEX_UNIX_LINES) {
             for (;;) {
             U16_FWD_1(inputBuf, startPos, fActiveLimit);
         }
 
         if (fPattern->fFlags & UREGEX_UNIX_LINES) {
             for (;;) {
-                c = inputBuf[startPos-1];
-                if (c == 0x0a) {
+                ch = inputBuf[startPos-1];
+                if (ch == 0x0a) {
                     MatchChunkAt(startPos, FALSE, status);
                     if (U_FAILURE(status)) {
                         return FALSE;
                     MatchChunkAt(startPos, FALSE, status);
                     if (U_FAILURE(status)) {
                         return FALSE;
@@ -1105,9 +1113,9 @@ UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
             }
         } else {
             for (;;) {
             }
         } else {
             for (;;) {
-                c = inputBuf[startPos-1];
-                if (isLineTerminator(c)) {
-                    if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
+                ch = inputBuf[startPos-1];
+                if (isLineTerminator(ch)) {
+                    if (ch == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
                         startPos++;
                     }
                     MatchChunkAt(startPos, FALSE, status);
                         startPos++;
                     }
                     MatchChunkAt(startPos, FALSE, status);
@@ -1134,11 +1142,10 @@ UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
     }
 
     default:
     }
 
     default:
-        U_ASSERT(FALSE);
+        UPRV_UNREACHABLE;
     }
 
     }
 
-    U_ASSERT(FALSE);
-    return FALSE;
+    UPRV_UNREACHABLE;
 }
 
 
 }
 
 
@@ -2187,7 +2194,7 @@ int32_t  RegexMatcher::split(UText *input,
                     break;
                 }
                 i++;
                     break;
                 }
                 i++;
-                dest[i] = utext_extract_replace(fInputText, dest[i], 
+                dest[i] = utext_extract_replace(fInputText, dest[i],
                                                start64(groupNum, status), end64(groupNum, status), &status);
             }
 
                                                start64(groupNum, status), end64(groupNum, status), &status);
             }
 
@@ -2200,7 +2207,7 @@ int32_t  RegexMatcher::split(UText *input,
                     if (dest[i] == NULL) {
                         dest[i] = utext_openUChars(NULL, NULL, 0, &status);
                     } else {
                     if (dest[i] == NULL) {
                         dest[i] = utext_openUChars(NULL, NULL, 0, &status);
                     } else {
-                        static UChar emptyString[] = {(UChar)0};
+                        static const UChar emptyString[] = {(UChar)0};
                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
                     }
                 }
                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
                     }
                 }
@@ -2500,6 +2507,10 @@ REStackFrame *RegexMatcher::resetStack() {
     fStack->removeAllElements();
 
     REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
     fStack->removeAllElements();
 
     REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
+    if(U_FAILURE(fDeferredStatus)) {
+        return NULL;
+    }
+
     int32_t i;
     for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
         iFrame->fExtra[i] = -1;
     int32_t i;
     for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
         iFrame->fExtra[i] = -1;
@@ -2687,9 +2698,12 @@ void RegexMatcher::IncrementTime(UErrorCode &status) {
 //
 //--------------------------------------------------------------------------------
 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
 //
 //--------------------------------------------------------------------------------
 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
+    if (U_FAILURE(status)) {
+        return fp;
+    }
     // push storage for a new frame.
     int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
     // push storage for a new frame.
     int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
-    if (newFP == NULL) {
+    if (U_FAILURE(status)) {
         // Failure on attempted stack expansion.
         //   Stack function set some other error code, change it to a more
         //   specific one for regular expressions.
         // Failure on attempted stack expansion.
         //   Stack function set some other error code, change it to a more
         //   specific one for regular expressions.
@@ -2720,6 +2734,18 @@ inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatId
     return (REStackFrame *)newFP;
 }
 
     return (REStackFrame *)newFP;
 }
 
+#if defined(REGEX_DEBUG)
+namespace {
+UnicodeString StringFromUText(UText *ut) {
+    UnicodeString result;
+    for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) {
+        result.append(c);
+    }
+    return result;
+}
+}
+#endif // REGEX_DEBUG
+
 
 //--------------------------------------------------------------------------------
 //
 
 //--------------------------------------------------------------------------------
 //
@@ -2739,32 +2765,10 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
     int32_t     opValue;               //    and the operand value.
 
 #ifdef REGEX_RUN_DEBUG
     int32_t     opValue;               //    and the operand value.
 
 #ifdef REGEX_RUN_DEBUG
-    if (fTraceDebug)
-    {
+    if (fTraceDebug) {
         printf("MatchAt(startIdx=%ld)\n", startIdx);
         printf("MatchAt(startIdx=%ld)\n", startIdx);
-        printf("Original Pattern: ");
-        UChar32 c = utext_next32From(fPattern->fPattern, 0);
-        while (c != U_SENTINEL) {
-            if (c<32 || c>256) {
-                c = '.';
-            }
-            printf("%c", c);
-
-            c = UTEXT_NEXT32(fPattern->fPattern);
-        }
-        printf("\n");
-        printf("Input String: ");
-        c = utext_next32From(fInputText, 0);
-        while (c != U_SENTINEL) {
-            if (c<32 || c>256) {
-                c = '.';
-            }
-            printf("%c", c);
-
-            c = UTEXT_NEXT32(fInputText);
-        }
-        printf("\n");
-        printf("\n");
+        printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
+        printf("Input String:     \"%s\"\n\n", CStr(StringFromUText(fInputText))());
     }
 #endif
 
     }
 #endif
 
@@ -2777,10 +2781,14 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
 
     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
 
     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
-    UVector             *sets          = fPattern->fSets;
+    UVector             *fSets         = fPattern->fSets;
 
     fFrameSize = fPattern->fFrameSize;
     REStackFrame        *fp            = resetStack();
 
     fFrameSize = fPattern->fFrameSize;
     REStackFrame        *fp            = resetStack();
+    if (U_FAILURE(fDeferredStatus)) {
+        status = fDeferredStatus;
+        return;
+    }
 
     fp->fPatIdx   = 0;
     fp->fInputIdx = startIdx;
 
     fp->fPatIdx   = 0;
     fp->fInputIdx = startIdx;
@@ -3375,7 +3383,7 @@ GC_Done:
 
                 // There is input left.  Pick up one char and test it for set membership.
                 UChar32 c = UTEXT_NEXT32(fInputText);
 
                 // There is input left.  Pick up one char and test it for set membership.
                 UChar32 c = UTEXT_NEXT32(fInputText);
-                U_ASSERT(opValue > 0 && opValue < sets->size());
+                U_ASSERT(opValue > 0 && opValue < fSets->size());
                 if (c<256) {
                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
                     if (s8->contains(c)) {
                 if (c<256) {
                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
                     if (s8->contains(c)) {
@@ -3383,7 +3391,7 @@ GC_Done:
                         break;
                     }
                 } else {
                         break;
                     }
                 } else {
-                    UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
+                    UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
                     if (s->contains(c)) {
                         // The character is in the set.  A Match.
                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
                     if (s->contains(c)) {
                         // The character is in the set.  A Match.
                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
@@ -3565,7 +3573,14 @@ GC_Done:
                         }
                     }
                     fp = StateSave(fp, fp->fPatIdx, status);
                         }
                     }
                     fp = StateSave(fp, fp->fPatIdx, status);
+                } else {
+                    // Increment time-out counter. (StateSave() does it if count >= minCount)
+                    fTickCounter--;
+                    if (fTickCounter <= 0) {
+                        IncrementTime(status);    // Re-initializes fTickCounter
+                    }
                 }
                 }
+
                 fp->fPatIdx = opValue + 4;    // Loop back.
             }
             break;
                 fp->fPatIdx = opValue + 4;    // Loop back.
             }
             break;
@@ -3622,6 +3637,11 @@ GC_Done:
                     // We haven't met the minimum number of matches yet.
                     //   Loop back for another one.
                     fp->fPatIdx = opValue + 4;    // Loop back.
                     // We haven't met the minimum number of matches yet.
                     //   Loop back for another one.
                     fp->fPatIdx = opValue + 4;    // Loop back.
+                    // Increment time-out counter. (StateSave() does it if count >= minCount)
+                    fTickCounter--;
+                    if (fTickCounter <= 0) {
+                        IncrementTime(status);    // Re-initializes fTickCounter
+                    }
                 } else {
                     // We do have the minimum number of matches.
 
                 } else {
                     // We do have the minimum number of matches.
 
@@ -3658,9 +3678,9 @@ GC_Done:
                 if (newFP == (int64_t *)fp) {
                     break;
                 }
                 if (newFP == (int64_t *)fp) {
                     break;
                 }
-                int32_t i;
-                for (i=0; i<fFrameSize; i++) {
-                    newFP[i] = ((int64_t *)fp)[i];
+                int32_t j;
+                for (j=0; j<fFrameSize; j++) {
+                    newFP[j] = ((int64_t *)fp)[j];
                 }
                 fp = (REStackFrame *)newFP;
                 fStack->setSize(newStackSize);
                 }
                 fp = (REStackFrame *)newFP;
                 fStack->setSize(newStackSize);
@@ -3817,9 +3837,9 @@ GC_Done:
                     //   This makes the capture groups from within the look-ahead
                     //   expression available.
                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
                     //   This makes the capture groups from within the look-ahead
                     //   expression available.
                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
-                    int32_t i;
-                    for (i=0; i<fFrameSize; i++) {
-                        newFP[i] = ((int64_t *)fp)[i];
+                    int32_t j;
+                    for (j=0; j<fFrameSize; j++) {
+                        newFP[j] = ((int64_t *)fp)[j];
                     }
                     fp = (REStackFrame *)newFP;
                     fStack->setSize(newStackSize);
                     }
                     fp = (REStackFrame *)newFP;
                     fStack->setSize(newStackSize);
@@ -3929,28 +3949,38 @@ GC_Done:
                 //   of this op in the pattern.
                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
                 //   of this op in the pattern.
                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
+                if (!UTEXT_USES_U16(fInputText)) {
+                    // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
+                    // The max length need not be exact; it just needs to be >= actual maximum.
+                    maxML *= 3;
+                }
                 U_ASSERT(minML <= maxML);
                 U_ASSERT(minML >= 0);
 
                 // Fetch (from data) the last input index where a match was attempted.
                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
                 U_ASSERT(minML <= maxML);
                 U_ASSERT(minML >= 0);
 
                 // Fetch (from data) the last input index where a match was attempted.
                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
-                int64_t  *lbStartIdx = &fData[opValue+2];
-                if (*lbStartIdx < 0) {
+                int64_t  &lbStartIdx = fData[opValue+2];
+                if (lbStartIdx < 0) {
                     // First time through loop.
                     // First time through loop.
-                    *lbStartIdx = fp->fInputIdx - minML;
+                    lbStartIdx = fp->fInputIdx - minML;
+                    if (lbStartIdx > 0) {
+                        // move index to a code point boudary, if it's not on one already.
+                        UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
+                        lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
+                    }
                 } else {
                     // 2nd through nth time through the loop.
                     // Back up start position for match by one.
                 } else {
                     // 2nd through nth time through the loop.
                     // Back up start position for match by one.
-                    if (*lbStartIdx == 0) {
-                        (*lbStartIdx)--;
+                    if (lbStartIdx == 0) {
+                        (lbStartIdx)--;
                     } else {
                     } else {
-                        UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
+                        UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
                         (void)UTEXT_PREVIOUS32(fInputText);
                         (void)UTEXT_PREVIOUS32(fInputText);
-                        *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
+                        lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
                     }
                 }
 
                     }
                 }
 
-                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
+                if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
                     // We have tried all potential match starting points without
                     //  getting a match.  Backtrack out, and out of the
                     //   Look Behind altogether.
                     // We have tried all potential match starting points without
                     //  getting a match.  Backtrack out, and out of the
                     //   Look Behind altogether.
@@ -3965,7 +3995,7 @@ GC_Done:
                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
                 //      (successful match will fall off the end of the loop.)
                 fp = StateSave(fp, fp->fPatIdx-3, status);
                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
                 //      (successful match will fall off the end of the loop.)
                 fp = StateSave(fp, fp->fPatIdx-3, status);
-                fp->fInputIdx = *lbStartIdx;
+                fp->fInputIdx = lbStartIdx;
             }
             break;
 
             }
             break;
 
@@ -4002,6 +4032,11 @@ GC_Done:
                 // Fetch the extra parameters of this op.
                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
                 // Fetch the extra parameters of this op.
                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
+                if (!UTEXT_USES_U16(fInputText)) {
+                    // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
+                    // The max length need not be exact; it just needs to be >= actual maximum.
+                    maxML *= 3;
+                }
                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
                         continueLoc = URX_VAL(continueLoc);
                 U_ASSERT(minML <= maxML);
                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
                         continueLoc = URX_VAL(continueLoc);
                 U_ASSERT(minML <= maxML);
@@ -4010,23 +4045,28 @@ GC_Done:
 
                 // Fetch (from data) the last input index where a match was attempted.
                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
 
                 // Fetch (from data) the last input index where a match was attempted.
                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
-                int64_t  *lbStartIdx = &fData[opValue+2];
-                if (*lbStartIdx < 0) {
+                int64_t  &lbStartIdx = fData[opValue+2];
+                if (lbStartIdx < 0) {
                     // First time through loop.
                     // First time through loop.
-                    *lbStartIdx = fp->fInputIdx - minML;
+                    lbStartIdx = fp->fInputIdx - minML;
+                    if (lbStartIdx > 0) {
+                        // move index to a code point boudary, if it's not on one already.
+                        UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
+                        lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
+                    }
                 } else {
                     // 2nd through nth time through the loop.
                     // Back up start position for match by one.
                 } else {
                     // 2nd through nth time through the loop.
                     // Back up start position for match by one.
-                    if (*lbStartIdx == 0) {
-                        (*lbStartIdx)--;
+                    if (lbStartIdx == 0) {
+                        (lbStartIdx)--;
                     } else {
                     } else {
-                        UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
+                        UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
                         (void)UTEXT_PREVIOUS32(fInputText);
                         (void)UTEXT_PREVIOUS32(fInputText);
-                        *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
+                        lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
                     }
                 }
 
                     }
                 }
 
-                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
+                if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
                     // We have tried all potential match starting points without
                     //  getting a match, which means that the negative lookbehind as
                     //  a whole has succeeded.  Jump forward to the continue location
                     // We have tried all potential match starting points without
                     //  getting a match, which means that the negative lookbehind as
                     //  a whole has succeeded.  Jump forward to the continue location
@@ -4041,7 +4081,7 @@ GC_Done:
                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
                 //      (successful match will cause a FAIL out of the loop altogether.)
                 fp = StateSave(fp, fp->fPatIdx-4, status);
                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
                 //      (successful match will cause a FAIL out of the loop altogether.)
                 fp = StateSave(fp, fp->fPatIdx-4, status);
-                fp->fInputIdx = *lbStartIdx;
+                fp->fInputIdx = lbStartIdx;
             }
             break;
 
             }
             break;
 
@@ -4090,9 +4130,9 @@ GC_Done:
             //   This op scans through all matching input.
             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
             {
             //   This op scans through all matching input.
             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
             {
-                U_ASSERT(opValue > 0 && opValue < sets->size());
+                U_ASSERT(opValue > 0 && opValue < fSets->size());
                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
-                UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
+                UnicodeSet   *s  = (UnicodeSet *)fSets->elementAt(opValue);
 
                 // Loop through input, until either the input is exhausted or
                 //   we reach a character that is not a member of the set.
 
                 // Loop through input, until either the input is exhausted or
                 //   we reach a character that is not a member of the set.
@@ -4245,7 +4285,7 @@ GC_Done:
         default:
             // Trouble.  The compiled pattern contains an entry with an
             //           unrecognized type tag.
         default:
             // Trouble.  The compiled pattern contains an entry with an
             //           unrecognized type tag.
-            U_ASSERT(FALSE);
+            UPRV_UNREACHABLE;
         }
 
         if (U_FAILURE(status)) {
         }
 
         if (U_FAILURE(status)) {
@@ -4303,29 +4343,8 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
 #ifdef REGEX_RUN_DEBUG
     if (fTraceDebug) {
         printf("MatchAt(startIdx=%d)\n", startIdx);
 #ifdef REGEX_RUN_DEBUG
     if (fTraceDebug) {
         printf("MatchAt(startIdx=%d)\n", startIdx);
-        printf("Original Pattern: ");
-        UChar32 c = utext_next32From(fPattern->fPattern, 0);
-        while (c != U_SENTINEL) {
-            if (c<32 || c>256) {
-                c = '.';
-            }
-            printf("%c", c);
-
-            c = UTEXT_NEXT32(fPattern->fPattern);
-        }
-        printf("\n");
-        printf("Input String: ");
-        c = utext_next32From(fInputText, 0);
-        while (c != U_SENTINEL) {
-            if (c<32 || c>256) {
-                c = '.';
-            }
-            printf("%c", c);
-
-            c = UTEXT_NEXT32(fInputText);
-        }
-        printf("\n");
-        printf("\n");
+        printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
+        printf("Input String:     \"%s\"\n\n", CStr(StringFromUText(fInputText))());
     }
 #endif
 
     }
 #endif
 
@@ -4338,12 +4357,16 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
 
     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
 
     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
-    UVector             *sets          = fPattern->fSets;
+    UVector             *fSets         = fPattern->fSets;
 
     const UChar         *inputBuf      = fInputText->chunkContents;
 
     fFrameSize = fPattern->fFrameSize;
     REStackFrame        *fp            = resetStack();
 
     const UChar         *inputBuf      = fInputText->chunkContents;
 
     fFrameSize = fPattern->fFrameSize;
     REStackFrame        *fp            = resetStack();
+    if (U_FAILURE(fDeferredStatus)) {
+        status = fDeferredStatus;
+        return;
+    }
 
     fp->fPatIdx   = 0;
     fp->fInputIdx = startIdx;
 
     fp->fPatIdx   = 0;
     fp->fInputIdx = startIdx;
@@ -4912,7 +4935,7 @@ GC_Done:
                     break;
                 }
 
                     break;
                 }
 
-                U_ASSERT(opValue > 0 && opValue < sets->size());
+                U_ASSERT(opValue > 0 && opValue < fSets->size());
 
                 // There is input left.  Pick up one char and test it for set membership.
                 UChar32  c;
 
                 // There is input left.  Pick up one char and test it for set membership.
                 UChar32  c;
@@ -4924,7 +4947,7 @@ GC_Done:
                         break;
                     }
                 } else {
                         break;
                     }
                 } else {
-                    UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
+                    UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
                     if (s->contains(c)) {
                         // The character is in the set.  A Match.
                         break;
                     if (s->contains(c)) {
                         // The character is in the set.  A Match.
                         break;
@@ -5095,6 +5118,12 @@ GC_Done:
                         }
                     }
                     fp = StateSave(fp, fp->fPatIdx, status);
                         }
                     }
                     fp = StateSave(fp, fp->fPatIdx, status);
+                } else {
+                    // Increment time-out counter. (StateSave() does it if count >= minCount)
+                    fTickCounter--;
+                    if (fTickCounter <= 0) {
+                        IncrementTime(status);    // Re-initializes fTickCounter
+                    }
                 }
                 fp->fPatIdx = opValue + 4;    // Loop back.
             }
                 }
                 fp->fPatIdx = opValue + 4;    // Loop back.
             }
@@ -5152,6 +5181,10 @@ GC_Done:
                     // We haven't met the minimum number of matches yet.
                     //   Loop back for another one.
                     fp->fPatIdx = opValue + 4;    // Loop back.
                     // We haven't met the minimum number of matches yet.
                     //   Loop back for another one.
                     fp->fPatIdx = opValue + 4;    // Loop back.
+                    fTickCounter--;
+                    if (fTickCounter <= 0) {
+                        IncrementTime(status);    // Re-initializes fTickCounter
+                    }
                 } else {
                     // We do have the minimum number of matches.
 
                 } else {
                     // We do have the minimum number of matches.
 
@@ -5188,9 +5221,9 @@ GC_Done:
                 if (newFP == (int64_t *)fp) {
                     break;
                 }
                 if (newFP == (int64_t *)fp) {
                     break;
                 }
-                int32_t i;
-                for (i=0; i<fFrameSize; i++) {
-                    newFP[i] = ((int64_t *)fp)[i];
+                int32_t j;
+                for (j=0; j<fFrameSize; j++) {
+                    newFP[j] = ((int64_t *)fp)[j];
                 }
                 fp = (REStackFrame *)newFP;
                 fStack->setSize(newStackSize);
                 }
                 fp = (REStackFrame *)newFP;
                 fStack->setSize(newStackSize);
@@ -5221,6 +5254,12 @@ GC_Done:
                         break;
                     }
                 }
                         break;
                     }
                 }
+                if (success && groupStartIdx < groupEndIdx && U16_IS_LEAD(inputBuf[groupEndIdx-1]) &&
+                        inputIndex < fActiveLimit && U16_IS_TRAIL(inputBuf[inputIndex])) {
+                    // Capture group ended with an unpaired lead surrogate.
+                    // Back reference is not permitted to match lead only of a surrogatge pair.
+                    success = FALSE;
+                }
                 if (success) {
                     fp->fInputIdx = inputIndex;
                 } else {
                 if (success) {
                     fp->fInputIdx = inputIndex;
                 } else {
@@ -5329,9 +5368,9 @@ GC_Done:
                     //   This makes the capture groups from within the look-ahead
                     //   expression available.
                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
                     //   This makes the capture groups from within the look-ahead
                     //   expression available.
                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
-                    int32_t i;
-                    for (i=0; i<fFrameSize; i++) {
-                        newFP[i] = ((int64_t *)fp)[i];
+                    int32_t j;
+                    for (j=0; j<fFrameSize; j++) {
+                        newFP[j] = ((int64_t *)fp)[j];
                     }
                     fp = (REStackFrame *)newFP;
                     fStack->setSize(newStackSize);
                     }
                     fp = (REStackFrame *)newFP;
                     fStack->setSize(newStackSize);
@@ -5433,21 +5472,24 @@ GC_Done:
 
                 // Fetch (from data) the last input index where a match was attempted.
                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
 
                 // Fetch (from data) the last input index where a match was attempted.
                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
-                int64_t  *lbStartIdx = &fData[opValue+2];
-                if (*lbStartIdx < 0) {
+                int64_t  &lbStartIdx = fData[opValue+2];
+                if (lbStartIdx < 0) {
                     // First time through loop.
                     // First time through loop.
-                    *lbStartIdx = fp->fInputIdx - minML;
+                    lbStartIdx = fp->fInputIdx - minML;
+                    if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
+                        U16_SET_CP_START(inputBuf, 0, lbStartIdx);
+                    }
                 } else {
                     // 2nd through nth time through the loop.
                     // Back up start position for match by one.
                 } else {
                     // 2nd through nth time through the loop.
                     // Back up start position for match by one.
-                    if (*lbStartIdx == 0) {
-                        (*lbStartIdx)--;
+                    if (lbStartIdx == 0) {
+                        lbStartIdx--;
                     } else {
                     } else {
-                        U16_BACK_1(inputBuf, 0, *lbStartIdx);
+                        U16_BACK_1(inputBuf, 0, lbStartIdx);
                     }
                 }
 
                     }
                 }
 
-                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
+                if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
                     // We have tried all potential match starting points without
                     //  getting a match.  Backtrack out, and out of the
                     //   Look Behind altogether.
                     // We have tried all potential match starting points without
                     //  getting a match.  Backtrack out, and out of the
                     //   Look Behind altogether.
@@ -5462,7 +5504,7 @@ GC_Done:
                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
                 //      (successful match will fall off the end of the loop.)
                 fp = StateSave(fp, fp->fPatIdx-3, status);
                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
                 //      (successful match will fall off the end of the loop.)
                 fp = StateSave(fp, fp->fPatIdx-3, status);
-                fp->fInputIdx =  *lbStartIdx;
+                fp->fInputIdx =  lbStartIdx;
             }
             break;
 
             }
             break;
 
@@ -5507,21 +5549,24 @@ GC_Done:
 
                 // Fetch (from data) the last input index where a match was attempted.
                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
 
                 // Fetch (from data) the last input index where a match was attempted.
                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
-                int64_t  *lbStartIdx = &fData[opValue+2];
-                if (*lbStartIdx < 0) {
+                int64_t  &lbStartIdx = fData[opValue+2];
+                if (lbStartIdx < 0) {
                     // First time through loop.
                     // First time through loop.
-                    *lbStartIdx = fp->fInputIdx - minML;
+                    lbStartIdx = fp->fInputIdx - minML;
+                    if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
+                        U16_SET_CP_START(inputBuf, 0, lbStartIdx);
+                    }
                 } else {
                     // 2nd through nth time through the loop.
                     // Back up start position for match by one.
                 } else {
                     // 2nd through nth time through the loop.
                     // Back up start position for match by one.
-                    if (*lbStartIdx == 0) {
-                        (*lbStartIdx)--;   // Because U16_BACK is unsafe starting at 0.
+                    if (lbStartIdx == 0) {
+                        lbStartIdx--;   // Because U16_BACK is unsafe starting at 0.
                     } else {
                     } else {
-                        U16_BACK_1(inputBuf, 0, *lbStartIdx);
+                        U16_BACK_1(inputBuf, 0, lbStartIdx);
                     }
                 }
 
                     }
                 }
 
-                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
+                if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
                     // We have tried all potential match starting points without
                     //  getting a match, which means that the negative lookbehind as
                     //  a whole has succeeded.  Jump forward to the continue location
                     // We have tried all potential match starting points without
                     //  getting a match, which means that the negative lookbehind as
                     //  a whole has succeeded.  Jump forward to the continue location
@@ -5536,7 +5581,7 @@ GC_Done:
                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
                 //      (successful match will cause a FAIL out of the loop altogether.)
                 fp = StateSave(fp, fp->fPatIdx-4, status);
                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
                 //      (successful match will cause a FAIL out of the loop altogether.)
                 fp = StateSave(fp, fp->fPatIdx-4, status);
-                fp->fInputIdx =  *lbStartIdx;
+                fp->fInputIdx =  lbStartIdx;
             }
             break;
 
             }
             break;
 
@@ -5585,9 +5630,9 @@ GC_Done:
             //   This op scans through all matching input.
             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
             {
             //   This op scans through all matching input.
             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
             {
-                U_ASSERT(opValue > 0 && opValue < sets->size());
+                U_ASSERT(opValue > 0 && opValue < fSets->size());
                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
-                UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
+                UnicodeSet   *s  = (UnicodeSet *)fSets->elementAt(opValue);
 
                 // Loop through input, until either the input is exhausted or
                 //   we reach a character that is not a member of the set.
 
                 // Loop through input, until either the input is exhausted or
                 //   we reach a character that is not a member of the set.
@@ -5740,7 +5785,7 @@ GC_Done:
         default:
             // Trouble.  The compiled pattern contains an entry with an
             //           unrecognized type tag.
         default:
             // Trouble.  The compiled pattern contains an entry with an
             //           unrecognized type tag.
-            U_ASSERT(FALSE);
+            UPRV_UNREACHABLE;
         }
 
         if (U_FAILURE(status)) {
         }
 
         if (U_FAILURE(status)) {
@@ -5780,3 +5825,4 @@ UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
 U_NAMESPACE_END
 
 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
 U_NAMESPACE_END
 
 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
+