]> git.saurik.com Git - apple/icu.git/blobdiff - icuSources/i18n/rematch.cpp
ICU-66108.tar.gz
[apple/icu.git] / icuSources / i18n / rematch.cpp
index a570b74ff332993db76a78ea70c2039d99117336..780200a66d021af840bb18da359fd0ce5afd01d3 100644 (file)
@@ -429,7 +429,7 @@ RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
                                (nextChar >= 0x31 && nextChar <= 0x39)) {       // 0..9
                         groupName.append(nextChar);
                     } else if (nextChar == RIGHTBRACKET) {
-                        groupNum = uhash_geti(fPattern->fNamedCaptureMap, &groupName);
+                        groupNum = fPattern->fNamedCaptureMap ? uhash_geti(fPattern->fNamedCaptureMap, &groupName) : 0;
                         if (groupNum == 0) {
                             status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
                         }
@@ -438,7 +438,7 @@ RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
                         status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
                     }
                 }
-                        
+
             } else if (u_isdigit(nextChar)) {
                 // $n    Scan for a capture group number
                 int32_t numCaptureGroups = fPattern->fGroupMap->size();
@@ -459,7 +459,7 @@ RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
                         break;
                     }
                     (void)UTEXT_NEXT32(replacement);
-                    groupNum=groupNum*10 + nextDigitVal; 
+                    groupNum=groupNum*10 + nextDigitVal;
                     ++numDigits;
                 }
             } else {
@@ -717,7 +717,7 @@ UBool RegexMatcher::find(UErrorCode &status) {
             if  (findProgressInterrupt(startPos, status))
                 return FALSE;
         }
-        U_ASSERT(FALSE);
+        UPRV_UNREACHABLE;
 
     case START_START:
         // Matches are only possible at the start of the input string
@@ -765,7 +765,7 @@ UBool RegexMatcher::find(UErrorCode &status) {
                     return FALSE;
             }
         }
-        U_ASSERT(FALSE);
+        UPRV_UNREACHABLE;
 
     case START_STRING:
     case START_CHAR:
@@ -797,11 +797,11 @@ UBool RegexMatcher::find(UErrorCode &status) {
                     return FALSE;
            }
         }
-        U_ASSERT(FALSE);
+        UPRV_UNREACHABLE;
 
     case START_LINE:
         {
-            UChar32  c;
+            UChar32 ch;
             if (startPos == fAnchorStart) {
                 MatchAt(startPos, FALSE, status);
                 if (U_FAILURE(status)) {
@@ -811,17 +811,17 @@ UBool RegexMatcher::find(UErrorCode &status) {
                     return TRUE;
                 }
                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
-                c = UTEXT_NEXT32(fInputText);
+                ch = UTEXT_NEXT32(fInputText);
                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
             } else {
                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
-                c = UTEXT_PREVIOUS32(fInputText);
+                ch = UTEXT_PREVIOUS32(fInputText);
                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
             }
 
             if (fPattern->fFlags & UREGEX_UNIX_LINES) {
                 for (;;) {
-                    if (c == 0x0a) {
+                    if (ch == 0x0a) {
                             MatchAt(startPos, FALSE, status);
                             if (U_FAILURE(status)) {
                                 return FALSE;
@@ -836,7 +836,7 @@ UBool RegexMatcher::find(UErrorCode &status) {
                         fHitEnd = TRUE;
                         return FALSE;
                     }
-                    c = UTEXT_NEXT32(fInputText);
+                    ch = UTEXT_NEXT32(fInputText);
                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
                     // Note that it's perfectly OK for a pattern to have a zero-length
                     //   match at the end of a string, so we must make sure that the loop
@@ -846,8 +846,8 @@ UBool RegexMatcher::find(UErrorCode &status) {
                 }
             } else {
                 for (;;) {
-                    if (isLineTerminator(c)) {
-                        if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
+                    if (isLineTerminator(ch)) {
+                        if (ch == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
                             (void)UTEXT_NEXT32(fInputText);
                             startPos = UTEXT_GETNATIVEINDEX(fInputText);
                         }
@@ -865,7 +865,7 @@ UBool RegexMatcher::find(UErrorCode &status) {
                         fHitEnd = TRUE;
                         return FALSE;
                     }
-                    c = UTEXT_NEXT32(fInputText);
+                    ch = UTEXT_NEXT32(fInputText);
                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
                     // Note that it's perfectly OK for a pattern to have a zero-length
                     //   match at the end of a string, so we must make sure that the loop
@@ -877,11 +877,10 @@ UBool RegexMatcher::find(UErrorCode &status) {
         }
 
     default:
-        U_ASSERT(FALSE);
+        UPRV_UNREACHABLE;
     }
 
-    U_ASSERT(FALSE);
-    return FALSE;
+    UPRV_UNREACHABLE;
 }
 
 
@@ -992,7 +991,7 @@ UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
             if  (findProgressInterrupt(startPos, status))
                 return FALSE;
         }
-        U_ASSERT(FALSE);
+        UPRV_UNREACHABLE;
 
     case START_START:
         // Matches are only possible at the start of the input string
@@ -1034,7 +1033,7 @@ UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
                 return FALSE;
         }
     }
-        U_ASSERT(FALSE);
+    UPRV_UNREACHABLE;
 
     case START_STRING:
     case START_CHAR:
@@ -1063,11 +1062,11 @@ UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
                 return FALSE;
         }
     }
-    U_ASSERT(FALSE);
+    UPRV_UNREACHABLE;
 
     case START_LINE:
     {
-        UChar32  c;
+        UChar32 ch;
         if (startPos == fAnchorStart) {
             MatchChunkAt(startPos, FALSE, status);
             if (U_FAILURE(status)) {
@@ -1090,8 +1089,8 @@ UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
 
         if (fPattern->fFlags & UREGEX_UNIX_LINES) {
             for (;;) {
-                c = inputBuf[startPos-1];
-                if (c == 0x0a) {
+                ch = inputBuf[startPos-1];
+                if (ch == 0x0a) {
                     MatchChunkAt(startPos, FALSE, status);
                     if (U_FAILURE(status)) {
                         return FALSE;
@@ -1114,9 +1113,9 @@ UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
             }
         } else {
             for (;;) {
-                c = inputBuf[startPos-1];
-                if (isLineTerminator(c)) {
-                    if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
+                ch = inputBuf[startPos-1];
+                if (isLineTerminator(ch)) {
+                    if (ch == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
                         startPos++;
                     }
                     MatchChunkAt(startPos, FALSE, status);
@@ -1143,11 +1142,10 @@ UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
     }
 
     default:
-        U_ASSERT(FALSE);
+        UPRV_UNREACHABLE;
     }
 
-    U_ASSERT(FALSE);
-    return FALSE;
+    UPRV_UNREACHABLE;
 }
 
 
@@ -2080,7 +2078,7 @@ int32_t  RegexMatcher::split(UText *input,
     //
     if (U_FAILURE(status)) {
         return 0;
-    };
+    }
 
     if (destCapacity < 1) {
         status = U_ILLEGAL_ARGUMENT_ERROR;
@@ -2196,7 +2194,7 @@ int32_t  RegexMatcher::split(UText *input,
                     break;
                 }
                 i++;
-                dest[i] = utext_extract_replace(fInputText, dest[i], 
+                dest[i] = utext_extract_replace(fInputText, dest[i],
                                                start64(groupNum, status), end64(groupNum, status), &status);
             }
 
@@ -2209,7 +2207,7 @@ int32_t  RegexMatcher::split(UText *input,
                     if (dest[i] == NULL) {
                         dest[i] = utext_openUChars(NULL, NULL, 0, &status);
                     } else {
-                        static UChar emptyString[] = {(UChar)0};
+                        static const UChar emptyString[] = {(UChar)0};
                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
                     }
                 }
@@ -2783,7 +2781,7 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
 
     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
-    UVector             *sets          = fPattern->fSets;
+    UVector             *fSets         = fPattern->fSets;
 
     fFrameSize = fPattern->fFrameSize;
     REStackFrame        *fp            = resetStack();
@@ -3385,7 +3383,7 @@ GC_Done:
 
                 // There is input left.  Pick up one char and test it for set membership.
                 UChar32 c = UTEXT_NEXT32(fInputText);
-                U_ASSERT(opValue > 0 && opValue < sets->size());
+                U_ASSERT(opValue > 0 && opValue < fSets->size());
                 if (c<256) {
                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
                     if (s8->contains(c)) {
@@ -3393,7 +3391,7 @@ GC_Done:
                         break;
                     }
                 } else {
-                    UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
+                    UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
                     if (s->contains(c)) {
                         // The character is in the set.  A Match.
                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
@@ -3680,9 +3678,9 @@ GC_Done:
                 if (newFP == (int64_t *)fp) {
                     break;
                 }
-                int32_t i;
-                for (i=0; i<fFrameSize; i++) {
-                    newFP[i] = ((int64_t *)fp)[i];
+                int32_t j;
+                for (j=0; j<fFrameSize; j++) {
+                    newFP[j] = ((int64_t *)fp)[j];
                 }
                 fp = (REStackFrame *)newFP;
                 fStack->setSize(newStackSize);
@@ -3816,11 +3814,13 @@ GC_Done:
 
         case URX_LA_START:
             {
-                // Entering a lookahead block.
+                // Entering a look around block.
                 // Save Stack Ptr, Input Pos.
-                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
+                U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
                 fData[opValue]   = fStack->size();
                 fData[opValue+1] = fp->fInputIdx;
+                fData[opValue+2] = fActiveStart;
+                fData[opValue+3] = fActiveLimit;
                 fActiveStart     = fLookStart;          // Set the match region change for
                 fActiveLimit     = fLookLimit;          //   transparent bounds.
             }
@@ -3830,7 +3830,7 @@ GC_Done:
             {
                 // Leaving a look-ahead block.
                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
-                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
+                U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
                 int32_t stackSize = fStack->size();
                 int32_t newStackSize =(int32_t)fData[opValue];
                 U_ASSERT(stackSize >= newStackSize);
@@ -3839,9 +3839,9 @@ GC_Done:
                     //   This makes the capture groups from within the look-ahead
                     //   expression available.
                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
-                    int32_t i;
-                    for (i=0; i<fFrameSize; i++) {
-                        newFP[i] = ((int64_t *)fp)[i];
+                    int32_t j;
+                    for (j=0; j<fFrameSize; j++) {
+                        newFP[j] = ((int64_t *)fp)[j];
                     }
                     fp = (REStackFrame *)newFP;
                     fStack->setSize(newStackSize);
@@ -3850,8 +3850,10 @@ GC_Done:
 
                 // Restore the active region bounds in the input string; they may have
                 //    been changed because of transparent bounds on a Region.
-                fActiveStart = fRegionStart;
-                fActiveLimit = fRegionLimit;
+                fActiveStart = fData[opValue+2];
+                fActiveLimit = fData[opValue+3];
+                U_ASSERT(fActiveStart >= 0);
+                U_ASSERT(fActiveLimit <= fInputLength);
             }
             break;
 
@@ -3927,17 +3929,19 @@ GC_Done:
         case URX_LB_START:
             {
                 // Entering a look-behind block.
-                // Save Stack Ptr, Input Pos.
+                // Save Stack Ptr, Input Pos and active input region.
                 //   TODO:  implement transparent bounds.  Ticket #6067
-                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
+                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
                 fData[opValue]   = fStack->size();
                 fData[opValue+1] = fp->fInputIdx;
-                // Init the variable containing the start index for attempted matches.
-                fData[opValue+2] = -1;
                 // Save input string length, then reset to pin any matches to end at
                 //   the current position.
+                fData[opValue+2] = fActiveStart;
                 fData[opValue+3] = fActiveLimit;
+                fActiveStart     = fRegionStart;
                 fActiveLimit     = fp->fInputIdx;
+                // Init the variable containing the start index for attempted matches.
+                fData[opValue+4] = -1;
             }
             break;
 
@@ -3960,8 +3964,8 @@ GC_Done:
                 U_ASSERT(minML >= 0);
 
                 // Fetch (from data) the last input index where a match was attempted.
-                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
-                int64_t  &lbStartIdx = fData[opValue+2];
+                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
+                int64_t  &lbStartIdx = fData[opValue+4];
                 if (lbStartIdx < 0) {
                     // First time through loop.
                     lbStartIdx = fp->fInputIdx - minML;
@@ -3987,10 +3991,10 @@ GC_Done:
                     //  getting a match.  Backtrack out, and out of the
                     //   Look Behind altogether.
                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
-                    int64_t restoreInputLen = fData[opValue+3];
-                    U_ASSERT(restoreInputLen >= fActiveLimit);
-                    U_ASSERT(restoreInputLen <= fInputLength);
-                    fActiveLimit = restoreInputLen;
+                    fActiveStart = fData[opValue+2];
+                    fActiveLimit = fData[opValue+3];
+                    U_ASSERT(fActiveStart >= 0);
+                    U_ASSERT(fActiveLimit <= fInputLength);
                     break;
                 }
 
@@ -4004,7 +4008,7 @@ GC_Done:
         case URX_LB_END:
             // End of a look-behind block, after a successful match.
             {
-                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
+                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
                 if (fp->fInputIdx != fActiveLimit) {
                     //  The look-behind expression matched, but the match did not
                     //    extend all the way to the point that we are looking behind from.
@@ -4015,13 +4019,13 @@ GC_Done:
                     break;
                 }
 
-                // Look-behind match is good.  Restore the orignal input string length,
+                // Look-behind match is good.  Restore the orignal input string region,
                 //   which had been truncated to pin the end of the lookbehind match to the
                 //   position being looked-behind.
-                int64_t originalInputLen = fData[opValue+3];
-                U_ASSERT(originalInputLen >= fActiveLimit);
-                U_ASSERT(originalInputLen <= fInputLength);
-                fActiveLimit = originalInputLen;
+                fActiveStart = fData[opValue+2];
+                fActiveLimit = fData[opValue+3];
+                U_ASSERT(fActiveStart >= 0);
+                U_ASSERT(fActiveLimit <= fInputLength);
             }
             break;
 
@@ -4046,8 +4050,8 @@ GC_Done:
                 U_ASSERT(continueLoc > fp->fPatIdx);
 
                 // Fetch (from data) the last input index where a match was attempted.
-                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
-                int64_t  &lbStartIdx = fData[opValue+2];
+                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
+                int64_t  &lbStartIdx = fData[opValue+4];
                 if (lbStartIdx < 0) {
                     // First time through loop.
                     lbStartIdx = fp->fInputIdx - minML;
@@ -4072,10 +4076,10 @@ GC_Done:
                     // We have tried all potential match starting points without
                     //  getting a match, which means that the negative lookbehind as
                     //  a whole has succeeded.  Jump forward to the continue location
-                    int64_t restoreInputLen = fData[opValue+3];
-                    U_ASSERT(restoreInputLen >= fActiveLimit);
-                    U_ASSERT(restoreInputLen <= fInputLength);
-                    fActiveLimit = restoreInputLen;
+                    fActiveStart = fData[opValue+2];
+                    fActiveLimit = fData[opValue+3];
+                    U_ASSERT(fActiveStart >= 0);
+                    U_ASSERT(fActiveLimit <= fInputLength);
                     fp->fPatIdx = continueLoc;
                     break;
                 }
@@ -4090,7 +4094,7 @@ GC_Done:
         case URX_LBN_END:
             // End of a negative look-behind block, after a successful match.
             {
-                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
+                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
                 if (fp->fInputIdx != fActiveLimit) {
                     //  The look-behind expression matched, but the match did not
                     //    extend all the way to the point that we are looking behind from.
@@ -4107,10 +4111,10 @@ GC_Done:
                 //   Restore the orignal input string length, which had been truncated
                 //   inorder to pin the end of the lookbehind match
                 //   to the position being looked-behind.
-                int64_t originalInputLen = fData[opValue+3];
-                U_ASSERT(originalInputLen >= fActiveLimit);
-                U_ASSERT(originalInputLen <= fInputLength);
-                fActiveLimit = originalInputLen;
+                fActiveStart = fData[opValue+2];
+                fActiveLimit = fData[opValue+3];
+                U_ASSERT(fActiveStart >= 0);
+                U_ASSERT(fActiveLimit <= fInputLength);
 
                 // Restore original stack position, discarding any state saved
                 //   by the successful pattern match.
@@ -4132,9 +4136,9 @@ GC_Done:
             //   This op scans through all matching input.
             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
             {
-                U_ASSERT(opValue > 0 && opValue < sets->size());
+                U_ASSERT(opValue > 0 && opValue < fSets->size());
                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
-                UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
+                UnicodeSet   *s  = (UnicodeSet *)fSets->elementAt(opValue);
 
                 // Loop through input, until either the input is exhausted or
                 //   we reach a character that is not a member of the set.
@@ -4287,7 +4291,7 @@ GC_Done:
         default:
             // Trouble.  The compiled pattern contains an entry with an
             //           unrecognized type tag.
-            U_ASSERT(FALSE);
+            UPRV_UNREACHABLE;
         }
 
         if (U_FAILURE(status)) {
@@ -4359,7 +4363,7 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
 
     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
-    UVector             *sets          = fPattern->fSets;
+    UVector             *fSets         = fPattern->fSets;
 
     const UChar         *inputBuf      = fInputText->chunkContents;
 
@@ -4937,7 +4941,7 @@ GC_Done:
                     break;
                 }
 
-                U_ASSERT(opValue > 0 && opValue < sets->size());
+                U_ASSERT(opValue > 0 && opValue < fSets->size());
 
                 // There is input left.  Pick up one char and test it for set membership.
                 UChar32  c;
@@ -4949,7 +4953,7 @@ GC_Done:
                         break;
                     }
                 } else {
-                    UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
+                    UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
                     if (s->contains(c)) {
                         // The character is in the set.  A Match.
                         break;
@@ -5223,9 +5227,9 @@ GC_Done:
                 if (newFP == (int64_t *)fp) {
                     break;
                 }
-                int32_t i;
-                for (i=0; i<fFrameSize; i++) {
-                    newFP[i] = ((int64_t *)fp)[i];
+                int32_t j;
+                for (j=0; j<fFrameSize; j++) {
+                    newFP[j] = ((int64_t *)fp)[j];
                 }
                 fp = (REStackFrame *)newFP;
                 fStack->setSize(newStackSize);
@@ -5347,11 +5351,13 @@ GC_Done:
 
         case URX_LA_START:
             {
-                // Entering a lookahead block.
+                // Entering a look around block.
                 // Save Stack Ptr, Input Pos.
-                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
+                U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
                 fData[opValue]   = fStack->size();
                 fData[opValue+1] = fp->fInputIdx;
+                fData[opValue+2] = fActiveStart;
+                fData[opValue+3] = fActiveLimit;
                 fActiveStart     = fLookStart;          // Set the match region change for
                 fActiveLimit     = fLookLimit;          //   transparent bounds.
             }
@@ -5359,9 +5365,9 @@ GC_Done:
 
         case URX_LA_END:
             {
-                // Leaving a look-ahead block.
+                // Leaving a look around block.
                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
-                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
+                U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
                 int32_t stackSize = fStack->size();
                 int32_t newStackSize = (int32_t)fData[opValue];
                 U_ASSERT(stackSize >= newStackSize);
@@ -5370,9 +5376,9 @@ GC_Done:
                     //   This makes the capture groups from within the look-ahead
                     //   expression available.
                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
-                    int32_t i;
-                    for (i=0; i<fFrameSize; i++) {
-                        newFP[i] = ((int64_t *)fp)[i];
+                    int32_t j;
+                    for (j=0; j<fFrameSize; j++) {
+                        newFP[j] = ((int64_t *)fp)[j];
                     }
                     fp = (REStackFrame *)newFP;
                     fStack->setSize(newStackSize);
@@ -5381,8 +5387,10 @@ GC_Done:
 
                 // Restore the active region bounds in the input string; they may have
                 //    been changed because of transparent bounds on a Region.
-                fActiveStart = fRegionStart;
-                fActiveLimit = fRegionLimit;
+                fActiveStart = fData[opValue+2];
+                fActiveLimit = fData[opValue+3];
+                U_ASSERT(fActiveStart >= 0);
+                U_ASSERT(fActiveLimit <= fInputLength);
             }
             break;
 
@@ -5445,17 +5453,19 @@ GC_Done:
         case URX_LB_START:
             {
                 // Entering a look-behind block.
-                // Save Stack Ptr, Input Pos.
+                // Save Stack Ptr, Input Pos and active input region.
                 //   TODO:  implement transparent bounds.  Ticket #6067
-                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
+                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
                 fData[opValue]   = fStack->size();
                 fData[opValue+1] = fp->fInputIdx;
-                // Init the variable containing the start index for attempted matches.
-                fData[opValue+2] = -1;
                 // Save input string length, then reset to pin any matches to end at
                 //   the current position.
+                fData[opValue+2] = fActiveStart;
                 fData[opValue+3] = fActiveLimit;
+                fActiveStart     = fRegionStart;
                 fActiveLimit     = fp->fInputIdx;
+                // Init the variable containing the start index for attempted matches.
+                fData[opValue+4] = -1;
             }
             break;
 
@@ -5473,12 +5483,12 @@ GC_Done:
                 U_ASSERT(minML >= 0);
 
                 // Fetch (from data) the last input index where a match was attempted.
-                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
-                int64_t  &lbStartIdx = fData[opValue+2];
+                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
+                int64_t  &lbStartIdx = fData[opValue+4];
                 if (lbStartIdx < 0) {
                     // First time through loop.
                     lbStartIdx = fp->fInputIdx - minML;
-                    if (lbStartIdx > 0) {
+                    if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
                         U16_SET_CP_START(inputBuf, 0, lbStartIdx);
                     }
                 } else {
@@ -5496,10 +5506,10 @@ GC_Done:
                     //  getting a match.  Backtrack out, and out of the
                     //   Look Behind altogether.
                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
-                    int64_t restoreInputLen = fData[opValue+3];
-                    U_ASSERT(restoreInputLen >= fActiveLimit);
-                    U_ASSERT(restoreInputLen <= fInputLength);
-                    fActiveLimit = restoreInputLen;
+                    fActiveStart = fData[opValue+2];
+                    fActiveLimit = fData[opValue+3];
+                    U_ASSERT(fActiveStart >= 0);
+                    U_ASSERT(fActiveLimit <= fInputLength);
                     break;
                 }
 
@@ -5513,7 +5523,7 @@ GC_Done:
         case URX_LB_END:
             // End of a look-behind block, after a successful match.
             {
-                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
+                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
                 if (fp->fInputIdx != fActiveLimit) {
                     //  The look-behind expression matched, but the match did not
                     //    extend all the way to the point that we are looking behind from.
@@ -5524,13 +5534,13 @@ GC_Done:
                     break;
                 }
 
-                // Look-behind match is good.  Restore the orignal input string length,
+                // Look-behind match is good.  Restore the orignal input string region,
                 //   which had been truncated to pin the end of the lookbehind match to the
                 //   position being looked-behind.
-                int64_t originalInputLen = fData[opValue+3];
-                U_ASSERT(originalInputLen >= fActiveLimit);
-                U_ASSERT(originalInputLen <= fInputLength);
-                fActiveLimit = originalInputLen;
+                fActiveStart = fData[opValue+2];
+                fActiveLimit = fData[opValue+3];
+                U_ASSERT(fActiveStart >= 0);
+                U_ASSERT(fActiveLimit <= fInputLength);
             }
             break;
 
@@ -5550,12 +5560,12 @@ GC_Done:
                 U_ASSERT(continueLoc > fp->fPatIdx);
 
                 // Fetch (from data) the last input index where a match was attempted.
-                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
-                int64_t  &lbStartIdx = fData[opValue+2];
+                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
+                int64_t  &lbStartIdx = fData[opValue+4];
                 if (lbStartIdx < 0) {
                     // First time through loop.
                     lbStartIdx = fp->fInputIdx - minML;
-                    if (lbStartIdx > 0) {
+                    if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
                         U16_SET_CP_START(inputBuf, 0, lbStartIdx);
                     }
                 } else {
@@ -5572,10 +5582,10 @@ GC_Done:
                     // We have tried all potential match starting points without
                     //  getting a match, which means that the negative lookbehind as
                     //  a whole has succeeded.  Jump forward to the continue location
-                    int64_t restoreInputLen = fData[opValue+3];
-                    U_ASSERT(restoreInputLen >= fActiveLimit);
-                    U_ASSERT(restoreInputLen <= fInputLength);
-                    fActiveLimit = restoreInputLen;
+                    fActiveStart = fData[opValue+2];
+                    fActiveLimit = fData[opValue+3];
+                    U_ASSERT(fActiveStart >= 0);
+                    U_ASSERT(fActiveLimit <= fInputLength);
                     fp->fPatIdx = continueLoc;
                     break;
                 }
@@ -5590,7 +5600,7 @@ GC_Done:
         case URX_LBN_END:
             // End of a negative look-behind block, after a successful match.
             {
-                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
+                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
                 if (fp->fInputIdx != fActiveLimit) {
                     //  The look-behind expression matched, but the match did not
                     //    extend all the way to the point that we are looking behind from.
@@ -5607,10 +5617,10 @@ GC_Done:
                 //   Restore the orignal input string length, which had been truncated
                 //   inorder to pin the end of the lookbehind match
                 //   to the position being looked-behind.
-                int64_t originalInputLen = fData[opValue+3];
-                U_ASSERT(originalInputLen >= fActiveLimit);
-                U_ASSERT(originalInputLen <= fInputLength);
-                fActiveLimit = originalInputLen;
+                fActiveStart = fData[opValue+2];
+                fActiveLimit = fData[opValue+3];
+                U_ASSERT(fActiveStart >= 0);
+                U_ASSERT(fActiveLimit <= fInputLength);
 
                 // Restore original stack position, discarding any state saved
                 //   by the successful pattern match.
@@ -5632,9 +5642,9 @@ GC_Done:
             //   This op scans through all matching input.
             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
             {
-                U_ASSERT(opValue > 0 && opValue < sets->size());
+                U_ASSERT(opValue > 0 && opValue < fSets->size());
                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
-                UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
+                UnicodeSet   *s  = (UnicodeSet *)fSets->elementAt(opValue);
 
                 // Loop through input, until either the input is exhausted or
                 //   we reach a character that is not a member of the set.
@@ -5787,7 +5797,7 @@ GC_Done:
         default:
             // Trouble.  The compiled pattern contains an entry with an
             //           unrecognized type tag.
-            U_ASSERT(FALSE);
+            UPRV_UNREACHABLE;
         }
 
         if (U_FAILURE(status)) {
@@ -5827,3 +5837,4 @@ UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
 U_NAMESPACE_END
 
 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
+