X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/57a6839dcb3bba09e8228b822b290604668416fe..4f1e1a09ce4daed860e35d359ce2fceccb0764e8:/icuSources/i18n/rematch.cpp

diff --git a/icuSources/i18n/rematch.cpp b/icuSources/i18n/rematch.cpp
index 6ffe6105..38e5fb29 100644
--- a/icuSources/i18n/rematch.cpp
+++ b/icuSources/i18n/rematch.cpp
@@ -1,7 +1,9 @@
+// Â© 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
 /*
 **************************************************************************
-*   Copyright (C) 2002-2013 International Business Machines Corporation  *
-*   and others. All rights reserved.                                     *
+*   Copyright (C) 2002-2016 International Business Machines Corporation
+*   and others. All rights reserved.
 **************************************************************************
 */
 //
@@ -23,6 +25,7 @@
 #include "unicode/utf16.h"
 #include "uassert.h"
 #include "cmemory.h"
+#include "cstr.h"
 #include "uvector.h"
 #include "uvectr32.h"
 #include "uvectr64.h"
@@ -34,25 +37,6 @@
 // #include <malloc.h>        // Needed for heapcheck testing
 
 
-// Find progress callback
-// ----------------------
-// Macro to inline test & call to ReportFindProgress().  Eliminates unnecessary function call.
-//
-#define REGEXFINDPROGRESS_INTERRUPT(pos, status)     \
-    (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE)
-
-
-// Smart Backtracking
-// ------------------
-// When a failure would go back to a LOOP_C instruction,
-// strings, characters, and setrefs scan backwards for a valid start
-// character themselves, pop the stack, and save state, emulating the
-// LOOP_C's effect but assured that the next character of input is a
-// possible matching character.
-//
-// Good idea in theory; unfortunately it only helps out a few specific
-// cases and slows the engine down a little in the rest.
-
 U_NAMESPACE_BEGIN
 
 // Default limit for the size of the back track stack, to avoid system
@@ -69,6 +53,15 @@ static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
 //   This constant determines that state saves per tick number.
 static const int32_t TIMER_INITIAL_VALUE = 10000;
 
+
+// Test for any of the Unicode line terminating characters.
+static inline UBool isLineTerminator(UChar32 c) {
+    if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
+        return false;
+    }
+    return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029;
+}
+
 //-----------------------------------------------------------------------------
 //
 //   Constructor and Destructor
@@ -236,10 +229,6 @@ void RegexMatcher::init(UErrorCode &status) {
     fInput             = NULL;
     fInputLength       = 0;
     fInputUniStrMaybeMutable = FALSE;
-
-    if (U_FAILURE(status)) {
-        fDeferredStatus = status;
-    }
 }
 
 //
@@ -252,7 +241,7 @@ void RegexMatcher::init2(UText *input, UErrorCode &status) {
         return;
     }
 
-    if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) {
+    if (fPattern->fDataSize > UPRV_LENGTHOF(fSmallData)) {
         fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
         if (fData == NULL) {
             status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
@@ -277,6 +266,9 @@ void RegexMatcher::init2(UText *input, UErrorCode &status) {
 
 static const UChar BACKSLASH  = 0x5c;
 static const UChar DOLLARSIGN = 0x24;
+static const UChar LEFTBRACKET = 0x7b;
+static const UChar RIGHTBRACKET = 0x7d;
+
 //--------------------------------------------------------------------------------
 //
 //    appendReplacement
@@ -351,8 +343,7 @@ RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
     //  TODO:  optimize this loop by efficiently scanning for '$' or '\',
     //         move entire ranges not containing substitutions.
     UTEXT_SETNATIVEINDEX(replacement, 0);
-    UChar32 c = UTEXT_NEXT32(replacement);
-    while (c != U_SENTINEL) {
+    for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL;  c = UTEXT_NEXT32(replacement)) {
         if (c == BACKSLASH) {
             // Backslash Escape.  Copy the following char out without further checks.
             //                    Note:  Surrogate pairs don't need any special handling
@@ -418,51 +409,69 @@ RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
                 }
             }
         } else {
-            // We've got a $.  Pick up a capture group number if one follows.
-            // Consume at most the number of digits necessary for the largest capture
-            // number that is valid for this pattern.
+            // We've got a $.  Pick up a capture group name or number if one follows.
+            // Consume digits so long as the resulting group number <= the number of
+            // number of capture groups in the pattern.
 
-            int32_t numDigits = 0;
             int32_t groupNum  = 0;
-            UChar32 digitC;
-            for (;;) {
-                digitC = UTEXT_CURRENT32(replacement);
-                if (digitC == U_SENTINEL) {
-                    break;
-                }
-                if (u_isdigit(digitC) == FALSE) {
-                    break;
+            int32_t numDigits = 0;
+            UChar32 nextChar = utext_current32(replacement);
+            if (nextChar == LEFTBRACKET) {
+                // Scan for a Named Capture Group, ${name}.
+                UnicodeString groupName;
+                utext_next32(replacement);
+                while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {
+                    nextChar = utext_next32(replacement);
+                    if (nextChar == U_SENTINEL) {
+                        status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
+                    } else if ((nextChar >= 0x41 && nextChar <= 0x5a) ||       // A..Z
+                               (nextChar >= 0x61 && nextChar <= 0x7a) ||       // a..z
+                               (nextChar >= 0x31 && nextChar <= 0x39)) {       // 0..9
+                        groupName.append(nextChar);
+                    } else if (nextChar == RIGHTBRACKET) {
+                        groupNum = uhash_geti(fPattern->fNamedCaptureMap, &groupName);
+                        if (groupNum == 0) {
+                            status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
+                        }
+                    } else {
+                        // Character was something other than a name char or a closing '}'
+                        status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
+                    }
                 }
-                (void)UTEXT_NEXT32(replacement);
-                groupNum=groupNum*10 + u_charDigitValue(digitC);
-                numDigits++;
-                if (numDigits >= fPattern->fMaxCaptureDigits) {
-                    break;
+
+            } else if (u_isdigit(nextChar)) {
+                // $n    Scan for a capture group number
+                int32_t numCaptureGroups = fPattern->fGroupMap->size();
+                for (;;) {
+                    nextChar = UTEXT_CURRENT32(replacement);
+                    if (nextChar == U_SENTINEL) {
+                        break;
+                    }
+                    if (u_isdigit(nextChar) == FALSE) {
+                        break;
+                    }
+                    int32_t nextDigitVal = u_charDigitValue(nextChar);
+                    if (groupNum*10 + nextDigitVal > numCaptureGroups) {
+                        // Don't consume the next digit if it makes the capture group number too big.
+                        if (numDigits == 0) {
+                            status = U_INDEX_OUTOFBOUNDS_ERROR;
+                        }
+                        break;
+                    }
+                    (void)UTEXT_NEXT32(replacement);
+                    groupNum=groupNum*10 + nextDigitVal;
+                    ++numDigits;
                 }
+            } else {
+                // $ not followed by capture group name or number.
+                status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
             }
 
-
-            if (numDigits == 0) {
-                // The $ didn't introduce a group number at all.
-                // Treat it as just part of the substitution text.
-                UChar c16 = DOLLARSIGN;
-                destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
-            } else {
-                // Finally, append the capture group data to the destination.
+            if (U_SUCCESS(status)) {
                 destLen += appendGroup(groupNum, dest, status);
-                if (U_FAILURE(status)) {
-                    // Can fail if group number is out of range.
-                    break;
-                }
             }
-        }
-
-        if (U_FAILURE(status)) {
-            break;
-        } else {
-            c = UTEXT_NEXT32(replacement);
-        }
-    }
+        }  // End of $ capture group handling
+    }  // End of per-character loop through the replacement string.
 
     return *this;
 }
@@ -577,6 +586,23 @@ int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
     return (int32_t)end64(group, err);
 }
 
+//--------------------------------------------------------------------------------
+//
+//   findProgressInterrupt  This function is called once for each advance in the target
+//                          string from the find() function, and calls the user progress callback
+//                          function if there is one installed.
+//
+//         Return:  TRUE if the find operation is to be terminated.
+//                  FALSE if the find operation is to continue running.
+//
+//--------------------------------------------------------------------------------
+UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) {
+    if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) {
+        status = U_REGEX_STOPPED_BY_CALLER;
+        return TRUE;
+    }
+    return FALSE;
+}
 
 //--------------------------------------------------------------------------------
 //
@@ -584,15 +610,33 @@ int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
 //
 //--------------------------------------------------------------------------------
 UBool RegexMatcher::find() {
+    if (U_FAILURE(fDeferredStatus)) {
+        return FALSE;
+    }
+    UErrorCode status = U_ZERO_ERROR;
+    UBool result = find(status);
+    return result;
+}
+
+//--------------------------------------------------------------------------------
+//
+//   find()
+//
+//--------------------------------------------------------------------------------
+UBool RegexMatcher::find(UErrorCode &status) {
     // Start at the position of the last match end.  (Will be zero if the
     //   matcher has been reset.)
     //
+    if (U_FAILURE(status)) {
+        return FALSE;
+    }
     if (U_FAILURE(fDeferredStatus)) {
+        status = fDeferredStatus;
         return FALSE;
     }
 
     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
-        return findUsingChunk();
+        return findUsingChunk(status);
     }
 
     int64_t startPos = fMatchEnd;
@@ -640,9 +684,9 @@ UBool RegexMatcher::find() {
             return FALSE;
         }
     } else {
-        // For now, let the matcher discover that it can't match on its own
-        // We don't know how long the match len is in native characters
-        testStartLimit = fActiveLimit;
+        // We don't know exactly how long the minimum match length is in native characters.
+        // Treat anything > 0 as 1.
+        testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0);
     }
 
     UChar32  c;
@@ -653,8 +697,8 @@ UBool RegexMatcher::find() {
         // No optimization was found.
         //  Try a match at each input position.
         for (;;) {
-            MatchAt(startPos, FALSE, fDeferredStatus);
-            if (U_FAILURE(fDeferredStatus)) {
+            MatchAt(startPos, FALSE, status);
+            if (U_FAILURE(status)) {
                 return FALSE;
             }
             if (fMatch) {
@@ -670,7 +714,7 @@ UBool RegexMatcher::find() {
             // Note that it's perfectly OK for a pattern to have a zero-length
             //   match at the end of a string, so we must make sure that the loop
             //   runs with startPos == testStartLimit the last time through.
-            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
+            if  (findProgressInterrupt(startPos, status))
                 return FALSE;
         }
         U_ASSERT(FALSE);
@@ -682,8 +726,8 @@ UBool RegexMatcher::find() {
             fMatch = FALSE;
             return FALSE;
         }
-        MatchAt(startPos, FALSE, fDeferredStatus);
-        if (U_FAILURE(fDeferredStatus)) {
+        MatchAt(startPos, FALSE, status);
+        if (U_FAILURE(status)) {
             return FALSE;
         }
         return fMatch;
@@ -693,18 +737,18 @@ UBool RegexMatcher::find() {
         {
             // Match may start on any char from a pre-computed set.
             U_ASSERT(fPattern->fMinMatchLen > 0);
-            int64_t pos;
             UTEXT_SETNATIVEINDEX(fInputText, startPos);
             for (;;) {
+                int64_t pos = startPos;
                 c = UTEXT_NEXT32(fInputText);
-                pos = UTEXT_GETNATIVEINDEX(fInputText);
+                startPos = UTEXT_GETNATIVEINDEX(fInputText);
                 // c will be -1 (U_SENTINEL) at end of text, in which case we
                 // skip this next block (so we don't have a negative array index)
                 // and handle end of text in the following block.
                 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
                               (c>=256 && fPattern->fInitialChars->contains(c)))) {
-                    MatchAt(startPos, FALSE, fDeferredStatus);
-                    if (U_FAILURE(fDeferredStatus)) {
+                    MatchAt(pos, FALSE, status);
+                    if (U_FAILURE(status)) {
                         return FALSE;
                     }
                     if (fMatch) {
@@ -712,13 +756,12 @@ UBool RegexMatcher::find() {
                     }
                     UTEXT_SETNATIVEINDEX(fInputText, pos);
                 }
-                if (startPos >= testStartLimit) {
+                if (startPos > testStartLimit) {
                     fMatch = FALSE;
                     fHitEnd = TRUE;
                     return FALSE;
                 }
-                startPos = pos;
-	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
+                if  (findProgressInterrupt(startPos, status))
                     return FALSE;
             }
         }
@@ -730,28 +773,27 @@ UBool RegexMatcher::find() {
             // Match starts on exactly one char.
             U_ASSERT(fPattern->fMinMatchLen > 0);
             UChar32 theChar = fPattern->fInitialChar;
-            int64_t pos;
             UTEXT_SETNATIVEINDEX(fInputText, startPos);
             for (;;) {
+                int64_t pos = startPos;
                 c = UTEXT_NEXT32(fInputText);
-                pos = UTEXT_GETNATIVEINDEX(fInputText);
+                startPos = UTEXT_GETNATIVEINDEX(fInputText);
                 if (c == theChar) {
-                    MatchAt(startPos, FALSE, fDeferredStatus);
-                    if (U_FAILURE(fDeferredStatus)) {
+                    MatchAt(pos, FALSE, status);
+                    if (U_FAILURE(status)) {
                         return FALSE;
                     }
                     if (fMatch) {
                         return TRUE;
                     }
-                    UTEXT_SETNATIVEINDEX(fInputText, pos);
+                    UTEXT_SETNATIVEINDEX(fInputText, startPos);
                 }
-                if (startPos >= testStartLimit) {
+                if (startPos > testStartLimit) {
                     fMatch = FALSE;
                     fHitEnd = TRUE;
                     return FALSE;
                 }
-                startPos = pos;
-	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
+                if  (findProgressInterrupt(startPos, status))
                     return FALSE;
            }
         }
@@ -761,8 +803,8 @@ UBool RegexMatcher::find() {
         {
             UChar32  c;
             if (startPos == fAnchorStart) {
-                MatchAt(startPos, FALSE, fDeferredStatus);
-                if (U_FAILURE(fDeferredStatus)) {
+                MatchAt(startPos, FALSE, status);
+                if (U_FAILURE(status)) {
                     return FALSE;
                 }
                 if (fMatch) {
@@ -780,8 +822,8 @@ UBool RegexMatcher::find() {
             if (fPattern->fFlags & UREGEX_UNIX_LINES) {
                 for (;;) {
                     if (c == 0x0a) {
-                            MatchAt(startPos, FALSE, fDeferredStatus);
-                            if (U_FAILURE(fDeferredStatus)) {
+                            MatchAt(startPos, FALSE, status);
+                            if (U_FAILURE(status)) {
                                 return FALSE;
                             }
                             if (fMatch) {
@@ -799,25 +841,24 @@ UBool RegexMatcher::find() {
                     // Note that it's perfectly OK for a pattern to have a zero-length
                     //   match at the end of a string, so we must make sure that the loop
                     //   runs with startPos == testStartLimit the last time through.
-		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
+                    if  (findProgressInterrupt(startPos, status))
                         return FALSE;
                 }
             } else {
                 for (;;) {
-                    if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
-                        ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
-                            if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
-                                (void)UTEXT_NEXT32(fInputText);
-                                startPos = UTEXT_GETNATIVEINDEX(fInputText);
-                            }
-                            MatchAt(startPos, FALSE, fDeferredStatus);
-                            if (U_FAILURE(fDeferredStatus)) {
-                                return FALSE;
-                            }
-                            if (fMatch) {
-                                return TRUE;
-                            }
-                            UTEXT_SETNATIVEINDEX(fInputText, startPos);
+                    if (isLineTerminator(c)) {
+                        if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
+                            (void)UTEXT_NEXT32(fInputText);
+                            startPos = UTEXT_GETNATIVEINDEX(fInputText);
+                        }
+                        MatchAt(startPos, FALSE, status);
+                        if (U_FAILURE(status)) {
+                            return FALSE;
+                        }
+                        if (fMatch) {
+                            return TRUE;
+                        }
+                        UTEXT_SETNATIVEINDEX(fInputText, startPos);
                     }
                     if (startPos >= testStartLimit) {
                         fMatch = FALSE;
@@ -829,7 +870,7 @@ UBool RegexMatcher::find() {
                     // Note that it's perfectly OK for a pattern to have a zero-length
                     //   match at the end of a string, so we must make sure that the loop
                     //   runs with startPos == testStartLimit the last time through.
-		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
+                    if  (findProgressInterrupt(startPos, status))
                         return FALSE;
                 }
             }
@@ -866,7 +907,7 @@ UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
         return FALSE;
     }
     fMatchEnd = nativeStart;
-    return find();
+    return find(status);
 }
 
 
@@ -876,7 +917,7 @@ UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
 //                       entire string is available in the UText's chunk buffer.
 //
 //--------------------------------------------------------------------------------
-UBool RegexMatcher::findUsingChunk() {
+UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
     // Start at the position of the last match end.  (Will be zero if the
     //   matcher has been reset.
     //
@@ -917,6 +958,7 @@ UBool RegexMatcher::findUsingChunk() {
     //   the minimum length match would extend past the end of the input.
     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
     //          Be aware of possible overflows if making changes here.
+    //   Note:  a match can begin at inputBuf + testLen; it is an inclusive limit.
     int32_t testLen  = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
     if (startPos > testLen) {
         fMatch = FALSE;
@@ -932,8 +974,8 @@ UBool RegexMatcher::findUsingChunk() {
         // No optimization was found.
         //  Try a match at each input position.
         for (;;) {
-            MatchChunkAt(startPos, FALSE, fDeferredStatus);
-            if (U_FAILURE(fDeferredStatus)) {
+            MatchChunkAt(startPos, FALSE, status);
+            if (U_FAILURE(status)) {
                 return FALSE;
             }
             if (fMatch) {
@@ -947,7 +989,7 @@ UBool RegexMatcher::findUsingChunk() {
             // Note that it's perfectly OK for a pattern to have a zero-length
             //   match at the end of a string, so we must make sure that the loop
             //   runs with startPos == testLen the last time through.
-            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
+            if  (findProgressInterrupt(startPos, status))
                 return FALSE;
         }
         U_ASSERT(FALSE);
@@ -959,8 +1001,8 @@ UBool RegexMatcher::findUsingChunk() {
             fMatch = FALSE;
             return FALSE;
         }
-        MatchChunkAt(startPos, FALSE, fDeferredStatus);
-        if (U_FAILURE(fDeferredStatus)) {
+        MatchChunkAt(startPos, FALSE, status);
+        if (U_FAILURE(status)) {
             return FALSE;
         }
         return fMatch;
@@ -975,20 +1017,20 @@ UBool RegexMatcher::findUsingChunk() {
             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
             if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
                 (c>=256 && fPattern->fInitialChars->contains(c))) {
-                MatchChunkAt(pos, FALSE, fDeferredStatus);
-                if (U_FAILURE(fDeferredStatus)) {
+                MatchChunkAt(pos, FALSE, status);
+                if (U_FAILURE(status)) {
                     return FALSE;
                 }
                 if (fMatch) {
                     return TRUE;
                 }
             }
-            if (pos >= testLen) {
+            if (startPos > testLen) {
                 fMatch = FALSE;
                 fHitEnd = TRUE;
                 return FALSE;
             }
-            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
+            if  (findProgressInterrupt(startPos, status))
                 return FALSE;
         }
     }
@@ -1004,36 +1046,45 @@ UBool RegexMatcher::findUsingChunk() {
             int32_t pos = startPos;
             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
             if (c == theChar) {
-                MatchChunkAt(pos, FALSE, fDeferredStatus);
-                if (U_FAILURE(fDeferredStatus)) {
+                MatchChunkAt(pos, FALSE, status);
+                if (U_FAILURE(status)) {
                     return FALSE;
                 }
                 if (fMatch) {
                     return TRUE;
                 }
             }
-            if (pos >= testLen) {
+            if (startPos > testLen) {
                 fMatch = FALSE;
                 fHitEnd = TRUE;
                 return FALSE;
             }
-            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
+            if  (findProgressInterrupt(startPos, status))
                 return FALSE;
         }
     }
-        U_ASSERT(FALSE);
+    U_ASSERT(FALSE);
 
     case START_LINE:
     {
         UChar32  c;
         if (startPos == fAnchorStart) {
-            MatchChunkAt(startPos, FALSE, fDeferredStatus);
-            if (U_FAILURE(fDeferredStatus)) {
+            MatchChunkAt(startPos, FALSE, status);
+            if (U_FAILURE(status)) {
                 return FALSE;
             }
             if (fMatch) {
                 return TRUE;
             }
+            // In bug 31063104 which has a zero-length text buffer we get here with
+            // inputBuf=NULL, startPos=fActiveLimit=0 (and fMatch F) which violates the
+            // requirement for U16_FWD_1 (utf16.h) that startPos < fActiveLimit. Having
+            // inputBuf=NULL (chunkContexts NULL) is probably due to an error in the
+            // CFStringUText functions. Nevertheless, to be defensive, add test below.
+            if (startPos >= testLen) {
+                fHitEnd = TRUE;
+                return FALSE;
+            }
             U16_FWD_1(inputBuf, startPos, fActiveLimit);
         }
 
@@ -1041,8 +1092,8 @@ UBool RegexMatcher::findUsingChunk() {
             for (;;) {
                 c = inputBuf[startPos-1];
                 if (c == 0x0a) {
-                    MatchChunkAt(startPos, FALSE, fDeferredStatus);
-                    if (U_FAILURE(fDeferredStatus)) {
+                    MatchChunkAt(startPos, FALSE, status);
+                    if (U_FAILURE(status)) {
                         return FALSE;
                     }
                     if (fMatch) {
@@ -1058,19 +1109,18 @@ UBool RegexMatcher::findUsingChunk() {
                 // Note that it's perfectly OK for a pattern to have a zero-length
                 //   match at the end of a string, so we must make sure that the loop
                 //   runs with startPos == testLen the last time through.
-	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
+                if  (findProgressInterrupt(startPos, status))
                     return FALSE;
             }
         } else {
             for (;;) {
                 c = inputBuf[startPos-1];
-                if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
-                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
+                if (isLineTerminator(c)) {
                     if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
                         startPos++;
                     }
-                    MatchChunkAt(startPos, FALSE, fDeferredStatus);
-                    if (U_FAILURE(fDeferredStatus)) {
+                    MatchChunkAt(startPos, FALSE, status);
+                    if (U_FAILURE(status)) {
                         return FALSE;
                     }
                     if (fMatch) {
@@ -1086,7 +1136,7 @@ UBool RegexMatcher::findUsingChunk() {
                 // Note that it's perfectly OK for a pattern to have a zero-length
                 //   match at the end of a string, so we must make sure that the loop
                 //   runs with startPos == testLen the last time through.
-	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
+                if  (findProgressInterrupt(startPos, status))
                     return FALSE;
             }
         }
@@ -1161,98 +1211,32 @@ UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UE
 
 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
     UnicodeString result;
-    if (U_FAILURE(status)) {
+    int64_t groupStart = start64(groupNum, status);
+    int64_t groupEnd = end64(groupNum, status);
+    if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) {
         return result;
     }
-    UText resultText = UTEXT_INITIALIZER;
-    utext_openUnicodeString(&resultText, &result, &status);
-    group(groupNum, &resultText, status);
-    utext_close(&resultText);
-    return result;
-}
-
-
-//  Return deep (mutable) clone
-//		Technology Preview (as an API), but note that the UnicodeString API is implemented
-//		using this function.
-UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const {
-    if (U_FAILURE(status)) {
-        return dest;
-    }
-
-    if (U_FAILURE(fDeferredStatus)) {
-        status = fDeferredStatus;
-    } else if (fMatch == FALSE) {
-        status = U_REGEX_INVALID_STATE;
-    } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
-        status = U_INDEX_OUTOFBOUNDS_ERROR;
-    }
-    if (U_FAILURE(status)) {
-        return dest;
-    }
 
-    int64_t s, e;
-    if (groupNum == 0) {
-        s = fMatchStart;
-        e = fMatchEnd;
-    } else {
-        int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
-        U_ASSERT(groupOffset < fPattern->fFrameSize);
-        U_ASSERT(groupOffset >= 0);
-        s = fFrame->fExtra[groupOffset];
-        e = fFrame->fExtra[groupOffset+1];
-    }
-
-    if (s < 0) {
-        // A capture group wasn't part of the match
-        if (dest) {
-            utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
-            return dest;
-        } else {
-            return utext_openUChars(NULL, NULL, 0, &status);
-        }
+    // Get the group length using a utext_extract preflight.
+    //    UText is actually pretty efficient at this when underlying encoding is UTF-16.
+    int32_t length = utext_extract(fInputText, groupStart, groupEnd, NULL, 0, &status);
+    if (status != U_BUFFER_OVERFLOW_ERROR) {
+        return result;
     }
-    U_ASSERT(s <= e);
 
-    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
-        U_ASSERT(e <= fInputLength);
-        if (dest) {
-            utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status);
-        } else {
-            UText groupText = UTEXT_INITIALIZER;
-            utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status);
-            dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
-            utext_close(&groupText);
-        }
+    status = U_ZERO_ERROR;
+    UChar *buf = result.getBuffer(length);
+    if (buf == NULL) {
+        status = U_MEMORY_ALLOCATION_ERROR;
     } else {
-        int32_t len16;
-        if (UTEXT_USES_U16(fInputText)) {
-            len16 = (int32_t)(e-s);
-        } else {
-            UErrorCode lengthStatus = U_ZERO_ERROR;
-            len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
-        }
-        UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
-        if (groupChars == NULL) {
-            status = U_MEMORY_ALLOCATION_ERROR;
-            return dest;
-        }
-        utext_extract(fInputText, s, e, groupChars, len16+1, &status);
-
-        if (dest) {
-            utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status);
-        } else {
-            UText groupText = UTEXT_INITIALIZER;
-            utext_openUChars(&groupText, groupChars, len16, &status);
-            dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
-            utext_close(&groupText);
-        }
-
-        uprv_free(groupChars);
+        int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status);
+        result.releaseBuffer(extractLength);
+        U_ASSERT(length == extractLength);
     }
-    return dest;
+    return result;
 }
 
+
 //--------------------------------------------------------------------------------
 //
 //  appendGroup() -- currently internal only, appends a group to a UText rather
@@ -1333,8 +1317,6 @@ int32_t RegexMatcher::groupCount() const {
     return fPattern->fGroupMap->size();
 }
 
-
-
 //--------------------------------------------------------------------------------
 //
 //  hasAnchoringBounds()
@@ -1870,6 +1852,9 @@ RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
     if (fPattern->fNeedsAltInput) {
         fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
     }
+    if (U_FAILURE(fDeferredStatus)) {
+        return *this;
+    }
     fInputLength = utext_nativeLength(fInputText);
 
     reset();
@@ -1894,6 +1879,9 @@ RegexMatcher &RegexMatcher::reset(UText *input) {
     if (fInputText != input) {
         fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
         if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
+        if (U_FAILURE(fDeferredStatus)) {
+            return *this;
+        }
         fInputLength = utext_nativeLength(fInputText);
 
         delete fInput;
@@ -1981,6 +1969,67 @@ void RegexMatcher::setTrace(UBool state) {
 
 
 
+/**
+  *  UText, replace entire contents of the destination UText with a substring of the source UText.
+  *
+  *     @param src    The source UText
+  *     @param dest   The destination UText. Must be writable.
+  *                   May be NULL, in which case a new UText will be allocated.
+  *     @param start  Start index of source substring.
+  *     @param limit  Limit index of source substring.
+  *     @param status An error code.
+  */
+static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) {
+    if (U_FAILURE(*status)) {
+        return dest;
+    }
+    if (start == limit) {
+        if (dest) {
+            utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, status);
+            return dest;
+        } else {
+            return utext_openUChars(NULL, NULL, 0, status);
+        }
+    }
+    int32_t length = utext_extract(src, start, limit, NULL, 0, status);
+    if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
+        return dest;
+    }
+    *status = U_ZERO_ERROR;
+    MaybeStackArray<UChar, 40> buffer;
+    if (length >= buffer.getCapacity()) {
+        UChar *newBuf = buffer.resize(length+1);   // Leave space for terminating Nul.
+        if (newBuf == NULL) {
+            *status = U_MEMORY_ALLOCATION_ERROR;
+        }
+    }
+    utext_extract(src, start, limit, buffer.getAlias(), length+1, status);
+    if (dest) {
+        utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status);
+        return dest;
+    }
+
+    // Caller did not provide a prexisting UText.
+    // Open a new one, and have it adopt the text buffer storage.
+    if (U_FAILURE(*status)) {
+        return NULL;
+    }
+    int32_t ownedLength = 0;
+    UChar *ownedBuf = buffer.orphanOrClone(length+1, ownedLength);
+    if (ownedBuf == NULL) {
+        *status = U_MEMORY_ALLOCATION_ERROR;
+        return NULL;
+    }
+    UText *result = utext_openUChars(NULL, ownedBuf, length, status);
+    if (U_FAILURE(*status)) {
+        uprv_free(ownedBuf);
+        return NULL;
+    }
+    result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT);
+    return result;
+}
+
+
 //---------------------------------------------------------------------
 //
 //   split
@@ -2147,7 +2196,8 @@ int32_t  RegexMatcher::split(UText *input,
                     break;
                 }
                 i++;
-                dest[i] = group(groupNum, dest[i], status);
+                dest[i] = utext_extract_replace(fInputText, dest[i],
+                                               start64(groupNum, status), end64(groupNum, status), &status);
             }
 
             if (nextOutputStringStart == fActiveLimit) {
@@ -2159,7 +2209,7 @@ int32_t  RegexMatcher::split(UText *input,
                     if (dest[i] == NULL) {
                         dest[i] = utext_openUChars(NULL, NULL, 0, &status);
                     } else {
-                        static UChar emptyString[] = {(UChar)0};
+                        static const UChar emptyString[] = {(UChar)0};
                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
                     }
                 }
@@ -2459,6 +2509,10 @@ REStackFrame *RegexMatcher::resetStack() {
     fStack->removeAllElements();
 
     REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
+    if(U_FAILURE(fDeferredStatus)) {
+        return NULL;
+    }
+
     int32_t i;
     for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
         iFrame->fExtra[i] = -1;
@@ -2624,29 +2678,6 @@ void RegexMatcher::IncrementTime(UErrorCode &status) {
     }
 }
 
-//--------------------------------------------------------------------------------
-//
-//   ReportFindProgress     This function is called once for each advance in the target
-//                          string from the find() function, and calls the user progress callback
-//                          function if there is one installed.
-//
-//                          NOTE:
-//
-//                          If the match operation needs to be aborted because the user
-//                          callback asked for it, just set an error status.
-//                          The engine will pick that up and stop in its outer loop.
-//
-//--------------------------------------------------------------------------------
-UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) {
-    if (fFindProgressCallbackFn != NULL) {
-        if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) {
-            status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/;
-            return FALSE;
-        }
-    }
-    return TRUE;
-}
-
 //--------------------------------------------------------------------------------
 //
 //   StateSave
@@ -2669,9 +2700,12 @@ UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) {
 //
 //--------------------------------------------------------------------------------
 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
+    if (U_FAILURE(status)) {
+        return fp;
+    }
     // push storage for a new frame.
     int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
-    if (newFP == NULL) {
+    if (U_FAILURE(status)) {
         // Failure on attempted stack expansion.
         //   Stack function set some other error code, change it to a more
         //   specific one for regular expressions.
@@ -2702,6 +2736,18 @@ inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatId
     return (REStackFrame *)newFP;
 }
 
+#if defined(REGEX_DEBUG)
+namespace {
+UnicodeString StringFromUText(UText *ut) {
+    UnicodeString result;
+    for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) {
+        result.append(c);
+    }
+    return result;
+}
+}
+#endif // REGEX_DEBUG
+
 
 //--------------------------------------------------------------------------------
 //
@@ -2721,32 +2767,10 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
     int32_t     opValue;               //    and the operand value.
 
 #ifdef REGEX_RUN_DEBUG
-    if (fTraceDebug)
-    {
+    if (fTraceDebug) {
         printf("MatchAt(startIdx=%ld)\n", startIdx);
-        printf("Original Pattern: ");
-        UChar32 c = utext_next32From(fPattern->fPattern, 0);
-        while (c != U_SENTINEL) {
-            if (c<32 || c>256) {
-                c = '.';
-            }
-            printf("%c", c);
-
-            c = UTEXT_NEXT32(fPattern->fPattern);
-        }
-        printf("\n");
-        printf("Input String: ");
-        c = utext_next32From(fInputText, 0);
-        while (c != U_SENTINEL) {
-            if (c<32 || c>256) {
-                c = '.';
-            }
-            printf("%c", c);
-
-            c = UTEXT_NEXT32(fInputText);
-        }
-        printf("\n");
-        printf("\n");
+        printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
+        printf("Input String:     \"%s\"\n\n", CStr(StringFromUText(fInputText))());
     }
 #endif
 
@@ -2763,6 +2787,10 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
 
     fFrameSize = fPattern->fFrameSize;
     REStackFrame        *fp            = resetStack();
+    if (U_FAILURE(fDeferredStatus)) {
+        status = fDeferredStatus;
+        return;
+    }
 
     fp->fPatIdx   = 0;
     fp->fInputIdx = startIdx;
@@ -2916,9 +2944,9 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
                 //   end of input, succeed.
                 UChar32 c = UTEXT_NEXT32(fInputText);
                 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
-                    if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
+                    if (isLineTerminator(c)) {
                         // If not in the middle of a CR/LF sequence
-                      if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
+                        if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
                             // At new-line at end of input. Success
                             fHitEnd = TRUE;
                             fRequireEnd = TRUE;
@@ -2974,7 +3002,7 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
                  // It makes no difference where the new-line is within the input.
                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
                  UChar32 c = UTEXT_CURRENT32(fInputText);
-                 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
+                 if (isLineTerminator(c)) {
                      // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
                      //  In multi-line mode, hitting a new-line just before the end of input does not
                      //   set the hitEnd or requireEnd flags
@@ -3023,8 +3051,7 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
                //   unless we are at the end of input
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
                UChar32  c = UTEXT_PREVIOUS32(fInputText);
-               if ((fp->fInputIdx < fAnchorLimit) &&
-                   ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
+               if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) {
                    //  It's a new-line.  ^ is true.  Success.
                    //  TODO:  what should be done with positions between a CR and LF?
                    break;
@@ -3105,6 +3132,68 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
             break;
 
 
+        case URX_BACKSLASH_H:            // Test for \h, horizontal white space.
+            {
+                if (fp->fInputIdx >= fActiveLimit) {
+                    fHitEnd = TRUE;
+                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
+                    break;
+                }
+                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
+                UChar32 c = UTEXT_NEXT32(fInputText);
+                int8_t ctype = u_charType(c);
+                UBool success = (ctype == U_SPACE_SEPARATOR || c == 9);  // SPACE_SEPARATOR || TAB
+                success ^= (UBool)(opValue != 0);        // flip sense for \H
+                if (success) {
+                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
+                } else {
+                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
+                }
+            }
+            break;
+
+
+        case URX_BACKSLASH_R:            // Test for \R, any line break sequence.
+            {
+                if (fp->fInputIdx >= fActiveLimit) {
+                    fHitEnd = TRUE;
+                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
+                    break;
+                }
+                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
+                UChar32 c = UTEXT_NEXT32(fInputText);
+                if (isLineTerminator(c)) {
+                    if (c == 0x0d && utext_current32(fInputText) == 0x0a) {
+                        utext_next32(fInputText);
+                    }
+                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
+                } else {
+                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
+                }
+            }
+            break;
+
+
+        case URX_BACKSLASH_V:            // \v, any single line ending character.
+            {
+                if (fp->fInputIdx >= fActiveLimit) {
+                    fHitEnd = TRUE;
+                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
+                    break;
+                }
+                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
+                UChar32 c = UTEXT_NEXT32(fInputText);
+                UBool success = isLineTerminator(c);
+                success ^= (UBool)(opValue != 0);        // flip sense for \V
+                if (success) {
+                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
+                } else {
+                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
+                }
+            }
+            break;
+
+
         case URX_BACKSLASH_X:
             //  Match a Grapheme, as defined by Unicode TR 29.
             //  Differs slightly from Perl, which consumes combining marks independently
@@ -3332,8 +3421,7 @@ GC_Done:
 
                 // There is input left.  Advance over one char, unless we've hit end-of-line
                 UChar32 c = UTEXT_NEXT32(fInputText);
-                if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
-                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
+                if (isLineTerminator(c)) {
                     // End of line in normal mode.   . does not match.
                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
                     break;
@@ -3487,7 +3575,14 @@ GC_Done:
                         }
                     }
                     fp = StateSave(fp, fp->fPatIdx, status);
+                } else {
+                    // Increment time-out counter. (StateSave() does it if count >= minCount)
+                    fTickCounter--;
+                    if (fTickCounter <= 0) {
+                        IncrementTime(status);    // Re-initializes fTickCounter
+                    }
                 }
+
                 fp->fPatIdx = opValue + 4;    // Loop back.
             }
             break;
@@ -3544,6 +3639,11 @@ GC_Done:
                     // We haven't met the minimum number of matches yet.
                     //   Loop back for another one.
                     fp->fPatIdx = opValue + 4;    // Loop back.
+                    // Increment time-out counter. (StateSave() does it if count >= minCount)
+                    fTickCounter--;
+                    if (fTickCounter <= 0) {
+                        IncrementTime(status);    // Re-initializes fTickCounter
+                    }
                 } else {
                     // We do have the minimum number of matches.
 
@@ -3851,28 +3951,38 @@ GC_Done:
                 //   of this op in the pattern.
                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
+                if (!UTEXT_USES_U16(fInputText)) {
+                    // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
+                    // The max length need not be exact; it just needs to be >= actual maximum.
+                    maxML *= 3;
+                }
                 U_ASSERT(minML <= maxML);
                 U_ASSERT(minML >= 0);
 
                 // Fetch (from data) the last input index where a match was attempted.
                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
-                int64_t  *lbStartIdx = &fData[opValue+2];
-                if (*lbStartIdx < 0) {
+                int64_t  &lbStartIdx = fData[opValue+2];
+                if (lbStartIdx < 0) {
                     // First time through loop.
-                    *lbStartIdx = fp->fInputIdx - minML;
+                    lbStartIdx = fp->fInputIdx - minML;
+                    if (lbStartIdx > 0) {
+                        // move index to a code point boudary, if it's not on one already.
+                        UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
+                        lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
+                    }
                 } else {
                     // 2nd through nth time through the loop.
                     // Back up start position for match by one.
-                    if (*lbStartIdx == 0) {
-                        (*lbStartIdx)--;
+                    if (lbStartIdx == 0) {
+                        (lbStartIdx)--;
                     } else {
-                        UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
+                        UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
                         (void)UTEXT_PREVIOUS32(fInputText);
-                        *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
+                        lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
                     }
                 }
 
-                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
+                if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
                     // We have tried all potential match starting points without
                     //  getting a match.  Backtrack out, and out of the
                     //   Look Behind altogether.
@@ -3887,7 +3997,7 @@ GC_Done:
                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
                 //      (successful match will fall off the end of the loop.)
                 fp = StateSave(fp, fp->fPatIdx-3, status);
-                fp->fInputIdx = *lbStartIdx;
+                fp->fInputIdx = lbStartIdx;
             }
             break;
 
@@ -3924,6 +4034,11 @@ GC_Done:
                 // Fetch the extra parameters of this op.
                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
+                if (!UTEXT_USES_U16(fInputText)) {
+                    // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
+                    // The max length need not be exact; it just needs to be >= actual maximum.
+                    maxML *= 3;
+                }
                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
                         continueLoc = URX_VAL(continueLoc);
                 U_ASSERT(minML <= maxML);
@@ -3932,23 +4047,28 @@ GC_Done:
 
                 // Fetch (from data) the last input index where a match was attempted.
                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
-                int64_t  *lbStartIdx = &fData[opValue+2];
-                if (*lbStartIdx < 0) {
+                int64_t  &lbStartIdx = fData[opValue+2];
+                if (lbStartIdx < 0) {
                     // First time through loop.
-                    *lbStartIdx = fp->fInputIdx - minML;
+                    lbStartIdx = fp->fInputIdx - minML;
+                    if (lbStartIdx > 0) {
+                        // move index to a code point boudary, if it's not on one already.
+                        UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
+                        lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
+                    }
                 } else {
                     // 2nd through nth time through the loop.
                     // Back up start position for match by one.
-                    if (*lbStartIdx == 0) {
-                        (*lbStartIdx)--;
+                    if (lbStartIdx == 0) {
+                        (lbStartIdx)--;
                     } else {
-                        UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
+                        UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
                         (void)UTEXT_PREVIOUS32(fInputText);
-                        *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
+                        lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
                     }
                 }
 
-                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
+                if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
                     // We have tried all potential match starting points without
                     //  getting a match, which means that the negative lookbehind as
                     //  a whole has succeeded.  Jump forward to the continue location
@@ -3963,7 +4083,7 @@ GC_Done:
                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
                 //      (successful match will cause a FAIL out of the loop altogether.)
                 fp = StateSave(fp, fp->fPatIdx-4, status);
-                fp->fInputIdx = *lbStartIdx;
+                fp->fInputIdx = lbStartIdx;
             }
             break;
 
@@ -4090,7 +4210,7 @@ GC_Done:
                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
                                (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
-                                    (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) {
+                                    isLineTerminator(c))) {
                                 //  char is a line ending.  Exit the scanning loop.
                                 break;
                             }
@@ -4225,29 +4345,8 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
 #ifdef REGEX_RUN_DEBUG
     if (fTraceDebug) {
         printf("MatchAt(startIdx=%d)\n", startIdx);
-        printf("Original Pattern: ");
-        UChar32 c = utext_next32From(fPattern->fPattern, 0);
-        while (c != U_SENTINEL) {
-            if (c<32 || c>256) {
-                c = '.';
-            }
-            printf("%c", c);
-
-            c = UTEXT_NEXT32(fPattern->fPattern);
-        }
-        printf("\n");
-        printf("Input String: ");
-        c = utext_next32From(fInputText, 0);
-        while (c != U_SENTINEL) {
-            if (c<32 || c>256) {
-                c = '.';
-            }
-            printf("%c", c);
-
-            c = UTEXT_NEXT32(fInputText);
-        }
-        printf("\n");
-        printf("\n");
+        printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
+        printf("Input String:     \"%s\"\n\n", CStr(StringFromUText(fInputText))());
     }
 #endif
 
@@ -4266,6 +4365,10 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
 
     fFrameSize = fPattern->fFrameSize;
     REStackFrame        *fp            = resetStack();
+    if (U_FAILURE(fDeferredStatus)) {
+        status = fDeferredStatus;
+        return;
+    }
 
     fp->fPatIdx   = 0;
     fp->fInputIdx = startIdx;
@@ -4421,7 +4524,7 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
                 UChar32 c;
                 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
 
-                if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
+                if (isLineTerminator(c)) {
                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
                         // At new-line at end of input. Success
                         fHitEnd = TRUE;
@@ -4475,7 +4578,7 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
                 // If we are positioned just before a new-line, succeed.
                 // It makes no difference where the new-line is within the input.
                 UChar32 c = inputBuf[fp->fInputIdx];
-                if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
+                if (isLineTerminator(c)) {
                     // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
                     //  In multi-line mode, hitting a new-line just before the end of input does not
                     //   set the hitEnd or requireEnd flags
@@ -4523,7 +4626,7 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
                 //   unless we are at the end of input
                 UChar  c = inputBuf[fp->fInputIdx - 1];
                 if ((fp->fInputIdx < fAnchorLimit) &&
-                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
+                    isLineTerminator(c)) {
                     //  It's a new-line.  ^ is true.  Success.
                     //  TODO:  what should be done with positions between a CR and LF?
                     break;
@@ -4600,6 +4703,69 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
             break;
 
 
+        case URX_BACKSLASH_H:            // Test for \h, horizontal white space.
+            {
+                if (fp->fInputIdx >= fActiveLimit) {
+                    fHitEnd = TRUE;
+                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
+                    break;
+                }
+                UChar32 c;
+                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
+                int8_t ctype = u_charType(c);
+                UBool success = (ctype == U_SPACE_SEPARATOR || c == 9);  // SPACE_SEPARATOR || TAB
+                success ^= (UBool)(opValue != 0);        // flip sense for \H
+                if (!success) {
+                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
+                }
+            }
+            break;
+
+
+        case URX_BACKSLASH_R:            // Test for \R, any line break sequence.
+            {
+                if (fp->fInputIdx >= fActiveLimit) {
+                    fHitEnd = TRUE;
+                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
+                    break;
+                }
+                UChar32 c;
+                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
+                if (isLineTerminator(c)) {
+                    if (c == 0x0d && fp->fInputIdx < fActiveLimit) {
+                        // Check for CR/LF sequence. Consume both together when found.
+                        UChar c2;
+                        U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2);
+                        if (c2 != 0x0a) {
+                            U16_PREV(inputBuf, 0, fp->fInputIdx, c2);
+                        }
+                    }
+                } else {
+                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
+                }
+            }
+            break;
+
+
+        case URX_BACKSLASH_V:         // Any single code point line ending.
+            {
+                if (fp->fInputIdx >= fActiveLimit) {
+                    fHitEnd = TRUE;
+                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
+                    break;
+                }
+                UChar32 c;
+                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
+                UBool success = isLineTerminator(c);
+                success ^= (UBool)(opValue != 0);        // flip sense for \V
+                if (!success) {
+                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
+                }
+            }
+            break;
+
+
+
         case URX_BACKSLASH_X:
         //  Match a Grapheme, as defined by Unicode TR 29.
         //  Differs slightly from Perl, which consumes combining marks independently
@@ -4809,8 +4975,7 @@ GC_Done:
                 // There is input left.  Advance over one char, unless we've hit end-of-line
                 UChar32  c;
                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
-                if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
-                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
+                if (isLineTerminator(c)) {
                     // End of line in normal mode.   . does not match.
                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
                     break;
@@ -4955,6 +5120,12 @@ GC_Done:
                         }
                     }
                     fp = StateSave(fp, fp->fPatIdx, status);
+                } else {
+                    // Increment time-out counter. (StateSave() does it if count >= minCount)
+                    fTickCounter--;
+                    if (fTickCounter <= 0) {
+                        IncrementTime(status);    // Re-initializes fTickCounter
+                    }
                 }
                 fp->fPatIdx = opValue + 4;    // Loop back.
             }
@@ -5012,6 +5183,10 @@ GC_Done:
                     // We haven't met the minimum number of matches yet.
                     //   Loop back for another one.
                     fp->fPatIdx = opValue + 4;    // Loop back.
+                    fTickCounter--;
+                    if (fTickCounter <= 0) {
+                        IncrementTime(status);    // Re-initializes fTickCounter
+                    }
                 } else {
                     // We do have the minimum number of matches.
 
@@ -5081,6 +5256,12 @@ GC_Done:
                         break;
                     }
                 }
+                if (success && groupStartIdx < groupEndIdx && U16_IS_LEAD(inputBuf[groupEndIdx-1]) &&
+                        inputIndex < fActiveLimit && U16_IS_TRAIL(inputBuf[inputIndex])) {
+                    // Capture group ended with an unpaired lead surrogate.
+                    // Back reference is not permitted to match lead only of a surrogatge pair.
+                    success = FALSE;
+                }
                 if (success) {
                     fp->fInputIdx = inputIndex;
                 } else {
@@ -5293,21 +5474,24 @@ GC_Done:
 
                 // Fetch (from data) the last input index where a match was attempted.
                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
-                int64_t  *lbStartIdx = &fData[opValue+2];
-                if (*lbStartIdx < 0) {
+                int64_t  &lbStartIdx = fData[opValue+2];
+                if (lbStartIdx < 0) {
                     // First time through loop.
-                    *lbStartIdx = fp->fInputIdx - minML;
+                    lbStartIdx = fp->fInputIdx - minML;
+                    if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
+                        U16_SET_CP_START(inputBuf, 0, lbStartIdx);
+                    }
                 } else {
                     // 2nd through nth time through the loop.
                     // Back up start position for match by one.
-                    if (*lbStartIdx == 0) {
-                        (*lbStartIdx)--;
+                    if (lbStartIdx == 0) {
+                        lbStartIdx--;
                     } else {
-                        U16_BACK_1(inputBuf, 0, *lbStartIdx);
+                        U16_BACK_1(inputBuf, 0, lbStartIdx);
                     }
                 }
 
-                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
+                if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
                     // We have tried all potential match starting points without
                     //  getting a match.  Backtrack out, and out of the
                     //   Look Behind altogether.
@@ -5322,7 +5506,7 @@ GC_Done:
                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
                 //      (successful match will fall off the end of the loop.)
                 fp = StateSave(fp, fp->fPatIdx-3, status);
-                fp->fInputIdx =  *lbStartIdx;
+                fp->fInputIdx =  lbStartIdx;
             }
             break;
 
@@ -5367,21 +5551,24 @@ GC_Done:
 
                 // Fetch (from data) the last input index where a match was attempted.
                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
-                int64_t  *lbStartIdx = &fData[opValue+2];
-                if (*lbStartIdx < 0) {
+                int64_t  &lbStartIdx = fData[opValue+2];
+                if (lbStartIdx < 0) {
                     // First time through loop.
-                    *lbStartIdx = fp->fInputIdx - minML;
+                    lbStartIdx = fp->fInputIdx - minML;
+                    if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
+                        U16_SET_CP_START(inputBuf, 0, lbStartIdx);
+                    }
                 } else {
                     // 2nd through nth time through the loop.
                     // Back up start position for match by one.
-                    if (*lbStartIdx == 0) {
-                        (*lbStartIdx)--;   // Because U16_BACK is unsafe starting at 0.
+                    if (lbStartIdx == 0) {
+                        lbStartIdx--;   // Because U16_BACK is unsafe starting at 0.
                     } else {
-                        U16_BACK_1(inputBuf, 0, *lbStartIdx);
+                        U16_BACK_1(inputBuf, 0, lbStartIdx);
                     }
                 }
 
-                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
+                if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
                     // We have tried all potential match starting points without
                     //  getting a match, which means that the negative lookbehind as
                     //  a whole has succeeded.  Jump forward to the continue location
@@ -5396,7 +5583,7 @@ GC_Done:
                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
                 //      (successful match will cause a FAIL out of the loop altogether.)
                 fp = StateSave(fp, fp->fPatIdx-4, status);
-                fp->fInputIdx =  *lbStartIdx;
+                fp->fInputIdx =  lbStartIdx;
             }
             break;
 
@@ -5524,7 +5711,7 @@ GC_Done:
                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
                                 (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
-                                   ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) {
+                                   isLineTerminator(c))) {
                                 //  char is a line ending.  Put the input pos back to the
                                 //    line ending char, and exit the scanning loop.
                                 U16_BACK_1(inputBuf, 0, ix);
@@ -5640,3 +5827,4 @@ UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
 U_NAMESPACE_END
 
 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
+