ICU-66108.tar.gz

[apple/icu.git] / icuSources / i18n / regexcmp.cpp
diff --git a/icuSources/i18n/regexcmp.cpp b/icuSources/i18n/regexcmp.cpp

index e518e84cd3520972ca2326f7fd5deb0405de9ac1..7274ca9a9255091b4644b6a56aa44588a16684ec 100644 (file)
--- a/icuSources/i18n/regexcmp.cpp
+++ b/icuSources/i18n/regexcmp.cpp
@@ -1,7 +1,9 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
  //
  //  file:  regexcmp.cpp
  //
-//  Copyright (C) 2002-2015 International Business Machines Corporation and others.
+//  Copyright (C) 2002-2016 International Business Machines Corporation and others.
  //  All Rights Reserved.
  //
  //  This file contains the ICU regular expression compiler, which is responsible
@@ -26,6 +28,7 @@
  #include "patternprops.h"
  #include "putilimp.h"
  #include "cmemory.h"
+#include "cstr.h"
  #include "cstring.h"
  #include "uvectr32.h"
  #include "uvectr64.h"
@@ -71,6 +74,7 @@ RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
      fMatchOpenParen   = -1;
      fMatchCloseParen  = -1;
      fCaptureName      = NULL;
+    fLastSetLiteral   = U_SENTINEL;
  
      if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
          status = rxp->fDeferredStatus;
@@ -486,6 +490,12 @@ UBool RegexCompile::doParseActions(int32_t action)
  
              // If this is a named capture group, add the name->group number mapping.
              if (fCaptureName != NULL) {
+                if (!fRXPat->initNamedCaptureMap()) {
+                    if (U_SUCCESS(*fStatus)) {
+                        error(fRXPat->fDeferredStatus);
+                    }
+                    break;
+                }
                  int32_t groupNumber = fRXPat->fGroupMap->size();
                  int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
                  fCaptureName = NULL;    // hash table takes ownership of the name (key) string.
@@ -557,7 +567,7 @@ UBool RegexCompile::doParseActions(int32_t action)
          //               sequence; don't change without making updates there too.
          //
          // Compiles to
-        //    1    START_LA     dataLoc     Saves SP, Input Pos
+        //    1    LA_START     dataLoc     Saves SP, Input Pos, Active input region.
          //    2.   STATE_SAVE   4            on failure of lookahead, goto 4
          //    3    JMP          6           continue ...
          //
@@ -571,10 +581,14 @@ UBool RegexCompile::doParseActions(int32_t action)
          //    8.     code for parenthesized stuff.
          //    9.   LA_END
          //
-        //  Two data slots are reserved, for saving the stack ptr and the input position.
+        //  Four data slots are reserved, for saving state on entry to the look-around
+        //    0:   stack pointer on entry.
+        //    1:   input position on entry.
+        //    2:   fActiveStart, the active bounds start on entry.
+        //    3:   fActiveLimit, the active bounds limit on entry.
          {
              fixLiterals();
-            int32_t dataLoc = allocateData(2);
+            int32_t dataLoc = allocateData(4);
              appendOp(URX_LA_START, dataLoc);
              appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
              appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
@@ -595,18 +609,23 @@ UBool RegexCompile::doParseActions(int32_t action)
      case doOpenLookAheadNeg:
          // Negated Lookahead.   (?! stuff )
          // Compiles to
-        //    1.    START_LA    dataloc
+        //    1.    LA_START    dataloc
          //    2.    SAVE_STATE  7         // Fail within look-ahead block restores to this state,
          //                                //   which continues with the match.
          //    3.    NOP                   // Std. Open Paren sequence, for possible '|'
          //    4.       code for parenthesized stuff.
-        //    5.    END_LA                // Cut back stack, remove saved state from step 2.
+        //    5.    LA_END                // Cut back stack, remove saved state from step 2.
          //    6.    BACKTRACK             // code in block succeeded, so neg. lookahead fails.
          //    7.    END_LA                // Restore match region, in case look-ahead was using
          //                                        an alternate (transparent) region.
+        //  Four data slots are reserved, for saving state on entry to the look-around
+        //    0:   stack pointer on entry.
+        //    1:   input position on entry.
+        //    2:   fActiveStart, the active bounds start on entry.
+        //    3:   fActiveLimit, the active bounds limit on entry.
          {
              fixLiterals();
-            int32_t dataLoc = allocateData(2);
+            int32_t dataLoc = allocateData(4);
              appendOp(URX_LA_START, dataLoc);
              appendOp(URX_STATE_SAVE, 0);    // dest address will be patched later.
              appendOp(URX_NOP, 0);
@@ -640,14 +659,16 @@ UBool RegexCompile::doParseActions(int32_t action)
              //          Allocate a block of matcher data, to contain (when running a match)
              //              0:    Stack ptr on entry
              //              1:    Input Index on entry
-            //              2:    Start index of match current match attempt.
-            //              3:    Original Input String len.
+            //              2:    fActiveStart, the active bounds start on entry.
+            //              3:    fActiveLimit, the active bounds limit on entry.
+            //              4:    Start index of match current match attempt.
+            //          The first four items must match the layout of data for LA_START / LA_END
  
              // Generate match code for any pending literals.
              fixLiterals();
  
              // Allocate data space
-            int32_t dataLoc = allocateData(4);
+            int32_t dataLoc = allocateData(5);
  
              // Emit URX_LB_START
              appendOp(URX_LB_START, dataLoc);
@@ -692,14 +713,16 @@ UBool RegexCompile::doParseActions(int32_t action)
              //          Allocate a block of matcher data, to contain (when running a match)
              //              0:    Stack ptr on entry
              //              1:    Input Index on entry
-            //              2:    Start index of match current match attempt.
-            //              3:    Original Input String len.
+            //              2:    fActiveStart, the active bounds start on entry.
+            //              3:    fActiveLimit, the active bounds limit on entry.
+            //              4:    Start index of match current match attempt.
+            //          The first four items must match the layout of data for LA_START / LA_END
  
              // Generate match code for any pending literals.
              fixLiterals();
  
              // Allocate data space
-            int32_t dataLoc = allocateData(4);
+            int32_t dataLoc = allocateData(5);
  
              // Emit URX_LB_START
              appendOp(URX_LB_START, dataLoc);
@@ -1328,7 +1351,8 @@ UBool RegexCompile::doParseActions(int32_t action)
  
      case doCompleteNamedBackRef:
          {
-        int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName);
+        int32_t groupNumber =
+            fRXPat->fNamedCaptureMap ? uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName) : 0;
          if (groupNumber == 0) {
              // Group name has not been defined.
              //   Could be a forward reference. If we choose to support them at some
@@ -1461,7 +1485,7 @@ UBool RegexCompile::doParseActions(int32_t action)
              case 0x78: /* 'x' */   bit = UREGEX_COMMENTS;         break;
              case 0x2d: /* '-' */   fSetModeFlag = FALSE;          break;
              default:
-                U_ASSERT(FALSE);   // Should never happen.  Other chars are filtered out
+                UPRV_UNREACHABLE;   // Should never happen.  Other chars are filtered out
                                     // by the scanner.
              }
              if (fSetModeFlag) {
@@ -1757,7 +1781,7 @@ UBool RegexCompile::doParseActions(int32_t action)
          //        and ICU UnicodeSet behavior.
          {
              UChar32  c = scanNamedChar();
-            if (U_SUCCESS(*fStatus) && fLastSetLiteral > c) {
+            if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > c)) {
                  error(U_REGEX_INVALID_RANGE);
              }
              UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
@@ -1826,7 +1850,8 @@ UBool RegexCompile::doParseActions(int32_t action)
          // Lower Limit > Upper limit being an error matches both Java
          //        and ICU UnicodeSet behavior.
          {
-        if (fLastSetLiteral > fC.fChar) {
+
+        if (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > fC.fChar) {
              error(U_REGEX_INVALID_RANGE);
          }
          UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
@@ -1835,9 +1860,7 @@ UBool RegexCompile::doParseActions(int32_t action)
          }
  
      default:
-        U_ASSERT(FALSE);
-        error(U_REGEX_INTERNAL_ERROR);
-        break;
+        UPRV_UNREACHABLE;
      }
  
      if (U_FAILURE(*fStatus)) {
@@ -1944,25 +1967,17 @@ int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
          return 0;
      }
      if (type < 0 || type > 255) {
-        U_ASSERT(FALSE);
-        error(U_REGEX_INTERNAL_ERROR);
-        type = URX_RESERVED_OP;
+        UPRV_UNREACHABLE;
      }
      if (val > 0x00ffffff) {
-        U_ASSERT(FALSE);
-        error(U_REGEX_INTERNAL_ERROR);
-        val = 0;
+        UPRV_UNREACHABLE;
      }
      if (val < 0) {
          if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) {
-            U_ASSERT(FALSE);
-            error(U_REGEX_INTERNAL_ERROR);
-            return -1;
+            UPRV_UNREACHABLE;
          }
          if (URX_TYPE(val) != 0xff) {
-            U_ASSERT(FALSE);
-            error(U_REGEX_INTERNAL_ERROR);
-            return -1;
+            UPRV_UNREACHABLE;
          }
          type = URX_RESERVED_OP_N;
      }
@@ -2290,6 +2305,13 @@ void  RegexCompile::handleCloseParen() {
                  error(U_REGEX_LOOK_BEHIND_LIMIT);
                  break;
              }
+            if (minML == INT32_MAX) {
+                // This condition happens when no match is possible, such as with a
+                // [set] expression containing no elements.
+                // In principle, the generated code to evaluate the expression could be deleted,
+                // but it's probably not worth the complication.
+                minML = 0;
+            }
              U_ASSERT(minML <= maxML);
  
              // Insert the min and max match len bounds into the URX_LB_CONT op that
@@ -2326,6 +2348,14 @@ void  RegexCompile::handleCloseParen() {
                  error(U_REGEX_LOOK_BEHIND_LIMIT);
                  break;
              }
+            if (minML == INT32_MAX) {
+                // This condition happens when no match is possible, such as with a
+                // [set] expression containing no elements.
+                // In principle, the generated code to evaluate the expression could be deleted,
+                // but it's probably not worth the complication.
+                minML = 0;
+            }
+
              U_ASSERT(minML <= maxML);
  
              // Insert the min and max match len bounds into the URX_LB_CONT op that
@@ -2343,7 +2373,7 @@ void  RegexCompile::handleCloseParen() {
  
  
      default:
-        U_ASSERT(FALSE);
+        UPRV_UNREACHABLE;
      }
  
      // remember the next location in the compiled pattern.
@@ -2601,7 +2631,10 @@ void  RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterCh
  
  // End of machine generated data.
  
-    if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
+    if (c < UCHAR_MIN_VALUE || c > UCHAR_MAX_VALUE) {
+        // This function should never be called with an invalid input character.
+        UPRV_UNREACHABLE;
+    } else if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
          UChar32 caseFoldedC  = u_foldCase(c, U_FOLD_CASE_DEFAULT);
          starterChars->set(caseFoldedC, caseFoldedC);
  
@@ -2629,6 +2662,16 @@ void  RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterCh
  }
  
  
+// Increment with overflow check.
+// val and delta will both be positive.
+
+static int32_t safeIncrement(int32_t val, int32_t delta) {
+    if (INT32_MAX - val > delta) {
+        return val + delta;
+    } else {
+        return INT32_MAX;
+    }
+}
  
  
  //------------------------------------------------------------------------------
@@ -2729,7 +2772,7 @@ void   RegexCompile::matchStartType() {
                  fRXPat->fInitialChars->add(URX_VAL(op));
                  numInitialStrings += 2;
              }
-            currentLen++;
+            currentLen = safeIncrement(currentLen, 1);
              atStart = FALSE;
              break;
  
@@ -2742,7 +2785,7 @@ void   RegexCompile::matchStartType() {
                  fRXPat->fInitialChars->addAll(*s);
                  numInitialStrings += 2;
              }
-            currentLen++;
+            currentLen = safeIncrement(currentLen, 1);
              atStart = FALSE;
              break;
  
@@ -2779,7 +2822,7 @@ void   RegexCompile::matchStartType() {
                  fRXPat->fInitialChars->addAll(*s);
                  numInitialStrings += 2;
              }
-            currentLen++;
+            currentLen = safeIncrement(currentLen, 1);
              atStart = FALSE;
              break;
  
@@ -2794,7 +2837,7 @@ void   RegexCompile::matchStartType() {
                  fRXPat->fInitialChars->addAll(sc);
                  numInitialStrings += 2;
              }
-            currentLen++;
+            currentLen = safeIncrement(currentLen, 1);
              atStart = FALSE;
              break;
  
@@ -2811,7 +2854,7 @@ void   RegexCompile::matchStartType() {
                   fRXPat->fInitialChars->addAll(s);
                   numInitialStrings += 2;
              }
-            currentLen++;
+            currentLen = safeIncrement(currentLen, 1);
              atStart = FALSE;
              break;
  
@@ -2828,7 +2871,7 @@ void   RegexCompile::matchStartType() {
                  fRXPat->fInitialChars->addAll(s);
                  numInitialStrings += 2;
              }
-            currentLen++;
+            currentLen = safeIncrement(currentLen, 1);
              atStart = FALSE;
              break;
  
@@ -2847,7 +2890,7 @@ void   RegexCompile::matchStartType() {
                  fRXPat->fInitialChars->addAll(s);
                  numInitialStrings += 2;
              }
-            currentLen++;
+            currentLen = safeIncrement(currentLen, 1);
              atStart = FALSE;
              break;
  
@@ -2871,7 +2914,7 @@ void   RegexCompile::matchStartType() {
                  }
                  numInitialStrings += 2;
              }
-            currentLen++;
+            currentLen = safeIncrement(currentLen, 1);
              atStart = FALSE;
              break;
  
@@ -2887,13 +2930,14 @@ void   RegexCompile::matchStartType() {
                  fRXPat->fInitialChars->complement();
                  numInitialStrings += 2;
              }
-            currentLen++;
+            currentLen = safeIncrement(currentLen, 1);
              atStart = FALSE;
              break;
  
  
          case URX_JMPX:
              loc++;             // Except for extra operand on URX_JMPX, same as URX_JMP.
+            U_FALLTHROUGH;
          case URX_JMP:
              {
                  int32_t  jmpDest = URX_VAL(op);
@@ -2966,7 +3010,7 @@ void   RegexCompile::matchStartType() {
                      fRXPat->fInitialStringLen = stringLen;
                  }
  
-                currentLen += stringLen;
+                currentLen = safeIncrement(currentLen, stringLen);
                  atStart = FALSE;
              }
              break;
@@ -2991,7 +3035,7 @@ void   RegexCompile::matchStartType() {
                      fRXPat->fInitialChars->addAll(s);
                      numInitialStrings += 2;  // Matching on an initial string not possible.
                  }
-                currentLen += stringLen;
+                currentLen = safeIncrement(currentLen, stringLen);
                  atStart = FALSE;
              }
              break;
@@ -3083,13 +3127,10 @@ void   RegexCompile::matchStartType() {
          case URX_LB_END:
          case URX_LBN_CONT:
          case URX_LBN_END:
-            U_ASSERT(FALSE);     // Shouldn't get here.  These ops should be
+            UPRV_UNREACHABLE;     // Shouldn't get here.  These ops should be
                                   //  consumed by the scan in URX_LA_START and LB_START
-
-            break;
-
          default:
-            U_ASSERT(FALSE);
+            UPRV_UNREACHABLE;
              }
  
          }
@@ -3249,13 +3290,14 @@ int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
          case URX_DOTANY_ALL:    // . matches one or two.
          case URX_DOTANY:
          case URX_DOTANY_UNIX:
-            currentLen++;
+            currentLen = safeIncrement(currentLen, 1);
              break;
  
  
          case URX_JMPX:
              loc++;              // URX_JMPX has an extra operand, ignored here,
                                  //   otherwise processed identically to URX_JMP.
+            U_FALLTHROUGH;
          case URX_JMP:
              {
                  int32_t  jmpDest = URX_VAL(op);
@@ -3300,7 +3342,7 @@ int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
              {
                  loc++;
                  int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
-                currentLen += URX_VAL(stringLenOp);
+                currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
              }
              break;
  
@@ -3313,7 +3355,7 @@ int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
                  //       Assume a min length of one for now.  A min length of zero causes
                  //        optimization failures for a pattern like "string"+
                  // currentLen += URX_VAL(stringLenOp);
-                currentLen += 1;
+                currentLen = safeIncrement(currentLen, 1);
              }
              break;
  
@@ -3359,7 +3401,7 @@ int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
                  //   it assumes that the look-ahead match might be zero-length.
                  //   TODO:  Positive lookahead could recursively do the block, then continue
                  //          with the longer of the block or the value coming in.  Ticket 6060
-                int32_t  depth = (opType == URX_LA_START? 2: 1);;
+                int32_t  depth = (opType == URX_LA_START? 2: 1);
                  for (;;) {
                      loc++;
                      op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
@@ -3408,7 +3450,7 @@ int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
              break;
  
          default:
-            U_ASSERT(FALSE);
+            UPRV_UNREACHABLE;
              }
  
          }
@@ -3423,18 +3465,6 @@ int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
      return currentLen;
  }
  
-// Increment with overflow check.
-// val and delta will both be positive.
-
-static int32_t safeIncrement(int32_t val, int32_t delta) {
-    if (INT32_MAX - val > delta) {
-        return val + delta;
-    } else {
-        return INT32_MAX;
-    }
-}
-
-
  //------------------------------------------------------------------------------
  //
  //   maxMatchLength    Calculate the length of the longest string that could
@@ -3453,7 +3483,6 @@ int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
      U_ASSERT(start <= end);
      U_ASSERT(end < fRXPat->fCompiledPat->size());
  
-
      int32_t    loc;
      int32_t    op;
      int32_t    opType;
@@ -3662,10 +3691,9 @@ int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
  
          case URX_CTR_LOOP:
          case URX_CTR_LOOP_NG:
-            // These opcodes will be skipped over by code for URX_CRT_INIT.
+            // These opcodes will be skipped over by code for URX_CTR_INIT.
              // We shouldn't encounter them here.
-            U_ASSERT(FALSE);
-            break;
+            UPRV_UNREACHABLE;
  
          case URX_LOOP_SR_I:
          case URX_LOOP_DOT_I:
@@ -3685,33 +3713,26 @@ int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
  
              // End of look-ahead ops should always be consumed by the processing at
              //  the URX_LA_START op.
-            // U_ASSERT(FALSE);
-            // break;
+            // UPRV_UNREACHABLE;
  
          case URX_LB_START:
              {
                  // Look-behind.  Scan forward until the matching look-around end,
                  //   without processing the look-behind block.
-                int32_t  depth = 0;
-                for (;;) {
-                    loc++;
+                int32_t dataLoc = URX_VAL(op);
+                for (loc = loc + 1; loc < end; ++loc) {
                      op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
-                    if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) {
-                        depth++;
-                    }
-                    if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
-                        if (depth == 0) {
-                            break;
-                        }
-                        depth--;
+                    int32_t opType = URX_TYPE(op);
+                    if ((opType == URX_LA_END || opType == URX_LBN_END) && (URX_VAL(op) == dataLoc)) {
+                        break;
                      }
-                    U_ASSERT(loc < end);
                  }
+                U_ASSERT(loc < end);
              }
              break;
  
          default:
-            U_ASSERT(FALSE);
+            UPRV_UNREACHABLE;
          }
  
  
@@ -3866,8 +3887,7 @@ void RegexCompile::stripNOPs() {
  
          default:
              // Some op is unaccounted for.
-            U_ASSERT(FALSE);
-            error(U_REGEX_INTERNAL_ERROR);
+            UPRV_UNREACHABLE;
          }
      }
  
@@ -3884,7 +3904,7 @@ void RegexCompile::stripNOPs() {
  //
  //------------------------------------------------------------------------------
  void RegexCompile::error(UErrorCode e) {
-    if (U_SUCCESS(*fStatus)) {
+    if (U_SUCCESS(*fStatus) || e == U_MEMORY_ALLOCATION_ERROR) {
          *fStatus = e;
          // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
          // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
@@ -4003,7 +4023,7 @@ UChar32  RegexCompile::peekCharLL() {
  //
  //------------------------------------------------------------------------------
  void RegexCompile::nextChar(RegexPatternChar &c) {
-
+  tailRecursion:
      fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
      c.fChar    = nextCharLL();
      c.fQuoted  = FALSE;
@@ -4014,7 +4034,9 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
              c.fChar == (UChar32)-1) {
              fQuoteMode = FALSE;  //  Exit quote mode,
              nextCharLL();        // discard the E
-            nextChar(c);         // recurse to get the real next char
+            // nextChar(c);      // recurse to get the real next char
+            goto tailRecursion;  // Note: fuzz testing produced testcases that
+                                 //       resulted in stack overflow here.
          }
      }
      else if (fInBackslashQuote) {
@@ -4132,8 +4154,10 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
              else if (peekCharLL() == chQ) {
                  //  "\Q"  enter quote mode, which will continue until "\E"
                  fQuoteMode = TRUE;
-                nextCharLL();       // discard the 'Q'.
-                nextChar(c);        // recurse to get the real next char.
+                nextCharLL();        // discard the 'Q'.
+                // nextChar(c);      // recurse to get the real next char.
+                goto tailRecursion;  // Note: fuzz testing produced test cases that
+                //                            resulted in stack overflow here.
              }
              else
              {
@@ -4362,211 +4386,209 @@ static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
  //     Includes trying the Java "properties" that aren't supported as
  //     normal ICU UnicodeSet properties
  //
-static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{"
-static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{"
  UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
-    UnicodeString   setExpr;
-    UnicodeSet      *set;
-    uint32_t        usetFlags = 0;
  
      if (U_FAILURE(*fStatus)) {
-        return NULL;
+        return nullptr;
      }
+    LocalPointer<UnicodeSet> set;
+    UErrorCode status = U_ZERO_ERROR;
  
-    //
-    //  First try the property as we received it
-    //
-    if (negated) {
-        setExpr.append(negSetPrefix, -1);
-    } else {
-        setExpr.append(posSetPrefix, -1);
-    }
-    setExpr.append(propName);
-    setExpr.append(chRBrace);
-    setExpr.append(chRBracket);
-    if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
-        usetFlags |= USET_CASE_INSENSITIVE;
-    }
-    set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
-    if (U_SUCCESS(*fStatus)) {
-       return set;
-    }
-    delete set;
-    set = NULL;
-
-    //
-    //  The property as it was didn't work.
-
-    //  Do [:word:]. It is not recognized as a property by UnicodeSet.  "word" not standard POSIX
-    //     or standard Java, but many other regular expression packages do recognize it.
-
-    if (propName.caseCompare(UNICODE_STRING_SIMPLE("word"), 0) == 0) {
-        *fStatus = U_ZERO_ERROR;
-        set = new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET]));
-        if (set == NULL) {
-            *fStatus = U_MEMORY_ALLOCATION_ERROR;
-            return set;
+    do {      // non-loop, exists to allow breaks from the block.
+        //
+        //  First try the property as we received it
+        //
+        UnicodeString   setExpr;
+        uint32_t        usetFlags = 0;
+        setExpr.append(u"[\\p{", -1);
+        setExpr.append(propName);
+        setExpr.append(u"}]", -1);
+        if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
+            usetFlags |= USET_CASE_INSENSITIVE;
          }
-        if (negated) {
-            set->complement();
+        set.adoptInsteadAndCheckErrorCode(new UnicodeSet(setExpr, usetFlags, NULL, status), status);
+        if (U_SUCCESS(status) || status == U_MEMORY_ALLOCATION_ERROR) {
+            break;
          }
-        return set;
-    }
  
+        //
+        //  The incoming property wasn't directly recognized by ICU.
  
-    //    Do Java fixes -
-    //       InGreek -> InGreek or Coptic, that being the official Unicode name for that block.
-    //       InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols.
-    //
-    //       Note on Spaces:  either "InCombiningMarksForSymbols" or "InCombining Marks for Symbols"
-    //                        is accepted by Java.  The property part of the name is compared
-    //                        case-insenstively.  The spaces must be exactly as shown, either
-    //                        all there, or all omitted, with exactly one at each position
-    //                        if they are present.  From checking against JDK 1.6
-    //
-    //       This code should be removed when ICU properties support the Java  compatibility names
-    //          (ICU 4.0?)
-    //
-    UnicodeString mPropName = propName;
-    if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) {
-        mPropName = UNICODE_STRING_SIMPLE("InGreek and Coptic");
-    }
-    if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbols"), 0) == 0 ||
-        mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols"), 0) == 0) {
-        mPropName = UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Symbols");
-    }
-    else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
-        mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint");
-    }
+        //  Check [:word:] and [:all:]. These are not recognized as a properties by ICU UnicodeSet.
+        //     Java accepts 'word' with mixed case.
+        //     Java accepts 'all' only in all lower case.
  
-    //    See if the property looks like a Java "InBlockName", which
-    //    we will recast as "Block=BlockName"
-    //
-    static const UChar IN[] = {0x49, 0x6E, 0};  // "In"
-    static const UChar BLOCK[] = {0x42, 0x6C, 0x6f, 0x63, 0x6b, 0x3d, 00};  // "Block="
-    if (mPropName.startsWith(IN, 2) && propName.length()>=3) {
-        setExpr.truncate(4);   // Leaves "[\p{", or "[\P{"
-        setExpr.append(BLOCK, -1);
-        setExpr.append(UnicodeString(mPropName, 2));  // Property with the leading "In" removed.
-        setExpr.append(chRBrace);
-        setExpr.append(chRBracket);
-        *fStatus = U_ZERO_ERROR;
-        set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
-        if (U_SUCCESS(*fStatus)) {
-            return set;
+        status = U_ZERO_ERROR;
+        if (propName.caseCompare(u"word", -1, 0) == 0) {
+            set.adoptInsteadAndCheckErrorCode(new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET])), status);
+            break;
+        }
+        if (propName.compare(u"all", -1) == 0) {
+            set.adoptInsteadAndCheckErrorCode(new UnicodeSet(0, 0x10ffff), status);
+            break;
          }
-        delete set;
-        set = NULL;
-    }
  
-    if (propName.startsWith(UNICODE_STRING_SIMPLE("java")) ||
-        propName.compare(UNICODE_STRING_SIMPLE("all")) == 0)
-    {
-        UErrorCode localStatus = U_ZERO_ERROR;
-        //setExpr.remove();
-        set = new UnicodeSet();
-        //
-        //  Try the various Java specific properties.
-        //   These all begin with "java"
+
+        //    Do Java InBlock expressions
          //
-        if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDefined")) == 0) {
-            addCategory(set, U_GC_CN_MASK, localStatus);
-            set->complement();
-        }
-        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDigit")) == 0) {
-            addCategory(set, U_GC_ND_MASK, localStatus);
-        }
-        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaIdentifierIgnorable")) == 0) {
-            addIdentifierIgnorable(set, localStatus);
-        }
-        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaISOControl")) == 0) {
-            set->add(0, 0x1F).add(0x7F, 0x9F);
-        }
-        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierPart")) == 0) {
-            addCategory(set, U_GC_L_MASK, localStatus);
-            addCategory(set, U_GC_SC_MASK, localStatus);
-            addCategory(set, U_GC_PC_MASK, localStatus);
-            addCategory(set, U_GC_ND_MASK, localStatus);
-            addCategory(set, U_GC_NL_MASK, localStatus);
-            addCategory(set, U_GC_MC_MASK, localStatus);
-            addCategory(set, U_GC_MN_MASK, localStatus);
-            addIdentifierIgnorable(set, localStatus);
-        }
-        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierStart")) == 0) {
-            addCategory(set, U_GC_L_MASK, localStatus);
-            addCategory(set, U_GC_NL_MASK, localStatus);
-            addCategory(set, U_GC_SC_MASK, localStatus);
-            addCategory(set, U_GC_PC_MASK, localStatus);
-        }
-        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetter")) == 0) {
-            addCategory(set, U_GC_L_MASK, localStatus);
-        }
-        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetterOrDigit")) == 0) {
-            addCategory(set, U_GC_L_MASK, localStatus);
-            addCategory(set, U_GC_ND_MASK, localStatus);
-        }
-        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLowerCase")) == 0) {
-            addCategory(set, U_GC_LL_MASK, localStatus);
-        }
-        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaMirrored")) == 0) {
-            set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, localStatus);
-        }
-        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSpaceChar")) == 0) {
-            addCategory(set, U_GC_Z_MASK, localStatus);
-        }
-        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSupplementaryCodePoint")) == 0) {
-            set->add(0x10000, UnicodeSet::MAX_VALUE);
-        }
-        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaTitleCase")) == 0) {
-            addCategory(set, U_GC_LT_MASK, localStatus);
-        }
-        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierStart")) == 0) {
-            addCategory(set, U_GC_L_MASK, localStatus);
-            addCategory(set, U_GC_NL_MASK, localStatus);
-        }
-        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierPart")) == 0) {
-            addCategory(set, U_GC_L_MASK, localStatus);
-            addCategory(set, U_GC_PC_MASK, localStatus);
-            addCategory(set, U_GC_ND_MASK, localStatus);
-            addCategory(set, U_GC_NL_MASK, localStatus);
-            addCategory(set, U_GC_MC_MASK, localStatus);
-            addCategory(set, U_GC_MN_MASK, localStatus);
-            addIdentifierIgnorable(set, localStatus);
-        }
-        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUpperCase")) == 0) {
-            addCategory(set, U_GC_LU_MASK, localStatus);
-        }
-        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaValidCodePoint")) == 0) {
-            set->add(0, UnicodeSet::MAX_VALUE);
-        }
-        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaWhitespace")) == 0) {
-            addCategory(set, U_GC_Z_MASK, localStatus);
-            set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
-            set->add(9, 0x0d).add(0x1c, 0x1f);
-        }
-        else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
-            set->add(0, UnicodeSet::MAX_VALUE);
+        UnicodeString mPropName = propName;
+        if (mPropName.startsWith(u"In", 2) && mPropName.length() >= 3) {
+            status = U_ZERO_ERROR;
+            set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
+            if (U_FAILURE(status)) {
+                break;
+            }
+            UnicodeString blockName(mPropName, 2);  // Property with the leading "In" removed.
+            set->applyPropertyAlias(UnicodeString(u"Block"), blockName, status);
+            break;
          }
  
-        if (U_SUCCESS(localStatus) && !set->isEmpty()) {
-            *fStatus = U_ZERO_ERROR;
-            if (usetFlags & USET_CASE_INSENSITIVE) {
+        //  Check for the Java form "IsBooleanPropertyValue", which we will recast
+        //  as "BooleanPropertyValue". The property value can be either a
+        //  a General Category or a Script Name.
+
+        if (propName.startsWith(u"Is", 2) && propName.length()>=3) {
+            mPropName.remove(0, 2);      // Strip the "Is"
+            if (mPropName.indexOf(u'=') >= 0) {
+                // Reject any "Is..." property expression containing an '=', that is,
+                // any non-binary property expression.
+                status = U_REGEX_PROPERTY_SYNTAX;
+                break;
+            }
+
+            if (mPropName.caseCompare(u"assigned", -1, 0) == 0) {
+                mPropName.setTo(u"unassigned", -1);
+                negated = !negated;
+            } else if (mPropName.caseCompare(u"TitleCase", -1, 0) == 0) {
+                mPropName.setTo(u"Titlecase_Letter", -1);
+            }
+
+            mPropName.insert(0, u"[\\p{", -1);
+            mPropName.append(u"}]", -1);
+            set.adoptInsteadAndCheckErrorCode(new UnicodeSet(mPropName, *fStatus), status);
+
+            if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
                  set->closeOver(USET_CASE_INSENSITIVE);
              }
-            if (negated) {
+            break;
+
+        }
+
+        if (propName.startsWith(u"java", -1)) {
+            status = U_ZERO_ERROR;
+            set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
+            if (U_FAILURE(status)) {
+                break;
+            }
+            //
+            //  Try the various Java specific properties.
+            //   These all begin with "java"
+            //
+            if (propName.compare(u"javaDefined", -1) == 0) {
+                addCategory(set.getAlias(), U_GC_CN_MASK, status);
                  set->complement();
              }
-            return set;
+            else if (propName.compare(u"javaDigit", -1) == 0) {
+                addCategory(set.getAlias(), U_GC_ND_MASK, status);
+            }
+            else if (propName.compare(u"javaIdentifierIgnorable", -1) == 0) {
+                addIdentifierIgnorable(set.getAlias(), status);
+            }
+            else if (propName.compare(u"javaISOControl", -1) == 0) {
+                set->add(0, 0x1F).add(0x7F, 0x9F);
+            }
+            else if (propName.compare(u"javaJavaIdentifierPart", -1) == 0) {
+                addCategory(set.getAlias(), U_GC_L_MASK, status);
+                addCategory(set.getAlias(), U_GC_SC_MASK, status);
+                addCategory(set.getAlias(), U_GC_PC_MASK, status);
+                addCategory(set.getAlias(), U_GC_ND_MASK, status);
+                addCategory(set.getAlias(), U_GC_NL_MASK, status);
+                addCategory(set.getAlias(), U_GC_MC_MASK, status);
+                addCategory(set.getAlias(), U_GC_MN_MASK, status);
+                addIdentifierIgnorable(set.getAlias(), status);
+            }
+            else if (propName.compare(u"javaJavaIdentifierStart", -1) == 0) {
+                addCategory(set.getAlias(), U_GC_L_MASK, status);
+                addCategory(set.getAlias(), U_GC_NL_MASK, status);
+                addCategory(set.getAlias(), U_GC_SC_MASK, status);
+                addCategory(set.getAlias(), U_GC_PC_MASK, status);
+            }
+            else if (propName.compare(u"javaLetter", -1) == 0) {
+                addCategory(set.getAlias(), U_GC_L_MASK, status);
+            }
+            else if (propName.compare(u"javaLetterOrDigit", -1) == 0) {
+                addCategory(set.getAlias(), U_GC_L_MASK, status);
+                addCategory(set.getAlias(), U_GC_ND_MASK, status);
+            }
+            else if (propName.compare(u"javaLowerCase", -1) == 0) {
+                addCategory(set.getAlias(), U_GC_LL_MASK, status);
+            }
+            else if (propName.compare(u"javaMirrored", -1) == 0) {
+                set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, status);
+            }
+            else if (propName.compare(u"javaSpaceChar", -1) == 0) {
+                addCategory(set.getAlias(), U_GC_Z_MASK, status);
+            }
+            else if (propName.compare(u"javaSupplementaryCodePoint", -1) == 0) {
+                set->add(0x10000, UnicodeSet::MAX_VALUE);
+            }
+            else if (propName.compare(u"javaTitleCase", -1) == 0) {
+                addCategory(set.getAlias(), U_GC_LT_MASK, status);
+            }
+            else if (propName.compare(u"javaUnicodeIdentifierStart", -1) == 0) {
+                addCategory(set.getAlias(), U_GC_L_MASK, status);
+                addCategory(set.getAlias(), U_GC_NL_MASK, status);
+            }
+            else if (propName.compare(u"javaUnicodeIdentifierPart", -1) == 0) {
+                addCategory(set.getAlias(), U_GC_L_MASK, status);
+                addCategory(set.getAlias(), U_GC_PC_MASK, status);
+                addCategory(set.getAlias(), U_GC_ND_MASK, status);
+                addCategory(set.getAlias(), U_GC_NL_MASK, status);
+                addCategory(set.getAlias(), U_GC_MC_MASK, status);
+                addCategory(set.getAlias(), U_GC_MN_MASK, status);
+                addIdentifierIgnorable(set.getAlias(), status);
+            }
+            else if (propName.compare(u"javaUpperCase", -1) == 0) {
+                addCategory(set.getAlias(), U_GC_LU_MASK, status);
+            }
+            else if (propName.compare(u"javaValidCodePoint", -1) == 0) {
+                set->add(0, UnicodeSet::MAX_VALUE);
+            }
+            else if (propName.compare(u"javaWhitespace", -1) == 0) {
+                addCategory(set.getAlias(), U_GC_Z_MASK, status);
+                set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
+                set->add(9, 0x0d).add(0x1c, 0x1f);
+            } else {
+                status = U_REGEX_PROPERTY_SYNTAX;
+            }
+
+            if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
+                set->closeOver(USET_CASE_INSENSITIVE);
+            }
+            break;
+        }
+
+        // Unrecognized property. ICU didn't like it as it was, and none of the Java compatibility
+        // extensions matched it.
+        status = U_REGEX_PROPERTY_SYNTAX;
+    } while (false);   // End of do loop block. Code above breaks out of the block on success or hard failure.
+
+    if (U_SUCCESS(status)) {
+        U_ASSERT(set.isValid());
+        if (negated) {
+            set->complement();
+        }
+        return set.orphan();
+    } else {
+        if (status == U_ILLEGAL_ARGUMENT_ERROR) {
+            status = U_REGEX_PROPERTY_SYNTAX;
          }
-        delete set;
-        set = NULL;
+        error(status);
+        return nullptr;
      }
-    error(*fStatus);
-    return NULL;
  }
  
  
-
  //
  //  SetEval   Part of the evaluation of [set expressions].
  //            Perform any pending (stacked) operations with precedence
@@ -4615,8 +4637,7 @@ void RegexCompile::setEval(int32_t nextOp) {
                  delete rightOperand;
                  break;
              default:
-                U_ASSERT(FALSE);
-                break;
+                UPRV_UNREACHABLE;
              }
          }
      }