icuSources/i18n/regexcmp.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 //
   4 //  file:  regexcmp.cpp
   5 //
   6 //  Copyright (C) 2002-2016 International Business Machines Corporation and others.
   7 //  All Rights Reserved.
   8 //
   9 //  This file contains the ICU regular expression compiler, which is responsible
  10 //  for processing a regular expression pattern into the compiled form that
  11 //  is used by the match finding engine.
  12 //
  13
  14 #include "unicode/utypes.h"
  15
  16 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  17
  18 #include "unicode/ustring.h"
  19 #include "unicode/unistr.h"
  20 #include "unicode/uniset.h"
  21 #include "unicode/uchar.h"
  22 #include "unicode/uchriter.h"
  23 #include "unicode/parsepos.h"
  24 #include "unicode/parseerr.h"
  25 #include "unicode/regex.h"
  26 #include "unicode/utf.h"
  27 #include "unicode/utf16.h"
  28 #include "patternprops.h"
  29 #include "putilimp.h"
  30 #include "cmemory.h"
  31 #include "cstring.h"
  32 #include "uvectr32.h"
  33 #include "uvectr64.h"
  34 #include "uassert.h"
  35 #include "uinvchar.h"
  36
  37 #include "regeximp.h"
  38 #include "regexcst.h"   // Contains state table for the regex pattern parser.
  39                         //   generated by a Perl script.
  40 #include "regexcmp.h"
  41 #include "regexst.h"
  42 #include "regextxt.h"
  43
  44
  45
  46 U_NAMESPACE_BEGIN
  47
  48
  49 //------------------------------------------------------------------------------
  50 //
  51 //  Constructor.
  52 //
  53 //------------------------------------------------------------------------------
  54 RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
  55    fParenStack(status), fSetStack(status), fSetOpStack(status)
  56 {
  57     // Lazy init of all shared global sets (needed for init()'s empty text)
  58     RegexStaticSets::initGlobals(&status);
  59
  60     fStatus           = &status;
  61
  62     fRXPat            = rxp;
  63     fScanIndex        = 0;
  64     fLastChar         = -1;
  65     fPeekChar         = -1;
  66     fLineNum          = 1;
  67     fCharNum          = 0;
  68     fQuoteMode        = FALSE;
  69     fInBackslashQuote = FALSE;
  70     fModeFlags        = fRXPat->fFlags | 0x80000000;
  71     fEOLComments      = TRUE;
  72
  73     fMatchOpenParen   = -1;
  74     fMatchCloseParen  = -1;
  75     fCaptureName      = NULL;
  76     fLastSetLiteral   = U_SENTINEL;
  77
  78     if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
  79         status = rxp->fDeferredStatus;
  80     }
  81 }
  82
  83 static const UChar      chAmp       = 0x26;      // '&'
  84 static const UChar      chDash      = 0x2d;      // '-'
  85
  86
  87 //------------------------------------------------------------------------------
  88 //
  89 //  Destructor
  90 //
  91 //------------------------------------------------------------------------------
  92 RegexCompile::~RegexCompile() {
  93     delete fCaptureName;         // Normally will be NULL, but can exist if pattern
  94                                  //   compilation stops with a syntax error.
  95 }
  96
  97 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
  98     set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
  99 }
 100
 101 //------------------------------------------------------------------------------
 102 //
 103 //  Compile regex pattern.   The state machine for rexexp pattern parsing is here.
 104 //                           The state tables are hand-written in the file regexcst.txt,
 105 //                           and converted to the form used here by a perl
 106 //                           script regexcst.pl
 107 //
 108 //------------------------------------------------------------------------------
 109 void    RegexCompile::compile(
 110                          const UnicodeString &pat,   // Source pat to be compiled.
 111                          UParseError &pp,            // Error position info
 112                          UErrorCode &e)              // Error Code
 113 {
 114     fRXPat->fPatternString = new UnicodeString(pat);
 115     UText patternText = UTEXT_INITIALIZER;
 116     utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
 117
 118     if (U_SUCCESS(e)) {
 119         compile(&patternText, pp, e);
 120         utext_close(&patternText);
 121     }
 122 }
 123
 124 //
 125 //   compile, UText mode
 126 //     All the work is actually done here.
 127 //
 128 void    RegexCompile::compile(
 129                          UText *pat,                 // Source pat to be compiled.
 130                          UParseError &pp,            // Error position info
 131                          UErrorCode &e)              // Error Code
 132 {
 133     fStatus             = &e;
 134     fParseErr           = &pp;
 135     fStackPtr           = 0;
 136     fStack[fStackPtr]   = 0;
 137
 138     if (U_FAILURE(*fStatus)) {
 139         return;
 140     }
 141
 142     // There should be no pattern stuff in the RegexPattern object.  They can not be reused.
 143     U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0);
 144
 145     // Prepare the RegexPattern object to receive the compiled pattern.
 146     fRXPat->fPattern        = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus);
 147     if (U_FAILURE(*fStatus)) {
 148         return;
 149     }
 150     fRXPat->fStaticSets     = RegexStaticSets::gStaticSets->fPropSets;
 151     fRXPat->fStaticSets8    = RegexStaticSets::gStaticSets->fPropSets8;
 152
 153
 154     // Initialize the pattern scanning state machine
 155     fPatternLength = utext_nativeLength(pat);
 156     uint16_t                state = 1;
 157     const RegexTableEl      *tableEl;
 158
 159     // UREGEX_LITERAL force entire pattern to be treated as a literal string.
 160     if (fModeFlags & UREGEX_LITERAL) {
 161         fQuoteMode = TRUE;
 162     }
 163
 164     nextChar(fC);                        // Fetch the first char from the pattern string.
 165
 166     //
 167     // Main loop for the regex pattern parsing state machine.
 168     //   Runs once per state transition.
 169     //   Each time through optionally performs, depending on the state table,
 170     //      - an advance to the the next pattern char
 171     //      - an action to be performed.
 172     //      - pushing or popping a state to/from the local state return stack.
 173     //   file regexcst.txt is the source for the state table.  The logic behind
 174     //     recongizing the pattern syntax is there, not here.
 175     //
 176     for (;;) {
 177         //  Bail out if anything has gone wrong.
 178         //  Regex pattern parsing stops on the first error encountered.
 179         if (U_FAILURE(*fStatus)) {
 180             break;
 181         }
 182
 183         U_ASSERT(state != 0);
 184
 185         // Find the state table element that matches the input char from the pattern, or the
 186         //    class of the input character.  Start with the first table row for this
 187         //    state, then linearly scan forward until we find a row that matches the
 188         //    character.  The last row for each state always matches all characters, so
 189         //    the search will stop there, if not before.
 190         //
 191         tableEl = &gRuleParseStateTable[state];
 192         REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d)    state=%s ",
 193             fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
 194
 195         for (;;) {    // loop through table rows belonging to this state, looking for one
 196                       //   that matches the current input char.
 197             REGEX_SCAN_DEBUG_PRINTF(("."));
 198             if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE &&   tableEl->fCharClass == fC.fChar) {
 199                 // Table row specified an individual character, not a set, and
 200                 //   the input character is not quoted, and
 201                 //   the input character matched it.
 202                 break;
 203             }
 204             if (tableEl->fCharClass == 255) {
 205                 // Table row specified default, match anything character class.
 206                 break;
 207             }
 208             if (tableEl->fCharClass == 254 && fC.fQuoted)  {
 209                 // Table row specified "quoted" and the char was quoted.
 210                 break;
 211             }
 212             if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1)  {
 213                 // Table row specified eof and we hit eof on the input.
 214                 break;
 215             }
 216
 217             if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
 218                 fC.fQuoted == FALSE &&                                       //   char is not escaped &&
 219                 fC.fChar != (UChar32)-1) {                                   //   char is not EOF
 220                 U_ASSERT(tableEl->fCharClass <= 137);
 221                 if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
 222                     // Table row specified a character class, or set of characters,
 223                     //   and the current char matches it.
 224                     break;
 225                 }
 226             }
 227
 228             // No match on this row, advance to the next  row for this state,
 229             tableEl++;
 230         }
 231         REGEX_SCAN_DEBUG_PRINTF(("\n"));
 232
 233         //
 234         // We've found the row of the state table that matches the current input
 235         //   character from the rules string.
 236         // Perform any action specified  by this row in the state table.
 237         if (doParseActions(tableEl->fAction) == FALSE) {
 238             // Break out of the state machine loop if the
 239             //   the action signalled some kind of error, or
 240             //   the action was to exit, occurs on normal end-of-rules-input.
 241             break;
 242         }
 243
 244         if (tableEl->fPushState != 0) {
 245             fStackPtr++;
 246             if (fStackPtr >= kStackSize) {
 247                 error(U_REGEX_INTERNAL_ERROR);
 248                 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
 249                 fStackPtr--;
 250             }
 251             fStack[fStackPtr] = tableEl->fPushState;
 252         }
 253
 254         //
 255         //  NextChar.  This is where characters are actually fetched from the pattern.
 256         //             Happens under control of the 'n' tag in the state table.
 257         //
 258         if (tableEl->fNextChar) {
 259             nextChar(fC);
 260         }
 261
 262         // Get the next state from the table entry, or from the
 263         //   state stack if the next state was specified as "pop".
 264         if (tableEl->fNextState != 255) {
 265             state = tableEl->fNextState;
 266         } else {
 267             state = fStack[fStackPtr];
 268             fStackPtr--;
 269             if (fStackPtr < 0) {
 270                 // state stack underflow
 271                 // This will occur if the user pattern has mis-matched parentheses,
 272                 //   with extra close parens.
 273                 //
 274                 fStackPtr++;
 275                 error(U_REGEX_MISMATCHED_PAREN);
 276             }
 277         }
 278
 279     }
 280
 281     if (U_FAILURE(*fStatus)) {
 282         // Bail out if the pattern had errors.
 283         //   Set stack cleanup:  a successful compile would have left it empty,
 284         //   but errors can leave temporary sets hanging around.
 285         while (!fSetStack.empty()) {
 286             delete (UnicodeSet *)fSetStack.pop();
 287         }
 288         return;
 289     }
 290
 291     //
 292     // The pattern has now been read and processed, and the compiled code generated.
 293     //
 294
 295     //
 296     // The pattern's fFrameSize so far has accumulated the requirements for
 297     //   storage for capture parentheses, counters, etc. that are encountered
 298     //   in the pattern.  Add space for the two variables that are always
 299     //   present in the saved state:  the input string position (int64_t) and
 300     //   the position in the compiled pattern.
 301     //
 302     allocateStackData(RESTACKFRAME_HDRCOUNT);
 303
 304     //
 305     // Optimization pass 1: NOPs, back-references, and case-folding
 306     //
 307     stripNOPs();
 308
 309     //
 310     // Get bounds for the minimum and maximum length of a string that this
 311     //   pattern can match.  Used to avoid looking for matches in strings that
 312     //   are too short.
 313     //
 314     fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
 315
 316     //
 317     // Optimization pass 2: match start type
 318     //
 319     matchStartType();
 320
 321     //
 322     // Set up fast latin-1 range sets
 323     //
 324     int32_t numSets = fRXPat->fSets->size();
 325     fRXPat->fSets8 = new Regex8BitSet[numSets];
 326     // Null pointer check.
 327     if (fRXPat->fSets8 == NULL) {
 328         e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
 329         return;
 330     }
 331     int32_t i;
 332     for (i=0; i<numSets; i++) {
 333         UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
 334         fRXPat->fSets8[i].init(s);
 335     }
 336
 337 }
 338
 339
 340
 341
 342
 343 //------------------------------------------------------------------------------
 344 //
 345 //  doParseAction        Do some action during regex pattern parsing.
 346 //                       Called by the parse state machine.
 347 //
 348 //                       Generation of the match engine PCode happens here, or
 349 //                       in functions called from the parse actions defined here.
 350 //
 351 //
 352 //------------------------------------------------------------------------------
 353 UBool RegexCompile::doParseActions(int32_t action)
 354 {
 355     UBool   returnVal = TRUE;
 356
 357     switch ((Regex_PatternParseAction)action) {
 358
 359     case doPatStart:
 360         // Start of pattern compiles to:
 361         //0   SAVE   2        Fall back to position of FAIL
 362         //1   jmp    3
 363         //2   FAIL            Stop if we ever reach here.
 364         //3   NOP             Dummy, so start of pattern looks the same as
 365         //                    the start of an ( grouping.
 366         //4   NOP             Resreved, will be replaced by a save if there are
 367         //                    OR | operators at the top level
 368         appendOp(URX_STATE_SAVE, 2);
 369         appendOp(URX_JMP,  3);
 370         appendOp(URX_FAIL, 0);
 371
 372         // Standard open nonCapture paren action emits the two NOPs and
 373         //   sets up the paren stack frame.
 374         doParseActions(doOpenNonCaptureParen);
 375         break;
 376
 377     case doPatFinish:
 378         // We've scanned to the end of the pattern
 379         //  The end of pattern compiles to:
 380         //        URX_END
 381         //    which will stop the runtime match engine.
 382         //  Encountering end of pattern also behaves like a close paren,
 383         //   and forces fixups of the State Save at the beginning of the compiled pattern
 384         //   and of any OR operations at the top level.
 385         //
 386         handleCloseParen();
 387         if (fParenStack.size() > 0) {
 388             // Missing close paren in pattern.
 389             error(U_REGEX_MISMATCHED_PAREN);
 390         }
 391
 392         // add the END operation to the compiled pattern.
 393         appendOp(URX_END, 0);
 394
 395         // Terminate the pattern compilation state machine.
 396         returnVal = FALSE;
 397         break;
 398
 399
 400
 401     case doOrOperator:
 402         // Scanning a '|', as in (A|B)
 403         {
 404             // Generate code for any pending literals preceding the '|'
 405             fixLiterals(FALSE);
 406
 407             // Insert a SAVE operation at the start of the pattern section preceding
 408             //   this OR at this level.  This SAVE will branch the match forward
 409             //   to the right hand side of the OR in the event that the left hand
 410             //   side fails to match and backtracks.  Locate the position for the
 411             //   save from the location on the top of the parentheses stack.
 412             int32_t savePosition = fParenStack.popi();
 413             int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
 414             U_ASSERT(URX_TYPE(op) == URX_NOP);  // original contents of reserved location
 415             op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
 416             fRXPat->fCompiledPat->setElementAt(op, savePosition);
 417
 418             // Append an JMP operation into the compiled pattern.  The operand for
 419             //  the JMP will eventually be the location following the ')' for the
 420             //  group.  This will be patched in later, when the ')' is encountered.
 421             appendOp(URX_JMP, 0);
 422
 423             // Push the position of the newly added JMP op onto the parentheses stack.
 424             // This registers if for fixup when this block's close paren is encountered.
 425             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
 426
 427             // Append a NOP to the compiled pattern.  This is the slot reserved
 428             //   for a SAVE in the event that there is yet another '|' following
 429             //   this one.
 430             appendOp(URX_NOP, 0);
 431             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
 432         }
 433         break;
 434
 435
 436     case doBeginNamedCapture:
 437         // Scanning (?<letter.
 438         //   The first letter of the name will come through again under doConinueNamedCapture.
 439         fCaptureName = new UnicodeString();
 440         if (fCaptureName == NULL) {
 441             error(U_MEMORY_ALLOCATION_ERROR);
 442         }
 443         break;
 444
 445     case  doContinueNamedCapture:
 446         fCaptureName->append(fC.fChar);
 447         break;
 448
 449     case doBadNamedCapture:
 450         error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
 451         break;
 452
 453     case doOpenCaptureParen:
 454         // Open Capturing Paren, possibly named.
 455         //   Compile to a
 456         //      - NOP, which later may be replaced by a save-state if the
 457         //         parenthesized group gets a * quantifier, followed by
 458         //      - START_CAPTURE  n    where n is stack frame offset to the capture group variables.
 459         //      - NOP, which may later be replaced by a save-state if there
 460         //             is an '|' alternation within the parens.
 461         //
 462         //    Each capture group gets three slots in the save stack frame:
 463         //         0: Capture Group start position (in input string being matched.)
 464         //         1: Capture Group end position.
 465         //         2: Start of Match-in-progress.
 466         //    The first two locations are for a completed capture group, and are
 467         //     referred to by back references and the like.
 468         //    The third location stores the capture start position when an START_CAPTURE is
 469         //      encountered.  This will be promoted to a completed capture when (and if) the corresponding
 470         //      END_CAPTURE is encountered.
 471         {
 472             fixLiterals();
 473             appendOp(URX_NOP, 0);
 474             int32_t  varsLoc = allocateStackData(3);    // Reserve three slots in match stack frame.
 475             appendOp(URX_START_CAPTURE, varsLoc);
 476             appendOp(URX_NOP, 0);
 477
 478             // On the Parentheses stack, start a new frame and add the postions
 479             //   of the two NOPs.  Depending on what follows in the pattern, the
 480             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
 481             //   address of the end of the parenthesized group.
 482             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
 483             fParenStack.push(capturing, *fStatus);                        // Frame type.
 484             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first  NOP location
 485             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
 486
 487             // Save the mapping from group number to stack frame variable position.
 488             fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
 489
 490             // If this is a named capture group, add the name->group number mapping.
 491             if (fCaptureName != NULL) {
 492                 int32_t groupNumber = fRXPat->fGroupMap->size();
 493                 int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
 494                 fCaptureName = NULL;    // hash table takes ownership of the name (key) string.
 495                 if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
 496                     error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
 497                 }
 498             }
 499         }
 500         break;
 501
 502     case doOpenNonCaptureParen:
 503         // Open non-caputuring (grouping only) Paren.
 504         //   Compile to a
 505         //      - NOP, which later may be replaced by a save-state if the
 506         //         parenthesized group gets a * quantifier, followed by
 507         //      - NOP, which may later be replaced by a save-state if there
 508         //             is an '|' alternation within the parens.
 509         {
 510             fixLiterals();
 511             appendOp(URX_NOP, 0);
 512             appendOp(URX_NOP, 0);
 513
 514             // On the Parentheses stack, start a new frame and add the postions
 515             //   of the two NOPs.
 516             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
 517             fParenStack.push(plain,      *fStatus);                       // Begin a new frame.
 518             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
 519             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
 520         }
 521          break;
 522
 523
 524     case doOpenAtomicParen:
 525         // Open Atomic Paren.  (?>
 526         //   Compile to a
 527         //      - NOP, which later may be replaced if the parenthesized group
 528         //         has a quantifier, followed by
 529         //      - STO_SP  save state stack position, so it can be restored at the ")"
 530         //      - NOP, which may later be replaced by a save-state if there
 531         //             is an '|' alternation within the parens.
 532         {
 533             fixLiterals();
 534             appendOp(URX_NOP, 0);
 535             int32_t  varLoc = allocateData(1);    // Reserve a data location for saving the state stack ptr.
 536             appendOp(URX_STO_SP, varLoc);
 537             appendOp(URX_NOP, 0);
 538
 539             // On the Parentheses stack, start a new frame and add the postions
 540             //   of the two NOPs.  Depending on what follows in the pattern, the
 541             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
 542             //   address of the end of the parenthesized group.
 543             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
 544             fParenStack.push(atomic, *fStatus);                           // Frame type.
 545             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first NOP
 546             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
 547         }
 548         break;
 549
 550
 551     case doOpenLookAhead:
 552         // Positive Look-ahead   (?=  stuff  )
 553         //
 554         //   Note:   Addition of transparent input regions, with the need to
 555         //           restore the original regions when failing out of a lookahead
 556         //           block, complicated this sequence.  Some conbined opcodes
 557         //           might make sense - or might not, lookahead aren't that common.
 558         //
 559         //      Caution:  min match length optimization knows about this
 560         //               sequence; don't change without making updates there too.
 561         //
 562         // Compiles to
 563         //    1    START_LA     dataLoc     Saves SP, Input Pos
 564         //    2.   STATE_SAVE   4            on failure of lookahead, goto 4
 565         //    3    JMP          6           continue ...
 566         //
 567         //    4.   LA_END                   Look Ahead failed.  Restore regions.
 568         //    5.   BACKTRACK                and back track again.
 569         //
 570         //    6.   NOP              reserved for use by quantifiers on the block.
 571         //                          Look-ahead can't have quantifiers, but paren stack
 572         //                             compile time conventions require the slot anyhow.
 573         //    7.   NOP              may be replaced if there is are '|' ops in the block.
 574         //    8.     code for parenthesized stuff.
 575         //    9.   LA_END
 576         //
 577         //  Two data slots are reserved, for saving the stack ptr and the input position.
 578         {
 579             fixLiterals();
 580             int32_t dataLoc = allocateData(2);
 581             appendOp(URX_LA_START, dataLoc);
 582             appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
 583             appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
 584             appendOp(URX_LA_END, dataLoc);
 585             appendOp(URX_BACKTRACK, 0);
 586             appendOp(URX_NOP, 0);
 587             appendOp(URX_NOP, 0);
 588
 589             // On the Parentheses stack, start a new frame and add the postions
 590             //   of the NOPs.
 591             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
 592             fParenStack.push(lookAhead, *fStatus);                        // Frame type.
 593             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
 594             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
 595         }
 596         break;
 597
 598     case doOpenLookAheadNeg:
 599         // Negated Lookahead.   (?! stuff )
 600         // Compiles to
 601         //    1.    START_LA    dataloc
 602         //    2.    SAVE_STATE  7         // Fail within look-ahead block restores to this state,
 603         //                                //   which continues with the match.
 604         //    3.    NOP                   // Std. Open Paren sequence, for possible '|'
 605         //    4.       code for parenthesized stuff.
 606         //    5.    END_LA                // Cut back stack, remove saved state from step 2.
 607         //    6.    BACKTRACK             // code in block succeeded, so neg. lookahead fails.
 608         //    7.    END_LA                // Restore match region, in case look-ahead was using
 609         //                                        an alternate (transparent) region.
 610         {
 611             fixLiterals();
 612             int32_t dataLoc = allocateData(2);
 613             appendOp(URX_LA_START, dataLoc);
 614             appendOp(URX_STATE_SAVE, 0);    // dest address will be patched later.
 615             appendOp(URX_NOP, 0);
 616
 617             // On the Parentheses stack, start a new frame and add the postions
 618             //   of the StateSave and NOP.
 619             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
 620             fParenStack.push(negLookAhead, *fStatus);                    // Frame type
 621             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The STATE_SAVE location
 622             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
 623
 624             // Instructions #5 - #7 will be added when the ')' is encountered.
 625         }
 626         break;
 627
 628     case doOpenLookBehind:
 629         {
 630             //   Compile a (?<= look-behind open paren.
 631             //
 632             //          Compiles to
 633             //              0       URX_LB_START     dataLoc
 634             //              1       URX_LB_CONT      dataLoc
 635             //              2                        MinMatchLen
 636             //              3                        MaxMatchLen
 637             //              4       URX_NOP          Standard '(' boilerplate.
 638             //              5       URX_NOP          Reserved slot for use with '|' ops within (block).
 639             //              6         <code for LookBehind expression>
 640             //              7       URX_LB_END       dataLoc    # Check match len, restore input  len
 641             //              8       URX_LA_END       dataLoc    # Restore stack, input pos
 642             //
 643             //          Allocate a block of matcher data, to contain (when running a match)
 644             //              0:    Stack ptr on entry
 645             //              1:    Input Index on entry
 646             //              2:    Start index of match current match attempt.
 647             //              3:    Original Input String len.
 648
 649             // Generate match code for any pending literals.
 650             fixLiterals();
 651
 652             // Allocate data space
 653             int32_t dataLoc = allocateData(4);
 654
 655             // Emit URX_LB_START
 656             appendOp(URX_LB_START, dataLoc);
 657
 658             // Emit URX_LB_CONT
 659             appendOp(URX_LB_CONT, dataLoc);
 660             appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
 661             appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
 662
 663             // Emit the NOPs
 664             appendOp(URX_NOP, 0);
 665             appendOp(URX_NOP, 0);
 666
 667             // On the Parentheses stack, start a new frame and add the postions
 668             //   of the URX_LB_CONT and the NOP.
 669             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
 670             fParenStack.push(lookBehind, *fStatus);                       // Frame type
 671             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
 672             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
 673
 674             // The final two instructions will be added when the ')' is encountered.
 675         }
 676
 677         break;
 678
 679     case doOpenLookBehindNeg:
 680         {
 681             //   Compile a (?<! negated look-behind open paren.
 682             //
 683             //          Compiles to
 684             //              0       URX_LB_START     dataLoc    # Save entry stack, input len
 685             //              1       URX_LBN_CONT     dataLoc    # Iterate possible match positions
 686             //              2                        MinMatchLen
 687             //              3                        MaxMatchLen
 688             //              4                        continueLoc (9)
 689             //              5       URX_NOP          Standard '(' boilerplate.
 690             //              6       URX_NOP          Reserved slot for use with '|' ops within (block).
 691             //              7         <code for LookBehind expression>
 692             //              8       URX_LBN_END      dataLoc    # Check match len, cause a FAIL
 693             //              9       ...
 694             //
 695             //          Allocate a block of matcher data, to contain (when running a match)
 696             //              0:    Stack ptr on entry
 697             //              1:    Input Index on entry
 698             //              2:    Start index of match current match attempt.
 699             //              3:    Original Input String len.
 700
 701             // Generate match code for any pending literals.
 702             fixLiterals();
 703
 704             // Allocate data space
 705             int32_t dataLoc = allocateData(4);
 706
 707             // Emit URX_LB_START
 708             appendOp(URX_LB_START, dataLoc);
 709
 710             // Emit URX_LBN_CONT
 711             appendOp(URX_LBN_CONT, dataLoc);
 712             appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
 713             appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
 714             appendOp(URX_RESERVED_OP, 0);    // Continue Loc.    To be filled later.
 715
 716             // Emit the NOPs
 717             appendOp(URX_NOP, 0);
 718             appendOp(URX_NOP, 0);
 719
 720             // On the Parentheses stack, start a new frame and add the postions
 721             //   of the URX_LB_CONT and the NOP.
 722             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
 723             fParenStack.push(lookBehindN, *fStatus);                      // Frame type
 724             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
 725             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
 726
 727             // The final two instructions will be added when the ')' is encountered.
 728         }
 729         break;
 730
 731     case doConditionalExpr:
 732         // Conditionals such as (?(1)a:b)
 733     case doPerlInline:
 734         // Perl inline-condtionals.  (?{perl code}a|b) We're not perl, no way to do them.
 735         error(U_REGEX_UNIMPLEMENTED);
 736         break;
 737
 738
 739     case doCloseParen:
 740         handleCloseParen();
 741         if (fParenStack.size() <= 0) {
 742             //  Extra close paren, or missing open paren.
 743             error(U_REGEX_MISMATCHED_PAREN);
 744         }
 745         break;
 746
 747     case doNOP:
 748         break;
 749
 750
 751     case doBadOpenParenType:
 752     case doRuleError:
 753         error(U_REGEX_RULE_SYNTAX);
 754         break;
 755
 756
 757     case doMismatchedParenErr:
 758         error(U_REGEX_MISMATCHED_PAREN);
 759         break;
 760
 761     case doPlus:
 762         //  Normal '+'  compiles to
 763         //     1.   stuff to be repeated  (already built)
 764         //     2.   jmp-sav 1
 765         //     3.   ...
 766         //
 767         //  Or, if the item to be repeated can match a zero length string,
 768         //     1.   STO_INP_LOC  data-loc
 769         //     2.      body of stuff to be repeated
 770         //     3.   JMP_SAV_X    2
 771         //     4.   ...
 772
 773         //
 774         //  Or, if the item to be repeated is simple
 775         //     1.   Item to be repeated.
 776         //     2.   LOOP_SR_I    set number  (assuming repeated item is a set ref)
 777         //     3.   LOOP_C       stack location
 778         {
 779             int32_t  topLoc = blockTopLoc(FALSE);        // location of item #1
 780             int32_t  frameLoc;
 781
 782             // Check for simple constructs, which may get special optimized code.
 783             if (topLoc == fRXPat->fCompiledPat->size() - 1) {
 784                 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
 785
 786                 if (URX_TYPE(repeatedOp) == URX_SETREF) {
 787                     // Emit optimized code for [char set]+
 788                     appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
 789                     frameLoc = allocateStackData(1);
 790                     appendOp(URX_LOOP_C, frameLoc);
 791                     break;
 792                 }
 793
 794                 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
 795                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
 796                     URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
 797                     // Emit Optimized code for .+ operations.
 798                     int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
 799                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
 800                         // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
 801                         loopOpI |= 1;
 802                     }
 803                     if (fModeFlags & UREGEX_UNIX_LINES) {
 804                         loopOpI |= 2;
 805                     }
 806                     appendOp(loopOpI);
 807                     frameLoc = allocateStackData(1);
 808                     appendOp(URX_LOOP_C, frameLoc);
 809                     break;
 810                 }
 811
 812             }
 813
 814             // General case.
 815
 816             // Check for minimum match length of zero, which requires
 817             //    extra loop-breaking code.
 818             if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
 819                 // Zero length match is possible.
 820                 // Emit the code sequence that can handle it.
 821                 insertOp(topLoc);
 822                 frameLoc = allocateStackData(1);
 823
 824                 int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
 825                 fRXPat->fCompiledPat->setElementAt(op, topLoc);
 826
 827                 appendOp(URX_JMP_SAV_X, topLoc+1);
 828             } else {
 829                 // Simpler code when the repeated body must match something non-empty
 830                 appendOp(URX_JMP_SAV, topLoc);
 831             }
 832         }
 833         break;
 834
 835     case doNGPlus:
 836         //  Non-greedy '+?'  compiles to
 837         //     1.   stuff to be repeated  (already built)
 838         //     2.   state-save  1
 839         //     3.   ...
 840         {
 841             int32_t topLoc      = blockTopLoc(FALSE);
 842             appendOp(URX_STATE_SAVE, topLoc);
 843         }
 844         break;
 845
 846
 847     case doOpt:
 848         // Normal (greedy) ? quantifier.
 849         //  Compiles to
 850         //     1. state save 3
 851         //     2.    body of optional block
 852         //     3. ...
 853         // Insert the state save into the compiled pattern, and we're done.
 854         {
 855             int32_t   saveStateLoc = blockTopLoc(TRUE);
 856             int32_t   saveStateOp  = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
 857             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
 858         }
 859         break;
 860
 861     case doNGOpt:
 862         // Non-greedy ?? quantifier
 863         //   compiles to
 864         //    1.  jmp   4
 865         //    2.     body of optional block
 866         //    3   jmp   5
 867         //    4.  state save 2
 868         //    5    ...
 869         //  This code is less than ideal, with two jmps instead of one, because we can only
 870         //  insert one instruction at the top of the block being iterated.
 871         {
 872             int32_t  jmp1_loc = blockTopLoc(TRUE);
 873             int32_t  jmp2_loc = fRXPat->fCompiledPat->size();
 874
 875             int32_t  jmp1_op  = buildOp(URX_JMP, jmp2_loc+1);
 876             fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
 877
 878             appendOp(URX_JMP, jmp2_loc+2);
 879
 880             appendOp(URX_STATE_SAVE, jmp1_loc+1);
 881         }
 882         break;
 883
 884
 885     case doStar:
 886         // Normal (greedy) * quantifier.
 887         // Compiles to
 888         //       1.   STATE_SAVE   4
 889         //       2.      body of stuff being iterated over
 890         //       3.   JMP_SAV      2
 891         //       4.   ...
 892         //
 893         // Or, if the body is a simple [Set],
 894         //       1.   LOOP_SR_I    set number
 895         //       2.   LOOP_C       stack location
 896         //       ...
 897         //
 898         // Or if this is a .*
 899         //       1.   LOOP_DOT_I    (. matches all mode flag)
 900         //       2.   LOOP_C        stack location
 901         //
 902         // Or, if the body can match a zero-length string, to inhibit infinite loops,
 903         //       1.   STATE_SAVE   5
 904         //       2.   STO_INP_LOC  data-loc
 905         //       3.      body of stuff
 906         //       4.   JMP_SAV_X    2
 907         //       5.   ...
 908         {
 909             // location of item #1, the STATE_SAVE
 910             int32_t   topLoc = blockTopLoc(FALSE);
 911             int32_t   dataLoc = -1;
 912
 913             // Check for simple *, where the construct being repeated
 914             //   compiled to single opcode, and might be optimizable.
 915             if (topLoc == fRXPat->fCompiledPat->size() - 1) {
 916                 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
 917
 918                 if (URX_TYPE(repeatedOp) == URX_SETREF) {
 919                     // Emit optimized code for a [char set]*
 920                     int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
 921                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
 922                     dataLoc = allocateStackData(1);
 923                     appendOp(URX_LOOP_C, dataLoc);
 924                     break;
 925                 }
 926
 927                 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
 928                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
 929                     URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
 930                     // Emit Optimized code for .* operations.
 931                     int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
 932                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
 933                         // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
 934                         loopOpI |= 1;
 935                     }
 936                     if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
 937                         loopOpI |= 2;
 938                     }
 939                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
 940                     dataLoc = allocateStackData(1);
 941                     appendOp(URX_LOOP_C, dataLoc);
 942                     break;
 943                 }
 944             }
 945
 946             // Emit general case code for this *
 947             // The optimizations did not apply.
 948
 949             int32_t   saveStateLoc = blockTopLoc(TRUE);
 950             int32_t   jmpOp        = buildOp(URX_JMP_SAV, saveStateLoc+1);
 951
 952             // Check for minimum match length of zero, which requires
 953             //    extra loop-breaking code.
 954             if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
 955                 insertOp(saveStateLoc);
 956                 dataLoc = allocateStackData(1);
 957
 958                 int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
 959                 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
 960                 jmpOp      = buildOp(URX_JMP_SAV_X, saveStateLoc+2);
 961             }
 962
 963             // Locate the position in the compiled pattern where the match will continue
 964             //   after completing the *.   (4 or 5 in the comment above)
 965             int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
 966
 967             // Put together the save state op and store it into the compiled code.
 968             int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
 969             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
 970
 971             // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
 972             appendOp(jmpOp);
 973         }
 974         break;
 975
 976     case doNGStar:
 977         // Non-greedy *? quantifier
 978         // compiles to
 979         //     1.   JMP    3
 980         //     2.      body of stuff being iterated over
 981         //     3.   STATE_SAVE  2
 982         //     4    ...
 983         {
 984             int32_t     jmpLoc  = blockTopLoc(TRUE);                   // loc  1.
 985             int32_t     saveLoc = fRXPat->fCompiledPat->size();        // loc  3.
 986             int32_t     jmpOp   = buildOp(URX_JMP, saveLoc);
 987             fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
 988             appendOp(URX_STATE_SAVE, jmpLoc+1);
 989         }
 990         break;
 991
 992
 993     case doIntervalInit:
 994         // The '{' opening an interval quantifier was just scanned.
 995         // Init the counter varaiables that will accumulate the values as the digits
 996         //    are scanned.
 997         fIntervalLow = 0;
 998         fIntervalUpper = -1;
 999         break;
1000
1001     case doIntevalLowerDigit:
1002         // Scanned a digit from the lower value of an {lower,upper} interval
1003         {
1004             int32_t digitValue = u_charDigitValue(fC.fChar);
1005             U_ASSERT(digitValue >= 0);
1006             int64_t val = (int64_t)fIntervalLow*10 + digitValue;
1007             if (val > INT32_MAX) {
1008                 error(U_REGEX_NUMBER_TOO_BIG);
1009             } else {
1010                 fIntervalLow = (int32_t)val;
1011             }
1012         }
1013         break;
1014
1015     case doIntervalUpperDigit:
1016         // Scanned a digit from the upper value of an {lower,upper} interval
1017         {
1018             if (fIntervalUpper < 0) {
1019                 fIntervalUpper = 0;
1020             }
1021             int32_t digitValue = u_charDigitValue(fC.fChar);
1022             U_ASSERT(digitValue >= 0);
1023             int64_t val = (int64_t)fIntervalUpper*10 + digitValue;
1024             if (val > INT32_MAX) {
1025                 error(U_REGEX_NUMBER_TOO_BIG);
1026             } else {
1027                 fIntervalUpper = (int32_t)val;
1028             }
1029         }
1030         break;
1031
1032     case doIntervalSame:
1033         // Scanned a single value interval like {27}.  Upper = Lower.
1034         fIntervalUpper = fIntervalLow;
1035         break;
1036
1037     case doInterval:
1038         // Finished scanning a normal {lower,upper} interval.  Generate the code for it.
1039         if (compileInlineInterval() == FALSE) {
1040             compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1041         }
1042         break;
1043
1044     case doPossessiveInterval:
1045         // Finished scanning a Possessive {lower,upper}+ interval.  Generate the code for it.
1046         {
1047             // Remember the loc for the top of the block being looped over.
1048             //   (Can not reserve a slot in the compiled pattern at this time, because
1049             //    compileInterval needs to reserve also, and blockTopLoc can only reserve
1050             //    once per block.)
1051             int32_t topLoc = blockTopLoc(FALSE);
1052
1053             // Produce normal looping code.
1054             compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1055
1056             // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1057             //  just as if the loop was inclosed in atomic parentheses.
1058
1059             // First the STO_SP before the start of the loop
1060             insertOp(topLoc);
1061
1062             int32_t  varLoc = allocateData(1);   // Reserve a data location for saving the
1063             int32_t  op     = buildOp(URX_STO_SP, varLoc);
1064             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1065
1066             int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
1067             U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
1068             loopOp++;     // point LoopOp after the just-inserted STO_SP
1069             fRXPat->fCompiledPat->push(loopOp, *fStatus);
1070
1071             // Then the LD_SP after the end of the loop
1072             appendOp(URX_LD_SP, varLoc);
1073         }
1074
1075         break;
1076
1077     case doNGInterval:
1078         // Finished scanning a non-greedy {lower,upper}? interval.  Generate the code for it.
1079         compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
1080         break;
1081
1082     case doIntervalError:
1083         error(U_REGEX_BAD_INTERVAL);
1084         break;
1085
1086     case doLiteralChar:
1087         // We've just scanned a "normal" character from the pattern,
1088         literalChar(fC.fChar);
1089         break;
1090
1091
1092     case doEscapedLiteralChar:
1093         // We've just scanned an backslashed escaped character with  no
1094         //   special meaning.  It represents itself.
1095         if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1096             ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1097             (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1098                error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1099              }
1100         literalChar(fC.fChar);
1101         break;
1102
1103
1104     case doDotAny:
1105         // scanned a ".",  match any single character.
1106         {
1107             fixLiterals(FALSE);
1108             if (fModeFlags & UREGEX_DOTALL) {
1109                 appendOp(URX_DOTANY_ALL, 0);
1110             } else if (fModeFlags & UREGEX_UNIX_LINES) {
1111                 appendOp(URX_DOTANY_UNIX, 0);
1112             } else {
1113                 appendOp(URX_DOTANY, 0);
1114             }
1115         }
1116         break;
1117
1118     case doCaret:
1119         {
1120             fixLiterals(FALSE);
1121             if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1122                 appendOp(URX_CARET, 0);
1123             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1124                 appendOp(URX_CARET_M, 0);
1125             } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1126                 appendOp(URX_CARET, 0);   // Only testing true start of input.
1127             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1128                 appendOp(URX_CARET_M_UNIX, 0);
1129             }
1130         }
1131         break;
1132
1133     case doDollar:
1134         {
1135             fixLiterals(FALSE);
1136             if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1137                 appendOp(URX_DOLLAR, 0);
1138             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1139                 appendOp(URX_DOLLAR_M, 0);
1140             } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1141                 appendOp(URX_DOLLAR_D, 0);
1142             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1143                 appendOp(URX_DOLLAR_MD, 0);
1144             }
1145         }
1146         break;
1147
1148     case doBackslashA:
1149         fixLiterals(FALSE);
1150         appendOp(URX_CARET, 0);
1151         break;
1152
1153     case doBackslashB:
1154         {
1155             #if  UCONFIG_NO_BREAK_ITERATION==1
1156             if (fModeFlags & UREGEX_UWORD) {
1157                 error(U_UNSUPPORTED_ERROR);
1158             }
1159             #endif
1160             fixLiterals(FALSE);
1161             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1162             appendOp(op, 1);
1163         }
1164         break;
1165
1166     case doBackslashb:
1167         {
1168             #if  UCONFIG_NO_BREAK_ITERATION==1
1169             if (fModeFlags & UREGEX_UWORD) {
1170                 error(U_UNSUPPORTED_ERROR);
1171             }
1172             #endif
1173             fixLiterals(FALSE);
1174             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1175             appendOp(op, 0);
1176         }
1177         break;
1178
1179     case doBackslashD:
1180         fixLiterals(FALSE);
1181         appendOp(URX_BACKSLASH_D, 1);
1182         break;
1183
1184     case doBackslashd:
1185         fixLiterals(FALSE);
1186         appendOp(URX_BACKSLASH_D, 0);
1187         break;
1188
1189     case doBackslashG:
1190         fixLiterals(FALSE);
1191         appendOp(URX_BACKSLASH_G, 0);
1192         break;
1193
1194     case doBackslashH:
1195         fixLiterals(FALSE);
1196         appendOp(URX_BACKSLASH_H, 1);
1197         break;
1198
1199     case doBackslashh:
1200         fixLiterals(FALSE);
1201         appendOp(URX_BACKSLASH_H, 0);
1202         break;
1203
1204     case doBackslashR:
1205         fixLiterals(FALSE);
1206         appendOp(URX_BACKSLASH_R, 0);
1207         break;
1208
1209     case doBackslashS:
1210         fixLiterals(FALSE);
1211         appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
1212         break;
1213
1214     case doBackslashs:
1215         fixLiterals(FALSE);
1216         appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
1217         break;
1218
1219     case doBackslashV:
1220         fixLiterals(FALSE);
1221         appendOp(URX_BACKSLASH_V, 1);
1222         break;
1223
1224     case doBackslashv:
1225         fixLiterals(FALSE);
1226         appendOp(URX_BACKSLASH_V, 0);
1227         break;
1228
1229     case doBackslashW:
1230         fixLiterals(FALSE);
1231         appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
1232         break;
1233
1234     case doBackslashw:
1235         fixLiterals(FALSE);
1236         appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
1237         break;
1238
1239     case doBackslashX:
1240         fixLiterals(FALSE);
1241         appendOp(URX_BACKSLASH_X, 0);
1242         break;
1243
1244
1245     case doBackslashZ:
1246         fixLiterals(FALSE);
1247         appendOp(URX_DOLLAR, 0);
1248         break;
1249
1250     case doBackslashz:
1251         fixLiterals(FALSE);
1252         appendOp(URX_BACKSLASH_Z, 0);
1253         break;
1254
1255     case doEscapeError:
1256         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1257         break;
1258
1259     case doExit:
1260         fixLiterals(FALSE);
1261         returnVal = FALSE;
1262         break;
1263
1264     case doProperty:
1265         {
1266             fixLiterals(FALSE);
1267             UnicodeSet *theSet = scanProp();
1268             compileSet(theSet);
1269         }
1270         break;
1271
1272     case doNamedChar:
1273         {
1274             UChar32 c = scanNamedChar();
1275             literalChar(c);
1276         }
1277         break;
1278
1279
1280     case doBackRef:
1281         // BackReference.  Somewhat unusual in that the front-end can not completely parse
1282         //                 the regular expression, because the number of digits to be consumed
1283         //                 depends on the number of capture groups that have been defined.  So
1284         //                 we have to do it here instead.
1285         {
1286             int32_t  numCaptureGroups = fRXPat->fGroupMap->size();
1287             int32_t  groupNum = 0;
1288             UChar32  c        = fC.fChar;
1289
1290             for (;;) {
1291                 // Loop once per digit, for max allowed number of digits in a back reference.
1292                 int32_t digit = u_charDigitValue(c);
1293                 groupNum = groupNum * 10 + digit;
1294                 if (groupNum >= numCaptureGroups) {
1295                     break;
1296                 }
1297                 c = peekCharLL();
1298                 if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) {
1299                     break;
1300                 }
1301                 nextCharLL();
1302             }
1303
1304             // Scan of the back reference in the source regexp is complete.  Now generate
1305             //  the compiled code for it.
1306             // Because capture groups can be forward-referenced by back-references,
1307             //  we fill the operand with the capture group number.  At the end
1308             //  of compilation, it will be changed to the variable's location.
1309             U_ASSERT(groupNum > 0);  // Shouldn't happen.  '\0' begins an octal escape sequence,
1310                                      //    and shouldn't enter this code path at all.
1311             fixLiterals(FALSE);
1312             if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1313                 appendOp(URX_BACKREF_I, groupNum);
1314             } else {
1315                 appendOp(URX_BACKREF, groupNum);
1316             }
1317         }
1318         break;
1319
1320     case doBeginNamedBackRef:
1321         U_ASSERT(fCaptureName == NULL);
1322         fCaptureName = new UnicodeString;
1323         if (fCaptureName == NULL) {
1324             error(U_MEMORY_ALLOCATION_ERROR);
1325         }
1326         break;
1327
1328     case doContinueNamedBackRef:
1329         fCaptureName->append(fC.fChar);
1330         break;
1331
1332     case doCompleteNamedBackRef:
1333         {
1334         int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName);
1335         if (groupNumber == 0) {
1336             // Group name has not been defined.
1337             //   Could be a forward reference. If we choose to support them at some
1338             //   future time, extra mechanism will be required at this point.
1339             error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
1340         } else {
1341             // Given the number, handle identically to a \n numbered back reference.
1342             // See comments above, under doBackRef
1343             fixLiterals(FALSE);
1344             if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1345                 appendOp(URX_BACKREF_I, groupNumber);
1346             } else {
1347                 appendOp(URX_BACKREF, groupNumber);
1348             }
1349         }
1350         delete fCaptureName;
1351         fCaptureName = NULL;
1352         break;
1353         }
1354
1355     case doPossessivePlus:
1356         // Possessive ++ quantifier.
1357         // Compiles to
1358         //       1.   STO_SP
1359         //       2.      body of stuff being iterated over
1360         //       3.   STATE_SAVE 5
1361         //       4.   JMP        2
1362         //       5.   LD_SP
1363         //       6.   ...
1364         //
1365         //  Note:  TODO:  This is pretty inefficient.  A mass of saved state is built up
1366         //                then unconditionally discarded.  Perhaps introduce a new opcode.  Ticket 6056
1367         //
1368         {
1369             // Emit the STO_SP
1370             int32_t   topLoc = blockTopLoc(TRUE);
1371             int32_t   stoLoc = allocateData(1);  // Reserve the data location for storing save stack ptr.
1372             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1373             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1374
1375             // Emit the STATE_SAVE
1376             appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
1377
1378             // Emit the JMP
1379             appendOp(URX_JMP, topLoc+1);
1380
1381             // Emit the LD_SP
1382             appendOp(URX_LD_SP, stoLoc);
1383         }
1384         break;
1385
1386     case doPossessiveStar:
1387         // Possessive *+ quantifier.
1388         // Compiles to
1389         //       1.   STO_SP       loc
1390         //       2.   STATE_SAVE   5
1391         //       3.      body of stuff being iterated over
1392         //       4.   JMP          2
1393         //       5.   LD_SP        loc
1394         //       6    ...
1395         // TODO:  do something to cut back the state stack each time through the loop.
1396         {
1397             // Reserve two slots at the top of the block.
1398             int32_t   topLoc = blockTopLoc(TRUE);
1399             insertOp(topLoc);
1400
1401             // emit   STO_SP     loc
1402             int32_t   stoLoc = allocateData(1);    // Reserve the data location for storing save stack ptr.
1403             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1404             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1405
1406             // Emit the SAVE_STATE   5
1407             int32_t L7 = fRXPat->fCompiledPat->size()+1;
1408             op = buildOp(URX_STATE_SAVE, L7);
1409             fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1410
1411             // Append the JMP operation.
1412             appendOp(URX_JMP, topLoc+1);
1413
1414             // Emit the LD_SP       loc
1415             appendOp(URX_LD_SP, stoLoc);
1416         }
1417         break;
1418
1419     case doPossessiveOpt:
1420         // Possessive  ?+ quantifier.
1421         //  Compiles to
1422         //     1. STO_SP      loc
1423         //     2. SAVE_STATE  5
1424         //     3.    body of optional block
1425         //     4. LD_SP       loc
1426         //     5. ...
1427         //
1428         {
1429             // Reserve two slots at the top of the block.
1430             int32_t   topLoc = blockTopLoc(TRUE);
1431             insertOp(topLoc);
1432
1433             // Emit the STO_SP
1434             int32_t   stoLoc = allocateData(1);   // Reserve the data location for storing save stack ptr.
1435             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1436             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1437
1438             // Emit the SAVE_STATE
1439             int32_t   continueLoc = fRXPat->fCompiledPat->size()+1;
1440             op = buildOp(URX_STATE_SAVE, continueLoc);
1441             fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1442
1443             // Emit the LD_SP
1444             appendOp(URX_LD_SP, stoLoc);
1445         }
1446         break;
1447
1448
1449     case doBeginMatchMode:
1450         fNewModeFlags = fModeFlags;
1451         fSetModeFlag  = TRUE;
1452         break;
1453
1454     case doMatchMode:   //  (?i)    and similar
1455         {
1456             int32_t  bit = 0;
1457             switch (fC.fChar) {
1458             case 0x69: /* 'i' */   bit = UREGEX_CASE_INSENSITIVE; break;
1459             case 0x64: /* 'd' */   bit = UREGEX_UNIX_LINES;       break;
1460             case 0x6d: /* 'm' */   bit = UREGEX_MULTILINE;        break;
1461             case 0x73: /* 's' */   bit = UREGEX_DOTALL;           break;
1462             case 0x75: /* 'u' */   bit = 0; /* Unicode casing */  break;
1463             case 0x77: /* 'w' */   bit = UREGEX_UWORD;            break;
1464             case 0x78: /* 'x' */   bit = UREGEX_COMMENTS;         break;
1465             case 0x2d: /* '-' */   fSetModeFlag = FALSE;          break;
1466             default:
1467                 U_ASSERT(FALSE);   // Should never happen.  Other chars are filtered out
1468                                    // by the scanner.
1469             }
1470             if (fSetModeFlag) {
1471                 fNewModeFlags |= bit;
1472             } else {
1473                 fNewModeFlags &= ~bit;
1474             }
1475         }
1476         break;
1477
1478     case doSetMatchMode:
1479         // Emit code to match any pending literals, using the not-yet changed match mode.
1480         fixLiterals();
1481
1482         // We've got a (?i) or similar.  The match mode is being changed, but
1483         //   the change is not scoped to a parenthesized block.
1484         U_ASSERT(fNewModeFlags < 0);
1485         fModeFlags = fNewModeFlags;
1486
1487         break;
1488
1489
1490     case doMatchModeParen:
1491         // We've got a (?i: or similar.  Begin a parenthesized block, save old
1492         //   mode flags so they can be restored at the close of the block.
1493         //
1494         //   Compile to a
1495         //      - NOP, which later may be replaced by a save-state if the
1496         //         parenthesized group gets a * quantifier, followed by
1497         //      - NOP, which may later be replaced by a save-state if there
1498         //             is an '|' alternation within the parens.
1499         {
1500             fixLiterals(FALSE);
1501             appendOp(URX_NOP, 0);
1502             appendOp(URX_NOP, 0);
1503
1504             // On the Parentheses stack, start a new frame and add the postions
1505             //   of the two NOPs (a normal non-capturing () frame, except for the
1506             //   saving of the orignal mode flags.)
1507             fParenStack.push(fModeFlags, *fStatus);
1508             fParenStack.push(flags, *fStatus);                            // Frame Marker
1509             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP
1510             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
1511
1512             // Set the current mode flags to the new values.
1513             U_ASSERT(fNewModeFlags < 0);
1514             fModeFlags = fNewModeFlags;
1515         }
1516         break;
1517
1518     case doBadModeFlag:
1519         error(U_REGEX_INVALID_FLAG);
1520         break;
1521
1522     case doSuppressComments:
1523         // We have just scanned a '(?'.  We now need to prevent the character scanner from
1524         // treating a '#' as a to-the-end-of-line comment.
1525         //   (This Perl compatibility just gets uglier and uglier to do...)
1526         fEOLComments = FALSE;
1527         break;
1528
1529
1530     case doSetAddAmp:
1531         {
1532           UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1533           set->add(chAmp);
1534         }
1535         break;
1536
1537     case doSetAddDash:
1538         {
1539           UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1540           set->add(chDash);
1541         }
1542         break;
1543
1544      case doSetBackslash_s:
1545         {
1546          UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1547          set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1548          break;
1549         }
1550
1551      case doSetBackslash_S:
1552         {
1553             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1554             UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1555             SSet.complement();
1556             set->addAll(SSet);
1557             break;
1558         }
1559
1560     case doSetBackslash_d:
1561         {
1562             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1563             // TODO - make a static set, ticket 6058.
1564             addCategory(set, U_GC_ND_MASK, *fStatus);
1565             break;
1566         }
1567
1568     case doSetBackslash_D:
1569         {
1570             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1571             UnicodeSet digits;
1572             // TODO - make a static set, ticket 6058.
1573             digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
1574             digits.complement();
1575             set->addAll(digits);
1576             break;
1577         }
1578
1579     case doSetBackslash_h:
1580         {
1581             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1582             UnicodeSet h;
1583             h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1584             h.add((UChar32)9);   // Tab
1585             set->addAll(h);
1586             break;
1587         }
1588
1589     case doSetBackslash_H:
1590         {
1591             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1592             UnicodeSet h;
1593             h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1594             h.add((UChar32)9);   // Tab
1595             h.complement();
1596             set->addAll(h);
1597             break;
1598         }
1599
1600     case doSetBackslash_v:
1601         {
1602             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1603             set->add((UChar32)0x0a, (UChar32)0x0d);  // add range
1604             set->add((UChar32)0x85);
1605             set->add((UChar32)0x2028, (UChar32)0x2029);
1606             break;
1607         }
1608
1609     case doSetBackslash_V:
1610         {
1611             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1612             UnicodeSet v;
1613             v.add((UChar32)0x0a, (UChar32)0x0d);  // add range
1614             v.add((UChar32)0x85);
1615             v.add((UChar32)0x2028, (UChar32)0x2029);
1616             v.complement();
1617             set->addAll(v);
1618             break;
1619         }
1620
1621     case doSetBackslash_w:
1622         {
1623             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1624             set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1625             break;
1626         }
1627
1628     case doSetBackslash_W:
1629         {
1630             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1631             UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1632             SSet.complement();
1633             set->addAll(SSet);
1634             break;
1635         }
1636
1637     case doSetBegin:
1638         fixLiterals(FALSE);
1639         fSetStack.push(new UnicodeSet(), *fStatus);
1640         fSetOpStack.push(setStart, *fStatus);
1641         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1642             fSetOpStack.push(setCaseClose, *fStatus);
1643         }
1644         break;
1645
1646     case doSetBeginDifference1:
1647         //  We have scanned something like [[abc]-[
1648         //  Set up a new UnicodeSet for the set beginning with the just-scanned '['
1649         //  Push a Difference operator, which will cause the new set to be subtracted from what
1650         //    went before once it is created.
1651         setPushOp(setDifference1);
1652         fSetOpStack.push(setStart, *fStatus);
1653         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1654             fSetOpStack.push(setCaseClose, *fStatus);
1655         }
1656         break;
1657
1658     case doSetBeginIntersection1:
1659         //  We have scanned something like  [[abc]&[
1660         //   Need both the '&' operator and the open '[' operator.
1661         setPushOp(setIntersection1);
1662         fSetOpStack.push(setStart, *fStatus);
1663         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1664             fSetOpStack.push(setCaseClose, *fStatus);
1665         }
1666         break;
1667
1668     case doSetBeginUnion:
1669         //  We have scanned something like  [[abc][
1670         //     Need to handle the union operation explicitly [[abc] | [
1671         setPushOp(setUnion);
1672         fSetOpStack.push(setStart, *fStatus);
1673         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1674             fSetOpStack.push(setCaseClose, *fStatus);
1675         }
1676         break;
1677
1678     case doSetDifference2:
1679         // We have scanned something like [abc--
1680         //   Consider this to unambiguously be a set difference operator.
1681         setPushOp(setDifference2);
1682         break;
1683
1684     case doSetEnd:
1685         // Have encountered the ']' that closes a set.
1686         //    Force the evaluation of any pending operations within this set,
1687         //    leave the completed set on the top of the set stack.
1688         setEval(setEnd);
1689         U_ASSERT(fSetOpStack.peeki()==setStart);
1690         fSetOpStack.popi();
1691         break;
1692
1693     case doSetFinish:
1694         {
1695         // Finished a complete set expression, including all nested sets.
1696         //   The close bracket has already triggered clearing out pending set operators,
1697         //    the operator stack should be empty and the operand stack should have just
1698         //    one entry, the result set.
1699         U_ASSERT(fSetOpStack.empty());
1700         UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
1701         U_ASSERT(fSetStack.empty());
1702         compileSet(theSet);
1703         break;
1704         }
1705
1706     case doSetIntersection2:
1707         // Have scanned something like [abc&&
1708         setPushOp(setIntersection2);
1709         break;
1710
1711     case doSetLiteral:
1712         // Union the just-scanned literal character into the set being built.
1713         //    This operation is the highest precedence set operation, so we can always do
1714         //    it immediately, without waiting to see what follows.  It is necessary to perform
1715         //    any pending '-' or '&' operation first, because these have the same precedence
1716         //    as union-ing in a literal'
1717         {
1718             setEval(setUnion);
1719             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1720             s->add(fC.fChar);
1721             fLastSetLiteral = fC.fChar;
1722             break;
1723         }
1724
1725     case doSetLiteralEscaped:
1726         // A back-slash escaped literal character was encountered.
1727         // Processing is the same as with setLiteral, above, with the addition of
1728         //  the optional check for errors on escaped ASCII letters.
1729         {
1730             if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1731                 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1732                  (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1733                 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1734             }
1735             setEval(setUnion);
1736             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1737             s->add(fC.fChar);
1738             fLastSetLiteral = fC.fChar;
1739             break;
1740         }
1741
1742         case doSetNamedChar:
1743         // Scanning a \N{UNICODE CHARACTER NAME}
1744         //  Aside from the source of the character, the processing is identical to doSetLiteral,
1745         //    above.
1746         {
1747             UChar32  c = scanNamedChar();
1748             setEval(setUnion);
1749             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1750             s->add(c);
1751             fLastSetLiteral = c;
1752             break;
1753         }
1754
1755     case doSetNamedRange:
1756         // We have scanned literal-\N{CHAR NAME}.  Add the range to the set.
1757         // The left character is already in the set, and is saved in fLastSetLiteral.
1758         // The right side needs to be picked up, the scan is at the 'N'.
1759         // Lower Limit > Upper limit being an error matches both Java
1760         //        and ICU UnicodeSet behavior.
1761         {
1762             UChar32  c = scanNamedChar();
1763             if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > c)) {
1764                 error(U_REGEX_INVALID_RANGE);
1765             }
1766             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1767             s->add(fLastSetLiteral, c);
1768             fLastSetLiteral = c;
1769             break;
1770         }
1771
1772
1773     case  doSetNegate:
1774         // Scanned a '^' at the start of a set.
1775         // Push the negation operator onto the set op stack.
1776         // A twist for case-insensitive matching:
1777         //   the case closure operation must happen _before_ negation.
1778         //   But the case closure operation will already be on the stack if it's required.
1779         //   This requires checking for case closure, and swapping the stack order
1780         //    if it is present.
1781         {
1782             int32_t  tosOp = fSetOpStack.peeki();
1783             if (tosOp == setCaseClose) {
1784                 fSetOpStack.popi();
1785                 fSetOpStack.push(setNegation, *fStatus);
1786                 fSetOpStack.push(setCaseClose, *fStatus);
1787             } else {
1788                 fSetOpStack.push(setNegation, *fStatus);
1789             }
1790         }
1791         break;
1792
1793     case doSetNoCloseError:
1794         error(U_REGEX_MISSING_CLOSE_BRACKET);
1795         break;
1796
1797     case doSetOpError:
1798         error(U_REGEX_RULE_SYNTAX);   //  -- or && at the end of a set.  Illegal.
1799         break;
1800
1801     case doSetPosixProp:
1802         {
1803             UnicodeSet *s = scanPosixProp();
1804             if (s != NULL) {
1805                 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1806                 tos->addAll(*s);
1807                 delete s;
1808             }  // else error.  scanProp() reported the error status already.
1809         }
1810         break;
1811
1812     case doSetProp:
1813         //  Scanned a \p \P within [brackets].
1814         {
1815             UnicodeSet *s = scanProp();
1816             if (s != NULL) {
1817                 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1818                 tos->addAll(*s);
1819                 delete s;
1820             }  // else error.  scanProp() reported the error status already.
1821         }
1822         break;
1823
1824
1825     case doSetRange:
1826         // We have scanned literal-literal.  Add the range to the set.
1827         // The left character is already in the set, and is saved in fLastSetLiteral.
1828         // The right side is the current character.
1829         // Lower Limit > Upper limit being an error matches both Java
1830         //        and ICU UnicodeSet behavior.
1831         {
1832
1833         if (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > fC.fChar) {
1834             error(U_REGEX_INVALID_RANGE);
1835         }
1836         UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1837         s->add(fLastSetLiteral, fC.fChar);
1838         break;
1839         }
1840
1841     default:
1842         U_ASSERT(FALSE);
1843         error(U_REGEX_INTERNAL_ERROR);
1844         break;
1845     }
1846
1847     if (U_FAILURE(*fStatus)) {
1848         returnVal = FALSE;
1849     }
1850
1851     return returnVal;
1852 }
1853
1854
1855
1856 //------------------------------------------------------------------------------
1857 //
1858 //   literalChar           We've encountered a literal character from the pattern,
1859 //                             or an escape sequence that reduces to a character.
1860 //                         Add it to the string containing all literal chars/strings from
1861 //                             the pattern.
1862 //
1863 //------------------------------------------------------------------------------
1864 void RegexCompile::literalChar(UChar32 c)  {
1865     fLiteralChars.append(c);
1866 }
1867
1868
1869 //------------------------------------------------------------------------------
1870 //
1871 //    fixLiterals           When compiling something that can follow a literal
1872 //                          string in a pattern, emit the code to match the
1873 //                          accumulated literal string.
1874 //
1875 //                          Optionally, split the last char of the string off into
1876 //                          a single "ONE_CHAR" operation, so that quantifiers can
1877 //                          apply to that char alone.  Example:   abc*
1878 //                          The * must apply to the 'c' only.
1879 //
1880 //------------------------------------------------------------------------------
1881 void    RegexCompile::fixLiterals(UBool split) {
1882
1883     // If no literal characters have been scanned but not yet had code generated
1884     //   for them, nothing needs to be done.
1885     if (fLiteralChars.length() == 0) {
1886         return;
1887     }
1888
1889     int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1890     UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1891
1892     // Split:  We need to  ensure that the last item in the compiled pattern
1893     //     refers only to the last literal scanned in the pattern, so that
1894     //     quantifiers (*, +, etc.) affect only it, and not a longer string.
1895     //     Split before case folding for case insensitive matches.
1896
1897     if (split) {
1898         fLiteralChars.truncate(indexOfLastCodePoint);
1899         fixLiterals(FALSE);   // Recursive call, emit code to match the first part of the string.
1900                               //  Note that the truncated literal string may be empty, in which case
1901                               //  nothing will be emitted.
1902
1903         literalChar(lastCodePoint);  // Re-add the last code point as if it were a new literal.
1904         fixLiterals(FALSE);          // Second recursive call, code for the final code point.
1905         return;
1906     }
1907
1908     // If we are doing case-insensitive matching, case fold the string.  This may expand
1909     //   the string, e.g. the German sharp-s turns into "ss"
1910     if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1911         fLiteralChars.foldCase();
1912         indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1913         lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1914     }
1915
1916     if (indexOfLastCodePoint == 0) {
1917         // Single character, emit a URX_ONECHAR op to match it.
1918         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
1919                  u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
1920             appendOp(URX_ONECHAR_I, lastCodePoint);
1921         } else {
1922             appendOp(URX_ONECHAR, lastCodePoint);
1923         }
1924     } else {
1925         // Two or more chars, emit a URX_STRING to match them.
1926         if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length() > 0x00ffffff) {
1927             error(U_REGEX_PATTERN_TOO_BIG);
1928         }
1929         if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1930             appendOp(URX_STRING_I, fRXPat->fLiteralText.length());
1931         } else {
1932             // TODO here:  add optimization to split case sensitive strings of length two
1933             //             into two single char ops, for efficiency.
1934             appendOp(URX_STRING, fRXPat->fLiteralText.length());
1935         }
1936         appendOp(URX_STRING_LEN, fLiteralChars.length());
1937
1938         // Add this string into the accumulated strings of the compiled pattern.
1939         fRXPat->fLiteralText.append(fLiteralChars);
1940     }
1941
1942     fLiteralChars.remove();
1943 }
1944
1945
1946 int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
1947     if (U_FAILURE(*fStatus)) {
1948         return 0;
1949     }
1950     if (type < 0 || type > 255) {
1951         U_ASSERT(FALSE);
1952         error(U_REGEX_INTERNAL_ERROR);
1953         type = URX_RESERVED_OP;
1954     }
1955     if (val > 0x00ffffff) {
1956         U_ASSERT(FALSE);
1957         error(U_REGEX_INTERNAL_ERROR);
1958         val = 0;
1959     }
1960     if (val < 0) {
1961         if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) {
1962             U_ASSERT(FALSE);
1963             error(U_REGEX_INTERNAL_ERROR);
1964             return -1;
1965         }
1966         if (URX_TYPE(val) != 0xff) {
1967             U_ASSERT(FALSE);
1968             error(U_REGEX_INTERNAL_ERROR);
1969             return -1;
1970         }
1971         type = URX_RESERVED_OP_N;
1972     }
1973     return (type << 24) | val;
1974 }
1975
1976
1977 //------------------------------------------------------------------------------
1978 //
1979 //   appendOp()             Append a new instruction onto the compiled pattern
1980 //                          Includes error checking, limiting the size of the
1981 //                          pattern to lengths that can be represented in the
1982 //                          24 bit operand field of an instruction.
1983 //
1984 //------------------------------------------------------------------------------
1985 void RegexCompile::appendOp(int32_t op) {
1986     if (U_FAILURE(*fStatus)) {
1987         return;
1988     }
1989     fRXPat->fCompiledPat->addElement(op, *fStatus);
1990     if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) {
1991         error(U_REGEX_PATTERN_TOO_BIG);
1992     }
1993 }
1994
1995 void RegexCompile::appendOp(int32_t type, int32_t val) {
1996     appendOp(buildOp(type, val));
1997 }
1998
1999
2000 //------------------------------------------------------------------------------
2001 //
2002 //   insertOp()             Insert a slot for a new opcode into the already
2003 //                          compiled pattern code.
2004 //
2005 //                          Fill the slot with a NOP.  Our caller will replace it
2006 //                          with what they really wanted.
2007 //
2008 //------------------------------------------------------------------------------
2009 void   RegexCompile::insertOp(int32_t where) {
2010     UVector64 *code = fRXPat->fCompiledPat;
2011     U_ASSERT(where>0 && where < code->size());
2012
2013     int32_t  nop = buildOp(URX_NOP, 0);
2014     code->insertElementAt(nop, where, *fStatus);
2015
2016     // Walk through the pattern, looking for any ops with targets that
2017     //  were moved down by the insert.  Fix them.
2018     int32_t loc;
2019     for (loc=0; loc<code->size(); loc++) {
2020         int32_t op = (int32_t)code->elementAti(loc);
2021         int32_t opType = URX_TYPE(op);
2022         int32_t opValue = URX_VAL(op);
2023         if ((opType == URX_JMP         ||
2024             opType == URX_JMPX         ||
2025             opType == URX_STATE_SAVE   ||
2026             opType == URX_CTR_LOOP     ||
2027             opType == URX_CTR_LOOP_NG  ||
2028             opType == URX_JMP_SAV      ||
2029             opType == URX_JMP_SAV_X    ||
2030             opType == URX_RELOC_OPRND)    && opValue > where) {
2031             // Target location for this opcode is after the insertion point and
2032             //   needs to be incremented to adjust for the insertion.
2033             opValue++;
2034             op = buildOp(opType, opValue);
2035             code->setElementAt(op, loc);
2036         }
2037     }
2038
2039     // Now fix up the parentheses stack.  All positive values in it are locations in
2040     //  the compiled pattern.   (Negative values are frame boundaries, and don't need fixing.)
2041     for (loc=0; loc<fParenStack.size(); loc++) {
2042         int32_t x = fParenStack.elementAti(loc);
2043         U_ASSERT(x < code->size());
2044         if (x>where) {
2045             x++;
2046             fParenStack.setElementAt(x, loc);
2047         }
2048     }
2049
2050     if (fMatchCloseParen > where) {
2051         fMatchCloseParen++;
2052     }
2053     if (fMatchOpenParen > where) {
2054         fMatchOpenParen++;
2055     }
2056 }
2057
2058
2059 //------------------------------------------------------------------------------
2060 //
2061 //   allocateData()        Allocate storage in the matcher's static data area.
2062 //                         Return the index for the newly allocated data.
2063 //                         The storage won't actually exist until we are running a match
2064 //                         operation, but the storage indexes are inserted into various
2065 //                         opcodes while compiling the pattern.
2066 //
2067 //------------------------------------------------------------------------------
2068 int32_t RegexCompile::allocateData(int32_t size) {
2069     if (U_FAILURE(*fStatus)) {
2070         return 0;
2071     }
2072     if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) {
2073         error(U_REGEX_INTERNAL_ERROR);
2074         return 0;
2075     }
2076     int32_t dataIndex = fRXPat->fDataSize;
2077     fRXPat->fDataSize += size;
2078     if (fRXPat->fDataSize >= 0x00fffff0) {
2079         error(U_REGEX_INTERNAL_ERROR);
2080     }
2081     return dataIndex;
2082 }
2083
2084
2085 //------------------------------------------------------------------------------
2086 //
2087 //   allocateStackData()   Allocate space in the back-tracking stack frame.
2088 //                         Return the index for the newly allocated data.
2089 //                         The frame indexes are inserted into various
2090 //                         opcodes while compiling the pattern, meaning that frame
2091 //                         size must be restricted to the size that will fit
2092 //                         as an operand (24 bits).
2093 //
2094 //------------------------------------------------------------------------------
2095 int32_t RegexCompile::allocateStackData(int32_t size) {
2096     if (U_FAILURE(*fStatus)) {
2097         return 0;
2098     }
2099     if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) {
2100         error(U_REGEX_INTERNAL_ERROR);
2101         return 0;
2102     }
2103     int32_t dataIndex = fRXPat->fFrameSize;
2104     fRXPat->fFrameSize += size;
2105     if (fRXPat->fFrameSize >= 0x00fffff0) {
2106         error(U_REGEX_PATTERN_TOO_BIG);
2107     }
2108     return dataIndex;
2109 }
2110
2111
2112 //------------------------------------------------------------------------------
2113 //
2114 //   blockTopLoc()          Find or create a location in the compiled pattern
2115 //                          at the start of the operation or block that has
2116 //                          just been compiled.  Needed when a quantifier (* or
2117 //                          whatever) appears, and we need to add an operation
2118 //                          at the start of the thing being quantified.
2119 //
2120 //                          (Parenthesized Blocks) have a slot with a NOP that
2121 //                          is reserved for this purpose.  .* or similar don't
2122 //                          and a slot needs to be added.
2123 //
2124 //       parameter reserveLoc   :  TRUE -  ensure that there is space to add an opcode
2125 //                                         at the returned location.
2126 //                                 FALSE - just return the address,
2127 //                                         do not reserve a location there.
2128 //
2129 //------------------------------------------------------------------------------
2130 int32_t   RegexCompile::blockTopLoc(UBool reserveLoc) {
2131     int32_t   theLoc;
2132     fixLiterals(TRUE);  // Emit code for any pending literals.
2133                         //   If last item was a string, emit separate op for the its last char.
2134     if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
2135     {
2136         // The item just processed is a parenthesized block.
2137         theLoc = fMatchOpenParen;   // A slot is already reserved for us.
2138         U_ASSERT(theLoc > 0);
2139         U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
2140     }
2141     else {
2142         // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
2143         // No slot for STATE_SAVE was pre-reserved in the compiled code.
2144         // We need to make space now.
2145         theLoc = fRXPat->fCompiledPat->size()-1;
2146         int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc);
2147         if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
2148             // Strings take two opcode, we want the position of the first one.
2149             // We can have a string at this point if a single character case-folded to two.
2150             theLoc--;
2151         }
2152         if (reserveLoc) {
2153             int32_t  nop = buildOp(URX_NOP, 0);
2154             fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
2155         }
2156     }
2157     return theLoc;
2158 }
2159
2160
2161
2162 //------------------------------------------------------------------------------
2163 //
2164 //    handleCloseParen      When compiling a close paren, we need to go back
2165 //                          and fix up any JMP or SAVE operations within the
2166 //                          parenthesized block that need to target the end
2167 //                          of the block.  The locations of these are kept on
2168 //                          the paretheses stack.
2169 //
2170 //                          This function is called both when encountering a
2171 //                          real ) and at the end of the pattern.
2172 //
2173 //------------------------------------------------------------------------------
2174 void  RegexCompile::handleCloseParen() {
2175     int32_t   patIdx;
2176     int32_t   patOp;
2177     if (fParenStack.size() <= 0) {
2178         error(U_REGEX_MISMATCHED_PAREN);
2179         return;
2180     }
2181
2182     // Emit code for any pending literals.
2183     fixLiterals(FALSE);
2184
2185     // Fixup any operations within the just-closed parenthesized group
2186     //    that need to reference the end of the (block).
2187     //    (The first one popped from the stack is an unused slot for
2188     //     alternation (OR) state save, but applying the fixup to it does no harm.)
2189     for (;;) {
2190         patIdx = fParenStack.popi();
2191         if (patIdx < 0) {
2192             // value < 0 flags the start of the frame on the paren stack.
2193             break;
2194         }
2195         U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
2196         patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx);
2197         U_ASSERT(URX_VAL(patOp) == 0);          // Branch target for JMP should not be set.
2198         patOp |= fRXPat->fCompiledPat->size();  // Set it now.
2199         fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
2200         fMatchOpenParen     = patIdx;
2201     }
2202
2203     //  At the close of any parenthesized block, restore the match mode flags  to
2204     //  the value they had at the open paren.  Saved value is
2205     //  at the top of the paren stack.
2206     fModeFlags = fParenStack.popi();
2207     U_ASSERT(fModeFlags < 0);
2208
2209     // DO any additional fixups, depending on the specific kind of
2210     // parentesized grouping this is
2211
2212     switch (patIdx) {
2213     case plain:
2214     case flags:
2215         // No additional fixups required.
2216         //   (Grouping-only parentheses)
2217         break;
2218     case capturing:
2219         // Capturing Parentheses.
2220         //   Insert a End Capture op into the pattern.
2221         //   The frame offset of the variables for this cg is obtained from the
2222         //       start capture op and put it into the end-capture op.
2223         {
2224             int32_t   captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2225             U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
2226
2227             int32_t   frameVarLocation = URX_VAL(captureOp);
2228             appendOp(URX_END_CAPTURE, frameVarLocation);
2229         }
2230         break;
2231     case atomic:
2232         // Atomic Parenthesis.
2233         //   Insert a LD_SP operation to restore the state stack to the position
2234         //   it was when the atomic parens were entered.
2235         {
2236             int32_t   stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2237             U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
2238             int32_t   stoLoc = URX_VAL(stoOp);
2239             appendOp(URX_LD_SP, stoLoc);
2240         }
2241         break;
2242
2243     case lookAhead:
2244         {
2245             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2246             U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2247             int32_t dataLoc  = URX_VAL(startOp);
2248             appendOp(URX_LA_END, dataLoc);
2249         }
2250         break;
2251
2252     case negLookAhead:
2253         {
2254             // See comment at doOpenLookAheadNeg
2255             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
2256             U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2257             int32_t dataLoc  = URX_VAL(startOp);
2258             appendOp(URX_LA_END, dataLoc);
2259             appendOp(URX_BACKTRACK, 0);
2260             appendOp(URX_LA_END, dataLoc);
2261
2262             // Patch the URX_SAVE near the top of the block.
2263             // The destination of the SAVE is the final LA_END that was just added.
2264             int32_t saveOp   = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
2265             U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
2266             int32_t dest     = fRXPat->fCompiledPat->size()-1;
2267             saveOp           = buildOp(URX_STATE_SAVE, dest);
2268             fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
2269         }
2270         break;
2271
2272     case lookBehind:
2273         {
2274             // See comment at doOpenLookBehind.
2275
2276             // Append the URX_LB_END and URX_LA_END to the compiled pattern.
2277             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
2278             U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2279             int32_t dataLoc  = URX_VAL(startOp);
2280             appendOp(URX_LB_END, dataLoc);
2281             appendOp(URX_LA_END, dataLoc);
2282
2283             // Determine the min and max bounds for the length of the
2284             //  string that the pattern can match.
2285             //  An unbounded upper limit is an error.
2286             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2287             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2288             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2289             if (URX_TYPE(maxML) != 0) {
2290                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2291                 break;
2292             }
2293             if (maxML == INT32_MAX) {
2294                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2295                 break;
2296             }
2297             U_ASSERT(minML <= maxML);
2298
2299             // Insert the min and max match len bounds into the URX_LB_CONT op that
2300             //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2301             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-2);
2302             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-1);
2303
2304         }
2305         break;
2306
2307
2308
2309     case lookBehindN:
2310         {
2311             // See comment at doOpenLookBehindNeg.
2312
2313             // Append the URX_LBN_END to the compiled pattern.
2314             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2315             U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2316             int32_t dataLoc  = URX_VAL(startOp);
2317             appendOp(URX_LBN_END, dataLoc);
2318
2319             // Determine the min and max bounds for the length of the
2320             //  string that the pattern can match.
2321             //  An unbounded upper limit is an error.
2322             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2323             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2324             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2325             if (URX_TYPE(maxML) != 0) {
2326                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2327                 break;
2328             }
2329             if (maxML == INT32_MAX) {
2330                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2331                 break;
2332             }
2333             U_ASSERT(minML <= maxML);
2334
2335             // Insert the min and max match len bounds into the URX_LB_CONT op that
2336             //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2337             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-3);
2338             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-2);
2339
2340             // Insert the pattern location to continue at after a successful match
2341             //  as the last operand of the URX_LBN_CONT
2342             int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
2343             fRXPat->fCompiledPat->setElementAt(op,  fMatchOpenParen-1);
2344         }
2345         break;
2346
2347
2348
2349     default:
2350         U_ASSERT(FALSE);
2351     }
2352
2353     // remember the next location in the compiled pattern.
2354     // The compilation of Quantifiers will look at this to see whether its looping
2355     //   over a parenthesized block or a single item
2356     fMatchCloseParen = fRXPat->fCompiledPat->size();
2357 }
2358
2359
2360
2361 //------------------------------------------------------------------------------
2362 //
2363 //   compileSet       Compile the pattern operations for a reference to a
2364 //                    UnicodeSet.
2365 //
2366 //------------------------------------------------------------------------------
2367 void        RegexCompile::compileSet(UnicodeSet *theSet)
2368 {
2369     if (theSet == NULL) {
2370         return;
2371     }
2372     //  Remove any strings from the set.
2373     //  There shoudn't be any, but just in case.
2374     //     (Case Closure can add them; if we had a simple case closure avaialble that
2375     //      ignored strings, that would be better.)
2376     theSet->removeAllStrings();
2377     int32_t  setSize = theSet->size();
2378
2379     switch (setSize) {
2380     case 0:
2381         {
2382             // Set of no elements.   Always fails to match.
2383             appendOp(URX_BACKTRACK, 0);
2384             delete theSet;
2385         }
2386         break;
2387
2388     case 1:
2389         {
2390             // The set contains only a single code point.  Put it into
2391             //   the compiled pattern as a single char operation rather
2392             //   than a set, and discard the set itself.
2393             literalChar(theSet->charAt(0));
2394             delete theSet;
2395         }
2396         break;
2397
2398     default:
2399         {
2400             //  The set contains two or more chars.  (the normal case)
2401             //  Put it into the compiled pattern as a set.
2402             int32_t setNumber = fRXPat->fSets->size();
2403             fRXPat->fSets->addElement(theSet, *fStatus);
2404             appendOp(URX_SETREF, setNumber);
2405         }
2406     }
2407 }
2408
2409
2410 //------------------------------------------------------------------------------
2411 //
2412 //   compileInterval    Generate the code for a {min, max} style interval quantifier.
2413 //                      Except for the specific opcodes used, the code is the same
2414 //                      for all three types (greedy, non-greedy, possessive) of
2415 //                      intervals.  The opcodes are supplied as parameters.
2416 //                      (There are two sets of opcodes - greedy & possessive use the
2417 //                      same ones, while non-greedy has it's own.)
2418 //
2419 //                      The code for interval loops has this form:
2420 //                         0  CTR_INIT   counter loc (in stack frame)
2421 //                         1             5  patt address of CTR_LOOP at bottom of block
2422 //                         2             min count
2423 //                         3             max count   (-1 for unbounded)
2424 //                         4  ...        block to be iterated over
2425 //                         5  CTR_LOOP
2426 //
2427 //                       In
2428 //------------------------------------------------------------------------------
2429 void        RegexCompile::compileInterval(int32_t InitOp,  int32_t LoopOp)
2430 {
2431     // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2432     //   four slots in the compiled code.  Reserve them.
2433     int32_t   topOfBlock = blockTopLoc(TRUE);
2434     insertOp(topOfBlock);
2435     insertOp(topOfBlock);
2436     insertOp(topOfBlock);
2437
2438     // The operands for the CTR_INIT opcode include the index in the matcher data
2439     //   of the counter.  Allocate it now. There are two data items
2440     //        counterLoc   -->  Loop counter
2441     //               +1    -->  Input index (for breaking non-progressing loops)
2442     //                          (Only present if unbounded upper limit on loop)
2443     int32_t   dataSize = fIntervalUpper < 0 ? 2 : 1;
2444     int32_t   counterLoc = allocateStackData(dataSize);
2445
2446     int32_t   op = buildOp(InitOp, counterLoc);
2447     fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
2448
2449     // The second operand of CTR_INIT is the location following the end of the loop.
2450     //   Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2451     //   compilation of something later on causes the code to grow and the target
2452     //   position to move.
2453     int32_t loopEnd = fRXPat->fCompiledPat->size();
2454     op = buildOp(URX_RELOC_OPRND, loopEnd);
2455     fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
2456
2457     // Followed by the min and max counts.
2458     fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
2459     fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
2460
2461     // Apend the CTR_LOOP op.  The operand is the location of the CTR_INIT op.
2462     //   Goes at end of the block being looped over, so just append to the code so far.
2463     appendOp(LoopOp, topOfBlock);
2464
2465     if ((fIntervalLow & 0xff000000) != 0 ||
2466         (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
2467             error(U_REGEX_NUMBER_TOO_BIG);
2468         }
2469
2470     if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
2471         error(U_REGEX_MAX_LT_MIN);
2472     }
2473 }
2474
2475
2476
2477 UBool RegexCompile::compileInlineInterval() {
2478     if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
2479         // Too big to inline.  Fail, which will cause looping code to be generated.
2480         //   (Upper < Lower picks up unbounded upper and errors, both.)
2481         return FALSE;
2482     }
2483
2484     int32_t   topOfBlock = blockTopLoc(FALSE);
2485     if (fIntervalUpper == 0) {
2486         // Pathological case.  Attempt no matches, as if the block doesn't exist.
2487         // Discard the generated code for the block.
2488         // If the block included parens, discard the info pertaining to them as well.
2489         fRXPat->fCompiledPat->setSize(topOfBlock);
2490         if (fMatchOpenParen >= topOfBlock) {
2491             fMatchOpenParen = -1;
2492         }
2493         if (fMatchCloseParen >= topOfBlock) {
2494             fMatchCloseParen = -1;
2495         }
2496         return TRUE;
2497     }
2498
2499     if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
2500         // The thing being repeated is not a single op, but some
2501         //   more complex block.  Do it as a loop, not inlines.
2502         //   Note that things "repeated" a max of once are handled as inline, because
2503         //     the one copy of the code already generated is just fine.
2504         return FALSE;
2505     }
2506
2507     // Pick up the opcode that is to be repeated
2508     //
2509     int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock);
2510
2511     // Compute the pattern location where the inline sequence
2512     //   will end, and set up the state save op that will be needed.
2513     //
2514     int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
2515                                 + fIntervalUpper + (fIntervalUpper-fIntervalLow);
2516     int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc);
2517     if (fIntervalLow == 0) {
2518         insertOp(topOfBlock);
2519         fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
2520     }
2521
2522
2523
2524     //  Loop, emitting the op for the thing being repeated each time.
2525     //    Loop starts at 1 because one instance of the op already exists in the pattern,
2526     //    it was put there when it was originally encountered.
2527     int32_t i;
2528     for (i=1; i<fIntervalUpper; i++ ) {
2529         if (i >= fIntervalLow) {
2530             appendOp(saveOp);
2531         }
2532         appendOp(op);
2533     }
2534     return TRUE;
2535 }
2536
2537
2538
2539 //------------------------------------------------------------------------------
2540 //
2541 //   caseInsensitiveStart  given a single code point from a pattern string, determine the
2542 //                         set of characters that could potentially begin a case-insensitive
2543 //                         match of a string beginning with that character, using full Unicode
2544 //                         case insensitive matching.
2545 //
2546 //          This is used in optimizing find().
2547 //
2548 //          closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
2549 //          misses cases like this:
2550 //             A string from the pattern begins with 'ss' (although all we know
2551 //                 in this context is that it begins with 's')
2552 //             The pattern could match a string beginning with a German sharp-s
2553 //
2554 //           To the ordinary case closure for a character c, we add all other
2555 //           characters cx where the case closure of cx incudes a string form that begins
2556 //           with the original character c.
2557 //
2558 //           This function could be made smarter. The full pattern string is available
2559 //           and it would be possible to verify that the extra characters being added
2560 //           to the starting set fully match, rather than having just a first-char of the
2561 //           folded form match.
2562 //
2563 //------------------------------------------------------------------------------
2564 void  RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
2565
2566 // Machine Generated below.
2567 // It may need updating with new versions of Unicode.
2568 // Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
2569 // The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
2570
2571 // Machine Generated Data. Do not hand edit.
2572     static const UChar32 RECaseFixCodePoints[] = {
2573         0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc,
2574         0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565,
2575         0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07,
2576         0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61,
2577         0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
2578
2579     static const int16_t RECaseFixStringOffsets[] = {
2580         0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10,
2581         0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f,
2582         0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43,
2583         0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57,
2584         0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
2585
2586     static const int16_t RECaseFixCounts[] = {
2587         0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1,
2588         0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1,
2589         0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2590         0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2591         0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
2592
2593     static const UChar RECaseFixData[] = {
2594         0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf,
2595         0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3,
2596         0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3,
2597         0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3,
2598         0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14,
2599         0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83,
2600         0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90,
2601         0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95,
2602         0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2,
2603         0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7,
2604         0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
2605
2606 // End of machine generated data.
2607
2608     if (c < UCHAR_MIN_VALUE || c > UCHAR_MAX_VALUE) {
2609         // This function should never be called with an invalid input character.
2610         U_ASSERT(FALSE);
2611         starterChars->clear();
2612     } else if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2613         UChar32 caseFoldedC  = u_foldCase(c, U_FOLD_CASE_DEFAULT);
2614         starterChars->set(caseFoldedC, caseFoldedC);
2615
2616         int32_t i;
2617         for (i=0; RECaseFixCodePoints[i]<c ; i++) {
2618             // Simple linear search through the sorted list of interesting code points.
2619         }
2620
2621         if (RECaseFixCodePoints[i] == c) {
2622             int32_t dataIndex = RECaseFixStringOffsets[i];
2623             int32_t numCharsToAdd = RECaseFixCounts[i];
2624             UChar32 cpToAdd = 0;
2625             for (int32_t j=0; j<numCharsToAdd; j++) {
2626                 U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd);
2627                 starterChars->add(cpToAdd);
2628             }
2629         }
2630
2631         starterChars->closeOver(USET_CASE_INSENSITIVE);
2632         starterChars->removeAllStrings();
2633     } else {
2634         // Not a cased character. Just return it alone.
2635         starterChars->set(c, c);
2636     }
2637 }
2638
2639
2640 // Increment with overflow check.
2641 // val and delta will both be positive.
2642
2643 static int32_t safeIncrement(int32_t val, int32_t delta) {
2644     if (INT32_MAX - val > delta) {
2645         return val + delta;
2646     } else {
2647         return INT32_MAX;
2648     }
2649 }
2650
2651
2652 //------------------------------------------------------------------------------
2653 //
2654 //   matchStartType    Determine how a match can start.
2655 //                     Used to optimize find() operations.
2656 //
2657 //                     Operation is very similar to minMatchLength().  Walk the compiled
2658 //                     pattern, keeping an on-going minimum-match-length.  For any
2659 //                     op where the min match coming in is zero, add that ops possible
2660 //                     starting matches to the possible starts for the overall pattern.
2661 //
2662 //------------------------------------------------------------------------------
2663 void   RegexCompile::matchStartType() {
2664     if (U_FAILURE(*fStatus)) {
2665         return;
2666     }
2667
2668
2669     int32_t    loc;                    // Location in the pattern of the current op being processed.
2670     int32_t    op;                     // The op being processed
2671     int32_t    opType;                 // The opcode type of the op
2672     int32_t    currentLen = 0;         // Minimum length of a match to this point (loc) in the pattern
2673     int32_t    numInitialStrings = 0;  // Number of strings encountered that could match at start.
2674
2675     UBool      atStart = TRUE;         // True if no part of the pattern yet encountered
2676                                        //   could have advanced the position in a match.
2677                                        //   (Maximum match length so far == 0)
2678
2679     // forwardedLength is a vector holding minimum-match-length values that
2680     //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
2681     //   It must be one longer than the pattern being checked because some  ops
2682     //   will jmp to a end-of-block+1 location from within a block, and we must
2683     //   count those when checking the block.
2684     int32_t end = fRXPat->fCompiledPat->size();
2685     UVector32  forwardedLength(end+1, *fStatus);
2686     forwardedLength.setSize(end+1);
2687     for (loc=3; loc<end; loc++) {
2688         forwardedLength.setElementAt(INT32_MAX, loc);
2689     }
2690
2691     for (loc = 3; loc<end; loc++) {
2692         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2693         opType = URX_TYPE(op);
2694
2695         // The loop is advancing linearly through the pattern.
2696         // If the op we are now at was the destination of a branch in the pattern,
2697         // and that path has a shorter minimum length than the current accumulated value,
2698         // replace the current accumulated value.
2699         if (forwardedLength.elementAti(loc) < currentLen) {
2700             currentLen = forwardedLength.elementAti(loc);
2701             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2702         }
2703
2704         switch (opType) {
2705             // Ops that don't change the total length matched
2706         case URX_RESERVED_OP:
2707         case URX_END:
2708         case URX_FAIL:
2709         case URX_STRING_LEN:
2710         case URX_NOP:
2711         case URX_START_CAPTURE:
2712         case URX_END_CAPTURE:
2713         case URX_BACKSLASH_B:
2714         case URX_BACKSLASH_BU:
2715         case URX_BACKSLASH_G:
2716         case URX_BACKSLASH_Z:
2717         case URX_DOLLAR:
2718         case URX_DOLLAR_M:
2719         case URX_DOLLAR_D:
2720         case URX_DOLLAR_MD:
2721         case URX_RELOC_OPRND:
2722         case URX_STO_INP_LOC:
2723         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
2724         case URX_BACKREF_I:
2725
2726         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
2727         case URX_LD_SP:
2728             break;
2729
2730         case URX_CARET:
2731             if (atStart) {
2732                 fRXPat->fStartType = START_START;
2733             }
2734             break;
2735
2736         case URX_CARET_M:
2737         case URX_CARET_M_UNIX:
2738             if (atStart) {
2739                 fRXPat->fStartType = START_LINE;
2740             }
2741             break;
2742
2743         case URX_ONECHAR:
2744             if (currentLen == 0) {
2745                 // This character could appear at the start of a match.
2746                 //   Add it to the set of possible starting characters.
2747                 fRXPat->fInitialChars->add(URX_VAL(op));
2748                 numInitialStrings += 2;
2749             }
2750             currentLen = safeIncrement(currentLen, 1);
2751             atStart = FALSE;
2752             break;
2753
2754
2755         case URX_SETREF:
2756             if (currentLen == 0) {
2757                 int32_t  sn = URX_VAL(op);
2758                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2759                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2760                 fRXPat->fInitialChars->addAll(*s);
2761                 numInitialStrings += 2;
2762             }
2763             currentLen = safeIncrement(currentLen, 1);
2764             atStart = FALSE;
2765             break;
2766
2767         case URX_LOOP_SR_I:
2768             // [Set]*, like a SETREF, above, in what it can match,
2769             //  but may not match at all, so currentLen is not incremented.
2770             if (currentLen == 0) {
2771                 int32_t  sn = URX_VAL(op);
2772                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2773                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2774                 fRXPat->fInitialChars->addAll(*s);
2775                 numInitialStrings += 2;
2776             }
2777             atStart = FALSE;
2778             break;
2779
2780         case URX_LOOP_DOT_I:
2781             if (currentLen == 0) {
2782                 // .* at the start of a pattern.
2783                 //    Any character can begin the match.
2784                 fRXPat->fInitialChars->clear();
2785                 fRXPat->fInitialChars->complement();
2786                 numInitialStrings += 2;
2787             }
2788             atStart = FALSE;
2789             break;
2790
2791
2792         case URX_STATIC_SETREF:
2793             if (currentLen == 0) {
2794                 int32_t  sn = URX_VAL(op);
2795                 U_ASSERT(sn>0 && sn<URX_LAST_SET);
2796                 const UnicodeSet *s = fRXPat->fStaticSets[sn];
2797                 fRXPat->fInitialChars->addAll(*s);
2798                 numInitialStrings += 2;
2799             }
2800             currentLen = safeIncrement(currentLen, 1);
2801             atStart = FALSE;
2802             break;
2803
2804
2805
2806         case URX_STAT_SETREF_N:
2807             if (currentLen == 0) {
2808                 int32_t  sn = URX_VAL(op);
2809                 const UnicodeSet *s = fRXPat->fStaticSets[sn];
2810                 UnicodeSet sc(*s);
2811                 sc.complement();
2812                 fRXPat->fInitialChars->addAll(sc);
2813                 numInitialStrings += 2;
2814             }
2815             currentLen = safeIncrement(currentLen, 1);
2816             atStart = FALSE;
2817             break;
2818
2819
2820
2821         case URX_BACKSLASH_D:
2822             // Digit Char
2823              if (currentLen == 0) {
2824                  UnicodeSet s;
2825                  s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
2826                  if (URX_VAL(op) != 0) {
2827                      s.complement();
2828                  }
2829                  fRXPat->fInitialChars->addAll(s);
2830                  numInitialStrings += 2;
2831             }
2832             currentLen = safeIncrement(currentLen, 1);
2833             atStart = FALSE;
2834             break;
2835
2836
2837         case URX_BACKSLASH_H:
2838             // Horiz white space
2839             if (currentLen == 0) {
2840                 UnicodeSet s;
2841                 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
2842                 s.add((UChar32)9);   // Tab
2843                 if (URX_VAL(op) != 0) {
2844                     s.complement();
2845                 }
2846                 fRXPat->fInitialChars->addAll(s);
2847                 numInitialStrings += 2;
2848             }
2849             currentLen = safeIncrement(currentLen, 1);
2850             atStart = FALSE;
2851             break;
2852
2853
2854         case URX_BACKSLASH_R:       // Any line ending sequence
2855         case URX_BACKSLASH_V:       // Any line ending code point, with optional negation
2856             if (currentLen == 0) {
2857                 UnicodeSet s;
2858                 s.add((UChar32)0x0a, (UChar32)0x0d);  // add range
2859                 s.add((UChar32)0x85);
2860                 s.add((UChar32)0x2028, (UChar32)0x2029);
2861                 if (URX_VAL(op) != 0) {
2862                      // Complement option applies to URX_BACKSLASH_V only.
2863                      s.complement();
2864                 }
2865                 fRXPat->fInitialChars->addAll(s);
2866                 numInitialStrings += 2;
2867             }
2868             currentLen = safeIncrement(currentLen, 1);
2869             atStart = FALSE;
2870             break;
2871
2872
2873
2874         case URX_ONECHAR_I:
2875             // Case Insensitive Single Character.
2876             if (currentLen == 0) {
2877                 UChar32  c = URX_VAL(op);
2878                 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2879                     UnicodeSet starters(c, c);
2880                     starters.closeOver(USET_CASE_INSENSITIVE);
2881                     // findCaseInsensitiveStarters(c, &starters);
2882                     //   For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
2883                     //   The expanded folding can't match the pattern.
2884                     fRXPat->fInitialChars->addAll(starters);
2885                 } else {
2886                     // Char has no case variants.  Just add it as-is to the
2887                     //   set of possible starting chars.
2888                     fRXPat->fInitialChars->add(c);
2889                 }
2890                 numInitialStrings += 2;
2891             }
2892             currentLen = safeIncrement(currentLen, 1);
2893             atStart = FALSE;
2894             break;
2895
2896
2897         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
2898         case URX_DOTANY_ALL:    // . matches one or two.
2899         case URX_DOTANY:
2900         case URX_DOTANY_UNIX:
2901             if (currentLen == 0) {
2902                 // These constructs are all bad news when they appear at the start
2903                 //   of a match.  Any character can begin the match.
2904                 fRXPat->fInitialChars->clear();
2905                 fRXPat->fInitialChars->complement();
2906                 numInitialStrings += 2;
2907             }
2908             currentLen = safeIncrement(currentLen, 1);
2909             atStart = FALSE;
2910             break;
2911
2912
2913         case URX_JMPX:
2914             loc++;             // Except for extra operand on URX_JMPX, same as URX_JMP.
2915             U_FALLTHROUGH;
2916         case URX_JMP:
2917             {
2918                 int32_t  jmpDest = URX_VAL(op);
2919                 if (jmpDest < loc) {
2920                     // Loop of some kind.  Can safely ignore, the worst that will happen
2921                     //  is that we understate the true minimum length
2922                     currentLen = forwardedLength.elementAti(loc+1);
2923
2924                 } else {
2925                     // Forward jump.  Propagate the current min length to the target loc of the jump.
2926                     U_ASSERT(jmpDest <= end+1);
2927                     if (forwardedLength.elementAti(jmpDest) > currentLen) {
2928                         forwardedLength.setElementAt(currentLen, jmpDest);
2929                     }
2930                 }
2931             }
2932             atStart = FALSE;
2933             break;
2934
2935         case URX_JMP_SAV:
2936         case URX_JMP_SAV_X:
2937             // Combo of state save to the next loc, + jmp backwards.
2938             //   Net effect on min. length computation is nothing.
2939             atStart = FALSE;
2940             break;
2941
2942         case URX_BACKTRACK:
2943             // Fails are kind of like a branch, except that the min length was
2944             //   propagated already, by the state save.
2945             currentLen = forwardedLength.elementAti(loc+1);
2946             atStart = FALSE;
2947             break;
2948
2949
2950         case URX_STATE_SAVE:
2951             {
2952                 // State Save, for forward jumps, propagate the current minimum.
2953                 //             of the state save.
2954                 int32_t  jmpDest = URX_VAL(op);
2955                 if (jmpDest > loc) {
2956                     if (currentLen < forwardedLength.elementAti(jmpDest)) {
2957                         forwardedLength.setElementAt(currentLen, jmpDest);
2958                     }
2959                 }
2960             }
2961             atStart = FALSE;
2962             break;
2963
2964
2965
2966
2967         case URX_STRING:
2968             {
2969                 loc++;
2970                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2971                 int32_t stringLen   = URX_VAL(stringLenOp);
2972                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2973                 U_ASSERT(stringLenOp >= 2);
2974                 if (currentLen == 0) {
2975                     // Add the starting character of this string to the set of possible starting
2976                     //   characters for this pattern.
2977                     int32_t stringStartIdx = URX_VAL(op);
2978                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
2979                     fRXPat->fInitialChars->add(c);
2980
2981                     // Remember this string.  After the entire pattern has been checked,
2982                     //  if nothing else is identified that can start a match, we'll use it.
2983                     numInitialStrings++;
2984                     fRXPat->fInitialStringIdx = stringStartIdx;
2985                     fRXPat->fInitialStringLen = stringLen;
2986                 }
2987
2988                 currentLen = safeIncrement(currentLen, stringLen);
2989                 atStart = FALSE;
2990             }
2991             break;
2992
2993         case URX_STRING_I:
2994             {
2995                 // Case-insensitive string.  Unlike exact-match strings, we won't
2996                 //   attempt a string search for possible match positions.  But we
2997                 //   do update the set of possible starting characters.
2998                 loc++;
2999                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3000                 int32_t stringLen   = URX_VAL(stringLenOp);
3001                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
3002                 U_ASSERT(stringLenOp >= 2);
3003                 if (currentLen == 0) {
3004                     // Add the starting character of this string to the set of possible starting
3005                     //   characters for this pattern.
3006                     int32_t stringStartIdx = URX_VAL(op);
3007                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
3008                     UnicodeSet s;
3009                     findCaseInsensitiveStarters(c, &s);
3010                     fRXPat->fInitialChars->addAll(s);
3011                     numInitialStrings += 2;  // Matching on an initial string not possible.
3012                 }
3013                 currentLen = safeIncrement(currentLen, stringLen);
3014                 atStart = FALSE;
3015             }
3016             break;
3017
3018         case URX_CTR_INIT:
3019         case URX_CTR_INIT_NG:
3020             {
3021                 // Loop Init Ops.  These don't change the min length, but they are 4 word ops
3022                 //   so location must be updated accordingly.
3023                 // Loop Init Ops.
3024                 //   If the min loop count == 0
3025                 //      move loc forwards to the end of the loop, skipping over the body.
3026                 //   If the min count is > 0,
3027                 //      continue normal processing of the body of the loop.
3028                 int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3029                         loopEndLoc   = URX_VAL(loopEndLoc);
3030                 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3031                 if (minLoopCount == 0) {
3032                     // Min Loop Count of 0, treat like a forward branch and
3033                     //   move the current minimum length up to the target
3034                     //   (end of loop) location.
3035                     U_ASSERT(loopEndLoc <= end+1);
3036                     if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
3037                         forwardedLength.setElementAt(currentLen, loopEndLoc);
3038                     }
3039                 }
3040                 loc+=3;  // Skips over operands of CTR_INIT
3041             }
3042             atStart = FALSE;
3043             break;
3044
3045
3046         case URX_CTR_LOOP:
3047         case URX_CTR_LOOP_NG:
3048             // Loop ops.
3049             //  The jump is conditional, backwards only.
3050             atStart = FALSE;
3051             break;
3052
3053         case URX_LOOP_C:
3054             // More loop ops.  These state-save to themselves.
3055             //   don't change the minimum match
3056             atStart = FALSE;
3057             break;
3058
3059
3060         case URX_LA_START:
3061         case URX_LB_START:
3062             {
3063                 // Look-around.  Scan forward until the matching look-ahead end,
3064                 //   without processing the look-around block.  This is overly pessimistic.
3065
3066                 // Keep track of the nesting depth of look-around blocks.  Boilerplate code for
3067                 //   lookahead contains two LA_END instructions, so count goes up by two
3068                 //   for each LA_START.
3069                 int32_t  depth = (opType == URX_LA_START? 2: 1);
3070                 for (;;) {
3071                     loc++;
3072                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3073                     if (URX_TYPE(op) == URX_LA_START) {
3074                         depth+=2;
3075                     }
3076                     if (URX_TYPE(op) == URX_LB_START) {
3077                         depth++;
3078                     }
3079                     if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3080                         depth--;
3081                         if (depth == 0) {
3082                             break;
3083                         }
3084                     }
3085                     if (URX_TYPE(op) == URX_STATE_SAVE) {
3086                         // Need this because neg lookahead blocks will FAIL to outside
3087                         //   of the block.
3088                         int32_t  jmpDest = URX_VAL(op);
3089                         if (jmpDest > loc) {
3090                             if (currentLen < forwardedLength.elementAti(jmpDest)) {
3091                                 forwardedLength.setElementAt(currentLen, jmpDest);
3092                             }
3093                         }
3094                     }
3095                     U_ASSERT(loc <= end);
3096                 }
3097             }
3098             break;
3099
3100         case URX_LA_END:
3101         case URX_LB_CONT:
3102         case URX_LB_END:
3103         case URX_LBN_CONT:
3104         case URX_LBN_END:
3105             U_ASSERT(FALSE);     // Shouldn't get here.  These ops should be
3106                                  //  consumed by the scan in URX_LA_START and LB_START
3107
3108             break;
3109
3110         default:
3111             U_ASSERT(FALSE);
3112             }
3113
3114         }
3115
3116
3117     // We have finished walking through the ops.  Check whether some forward jump
3118     //   propagated a shorter length to location end+1.
3119     if (forwardedLength.elementAti(end+1) < currentLen) {
3120         currentLen = forwardedLength.elementAti(end+1);
3121     }
3122
3123
3124     fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
3125
3126
3127     // Sort out what we should check for when looking for candidate match start positions.
3128     // In order of preference,
3129     //     1.   Start of input text buffer.
3130     //     2.   A literal string.
3131     //     3.   Start of line in multi-line mode.
3132     //     4.   A single literal character.
3133     //     5.   A character from a set of characters.
3134     //
3135     if (fRXPat->fStartType == START_START) {
3136         // Match only at the start of an input text string.
3137         //    start type is already set.  We're done.
3138     } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
3139         // Match beginning only with a literal string.
3140         UChar32  c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
3141         U_ASSERT(fRXPat->fInitialChars->contains(c));
3142         fRXPat->fStartType   = START_STRING;
3143         fRXPat->fInitialChar = c;
3144     } else if (fRXPat->fStartType == START_LINE) {
3145         // Match at start of line in Multi-Line mode.
3146         // Nothing to do here; everything is already set.
3147     } else if (fRXPat->fMinMatchLen == 0) {
3148         // Zero length match possible.  We could start anywhere.
3149         fRXPat->fStartType = START_NO_INFO;
3150     } else if (fRXPat->fInitialChars->size() == 1) {
3151         // All matches begin with the same char.
3152         fRXPat->fStartType   = START_CHAR;
3153         fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
3154         U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
3155     } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE &&
3156         fRXPat->fMinMatchLen > 0) {
3157         // Matches start with a set of character smaller than the set of all chars.
3158         fRXPat->fStartType = START_SET;
3159     } else {
3160         // Matches can start with anything
3161         fRXPat->fStartType = START_NO_INFO;
3162     }
3163
3164     return;
3165 }
3166
3167
3168
3169 //------------------------------------------------------------------------------
3170 //
3171 //   minMatchLength    Calculate the length of the shortest string that could
3172 //                     match the specified pattern.
3173 //                     Length is in 16 bit code units, not code points.
3174 //
3175 //                     The calculated length may not be exact.  The returned
3176 //                     value may be shorter than the actual minimum; it must
3177 //                     never be longer.
3178 //
3179 //                     start and end are the range of p-code operations to be
3180 //                     examined.  The endpoints are included in the range.
3181 //
3182 //------------------------------------------------------------------------------
3183 int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
3184     if (U_FAILURE(*fStatus)) {
3185         return 0;
3186     }
3187
3188     U_ASSERT(start <= end);
3189     U_ASSERT(end < fRXPat->fCompiledPat->size());
3190
3191
3192     int32_t    loc;
3193     int32_t    op;
3194     int32_t    opType;
3195     int32_t    currentLen = 0;
3196
3197
3198     // forwardedLength is a vector holding minimum-match-length values that
3199     //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
3200     //   It must be one longer than the pattern being checked because some  ops
3201     //   will jmp to a end-of-block+1 location from within a block, and we must
3202     //   count those when checking the block.
3203     UVector32  forwardedLength(end+2, *fStatus);
3204     forwardedLength.setSize(end+2);
3205     for (loc=start; loc<=end+1; loc++) {
3206         forwardedLength.setElementAt(INT32_MAX, loc);
3207     }
3208
3209     for (loc = start; loc<=end; loc++) {
3210         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3211         opType = URX_TYPE(op);
3212
3213         // The loop is advancing linearly through the pattern.
3214         // If the op we are now at was the destination of a branch in the pattern,
3215         // and that path has a shorter minimum length than the current accumulated value,
3216         // replace the current accumulated value.
3217         // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);  // MinLength == INT32_MAX for some
3218                                                                //   no-match-possible cases.
3219         if (forwardedLength.elementAti(loc) < currentLen) {
3220             currentLen = forwardedLength.elementAti(loc);
3221             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3222         }
3223
3224         switch (opType) {
3225             // Ops that don't change the total length matched
3226         case URX_RESERVED_OP:
3227         case URX_END:
3228         case URX_STRING_LEN:
3229         case URX_NOP:
3230         case URX_START_CAPTURE:
3231         case URX_END_CAPTURE:
3232         case URX_BACKSLASH_B:
3233         case URX_BACKSLASH_BU:
3234         case URX_BACKSLASH_G:
3235         case URX_BACKSLASH_Z:
3236         case URX_CARET:
3237         case URX_DOLLAR:
3238         case URX_DOLLAR_M:
3239         case URX_DOLLAR_D:
3240         case URX_DOLLAR_MD:
3241         case URX_RELOC_OPRND:
3242         case URX_STO_INP_LOC:
3243         case URX_CARET_M:
3244         case URX_CARET_M_UNIX:
3245         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
3246         case URX_BACKREF_I:
3247
3248         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
3249         case URX_LD_SP:
3250
3251         case URX_JMP_SAV:
3252         case URX_JMP_SAV_X:
3253             break;
3254
3255
3256             // Ops that match a minimum of one character (one or two 16 bit code units.)
3257             //
3258         case URX_ONECHAR:
3259         case URX_STATIC_SETREF:
3260         case URX_STAT_SETREF_N:
3261         case URX_SETREF:
3262         case URX_BACKSLASH_D:
3263         case URX_BACKSLASH_H:
3264         case URX_BACKSLASH_R:
3265         case URX_BACKSLASH_V:
3266         case URX_ONECHAR_I:
3267         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
3268         case URX_DOTANY_ALL:    // . matches one or two.
3269         case URX_DOTANY:
3270         case URX_DOTANY_UNIX:
3271             currentLen = safeIncrement(currentLen, 1);
3272             break;
3273
3274
3275         case URX_JMPX:
3276             loc++;              // URX_JMPX has an extra operand, ignored here,
3277                                 //   otherwise processed identically to URX_JMP.
3278             U_FALLTHROUGH;
3279         case URX_JMP:
3280             {
3281                 int32_t  jmpDest = URX_VAL(op);
3282                 if (jmpDest < loc) {
3283                     // Loop of some kind.  Can safely ignore, the worst that will happen
3284                     //  is that we understate the true minimum length
3285                     currentLen = forwardedLength.elementAti(loc+1);
3286                 } else {
3287                     // Forward jump.  Propagate the current min length to the target loc of the jump.
3288                     U_ASSERT(jmpDest <= end+1);
3289                     if (forwardedLength.elementAti(jmpDest) > currentLen) {
3290                         forwardedLength.setElementAt(currentLen, jmpDest);
3291                     }
3292                 }
3293             }
3294             break;
3295
3296         case URX_BACKTRACK:
3297             {
3298                 // Back-tracks are kind of like a branch, except that the min length was
3299                 //   propagated already, by the state save.
3300                 currentLen = forwardedLength.elementAti(loc+1);
3301             }
3302             break;
3303
3304
3305         case URX_STATE_SAVE:
3306             {
3307                 // State Save, for forward jumps, propagate the current minimum.
3308                 //             of the state save.
3309                 int32_t  jmpDest = URX_VAL(op);
3310                 if (jmpDest > loc) {
3311                     if (currentLen < forwardedLength.elementAti(jmpDest)) {
3312                         forwardedLength.setElementAt(currentLen, jmpDest);
3313                     }
3314                 }
3315             }
3316             break;
3317
3318
3319         case URX_STRING:
3320             {
3321                 loc++;
3322                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3323                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3324             }
3325             break;
3326
3327
3328         case URX_STRING_I:
3329             {
3330                 loc++;
3331                 // TODO: with full case folding, matching input text may be shorter than
3332                 //       the string we have here.  More smarts could put some bounds on it.
3333                 //       Assume a min length of one for now.  A min length of zero causes
3334                 //        optimization failures for a pattern like "string"+
3335                 // currentLen += URX_VAL(stringLenOp);
3336                 currentLen = safeIncrement(currentLen, 1);
3337             }
3338             break;
3339
3340         case URX_CTR_INIT:
3341         case URX_CTR_INIT_NG:
3342             {
3343                 // Loop Init Ops.
3344                 //   If the min loop count == 0
3345                 //      move loc forwards to the end of the loop, skipping over the body.
3346                 //   If the min count is > 0,
3347                 //      continue normal processing of the body of the loop.
3348                 int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3349                         loopEndLoc   = URX_VAL(loopEndLoc);
3350                 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3351                 if (minLoopCount == 0) {
3352                     loc = loopEndLoc;
3353                 } else {
3354                     loc+=3;  // Skips over operands of CTR_INIT
3355                 }
3356             }
3357             break;
3358
3359
3360         case URX_CTR_LOOP:
3361         case URX_CTR_LOOP_NG:
3362             // Loop ops.
3363             //  The jump is conditional, backwards only.
3364             break;
3365
3366         case URX_LOOP_SR_I:
3367         case URX_LOOP_DOT_I:
3368         case URX_LOOP_C:
3369             // More loop ops.  These state-save to themselves.
3370             //   don't change the minimum match - could match nothing at all.
3371             break;
3372
3373
3374         case URX_LA_START:
3375         case URX_LB_START:
3376             {
3377                 // Look-around.  Scan forward until the matching look-ahead end,
3378                 //   without processing the look-around block.  This is overly pessimistic for look-ahead,
3379                 //   it assumes that the look-ahead match might be zero-length.
3380                 //   TODO:  Positive lookahead could recursively do the block, then continue
3381                 //          with the longer of the block or the value coming in.  Ticket 6060
3382                 int32_t  depth = (opType == URX_LA_START? 2: 1);;
3383                 for (;;) {
3384                     loc++;
3385                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3386                     if (URX_TYPE(op) == URX_LA_START) {
3387                         // The boilerplate for look-ahead includes two LA_END insturctions,
3388                         //    Depth will be decremented by each one when it is seen.
3389                         depth += 2;
3390                     }
3391                     if (URX_TYPE(op) == URX_LB_START) {
3392                         depth++;
3393                     }
3394                     if (URX_TYPE(op) == URX_LA_END) {
3395                         depth--;
3396                         if (depth == 0) {
3397                             break;
3398                         }
3399                     }
3400                     if (URX_TYPE(op)==URX_LBN_END) {
3401                         depth--;
3402                         if (depth == 0) {
3403                             break;
3404                         }
3405                     }
3406                     if (URX_TYPE(op) == URX_STATE_SAVE) {
3407                         // Need this because neg lookahead blocks will FAIL to outside
3408                         //   of the block.
3409                         int32_t  jmpDest = URX_VAL(op);
3410                         if (jmpDest > loc) {
3411                             if (currentLen < forwardedLength.elementAti(jmpDest)) {
3412                                 forwardedLength.setElementAt(currentLen, jmpDest);
3413                             }
3414                         }
3415                     }
3416                     U_ASSERT(loc <= end);
3417                 }
3418             }
3419             break;
3420
3421         case URX_LA_END:
3422         case URX_LB_CONT:
3423         case URX_LB_END:
3424         case URX_LBN_CONT:
3425         case URX_LBN_END:
3426             // Only come here if the matching URX_LA_START or URX_LB_START was not in the
3427             //   range being sized, which happens when measuring size of look-behind blocks.
3428             break;
3429
3430         default:
3431             U_ASSERT(FALSE);
3432             }
3433
3434         }
3435
3436     // We have finished walking through the ops.  Check whether some forward jump
3437     //   propagated a shorter length to location end+1.
3438     if (forwardedLength.elementAti(end+1) < currentLen) {
3439         currentLen = forwardedLength.elementAti(end+1);
3440         U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3441     }
3442
3443     return currentLen;
3444 }
3445
3446 //------------------------------------------------------------------------------
3447 //
3448 //   maxMatchLength    Calculate the length of the longest string that could
3449 //                     match the specified pattern.
3450 //                     Length is in 16 bit code units, not code points.
3451 //
3452 //                     The calculated length may not be exact.  The returned
3453 //                     value may be longer than the actual maximum; it must
3454 //                     never be shorter.
3455 //
3456 //------------------------------------------------------------------------------
3457 int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
3458     if (U_FAILURE(*fStatus)) {
3459         return 0;
3460     }
3461     U_ASSERT(start <= end);
3462     U_ASSERT(end < fRXPat->fCompiledPat->size());
3463
3464
3465     int32_t    loc;
3466     int32_t    op;
3467     int32_t    opType;
3468     int32_t    currentLen = 0;
3469     UVector32  forwardedLength(end+1, *fStatus);
3470     forwardedLength.setSize(end+1);
3471
3472     for (loc=start; loc<=end; loc++) {
3473         forwardedLength.setElementAt(0, loc);
3474     }
3475
3476     for (loc = start; loc<=end; loc++) {
3477         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3478         opType = URX_TYPE(op);
3479
3480         // The loop is advancing linearly through the pattern.
3481         // If the op we are now at was the destination of a branch in the pattern,
3482         // and that path has a longer maximum length than the current accumulated value,
3483         // replace the current accumulated value.
3484         if (forwardedLength.elementAti(loc) > currentLen) {
3485             currentLen = forwardedLength.elementAti(loc);
3486         }
3487
3488         switch (opType) {
3489             // Ops that don't change the total length matched
3490         case URX_RESERVED_OP:
3491         case URX_END:
3492         case URX_STRING_LEN:
3493         case URX_NOP:
3494         case URX_START_CAPTURE:
3495         case URX_END_CAPTURE:
3496         case URX_BACKSLASH_B:
3497         case URX_BACKSLASH_BU:
3498         case URX_BACKSLASH_G:
3499         case URX_BACKSLASH_Z:
3500         case URX_CARET:
3501         case URX_DOLLAR:
3502         case URX_DOLLAR_M:
3503         case URX_DOLLAR_D:
3504         case URX_DOLLAR_MD:
3505         case URX_RELOC_OPRND:
3506         case URX_STO_INP_LOC:
3507         case URX_CARET_M:
3508         case URX_CARET_M_UNIX:
3509
3510         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
3511         case URX_LD_SP:
3512
3513         case URX_LB_END:
3514         case URX_LB_CONT:
3515         case URX_LBN_CONT:
3516         case URX_LBN_END:
3517             break;
3518
3519
3520             // Ops that increase that cause an unbounded increase in the length
3521             //   of a matched string, or that increase it a hard to characterize way.
3522             //   Call the max length unbounded, and stop further checking.
3523         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
3524         case URX_BACKREF_I:
3525         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
3526             currentLen = INT32_MAX;
3527             break;
3528
3529
3530             // Ops that match a max of one character (possibly two 16 bit code units.)
3531             //
3532         case URX_STATIC_SETREF:
3533         case URX_STAT_SETREF_N:
3534         case URX_SETREF:
3535         case URX_BACKSLASH_D:
3536         case URX_BACKSLASH_H:
3537         case URX_BACKSLASH_R:
3538         case URX_BACKSLASH_V:
3539         case URX_ONECHAR_I:
3540         case URX_DOTANY_ALL:
3541         case URX_DOTANY:
3542         case URX_DOTANY_UNIX:
3543             currentLen = safeIncrement(currentLen, 2);
3544             break;
3545
3546             // Single literal character.  Increase current max length by one or two,
3547             //       depending on whether the char is in the supplementary range.
3548         case URX_ONECHAR:
3549             currentLen = safeIncrement(currentLen, 1);
3550             if (URX_VAL(op) > 0x10000) {
3551                 currentLen = safeIncrement(currentLen, 1);
3552             }
3553             break;
3554
3555             // Jumps.
3556             //
3557         case URX_JMP:
3558         case URX_JMPX:
3559         case URX_JMP_SAV:
3560         case URX_JMP_SAV_X:
3561             {
3562                 int32_t  jmpDest = URX_VAL(op);
3563                 if (jmpDest < loc) {
3564                     // Loop of some kind.  Max match length is unbounded.
3565                     currentLen = INT32_MAX;
3566                 } else {
3567                     // Forward jump.  Propagate the current min length to the target loc of the jump.
3568                     if (forwardedLength.elementAti(jmpDest) < currentLen) {
3569                         forwardedLength.setElementAt(currentLen, jmpDest);
3570                     }
3571                     currentLen = 0;
3572                 }
3573             }
3574             break;
3575
3576         case URX_BACKTRACK:
3577             // back-tracks are kind of like a branch, except that the max length was
3578             //   propagated already, by the state save.
3579             currentLen = forwardedLength.elementAti(loc+1);
3580             break;
3581
3582
3583         case URX_STATE_SAVE:
3584             {
3585                 // State Save, for forward jumps, propagate the current minimum.
3586                 //               of the state save.
3587                 //             For backwards jumps, they create a loop, maximum
3588                 //               match length is unbounded.
3589                 int32_t  jmpDest = URX_VAL(op);
3590                 if (jmpDest > loc) {
3591                     if (currentLen > forwardedLength.elementAti(jmpDest)) {
3592                         forwardedLength.setElementAt(currentLen, jmpDest);
3593                     }
3594                 } else {
3595                     currentLen = INT32_MAX;
3596                 }
3597             }
3598             break;
3599
3600
3601
3602
3603         case URX_STRING:
3604             {
3605                 loc++;
3606                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3607                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3608                 break;
3609             }
3610
3611         case URX_STRING_I:
3612             // TODO:  This code assumes that any user string that matches will be no longer
3613             //        than our compiled string, with case insensitive matching.
3614             //        Our compiled string has been case-folded already.
3615             //
3616             //        Any matching user string will have no more code points than our
3617             //        compiled (folded) string.  Folding may add code points, but
3618             //        not remove them.
3619             //
3620             //        There is a potential problem if a supplemental code point
3621             //        case-folds to a BMP code point.  In this case our compiled string
3622             //        could be shorter (in code units) than a matching user string.
3623             //
3624             //        At this time (Unicode 6.1) there are no such characters, and this case
3625             //        is not being handled.  A test, intltest regex/Bug9283, will fail if
3626             //        any problematic characters are added to Unicode.
3627             //
3628             //        If this happens, we can make a set of the BMP chars that the
3629             //        troublesome supplementals fold to, scan our string, and bump the
3630             //        currentLen one extra for each that is found.
3631             //
3632             {
3633                 loc++;
3634                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3635                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3636             }
3637             break;
3638
3639         case URX_CTR_INIT:
3640         case URX_CTR_INIT_NG:
3641             // For Loops, recursively call this function on the pattern for the loop body,
3642             //   then multiply the result by the maximum loop count.
3643             {
3644                 int32_t  loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
3645                 if (loopEndLoc == loc+4) {
3646                     // Loop has an empty body. No affect on max match length.
3647                     // Continue processing with code after the loop end.
3648                     loc = loopEndLoc;
3649                     break;
3650                 }
3651
3652                 int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+3));
3653                 if (maxLoopCount == -1) {
3654                     // Unbounded Loop. No upper bound on match length.
3655                     currentLen = INT32_MAX;
3656                     break;
3657                 }
3658
3659                 U_ASSERT(loopEndLoc >= loc+4);
3660                 int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1);  // Recursive call.
3661                 int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount;
3662                 if (updatedLen >= INT32_MAX) {
3663                     currentLen = INT32_MAX;
3664                     break;
3665                 }
3666                 currentLen = (int32_t)updatedLen;
3667                 loc = loopEndLoc;
3668                 break;
3669             }
3670
3671         case URX_CTR_LOOP:
3672         case URX_CTR_LOOP_NG:
3673             // These opcodes will be skipped over by code for URX_CRT_INIT.
3674             // We shouldn't encounter them here.
3675             U_ASSERT(FALSE);
3676             break;
3677
3678         case URX_LOOP_SR_I:
3679         case URX_LOOP_DOT_I:
3680         case URX_LOOP_C:
3681             // For anything to do with loops, make the match length unbounded.
3682             currentLen = INT32_MAX;
3683             break;
3684
3685
3686
3687         case URX_LA_START:
3688         case URX_LA_END:
3689             // Look-ahead.  Just ignore, treat the look-ahead block as if
3690             // it were normal pattern.  Gives a too-long match length,
3691             //  but good enough for now.
3692             break;
3693
3694             // End of look-ahead ops should always be consumed by the processing at
3695             //  the URX_LA_START op.
3696             // U_ASSERT(FALSE);
3697             // break;
3698
3699         case URX_LB_START:
3700             {
3701                 // Look-behind.  Scan forward until the matching look-around end,
3702                 //   without processing the look-behind block.
3703                 int32_t  depth = 0;
3704                 for (;;) {
3705                     loc++;
3706                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3707                     if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) {
3708                         depth++;
3709                     }
3710                     if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3711                         if (depth == 0) {
3712                             break;
3713                         }
3714                         depth--;
3715                     }
3716                     U_ASSERT(loc < end);
3717                 }
3718             }
3719             break;
3720
3721         default:
3722             U_ASSERT(FALSE);
3723         }
3724
3725
3726         if (currentLen == INT32_MAX) {
3727             //  The maximum length is unbounded.
3728             //  Stop further processing of the pattern.
3729             break;
3730         }
3731
3732     }
3733     return currentLen;
3734
3735 }
3736
3737
3738 //------------------------------------------------------------------------------
3739 //
3740 //   stripNOPs    Remove any NOP operations from the compiled pattern code.
3741 //                Extra NOPs are inserted for some constructs during the initial
3742 //                code generation to provide locations that may be patched later.
3743 //                Many end up unneeded, and are removed by this function.
3744 //
3745 //                In order to minimize the number of passes through the pattern,
3746 //                back-reference fixup is also performed here (adjusting
3747 //                back-reference operands to point to the correct frame offsets).
3748 //
3749 //------------------------------------------------------------------------------
3750 void RegexCompile::stripNOPs() {
3751
3752     if (U_FAILURE(*fStatus)) {
3753         return;
3754     }
3755
3756     int32_t    end = fRXPat->fCompiledPat->size();
3757     UVector32  deltas(end, *fStatus);
3758
3759     // Make a first pass over the code, computing the amount that things
3760     //   will be offset at each location in the original code.
3761     int32_t   loc;
3762     int32_t   d = 0;
3763     for (loc=0; loc<end; loc++) {
3764         deltas.addElement(d, *fStatus);
3765         int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3766         if (URX_TYPE(op) == URX_NOP) {
3767             d++;
3768         }
3769     }
3770
3771     UnicodeString caseStringBuffer;
3772
3773     // Make a second pass over the code, removing the NOPs by moving following
3774     //  code up, and patching operands that refer to code locations that
3775     //  are being moved.  The array of offsets from the first step is used
3776     //  to compute the new operand values.
3777     int32_t src;
3778     int32_t dst = 0;
3779     for (src=0; src<end; src++) {
3780         int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
3781         int32_t opType = URX_TYPE(op);
3782         switch (opType) {
3783         case URX_NOP:
3784             break;
3785
3786         case URX_STATE_SAVE:
3787         case URX_JMP:
3788         case URX_CTR_LOOP:
3789         case URX_CTR_LOOP_NG:
3790         case URX_RELOC_OPRND:
3791         case URX_JMPX:
3792         case URX_JMP_SAV:
3793         case URX_JMP_SAV_X:
3794             // These are instructions with operands that refer to code locations.
3795             {
3796                 int32_t  operandAddress = URX_VAL(op);
3797                 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
3798                 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
3799                 op = buildOp(opType, fixedOperandAddress);
3800                 fRXPat->fCompiledPat->setElementAt(op, dst);
3801                 dst++;
3802                 break;
3803             }
3804
3805         case URX_BACKREF:
3806         case URX_BACKREF_I:
3807             {
3808                 int32_t where = URX_VAL(op);
3809                 if (where > fRXPat->fGroupMap->size()) {
3810                     error(U_REGEX_INVALID_BACK_REF);
3811                     break;
3812                 }
3813                 where = fRXPat->fGroupMap->elementAti(where-1);
3814                 op    = buildOp(opType, where);
3815                 fRXPat->fCompiledPat->setElementAt(op, dst);
3816                 dst++;
3817
3818                 fRXPat->fNeedsAltInput = TRUE;
3819                 break;
3820             }
3821         case URX_RESERVED_OP:
3822         case URX_RESERVED_OP_N:
3823         case URX_BACKTRACK:
3824         case URX_END:
3825         case URX_ONECHAR:
3826         case URX_STRING:
3827         case URX_STRING_LEN:
3828         case URX_START_CAPTURE:
3829         case URX_END_CAPTURE:
3830         case URX_STATIC_SETREF:
3831         case URX_STAT_SETREF_N:
3832         case URX_SETREF:
3833         case URX_DOTANY:
3834         case URX_FAIL:
3835         case URX_BACKSLASH_B:
3836         case URX_BACKSLASH_BU:
3837         case URX_BACKSLASH_G:
3838         case URX_BACKSLASH_X:
3839         case URX_BACKSLASH_Z:
3840         case URX_DOTANY_ALL:
3841         case URX_BACKSLASH_D:
3842         case URX_CARET:
3843         case URX_DOLLAR:
3844         case URX_CTR_INIT:
3845         case URX_CTR_INIT_NG:
3846         case URX_DOTANY_UNIX:
3847         case URX_STO_SP:
3848         case URX_LD_SP:
3849         case URX_STO_INP_LOC:
3850         case URX_LA_START:
3851         case URX_LA_END:
3852         case URX_ONECHAR_I:
3853         case URX_STRING_I:
3854         case URX_DOLLAR_M:
3855         case URX_CARET_M:
3856         case URX_CARET_M_UNIX:
3857         case URX_LB_START:
3858         case URX_LB_CONT:
3859         case URX_LB_END:
3860         case URX_LBN_CONT:
3861         case URX_LBN_END:
3862         case URX_LOOP_SR_I:
3863         case URX_LOOP_DOT_I:
3864         case URX_LOOP_C:
3865         case URX_DOLLAR_D:
3866         case URX_DOLLAR_MD:
3867         case URX_BACKSLASH_H:
3868         case URX_BACKSLASH_R:
3869         case URX_BACKSLASH_V:
3870             // These instructions are unaltered by the relocation.
3871             fRXPat->fCompiledPat->setElementAt(op, dst);
3872             dst++;
3873             break;
3874
3875         default:
3876             // Some op is unaccounted for.
3877             U_ASSERT(FALSE);
3878             error(U_REGEX_INTERNAL_ERROR);
3879         }
3880     }
3881
3882     fRXPat->fCompiledPat->setSize(dst);
3883 }
3884
3885
3886
3887
3888 //------------------------------------------------------------------------------
3889 //
3890 //  Error         Report a rule parse error.
3891 //                Only report it if no previous error has been recorded.
3892 //
3893 //------------------------------------------------------------------------------
3894 void RegexCompile::error(UErrorCode e) {
3895     if (U_SUCCESS(*fStatus)) {
3896         *fStatus = e;
3897         // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
3898         // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
3899         // int64_t. If the values of the latter are out of range for the former,
3900         // set them to the appropriate "field not supported" values.
3901         if (fLineNum > 0x7FFFFFFF) {
3902             fParseErr->line   = 0;
3903             fParseErr->offset = -1;
3904         } else if (fCharNum > 0x7FFFFFFF) {
3905             fParseErr->line   = (int32_t)fLineNum;
3906             fParseErr->offset = -1;
3907         } else {
3908             fParseErr->line   = (int32_t)fLineNum;
3909             fParseErr->offset = (int32_t)fCharNum;
3910         }
3911
3912         UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
3913
3914         // Fill in the context.
3915         //   Note: extractBetween() pins supplied indicies to the string bounds.
3916         uprv_memset(fParseErr->preContext,  0, sizeof(fParseErr->preContext));
3917         uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
3918         utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
3919         utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
3920     }
3921 }
3922
3923
3924 //
3925 //  Assorted Unicode character constants.
3926 //     Numeric because there is no portable way to enter them as literals.
3927 //     (Think EBCDIC).
3928 //
3929 static const UChar      chCR        = 0x0d;      // New lines, for terminating comments.
3930 static const UChar      chLF        = 0x0a;      // Line Feed
3931 static const UChar      chPound     = 0x23;      // '#', introduces a comment.
3932 static const UChar      chDigit0    = 0x30;      // '0'
3933 static const UChar      chDigit7    = 0x37;      // '9'
3934 static const UChar      chColon     = 0x3A;      // ':'
3935 static const UChar      chE         = 0x45;      // 'E'
3936 static const UChar      chQ         = 0x51;      // 'Q'
3937 //static const UChar      chN         = 0x4E;      // 'N'
3938 static const UChar      chP         = 0x50;      // 'P'
3939 static const UChar      chBackSlash = 0x5c;      // '\'  introduces a char escape
3940 //static const UChar      chLBracket  = 0x5b;      // '['
3941 static const UChar      chRBracket  = 0x5d;      // ']'
3942 static const UChar      chUp        = 0x5e;      // '^'
3943 static const UChar      chLowerP    = 0x70;
3944 static const UChar      chLBrace    = 0x7b;      // '{'
3945 static const UChar      chRBrace    = 0x7d;      // '}'
3946 static const UChar      chNEL       = 0x85;      //    NEL newline variant
3947 static const UChar      chLS        = 0x2028;    //    Unicode Line Separator
3948
3949
3950 //------------------------------------------------------------------------------
3951 //
3952 //  nextCharLL    Low Level Next Char from the regex pattern.
3953 //                Get a char from the string, keep track of input position
3954 //                     for error reporting.
3955 //
3956 //------------------------------------------------------------------------------
3957 UChar32  RegexCompile::nextCharLL() {
3958     UChar32       ch;
3959
3960     if (fPeekChar != -1) {
3961         ch = fPeekChar;
3962         fPeekChar = -1;
3963         return ch;
3964     }
3965
3966     // assume we're already in the right place
3967     ch = UTEXT_NEXT32(fRXPat->fPattern);
3968     if (ch == U_SENTINEL) {
3969         return ch;
3970     }
3971
3972     if (ch == chCR ||
3973         ch == chNEL ||
3974         ch == chLS   ||
3975         (ch == chLF && fLastChar != chCR)) {
3976         // Character is starting a new line.  Bump up the line number, and
3977         //  reset the column to 0.
3978         fLineNum++;
3979         fCharNum=0;
3980     }
3981     else {
3982         // Character is not starting a new line.  Except in the case of a
3983         //   LF following a CR, increment the column position.
3984         if (ch != chLF) {
3985             fCharNum++;
3986         }
3987     }
3988     fLastChar = ch;
3989     return ch;
3990 }
3991
3992 //------------------------------------------------------------------------------
3993 //
3994 //   peekCharLL    Low Level Character Scanning, sneak a peek at the next
3995 //                 character without actually getting it.
3996 //
3997 //------------------------------------------------------------------------------
3998 UChar32  RegexCompile::peekCharLL() {
3999     if (fPeekChar == -1) {
4000         fPeekChar = nextCharLL();
4001     }
4002     return fPeekChar;
4003 }
4004
4005
4006 //------------------------------------------------------------------------------
4007 //
4008 //   nextChar     for pattern scanning.  At this level, we handle stripping
4009 //                out comments and processing some backslash character escapes.
4010 //                The rest of the pattern grammar is handled at the next level up.
4011 //
4012 //------------------------------------------------------------------------------
4013 void RegexCompile::nextChar(RegexPatternChar &c) {
4014
4015     fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4016     c.fChar    = nextCharLL();
4017     c.fQuoted  = FALSE;
4018
4019     if (fQuoteMode) {
4020         c.fQuoted = TRUE;
4021         if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
4022             c.fChar == (UChar32)-1) {
4023             fQuoteMode = FALSE;  //  Exit quote mode,
4024             nextCharLL();        // discard the E
4025             nextChar(c);         // recurse to get the real next char
4026         }
4027     }
4028     else if (fInBackslashQuote) {
4029         // The current character immediately follows a '\'
4030         // Don't check for any further escapes, just return it as-is.
4031         // Don't set c.fQuoted, because that would prevent the state machine from
4032         //    dispatching on the character.
4033         fInBackslashQuote = FALSE;
4034     }
4035     else
4036     {
4037         // We are not in a \Q quoted region \E of the source.
4038         //
4039         if (fModeFlags & UREGEX_COMMENTS) {
4040             //
4041             // We are in free-spacing and comments mode.
4042             //  Scan through any white space and comments, until we
4043             //  reach a significant character or the end of inut.
4044             for (;;) {
4045                 if (c.fChar == (UChar32)-1) {
4046                     break;     // End of Input
4047                 }
4048                 if  (c.fChar == chPound && fEOLComments == TRUE) {
4049                     // Start of a comment.  Consume the rest of it, until EOF or a new line
4050                     for (;;) {
4051                         c.fChar = nextCharLL();
4052                         if (c.fChar == (UChar32)-1 ||  // EOF
4053                             c.fChar == chCR        ||
4054                             c.fChar == chLF        ||
4055                             c.fChar == chNEL       ||
4056                             c.fChar == chLS)       {
4057                             break;
4058                         }
4059                     }
4060                 }
4061                 // TODO:  check what Java & Perl do with non-ASCII white spaces.  Ticket 6061.
4062                 if (PatternProps::isWhiteSpace(c.fChar) == FALSE) {
4063                     break;
4064                 }
4065                 c.fChar = nextCharLL();
4066             }
4067         }
4068
4069         //
4070         //  check for backslash escaped characters.
4071         //
4072         if (c.fChar == chBackSlash) {
4073             int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4074             if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
4075                 //
4076                 // A '\' sequence that is handled by ICU's standard unescapeAt function.
4077                 //   Includes \uxxxx, \n, \r, many others.
4078                 //   Return the single equivalent character.
4079                 //
4080                 nextCharLL();                 // get & discard the peeked char.
4081                 c.fQuoted = TRUE;
4082
4083                 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
4084                     int32_t endIndex = (int32_t)pos;
4085                     c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
4086
4087                     if (endIndex == pos) {
4088                         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4089                     }
4090                     fCharNum += endIndex - pos;
4091                     UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
4092                 } else {
4093                     int32_t offset = 0;
4094                     struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
4095
4096                     UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
4097                     c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
4098
4099                     if (offset == 0) {
4100                         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4101                     } else if (context.lastOffset == offset) {
4102                         UTEXT_PREVIOUS32(fRXPat->fPattern);
4103                     } else if (context.lastOffset != offset-1) {
4104                         utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
4105                     }
4106                     fCharNum += offset;
4107                 }
4108             }
4109             else if (peekCharLL() == chDigit0) {
4110                 //  Octal Escape, using Java Regexp Conventions
4111                 //    which are \0 followed by 1-3 octal digits.
4112                 //    Different from ICU Unescape handling of Octal, which does not
4113                 //    require the leading 0.
4114                 //  Java also has the convention of only consuming 2 octal digits if
4115                 //    the three digit number would be > 0xff
4116                 //
4117                 c.fChar = 0;
4118                 nextCharLL();    // Consume the initial 0.
4119                 int index;
4120                 for (index=0; index<3; index++) {
4121                     int32_t ch = peekCharLL();
4122                     if (ch<chDigit0 || ch>chDigit7) {
4123                         if (index==0) {
4124                            // \0 is not followed by any octal digits.
4125                            error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4126                         }
4127                         break;
4128                     }
4129                     c.fChar <<= 3;
4130                     c.fChar += ch&7;
4131                     if (c.fChar <= 255) {
4132                         nextCharLL();
4133                     } else {
4134                         // The last digit made the number too big.  Forget we saw it.
4135                         c.fChar >>= 3;
4136                     }
4137                 }
4138                 c.fQuoted = TRUE;
4139             }
4140             else if (peekCharLL() == chQ) {
4141                 //  "\Q"  enter quote mode, which will continue until "\E"
4142                 fQuoteMode = TRUE;
4143                 nextCharLL();       // discard the 'Q'.
4144                 nextChar(c);        // recurse to get the real next char.
4145             }
4146             else
4147             {
4148                 // We are in a '\' escape that will be handled by the state table scanner.
4149                 // Just return the backslash, but remember that the following char is to
4150                 //  be taken literally.
4151                 fInBackslashQuote = TRUE;
4152             }
4153         }
4154     }
4155
4156     // re-enable # to end-of-line comments, in case they were disabled.
4157     // They are disabled by the parser upon seeing '(?', but this lasts for
4158     //  the fetching of the next character only.
4159     fEOLComments = TRUE;
4160
4161     // putc(c.fChar, stdout);
4162 }
4163
4164
4165
4166 //------------------------------------------------------------------------------
4167 //
4168 //  scanNamedChar
4169 //            Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
4170 //
4171 //             The scan position will be at the 'N'.  On return
4172 //             the scan position should be just after the '}'
4173 //
4174 //             Return the UChar32
4175 //
4176 //------------------------------------------------------------------------------
4177 UChar32  RegexCompile::scanNamedChar() {
4178     if (U_FAILURE(*fStatus)) {
4179         return 0;
4180     }
4181
4182     nextChar(fC);
4183     if (fC.fChar != chLBrace) {
4184         error(U_REGEX_PROPERTY_SYNTAX);
4185         return 0;
4186     }
4187
4188     UnicodeString  charName;
4189     for (;;) {
4190         nextChar(fC);
4191         if (fC.fChar == chRBrace) {
4192             break;
4193         }
4194         if (fC.fChar == -1) {
4195             error(U_REGEX_PROPERTY_SYNTAX);
4196             return 0;
4197         }
4198         charName.append(fC.fChar);
4199     }
4200
4201     char name[100];
4202     if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
4203          (uint32_t)charName.length()>=sizeof(name)) {
4204         // All Unicode character names have only invariant characters.
4205         // The API to get a character, given a name, accepts only char *, forcing us to convert,
4206         //   which requires this error check
4207         error(U_REGEX_PROPERTY_SYNTAX);
4208         return 0;
4209     }
4210     charName.extract(0, charName.length(), name, sizeof(name), US_INV);
4211
4212     UChar32  theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
4213     if (U_FAILURE(*fStatus)) {
4214         error(U_REGEX_PROPERTY_SYNTAX);
4215     }
4216
4217     nextChar(fC);      // Continue overall regex pattern processing with char after the '}'
4218     return theChar;
4219 }
4220
4221 //------------------------------------------------------------------------------
4222 //
4223 //  scanProp   Construct a UnicodeSet from the text at the current scan
4224 //             position, which will be of the form \p{whaterver}
4225 //
4226 //             The scan position will be at the 'p' or 'P'.  On return
4227 //             the scan position should be just after the '}'
4228 //
4229 //             Return a UnicodeSet, constructed from the \P pattern,
4230 //             or NULL if the pattern is invalid.
4231 //
4232 //------------------------------------------------------------------------------
4233 UnicodeSet *RegexCompile::scanProp() {
4234     UnicodeSet    *uset = NULL;
4235
4236     if (U_FAILURE(*fStatus)) {
4237         return NULL;
4238     }
4239     (void)chLowerP;   // Suppress compiler unused variable warning.
4240     U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
4241     UBool negated = (fC.fChar == chP);
4242
4243     UnicodeString propertyName;
4244     nextChar(fC);
4245     if (fC.fChar != chLBrace) {
4246         error(U_REGEX_PROPERTY_SYNTAX);
4247         return NULL;
4248     }
4249     for (;;) {
4250         nextChar(fC);
4251         if (fC.fChar == chRBrace) {
4252             break;
4253         }
4254         if (fC.fChar == -1) {
4255             // Hit the end of the input string without finding the closing '}'
4256             error(U_REGEX_PROPERTY_SYNTAX);
4257             return NULL;
4258         }
4259         propertyName.append(fC.fChar);
4260     }
4261     uset = createSetForProperty(propertyName, negated);
4262     nextChar(fC);    // Move input scan to position following the closing '}'
4263     return uset;
4264 }
4265
4266 //------------------------------------------------------------------------------
4267 //
4268 //  scanPosixProp   Construct a UnicodeSet from the text at the current scan
4269 //             position, which is expected be of the form [:property expression:]
4270 //
4271 //             The scan position will be at the opening ':'.  On return
4272 //             the scan position must be on the closing ']'
4273 //
4274 //             Return a UnicodeSet constructed from the pattern,
4275 //             or NULL if this is not a valid POSIX-style set expression.
4276 //             If not a property expression, restore the initial scan position
4277 //                (to the opening ':')
4278 //
4279 //               Note:  the opening '[:' is not sufficient to guarantee that
4280 //                      this is a [:property:] expression.
4281 //                      [:'+=,] is a perfectly good ordinary set expression that
4282 //                              happens to include ':' as one of its characters.
4283 //
4284 //------------------------------------------------------------------------------
4285 UnicodeSet *RegexCompile::scanPosixProp() {
4286     UnicodeSet    *uset = NULL;
4287
4288     if (U_FAILURE(*fStatus)) {
4289         return NULL;
4290     }
4291
4292     U_ASSERT(fC.fChar == chColon);
4293
4294     // Save the scanner state.
4295     // TODO:  move this into the scanner, with the state encapsulated in some way.  Ticket 6062
4296     int64_t     savedScanIndex        = fScanIndex;
4297     int64_t     savedNextIndex        = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4298     UBool       savedQuoteMode        = fQuoteMode;
4299     UBool       savedInBackslashQuote = fInBackslashQuote;
4300     UBool       savedEOLComments      = fEOLComments;
4301     int64_t     savedLineNum          = fLineNum;
4302     int64_t     savedCharNum          = fCharNum;
4303     UChar32     savedLastChar         = fLastChar;
4304     UChar32     savedPeekChar         = fPeekChar;
4305     RegexPatternChar savedfC          = fC;
4306
4307     // Scan for a closing ].   A little tricky because there are some perverse
4308     //   edge cases possible.  "[:abc\Qdef:] \E]"  is a valid non-property expression,
4309     //   ending on the second closing ].
4310
4311     UnicodeString propName;
4312     UBool         negated  = FALSE;
4313
4314     // Check for and consume the '^' in a negated POSIX property, e.g.  [:^Letter:]
4315     nextChar(fC);
4316     if (fC.fChar == chUp) {
4317        negated = TRUE;
4318        nextChar(fC);
4319     }
4320
4321     // Scan for the closing ":]", collecting the property name along the way.
4322     UBool  sawPropSetTerminator = FALSE;
4323     for (;;) {
4324         propName.append(fC.fChar);
4325         nextChar(fC);
4326         if (fC.fQuoted || fC.fChar == -1) {
4327             // Escaped characters or end of input - either says this isn't a [:Property:]
4328             break;
4329         }
4330         if (fC.fChar == chColon) {
4331             nextChar(fC);
4332             if (fC.fChar == chRBracket) {
4333                 sawPropSetTerminator = TRUE;
4334             }
4335             break;
4336         }
4337     }
4338
4339     if (sawPropSetTerminator) {
4340         uset = createSetForProperty(propName, negated);
4341     }
4342     else
4343     {
4344         // No closing ":]".
4345         //  Restore the original scan position.
4346         //  The main scanner will retry the input as a normal set expression,
4347         //    not a [:Property:] expression.
4348         fScanIndex        = savedScanIndex;
4349         fQuoteMode        = savedQuoteMode;
4350         fInBackslashQuote = savedInBackslashQuote;
4351         fEOLComments      = savedEOLComments;
4352         fLineNum          = savedLineNum;
4353         fCharNum          = savedCharNum;
4354         fLastChar         = savedLastChar;
4355         fPeekChar         = savedPeekChar;
4356         fC                = savedfC;
4357         UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
4358     }
4359     return uset;
4360 }
4361
4362 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
4363     set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
4364     addCategory(set, U_GC_CF_MASK, ec);
4365 }
4366
4367 //
4368 //  Create a Unicode Set from a Unicode Property expression.
4369 //     This is common code underlying both \p{...} ane [:...:] expressions.
4370 //     Includes trying the Java "properties" that aren't supported as
4371 //     normal ICU UnicodeSet properties
4372 //
4373 static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{"
4374 static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{"
4375 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
4376     UnicodeString   setExpr;
4377     UnicodeSet      *set;
4378     uint32_t        usetFlags = 0;
4379
4380     if (U_FAILURE(*fStatus)) {
4381         return NULL;
4382     }
4383
4384     //
4385     //  First try the property as we received it
4386     //
4387     if (negated) {
4388         setExpr.append(negSetPrefix, -1);
4389     } else {
4390         setExpr.append(posSetPrefix, -1);
4391     }
4392     setExpr.append(propName);
4393     setExpr.append(chRBrace);
4394     setExpr.append(chRBracket);
4395     if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
4396         usetFlags |= USET_CASE_INSENSITIVE;
4397     }
4398     set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
4399     if (U_SUCCESS(*fStatus)) {
4400        return set;
4401     }
4402     delete set;
4403     set = NULL;
4404
4405     //
4406     //  The property as it was didn't work.
4407
4408     //  Do [:word:]. It is not recognized as a property by UnicodeSet.  "word" not standard POSIX
4409     //     or standard Java, but many other regular expression packages do recognize it.
4410
4411     if (propName.caseCompare(UNICODE_STRING_SIMPLE("word"), 0) == 0) {
4412         *fStatus = U_ZERO_ERROR;
4413         set = new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET]));
4414         if (set == NULL) {
4415             *fStatus = U_MEMORY_ALLOCATION_ERROR;
4416             return set;
4417         }
4418         if (negated) {
4419             set->complement();
4420         }
4421         return set;
4422     }
4423
4424
4425     //    Do Java fixes -
4426     //       InGreek -> InGreek or Coptic, that being the official Unicode name for that block.
4427     //       InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols.
4428     //
4429     //       Note on Spaces:  either "InCombiningMarksForSymbols" or "InCombining Marks for Symbols"
4430     //                        is accepted by Java.  The property part of the name is compared
4431     //                        case-insenstively.  The spaces must be exactly as shown, either
4432     //                        all there, or all omitted, with exactly one at each position
4433     //                        if they are present.  From checking against JDK 1.6
4434     //
4435     //       This code should be removed when ICU properties support the Java  compatibility names
4436     //          (ICU 4.0?)
4437     //
4438     UnicodeString mPropName = propName;
4439     if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) {
4440         mPropName = UNICODE_STRING_SIMPLE("InGreek and Coptic");
4441     }
4442     if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbols"), 0) == 0 ||
4443         mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols"), 0) == 0) {
4444         mPropName = UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Symbols");
4445     }
4446     else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
4447         mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint");
4448     }
4449
4450     //    See if the property looks like a Java "InBlockName", which
4451     //    we will recast as "Block=BlockName"
4452     //
4453     if (mPropName.startsWith(u"In", 2) && propName.length()>=3) {
4454         setExpr.truncate(4);   // Leaves "[\p{", or "[\P{"
4455         setExpr.append(u"Block=", -1);
4456         setExpr.append(UnicodeString(mPropName, 2));  // Property with the leading "In" removed.
4457         setExpr.append(chRBrace);
4458         setExpr.append(chRBracket);
4459         *fStatus = U_ZERO_ERROR;
4460         set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
4461         if (U_SUCCESS(*fStatus)) {
4462             return set;
4463         }
4464         delete set;
4465         set = NULL;
4466     }
4467
4468     if (propName.startsWith(UNICODE_STRING_SIMPLE("java")) ||
4469         propName.compare(UNICODE_STRING_SIMPLE("all")) == 0)
4470     {
4471         UErrorCode localStatus = U_ZERO_ERROR;
4472         //setExpr.remove();
4473         set = new UnicodeSet();
4474         //
4475         //  Try the various Java specific properties.
4476         //   These all begin with "java"
4477         //
4478         if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDefined")) == 0) {
4479             addCategory(set, U_GC_CN_MASK, localStatus);
4480             set->complement();
4481         }
4482         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDigit")) == 0) {
4483             addCategory(set, U_GC_ND_MASK, localStatus);
4484         }
4485         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaIdentifierIgnorable")) == 0) {
4486             addIdentifierIgnorable(set, localStatus);
4487         }
4488         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaISOControl")) == 0) {
4489             set->add(0, 0x1F).add(0x7F, 0x9F);
4490         }
4491         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierPart")) == 0) {
4492             addCategory(set, U_GC_L_MASK, localStatus);
4493             addCategory(set, U_GC_SC_MASK, localStatus);
4494             addCategory(set, U_GC_PC_MASK, localStatus);
4495             addCategory(set, U_GC_ND_MASK, localStatus);
4496             addCategory(set, U_GC_NL_MASK, localStatus);
4497             addCategory(set, U_GC_MC_MASK, localStatus);
4498             addCategory(set, U_GC_MN_MASK, localStatus);
4499             addIdentifierIgnorable(set, localStatus);
4500         }
4501         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierStart")) == 0) {
4502             addCategory(set, U_GC_L_MASK, localStatus);
4503             addCategory(set, U_GC_NL_MASK, localStatus);
4504             addCategory(set, U_GC_SC_MASK, localStatus);
4505             addCategory(set, U_GC_PC_MASK, localStatus);
4506         }
4507         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetter")) == 0) {
4508             addCategory(set, U_GC_L_MASK, localStatus);
4509         }
4510         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetterOrDigit")) == 0) {
4511             addCategory(set, U_GC_L_MASK, localStatus);
4512             addCategory(set, U_GC_ND_MASK, localStatus);
4513         }
4514         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLowerCase")) == 0) {
4515             addCategory(set, U_GC_LL_MASK, localStatus);
4516         }
4517         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaMirrored")) == 0) {
4518             set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, localStatus);
4519         }
4520         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSpaceChar")) == 0) {
4521             addCategory(set, U_GC_Z_MASK, localStatus);
4522         }
4523         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSupplementaryCodePoint")) == 0) {
4524             set->add(0x10000, UnicodeSet::MAX_VALUE);
4525         }
4526         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaTitleCase")) == 0) {
4527             addCategory(set, U_GC_LT_MASK, localStatus);
4528         }
4529         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierStart")) == 0) {
4530             addCategory(set, U_GC_L_MASK, localStatus);
4531             addCategory(set, U_GC_NL_MASK, localStatus);
4532         }
4533         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierPart")) == 0) {
4534             addCategory(set, U_GC_L_MASK, localStatus);
4535             addCategory(set, U_GC_PC_MASK, localStatus);
4536             addCategory(set, U_GC_ND_MASK, localStatus);
4537             addCategory(set, U_GC_NL_MASK, localStatus);
4538             addCategory(set, U_GC_MC_MASK, localStatus);
4539             addCategory(set, U_GC_MN_MASK, localStatus);
4540             addIdentifierIgnorable(set, localStatus);
4541         }
4542         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUpperCase")) == 0) {
4543             addCategory(set, U_GC_LU_MASK, localStatus);
4544         }
4545         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaValidCodePoint")) == 0) {
4546             set->add(0, UnicodeSet::MAX_VALUE);
4547         }
4548         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaWhitespace")) == 0) {
4549             addCategory(set, U_GC_Z_MASK, localStatus);
4550             set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
4551             set->add(9, 0x0d).add(0x1c, 0x1f);
4552         }
4553         else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
4554             set->add(0, UnicodeSet::MAX_VALUE);
4555         }
4556
4557         if (U_SUCCESS(localStatus) && !set->isEmpty()) {
4558             *fStatus = U_ZERO_ERROR;
4559             if (usetFlags & USET_CASE_INSENSITIVE) {
4560                 set->closeOver(USET_CASE_INSENSITIVE);
4561             }
4562             if (negated) {
4563                 set->complement();
4564             }
4565             return set;
4566         }
4567         delete set;
4568         set = NULL;
4569     }
4570     error(*fStatus);
4571     return NULL;
4572 }
4573
4574
4575
4576 //
4577 //  SetEval   Part of the evaluation of [set expressions].
4578 //            Perform any pending (stacked) operations with precedence
4579 //            equal or greater to that of the next operator encountered
4580 //            in the expression.
4581 //
4582 void RegexCompile::setEval(int32_t nextOp) {
4583     UnicodeSet *rightOperand = NULL;
4584     UnicodeSet *leftOperand  = NULL;
4585     for (;;) {
4586         U_ASSERT(fSetOpStack.empty()==FALSE);
4587         int32_t pendingSetOperation = fSetOpStack.peeki();
4588         if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
4589             break;
4590         }
4591         fSetOpStack.popi();
4592         U_ASSERT(fSetStack.empty() == FALSE);
4593         rightOperand = (UnicodeSet *)fSetStack.peek();
4594         switch (pendingSetOperation) {
4595             case setNegation:
4596                 rightOperand->complement();
4597                 break;
4598             case setCaseClose:
4599                 // TODO: need a simple close function.  Ticket 6065
4600                 rightOperand->closeOver(USET_CASE_INSENSITIVE);
4601                 rightOperand->removeAllStrings();
4602                 break;
4603             case setDifference1:
4604             case setDifference2:
4605                 fSetStack.pop();
4606                 leftOperand = (UnicodeSet *)fSetStack.peek();
4607                 leftOperand->removeAll(*rightOperand);
4608                 delete rightOperand;
4609                 break;
4610             case setIntersection1:
4611             case setIntersection2:
4612                 fSetStack.pop();
4613                 leftOperand = (UnicodeSet *)fSetStack.peek();
4614                 leftOperand->retainAll(*rightOperand);
4615                 delete rightOperand;
4616                 break;
4617             case setUnion:
4618                 fSetStack.pop();
4619                 leftOperand = (UnicodeSet *)fSetStack.peek();
4620                 leftOperand->addAll(*rightOperand);
4621                 delete rightOperand;
4622                 break;
4623             default:
4624                 U_ASSERT(FALSE);
4625                 break;
4626             }
4627         }
4628     }
4629
4630 void RegexCompile::setPushOp(int32_t op) {
4631     setEval(op);
4632     fSetOpStack.push(op, *fStatus);
4633     fSetStack.push(new UnicodeSet(), *fStatus);
4634 }
4635
4636 U_NAMESPACE_END
4637 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
4638