icuSources/i18n/regexcmp.cpp

   1 //
   2 //  file:  regexcmp.cpp
   3 //
   4 //  Copyright (C) 2002-2015 International Business Machines Corporation and others.
   5 //  All Rights Reserved.
   6 //
   7 //  This file contains the ICU regular expression compiler, which is responsible
   8 //  for processing a regular expression pattern into the compiled form that
   9 //  is used by the match finding engine.
  10 //
  11
  12 #include "unicode/utypes.h"
  13
  14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  15
  16 #include "unicode/ustring.h"
  17 #include "unicode/unistr.h"
  18 #include "unicode/uniset.h"
  19 #include "unicode/uchar.h"
  20 #include "unicode/uchriter.h"
  21 #include "unicode/parsepos.h"
  22 #include "unicode/parseerr.h"
  23 #include "unicode/regex.h"
  24 #include "unicode/utf.h"
  25 #include "unicode/utf16.h"
  26 #include "patternprops.h"
  27 #include "putilimp.h"
  28 #include "cmemory.h"
  29 #include "cstring.h"
  30 #include "uvectr32.h"
  31 #include "uvectr64.h"
  32 #include "uassert.h"
  33 #include "uinvchar.h"
  34
  35 #include "regeximp.h"
  36 #include "regexcst.h"   // Contains state table for the regex pattern parser.
  37                         //   generated by a Perl script.
  38 #include "regexcmp.h"
  39 #include "regexst.h"
  40 #include "regextxt.h"
  41
  42
  43
  44 U_NAMESPACE_BEGIN
  45
  46
  47 //------------------------------------------------------------------------------
  48 //
  49 //  Constructor.
  50 //
  51 //------------------------------------------------------------------------------
  52 RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
  53    fParenStack(status), fSetStack(status), fSetOpStack(status)
  54 {
  55     // Lazy init of all shared global sets (needed for init()'s empty text)
  56     RegexStaticSets::initGlobals(&status);
  57
  58     fStatus           = &status;
  59
  60     fRXPat            = rxp;
  61     fScanIndex        = 0;
  62     fLastChar         = -1;
  63     fPeekChar         = -1;
  64     fLineNum          = 1;
  65     fCharNum          = 0;
  66     fQuoteMode        = FALSE;
  67     fInBackslashQuote = FALSE;
  68     fModeFlags        = fRXPat->fFlags | 0x80000000;
  69     fEOLComments      = TRUE;
  70
  71     fMatchOpenParen   = -1;
  72     fMatchCloseParen  = -1;
  73     fCaptureName      = NULL;
  74
  75     if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
  76         status = rxp->fDeferredStatus;
  77     }
  78 }
  79
  80 static const UChar      chAmp       = 0x26;      // '&'
  81 static const UChar      chDash      = 0x2d;      // '-'
  82
  83
  84 //------------------------------------------------------------------------------
  85 //
  86 //  Destructor
  87 //
  88 //------------------------------------------------------------------------------
  89 RegexCompile::~RegexCompile() {
  90     delete fCaptureName;         // Normally will be NULL, but can exist if pattern
  91                                  //   compilation stops with a syntax error.
  92 }
  93
  94 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
  95     set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
  96 }
  97
  98 //------------------------------------------------------------------------------
  99 //
 100 //  Compile regex pattern.   The state machine for rexexp pattern parsing is here.
 101 //                           The state tables are hand-written in the file regexcst.txt,
 102 //                           and converted to the form used here by a perl
 103 //                           script regexcst.pl
 104 //
 105 //------------------------------------------------------------------------------
 106 void    RegexCompile::compile(
 107                          const UnicodeString &pat,   // Source pat to be compiled.
 108                          UParseError &pp,            // Error position info
 109                          UErrorCode &e)              // Error Code
 110 {
 111     fRXPat->fPatternString = new UnicodeString(pat);
 112     UText patternText = UTEXT_INITIALIZER;
 113     utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
 114
 115     if (U_SUCCESS(e)) {
 116         compile(&patternText, pp, e);
 117         utext_close(&patternText);
 118     }
 119 }
 120
 121 //
 122 //   compile, UText mode
 123 //     All the work is actually done here.
 124 //
 125 void    RegexCompile::compile(
 126                          UText *pat,                 // Source pat to be compiled.
 127                          UParseError &pp,            // Error position info
 128                          UErrorCode &e)              // Error Code
 129 {
 130     fStatus             = &e;
 131     fParseErr           = &pp;
 132     fStackPtr           = 0;
 133     fStack[fStackPtr]   = 0;
 134
 135     if (U_FAILURE(*fStatus)) {
 136         return;
 137     }
 138
 139     // There should be no pattern stuff in the RegexPattern object.  They can not be reused.
 140     U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0);
 141
 142     // Prepare the RegexPattern object to receive the compiled pattern.
 143     fRXPat->fPattern        = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus);
 144     if (U_FAILURE(*fStatus)) {
 145         return;
 146     }
 147     fRXPat->fStaticSets     = RegexStaticSets::gStaticSets->fPropSets;
 148     fRXPat->fStaticSets8    = RegexStaticSets::gStaticSets->fPropSets8;
 149
 150
 151     // Initialize the pattern scanning state machine
 152     fPatternLength = utext_nativeLength(pat);
 153     uint16_t                state = 1;
 154     const RegexTableEl      *tableEl;
 155
 156     // UREGEX_LITERAL force entire pattern to be treated as a literal string.
 157     if (fModeFlags & UREGEX_LITERAL) {
 158         fQuoteMode = TRUE;
 159     }
 160
 161     nextChar(fC);                        // Fetch the first char from the pattern string.
 162
 163     //
 164     // Main loop for the regex pattern parsing state machine.
 165     //   Runs once per state transition.
 166     //   Each time through optionally performs, depending on the state table,
 167     //      - an advance to the the next pattern char
 168     //      - an action to be performed.
 169     //      - pushing or popping a state to/from the local state return stack.
 170     //   file regexcst.txt is the source for the state table.  The logic behind
 171     //     recongizing the pattern syntax is there, not here.
 172     //
 173     for (;;) {
 174         //  Bail out if anything has gone wrong.
 175         //  Regex pattern parsing stops on the first error encountered.
 176         if (U_FAILURE(*fStatus)) {
 177             break;
 178         }
 179
 180         U_ASSERT(state != 0);
 181
 182         // Find the state table element that matches the input char from the pattern, or the
 183         //    class of the input character.  Start with the first table row for this
 184         //    state, then linearly scan forward until we find a row that matches the
 185         //    character.  The last row for each state always matches all characters, so
 186         //    the search will stop there, if not before.
 187         //
 188         tableEl = &gRuleParseStateTable[state];
 189         REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d)    state=%s ",
 190             fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
 191
 192         for (;;) {    // loop through table rows belonging to this state, looking for one
 193                       //   that matches the current input char.
 194             REGEX_SCAN_DEBUG_PRINTF(("."));
 195             if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE &&   tableEl->fCharClass == fC.fChar) {
 196                 // Table row specified an individual character, not a set, and
 197                 //   the input character is not quoted, and
 198                 //   the input character matched it.
 199                 break;
 200             }
 201             if (tableEl->fCharClass == 255) {
 202                 // Table row specified default, match anything character class.
 203                 break;
 204             }
 205             if (tableEl->fCharClass == 254 && fC.fQuoted)  {
 206                 // Table row specified "quoted" and the char was quoted.
 207                 break;
 208             }
 209             if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1)  {
 210                 // Table row specified eof and we hit eof on the input.
 211                 break;
 212             }
 213
 214             if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
 215                 fC.fQuoted == FALSE &&                                       //   char is not escaped &&
 216                 fC.fChar != (UChar32)-1) {                                   //   char is not EOF
 217                 U_ASSERT(tableEl->fCharClass <= 137);
 218                 if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
 219                     // Table row specified a character class, or set of characters,
 220                     //   and the current char matches it.
 221                     break;
 222                 }
 223             }
 224
 225             // No match on this row, advance to the next  row for this state,
 226             tableEl++;
 227         }
 228         REGEX_SCAN_DEBUG_PRINTF(("\n"));
 229
 230         //
 231         // We've found the row of the state table that matches the current input
 232         //   character from the rules string.
 233         // Perform any action specified  by this row in the state table.
 234         if (doParseActions(tableEl->fAction) == FALSE) {
 235             // Break out of the state machine loop if the
 236             //   the action signalled some kind of error, or
 237             //   the action was to exit, occurs on normal end-of-rules-input.
 238             break;
 239         }
 240
 241         if (tableEl->fPushState != 0) {
 242             fStackPtr++;
 243             if (fStackPtr >= kStackSize) {
 244                 error(U_REGEX_INTERNAL_ERROR);
 245                 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
 246                 fStackPtr--;
 247             }
 248             fStack[fStackPtr] = tableEl->fPushState;
 249         }
 250
 251         //
 252         //  NextChar.  This is where characters are actually fetched from the pattern.
 253         //             Happens under control of the 'n' tag in the state table.
 254         //
 255         if (tableEl->fNextChar) {
 256             nextChar(fC);
 257         }
 258
 259         // Get the next state from the table entry, or from the
 260         //   state stack if the next state was specified as "pop".
 261         if (tableEl->fNextState != 255) {
 262             state = tableEl->fNextState;
 263         } else {
 264             state = fStack[fStackPtr];
 265             fStackPtr--;
 266             if (fStackPtr < 0) {
 267                 // state stack underflow
 268                 // This will occur if the user pattern has mis-matched parentheses,
 269                 //   with extra close parens.
 270                 //
 271                 fStackPtr++;
 272                 error(U_REGEX_MISMATCHED_PAREN);
 273             }
 274         }
 275
 276     }
 277
 278     if (U_FAILURE(*fStatus)) {
 279         // Bail out if the pattern had errors.
 280         //   Set stack cleanup:  a successful compile would have left it empty,
 281         //   but errors can leave temporary sets hanging around.
 282         while (!fSetStack.empty()) {
 283             delete (UnicodeSet *)fSetStack.pop();
 284         }
 285         return;
 286     }
 287
 288     //
 289     // The pattern has now been read and processed, and the compiled code generated.
 290     //
 291
 292     //
 293     // The pattern's fFrameSize so far has accumulated the requirements for
 294     //   storage for capture parentheses, counters, etc. that are encountered
 295     //   in the pattern.  Add space for the two variables that are always
 296     //   present in the saved state:  the input string position (int64_t) and
 297     //   the position in the compiled pattern.
 298     //
 299     allocateStackData(RESTACKFRAME_HDRCOUNT);
 300
 301     //
 302     // Optimization pass 1: NOPs, back-references, and case-folding
 303     //
 304     stripNOPs();
 305
 306     //
 307     // Get bounds for the minimum and maximum length of a string that this
 308     //   pattern can match.  Used to avoid looking for matches in strings that
 309     //   are too short.
 310     //
 311     fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
 312
 313     //
 314     // Optimization pass 2: match start type
 315     //
 316     matchStartType();
 317
 318     //
 319     // Set up fast latin-1 range sets
 320     //
 321     int32_t numSets = fRXPat->fSets->size();
 322     fRXPat->fSets8 = new Regex8BitSet[numSets];
 323     // Null pointer check.
 324     if (fRXPat->fSets8 == NULL) {
 325         e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
 326         return;
 327     }
 328     int32_t i;
 329     for (i=0; i<numSets; i++) {
 330         UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
 331         fRXPat->fSets8[i].init(s);
 332     }
 333
 334 }
 335
 336
 337
 338
 339
 340 //------------------------------------------------------------------------------
 341 //
 342 //  doParseAction        Do some action during regex pattern parsing.
 343 //                       Called by the parse state machine.
 344 //
 345 //                       Generation of the match engine PCode happens here, or
 346 //                       in functions called from the parse actions defined here.
 347 //
 348 //
 349 //------------------------------------------------------------------------------
 350 UBool RegexCompile::doParseActions(int32_t action)
 351 {
 352     UBool   returnVal = TRUE;
 353
 354     switch ((Regex_PatternParseAction)action) {
 355
 356     case doPatStart:
 357         // Start of pattern compiles to:
 358         //0   SAVE   2        Fall back to position of FAIL
 359         //1   jmp    3
 360         //2   FAIL            Stop if we ever reach here.
 361         //3   NOP             Dummy, so start of pattern looks the same as
 362         //                    the start of an ( grouping.
 363         //4   NOP             Resreved, will be replaced by a save if there are
 364         //                    OR | operators at the top level
 365         appendOp(URX_STATE_SAVE, 2);
 366         appendOp(URX_JMP,  3);
 367         appendOp(URX_FAIL, 0);
 368
 369         // Standard open nonCapture paren action emits the two NOPs and
 370         //   sets up the paren stack frame.
 371         doParseActions(doOpenNonCaptureParen);
 372         break;
 373
 374     case doPatFinish:
 375         // We've scanned to the end of the pattern
 376         //  The end of pattern compiles to:
 377         //        URX_END
 378         //    which will stop the runtime match engine.
 379         //  Encountering end of pattern also behaves like a close paren,
 380         //   and forces fixups of the State Save at the beginning of the compiled pattern
 381         //   and of any OR operations at the top level.
 382         //
 383         handleCloseParen();
 384         if (fParenStack.size() > 0) {
 385             // Missing close paren in pattern.
 386             error(U_REGEX_MISMATCHED_PAREN);
 387         }
 388
 389         // add the END operation to the compiled pattern.
 390         appendOp(URX_END, 0);
 391
 392         // Terminate the pattern compilation state machine.
 393         returnVal = FALSE;
 394         break;
 395
 396
 397
 398     case doOrOperator:
 399         // Scanning a '|', as in (A|B)
 400         {
 401             // Generate code for any pending literals preceding the '|'
 402             fixLiterals(FALSE);
 403
 404             // Insert a SAVE operation at the start of the pattern section preceding
 405             //   this OR at this level.  This SAVE will branch the match forward
 406             //   to the right hand side of the OR in the event that the left hand
 407             //   side fails to match and backtracks.  Locate the position for the
 408             //   save from the location on the top of the parentheses stack.
 409             int32_t savePosition = fParenStack.popi();
 410             int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
 411             U_ASSERT(URX_TYPE(op) == URX_NOP);  // original contents of reserved location
 412             op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
 413             fRXPat->fCompiledPat->setElementAt(op, savePosition);
 414
 415             // Append an JMP operation into the compiled pattern.  The operand for
 416             //  the JMP will eventually be the location following the ')' for the
 417             //  group.  This will be patched in later, when the ')' is encountered.
 418             appendOp(URX_JMP, 0);
 419
 420             // Push the position of the newly added JMP op onto the parentheses stack.
 421             // This registers if for fixup when this block's close paren is encountered.
 422             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
 423
 424             // Append a NOP to the compiled pattern.  This is the slot reserved
 425             //   for a SAVE in the event that there is yet another '|' following
 426             //   this one.
 427             appendOp(URX_NOP, 0);
 428             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
 429         }
 430         break;
 431
 432
 433     case doBeginNamedCapture:
 434         // Scanning (?<letter.
 435         //   The first letter of the name will come through again under doConinueNamedCapture.
 436         fCaptureName = new UnicodeString();
 437         if (fCaptureName == NULL) {
 438             error(U_MEMORY_ALLOCATION_ERROR);
 439         }
 440         break;
 441
 442     case  doContinueNamedCapture:
 443         fCaptureName->append(fC.fChar);
 444         break;
 445
 446     case doBadNamedCapture:
 447         error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
 448         break;
 449
 450     case doOpenCaptureParen:
 451         // Open Capturing Paren, possibly named.
 452         //   Compile to a
 453         //      - NOP, which later may be replaced by a save-state if the
 454         //         parenthesized group gets a * quantifier, followed by
 455         //      - START_CAPTURE  n    where n is stack frame offset to the capture group variables.
 456         //      - NOP, which may later be replaced by a save-state if there
 457         //             is an '|' alternation within the parens.
 458         //
 459         //    Each capture group gets three slots in the save stack frame:
 460         //         0: Capture Group start position (in input string being matched.)
 461         //         1: Capture Group end position.
 462         //         2: Start of Match-in-progress.
 463         //    The first two locations are for a completed capture group, and are
 464         //     referred to by back references and the like.
 465         //    The third location stores the capture start position when an START_CAPTURE is
 466         //      encountered.  This will be promoted to a completed capture when (and if) the corresponding
 467         //      END_CAPTURE is encountered.
 468         {
 469             fixLiterals();
 470             appendOp(URX_NOP, 0);
 471             int32_t  varsLoc = allocateStackData(3);    // Reserve three slots in match stack frame.
 472             appendOp(URX_START_CAPTURE, varsLoc);
 473             appendOp(URX_NOP, 0);
 474
 475             // On the Parentheses stack, start a new frame and add the postions
 476             //   of the two NOPs.  Depending on what follows in the pattern, the
 477             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
 478             //   address of the end of the parenthesized group.
 479             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
 480             fParenStack.push(capturing, *fStatus);                        // Frame type.
 481             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first  NOP location
 482             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
 483
 484             // Save the mapping from group number to stack frame variable position.
 485             fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
 486
 487             // If this is a named capture group, add the name->group number mapping.
 488             if (fCaptureName != NULL) {
 489                 int32_t groupNumber = fRXPat->fGroupMap->size();
 490                 int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
 491                 fCaptureName = NULL;    // hash table takes ownership of the name (key) string.
 492                 if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
 493                     error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
 494                 }
 495             }
 496         }
 497         break;
 498
 499     case doOpenNonCaptureParen:
 500         // Open non-caputuring (grouping only) Paren.
 501         //   Compile to a
 502         //      - NOP, which later may be replaced by a save-state if the
 503         //         parenthesized group gets a * quantifier, followed by
 504         //      - NOP, which may later be replaced by a save-state if there
 505         //             is an '|' alternation within the parens.
 506         {
 507             fixLiterals();
 508             appendOp(URX_NOP, 0);
 509             appendOp(URX_NOP, 0);
 510
 511             // On the Parentheses stack, start a new frame and add the postions
 512             //   of the two NOPs.
 513             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
 514             fParenStack.push(plain,      *fStatus);                       // Begin a new frame.
 515             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
 516             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
 517         }
 518          break;
 519
 520
 521     case doOpenAtomicParen:
 522         // Open Atomic Paren.  (?>
 523         //   Compile to a
 524         //      - NOP, which later may be replaced if the parenthesized group
 525         //         has a quantifier, followed by
 526         //      - STO_SP  save state stack position, so it can be restored at the ")"
 527         //      - NOP, which may later be replaced by a save-state if there
 528         //             is an '|' alternation within the parens.
 529         {
 530             fixLiterals();
 531             appendOp(URX_NOP, 0);
 532             int32_t  varLoc = allocateData(1);    // Reserve a data location for saving the state stack ptr.
 533             appendOp(URX_STO_SP, varLoc);
 534             appendOp(URX_NOP, 0);
 535
 536             // On the Parentheses stack, start a new frame and add the postions
 537             //   of the two NOPs.  Depending on what follows in the pattern, the
 538             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
 539             //   address of the end of the parenthesized group.
 540             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
 541             fParenStack.push(atomic, *fStatus);                           // Frame type.
 542             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first NOP
 543             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
 544         }
 545         break;
 546
 547
 548     case doOpenLookAhead:
 549         // Positive Look-ahead   (?=  stuff  )
 550         //
 551         //   Note:   Addition of transparent input regions, with the need to
 552         //           restore the original regions when failing out of a lookahead
 553         //           block, complicated this sequence.  Some conbined opcodes
 554         //           might make sense - or might not, lookahead aren't that common.
 555         //
 556         //      Caution:  min match length optimization knows about this
 557         //               sequence; don't change without making updates there too.
 558         //
 559         // Compiles to
 560         //    1    START_LA     dataLoc     Saves SP, Input Pos
 561         //    2.   STATE_SAVE   4            on failure of lookahead, goto 4
 562         //    3    JMP          6           continue ...
 563         //
 564         //    4.   LA_END                   Look Ahead failed.  Restore regions.
 565         //    5.   BACKTRACK                and back track again.
 566         //
 567         //    6.   NOP              reserved for use by quantifiers on the block.
 568         //                          Look-ahead can't have quantifiers, but paren stack
 569         //                             compile time conventions require the slot anyhow.
 570         //    7.   NOP              may be replaced if there is are '|' ops in the block.
 571         //    8.     code for parenthesized stuff.
 572         //    9.   LA_END
 573         //
 574         //  Two data slots are reserved, for saving the stack ptr and the input position.
 575         {
 576             fixLiterals();
 577             int32_t dataLoc = allocateData(2);
 578             appendOp(URX_LA_START, dataLoc);
 579             appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
 580             appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
 581             appendOp(URX_LA_END, dataLoc);
 582             appendOp(URX_BACKTRACK, 0);
 583             appendOp(URX_NOP, 0);
 584             appendOp(URX_NOP, 0);
 585
 586             // On the Parentheses stack, start a new frame and add the postions
 587             //   of the NOPs.
 588             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
 589             fParenStack.push(lookAhead, *fStatus);                        // Frame type.
 590             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
 591             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
 592         }
 593         break;
 594
 595     case doOpenLookAheadNeg:
 596         // Negated Lookahead.   (?! stuff )
 597         // Compiles to
 598         //    1.    START_LA    dataloc
 599         //    2.    SAVE_STATE  7         // Fail within look-ahead block restores to this state,
 600         //                                //   which continues with the match.
 601         //    3.    NOP                   // Std. Open Paren sequence, for possible '|'
 602         //    4.       code for parenthesized stuff.
 603         //    5.    END_LA                // Cut back stack, remove saved state from step 2.
 604         //    6.    BACKTRACK             // code in block succeeded, so neg. lookahead fails.
 605         //    7.    END_LA                // Restore match region, in case look-ahead was using
 606         //                                        an alternate (transparent) region.
 607         {
 608             fixLiterals();
 609             int32_t dataLoc = allocateData(2);
 610             appendOp(URX_LA_START, dataLoc);
 611             appendOp(URX_STATE_SAVE, 0);    // dest address will be patched later.
 612             appendOp(URX_NOP, 0);
 613
 614             // On the Parentheses stack, start a new frame and add the postions
 615             //   of the StateSave and NOP.
 616             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
 617             fParenStack.push(negLookAhead, *fStatus);                    // Frame type
 618             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The STATE_SAVE location
 619             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
 620
 621             // Instructions #5 - #7 will be added when the ')' is encountered.
 622         }
 623         break;
 624
 625     case doOpenLookBehind:
 626         {
 627             //   Compile a (?<= look-behind open paren.
 628             //
 629             //          Compiles to
 630             //              0       URX_LB_START     dataLoc
 631             //              1       URX_LB_CONT      dataLoc
 632             //              2                        MinMatchLen
 633             //              3                        MaxMatchLen
 634             //              4       URX_NOP          Standard '(' boilerplate.
 635             //              5       URX_NOP          Reserved slot for use with '|' ops within (block).
 636             //              6         <code for LookBehind expression>
 637             //              7       URX_LB_END       dataLoc    # Check match len, restore input  len
 638             //              8       URX_LA_END       dataLoc    # Restore stack, input pos
 639             //
 640             //          Allocate a block of matcher data, to contain (when running a match)
 641             //              0:    Stack ptr on entry
 642             //              1:    Input Index on entry
 643             //              2:    Start index of match current match attempt.
 644             //              3:    Original Input String len.
 645
 646             // Generate match code for any pending literals.
 647             fixLiterals();
 648
 649             // Allocate data space
 650             int32_t dataLoc = allocateData(4);
 651
 652             // Emit URX_LB_START
 653             appendOp(URX_LB_START, dataLoc);
 654
 655             // Emit URX_LB_CONT
 656             appendOp(URX_LB_CONT, dataLoc);
 657             appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
 658             appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
 659
 660             // Emit the NOPs
 661             appendOp(URX_NOP, 0);
 662             appendOp(URX_NOP, 0);
 663
 664             // On the Parentheses stack, start a new frame and add the postions
 665             //   of the URX_LB_CONT and the NOP.
 666             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
 667             fParenStack.push(lookBehind, *fStatus);                       // Frame type
 668             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
 669             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
 670
 671             // The final two instructions will be added when the ')' is encountered.
 672         }
 673
 674         break;
 675
 676     case doOpenLookBehindNeg:
 677         {
 678             //   Compile a (?<! negated look-behind open paren.
 679             //
 680             //          Compiles to
 681             //              0       URX_LB_START     dataLoc    # Save entry stack, input len
 682             //              1       URX_LBN_CONT     dataLoc    # Iterate possible match positions
 683             //              2                        MinMatchLen
 684             //              3                        MaxMatchLen
 685             //              4                        continueLoc (9)
 686             //              5       URX_NOP          Standard '(' boilerplate.
 687             //              6       URX_NOP          Reserved slot for use with '|' ops within (block).
 688             //              7         <code for LookBehind expression>
 689             //              8       URX_LBN_END      dataLoc    # Check match len, cause a FAIL
 690             //              9       ...
 691             //
 692             //          Allocate a block of matcher data, to contain (when running a match)
 693             //              0:    Stack ptr on entry
 694             //              1:    Input Index on entry
 695             //              2:    Start index of match current match attempt.
 696             //              3:    Original Input String len.
 697
 698             // Generate match code for any pending literals.
 699             fixLiterals();
 700
 701             // Allocate data space
 702             int32_t dataLoc = allocateData(4);
 703
 704             // Emit URX_LB_START
 705             appendOp(URX_LB_START, dataLoc);
 706
 707             // Emit URX_LBN_CONT
 708             appendOp(URX_LBN_CONT, dataLoc);
 709             appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
 710             appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
 711             appendOp(URX_RESERVED_OP, 0);    // Continue Loc.    To be filled later.
 712
 713             // Emit the NOPs
 714             appendOp(URX_NOP, 0);
 715             appendOp(URX_NOP, 0);
 716
 717             // On the Parentheses stack, start a new frame and add the postions
 718             //   of the URX_LB_CONT and the NOP.
 719             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
 720             fParenStack.push(lookBehindN, *fStatus);                      // Frame type
 721             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
 722             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
 723
 724             // The final two instructions will be added when the ')' is encountered.
 725         }
 726         break;
 727
 728     case doConditionalExpr:
 729         // Conditionals such as (?(1)a:b)
 730     case doPerlInline:
 731         // Perl inline-condtionals.  (?{perl code}a|b) We're not perl, no way to do them.
 732         error(U_REGEX_UNIMPLEMENTED);
 733         break;
 734
 735
 736     case doCloseParen:
 737         handleCloseParen();
 738         if (fParenStack.size() <= 0) {
 739             //  Extra close paren, or missing open paren.
 740             error(U_REGEX_MISMATCHED_PAREN);
 741         }
 742         break;
 743
 744     case doNOP:
 745         break;
 746
 747
 748     case doBadOpenParenType:
 749     case doRuleError:
 750         error(U_REGEX_RULE_SYNTAX);
 751         break;
 752
 753
 754     case doMismatchedParenErr:
 755         error(U_REGEX_MISMATCHED_PAREN);
 756         break;
 757
 758     case doPlus:
 759         //  Normal '+'  compiles to
 760         //     1.   stuff to be repeated  (already built)
 761         //     2.   jmp-sav 1
 762         //     3.   ...
 763         //
 764         //  Or, if the item to be repeated can match a zero length string,
 765         //     1.   STO_INP_LOC  data-loc
 766         //     2.      body of stuff to be repeated
 767         //     3.   JMP_SAV_X    2
 768         //     4.   ...
 769
 770         //
 771         //  Or, if the item to be repeated is simple
 772         //     1.   Item to be repeated.
 773         //     2.   LOOP_SR_I    set number  (assuming repeated item is a set ref)
 774         //     3.   LOOP_C       stack location
 775         {
 776             int32_t  topLoc = blockTopLoc(FALSE);        // location of item #1
 777             int32_t  frameLoc;
 778
 779             // Check for simple constructs, which may get special optimized code.
 780             if (topLoc == fRXPat->fCompiledPat->size() - 1) {
 781                 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
 782
 783                 if (URX_TYPE(repeatedOp) == URX_SETREF) {
 784                     // Emit optimized code for [char set]+
 785                     appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
 786                     frameLoc = allocateStackData(1);
 787                     appendOp(URX_LOOP_C, frameLoc);
 788                     break;
 789                 }
 790
 791                 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
 792                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
 793                     URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
 794                     // Emit Optimized code for .+ operations.
 795                     int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
 796                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
 797                         // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
 798                         loopOpI |= 1;
 799                     }
 800                     if (fModeFlags & UREGEX_UNIX_LINES) {
 801                         loopOpI |= 2;
 802                     }
 803                     appendOp(loopOpI);
 804                     frameLoc = allocateStackData(1);
 805                     appendOp(URX_LOOP_C, frameLoc);
 806                     break;
 807                 }
 808
 809             }
 810
 811             // General case.
 812
 813             // Check for minimum match length of zero, which requires
 814             //    extra loop-breaking code.
 815             if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
 816                 // Zero length match is possible.
 817                 // Emit the code sequence that can handle it.
 818                 insertOp(topLoc);
 819                 frameLoc = allocateStackData(1);
 820
 821                 int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
 822                 fRXPat->fCompiledPat->setElementAt(op, topLoc);
 823
 824                 appendOp(URX_JMP_SAV_X, topLoc+1);
 825             } else {
 826                 // Simpler code when the repeated body must match something non-empty
 827                 appendOp(URX_JMP_SAV, topLoc);
 828             }
 829         }
 830         break;
 831
 832     case doNGPlus:
 833         //  Non-greedy '+?'  compiles to
 834         //     1.   stuff to be repeated  (already built)
 835         //     2.   state-save  1
 836         //     3.   ...
 837         {
 838             int32_t topLoc      = blockTopLoc(FALSE);
 839             appendOp(URX_STATE_SAVE, topLoc);
 840         }
 841         break;
 842
 843
 844     case doOpt:
 845         // Normal (greedy) ? quantifier.
 846         //  Compiles to
 847         //     1. state save 3
 848         //     2.    body of optional block
 849         //     3. ...
 850         // Insert the state save into the compiled pattern, and we're done.
 851         {
 852             int32_t   saveStateLoc = blockTopLoc(TRUE);
 853             int32_t   saveStateOp  = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
 854             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
 855         }
 856         break;
 857
 858     case doNGOpt:
 859         // Non-greedy ?? quantifier
 860         //   compiles to
 861         //    1.  jmp   4
 862         //    2.     body of optional block
 863         //    3   jmp   5
 864         //    4.  state save 2
 865         //    5    ...
 866         //  This code is less than ideal, with two jmps instead of one, because we can only
 867         //  insert one instruction at the top of the block being iterated.
 868         {
 869             int32_t  jmp1_loc = blockTopLoc(TRUE);
 870             int32_t  jmp2_loc = fRXPat->fCompiledPat->size();
 871
 872             int32_t  jmp1_op  = buildOp(URX_JMP, jmp2_loc+1);
 873             fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
 874
 875             appendOp(URX_JMP, jmp2_loc+2);
 876
 877             appendOp(URX_STATE_SAVE, jmp1_loc+1);
 878         }
 879         break;
 880
 881
 882     case doStar:
 883         // Normal (greedy) * quantifier.
 884         // Compiles to
 885         //       1.   STATE_SAVE   4
 886         //       2.      body of stuff being iterated over
 887         //       3.   JMP_SAV      2
 888         //       4.   ...
 889         //
 890         // Or, if the body is a simple [Set],
 891         //       1.   LOOP_SR_I    set number
 892         //       2.   LOOP_C       stack location
 893         //       ...
 894         //
 895         // Or if this is a .*
 896         //       1.   LOOP_DOT_I    (. matches all mode flag)
 897         //       2.   LOOP_C        stack location
 898         //
 899         // Or, if the body can match a zero-length string, to inhibit infinite loops,
 900         //       1.   STATE_SAVE   5
 901         //       2.   STO_INP_LOC  data-loc
 902         //       3.      body of stuff
 903         //       4.   JMP_SAV_X    2
 904         //       5.   ...
 905         {
 906             // location of item #1, the STATE_SAVE
 907             int32_t   topLoc = blockTopLoc(FALSE);
 908             int32_t   dataLoc = -1;
 909
 910             // Check for simple *, where the construct being repeated
 911             //   compiled to single opcode, and might be optimizable.
 912             if (topLoc == fRXPat->fCompiledPat->size() - 1) {
 913                 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
 914
 915                 if (URX_TYPE(repeatedOp) == URX_SETREF) {
 916                     // Emit optimized code for a [char set]*
 917                     int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
 918                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
 919                     dataLoc = allocateStackData(1);
 920                     appendOp(URX_LOOP_C, dataLoc);
 921                     break;
 922                 }
 923
 924                 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
 925                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
 926                     URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
 927                     // Emit Optimized code for .* operations.
 928                     int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
 929                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
 930                         // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
 931                         loopOpI |= 1;
 932                     }
 933                     if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
 934                         loopOpI |= 2;
 935                     }
 936                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
 937                     dataLoc = allocateStackData(1);
 938                     appendOp(URX_LOOP_C, dataLoc);
 939                     break;
 940                 }
 941             }
 942
 943             // Emit general case code for this *
 944             // The optimizations did not apply.
 945
 946             int32_t   saveStateLoc = blockTopLoc(TRUE);
 947             int32_t   jmpOp        = buildOp(URX_JMP_SAV, saveStateLoc+1);
 948
 949             // Check for minimum match length of zero, which requires
 950             //    extra loop-breaking code.
 951             if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
 952                 insertOp(saveStateLoc);
 953                 dataLoc = allocateStackData(1);
 954
 955                 int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
 956                 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
 957                 jmpOp      = buildOp(URX_JMP_SAV_X, saveStateLoc+2);
 958             }
 959
 960             // Locate the position in the compiled pattern where the match will continue
 961             //   after completing the *.   (4 or 5 in the comment above)
 962             int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
 963
 964             // Put together the save state op and store it into the compiled code.
 965             int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
 966             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
 967
 968             // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
 969             appendOp(jmpOp);
 970         }
 971         break;
 972
 973     case doNGStar:
 974         // Non-greedy *? quantifier
 975         // compiles to
 976         //     1.   JMP    3
 977         //     2.      body of stuff being iterated over
 978         //     3.   STATE_SAVE  2
 979         //     4    ...
 980         {
 981             int32_t     jmpLoc  = blockTopLoc(TRUE);                   // loc  1.
 982             int32_t     saveLoc = fRXPat->fCompiledPat->size();        // loc  3.
 983             int32_t     jmpOp   = buildOp(URX_JMP, saveLoc);
 984             fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
 985             appendOp(URX_STATE_SAVE, jmpLoc+1);
 986         }
 987         break;
 988
 989
 990     case doIntervalInit:
 991         // The '{' opening an interval quantifier was just scanned.
 992         // Init the counter varaiables that will accumulate the values as the digits
 993         //    are scanned.
 994         fIntervalLow = 0;
 995         fIntervalUpper = -1;
 996         break;
 997
 998     case doIntevalLowerDigit:
 999         // Scanned a digit from the lower value of an {lower,upper} interval
1000         {
1001             int32_t digitValue = u_charDigitValue(fC.fChar);
1002             U_ASSERT(digitValue >= 0);
1003             int64_t val = (int64_t)fIntervalLow*10 + digitValue;
1004             if (val > INT32_MAX) {
1005                 error(U_REGEX_NUMBER_TOO_BIG);
1006             } else {
1007                 fIntervalLow = (int32_t)val;
1008             }
1009         }
1010         break;
1011
1012     case doIntervalUpperDigit:
1013         // Scanned a digit from the upper value of an {lower,upper} interval
1014         {
1015             if (fIntervalUpper < 0) {
1016                 fIntervalUpper = 0;
1017             }
1018             int32_t digitValue = u_charDigitValue(fC.fChar);
1019             U_ASSERT(digitValue >= 0);
1020             int64_t val = (int64_t)fIntervalUpper*10 + digitValue;
1021             if (val > INT32_MAX) {
1022                 error(U_REGEX_NUMBER_TOO_BIG);
1023             } else {
1024                 fIntervalUpper = (int32_t)val;
1025             }
1026         }
1027         break;
1028
1029     case doIntervalSame:
1030         // Scanned a single value interval like {27}.  Upper = Lower.
1031         fIntervalUpper = fIntervalLow;
1032         break;
1033
1034     case doInterval:
1035         // Finished scanning a normal {lower,upper} interval.  Generate the code for it.
1036         if (compileInlineInterval() == FALSE) {
1037             compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1038         }
1039         break;
1040
1041     case doPossessiveInterval:
1042         // Finished scanning a Possessive {lower,upper}+ interval.  Generate the code for it.
1043         {
1044             // Remember the loc for the top of the block being looped over.
1045             //   (Can not reserve a slot in the compiled pattern at this time, because
1046             //    compileInterval needs to reserve also, and blockTopLoc can only reserve
1047             //    once per block.)
1048             int32_t topLoc = blockTopLoc(FALSE);
1049
1050             // Produce normal looping code.
1051             compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1052
1053             // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1054             //  just as if the loop was inclosed in atomic parentheses.
1055
1056             // First the STO_SP before the start of the loop
1057             insertOp(topLoc);
1058
1059             int32_t  varLoc = allocateData(1);   // Reserve a data location for saving the
1060             int32_t  op     = buildOp(URX_STO_SP, varLoc);
1061             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1062
1063             int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
1064             U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
1065             loopOp++;     // point LoopOp after the just-inserted STO_SP
1066             fRXPat->fCompiledPat->push(loopOp, *fStatus);
1067
1068             // Then the LD_SP after the end of the loop
1069             appendOp(URX_LD_SP, varLoc);
1070         }
1071
1072         break;
1073
1074     case doNGInterval:
1075         // Finished scanning a non-greedy {lower,upper}? interval.  Generate the code for it.
1076         compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
1077         break;
1078
1079     case doIntervalError:
1080         error(U_REGEX_BAD_INTERVAL);
1081         break;
1082
1083     case doLiteralChar:
1084         // We've just scanned a "normal" character from the pattern,
1085         literalChar(fC.fChar);
1086         break;
1087
1088
1089     case doEscapedLiteralChar:
1090         // We've just scanned an backslashed escaped character with  no
1091         //   special meaning.  It represents itself.
1092         if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1093             ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1094             (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1095                error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1096              }
1097         literalChar(fC.fChar);
1098         break;
1099
1100
1101     case doDotAny:
1102         // scanned a ".",  match any single character.
1103         {
1104             fixLiterals(FALSE);
1105             if (fModeFlags & UREGEX_DOTALL) {
1106                 appendOp(URX_DOTANY_ALL, 0);
1107             } else if (fModeFlags & UREGEX_UNIX_LINES) {
1108                 appendOp(URX_DOTANY_UNIX, 0);
1109             } else {
1110                 appendOp(URX_DOTANY, 0);
1111             }
1112         }
1113         break;
1114
1115     case doCaret:
1116         {
1117             fixLiterals(FALSE);
1118             if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1119                 appendOp(URX_CARET, 0);
1120             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1121                 appendOp(URX_CARET_M, 0);
1122             } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1123                 appendOp(URX_CARET, 0);   // Only testing true start of input.
1124             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1125                 appendOp(URX_CARET_M_UNIX, 0);
1126             }
1127         }
1128         break;
1129
1130     case doDollar:
1131         {
1132             fixLiterals(FALSE);
1133             if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1134                 appendOp(URX_DOLLAR, 0);
1135             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1136                 appendOp(URX_DOLLAR_M, 0);
1137             } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1138                 appendOp(URX_DOLLAR_D, 0);
1139             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1140                 appendOp(URX_DOLLAR_MD, 0);
1141             }
1142         }
1143         break;
1144
1145     case doBackslashA:
1146         fixLiterals(FALSE);
1147         appendOp(URX_CARET, 0);
1148         break;
1149
1150     case doBackslashB:
1151         {
1152             #if  UCONFIG_NO_BREAK_ITERATION==1
1153             if (fModeFlags & UREGEX_UWORD) {
1154                 error(U_UNSUPPORTED_ERROR);
1155             }
1156             #endif
1157             fixLiterals(FALSE);
1158             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1159             appendOp(op, 1);
1160         }
1161         break;
1162
1163     case doBackslashb:
1164         {
1165             #if  UCONFIG_NO_BREAK_ITERATION==1
1166             if (fModeFlags & UREGEX_UWORD) {
1167                 error(U_UNSUPPORTED_ERROR);
1168             }
1169             #endif
1170             fixLiterals(FALSE);
1171             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1172             appendOp(op, 0);
1173         }
1174         break;
1175
1176     case doBackslashD:
1177         fixLiterals(FALSE);
1178         appendOp(URX_BACKSLASH_D, 1);
1179         break;
1180
1181     case doBackslashd:
1182         fixLiterals(FALSE);
1183         appendOp(URX_BACKSLASH_D, 0);
1184         break;
1185
1186     case doBackslashG:
1187         fixLiterals(FALSE);
1188         appendOp(URX_BACKSLASH_G, 0);
1189         break;
1190
1191     case doBackslashH:
1192         fixLiterals(FALSE);
1193         appendOp(URX_BACKSLASH_H, 1);
1194         break;
1195
1196     case doBackslashh:
1197         fixLiterals(FALSE);
1198         appendOp(URX_BACKSLASH_H, 0);
1199         break;
1200
1201     case doBackslashR:
1202         fixLiterals(FALSE);
1203         appendOp(URX_BACKSLASH_R, 0);
1204         break;
1205
1206     case doBackslashS:
1207         fixLiterals(FALSE);
1208         appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
1209         break;
1210
1211     case doBackslashs:
1212         fixLiterals(FALSE);
1213         appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
1214         break;
1215
1216     case doBackslashV:
1217         fixLiterals(FALSE);
1218         appendOp(URX_BACKSLASH_V, 1);
1219         break;
1220
1221     case doBackslashv:
1222         fixLiterals(FALSE);
1223         appendOp(URX_BACKSLASH_V, 0);
1224         break;
1225
1226     case doBackslashW:
1227         fixLiterals(FALSE);
1228         appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
1229         break;
1230
1231     case doBackslashw:
1232         fixLiterals(FALSE);
1233         appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
1234         break;
1235
1236     case doBackslashX:
1237         fixLiterals(FALSE);
1238         appendOp(URX_BACKSLASH_X, 0);
1239         break;
1240
1241
1242     case doBackslashZ:
1243         fixLiterals(FALSE);
1244         appendOp(URX_DOLLAR, 0);
1245         break;
1246
1247     case doBackslashz:
1248         fixLiterals(FALSE);
1249         appendOp(URX_BACKSLASH_Z, 0);
1250         break;
1251
1252     case doEscapeError:
1253         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1254         break;
1255
1256     case doExit:
1257         fixLiterals(FALSE);
1258         returnVal = FALSE;
1259         break;
1260
1261     case doProperty:
1262         {
1263             fixLiterals(FALSE);
1264             UnicodeSet *theSet = scanProp();
1265             compileSet(theSet);
1266         }
1267         break;
1268
1269     case doNamedChar:
1270         {
1271             UChar32 c = scanNamedChar();
1272             literalChar(c);
1273         }
1274         break;
1275
1276
1277     case doBackRef:
1278         // BackReference.  Somewhat unusual in that the front-end can not completely parse
1279         //                 the regular expression, because the number of digits to be consumed
1280         //                 depends on the number of capture groups that have been defined.  So
1281         //                 we have to do it here instead.
1282         {
1283             int32_t  numCaptureGroups = fRXPat->fGroupMap->size();
1284             int32_t  groupNum = 0;
1285             UChar32  c        = fC.fChar;
1286
1287             for (;;) {
1288                 // Loop once per digit, for max allowed number of digits in a back reference.
1289                 int32_t digit = u_charDigitValue(c);
1290                 groupNum = groupNum * 10 + digit;
1291                 if (groupNum >= numCaptureGroups) {
1292                     break;
1293                 }
1294                 c = peekCharLL();
1295                 if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) {
1296                     break;
1297                 }
1298                 nextCharLL();
1299             }
1300
1301             // Scan of the back reference in the source regexp is complete.  Now generate
1302             //  the compiled code for it.
1303             // Because capture groups can be forward-referenced by back-references,
1304             //  we fill the operand with the capture group number.  At the end
1305             //  of compilation, it will be changed to the variable's location.
1306             U_ASSERT(groupNum > 0);  // Shouldn't happen.  '\0' begins an octal escape sequence,
1307                                      //    and shouldn't enter this code path at all.
1308             fixLiterals(FALSE);
1309             if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1310                 appendOp(URX_BACKREF_I, groupNum);
1311             } else {
1312                 appendOp(URX_BACKREF, groupNum);
1313             }
1314         }
1315         break;
1316
1317     case doBeginNamedBackRef:
1318         U_ASSERT(fCaptureName == NULL);
1319         fCaptureName = new UnicodeString;
1320         if (fCaptureName == NULL) {
1321             error(U_MEMORY_ALLOCATION_ERROR);
1322         }
1323         break;
1324
1325     case doContinueNamedBackRef:
1326         fCaptureName->append(fC.fChar);
1327         break;
1328
1329     case doCompleteNamedBackRef:
1330         {
1331         int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName);
1332         if (groupNumber == 0) {
1333             // Group name has not been defined.
1334             //   Could be a forward reference. If we choose to support them at some
1335             //   future time, extra mechanism will be required at this point.
1336             error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
1337         } else {
1338             // Given the number, handle identically to a \n numbered back reference.
1339             // See comments above, under doBackRef
1340             fixLiterals(FALSE);
1341             if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1342                 appendOp(URX_BACKREF_I, groupNumber);
1343             } else {
1344                 appendOp(URX_BACKREF, groupNumber);
1345             }
1346         }
1347         delete fCaptureName;
1348         fCaptureName = NULL;
1349         break;
1350         }
1351
1352     case doPossessivePlus:
1353         // Possessive ++ quantifier.
1354         // Compiles to
1355         //       1.   STO_SP
1356         //       2.      body of stuff being iterated over
1357         //       3.   STATE_SAVE 5
1358         //       4.   JMP        2
1359         //       5.   LD_SP
1360         //       6.   ...
1361         //
1362         //  Note:  TODO:  This is pretty inefficient.  A mass of saved state is built up
1363         //                then unconditionally discarded.  Perhaps introduce a new opcode.  Ticket 6056
1364         //
1365         {
1366             // Emit the STO_SP
1367             int32_t   topLoc = blockTopLoc(TRUE);
1368             int32_t   stoLoc = allocateData(1);  // Reserve the data location for storing save stack ptr.
1369             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1370             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1371
1372             // Emit the STATE_SAVE
1373             appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
1374
1375             // Emit the JMP
1376             appendOp(URX_JMP, topLoc+1);
1377
1378             // Emit the LD_SP
1379             appendOp(URX_LD_SP, stoLoc);
1380         }
1381         break;
1382
1383     case doPossessiveStar:
1384         // Possessive *+ quantifier.
1385         // Compiles to
1386         //       1.   STO_SP       loc
1387         //       2.   STATE_SAVE   5
1388         //       3.      body of stuff being iterated over
1389         //       4.   JMP          2
1390         //       5.   LD_SP        loc
1391         //       6    ...
1392         // TODO:  do something to cut back the state stack each time through the loop.
1393         {
1394             // Reserve two slots at the top of the block.
1395             int32_t   topLoc = blockTopLoc(TRUE);
1396             insertOp(topLoc);
1397
1398             // emit   STO_SP     loc
1399             int32_t   stoLoc = allocateData(1);    // Reserve the data location for storing save stack ptr.
1400             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1401             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1402
1403             // Emit the SAVE_STATE   5
1404             int32_t L7 = fRXPat->fCompiledPat->size()+1;
1405             op = buildOp(URX_STATE_SAVE, L7);
1406             fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1407
1408             // Append the JMP operation.
1409             appendOp(URX_JMP, topLoc+1);
1410
1411             // Emit the LD_SP       loc
1412             appendOp(URX_LD_SP, stoLoc);
1413         }
1414         break;
1415
1416     case doPossessiveOpt:
1417         // Possessive  ?+ quantifier.
1418         //  Compiles to
1419         //     1. STO_SP      loc
1420         //     2. SAVE_STATE  5
1421         //     3.    body of optional block
1422         //     4. LD_SP       loc
1423         //     5. ...
1424         //
1425         {
1426             // Reserve two slots at the top of the block.
1427             int32_t   topLoc = blockTopLoc(TRUE);
1428             insertOp(topLoc);
1429
1430             // Emit the STO_SP
1431             int32_t   stoLoc = allocateData(1);   // Reserve the data location for storing save stack ptr.
1432             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1433             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1434
1435             // Emit the SAVE_STATE
1436             int32_t   continueLoc = fRXPat->fCompiledPat->size()+1;
1437             op = buildOp(URX_STATE_SAVE, continueLoc);
1438             fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1439
1440             // Emit the LD_SP
1441             appendOp(URX_LD_SP, stoLoc);
1442         }
1443         break;
1444
1445
1446     case doBeginMatchMode:
1447         fNewModeFlags = fModeFlags;
1448         fSetModeFlag  = TRUE;
1449         break;
1450
1451     case doMatchMode:   //  (?i)    and similar
1452         {
1453             int32_t  bit = 0;
1454             switch (fC.fChar) {
1455             case 0x69: /* 'i' */   bit = UREGEX_CASE_INSENSITIVE; break;
1456             case 0x64: /* 'd' */   bit = UREGEX_UNIX_LINES;       break;
1457             case 0x6d: /* 'm' */   bit = UREGEX_MULTILINE;        break;
1458             case 0x73: /* 's' */   bit = UREGEX_DOTALL;           break;
1459             case 0x75: /* 'u' */   bit = 0; /* Unicode casing */  break;
1460             case 0x77: /* 'w' */   bit = UREGEX_UWORD;            break;
1461             case 0x78: /* 'x' */   bit = UREGEX_COMMENTS;         break;
1462             case 0x2d: /* '-' */   fSetModeFlag = FALSE;          break;
1463             default:
1464                 U_ASSERT(FALSE);   // Should never happen.  Other chars are filtered out
1465                                    // by the scanner.
1466             }
1467             if (fSetModeFlag) {
1468                 fNewModeFlags |= bit;
1469             } else {
1470                 fNewModeFlags &= ~bit;
1471             }
1472         }
1473         break;
1474
1475     case doSetMatchMode:
1476         // Emit code to match any pending literals, using the not-yet changed match mode.
1477         fixLiterals();
1478
1479         // We've got a (?i) or similar.  The match mode is being changed, but
1480         //   the change is not scoped to a parenthesized block.
1481         U_ASSERT(fNewModeFlags < 0);
1482         fModeFlags = fNewModeFlags;
1483
1484         break;
1485
1486
1487     case doMatchModeParen:
1488         // We've got a (?i: or similar.  Begin a parenthesized block, save old
1489         //   mode flags so they can be restored at the close of the block.
1490         //
1491         //   Compile to a
1492         //      - NOP, which later may be replaced by a save-state if the
1493         //         parenthesized group gets a * quantifier, followed by
1494         //      - NOP, which may later be replaced by a save-state if there
1495         //             is an '|' alternation within the parens.
1496         {
1497             fixLiterals(FALSE);
1498             appendOp(URX_NOP, 0);
1499             appendOp(URX_NOP, 0);
1500
1501             // On the Parentheses stack, start a new frame and add the postions
1502             //   of the two NOPs (a normal non-capturing () frame, except for the
1503             //   saving of the orignal mode flags.)
1504             fParenStack.push(fModeFlags, *fStatus);
1505             fParenStack.push(flags, *fStatus);                            // Frame Marker
1506             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP
1507             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
1508
1509             // Set the current mode flags to the new values.
1510             U_ASSERT(fNewModeFlags < 0);
1511             fModeFlags = fNewModeFlags;
1512         }
1513         break;
1514
1515     case doBadModeFlag:
1516         error(U_REGEX_INVALID_FLAG);
1517         break;
1518
1519     case doSuppressComments:
1520         // We have just scanned a '(?'.  We now need to prevent the character scanner from
1521         // treating a '#' as a to-the-end-of-line comment.
1522         //   (This Perl compatibility just gets uglier and uglier to do...)
1523         fEOLComments = FALSE;
1524         break;
1525
1526
1527     case doSetAddAmp:
1528         {
1529           UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1530           set->add(chAmp);
1531         }
1532         break;
1533
1534     case doSetAddDash:
1535         {
1536           UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1537           set->add(chDash);
1538         }
1539         break;
1540
1541      case doSetBackslash_s:
1542         {
1543          UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1544          set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1545          break;
1546         }
1547
1548      case doSetBackslash_S:
1549         {
1550             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1551             UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1552             SSet.complement();
1553             set->addAll(SSet);
1554             break;
1555         }
1556
1557     case doSetBackslash_d:
1558         {
1559             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1560             // TODO - make a static set, ticket 6058.
1561             addCategory(set, U_GC_ND_MASK, *fStatus);
1562             break;
1563         }
1564
1565     case doSetBackslash_D:
1566         {
1567             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1568             UnicodeSet digits;
1569             // TODO - make a static set, ticket 6058.
1570             digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
1571             digits.complement();
1572             set->addAll(digits);
1573             break;
1574         }
1575
1576     case doSetBackslash_h:
1577         {
1578             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1579             UnicodeSet h;
1580             h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1581             h.add((UChar32)9);   // Tab
1582             set->addAll(h);
1583             break;
1584         }
1585
1586     case doSetBackslash_H:
1587         {
1588             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1589             UnicodeSet h;
1590             h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1591             h.add((UChar32)9);   // Tab
1592             h.complement();
1593             set->addAll(h);
1594             break;
1595         }
1596
1597     case doSetBackslash_v:
1598         {
1599             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1600             set->add((UChar32)0x0a, (UChar32)0x0d);  // add range
1601             set->add((UChar32)0x85);
1602             set->add((UChar32)0x2028, (UChar32)0x2029);
1603             break;
1604         }
1605
1606     case doSetBackslash_V:
1607         {
1608             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1609             UnicodeSet v;
1610             v.add((UChar32)0x0a, (UChar32)0x0d);  // add range
1611             v.add((UChar32)0x85);
1612             v.add((UChar32)0x2028, (UChar32)0x2029);
1613             v.complement();
1614             set->addAll(v);
1615             break;
1616         }
1617
1618     case doSetBackslash_w:
1619         {
1620             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1621             set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1622             break;
1623         }
1624
1625     case doSetBackslash_W:
1626         {
1627             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1628             UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1629             SSet.complement();
1630             set->addAll(SSet);
1631             break;
1632         }
1633
1634     case doSetBegin:
1635         fixLiterals(FALSE);
1636         fSetStack.push(new UnicodeSet(), *fStatus);
1637         fSetOpStack.push(setStart, *fStatus);
1638         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1639             fSetOpStack.push(setCaseClose, *fStatus);
1640         }
1641         break;
1642
1643     case doSetBeginDifference1:
1644         //  We have scanned something like [[abc]-[
1645         //  Set up a new UnicodeSet for the set beginning with the just-scanned '['
1646         //  Push a Difference operator, which will cause the new set to be subtracted from what
1647         //    went before once it is created.
1648         setPushOp(setDifference1);
1649         fSetOpStack.push(setStart, *fStatus);
1650         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1651             fSetOpStack.push(setCaseClose, *fStatus);
1652         }
1653         break;
1654
1655     case doSetBeginIntersection1:
1656         //  We have scanned something like  [[abc]&[
1657         //   Need both the '&' operator and the open '[' operator.
1658         setPushOp(setIntersection1);
1659         fSetOpStack.push(setStart, *fStatus);
1660         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1661             fSetOpStack.push(setCaseClose, *fStatus);
1662         }
1663         break;
1664
1665     case doSetBeginUnion:
1666         //  We have scanned something like  [[abc][
1667         //     Need to handle the union operation explicitly [[abc] | [
1668         setPushOp(setUnion);
1669         fSetOpStack.push(setStart, *fStatus);
1670         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1671             fSetOpStack.push(setCaseClose, *fStatus);
1672         }
1673         break;
1674
1675     case doSetDifference2:
1676         // We have scanned something like [abc--
1677         //   Consider this to unambiguously be a set difference operator.
1678         setPushOp(setDifference2);
1679         break;
1680
1681     case doSetEnd:
1682         // Have encountered the ']' that closes a set.
1683         //    Force the evaluation of any pending operations within this set,
1684         //    leave the completed set on the top of the set stack.
1685         setEval(setEnd);
1686         U_ASSERT(fSetOpStack.peeki()==setStart);
1687         fSetOpStack.popi();
1688         break;
1689
1690     case doSetFinish:
1691         {
1692         // Finished a complete set expression, including all nested sets.
1693         //   The close bracket has already triggered clearing out pending set operators,
1694         //    the operator stack should be empty and the operand stack should have just
1695         //    one entry, the result set.
1696         U_ASSERT(fSetOpStack.empty());
1697         UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
1698         U_ASSERT(fSetStack.empty());
1699         compileSet(theSet);
1700         break;
1701         }
1702
1703     case doSetIntersection2:
1704         // Have scanned something like [abc&&
1705         setPushOp(setIntersection2);
1706         break;
1707
1708     case doSetLiteral:
1709         // Union the just-scanned literal character into the set being built.
1710         //    This operation is the highest precedence set operation, so we can always do
1711         //    it immediately, without waiting to see what follows.  It is necessary to perform
1712         //    any pending '-' or '&' operation first, because these have the same precedence
1713         //    as union-ing in a literal'
1714         {
1715             setEval(setUnion);
1716             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1717             s->add(fC.fChar);
1718             fLastSetLiteral = fC.fChar;
1719             break;
1720         }
1721
1722     case doSetLiteralEscaped:
1723         // A back-slash escaped literal character was encountered.
1724         // Processing is the same as with setLiteral, above, with the addition of
1725         //  the optional check for errors on escaped ASCII letters.
1726         {
1727             if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1728                 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1729                  (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1730                 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1731             }
1732             setEval(setUnion);
1733             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1734             s->add(fC.fChar);
1735             fLastSetLiteral = fC.fChar;
1736             break;
1737         }
1738
1739         case doSetNamedChar:
1740         // Scanning a \N{UNICODE CHARACTER NAME}
1741         //  Aside from the source of the character, the processing is identical to doSetLiteral,
1742         //    above.
1743         {
1744             UChar32  c = scanNamedChar();
1745             setEval(setUnion);
1746             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1747             s->add(c);
1748             fLastSetLiteral = c;
1749             break;
1750         }
1751
1752     case doSetNamedRange:
1753         // We have scanned literal-\N{CHAR NAME}.  Add the range to the set.
1754         // The left character is already in the set, and is saved in fLastSetLiteral.
1755         // The right side needs to be picked up, the scan is at the 'N'.
1756         // Lower Limit > Upper limit being an error matches both Java
1757         //        and ICU UnicodeSet behavior.
1758         {
1759             UChar32  c = scanNamedChar();
1760             if (U_SUCCESS(*fStatus) && fLastSetLiteral > c) {
1761                 error(U_REGEX_INVALID_RANGE);
1762             }
1763             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1764             s->add(fLastSetLiteral, c);
1765             fLastSetLiteral = c;
1766             break;
1767         }
1768
1769
1770     case  doSetNegate:
1771         // Scanned a '^' at the start of a set.
1772         // Push the negation operator onto the set op stack.
1773         // A twist for case-insensitive matching:
1774         //   the case closure operation must happen _before_ negation.
1775         //   But the case closure operation will already be on the stack if it's required.
1776         //   This requires checking for case closure, and swapping the stack order
1777         //    if it is present.
1778         {
1779             int32_t  tosOp = fSetOpStack.peeki();
1780             if (tosOp == setCaseClose) {
1781                 fSetOpStack.popi();
1782                 fSetOpStack.push(setNegation, *fStatus);
1783                 fSetOpStack.push(setCaseClose, *fStatus);
1784             } else {
1785                 fSetOpStack.push(setNegation, *fStatus);
1786             }
1787         }
1788         break;
1789
1790     case doSetNoCloseError:
1791         error(U_REGEX_MISSING_CLOSE_BRACKET);
1792         break;
1793
1794     case doSetOpError:
1795         error(U_REGEX_RULE_SYNTAX);   //  -- or && at the end of a set.  Illegal.
1796         break;
1797
1798     case doSetPosixProp:
1799         {
1800             UnicodeSet *s = scanPosixProp();
1801             if (s != NULL) {
1802                 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1803                 tos->addAll(*s);
1804                 delete s;
1805             }  // else error.  scanProp() reported the error status already.
1806         }
1807         break;
1808
1809     case doSetProp:
1810         //  Scanned a \p \P within [brackets].
1811         {
1812             UnicodeSet *s = scanProp();
1813             if (s != NULL) {
1814                 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1815                 tos->addAll(*s);
1816                 delete s;
1817             }  // else error.  scanProp() reported the error status already.
1818         }
1819         break;
1820
1821
1822     case doSetRange:
1823         // We have scanned literal-literal.  Add the range to the set.
1824         // The left character is already in the set, and is saved in fLastSetLiteral.
1825         // The right side is the current character.
1826         // Lower Limit > Upper limit being an error matches both Java
1827         //        and ICU UnicodeSet behavior.
1828         {
1829         if (fLastSetLiteral > fC.fChar) {
1830             error(U_REGEX_INVALID_RANGE);
1831         }
1832         UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1833         s->add(fLastSetLiteral, fC.fChar);
1834         break;
1835         }
1836
1837     default:
1838         U_ASSERT(FALSE);
1839         error(U_REGEX_INTERNAL_ERROR);
1840         break;
1841     }
1842
1843     if (U_FAILURE(*fStatus)) {
1844         returnVal = FALSE;
1845     }
1846
1847     return returnVal;
1848 }
1849
1850
1851
1852 //------------------------------------------------------------------------------
1853 //
1854 //   literalChar           We've encountered a literal character from the pattern,
1855 //                             or an escape sequence that reduces to a character.
1856 //                         Add it to the string containing all literal chars/strings from
1857 //                             the pattern.
1858 //
1859 //------------------------------------------------------------------------------
1860 void RegexCompile::literalChar(UChar32 c)  {
1861     fLiteralChars.append(c);
1862 }
1863
1864
1865 //------------------------------------------------------------------------------
1866 //
1867 //    fixLiterals           When compiling something that can follow a literal
1868 //                          string in a pattern, emit the code to match the
1869 //                          accumulated literal string.
1870 //
1871 //                          Optionally, split the last char of the string off into
1872 //                          a single "ONE_CHAR" operation, so that quantifiers can
1873 //                          apply to that char alone.  Example:   abc*
1874 //                          The * must apply to the 'c' only.
1875 //
1876 //------------------------------------------------------------------------------
1877 void    RegexCompile::fixLiterals(UBool split) {
1878
1879     // If no literal characters have been scanned but not yet had code generated
1880     //   for them, nothing needs to be done.
1881     if (fLiteralChars.length() == 0) {
1882         return;
1883     }
1884
1885     int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1886     UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1887
1888     // Split:  We need to  ensure that the last item in the compiled pattern
1889     //     refers only to the last literal scanned in the pattern, so that
1890     //     quantifiers (*, +, etc.) affect only it, and not a longer string.
1891     //     Split before case folding for case insensitive matches.
1892
1893     if (split) {
1894         fLiteralChars.truncate(indexOfLastCodePoint);
1895         fixLiterals(FALSE);   // Recursive call, emit code to match the first part of the string.
1896                               //  Note that the truncated literal string may be empty, in which case
1897                               //  nothing will be emitted.
1898
1899         literalChar(lastCodePoint);  // Re-add the last code point as if it were a new literal.
1900         fixLiterals(FALSE);          // Second recursive call, code for the final code point.
1901         return;
1902     }
1903
1904     // If we are doing case-insensitive matching, case fold the string.  This may expand
1905     //   the string, e.g. the German sharp-s turns into "ss"
1906     if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1907         fLiteralChars.foldCase();
1908         indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1909         lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1910     }
1911
1912     if (indexOfLastCodePoint == 0) {
1913         // Single character, emit a URX_ONECHAR op to match it.
1914         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
1915                  u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
1916             appendOp(URX_ONECHAR_I, lastCodePoint);
1917         } else {
1918             appendOp(URX_ONECHAR, lastCodePoint);
1919         }
1920     } else {
1921         // Two or more chars, emit a URX_STRING to match them.
1922         if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length() > 0x00ffffff) {
1923             error(U_REGEX_PATTERN_TOO_BIG);
1924         }
1925         if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1926             appendOp(URX_STRING_I, fRXPat->fLiteralText.length());
1927         } else {
1928             // TODO here:  add optimization to split case sensitive strings of length two
1929             //             into two single char ops, for efficiency.
1930             appendOp(URX_STRING, fRXPat->fLiteralText.length());
1931         }
1932         appendOp(URX_STRING_LEN, fLiteralChars.length());
1933
1934         // Add this string into the accumulated strings of the compiled pattern.
1935         fRXPat->fLiteralText.append(fLiteralChars);
1936     }
1937
1938     fLiteralChars.remove();
1939 }
1940
1941
1942 int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
1943     if (U_FAILURE(*fStatus)) {
1944         return 0;
1945     }
1946     if (type < 0 || type > 255) {
1947         U_ASSERT(FALSE);
1948         error(U_REGEX_INTERNAL_ERROR);
1949         type = URX_RESERVED_OP;
1950     }
1951     if (val > 0x00ffffff) {
1952         U_ASSERT(FALSE);
1953         error(U_REGEX_INTERNAL_ERROR);
1954         val = 0;
1955     }
1956     if (val < 0) {
1957         if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) {
1958             U_ASSERT(FALSE);
1959             error(U_REGEX_INTERNAL_ERROR);
1960             return -1;
1961         }
1962         if (URX_TYPE(val) != 0xff) {
1963             U_ASSERT(FALSE);
1964             error(U_REGEX_INTERNAL_ERROR);
1965             return -1;
1966         }
1967         type = URX_RESERVED_OP_N;
1968     }
1969     return (type << 24) | val;
1970 }
1971
1972
1973 //------------------------------------------------------------------------------
1974 //
1975 //   appendOp()             Append a new instruction onto the compiled pattern
1976 //                          Includes error checking, limiting the size of the
1977 //                          pattern to lengths that can be represented in the
1978 //                          24 bit operand field of an instruction.
1979 //
1980 //------------------------------------------------------------------------------
1981 void RegexCompile::appendOp(int32_t op) {
1982     if (U_FAILURE(*fStatus)) {
1983         return;
1984     }
1985     fRXPat->fCompiledPat->addElement(op, *fStatus);
1986     if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) {
1987         error(U_REGEX_PATTERN_TOO_BIG);
1988     }
1989 }
1990
1991 void RegexCompile::appendOp(int32_t type, int32_t val) {
1992     appendOp(buildOp(type, val));
1993 }
1994
1995
1996 //------------------------------------------------------------------------------
1997 //
1998 //   insertOp()             Insert a slot for a new opcode into the already
1999 //                          compiled pattern code.
2000 //
2001 //                          Fill the slot with a NOP.  Our caller will replace it
2002 //                          with what they really wanted.
2003 //
2004 //------------------------------------------------------------------------------
2005 void   RegexCompile::insertOp(int32_t where) {
2006     UVector64 *code = fRXPat->fCompiledPat;
2007     U_ASSERT(where>0 && where < code->size());
2008
2009     int32_t  nop = buildOp(URX_NOP, 0);
2010     code->insertElementAt(nop, where, *fStatus);
2011
2012     // Walk through the pattern, looking for any ops with targets that
2013     //  were moved down by the insert.  Fix them.
2014     int32_t loc;
2015     for (loc=0; loc<code->size(); loc++) {
2016         int32_t op = (int32_t)code->elementAti(loc);
2017         int32_t opType = URX_TYPE(op);
2018         int32_t opValue = URX_VAL(op);
2019         if ((opType == URX_JMP         ||
2020             opType == URX_JMPX         ||
2021             opType == URX_STATE_SAVE   ||
2022             opType == URX_CTR_LOOP     ||
2023             opType == URX_CTR_LOOP_NG  ||
2024             opType == URX_JMP_SAV      ||
2025             opType == URX_JMP_SAV_X    ||
2026             opType == URX_RELOC_OPRND)    && opValue > where) {
2027             // Target location for this opcode is after the insertion point and
2028             //   needs to be incremented to adjust for the insertion.
2029             opValue++;
2030             op = buildOp(opType, opValue);
2031             code->setElementAt(op, loc);
2032         }
2033     }
2034
2035     // Now fix up the parentheses stack.  All positive values in it are locations in
2036     //  the compiled pattern.   (Negative values are frame boundaries, and don't need fixing.)
2037     for (loc=0; loc<fParenStack.size(); loc++) {
2038         int32_t x = fParenStack.elementAti(loc);
2039         U_ASSERT(x < code->size());
2040         if (x>where) {
2041             x++;
2042             fParenStack.setElementAt(x, loc);
2043         }
2044     }
2045
2046     if (fMatchCloseParen > where) {
2047         fMatchCloseParen++;
2048     }
2049     if (fMatchOpenParen > where) {
2050         fMatchOpenParen++;
2051     }
2052 }
2053
2054
2055 //------------------------------------------------------------------------------
2056 //
2057 //   allocateData()        Allocate storage in the matcher's static data area.
2058 //                         Return the index for the newly allocated data.
2059 //                         The storage won't actually exist until we are running a match
2060 //                         operation, but the storage indexes are inserted into various
2061 //                         opcodes while compiling the pattern.
2062 //
2063 //------------------------------------------------------------------------------
2064 int32_t RegexCompile::allocateData(int32_t size) {
2065     if (U_FAILURE(*fStatus)) {
2066         return 0;
2067     }
2068     if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) {
2069         error(U_REGEX_INTERNAL_ERROR);
2070         return 0;
2071     }
2072     int32_t dataIndex = fRXPat->fDataSize;
2073     fRXPat->fDataSize += size;
2074     if (fRXPat->fDataSize >= 0x00fffff0) {
2075         error(U_REGEX_INTERNAL_ERROR);
2076     }
2077     return dataIndex;
2078 }
2079
2080
2081 //------------------------------------------------------------------------------
2082 //
2083 //   allocateStackData()   Allocate space in the back-tracking stack frame.
2084 //                         Return the index for the newly allocated data.
2085 //                         The frame indexes are inserted into various
2086 //                         opcodes while compiling the pattern, meaning that frame
2087 //                         size must be restricted to the size that will fit
2088 //                         as an operand (24 bits).
2089 //
2090 //------------------------------------------------------------------------------
2091 int32_t RegexCompile::allocateStackData(int32_t size) {
2092     if (U_FAILURE(*fStatus)) {
2093         return 0;
2094     }
2095     if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) {
2096         error(U_REGEX_INTERNAL_ERROR);
2097         return 0;
2098     }
2099     int32_t dataIndex = fRXPat->fFrameSize;
2100     fRXPat->fFrameSize += size;
2101     if (fRXPat->fFrameSize >= 0x00fffff0) {
2102         error(U_REGEX_PATTERN_TOO_BIG);
2103     }
2104     return dataIndex;
2105 }
2106
2107
2108 //------------------------------------------------------------------------------
2109 //
2110 //   blockTopLoc()          Find or create a location in the compiled pattern
2111 //                          at the start of the operation or block that has
2112 //                          just been compiled.  Needed when a quantifier (* or
2113 //                          whatever) appears, and we need to add an operation
2114 //                          at the start of the thing being quantified.
2115 //
2116 //                          (Parenthesized Blocks) have a slot with a NOP that
2117 //                          is reserved for this purpose.  .* or similar don't
2118 //                          and a slot needs to be added.
2119 //
2120 //       parameter reserveLoc   :  TRUE -  ensure that there is space to add an opcode
2121 //                                         at the returned location.
2122 //                                 FALSE - just return the address,
2123 //                                         do not reserve a location there.
2124 //
2125 //------------------------------------------------------------------------------
2126 int32_t   RegexCompile::blockTopLoc(UBool reserveLoc) {
2127     int32_t   theLoc;
2128     fixLiterals(TRUE);  // Emit code for any pending literals.
2129                         //   If last item was a string, emit separate op for the its last char.
2130     if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
2131     {
2132         // The item just processed is a parenthesized block.
2133         theLoc = fMatchOpenParen;   // A slot is already reserved for us.
2134         U_ASSERT(theLoc > 0);
2135         U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
2136     }
2137     else {
2138         // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
2139         // No slot for STATE_SAVE was pre-reserved in the compiled code.
2140         // We need to make space now.
2141         theLoc = fRXPat->fCompiledPat->size()-1;
2142         int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc);
2143         if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
2144             // Strings take two opcode, we want the position of the first one.
2145             // We can have a string at this point if a single character case-folded to two.
2146             theLoc--;
2147         }
2148         if (reserveLoc) {
2149             int32_t  nop = buildOp(URX_NOP, 0);
2150             fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
2151         }
2152     }
2153     return theLoc;
2154 }
2155
2156
2157
2158 //------------------------------------------------------------------------------
2159 //
2160 //    handleCloseParen      When compiling a close paren, we need to go back
2161 //                          and fix up any JMP or SAVE operations within the
2162 //                          parenthesized block that need to target the end
2163 //                          of the block.  The locations of these are kept on
2164 //                          the paretheses stack.
2165 //
2166 //                          This function is called both when encountering a
2167 //                          real ) and at the end of the pattern.
2168 //
2169 //------------------------------------------------------------------------------
2170 void  RegexCompile::handleCloseParen() {
2171     int32_t   patIdx;
2172     int32_t   patOp;
2173     if (fParenStack.size() <= 0) {
2174         error(U_REGEX_MISMATCHED_PAREN);
2175         return;
2176     }
2177
2178     // Emit code for any pending literals.
2179     fixLiterals(FALSE);
2180
2181     // Fixup any operations within the just-closed parenthesized group
2182     //    that need to reference the end of the (block).
2183     //    (The first one popped from the stack is an unused slot for
2184     //     alternation (OR) state save, but applying the fixup to it does no harm.)
2185     for (;;) {
2186         patIdx = fParenStack.popi();
2187         if (patIdx < 0) {
2188             // value < 0 flags the start of the frame on the paren stack.
2189             break;
2190         }
2191         U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
2192         patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx);
2193         U_ASSERT(URX_VAL(patOp) == 0);          // Branch target for JMP should not be set.
2194         patOp |= fRXPat->fCompiledPat->size();  // Set it now.
2195         fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
2196         fMatchOpenParen     = patIdx;
2197     }
2198
2199     //  At the close of any parenthesized block, restore the match mode flags  to
2200     //  the value they had at the open paren.  Saved value is
2201     //  at the top of the paren stack.
2202     fModeFlags = fParenStack.popi();
2203     U_ASSERT(fModeFlags < 0);
2204
2205     // DO any additional fixups, depending on the specific kind of
2206     // parentesized grouping this is
2207
2208     switch (patIdx) {
2209     case plain:
2210     case flags:
2211         // No additional fixups required.
2212         //   (Grouping-only parentheses)
2213         break;
2214     case capturing:
2215         // Capturing Parentheses.
2216         //   Insert a End Capture op into the pattern.
2217         //   The frame offset of the variables for this cg is obtained from the
2218         //       start capture op and put it into the end-capture op.
2219         {
2220             int32_t   captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2221             U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
2222
2223             int32_t   frameVarLocation = URX_VAL(captureOp);
2224             appendOp(URX_END_CAPTURE, frameVarLocation);
2225         }
2226         break;
2227     case atomic:
2228         // Atomic Parenthesis.
2229         //   Insert a LD_SP operation to restore the state stack to the position
2230         //   it was when the atomic parens were entered.
2231         {
2232             int32_t   stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2233             U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
2234             int32_t   stoLoc = URX_VAL(stoOp);
2235             appendOp(URX_LD_SP, stoLoc);
2236         }
2237         break;
2238
2239     case lookAhead:
2240         {
2241             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2242             U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2243             int32_t dataLoc  = URX_VAL(startOp);
2244             appendOp(URX_LA_END, dataLoc);
2245         }
2246         break;
2247
2248     case negLookAhead:
2249         {
2250             // See comment at doOpenLookAheadNeg
2251             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
2252             U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2253             int32_t dataLoc  = URX_VAL(startOp);
2254             appendOp(URX_LA_END, dataLoc);
2255             appendOp(URX_BACKTRACK, 0);
2256             appendOp(URX_LA_END, dataLoc);
2257
2258             // Patch the URX_SAVE near the top of the block.
2259             // The destination of the SAVE is the final LA_END that was just added.
2260             int32_t saveOp   = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
2261             U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
2262             int32_t dest     = fRXPat->fCompiledPat->size()-1;
2263             saveOp           = buildOp(URX_STATE_SAVE, dest);
2264             fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
2265         }
2266         break;
2267
2268     case lookBehind:
2269         {
2270             // See comment at doOpenLookBehind.
2271
2272             // Append the URX_LB_END and URX_LA_END to the compiled pattern.
2273             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
2274             U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2275             int32_t dataLoc  = URX_VAL(startOp);
2276             appendOp(URX_LB_END, dataLoc);
2277             appendOp(URX_LA_END, dataLoc);
2278
2279             // Determine the min and max bounds for the length of the
2280             //  string that the pattern can match.
2281             //  An unbounded upper limit is an error.
2282             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2283             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2284             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2285             if (URX_TYPE(maxML) != 0) {
2286                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2287                 break;
2288             }
2289             if (maxML == INT32_MAX) {
2290                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2291                 break;
2292             }
2293             U_ASSERT(minML <= maxML);
2294
2295             // Insert the min and max match len bounds into the URX_LB_CONT op that
2296             //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2297             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-2);
2298             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-1);
2299
2300         }
2301         break;
2302
2303
2304
2305     case lookBehindN:
2306         {
2307             // See comment at doOpenLookBehindNeg.
2308
2309             // Append the URX_LBN_END to the compiled pattern.
2310             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2311             U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2312             int32_t dataLoc  = URX_VAL(startOp);
2313             appendOp(URX_LBN_END, dataLoc);
2314
2315             // Determine the min and max bounds for the length of the
2316             //  string that the pattern can match.
2317             //  An unbounded upper limit is an error.
2318             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2319             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2320             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2321             if (URX_TYPE(maxML) != 0) {
2322                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2323                 break;
2324             }
2325             if (maxML == INT32_MAX) {
2326                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2327                 break;
2328             }
2329             U_ASSERT(minML <= maxML);
2330
2331             // Insert the min and max match len bounds into the URX_LB_CONT op that
2332             //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2333             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-3);
2334             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-2);
2335
2336             // Insert the pattern location to continue at after a successful match
2337             //  as the last operand of the URX_LBN_CONT
2338             int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
2339             fRXPat->fCompiledPat->setElementAt(op,  fMatchOpenParen-1);
2340         }
2341         break;
2342
2343
2344
2345     default:
2346         U_ASSERT(FALSE);
2347     }
2348
2349     // remember the next location in the compiled pattern.
2350     // The compilation of Quantifiers will look at this to see whether its looping
2351     //   over a parenthesized block or a single item
2352     fMatchCloseParen = fRXPat->fCompiledPat->size();
2353 }
2354
2355
2356
2357 //------------------------------------------------------------------------------
2358 //
2359 //   compileSet       Compile the pattern operations for a reference to a
2360 //                    UnicodeSet.
2361 //
2362 //------------------------------------------------------------------------------
2363 void        RegexCompile::compileSet(UnicodeSet *theSet)
2364 {
2365     if (theSet == NULL) {
2366         return;
2367     }
2368     //  Remove any strings from the set.
2369     //  There shoudn't be any, but just in case.
2370     //     (Case Closure can add them; if we had a simple case closure avaialble that
2371     //      ignored strings, that would be better.)
2372     theSet->removeAllStrings();
2373     int32_t  setSize = theSet->size();
2374
2375     switch (setSize) {
2376     case 0:
2377         {
2378             // Set of no elements.   Always fails to match.
2379             appendOp(URX_BACKTRACK, 0);
2380             delete theSet;
2381         }
2382         break;
2383
2384     case 1:
2385         {
2386             // The set contains only a single code point.  Put it into
2387             //   the compiled pattern as a single char operation rather
2388             //   than a set, and discard the set itself.
2389             literalChar(theSet->charAt(0));
2390             delete theSet;
2391         }
2392         break;
2393
2394     default:
2395         {
2396             //  The set contains two or more chars.  (the normal case)
2397             //  Put it into the compiled pattern as a set.
2398             int32_t setNumber = fRXPat->fSets->size();
2399             fRXPat->fSets->addElement(theSet, *fStatus);
2400             appendOp(URX_SETREF, setNumber);
2401         }
2402     }
2403 }
2404
2405
2406 //------------------------------------------------------------------------------
2407 //
2408 //   compileInterval    Generate the code for a {min, max} style interval quantifier.
2409 //                      Except for the specific opcodes used, the code is the same
2410 //                      for all three types (greedy, non-greedy, possessive) of
2411 //                      intervals.  The opcodes are supplied as parameters.
2412 //                      (There are two sets of opcodes - greedy & possessive use the
2413 //                      same ones, while non-greedy has it's own.)
2414 //
2415 //                      The code for interval loops has this form:
2416 //                         0  CTR_INIT   counter loc (in stack frame)
2417 //                         1             5  patt address of CTR_LOOP at bottom of block
2418 //                         2             min count
2419 //                         3             max count   (-1 for unbounded)
2420 //                         4  ...        block to be iterated over
2421 //                         5  CTR_LOOP
2422 //
2423 //                       In
2424 //------------------------------------------------------------------------------
2425 void        RegexCompile::compileInterval(int32_t InitOp,  int32_t LoopOp)
2426 {
2427     // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2428     //   four slots in the compiled code.  Reserve them.
2429     int32_t   topOfBlock = blockTopLoc(TRUE);
2430     insertOp(topOfBlock);
2431     insertOp(topOfBlock);
2432     insertOp(topOfBlock);
2433
2434     // The operands for the CTR_INIT opcode include the index in the matcher data
2435     //   of the counter.  Allocate it now. There are two data items
2436     //        counterLoc   -->  Loop counter
2437     //               +1    -->  Input index (for breaking non-progressing loops)
2438     //                          (Only present if unbounded upper limit on loop)
2439     int32_t   dataSize = fIntervalUpper < 0 ? 2 : 1;
2440     int32_t   counterLoc = allocateStackData(dataSize);
2441
2442     int32_t   op = buildOp(InitOp, counterLoc);
2443     fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
2444
2445     // The second operand of CTR_INIT is the location following the end of the loop.
2446     //   Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2447     //   compilation of something later on causes the code to grow and the target
2448     //   position to move.
2449     int32_t loopEnd = fRXPat->fCompiledPat->size();
2450     op = buildOp(URX_RELOC_OPRND, loopEnd);
2451     fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
2452
2453     // Followed by the min and max counts.
2454     fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
2455     fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
2456
2457     // Apend the CTR_LOOP op.  The operand is the location of the CTR_INIT op.
2458     //   Goes at end of the block being looped over, so just append to the code so far.
2459     appendOp(LoopOp, topOfBlock);
2460
2461     if ((fIntervalLow & 0xff000000) != 0 ||
2462         (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
2463             error(U_REGEX_NUMBER_TOO_BIG);
2464         }
2465
2466     if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
2467         error(U_REGEX_MAX_LT_MIN);
2468     }
2469 }
2470
2471
2472
2473 UBool RegexCompile::compileInlineInterval() {
2474     if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
2475         // Too big to inline.  Fail, which will cause looping code to be generated.
2476         //   (Upper < Lower picks up unbounded upper and errors, both.)
2477         return FALSE;
2478     }
2479
2480     int32_t   topOfBlock = blockTopLoc(FALSE);
2481     if (fIntervalUpper == 0) {
2482         // Pathological case.  Attempt no matches, as if the block doesn't exist.
2483         // Discard the generated code for the block.
2484         // If the block included parens, discard the info pertaining to them as well.
2485         fRXPat->fCompiledPat->setSize(topOfBlock);
2486         if (fMatchOpenParen >= topOfBlock) {
2487             fMatchOpenParen = -1;
2488         }
2489         if (fMatchCloseParen >= topOfBlock) {
2490             fMatchCloseParen = -1;
2491         }
2492         return TRUE;
2493     }
2494
2495     if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
2496         // The thing being repeated is not a single op, but some
2497         //   more complex block.  Do it as a loop, not inlines.
2498         //   Note that things "repeated" a max of once are handled as inline, because
2499         //     the one copy of the code already generated is just fine.
2500         return FALSE;
2501     }
2502
2503     // Pick up the opcode that is to be repeated
2504     //
2505     int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock);
2506
2507     // Compute the pattern location where the inline sequence
2508     //   will end, and set up the state save op that will be needed.
2509     //
2510     int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
2511                                 + fIntervalUpper + (fIntervalUpper-fIntervalLow);
2512     int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc);
2513     if (fIntervalLow == 0) {
2514         insertOp(topOfBlock);
2515         fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
2516     }
2517
2518
2519
2520     //  Loop, emitting the op for the thing being repeated each time.
2521     //    Loop starts at 1 because one instance of the op already exists in the pattern,
2522     //    it was put there when it was originally encountered.
2523     int32_t i;
2524     for (i=1; i<fIntervalUpper; i++ ) {
2525         if (i >= fIntervalLow) {
2526             appendOp(saveOp);
2527         }
2528         appendOp(op);
2529     }
2530     return TRUE;
2531 }
2532
2533
2534
2535 //------------------------------------------------------------------------------
2536 //
2537 //   caseInsensitiveStart  given a single code point from a pattern string, determine the
2538 //                         set of characters that could potentially begin a case-insensitive
2539 //                         match of a string beginning with that character, using full Unicode
2540 //                         case insensitive matching.
2541 //
2542 //          This is used in optimizing find().
2543 //
2544 //          closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
2545 //          misses cases like this:
2546 //             A string from the pattern begins with 'ss' (although all we know
2547 //                 in this context is that it begins with 's')
2548 //             The pattern could match a string beginning with a German sharp-s
2549 //
2550 //           To the ordinary case closure for a character c, we add all other
2551 //           characters cx where the case closure of cx incudes a string form that begins
2552 //           with the original character c.
2553 //
2554 //           This function could be made smarter. The full pattern string is available
2555 //           and it would be possible to verify that the extra characters being added
2556 //           to the starting set fully match, rather than having just a first-char of the
2557 //           folded form match.
2558 //
2559 //------------------------------------------------------------------------------
2560 void  RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
2561
2562 // Machine Generated below.
2563 // It may need updating with new versions of Unicode.
2564 // Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
2565 // The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
2566
2567 // Machine Generated Data. Do not hand edit.
2568     static const UChar32 RECaseFixCodePoints[] = {
2569         0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc,
2570         0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565,
2571         0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07,
2572         0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61,
2573         0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
2574
2575     static const int16_t RECaseFixStringOffsets[] = {
2576         0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10,
2577         0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f,
2578         0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43,
2579         0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57,
2580         0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
2581
2582     static const int16_t RECaseFixCounts[] = {
2583         0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1,
2584         0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1,
2585         0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2586         0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2587         0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
2588
2589     static const UChar RECaseFixData[] = {
2590         0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf,
2591         0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3,
2592         0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3,
2593         0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3,
2594         0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14,
2595         0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83,
2596         0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90,
2597         0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95,
2598         0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2,
2599         0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7,
2600         0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
2601
2602 // End of machine generated data.
2603
2604     if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2605         UChar32 caseFoldedC  = u_foldCase(c, U_FOLD_CASE_DEFAULT);
2606         starterChars->set(caseFoldedC, caseFoldedC);
2607
2608         int32_t i;
2609         for (i=0; RECaseFixCodePoints[i]<c ; i++) {
2610             // Simple linear search through the sorted list of interesting code points.
2611         }
2612
2613         if (RECaseFixCodePoints[i] == c) {
2614             int32_t dataIndex = RECaseFixStringOffsets[i];
2615             int32_t numCharsToAdd = RECaseFixCounts[i];
2616             UChar32 cpToAdd = 0;
2617             for (int32_t j=0; j<numCharsToAdd; j++) {
2618                 U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd);
2619                 starterChars->add(cpToAdd);
2620             }
2621         }
2622
2623         starterChars->closeOver(USET_CASE_INSENSITIVE);
2624         starterChars->removeAllStrings();
2625     } else {
2626         // Not a cased character. Just return it alone.
2627         starterChars->set(c, c);
2628     }
2629 }
2630
2631
2632
2633
2634 //------------------------------------------------------------------------------
2635 //
2636 //   matchStartType    Determine how a match can start.
2637 //                     Used to optimize find() operations.
2638 //
2639 //                     Operation is very similar to minMatchLength().  Walk the compiled
2640 //                     pattern, keeping an on-going minimum-match-length.  For any
2641 //                     op where the min match coming in is zero, add that ops possible
2642 //                     starting matches to the possible starts for the overall pattern.
2643 //
2644 //------------------------------------------------------------------------------
2645 void   RegexCompile::matchStartType() {
2646     if (U_FAILURE(*fStatus)) {
2647         return;
2648     }
2649
2650
2651     int32_t    loc;                    // Location in the pattern of the current op being processed.
2652     int32_t    op;                     // The op being processed
2653     int32_t    opType;                 // The opcode type of the op
2654     int32_t    currentLen = 0;         // Minimum length of a match to this point (loc) in the pattern
2655     int32_t    numInitialStrings = 0;  // Number of strings encountered that could match at start.
2656
2657     UBool      atStart = TRUE;         // True if no part of the pattern yet encountered
2658                                        //   could have advanced the position in a match.
2659                                        //   (Maximum match length so far == 0)
2660
2661     // forwardedLength is a vector holding minimum-match-length values that
2662     //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
2663     //   It must be one longer than the pattern being checked because some  ops
2664     //   will jmp to a end-of-block+1 location from within a block, and we must
2665     //   count those when checking the block.
2666     int32_t end = fRXPat->fCompiledPat->size();
2667     UVector32  forwardedLength(end+1, *fStatus);
2668     forwardedLength.setSize(end+1);
2669     for (loc=3; loc<end; loc++) {
2670         forwardedLength.setElementAt(INT32_MAX, loc);
2671     }
2672
2673     for (loc = 3; loc<end; loc++) {
2674         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2675         opType = URX_TYPE(op);
2676
2677         // The loop is advancing linearly through the pattern.
2678         // If the op we are now at was the destination of a branch in the pattern,
2679         // and that path has a shorter minimum length than the current accumulated value,
2680         // replace the current accumulated value.
2681         if (forwardedLength.elementAti(loc) < currentLen) {
2682             currentLen = forwardedLength.elementAti(loc);
2683             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2684         }
2685
2686         switch (opType) {
2687             // Ops that don't change the total length matched
2688         case URX_RESERVED_OP:
2689         case URX_END:
2690         case URX_FAIL:
2691         case URX_STRING_LEN:
2692         case URX_NOP:
2693         case URX_START_CAPTURE:
2694         case URX_END_CAPTURE:
2695         case URX_BACKSLASH_B:
2696         case URX_BACKSLASH_BU:
2697         case URX_BACKSLASH_G:
2698         case URX_BACKSLASH_Z:
2699         case URX_DOLLAR:
2700         case URX_DOLLAR_M:
2701         case URX_DOLLAR_D:
2702         case URX_DOLLAR_MD:
2703         case URX_RELOC_OPRND:
2704         case URX_STO_INP_LOC:
2705         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
2706         case URX_BACKREF_I:
2707
2708         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
2709         case URX_LD_SP:
2710             break;
2711
2712         case URX_CARET:
2713             if (atStart) {
2714                 fRXPat->fStartType = START_START;
2715             }
2716             break;
2717
2718         case URX_CARET_M:
2719         case URX_CARET_M_UNIX:
2720             if (atStart) {
2721                 fRXPat->fStartType = START_LINE;
2722             }
2723             break;
2724
2725         case URX_ONECHAR:
2726             if (currentLen == 0) {
2727                 // This character could appear at the start of a match.
2728                 //   Add it to the set of possible starting characters.
2729                 fRXPat->fInitialChars->add(URX_VAL(op));
2730                 numInitialStrings += 2;
2731             }
2732             currentLen++;
2733             atStart = FALSE;
2734             break;
2735
2736
2737         case URX_SETREF:
2738             if (currentLen == 0) {
2739                 int32_t  sn = URX_VAL(op);
2740                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2741                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2742                 fRXPat->fInitialChars->addAll(*s);
2743                 numInitialStrings += 2;
2744             }
2745             currentLen++;
2746             atStart = FALSE;
2747             break;
2748
2749         case URX_LOOP_SR_I:
2750             // [Set]*, like a SETREF, above, in what it can match,
2751             //  but may not match at all, so currentLen is not incremented.
2752             if (currentLen == 0) {
2753                 int32_t  sn = URX_VAL(op);
2754                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2755                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2756                 fRXPat->fInitialChars->addAll(*s);
2757                 numInitialStrings += 2;
2758             }
2759             atStart = FALSE;
2760             break;
2761
2762         case URX_LOOP_DOT_I:
2763             if (currentLen == 0) {
2764                 // .* at the start of a pattern.
2765                 //    Any character can begin the match.
2766                 fRXPat->fInitialChars->clear();
2767                 fRXPat->fInitialChars->complement();
2768                 numInitialStrings += 2;
2769             }
2770             atStart = FALSE;
2771             break;
2772
2773
2774         case URX_STATIC_SETREF:
2775             if (currentLen == 0) {
2776                 int32_t  sn = URX_VAL(op);
2777                 U_ASSERT(sn>0 && sn<URX_LAST_SET);
2778                 const UnicodeSet *s = fRXPat->fStaticSets[sn];
2779                 fRXPat->fInitialChars->addAll(*s);
2780                 numInitialStrings += 2;
2781             }
2782             currentLen++;
2783             atStart = FALSE;
2784             break;
2785
2786
2787
2788         case URX_STAT_SETREF_N:
2789             if (currentLen == 0) {
2790                 int32_t  sn = URX_VAL(op);
2791                 const UnicodeSet *s = fRXPat->fStaticSets[sn];
2792                 UnicodeSet sc(*s);
2793                 sc.complement();
2794                 fRXPat->fInitialChars->addAll(sc);
2795                 numInitialStrings += 2;
2796             }
2797             currentLen++;
2798             atStart = FALSE;
2799             break;
2800
2801
2802
2803         case URX_BACKSLASH_D:
2804             // Digit Char
2805              if (currentLen == 0) {
2806                  UnicodeSet s;
2807                  s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
2808                  if (URX_VAL(op) != 0) {
2809                      s.complement();
2810                  }
2811                  fRXPat->fInitialChars->addAll(s);
2812                  numInitialStrings += 2;
2813             }
2814             currentLen++;
2815             atStart = FALSE;
2816             break;
2817
2818
2819         case URX_BACKSLASH_H:
2820             // Horiz white space
2821             if (currentLen == 0) {
2822                 UnicodeSet s;
2823                 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
2824                 s.add((UChar32)9);   // Tab
2825                 if (URX_VAL(op) != 0) {
2826                     s.complement();
2827                 }
2828                 fRXPat->fInitialChars->addAll(s);
2829                 numInitialStrings += 2;
2830             }
2831             currentLen++;
2832             atStart = FALSE;
2833             break;
2834
2835
2836         case URX_BACKSLASH_R:       // Any line ending sequence
2837         case URX_BACKSLASH_V:       // Any line ending code point, with optional negation
2838             if (currentLen == 0) {
2839                 UnicodeSet s;
2840                 s.add((UChar32)0x0a, (UChar32)0x0d);  // add range
2841                 s.add((UChar32)0x85);
2842                 s.add((UChar32)0x2028, (UChar32)0x2029);
2843                 if (URX_VAL(op) != 0) {
2844                      // Complement option applies to URX_BACKSLASH_V only.
2845                      s.complement();
2846                 }
2847                 fRXPat->fInitialChars->addAll(s);
2848                 numInitialStrings += 2;
2849             }
2850             currentLen++;
2851             atStart = FALSE;
2852             break;
2853
2854
2855
2856         case URX_ONECHAR_I:
2857             // Case Insensitive Single Character.
2858             if (currentLen == 0) {
2859                 UChar32  c = URX_VAL(op);
2860                 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2861                     UnicodeSet starters(c, c);
2862                     starters.closeOver(USET_CASE_INSENSITIVE);
2863                     // findCaseInsensitiveStarters(c, &starters);
2864                     //   For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
2865                     //   The expanded folding can't match the pattern.
2866                     fRXPat->fInitialChars->addAll(starters);
2867                 } else {
2868                     // Char has no case variants.  Just add it as-is to the
2869                     //   set of possible starting chars.
2870                     fRXPat->fInitialChars->add(c);
2871                 }
2872                 numInitialStrings += 2;
2873             }
2874             currentLen++;
2875             atStart = FALSE;
2876             break;
2877
2878
2879         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
2880         case URX_DOTANY_ALL:    // . matches one or two.
2881         case URX_DOTANY:
2882         case URX_DOTANY_UNIX:
2883             if (currentLen == 0) {
2884                 // These constructs are all bad news when they appear at the start
2885                 //   of a match.  Any character can begin the match.
2886                 fRXPat->fInitialChars->clear();
2887                 fRXPat->fInitialChars->complement();
2888                 numInitialStrings += 2;
2889             }
2890             currentLen++;
2891             atStart = FALSE;
2892             break;
2893
2894
2895         case URX_JMPX:
2896             loc++;             // Except for extra operand on URX_JMPX, same as URX_JMP.
2897         case URX_JMP:
2898             {
2899                 int32_t  jmpDest = URX_VAL(op);
2900                 if (jmpDest < loc) {
2901                     // Loop of some kind.  Can safely ignore, the worst that will happen
2902                     //  is that we understate the true minimum length
2903                     currentLen = forwardedLength.elementAti(loc+1);
2904
2905                 } else {
2906                     // Forward jump.  Propagate the current min length to the target loc of the jump.
2907                     U_ASSERT(jmpDest <= end+1);
2908                     if (forwardedLength.elementAti(jmpDest) > currentLen) {
2909                         forwardedLength.setElementAt(currentLen, jmpDest);
2910                     }
2911                 }
2912             }
2913             atStart = FALSE;
2914             break;
2915
2916         case URX_JMP_SAV:
2917         case URX_JMP_SAV_X:
2918             // Combo of state save to the next loc, + jmp backwards.
2919             //   Net effect on min. length computation is nothing.
2920             atStart = FALSE;
2921             break;
2922
2923         case URX_BACKTRACK:
2924             // Fails are kind of like a branch, except that the min length was
2925             //   propagated already, by the state save.
2926             currentLen = forwardedLength.elementAti(loc+1);
2927             atStart = FALSE;
2928             break;
2929
2930
2931         case URX_STATE_SAVE:
2932             {
2933                 // State Save, for forward jumps, propagate the current minimum.
2934                 //             of the state save.
2935                 int32_t  jmpDest = URX_VAL(op);
2936                 if (jmpDest > loc) {
2937                     if (currentLen < forwardedLength.elementAti(jmpDest)) {
2938                         forwardedLength.setElementAt(currentLen, jmpDest);
2939                     }
2940                 }
2941             }
2942             atStart = FALSE;
2943             break;
2944
2945
2946
2947
2948         case URX_STRING:
2949             {
2950                 loc++;
2951                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2952                 int32_t stringLen   = URX_VAL(stringLenOp);
2953                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2954                 U_ASSERT(stringLenOp >= 2);
2955                 if (currentLen == 0) {
2956                     // Add the starting character of this string to the set of possible starting
2957                     //   characters for this pattern.
2958                     int32_t stringStartIdx = URX_VAL(op);
2959                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
2960                     fRXPat->fInitialChars->add(c);
2961
2962                     // Remember this string.  After the entire pattern has been checked,
2963                     //  if nothing else is identified that can start a match, we'll use it.
2964                     numInitialStrings++;
2965                     fRXPat->fInitialStringIdx = stringStartIdx;
2966                     fRXPat->fInitialStringLen = stringLen;
2967                 }
2968
2969                 currentLen += stringLen;
2970                 atStart = FALSE;
2971             }
2972             break;
2973
2974         case URX_STRING_I:
2975             {
2976                 // Case-insensitive string.  Unlike exact-match strings, we won't
2977                 //   attempt a string search for possible match positions.  But we
2978                 //   do update the set of possible starting characters.
2979                 loc++;
2980                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2981                 int32_t stringLen   = URX_VAL(stringLenOp);
2982                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2983                 U_ASSERT(stringLenOp >= 2);
2984                 if (currentLen == 0) {
2985                     // Add the starting character of this string to the set of possible starting
2986                     //   characters for this pattern.
2987                     int32_t stringStartIdx = URX_VAL(op);
2988                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
2989                     UnicodeSet s;
2990                     findCaseInsensitiveStarters(c, &s);
2991                     fRXPat->fInitialChars->addAll(s);
2992                     numInitialStrings += 2;  // Matching on an initial string not possible.
2993                 }
2994                 currentLen += stringLen;
2995                 atStart = FALSE;
2996             }
2997             break;
2998
2999         case URX_CTR_INIT:
3000         case URX_CTR_INIT_NG:
3001             {
3002                 // Loop Init Ops.  These don't change the min length, but they are 4 word ops
3003                 //   so location must be updated accordingly.
3004                 // Loop Init Ops.
3005                 //   If the min loop count == 0
3006                 //      move loc forwards to the end of the loop, skipping over the body.
3007                 //   If the min count is > 0,
3008                 //      continue normal processing of the body of the loop.
3009                 int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3010                         loopEndLoc   = URX_VAL(loopEndLoc);
3011                 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3012                 if (minLoopCount == 0) {
3013                     // Min Loop Count of 0, treat like a forward branch and
3014                     //   move the current minimum length up to the target
3015                     //   (end of loop) location.
3016                     U_ASSERT(loopEndLoc <= end+1);
3017                     if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
3018                         forwardedLength.setElementAt(currentLen, loopEndLoc);
3019                     }
3020                 }
3021                 loc+=3;  // Skips over operands of CTR_INIT
3022             }
3023             atStart = FALSE;
3024             break;
3025
3026
3027         case URX_CTR_LOOP:
3028         case URX_CTR_LOOP_NG:
3029             // Loop ops.
3030             //  The jump is conditional, backwards only.
3031             atStart = FALSE;
3032             break;
3033
3034         case URX_LOOP_C:
3035             // More loop ops.  These state-save to themselves.
3036             //   don't change the minimum match
3037             atStart = FALSE;
3038             break;
3039
3040
3041         case URX_LA_START:
3042         case URX_LB_START:
3043             {
3044                 // Look-around.  Scan forward until the matching look-ahead end,
3045                 //   without processing the look-around block.  This is overly pessimistic.
3046
3047                 // Keep track of the nesting depth of look-around blocks.  Boilerplate code for
3048                 //   lookahead contains two LA_END instructions, so count goes up by two
3049                 //   for each LA_START.
3050                 int32_t  depth = (opType == URX_LA_START? 2: 1);
3051                 for (;;) {
3052                     loc++;
3053                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3054                     if (URX_TYPE(op) == URX_LA_START) {
3055                         depth+=2;
3056                     }
3057                     if (URX_TYPE(op) == URX_LB_START) {
3058                         depth++;
3059                     }
3060                     if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3061                         depth--;
3062                         if (depth == 0) {
3063                             break;
3064                         }
3065                     }
3066                     if (URX_TYPE(op) == URX_STATE_SAVE) {
3067                         // Need this because neg lookahead blocks will FAIL to outside
3068                         //   of the block.
3069                         int32_t  jmpDest = URX_VAL(op);
3070                         if (jmpDest > loc) {
3071                             if (currentLen < forwardedLength.elementAti(jmpDest)) {
3072                                 forwardedLength.setElementAt(currentLen, jmpDest);
3073                             }
3074                         }
3075                     }
3076                     U_ASSERT(loc <= end);
3077                 }
3078             }
3079             break;
3080
3081         case URX_LA_END:
3082         case URX_LB_CONT:
3083         case URX_LB_END:
3084         case URX_LBN_CONT:
3085         case URX_LBN_END:
3086             U_ASSERT(FALSE);     // Shouldn't get here.  These ops should be
3087                                  //  consumed by the scan in URX_LA_START and LB_START
3088
3089             break;
3090
3091         default:
3092             U_ASSERT(FALSE);
3093             }
3094
3095         }
3096
3097
3098     // We have finished walking through the ops.  Check whether some forward jump
3099     //   propagated a shorter length to location end+1.
3100     if (forwardedLength.elementAti(end+1) < currentLen) {
3101         currentLen = forwardedLength.elementAti(end+1);
3102     }
3103
3104
3105     fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
3106
3107
3108     // Sort out what we should check for when looking for candidate match start positions.
3109     // In order of preference,
3110     //     1.   Start of input text buffer.
3111     //     2.   A literal string.
3112     //     3.   Start of line in multi-line mode.
3113     //     4.   A single literal character.
3114     //     5.   A character from a set of characters.
3115     //
3116     if (fRXPat->fStartType == START_START) {
3117         // Match only at the start of an input text string.
3118         //    start type is already set.  We're done.
3119     } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
3120         // Match beginning only with a literal string.
3121         UChar32  c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
3122         U_ASSERT(fRXPat->fInitialChars->contains(c));
3123         fRXPat->fStartType   = START_STRING;
3124         fRXPat->fInitialChar = c;
3125     } else if (fRXPat->fStartType == START_LINE) {
3126         // Match at start of line in Multi-Line mode.
3127         // Nothing to do here; everything is already set.
3128     } else if (fRXPat->fMinMatchLen == 0) {
3129         // Zero length match possible.  We could start anywhere.
3130         fRXPat->fStartType = START_NO_INFO;
3131     } else if (fRXPat->fInitialChars->size() == 1) {
3132         // All matches begin with the same char.
3133         fRXPat->fStartType   = START_CHAR;
3134         fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
3135         U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
3136     } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE &&
3137         fRXPat->fMinMatchLen > 0) {
3138         // Matches start with a set of character smaller than the set of all chars.
3139         fRXPat->fStartType = START_SET;
3140     } else {
3141         // Matches can start with anything
3142         fRXPat->fStartType = START_NO_INFO;
3143     }
3144
3145     return;
3146 }
3147
3148
3149
3150 //------------------------------------------------------------------------------
3151 //
3152 //   minMatchLength    Calculate the length of the shortest string that could
3153 //                     match the specified pattern.
3154 //                     Length is in 16 bit code units, not code points.
3155 //
3156 //                     The calculated length may not be exact.  The returned
3157 //                     value may be shorter than the actual minimum; it must
3158 //                     never be longer.
3159 //
3160 //                     start and end are the range of p-code operations to be
3161 //                     examined.  The endpoints are included in the range.
3162 //
3163 //------------------------------------------------------------------------------
3164 int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
3165     if (U_FAILURE(*fStatus)) {
3166         return 0;
3167     }
3168
3169     U_ASSERT(start <= end);
3170     U_ASSERT(end < fRXPat->fCompiledPat->size());
3171
3172
3173     int32_t    loc;
3174     int32_t    op;
3175     int32_t    opType;
3176     int32_t    currentLen = 0;
3177
3178
3179     // forwardedLength is a vector holding minimum-match-length values that
3180     //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
3181     //   It must be one longer than the pattern being checked because some  ops
3182     //   will jmp to a end-of-block+1 location from within a block, and we must
3183     //   count those when checking the block.
3184     UVector32  forwardedLength(end+2, *fStatus);
3185     forwardedLength.setSize(end+2);
3186     for (loc=start; loc<=end+1; loc++) {
3187         forwardedLength.setElementAt(INT32_MAX, loc);
3188     }
3189
3190     for (loc = start; loc<=end; loc++) {
3191         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3192         opType = URX_TYPE(op);
3193
3194         // The loop is advancing linearly through the pattern.
3195         // If the op we are now at was the destination of a branch in the pattern,
3196         // and that path has a shorter minimum length than the current accumulated value,
3197         // replace the current accumulated value.
3198         // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);  // MinLength == INT32_MAX for some
3199                                                                //   no-match-possible cases.
3200         if (forwardedLength.elementAti(loc) < currentLen) {
3201             currentLen = forwardedLength.elementAti(loc);
3202             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3203         }
3204
3205         switch (opType) {
3206             // Ops that don't change the total length matched
3207         case URX_RESERVED_OP:
3208         case URX_END:
3209         case URX_STRING_LEN:
3210         case URX_NOP:
3211         case URX_START_CAPTURE:
3212         case URX_END_CAPTURE:
3213         case URX_BACKSLASH_B:
3214         case URX_BACKSLASH_BU:
3215         case URX_BACKSLASH_G:
3216         case URX_BACKSLASH_Z:
3217         case URX_CARET:
3218         case URX_DOLLAR:
3219         case URX_DOLLAR_M:
3220         case URX_DOLLAR_D:
3221         case URX_DOLLAR_MD:
3222         case URX_RELOC_OPRND:
3223         case URX_STO_INP_LOC:
3224         case URX_CARET_M:
3225         case URX_CARET_M_UNIX:
3226         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
3227         case URX_BACKREF_I:
3228
3229         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
3230         case URX_LD_SP:
3231
3232         case URX_JMP_SAV:
3233         case URX_JMP_SAV_X:
3234             break;
3235
3236
3237             // Ops that match a minimum of one character (one or two 16 bit code units.)
3238             //
3239         case URX_ONECHAR:
3240         case URX_STATIC_SETREF:
3241         case URX_STAT_SETREF_N:
3242         case URX_SETREF:
3243         case URX_BACKSLASH_D:
3244         case URX_BACKSLASH_H:
3245         case URX_BACKSLASH_R:
3246         case URX_BACKSLASH_V:
3247         case URX_ONECHAR_I:
3248         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
3249         case URX_DOTANY_ALL:    // . matches one or two.
3250         case URX_DOTANY:
3251         case URX_DOTANY_UNIX:
3252             currentLen++;
3253             break;
3254
3255
3256         case URX_JMPX:
3257             loc++;              // URX_JMPX has an extra operand, ignored here,
3258                                 //   otherwise processed identically to URX_JMP.
3259         case URX_JMP:
3260             {
3261                 int32_t  jmpDest = URX_VAL(op);
3262                 if (jmpDest < loc) {
3263                     // Loop of some kind.  Can safely ignore, the worst that will happen
3264                     //  is that we understate the true minimum length
3265                     currentLen = forwardedLength.elementAti(loc+1);
3266                 } else {
3267                     // Forward jump.  Propagate the current min length to the target loc of the jump.
3268                     U_ASSERT(jmpDest <= end+1);
3269                     if (forwardedLength.elementAti(jmpDest) > currentLen) {
3270                         forwardedLength.setElementAt(currentLen, jmpDest);
3271                     }
3272                 }
3273             }
3274             break;
3275
3276         case URX_BACKTRACK:
3277             {
3278                 // Back-tracks are kind of like a branch, except that the min length was
3279                 //   propagated already, by the state save.
3280                 currentLen = forwardedLength.elementAti(loc+1);
3281             }
3282             break;
3283
3284
3285         case URX_STATE_SAVE:
3286             {
3287                 // State Save, for forward jumps, propagate the current minimum.
3288                 //             of the state save.
3289                 int32_t  jmpDest = URX_VAL(op);
3290                 if (jmpDest > loc) {
3291                     if (currentLen < forwardedLength.elementAti(jmpDest)) {
3292                         forwardedLength.setElementAt(currentLen, jmpDest);
3293                     }
3294                 }
3295             }
3296             break;
3297
3298
3299         case URX_STRING:
3300             {
3301                 loc++;
3302                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3303                 currentLen += URX_VAL(stringLenOp);
3304             }
3305             break;
3306
3307
3308         case URX_STRING_I:
3309             {
3310                 loc++;
3311                 // TODO: with full case folding, matching input text may be shorter than
3312                 //       the string we have here.  More smarts could put some bounds on it.
3313                 //       Assume a min length of one for now.  A min length of zero causes
3314                 //        optimization failures for a pattern like "string"+
3315                 // currentLen += URX_VAL(stringLenOp);
3316                 currentLen += 1;
3317             }
3318             break;
3319
3320         case URX_CTR_INIT:
3321         case URX_CTR_INIT_NG:
3322             {
3323                 // Loop Init Ops.
3324                 //   If the min loop count == 0
3325                 //      move loc forwards to the end of the loop, skipping over the body.
3326                 //   If the min count is > 0,
3327                 //      continue normal processing of the body of the loop.
3328                 int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3329                         loopEndLoc   = URX_VAL(loopEndLoc);
3330                 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3331                 if (minLoopCount == 0) {
3332                     loc = loopEndLoc;
3333                 } else {
3334                     loc+=3;  // Skips over operands of CTR_INIT
3335                 }
3336             }
3337             break;
3338
3339
3340         case URX_CTR_LOOP:
3341         case URX_CTR_LOOP_NG:
3342             // Loop ops.
3343             //  The jump is conditional, backwards only.
3344             break;
3345
3346         case URX_LOOP_SR_I:
3347         case URX_LOOP_DOT_I:
3348         case URX_LOOP_C:
3349             // More loop ops.  These state-save to themselves.
3350             //   don't change the minimum match - could match nothing at all.
3351             break;
3352
3353
3354         case URX_LA_START:
3355         case URX_LB_START:
3356             {
3357                 // Look-around.  Scan forward until the matching look-ahead end,
3358                 //   without processing the look-around block.  This is overly pessimistic for look-ahead,
3359                 //   it assumes that the look-ahead match might be zero-length.
3360                 //   TODO:  Positive lookahead could recursively do the block, then continue
3361                 //          with the longer of the block or the value coming in.  Ticket 6060
3362                 int32_t  depth = (opType == URX_LA_START? 2: 1);;
3363                 for (;;) {
3364                     loc++;
3365                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3366                     if (URX_TYPE(op) == URX_LA_START) {
3367                         // The boilerplate for look-ahead includes two LA_END insturctions,
3368                         //    Depth will be decremented by each one when it is seen.
3369                         depth += 2;
3370                     }
3371                     if (URX_TYPE(op) == URX_LB_START) {
3372                         depth++;
3373                     }
3374                     if (URX_TYPE(op) == URX_LA_END) {
3375                         depth--;
3376                         if (depth == 0) {
3377                             break;
3378                         }
3379                     }
3380                     if (URX_TYPE(op)==URX_LBN_END) {
3381                         depth--;
3382                         if (depth == 0) {
3383                             break;
3384                         }
3385                     }
3386                     if (URX_TYPE(op) == URX_STATE_SAVE) {
3387                         // Need this because neg lookahead blocks will FAIL to outside
3388                         //   of the block.
3389                         int32_t  jmpDest = URX_VAL(op);
3390                         if (jmpDest > loc) {
3391                             if (currentLen < forwardedLength.elementAti(jmpDest)) {
3392                                 forwardedLength.setElementAt(currentLen, jmpDest);
3393                             }
3394                         }
3395                     }
3396                     U_ASSERT(loc <= end);
3397                 }
3398             }
3399             break;
3400
3401         case URX_LA_END:
3402         case URX_LB_CONT:
3403         case URX_LB_END:
3404         case URX_LBN_CONT:
3405         case URX_LBN_END:
3406             // Only come here if the matching URX_LA_START or URX_LB_START was not in the
3407             //   range being sized, which happens when measuring size of look-behind blocks.
3408             break;
3409
3410         default:
3411             U_ASSERT(FALSE);
3412             }
3413
3414         }
3415
3416     // We have finished walking through the ops.  Check whether some forward jump
3417     //   propagated a shorter length to location end+1.
3418     if (forwardedLength.elementAti(end+1) < currentLen) {
3419         currentLen = forwardedLength.elementAti(end+1);
3420         U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3421     }
3422
3423     return currentLen;
3424 }
3425
3426 // Increment with overflow check.
3427 // val and delta will both be positive.
3428
3429 static int32_t safeIncrement(int32_t val, int32_t delta) {
3430     if (INT32_MAX - val > delta) {
3431         return val + delta;
3432     } else {
3433         return INT32_MAX;
3434     }
3435 }
3436
3437
3438 //------------------------------------------------------------------------------
3439 //
3440 //   maxMatchLength    Calculate the length of the longest string that could
3441 //                     match the specified pattern.
3442 //                     Length is in 16 bit code units, not code points.
3443 //
3444 //                     The calculated length may not be exact.  The returned
3445 //                     value may be longer than the actual maximum; it must
3446 //                     never be shorter.
3447 //
3448 //------------------------------------------------------------------------------
3449 int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
3450     if (U_FAILURE(*fStatus)) {
3451         return 0;
3452     }
3453     U_ASSERT(start <= end);
3454     U_ASSERT(end < fRXPat->fCompiledPat->size());
3455
3456
3457     int32_t    loc;
3458     int32_t    op;
3459     int32_t    opType;
3460     int32_t    currentLen = 0;
3461     UVector32  forwardedLength(end+1, *fStatus);
3462     forwardedLength.setSize(end+1);
3463
3464     for (loc=start; loc<=end; loc++) {
3465         forwardedLength.setElementAt(0, loc);
3466     }
3467
3468     for (loc = start; loc<=end; loc++) {
3469         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3470         opType = URX_TYPE(op);
3471
3472         // The loop is advancing linearly through the pattern.
3473         // If the op we are now at was the destination of a branch in the pattern,
3474         // and that path has a longer maximum length than the current accumulated value,
3475         // replace the current accumulated value.
3476         if (forwardedLength.elementAti(loc) > currentLen) {
3477             currentLen = forwardedLength.elementAti(loc);
3478         }
3479
3480         switch (opType) {
3481             // Ops that don't change the total length matched
3482         case URX_RESERVED_OP:
3483         case URX_END:
3484         case URX_STRING_LEN:
3485         case URX_NOP:
3486         case URX_START_CAPTURE:
3487         case URX_END_CAPTURE:
3488         case URX_BACKSLASH_B:
3489         case URX_BACKSLASH_BU:
3490         case URX_BACKSLASH_G:
3491         case URX_BACKSLASH_Z:
3492         case URX_CARET:
3493         case URX_DOLLAR:
3494         case URX_DOLLAR_M:
3495         case URX_DOLLAR_D:
3496         case URX_DOLLAR_MD:
3497         case URX_RELOC_OPRND:
3498         case URX_STO_INP_LOC:
3499         case URX_CARET_M:
3500         case URX_CARET_M_UNIX:
3501
3502         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
3503         case URX_LD_SP:
3504
3505         case URX_LB_END:
3506         case URX_LB_CONT:
3507         case URX_LBN_CONT:
3508         case URX_LBN_END:
3509             break;
3510
3511
3512             // Ops that increase that cause an unbounded increase in the length
3513             //   of a matched string, or that increase it a hard to characterize way.
3514             //   Call the max length unbounded, and stop further checking.
3515         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
3516         case URX_BACKREF_I:
3517         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
3518             currentLen = INT32_MAX;
3519             break;
3520
3521
3522             // Ops that match a max of one character (possibly two 16 bit code units.)
3523             //
3524         case URX_STATIC_SETREF:
3525         case URX_STAT_SETREF_N:
3526         case URX_SETREF:
3527         case URX_BACKSLASH_D:
3528         case URX_BACKSLASH_H:
3529         case URX_BACKSLASH_R:
3530         case URX_BACKSLASH_V:
3531         case URX_ONECHAR_I:
3532         case URX_DOTANY_ALL:
3533         case URX_DOTANY:
3534         case URX_DOTANY_UNIX:
3535             currentLen = safeIncrement(currentLen, 2);
3536             break;
3537
3538             // Single literal character.  Increase current max length by one or two,
3539             //       depending on whether the char is in the supplementary range.
3540         case URX_ONECHAR:
3541             currentLen = safeIncrement(currentLen, 1);
3542             if (URX_VAL(op) > 0x10000) {
3543                 currentLen = safeIncrement(currentLen, 1);
3544             }
3545             break;
3546
3547             // Jumps.
3548             //
3549         case URX_JMP:
3550         case URX_JMPX:
3551         case URX_JMP_SAV:
3552         case URX_JMP_SAV_X:
3553             {
3554                 int32_t  jmpDest = URX_VAL(op);
3555                 if (jmpDest < loc) {
3556                     // Loop of some kind.  Max match length is unbounded.
3557                     currentLen = INT32_MAX;
3558                 } else {
3559                     // Forward jump.  Propagate the current min length to the target loc of the jump.
3560                     if (forwardedLength.elementAti(jmpDest) < currentLen) {
3561                         forwardedLength.setElementAt(currentLen, jmpDest);
3562                     }
3563                     currentLen = 0;
3564                 }
3565             }
3566             break;
3567
3568         case URX_BACKTRACK:
3569             // back-tracks are kind of like a branch, except that the max length was
3570             //   propagated already, by the state save.
3571             currentLen = forwardedLength.elementAti(loc+1);
3572             break;
3573
3574
3575         case URX_STATE_SAVE:
3576             {
3577                 // State Save, for forward jumps, propagate the current minimum.
3578                 //               of the state save.
3579                 //             For backwards jumps, they create a loop, maximum
3580                 //               match length is unbounded.
3581                 int32_t  jmpDest = URX_VAL(op);
3582                 if (jmpDest > loc) {
3583                     if (currentLen > forwardedLength.elementAti(jmpDest)) {
3584                         forwardedLength.setElementAt(currentLen, jmpDest);
3585                     }
3586                 } else {
3587                     currentLen = INT32_MAX;
3588                 }
3589             }
3590             break;
3591
3592
3593
3594
3595         case URX_STRING:
3596             {
3597                 loc++;
3598                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3599                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3600                 break;
3601             }
3602
3603         case URX_STRING_I:
3604             // TODO:  This code assumes that any user string that matches will be no longer
3605             //        than our compiled string, with case insensitive matching.
3606             //        Our compiled string has been case-folded already.
3607             //
3608             //        Any matching user string will have no more code points than our
3609             //        compiled (folded) string.  Folding may add code points, but
3610             //        not remove them.
3611             //
3612             //        There is a potential problem if a supplemental code point
3613             //        case-folds to a BMP code point.  In this case our compiled string
3614             //        could be shorter (in code units) than a matching user string.
3615             //
3616             //        At this time (Unicode 6.1) there are no such characters, and this case
3617             //        is not being handled.  A test, intltest regex/Bug9283, will fail if
3618             //        any problematic characters are added to Unicode.
3619             //
3620             //        If this happens, we can make a set of the BMP chars that the
3621             //        troublesome supplementals fold to, scan our string, and bump the
3622             //        currentLen one extra for each that is found.
3623             //
3624             {
3625                 loc++;
3626                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3627                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3628             }
3629             break;
3630
3631         case URX_CTR_INIT:
3632         case URX_CTR_INIT_NG:
3633             // For Loops, recursively call this function on the pattern for the loop body,
3634             //   then multiply the result by the maximum loop count.
3635             {
3636                 int32_t  loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
3637                 if (loopEndLoc == loc+4) {
3638                     // Loop has an empty body. No affect on max match length.
3639                     // Continue processing with code after the loop end.
3640                     loc = loopEndLoc;
3641                     break;
3642                 }
3643
3644                 int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+3));
3645                 if (maxLoopCount == -1) {
3646                     // Unbounded Loop. No upper bound on match length.
3647                     currentLen = INT32_MAX;
3648                     break;
3649                 }
3650
3651                 U_ASSERT(loopEndLoc >= loc+4);
3652                 int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1);  // Recursive call.
3653                 int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount;
3654                 if (updatedLen >= INT32_MAX) {
3655                     currentLen = INT32_MAX;
3656                     break;
3657                 }
3658                 currentLen = (int32_t)updatedLen;
3659                 loc = loopEndLoc;
3660                 break;
3661             }
3662
3663         case URX_CTR_LOOP:
3664         case URX_CTR_LOOP_NG:
3665             // These opcodes will be skipped over by code for URX_CRT_INIT.
3666             // We shouldn't encounter them here.
3667             U_ASSERT(FALSE);
3668             break;
3669
3670         case URX_LOOP_SR_I:
3671         case URX_LOOP_DOT_I:
3672         case URX_LOOP_C:
3673             // For anything to do with loops, make the match length unbounded.
3674             currentLen = INT32_MAX;
3675             break;
3676
3677
3678
3679         case URX_LA_START:
3680         case URX_LA_END:
3681             // Look-ahead.  Just ignore, treat the look-ahead block as if
3682             // it were normal pattern.  Gives a too-long match length,
3683             //  but good enough for now.
3684             break;
3685
3686             // End of look-ahead ops should always be consumed by the processing at
3687             //  the URX_LA_START op.
3688             // U_ASSERT(FALSE);
3689             // break;
3690
3691         case URX_LB_START:
3692             {
3693                 // Look-behind.  Scan forward until the matching look-around end,
3694                 //   without processing the look-behind block.
3695                 int32_t  depth = 0;
3696                 for (;;) {
3697                     loc++;
3698                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3699                     if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) {
3700                         depth++;
3701                     }
3702                     if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3703                         if (depth == 0) {
3704                             break;
3705                         }
3706                         depth--;
3707                     }
3708                     U_ASSERT(loc < end);
3709                 }
3710             }
3711             break;
3712
3713         default:
3714             U_ASSERT(FALSE);
3715         }
3716
3717
3718         if (currentLen == INT32_MAX) {
3719             //  The maximum length is unbounded.
3720             //  Stop further processing of the pattern.
3721             break;
3722         }
3723
3724     }
3725     return currentLen;
3726
3727 }
3728
3729
3730 //------------------------------------------------------------------------------
3731 //
3732 //   stripNOPs    Remove any NOP operations from the compiled pattern code.
3733 //                Extra NOPs are inserted for some constructs during the initial
3734 //                code generation to provide locations that may be patched later.
3735 //                Many end up unneeded, and are removed by this function.
3736 //
3737 //                In order to minimize the number of passes through the pattern,
3738 //                back-reference fixup is also performed here (adjusting
3739 //                back-reference operands to point to the correct frame offsets).
3740 //
3741 //------------------------------------------------------------------------------
3742 void RegexCompile::stripNOPs() {
3743
3744     if (U_FAILURE(*fStatus)) {
3745         return;
3746     }
3747
3748     int32_t    end = fRXPat->fCompiledPat->size();
3749     UVector32  deltas(end, *fStatus);
3750
3751     // Make a first pass over the code, computing the amount that things
3752     //   will be offset at each location in the original code.
3753     int32_t   loc;
3754     int32_t   d = 0;
3755     for (loc=0; loc<end; loc++) {
3756         deltas.addElement(d, *fStatus);
3757         int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3758         if (URX_TYPE(op) == URX_NOP) {
3759             d++;
3760         }
3761     }
3762
3763     UnicodeString caseStringBuffer;
3764
3765     // Make a second pass over the code, removing the NOPs by moving following
3766     //  code up, and patching operands that refer to code locations that
3767     //  are being moved.  The array of offsets from the first step is used
3768     //  to compute the new operand values.
3769     int32_t src;
3770     int32_t dst = 0;
3771     for (src=0; src<end; src++) {
3772         int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
3773         int32_t opType = URX_TYPE(op);
3774         switch (opType) {
3775         case URX_NOP:
3776             break;
3777
3778         case URX_STATE_SAVE:
3779         case URX_JMP:
3780         case URX_CTR_LOOP:
3781         case URX_CTR_LOOP_NG:
3782         case URX_RELOC_OPRND:
3783         case URX_JMPX:
3784         case URX_JMP_SAV:
3785         case URX_JMP_SAV_X:
3786             // These are instructions with operands that refer to code locations.
3787             {
3788                 int32_t  operandAddress = URX_VAL(op);
3789                 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
3790                 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
3791                 op = buildOp(opType, fixedOperandAddress);
3792                 fRXPat->fCompiledPat->setElementAt(op, dst);
3793                 dst++;
3794                 break;
3795             }
3796
3797         case URX_BACKREF:
3798         case URX_BACKREF_I:
3799             {
3800                 int32_t where = URX_VAL(op);
3801                 if (where > fRXPat->fGroupMap->size()) {
3802                     error(U_REGEX_INVALID_BACK_REF);
3803                     break;
3804                 }
3805                 where = fRXPat->fGroupMap->elementAti(where-1);
3806                 op    = buildOp(opType, where);
3807                 fRXPat->fCompiledPat->setElementAt(op, dst);
3808                 dst++;
3809
3810                 fRXPat->fNeedsAltInput = TRUE;
3811                 break;
3812             }
3813         case URX_RESERVED_OP:
3814         case URX_RESERVED_OP_N:
3815         case URX_BACKTRACK:
3816         case URX_END:
3817         case URX_ONECHAR:
3818         case URX_STRING:
3819         case URX_STRING_LEN:
3820         case URX_START_CAPTURE:
3821         case URX_END_CAPTURE:
3822         case URX_STATIC_SETREF:
3823         case URX_STAT_SETREF_N:
3824         case URX_SETREF:
3825         case URX_DOTANY:
3826         case URX_FAIL:
3827         case URX_BACKSLASH_B:
3828         case URX_BACKSLASH_BU:
3829         case URX_BACKSLASH_G:
3830         case URX_BACKSLASH_X:
3831         case URX_BACKSLASH_Z:
3832         case URX_DOTANY_ALL:
3833         case URX_BACKSLASH_D:
3834         case URX_CARET:
3835         case URX_DOLLAR:
3836         case URX_CTR_INIT:
3837         case URX_CTR_INIT_NG:
3838         case URX_DOTANY_UNIX:
3839         case URX_STO_SP:
3840         case URX_LD_SP:
3841         case URX_STO_INP_LOC:
3842         case URX_LA_START:
3843         case URX_LA_END:
3844         case URX_ONECHAR_I:
3845         case URX_STRING_I:
3846         case URX_DOLLAR_M:
3847         case URX_CARET_M:
3848         case URX_CARET_M_UNIX:
3849         case URX_LB_START:
3850         case URX_LB_CONT:
3851         case URX_LB_END:
3852         case URX_LBN_CONT:
3853         case URX_LBN_END:
3854         case URX_LOOP_SR_I:
3855         case URX_LOOP_DOT_I:
3856         case URX_LOOP_C:
3857         case URX_DOLLAR_D:
3858         case URX_DOLLAR_MD:
3859         case URX_BACKSLASH_H:
3860         case URX_BACKSLASH_R:
3861         case URX_BACKSLASH_V:
3862             // These instructions are unaltered by the relocation.
3863             fRXPat->fCompiledPat->setElementAt(op, dst);
3864             dst++;
3865             break;
3866
3867         default:
3868             // Some op is unaccounted for.
3869             U_ASSERT(FALSE);
3870             error(U_REGEX_INTERNAL_ERROR);
3871         }
3872     }
3873
3874     fRXPat->fCompiledPat->setSize(dst);
3875 }
3876
3877
3878
3879
3880 //------------------------------------------------------------------------------
3881 //
3882 //  Error         Report a rule parse error.
3883 //                Only report it if no previous error has been recorded.
3884 //
3885 //------------------------------------------------------------------------------
3886 void RegexCompile::error(UErrorCode e) {
3887     if (U_SUCCESS(*fStatus)) {
3888         *fStatus = e;
3889         // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
3890         // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
3891         // int64_t. If the values of the latter are out of range for the former,
3892         // set them to the appropriate "field not supported" values.
3893         if (fLineNum > 0x7FFFFFFF) {
3894             fParseErr->line   = 0;
3895             fParseErr->offset = -1;
3896         } else if (fCharNum > 0x7FFFFFFF) {
3897             fParseErr->line   = (int32_t)fLineNum;
3898             fParseErr->offset = -1;
3899         } else {
3900             fParseErr->line   = (int32_t)fLineNum;
3901             fParseErr->offset = (int32_t)fCharNum;
3902         }
3903
3904         UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
3905
3906         // Fill in the context.
3907         //   Note: extractBetween() pins supplied indicies to the string bounds.
3908         uprv_memset(fParseErr->preContext,  0, sizeof(fParseErr->preContext));
3909         uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
3910         utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
3911         utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
3912     }
3913 }
3914
3915
3916 //
3917 //  Assorted Unicode character constants.
3918 //     Numeric because there is no portable way to enter them as literals.
3919 //     (Think EBCDIC).
3920 //
3921 static const UChar      chCR        = 0x0d;      // New lines, for terminating comments.
3922 static const UChar      chLF        = 0x0a;      // Line Feed
3923 static const UChar      chPound     = 0x23;      // '#', introduces a comment.
3924 static const UChar      chDigit0    = 0x30;      // '0'
3925 static const UChar      chDigit7    = 0x37;      // '9'
3926 static const UChar      chColon     = 0x3A;      // ':'
3927 static const UChar      chE         = 0x45;      // 'E'
3928 static const UChar      chQ         = 0x51;      // 'Q'
3929 //static const UChar      chN         = 0x4E;      // 'N'
3930 static const UChar      chP         = 0x50;      // 'P'
3931 static const UChar      chBackSlash = 0x5c;      // '\'  introduces a char escape
3932 //static const UChar      chLBracket  = 0x5b;      // '['
3933 static const UChar      chRBracket  = 0x5d;      // ']'
3934 static const UChar      chUp        = 0x5e;      // '^'
3935 static const UChar      chLowerP    = 0x70;
3936 static const UChar      chLBrace    = 0x7b;      // '{'
3937 static const UChar      chRBrace    = 0x7d;      // '}'
3938 static const UChar      chNEL       = 0x85;      //    NEL newline variant
3939 static const UChar      chLS        = 0x2028;    //    Unicode Line Separator
3940
3941
3942 //------------------------------------------------------------------------------
3943 //
3944 //  nextCharLL    Low Level Next Char from the regex pattern.
3945 //                Get a char from the string, keep track of input position
3946 //                     for error reporting.
3947 //
3948 //------------------------------------------------------------------------------
3949 UChar32  RegexCompile::nextCharLL() {
3950     UChar32       ch;
3951
3952     if (fPeekChar != -1) {
3953         ch = fPeekChar;
3954         fPeekChar = -1;
3955         return ch;
3956     }
3957
3958     // assume we're already in the right place
3959     ch = UTEXT_NEXT32(fRXPat->fPattern);
3960     if (ch == U_SENTINEL) {
3961         return ch;
3962     }
3963
3964     if (ch == chCR ||
3965         ch == chNEL ||
3966         ch == chLS   ||
3967         (ch == chLF && fLastChar != chCR)) {
3968         // Character is starting a new line.  Bump up the line number, and
3969         //  reset the column to 0.
3970         fLineNum++;
3971         fCharNum=0;
3972     }
3973     else {
3974         // Character is not starting a new line.  Except in the case of a
3975         //   LF following a CR, increment the column position.
3976         if (ch != chLF) {
3977             fCharNum++;
3978         }
3979     }
3980     fLastChar = ch;
3981     return ch;
3982 }
3983
3984 //------------------------------------------------------------------------------
3985 //
3986 //   peekCharLL    Low Level Character Scanning, sneak a peek at the next
3987 //                 character without actually getting it.
3988 //
3989 //------------------------------------------------------------------------------
3990 UChar32  RegexCompile::peekCharLL() {
3991     if (fPeekChar == -1) {
3992         fPeekChar = nextCharLL();
3993     }
3994     return fPeekChar;
3995 }
3996
3997
3998 //------------------------------------------------------------------------------
3999 //
4000 //   nextChar     for pattern scanning.  At this level, we handle stripping
4001 //                out comments and processing some backslash character escapes.
4002 //                The rest of the pattern grammar is handled at the next level up.
4003 //
4004 //------------------------------------------------------------------------------
4005 void RegexCompile::nextChar(RegexPatternChar &c) {
4006
4007     fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4008     c.fChar    = nextCharLL();
4009     c.fQuoted  = FALSE;
4010
4011     if (fQuoteMode) {
4012         c.fQuoted = TRUE;
4013         if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
4014             c.fChar == (UChar32)-1) {
4015             fQuoteMode = FALSE;  //  Exit quote mode,
4016             nextCharLL();        // discard the E
4017             nextChar(c);         // recurse to get the real next char
4018         }
4019     }
4020     else if (fInBackslashQuote) {
4021         // The current character immediately follows a '\'
4022         // Don't check for any further escapes, just return it as-is.
4023         // Don't set c.fQuoted, because that would prevent the state machine from
4024         //    dispatching on the character.
4025         fInBackslashQuote = FALSE;
4026     }
4027     else
4028     {
4029         // We are not in a \Q quoted region \E of the source.
4030         //
4031         if (fModeFlags & UREGEX_COMMENTS) {
4032             //
4033             // We are in free-spacing and comments mode.
4034             //  Scan through any white space and comments, until we
4035             //  reach a significant character or the end of inut.
4036             for (;;) {
4037                 if (c.fChar == (UChar32)-1) {
4038                     break;     // End of Input
4039                 }
4040                 if  (c.fChar == chPound && fEOLComments == TRUE) {
4041                     // Start of a comment.  Consume the rest of it, until EOF or a new line
4042                     for (;;) {
4043                         c.fChar = nextCharLL();
4044                         if (c.fChar == (UChar32)-1 ||  // EOF
4045                             c.fChar == chCR        ||
4046                             c.fChar == chLF        ||
4047                             c.fChar == chNEL       ||
4048                             c.fChar == chLS)       {
4049                             break;
4050                         }
4051                     }
4052                 }
4053                 // TODO:  check what Java & Perl do with non-ASCII white spaces.  Ticket 6061.
4054                 if (PatternProps::isWhiteSpace(c.fChar) == FALSE) {
4055                     break;
4056                 }
4057                 c.fChar = nextCharLL();
4058             }
4059         }
4060
4061         //
4062         //  check for backslash escaped characters.
4063         //
4064         if (c.fChar == chBackSlash) {
4065             int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4066             if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
4067                 //
4068                 // A '\' sequence that is handled by ICU's standard unescapeAt function.
4069                 //   Includes \uxxxx, \n, \r, many others.
4070                 //   Return the single equivalent character.
4071                 //
4072                 nextCharLL();                 // get & discard the peeked char.
4073                 c.fQuoted = TRUE;
4074
4075                 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
4076                     int32_t endIndex = (int32_t)pos;
4077                     c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
4078
4079                     if (endIndex == pos) {
4080                         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4081                     }
4082                     fCharNum += endIndex - pos;
4083                     UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
4084                 } else {
4085                     int32_t offset = 0;
4086                     struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
4087
4088                     UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
4089                     c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
4090
4091                     if (offset == 0) {
4092                         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4093                     } else if (context.lastOffset == offset) {
4094                         UTEXT_PREVIOUS32(fRXPat->fPattern);
4095                     } else if (context.lastOffset != offset-1) {
4096                         utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
4097                     }
4098                     fCharNum += offset;
4099                 }
4100             }
4101             else if (peekCharLL() == chDigit0) {
4102                 //  Octal Escape, using Java Regexp Conventions
4103                 //    which are \0 followed by 1-3 octal digits.
4104                 //    Different from ICU Unescape handling of Octal, which does not
4105                 //    require the leading 0.
4106                 //  Java also has the convention of only consuming 2 octal digits if
4107                 //    the three digit number would be > 0xff
4108                 //
4109                 c.fChar = 0;
4110                 nextCharLL();    // Consume the initial 0.
4111                 int index;
4112                 for (index=0; index<3; index++) {
4113                     int32_t ch = peekCharLL();
4114                     if (ch<chDigit0 || ch>chDigit7) {
4115                         if (index==0) {
4116                            // \0 is not followed by any octal digits.
4117                            error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4118                         }
4119                         break;
4120                     }
4121                     c.fChar <<= 3;
4122                     c.fChar += ch&7;
4123                     if (c.fChar <= 255) {
4124                         nextCharLL();
4125                     } else {
4126                         // The last digit made the number too big.  Forget we saw it.
4127                         c.fChar >>= 3;
4128                     }
4129                 }
4130                 c.fQuoted = TRUE;
4131             }
4132             else if (peekCharLL() == chQ) {
4133                 //  "\Q"  enter quote mode, which will continue until "\E"
4134                 fQuoteMode = TRUE;
4135                 nextCharLL();       // discard the 'Q'.
4136                 nextChar(c);        // recurse to get the real next char.
4137             }
4138             else
4139             {
4140                 // We are in a '\' escape that will be handled by the state table scanner.
4141                 // Just return the backslash, but remember that the following char is to
4142                 //  be taken literally.
4143                 fInBackslashQuote = TRUE;
4144             }
4145         }
4146     }
4147
4148     // re-enable # to end-of-line comments, in case they were disabled.
4149     // They are disabled by the parser upon seeing '(?', but this lasts for
4150     //  the fetching of the next character only.
4151     fEOLComments = TRUE;
4152
4153     // putc(c.fChar, stdout);
4154 }
4155
4156
4157
4158 //------------------------------------------------------------------------------
4159 //
4160 //  scanNamedChar
4161 //            Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
4162 //
4163 //             The scan position will be at the 'N'.  On return
4164 //             the scan position should be just after the '}'
4165 //
4166 //             Return the UChar32
4167 //
4168 //------------------------------------------------------------------------------
4169 UChar32  RegexCompile::scanNamedChar() {
4170     if (U_FAILURE(*fStatus)) {
4171         return 0;
4172     }
4173
4174     nextChar(fC);
4175     if (fC.fChar != chLBrace) {
4176         error(U_REGEX_PROPERTY_SYNTAX);
4177         return 0;
4178     }
4179
4180     UnicodeString  charName;
4181     for (;;) {
4182         nextChar(fC);
4183         if (fC.fChar == chRBrace) {
4184             break;
4185         }
4186         if (fC.fChar == -1) {
4187             error(U_REGEX_PROPERTY_SYNTAX);
4188             return 0;
4189         }
4190         charName.append(fC.fChar);
4191     }
4192
4193     char name[100];
4194     if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
4195          (uint32_t)charName.length()>=sizeof(name)) {
4196         // All Unicode character names have only invariant characters.
4197         // The API to get a character, given a name, accepts only char *, forcing us to convert,
4198         //   which requires this error check
4199         error(U_REGEX_PROPERTY_SYNTAX);
4200         return 0;
4201     }
4202     charName.extract(0, charName.length(), name, sizeof(name), US_INV);
4203
4204     UChar32  theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
4205     if (U_FAILURE(*fStatus)) {
4206         error(U_REGEX_PROPERTY_SYNTAX);
4207     }
4208
4209     nextChar(fC);      // Continue overall regex pattern processing with char after the '}'
4210     return theChar;
4211 }
4212
4213 //------------------------------------------------------------------------------
4214 //
4215 //  scanProp   Construct a UnicodeSet from the text at the current scan
4216 //             position, which will be of the form \p{whaterver}
4217 //
4218 //             The scan position will be at the 'p' or 'P'.  On return
4219 //             the scan position should be just after the '}'
4220 //
4221 //             Return a UnicodeSet, constructed from the \P pattern,
4222 //             or NULL if the pattern is invalid.
4223 //
4224 //------------------------------------------------------------------------------
4225 UnicodeSet *RegexCompile::scanProp() {
4226     UnicodeSet    *uset = NULL;
4227
4228     if (U_FAILURE(*fStatus)) {
4229         return NULL;
4230     }
4231     (void)chLowerP;   // Suppress compiler unused variable warning.
4232     U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
4233     UBool negated = (fC.fChar == chP);
4234
4235     UnicodeString propertyName;
4236     nextChar(fC);
4237     if (fC.fChar != chLBrace) {
4238         error(U_REGEX_PROPERTY_SYNTAX);
4239         return NULL;
4240     }
4241     for (;;) {
4242         nextChar(fC);
4243         if (fC.fChar == chRBrace) {
4244             break;
4245         }
4246         if (fC.fChar == -1) {
4247             // Hit the end of the input string without finding the closing '}'
4248             error(U_REGEX_PROPERTY_SYNTAX);
4249             return NULL;
4250         }
4251         propertyName.append(fC.fChar);
4252     }
4253     uset = createSetForProperty(propertyName, negated);
4254     nextChar(fC);    // Move input scan to position following the closing '}'
4255     return uset;
4256 }
4257
4258 //------------------------------------------------------------------------------
4259 //
4260 //  scanPosixProp   Construct a UnicodeSet from the text at the current scan
4261 //             position, which is expected be of the form [:property expression:]
4262 //
4263 //             The scan position will be at the opening ':'.  On return
4264 //             the scan position must be on the closing ']'
4265 //
4266 //             Return a UnicodeSet constructed from the pattern,
4267 //             or NULL if this is not a valid POSIX-style set expression.
4268 //             If not a property expression, restore the initial scan position
4269 //                (to the opening ':')
4270 //
4271 //               Note:  the opening '[:' is not sufficient to guarantee that
4272 //                      this is a [:property:] expression.
4273 //                      [:'+=,] is a perfectly good ordinary set expression that
4274 //                              happens to include ':' as one of its characters.
4275 //
4276 //------------------------------------------------------------------------------
4277 UnicodeSet *RegexCompile::scanPosixProp() {
4278     UnicodeSet    *uset = NULL;
4279
4280     if (U_FAILURE(*fStatus)) {
4281         return NULL;
4282     }
4283
4284     U_ASSERT(fC.fChar == chColon);
4285
4286     // Save the scanner state.
4287     // TODO:  move this into the scanner, with the state encapsulated in some way.  Ticket 6062
4288     int64_t     savedScanIndex        = fScanIndex;
4289     int64_t     savedNextIndex        = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4290     UBool       savedQuoteMode        = fQuoteMode;
4291     UBool       savedInBackslashQuote = fInBackslashQuote;
4292     UBool       savedEOLComments      = fEOLComments;
4293     int64_t     savedLineNum          = fLineNum;
4294     int64_t     savedCharNum          = fCharNum;
4295     UChar32     savedLastChar         = fLastChar;
4296     UChar32     savedPeekChar         = fPeekChar;
4297     RegexPatternChar savedfC          = fC;
4298
4299     // Scan for a closing ].   A little tricky because there are some perverse
4300     //   edge cases possible.  "[:abc\Qdef:] \E]"  is a valid non-property expression,
4301     //   ending on the second closing ].
4302
4303     UnicodeString propName;
4304     UBool         negated  = FALSE;
4305
4306     // Check for and consume the '^' in a negated POSIX property, e.g.  [:^Letter:]
4307     nextChar(fC);
4308     if (fC.fChar == chUp) {
4309        negated = TRUE;
4310        nextChar(fC);
4311     }
4312
4313     // Scan for the closing ":]", collecting the property name along the way.
4314     UBool  sawPropSetTerminator = FALSE;
4315     for (;;) {
4316         propName.append(fC.fChar);
4317         nextChar(fC);
4318         if (fC.fQuoted || fC.fChar == -1) {
4319             // Escaped characters or end of input - either says this isn't a [:Property:]
4320             break;
4321         }
4322         if (fC.fChar == chColon) {
4323             nextChar(fC);
4324             if (fC.fChar == chRBracket) {
4325                 sawPropSetTerminator = TRUE;
4326             }
4327             break;
4328         }
4329     }
4330
4331     if (sawPropSetTerminator) {
4332         uset = createSetForProperty(propName, negated);
4333     }
4334     else
4335     {
4336         // No closing ":]".
4337         //  Restore the original scan position.
4338         //  The main scanner will retry the input as a normal set expression,
4339         //    not a [:Property:] expression.
4340         fScanIndex        = savedScanIndex;
4341         fQuoteMode        = savedQuoteMode;
4342         fInBackslashQuote = savedInBackslashQuote;
4343         fEOLComments      = savedEOLComments;
4344         fLineNum          = savedLineNum;
4345         fCharNum          = savedCharNum;
4346         fLastChar         = savedLastChar;
4347         fPeekChar         = savedPeekChar;
4348         fC                = savedfC;
4349         UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
4350     }
4351     return uset;
4352 }
4353
4354 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
4355     set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
4356     addCategory(set, U_GC_CF_MASK, ec);
4357 }
4358
4359 //
4360 //  Create a Unicode Set from a Unicode Property expression.
4361 //     This is common code underlying both \p{...} ane [:...:] expressions.
4362 //     Includes trying the Java "properties" that aren't supported as
4363 //     normal ICU UnicodeSet properties
4364 //
4365 static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{"
4366 static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{"
4367 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
4368     UnicodeString   setExpr;
4369     UnicodeSet      *set;
4370     uint32_t        usetFlags = 0;
4371
4372     if (U_FAILURE(*fStatus)) {
4373         return NULL;
4374     }
4375
4376     //
4377     //  First try the property as we received it
4378     //
4379     if (negated) {
4380         setExpr.append(negSetPrefix, -1);
4381     } else {
4382         setExpr.append(posSetPrefix, -1);
4383     }
4384     setExpr.append(propName);
4385     setExpr.append(chRBrace);
4386     setExpr.append(chRBracket);
4387     if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
4388         usetFlags |= USET_CASE_INSENSITIVE;
4389     }
4390     set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
4391     if (U_SUCCESS(*fStatus)) {
4392        return set;
4393     }
4394     delete set;
4395     set = NULL;
4396
4397     //
4398     //  The property as it was didn't work.
4399
4400     //  Do [:word:]. It is not recognized as a property by UnicodeSet.  "word" not standard POSIX
4401     //     or standard Java, but many other regular expression packages do recognize it.
4402
4403     if (propName.caseCompare(UNICODE_STRING_SIMPLE("word"), 0) == 0) {
4404         *fStatus = U_ZERO_ERROR;
4405         set = new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET]));
4406         if (set == NULL) {
4407             *fStatus = U_MEMORY_ALLOCATION_ERROR;
4408             return set;
4409         }
4410         if (negated) {
4411             set->complement();
4412         }
4413         return set;
4414     }
4415
4416
4417     //    Do Java fixes -
4418     //       InGreek -> InGreek or Coptic, that being the official Unicode name for that block.
4419     //       InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols.
4420     //
4421     //       Note on Spaces:  either "InCombiningMarksForSymbols" or "InCombining Marks for Symbols"
4422     //                        is accepted by Java.  The property part of the name is compared
4423     //                        case-insenstively.  The spaces must be exactly as shown, either
4424     //                        all there, or all omitted, with exactly one at each position
4425     //                        if they are present.  From checking against JDK 1.6
4426     //
4427     //       This code should be removed when ICU properties support the Java  compatibility names
4428     //          (ICU 4.0?)
4429     //
4430     UnicodeString mPropName = propName;
4431     if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) {
4432         mPropName = UNICODE_STRING_SIMPLE("InGreek and Coptic");
4433     }
4434     if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbols"), 0) == 0 ||
4435         mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols"), 0) == 0) {
4436         mPropName = UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Symbols");
4437     }
4438     else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
4439         mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint");
4440     }
4441
4442     //    See if the property looks like a Java "InBlockName", which
4443     //    we will recast as "Block=BlockName"
4444     //
4445     static const UChar IN[] = {0x49, 0x6E, 0};  // "In"
4446     static const UChar BLOCK[] = {0x42, 0x6C, 0x6f, 0x63, 0x6b, 0x3d, 00};  // "Block="
4447     if (mPropName.startsWith(IN, 2) && propName.length()>=3) {
4448         setExpr.truncate(4);   // Leaves "[\p{", or "[\P{"
4449         setExpr.append(BLOCK, -1);
4450         setExpr.append(UnicodeString(mPropName, 2));  // Property with the leading "In" removed.
4451         setExpr.append(chRBrace);
4452         setExpr.append(chRBracket);
4453         *fStatus = U_ZERO_ERROR;
4454         set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
4455         if (U_SUCCESS(*fStatus)) {
4456             return set;
4457         }
4458         delete set;
4459         set = NULL;
4460     }
4461
4462     if (propName.startsWith(UNICODE_STRING_SIMPLE("java")) ||
4463         propName.compare(UNICODE_STRING_SIMPLE("all")) == 0)
4464     {
4465         UErrorCode localStatus = U_ZERO_ERROR;
4466         //setExpr.remove();
4467         set = new UnicodeSet();
4468         //
4469         //  Try the various Java specific properties.
4470         //   These all begin with "java"
4471         //
4472         if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDefined")) == 0) {
4473             addCategory(set, U_GC_CN_MASK, localStatus);
4474             set->complement();
4475         }
4476         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDigit")) == 0) {
4477             addCategory(set, U_GC_ND_MASK, localStatus);
4478         }
4479         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaIdentifierIgnorable")) == 0) {
4480             addIdentifierIgnorable(set, localStatus);
4481         }
4482         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaISOControl")) == 0) {
4483             set->add(0, 0x1F).add(0x7F, 0x9F);
4484         }
4485         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierPart")) == 0) {
4486             addCategory(set, U_GC_L_MASK, localStatus);
4487             addCategory(set, U_GC_SC_MASK, localStatus);
4488             addCategory(set, U_GC_PC_MASK, localStatus);
4489             addCategory(set, U_GC_ND_MASK, localStatus);
4490             addCategory(set, U_GC_NL_MASK, localStatus);
4491             addCategory(set, U_GC_MC_MASK, localStatus);
4492             addCategory(set, U_GC_MN_MASK, localStatus);
4493             addIdentifierIgnorable(set, localStatus);
4494         }
4495         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierStart")) == 0) {
4496             addCategory(set, U_GC_L_MASK, localStatus);
4497             addCategory(set, U_GC_NL_MASK, localStatus);
4498             addCategory(set, U_GC_SC_MASK, localStatus);
4499             addCategory(set, U_GC_PC_MASK, localStatus);
4500         }
4501         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetter")) == 0) {
4502             addCategory(set, U_GC_L_MASK, localStatus);
4503         }
4504         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetterOrDigit")) == 0) {
4505             addCategory(set, U_GC_L_MASK, localStatus);
4506             addCategory(set, U_GC_ND_MASK, localStatus);
4507         }
4508         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLowerCase")) == 0) {
4509             addCategory(set, U_GC_LL_MASK, localStatus);
4510         }
4511         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaMirrored")) == 0) {
4512             set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, localStatus);
4513         }
4514         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSpaceChar")) == 0) {
4515             addCategory(set, U_GC_Z_MASK, localStatus);
4516         }
4517         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSupplementaryCodePoint")) == 0) {
4518             set->add(0x10000, UnicodeSet::MAX_VALUE);
4519         }
4520         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaTitleCase")) == 0) {
4521             addCategory(set, U_GC_LT_MASK, localStatus);
4522         }
4523         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierStart")) == 0) {
4524             addCategory(set, U_GC_L_MASK, localStatus);
4525             addCategory(set, U_GC_NL_MASK, localStatus);
4526         }
4527         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierPart")) == 0) {
4528             addCategory(set, U_GC_L_MASK, localStatus);
4529             addCategory(set, U_GC_PC_MASK, localStatus);
4530             addCategory(set, U_GC_ND_MASK, localStatus);
4531             addCategory(set, U_GC_NL_MASK, localStatus);
4532             addCategory(set, U_GC_MC_MASK, localStatus);
4533             addCategory(set, U_GC_MN_MASK, localStatus);
4534             addIdentifierIgnorable(set, localStatus);
4535         }
4536         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUpperCase")) == 0) {
4537             addCategory(set, U_GC_LU_MASK, localStatus);
4538         }
4539         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaValidCodePoint")) == 0) {
4540             set->add(0, UnicodeSet::MAX_VALUE);
4541         }
4542         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaWhitespace")) == 0) {
4543             addCategory(set, U_GC_Z_MASK, localStatus);
4544             set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
4545             set->add(9, 0x0d).add(0x1c, 0x1f);
4546         }
4547         else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
4548             set->add(0, UnicodeSet::MAX_VALUE);
4549         }
4550
4551         if (U_SUCCESS(localStatus) && !set->isEmpty()) {
4552             *fStatus = U_ZERO_ERROR;
4553             if (usetFlags & USET_CASE_INSENSITIVE) {
4554                 set->closeOver(USET_CASE_INSENSITIVE);
4555             }
4556             if (negated) {
4557                 set->complement();
4558             }
4559             return set;
4560         }
4561         delete set;
4562         set = NULL;
4563     }
4564     error(*fStatus);
4565     return NULL;
4566 }
4567
4568
4569
4570 //
4571 //  SetEval   Part of the evaluation of [set expressions].
4572 //            Perform any pending (stacked) operations with precedence
4573 //            equal or greater to that of the next operator encountered
4574 //            in the expression.
4575 //
4576 void RegexCompile::setEval(int32_t nextOp) {
4577     UnicodeSet *rightOperand = NULL;
4578     UnicodeSet *leftOperand  = NULL;
4579     for (;;) {
4580         U_ASSERT(fSetOpStack.empty()==FALSE);
4581         int32_t pendingSetOperation = fSetOpStack.peeki();
4582         if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
4583             break;
4584         }
4585         fSetOpStack.popi();
4586         U_ASSERT(fSetStack.empty() == FALSE);
4587         rightOperand = (UnicodeSet *)fSetStack.peek();
4588         switch (pendingSetOperation) {
4589             case setNegation:
4590                 rightOperand->complement();
4591                 break;
4592             case setCaseClose:
4593                 // TODO: need a simple close function.  Ticket 6065
4594                 rightOperand->closeOver(USET_CASE_INSENSITIVE);
4595                 rightOperand->removeAllStrings();
4596                 break;
4597             case setDifference1:
4598             case setDifference2:
4599                 fSetStack.pop();
4600                 leftOperand = (UnicodeSet *)fSetStack.peek();
4601                 leftOperand->removeAll(*rightOperand);
4602                 delete rightOperand;
4603                 break;
4604             case setIntersection1:
4605             case setIntersection2:
4606                 fSetStack.pop();
4607                 leftOperand = (UnicodeSet *)fSetStack.peek();
4608                 leftOperand->retainAll(*rightOperand);
4609                 delete rightOperand;
4610                 break;
4611             case setUnion:
4612                 fSetStack.pop();
4613                 leftOperand = (UnicodeSet *)fSetStack.peek();
4614                 leftOperand->addAll(*rightOperand);
4615                 delete rightOperand;
4616                 break;
4617             default:
4618                 U_ASSERT(FALSE);
4619                 break;
4620             }
4621         }
4622     }
4623
4624 void RegexCompile::setPushOp(int32_t op) {
4625     setEval(op);
4626     fSetOpStack.push(op, *fStatus);
4627     fSetStack.push(new UnicodeSet(), *fStatus);
4628 }
4629
4630 U_NAMESPACE_END
4631 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
4632