icuSources/i18n/regexcmp.cpp

   1 //
   2 //  file:  regexcmp.cpp
   3 //
   4 //  Copyright (C) 2002-2016 International Business Machines Corporation and others.
   5 //  All Rights Reserved.
   6 //
   7 //  This file contains the ICU regular expression compiler, which is responsible
   8 //  for processing a regular expression pattern into the compiled form that
   9 //  is used by the match finding engine.
  10 //
  11
  12 #include "unicode/utypes.h"
  13
  14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  15
  16 #include "unicode/ustring.h"
  17 #include "unicode/unistr.h"
  18 #include "unicode/uniset.h"
  19 #include "unicode/uchar.h"
  20 #include "unicode/uchriter.h"
  21 #include "unicode/parsepos.h"
  22 #include "unicode/parseerr.h"
  23 #include "unicode/regex.h"
  24 #include "unicode/utf.h"
  25 #include "unicode/utf16.h"
  26 #include "patternprops.h"
  27 #include "putilimp.h"
  28 #include "cmemory.h"
  29 #include "cstring.h"
  30 #include "uvectr32.h"
  31 #include "uvectr64.h"
  32 #include "uassert.h"
  33 #include "uinvchar.h"
  34
  35 #include "regeximp.h"
  36 #include "regexcst.h"   // Contains state table for the regex pattern parser.
  37                         //   generated by a Perl script.
  38 #include "regexcmp.h"
  39 #include "regexst.h"
  40 #include "regextxt.h"
  41
  42
  43
  44 U_NAMESPACE_BEGIN
  45
  46
  47 //------------------------------------------------------------------------------
  48 //
  49 //  Constructor.
  50 //
  51 //------------------------------------------------------------------------------
  52 RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
  53    fParenStack(status), fSetStack(status), fSetOpStack(status)
  54 {
  55     // Lazy init of all shared global sets (needed for init()'s empty text)
  56     RegexStaticSets::initGlobals(&status);
  57
  58     fStatus           = &status;
  59
  60     fRXPat            = rxp;
  61     fScanIndex        = 0;
  62     fLastChar         = -1;
  63     fPeekChar         = -1;
  64     fLineNum          = 1;
  65     fCharNum          = 0;
  66     fQuoteMode        = FALSE;
  67     fInBackslashQuote = FALSE;
  68     fModeFlags        = fRXPat->fFlags | 0x80000000;
  69     fEOLComments      = TRUE;
  70
  71     fMatchOpenParen   = -1;
  72     fMatchCloseParen  = -1;
  73     fCaptureName      = NULL;
  74     fLastSetLiteral   = U_SENTINEL;
  75
  76     if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
  77         status = rxp->fDeferredStatus;
  78     }
  79 }
  80
  81 static const UChar      chAmp       = 0x26;      // '&'
  82 static const UChar      chDash      = 0x2d;      // '-'
  83
  84
  85 //------------------------------------------------------------------------------
  86 //
  87 //  Destructor
  88 //
  89 //------------------------------------------------------------------------------
  90 RegexCompile::~RegexCompile() {
  91     delete fCaptureName;         // Normally will be NULL, but can exist if pattern
  92                                  //   compilation stops with a syntax error.
  93 }
  94
  95 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
  96     set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
  97 }
  98
  99 //------------------------------------------------------------------------------
 100 //
 101 //  Compile regex pattern.   The state machine for rexexp pattern parsing is here.
 102 //                           The state tables are hand-written in the file regexcst.txt,
 103 //                           and converted to the form used here by a perl
 104 //                           script regexcst.pl
 105 //
 106 //------------------------------------------------------------------------------
 107 void    RegexCompile::compile(
 108                          const UnicodeString &pat,   // Source pat to be compiled.
 109                          UParseError &pp,            // Error position info
 110                          UErrorCode &e)              // Error Code
 111 {
 112     fRXPat->fPatternString = new UnicodeString(pat);
 113     UText patternText = UTEXT_INITIALIZER;
 114     utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
 115
 116     if (U_SUCCESS(e)) {
 117         compile(&patternText, pp, e);
 118         utext_close(&patternText);
 119     }
 120 }
 121
 122 //
 123 //   compile, UText mode
 124 //     All the work is actually done here.
 125 //
 126 void    RegexCompile::compile(
 127                          UText *pat,                 // Source pat to be compiled.
 128                          UParseError &pp,            // Error position info
 129                          UErrorCode &e)              // Error Code
 130 {
 131     fStatus             = &e;
 132     fParseErr           = &pp;
 133     fStackPtr           = 0;
 134     fStack[fStackPtr]   = 0;
 135
 136     if (U_FAILURE(*fStatus)) {
 137         return;
 138     }
 139
 140     // There should be no pattern stuff in the RegexPattern object.  They can not be reused.
 141     U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0);
 142
 143     // Prepare the RegexPattern object to receive the compiled pattern.
 144     fRXPat->fPattern        = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus);
 145     if (U_FAILURE(*fStatus)) {
 146         return;
 147     }
 148     fRXPat->fStaticSets     = RegexStaticSets::gStaticSets->fPropSets;
 149     fRXPat->fStaticSets8    = RegexStaticSets::gStaticSets->fPropSets8;
 150
 151
 152     // Initialize the pattern scanning state machine
 153     fPatternLength = utext_nativeLength(pat);
 154     uint16_t                state = 1;
 155     const RegexTableEl      *tableEl;
 156
 157     // UREGEX_LITERAL force entire pattern to be treated as a literal string.
 158     if (fModeFlags & UREGEX_LITERAL) {
 159         fQuoteMode = TRUE;
 160     }
 161
 162     nextChar(fC);                        // Fetch the first char from the pattern string.
 163
 164     //
 165     // Main loop for the regex pattern parsing state machine.
 166     //   Runs once per state transition.
 167     //   Each time through optionally performs, depending on the state table,
 168     //      - an advance to the the next pattern char
 169     //      - an action to be performed.
 170     //      - pushing or popping a state to/from the local state return stack.
 171     //   file regexcst.txt is the source for the state table.  The logic behind
 172     //     recongizing the pattern syntax is there, not here.
 173     //
 174     for (;;) {
 175         //  Bail out if anything has gone wrong.
 176         //  Regex pattern parsing stops on the first error encountered.
 177         if (U_FAILURE(*fStatus)) {
 178             break;
 179         }
 180
 181         U_ASSERT(state != 0);
 182
 183         // Find the state table element that matches the input char from the pattern, or the
 184         //    class of the input character.  Start with the first table row for this
 185         //    state, then linearly scan forward until we find a row that matches the
 186         //    character.  The last row for each state always matches all characters, so
 187         //    the search will stop there, if not before.
 188         //
 189         tableEl = &gRuleParseStateTable[state];
 190         REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d)    state=%s ",
 191             fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
 192
 193         for (;;) {    // loop through table rows belonging to this state, looking for one
 194                       //   that matches the current input char.
 195             REGEX_SCAN_DEBUG_PRINTF(("."));
 196             if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE &&   tableEl->fCharClass == fC.fChar) {
 197                 // Table row specified an individual character, not a set, and
 198                 //   the input character is not quoted, and
 199                 //   the input character matched it.
 200                 break;
 201             }
 202             if (tableEl->fCharClass == 255) {
 203                 // Table row specified default, match anything character class.
 204                 break;
 205             }
 206             if (tableEl->fCharClass == 254 && fC.fQuoted)  {
 207                 // Table row specified "quoted" and the char was quoted.
 208                 break;
 209             }
 210             if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1)  {
 211                 // Table row specified eof and we hit eof on the input.
 212                 break;
 213             }
 214
 215             if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
 216                 fC.fQuoted == FALSE &&                                       //   char is not escaped &&
 217                 fC.fChar != (UChar32)-1) {                                   //   char is not EOF
 218                 U_ASSERT(tableEl->fCharClass <= 137);
 219                 if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
 220                     // Table row specified a character class, or set of characters,
 221                     //   and the current char matches it.
 222                     break;
 223                 }
 224             }
 225
 226             // No match on this row, advance to the next  row for this state,
 227             tableEl++;
 228         }
 229         REGEX_SCAN_DEBUG_PRINTF(("\n"));
 230
 231         //
 232         // We've found the row of the state table that matches the current input
 233         //   character from the rules string.
 234         // Perform any action specified  by this row in the state table.
 235         if (doParseActions(tableEl->fAction) == FALSE) {
 236             // Break out of the state machine loop if the
 237             //   the action signalled some kind of error, or
 238             //   the action was to exit, occurs on normal end-of-rules-input.
 239             break;
 240         }
 241
 242         if (tableEl->fPushState != 0) {
 243             fStackPtr++;
 244             if (fStackPtr >= kStackSize) {
 245                 error(U_REGEX_INTERNAL_ERROR);
 246                 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
 247                 fStackPtr--;
 248             }
 249             fStack[fStackPtr] = tableEl->fPushState;
 250         }
 251
 252         //
 253         //  NextChar.  This is where characters are actually fetched from the pattern.
 254         //             Happens under control of the 'n' tag in the state table.
 255         //
 256         if (tableEl->fNextChar) {
 257             nextChar(fC);
 258         }
 259
 260         // Get the next state from the table entry, or from the
 261         //   state stack if the next state was specified as "pop".
 262         if (tableEl->fNextState != 255) {
 263             state = tableEl->fNextState;
 264         } else {
 265             state = fStack[fStackPtr];
 266             fStackPtr--;
 267             if (fStackPtr < 0) {
 268                 // state stack underflow
 269                 // This will occur if the user pattern has mis-matched parentheses,
 270                 //   with extra close parens.
 271                 //
 272                 fStackPtr++;
 273                 error(U_REGEX_MISMATCHED_PAREN);
 274             }
 275         }
 276
 277     }
 278
 279     if (U_FAILURE(*fStatus)) {
 280         // Bail out if the pattern had errors.
 281         //   Set stack cleanup:  a successful compile would have left it empty,
 282         //   but errors can leave temporary sets hanging around.
 283         while (!fSetStack.empty()) {
 284             delete (UnicodeSet *)fSetStack.pop();
 285         }
 286         return;
 287     }
 288
 289     //
 290     // The pattern has now been read and processed, and the compiled code generated.
 291     //
 292
 293     //
 294     // The pattern's fFrameSize so far has accumulated the requirements for
 295     //   storage for capture parentheses, counters, etc. that are encountered
 296     //   in the pattern.  Add space for the two variables that are always
 297     //   present in the saved state:  the input string position (int64_t) and
 298     //   the position in the compiled pattern.
 299     //
 300     allocateStackData(RESTACKFRAME_HDRCOUNT);
 301
 302     //
 303     // Optimization pass 1: NOPs, back-references, and case-folding
 304     //
 305     stripNOPs();
 306
 307     //
 308     // Get bounds for the minimum and maximum length of a string that this
 309     //   pattern can match.  Used to avoid looking for matches in strings that
 310     //   are too short.
 311     //
 312     fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
 313
 314     //
 315     // Optimization pass 2: match start type
 316     //
 317     matchStartType();
 318
 319     //
 320     // Set up fast latin-1 range sets
 321     //
 322     int32_t numSets = fRXPat->fSets->size();
 323     fRXPat->fSets8 = new Regex8BitSet[numSets];
 324     // Null pointer check.
 325     if (fRXPat->fSets8 == NULL) {
 326         e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
 327         return;
 328     }
 329     int32_t i;
 330     for (i=0; i<numSets; i++) {
 331         UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
 332         fRXPat->fSets8[i].init(s);
 333     }
 334
 335 }
 336
 337
 338
 339
 340
 341 //------------------------------------------------------------------------------
 342 //
 343 //  doParseAction        Do some action during regex pattern parsing.
 344 //                       Called by the parse state machine.
 345 //
 346 //                       Generation of the match engine PCode happens here, or
 347 //                       in functions called from the parse actions defined here.
 348 //
 349 //
 350 //------------------------------------------------------------------------------
 351 UBool RegexCompile::doParseActions(int32_t action)
 352 {
 353     UBool   returnVal = TRUE;
 354
 355     switch ((Regex_PatternParseAction)action) {
 356
 357     case doPatStart:
 358         // Start of pattern compiles to:
 359         //0   SAVE   2        Fall back to position of FAIL
 360         //1   jmp    3
 361         //2   FAIL            Stop if we ever reach here.
 362         //3   NOP             Dummy, so start of pattern looks the same as
 363         //                    the start of an ( grouping.
 364         //4   NOP             Resreved, will be replaced by a save if there are
 365         //                    OR | operators at the top level
 366         appendOp(URX_STATE_SAVE, 2);
 367         appendOp(URX_JMP,  3);
 368         appendOp(URX_FAIL, 0);
 369
 370         // Standard open nonCapture paren action emits the two NOPs and
 371         //   sets up the paren stack frame.
 372         doParseActions(doOpenNonCaptureParen);
 373         break;
 374
 375     case doPatFinish:
 376         // We've scanned to the end of the pattern
 377         //  The end of pattern compiles to:
 378         //        URX_END
 379         //    which will stop the runtime match engine.
 380         //  Encountering end of pattern also behaves like a close paren,
 381         //   and forces fixups of the State Save at the beginning of the compiled pattern
 382         //   and of any OR operations at the top level.
 383         //
 384         handleCloseParen();
 385         if (fParenStack.size() > 0) {
 386             // Missing close paren in pattern.
 387             error(U_REGEX_MISMATCHED_PAREN);
 388         }
 389
 390         // add the END operation to the compiled pattern.
 391         appendOp(URX_END, 0);
 392
 393         // Terminate the pattern compilation state machine.
 394         returnVal = FALSE;
 395         break;
 396
 397
 398
 399     case doOrOperator:
 400         // Scanning a '|', as in (A|B)
 401         {
 402             // Generate code for any pending literals preceding the '|'
 403             fixLiterals(FALSE);
 404
 405             // Insert a SAVE operation at the start of the pattern section preceding
 406             //   this OR at this level.  This SAVE will branch the match forward
 407             //   to the right hand side of the OR in the event that the left hand
 408             //   side fails to match and backtracks.  Locate the position for the
 409             //   save from the location on the top of the parentheses stack.
 410             int32_t savePosition = fParenStack.popi();
 411             int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
 412             U_ASSERT(URX_TYPE(op) == URX_NOP);  // original contents of reserved location
 413             op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
 414             fRXPat->fCompiledPat->setElementAt(op, savePosition);
 415
 416             // Append an JMP operation into the compiled pattern.  The operand for
 417             //  the JMP will eventually be the location following the ')' for the
 418             //  group.  This will be patched in later, when the ')' is encountered.
 419             appendOp(URX_JMP, 0);
 420
 421             // Push the position of the newly added JMP op onto the parentheses stack.
 422             // This registers if for fixup when this block's close paren is encountered.
 423             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
 424
 425             // Append a NOP to the compiled pattern.  This is the slot reserved
 426             //   for a SAVE in the event that there is yet another '|' following
 427             //   this one.
 428             appendOp(URX_NOP, 0);
 429             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
 430         }
 431         break;
 432
 433
 434     case doBeginNamedCapture:
 435         // Scanning (?<letter.
 436         //   The first letter of the name will come through again under doConinueNamedCapture.
 437         fCaptureName = new UnicodeString();
 438         if (fCaptureName == NULL) {
 439             error(U_MEMORY_ALLOCATION_ERROR);
 440         }
 441         break;
 442
 443     case  doContinueNamedCapture:
 444         fCaptureName->append(fC.fChar);
 445         break;
 446
 447     case doBadNamedCapture:
 448         error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
 449         break;
 450
 451     case doOpenCaptureParen:
 452         // Open Capturing Paren, possibly named.
 453         //   Compile to a
 454         //      - NOP, which later may be replaced by a save-state if the
 455         //         parenthesized group gets a * quantifier, followed by
 456         //      - START_CAPTURE  n    where n is stack frame offset to the capture group variables.
 457         //      - NOP, which may later be replaced by a save-state if there
 458         //             is an '|' alternation within the parens.
 459         //
 460         //    Each capture group gets three slots in the save stack frame:
 461         //         0: Capture Group start position (in input string being matched.)
 462         //         1: Capture Group end position.
 463         //         2: Start of Match-in-progress.
 464         //    The first two locations are for a completed capture group, and are
 465         //     referred to by back references and the like.
 466         //    The third location stores the capture start position when an START_CAPTURE is
 467         //      encountered.  This will be promoted to a completed capture when (and if) the corresponding
 468         //      END_CAPTURE is encountered.
 469         {
 470             fixLiterals();
 471             appendOp(URX_NOP, 0);
 472             int32_t  varsLoc = allocateStackData(3);    // Reserve three slots in match stack frame.
 473             appendOp(URX_START_CAPTURE, varsLoc);
 474             appendOp(URX_NOP, 0);
 475
 476             // On the Parentheses stack, start a new frame and add the postions
 477             //   of the two NOPs.  Depending on what follows in the pattern, the
 478             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
 479             //   address of the end of the parenthesized group.
 480             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
 481             fParenStack.push(capturing, *fStatus);                        // Frame type.
 482             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first  NOP location
 483             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
 484
 485             // Save the mapping from group number to stack frame variable position.
 486             fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
 487
 488             // If this is a named capture group, add the name->group number mapping.
 489             if (fCaptureName != NULL) {
 490                 int32_t groupNumber = fRXPat->fGroupMap->size();
 491                 int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
 492                 fCaptureName = NULL;    // hash table takes ownership of the name (key) string.
 493                 if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
 494                     error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
 495                 }
 496             }
 497         }
 498         break;
 499
 500     case doOpenNonCaptureParen:
 501         // Open non-caputuring (grouping only) Paren.
 502         //   Compile to a
 503         //      - NOP, which later may be replaced by a save-state if the
 504         //         parenthesized group gets a * quantifier, followed by
 505         //      - NOP, which may later be replaced by a save-state if there
 506         //             is an '|' alternation within the parens.
 507         {
 508             fixLiterals();
 509             appendOp(URX_NOP, 0);
 510             appendOp(URX_NOP, 0);
 511
 512             // On the Parentheses stack, start a new frame and add the postions
 513             //   of the two NOPs.
 514             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
 515             fParenStack.push(plain,      *fStatus);                       // Begin a new frame.
 516             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
 517             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
 518         }
 519          break;
 520
 521
 522     case doOpenAtomicParen:
 523         // Open Atomic Paren.  (?>
 524         //   Compile to a
 525         //      - NOP, which later may be replaced if the parenthesized group
 526         //         has a quantifier, followed by
 527         //      - STO_SP  save state stack position, so it can be restored at the ")"
 528         //      - NOP, which may later be replaced by a save-state if there
 529         //             is an '|' alternation within the parens.
 530         {
 531             fixLiterals();
 532             appendOp(URX_NOP, 0);
 533             int32_t  varLoc = allocateData(1);    // Reserve a data location for saving the state stack ptr.
 534             appendOp(URX_STO_SP, varLoc);
 535             appendOp(URX_NOP, 0);
 536
 537             // On the Parentheses stack, start a new frame and add the postions
 538             //   of the two NOPs.  Depending on what follows in the pattern, the
 539             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
 540             //   address of the end of the parenthesized group.
 541             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
 542             fParenStack.push(atomic, *fStatus);                           // Frame type.
 543             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first NOP
 544             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
 545         }
 546         break;
 547
 548
 549     case doOpenLookAhead:
 550         // Positive Look-ahead   (?=  stuff  )
 551         //
 552         //   Note:   Addition of transparent input regions, with the need to
 553         //           restore the original regions when failing out of a lookahead
 554         //           block, complicated this sequence.  Some conbined opcodes
 555         //           might make sense - or might not, lookahead aren't that common.
 556         //
 557         //      Caution:  min match length optimization knows about this
 558         //               sequence; don't change without making updates there too.
 559         //
 560         // Compiles to
 561         //    1    START_LA     dataLoc     Saves SP, Input Pos
 562         //    2.   STATE_SAVE   4            on failure of lookahead, goto 4
 563         //    3    JMP          6           continue ...
 564         //
 565         //    4.   LA_END                   Look Ahead failed.  Restore regions.
 566         //    5.   BACKTRACK                and back track again.
 567         //
 568         //    6.   NOP              reserved for use by quantifiers on the block.
 569         //                          Look-ahead can't have quantifiers, but paren stack
 570         //                             compile time conventions require the slot anyhow.
 571         //    7.   NOP              may be replaced if there is are '|' ops in the block.
 572         //    8.     code for parenthesized stuff.
 573         //    9.   LA_END
 574         //
 575         //  Two data slots are reserved, for saving the stack ptr and the input position.
 576         {
 577             fixLiterals();
 578             int32_t dataLoc = allocateData(2);
 579             appendOp(URX_LA_START, dataLoc);
 580             appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
 581             appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
 582             appendOp(URX_LA_END, dataLoc);
 583             appendOp(URX_BACKTRACK, 0);
 584             appendOp(URX_NOP, 0);
 585             appendOp(URX_NOP, 0);
 586
 587             // On the Parentheses stack, start a new frame and add the postions
 588             //   of the NOPs.
 589             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
 590             fParenStack.push(lookAhead, *fStatus);                        // Frame type.
 591             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
 592             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
 593         }
 594         break;
 595
 596     case doOpenLookAheadNeg:
 597         // Negated Lookahead.   (?! stuff )
 598         // Compiles to
 599         //    1.    START_LA    dataloc
 600         //    2.    SAVE_STATE  7         // Fail within look-ahead block restores to this state,
 601         //                                //   which continues with the match.
 602         //    3.    NOP                   // Std. Open Paren sequence, for possible '|'
 603         //    4.       code for parenthesized stuff.
 604         //    5.    END_LA                // Cut back stack, remove saved state from step 2.
 605         //    6.    BACKTRACK             // code in block succeeded, so neg. lookahead fails.
 606         //    7.    END_LA                // Restore match region, in case look-ahead was using
 607         //                                        an alternate (transparent) region.
 608         {
 609             fixLiterals();
 610             int32_t dataLoc = allocateData(2);
 611             appendOp(URX_LA_START, dataLoc);
 612             appendOp(URX_STATE_SAVE, 0);    // dest address will be patched later.
 613             appendOp(URX_NOP, 0);
 614
 615             // On the Parentheses stack, start a new frame and add the postions
 616             //   of the StateSave and NOP.
 617             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
 618             fParenStack.push(negLookAhead, *fStatus);                    // Frame type
 619             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The STATE_SAVE location
 620             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
 621
 622             // Instructions #5 - #7 will be added when the ')' is encountered.
 623         }
 624         break;
 625
 626     case doOpenLookBehind:
 627         {
 628             //   Compile a (?<= look-behind open paren.
 629             //
 630             //          Compiles to
 631             //              0       URX_LB_START     dataLoc
 632             //              1       URX_LB_CONT      dataLoc
 633             //              2                        MinMatchLen
 634             //              3                        MaxMatchLen
 635             //              4       URX_NOP          Standard '(' boilerplate.
 636             //              5       URX_NOP          Reserved slot for use with '|' ops within (block).
 637             //              6         <code for LookBehind expression>
 638             //              7       URX_LB_END       dataLoc    # Check match len, restore input  len
 639             //              8       URX_LA_END       dataLoc    # Restore stack, input pos
 640             //
 641             //          Allocate a block of matcher data, to contain (when running a match)
 642             //              0:    Stack ptr on entry
 643             //              1:    Input Index on entry
 644             //              2:    Start index of match current match attempt.
 645             //              3:    Original Input String len.
 646
 647             // Generate match code for any pending literals.
 648             fixLiterals();
 649
 650             // Allocate data space
 651             int32_t dataLoc = allocateData(4);
 652
 653             // Emit URX_LB_START
 654             appendOp(URX_LB_START, dataLoc);
 655
 656             // Emit URX_LB_CONT
 657             appendOp(URX_LB_CONT, dataLoc);
 658             appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
 659             appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
 660
 661             // Emit the NOPs
 662             appendOp(URX_NOP, 0);
 663             appendOp(URX_NOP, 0);
 664
 665             // On the Parentheses stack, start a new frame and add the postions
 666             //   of the URX_LB_CONT and the NOP.
 667             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
 668             fParenStack.push(lookBehind, *fStatus);                       // Frame type
 669             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
 670             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
 671
 672             // The final two instructions will be added when the ')' is encountered.
 673         }
 674
 675         break;
 676
 677     case doOpenLookBehindNeg:
 678         {
 679             //   Compile a (?<! negated look-behind open paren.
 680             //
 681             //          Compiles to
 682             //              0       URX_LB_START     dataLoc    # Save entry stack, input len
 683             //              1       URX_LBN_CONT     dataLoc    # Iterate possible match positions
 684             //              2                        MinMatchLen
 685             //              3                        MaxMatchLen
 686             //              4                        continueLoc (9)
 687             //              5       URX_NOP          Standard '(' boilerplate.
 688             //              6       URX_NOP          Reserved slot for use with '|' ops within (block).
 689             //              7         <code for LookBehind expression>
 690             //              8       URX_LBN_END      dataLoc    # Check match len, cause a FAIL
 691             //              9       ...
 692             //
 693             //          Allocate a block of matcher data, to contain (when running a match)
 694             //              0:    Stack ptr on entry
 695             //              1:    Input Index on entry
 696             //              2:    Start index of match current match attempt.
 697             //              3:    Original Input String len.
 698
 699             // Generate match code for any pending literals.
 700             fixLiterals();
 701
 702             // Allocate data space
 703             int32_t dataLoc = allocateData(4);
 704
 705             // Emit URX_LB_START
 706             appendOp(URX_LB_START, dataLoc);
 707
 708             // Emit URX_LBN_CONT
 709             appendOp(URX_LBN_CONT, dataLoc);
 710             appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
 711             appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
 712             appendOp(URX_RESERVED_OP, 0);    // Continue Loc.    To be filled later.
 713
 714             // Emit the NOPs
 715             appendOp(URX_NOP, 0);
 716             appendOp(URX_NOP, 0);
 717
 718             // On the Parentheses stack, start a new frame and add the postions
 719             //   of the URX_LB_CONT and the NOP.
 720             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
 721             fParenStack.push(lookBehindN, *fStatus);                      // Frame type
 722             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
 723             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
 724
 725             // The final two instructions will be added when the ')' is encountered.
 726         }
 727         break;
 728
 729     case doConditionalExpr:
 730         // Conditionals such as (?(1)a:b)
 731     case doPerlInline:
 732         // Perl inline-condtionals.  (?{perl code}a|b) We're not perl, no way to do them.
 733         error(U_REGEX_UNIMPLEMENTED);
 734         break;
 735
 736
 737     case doCloseParen:
 738         handleCloseParen();
 739         if (fParenStack.size() <= 0) {
 740             //  Extra close paren, or missing open paren.
 741             error(U_REGEX_MISMATCHED_PAREN);
 742         }
 743         break;
 744
 745     case doNOP:
 746         break;
 747
 748
 749     case doBadOpenParenType:
 750     case doRuleError:
 751         error(U_REGEX_RULE_SYNTAX);
 752         break;
 753
 754
 755     case doMismatchedParenErr:
 756         error(U_REGEX_MISMATCHED_PAREN);
 757         break;
 758
 759     case doPlus:
 760         //  Normal '+'  compiles to
 761         //     1.   stuff to be repeated  (already built)
 762         //     2.   jmp-sav 1
 763         //     3.   ...
 764         //
 765         //  Or, if the item to be repeated can match a zero length string,
 766         //     1.   STO_INP_LOC  data-loc
 767         //     2.      body of stuff to be repeated
 768         //     3.   JMP_SAV_X    2
 769         //     4.   ...
 770
 771         //
 772         //  Or, if the item to be repeated is simple
 773         //     1.   Item to be repeated.
 774         //     2.   LOOP_SR_I    set number  (assuming repeated item is a set ref)
 775         //     3.   LOOP_C       stack location
 776         {
 777             int32_t  topLoc = blockTopLoc(FALSE);        // location of item #1
 778             int32_t  frameLoc;
 779
 780             // Check for simple constructs, which may get special optimized code.
 781             if (topLoc == fRXPat->fCompiledPat->size() - 1) {
 782                 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
 783
 784                 if (URX_TYPE(repeatedOp) == URX_SETREF) {
 785                     // Emit optimized code for [char set]+
 786                     appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
 787                     frameLoc = allocateStackData(1);
 788                     appendOp(URX_LOOP_C, frameLoc);
 789                     break;
 790                 }
 791
 792                 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
 793                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
 794                     URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
 795                     // Emit Optimized code for .+ operations.
 796                     int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
 797                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
 798                         // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
 799                         loopOpI |= 1;
 800                     }
 801                     if (fModeFlags & UREGEX_UNIX_LINES) {
 802                         loopOpI |= 2;
 803                     }
 804                     appendOp(loopOpI);
 805                     frameLoc = allocateStackData(1);
 806                     appendOp(URX_LOOP_C, frameLoc);
 807                     break;
 808                 }
 809
 810             }
 811
 812             // General case.
 813
 814             // Check for minimum match length of zero, which requires
 815             //    extra loop-breaking code.
 816             if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
 817                 // Zero length match is possible.
 818                 // Emit the code sequence that can handle it.
 819                 insertOp(topLoc);
 820                 frameLoc = allocateStackData(1);
 821
 822                 int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
 823                 fRXPat->fCompiledPat->setElementAt(op, topLoc);
 824
 825                 appendOp(URX_JMP_SAV_X, topLoc+1);
 826             } else {
 827                 // Simpler code when the repeated body must match something non-empty
 828                 appendOp(URX_JMP_SAV, topLoc);
 829             }
 830         }
 831         break;
 832
 833     case doNGPlus:
 834         //  Non-greedy '+?'  compiles to
 835         //     1.   stuff to be repeated  (already built)
 836         //     2.   state-save  1
 837         //     3.   ...
 838         {
 839             int32_t topLoc      = blockTopLoc(FALSE);
 840             appendOp(URX_STATE_SAVE, topLoc);
 841         }
 842         break;
 843
 844
 845     case doOpt:
 846         // Normal (greedy) ? quantifier.
 847         //  Compiles to
 848         //     1. state save 3
 849         //     2.    body of optional block
 850         //     3. ...
 851         // Insert the state save into the compiled pattern, and we're done.
 852         {
 853             int32_t   saveStateLoc = blockTopLoc(TRUE);
 854             int32_t   saveStateOp  = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
 855             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
 856         }
 857         break;
 858
 859     case doNGOpt:
 860         // Non-greedy ?? quantifier
 861         //   compiles to
 862         //    1.  jmp   4
 863         //    2.     body of optional block
 864         //    3   jmp   5
 865         //    4.  state save 2
 866         //    5    ...
 867         //  This code is less than ideal, with two jmps instead of one, because we can only
 868         //  insert one instruction at the top of the block being iterated.
 869         {
 870             int32_t  jmp1_loc = blockTopLoc(TRUE);
 871             int32_t  jmp2_loc = fRXPat->fCompiledPat->size();
 872
 873             int32_t  jmp1_op  = buildOp(URX_JMP, jmp2_loc+1);
 874             fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
 875
 876             appendOp(URX_JMP, jmp2_loc+2);
 877
 878             appendOp(URX_STATE_SAVE, jmp1_loc+1);
 879         }
 880         break;
 881
 882
 883     case doStar:
 884         // Normal (greedy) * quantifier.
 885         // Compiles to
 886         //       1.   STATE_SAVE   4
 887         //       2.      body of stuff being iterated over
 888         //       3.   JMP_SAV      2
 889         //       4.   ...
 890         //
 891         // Or, if the body is a simple [Set],
 892         //       1.   LOOP_SR_I    set number
 893         //       2.   LOOP_C       stack location
 894         //       ...
 895         //
 896         // Or if this is a .*
 897         //       1.   LOOP_DOT_I    (. matches all mode flag)
 898         //       2.   LOOP_C        stack location
 899         //
 900         // Or, if the body can match a zero-length string, to inhibit infinite loops,
 901         //       1.   STATE_SAVE   5
 902         //       2.   STO_INP_LOC  data-loc
 903         //       3.      body of stuff
 904         //       4.   JMP_SAV_X    2
 905         //       5.   ...
 906         {
 907             // location of item #1, the STATE_SAVE
 908             int32_t   topLoc = blockTopLoc(FALSE);
 909             int32_t   dataLoc = -1;
 910
 911             // Check for simple *, where the construct being repeated
 912             //   compiled to single opcode, and might be optimizable.
 913             if (topLoc == fRXPat->fCompiledPat->size() - 1) {
 914                 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
 915
 916                 if (URX_TYPE(repeatedOp) == URX_SETREF) {
 917                     // Emit optimized code for a [char set]*
 918                     int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
 919                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
 920                     dataLoc = allocateStackData(1);
 921                     appendOp(URX_LOOP_C, dataLoc);
 922                     break;
 923                 }
 924
 925                 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
 926                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
 927                     URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
 928                     // Emit Optimized code for .* operations.
 929                     int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
 930                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
 931                         // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
 932                         loopOpI |= 1;
 933                     }
 934                     if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
 935                         loopOpI |= 2;
 936                     }
 937                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
 938                     dataLoc = allocateStackData(1);
 939                     appendOp(URX_LOOP_C, dataLoc);
 940                     break;
 941                 }
 942             }
 943
 944             // Emit general case code for this *
 945             // The optimizations did not apply.
 946
 947             int32_t   saveStateLoc = blockTopLoc(TRUE);
 948             int32_t   jmpOp        = buildOp(URX_JMP_SAV, saveStateLoc+1);
 949
 950             // Check for minimum match length of zero, which requires
 951             //    extra loop-breaking code.
 952             if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
 953                 insertOp(saveStateLoc);
 954                 dataLoc = allocateStackData(1);
 955
 956                 int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
 957                 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
 958                 jmpOp      = buildOp(URX_JMP_SAV_X, saveStateLoc+2);
 959             }
 960
 961             // Locate the position in the compiled pattern where the match will continue
 962             //   after completing the *.   (4 or 5 in the comment above)
 963             int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
 964
 965             // Put together the save state op and store it into the compiled code.
 966             int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
 967             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
 968
 969             // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
 970             appendOp(jmpOp);
 971         }
 972         break;
 973
 974     case doNGStar:
 975         // Non-greedy *? quantifier
 976         // compiles to
 977         //     1.   JMP    3
 978         //     2.      body of stuff being iterated over
 979         //     3.   STATE_SAVE  2
 980         //     4    ...
 981         {
 982             int32_t     jmpLoc  = blockTopLoc(TRUE);                   // loc  1.
 983             int32_t     saveLoc = fRXPat->fCompiledPat->size();        // loc  3.
 984             int32_t     jmpOp   = buildOp(URX_JMP, saveLoc);
 985             fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
 986             appendOp(URX_STATE_SAVE, jmpLoc+1);
 987         }
 988         break;
 989
 990
 991     case doIntervalInit:
 992         // The '{' opening an interval quantifier was just scanned.
 993         // Init the counter varaiables that will accumulate the values as the digits
 994         //    are scanned.
 995         fIntervalLow = 0;
 996         fIntervalUpper = -1;
 997         break;
 998
 999     case doIntevalLowerDigit:
1000         // Scanned a digit from the lower value of an {lower,upper} interval
1001         {
1002             int32_t digitValue = u_charDigitValue(fC.fChar);
1003             U_ASSERT(digitValue >= 0);
1004             int64_t val = (int64_t)fIntervalLow*10 + digitValue;
1005             if (val > INT32_MAX) {
1006                 error(U_REGEX_NUMBER_TOO_BIG);
1007             } else {
1008                 fIntervalLow = (int32_t)val;
1009             }
1010         }
1011         break;
1012
1013     case doIntervalUpperDigit:
1014         // Scanned a digit from the upper value of an {lower,upper} interval
1015         {
1016             if (fIntervalUpper < 0) {
1017                 fIntervalUpper = 0;
1018             }
1019             int32_t digitValue = u_charDigitValue(fC.fChar);
1020             U_ASSERT(digitValue >= 0);
1021             int64_t val = (int64_t)fIntervalUpper*10 + digitValue;
1022             if (val > INT32_MAX) {
1023                 error(U_REGEX_NUMBER_TOO_BIG);
1024             } else {
1025                 fIntervalUpper = (int32_t)val;
1026             }
1027         }
1028         break;
1029
1030     case doIntervalSame:
1031         // Scanned a single value interval like {27}.  Upper = Lower.
1032         fIntervalUpper = fIntervalLow;
1033         break;
1034
1035     case doInterval:
1036         // Finished scanning a normal {lower,upper} interval.  Generate the code for it.
1037         if (compileInlineInterval() == FALSE) {
1038             compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1039         }
1040         break;
1041
1042     case doPossessiveInterval:
1043         // Finished scanning a Possessive {lower,upper}+ interval.  Generate the code for it.
1044         {
1045             // Remember the loc for the top of the block being looped over.
1046             //   (Can not reserve a slot in the compiled pattern at this time, because
1047             //    compileInterval needs to reserve also, and blockTopLoc can only reserve
1048             //    once per block.)
1049             int32_t topLoc = blockTopLoc(FALSE);
1050
1051             // Produce normal looping code.
1052             compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1053
1054             // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1055             //  just as if the loop was inclosed in atomic parentheses.
1056
1057             // First the STO_SP before the start of the loop
1058             insertOp(topLoc);
1059
1060             int32_t  varLoc = allocateData(1);   // Reserve a data location for saving the
1061             int32_t  op     = buildOp(URX_STO_SP, varLoc);
1062             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1063
1064             int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
1065             U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
1066             loopOp++;     // point LoopOp after the just-inserted STO_SP
1067             fRXPat->fCompiledPat->push(loopOp, *fStatus);
1068
1069             // Then the LD_SP after the end of the loop
1070             appendOp(URX_LD_SP, varLoc);
1071         }
1072
1073         break;
1074
1075     case doNGInterval:
1076         // Finished scanning a non-greedy {lower,upper}? interval.  Generate the code for it.
1077         compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
1078         break;
1079
1080     case doIntervalError:
1081         error(U_REGEX_BAD_INTERVAL);
1082         break;
1083
1084     case doLiteralChar:
1085         // We've just scanned a "normal" character from the pattern,
1086         literalChar(fC.fChar);
1087         break;
1088
1089
1090     case doEscapedLiteralChar:
1091         // We've just scanned an backslashed escaped character with  no
1092         //   special meaning.  It represents itself.
1093         if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1094             ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1095             (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1096                error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1097              }
1098         literalChar(fC.fChar);
1099         break;
1100
1101
1102     case doDotAny:
1103         // scanned a ".",  match any single character.
1104         {
1105             fixLiterals(FALSE);
1106             if (fModeFlags & UREGEX_DOTALL) {
1107                 appendOp(URX_DOTANY_ALL, 0);
1108             } else if (fModeFlags & UREGEX_UNIX_LINES) {
1109                 appendOp(URX_DOTANY_UNIX, 0);
1110             } else {
1111                 appendOp(URX_DOTANY, 0);
1112             }
1113         }
1114         break;
1115
1116     case doCaret:
1117         {
1118             fixLiterals(FALSE);
1119             if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1120                 appendOp(URX_CARET, 0);
1121             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1122                 appendOp(URX_CARET_M, 0);
1123             } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1124                 appendOp(URX_CARET, 0);   // Only testing true start of input.
1125             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1126                 appendOp(URX_CARET_M_UNIX, 0);
1127             }
1128         }
1129         break;
1130
1131     case doDollar:
1132         {
1133             fixLiterals(FALSE);
1134             if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1135                 appendOp(URX_DOLLAR, 0);
1136             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1137                 appendOp(URX_DOLLAR_M, 0);
1138             } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1139                 appendOp(URX_DOLLAR_D, 0);
1140             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1141                 appendOp(URX_DOLLAR_MD, 0);
1142             }
1143         }
1144         break;
1145
1146     case doBackslashA:
1147         fixLiterals(FALSE);
1148         appendOp(URX_CARET, 0);
1149         break;
1150
1151     case doBackslashB:
1152         {
1153             #if  UCONFIG_NO_BREAK_ITERATION==1
1154             if (fModeFlags & UREGEX_UWORD) {
1155                 error(U_UNSUPPORTED_ERROR);
1156             }
1157             #endif
1158             fixLiterals(FALSE);
1159             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1160             appendOp(op, 1);
1161         }
1162         break;
1163
1164     case doBackslashb:
1165         {
1166             #if  UCONFIG_NO_BREAK_ITERATION==1
1167             if (fModeFlags & UREGEX_UWORD) {
1168                 error(U_UNSUPPORTED_ERROR);
1169             }
1170             #endif
1171             fixLiterals(FALSE);
1172             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1173             appendOp(op, 0);
1174         }
1175         break;
1176
1177     case doBackslashD:
1178         fixLiterals(FALSE);
1179         appendOp(URX_BACKSLASH_D, 1);
1180         break;
1181
1182     case doBackslashd:
1183         fixLiterals(FALSE);
1184         appendOp(URX_BACKSLASH_D, 0);
1185         break;
1186
1187     case doBackslashG:
1188         fixLiterals(FALSE);
1189         appendOp(URX_BACKSLASH_G, 0);
1190         break;
1191
1192     case doBackslashH:
1193         fixLiterals(FALSE);
1194         appendOp(URX_BACKSLASH_H, 1);
1195         break;
1196
1197     case doBackslashh:
1198         fixLiterals(FALSE);
1199         appendOp(URX_BACKSLASH_H, 0);
1200         break;
1201
1202     case doBackslashR:
1203         fixLiterals(FALSE);
1204         appendOp(URX_BACKSLASH_R, 0);
1205         break;
1206
1207     case doBackslashS:
1208         fixLiterals(FALSE);
1209         appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
1210         break;
1211
1212     case doBackslashs:
1213         fixLiterals(FALSE);
1214         appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
1215         break;
1216
1217     case doBackslashV:
1218         fixLiterals(FALSE);
1219         appendOp(URX_BACKSLASH_V, 1);
1220         break;
1221
1222     case doBackslashv:
1223         fixLiterals(FALSE);
1224         appendOp(URX_BACKSLASH_V, 0);
1225         break;
1226
1227     case doBackslashW:
1228         fixLiterals(FALSE);
1229         appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
1230         break;
1231
1232     case doBackslashw:
1233         fixLiterals(FALSE);
1234         appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
1235         break;
1236
1237     case doBackslashX:
1238         fixLiterals(FALSE);
1239         appendOp(URX_BACKSLASH_X, 0);
1240         break;
1241
1242
1243     case doBackslashZ:
1244         fixLiterals(FALSE);
1245         appendOp(URX_DOLLAR, 0);
1246         break;
1247
1248     case doBackslashz:
1249         fixLiterals(FALSE);
1250         appendOp(URX_BACKSLASH_Z, 0);
1251         break;
1252
1253     case doEscapeError:
1254         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1255         break;
1256
1257     case doExit:
1258         fixLiterals(FALSE);
1259         returnVal = FALSE;
1260         break;
1261
1262     case doProperty:
1263         {
1264             fixLiterals(FALSE);
1265             UnicodeSet *theSet = scanProp();
1266             compileSet(theSet);
1267         }
1268         break;
1269
1270     case doNamedChar:
1271         {
1272             UChar32 c = scanNamedChar();
1273             literalChar(c);
1274         }
1275         break;
1276
1277
1278     case doBackRef:
1279         // BackReference.  Somewhat unusual in that the front-end can not completely parse
1280         //                 the regular expression, because the number of digits to be consumed
1281         //                 depends on the number of capture groups that have been defined.  So
1282         //                 we have to do it here instead.
1283         {
1284             int32_t  numCaptureGroups = fRXPat->fGroupMap->size();
1285             int32_t  groupNum = 0;
1286             UChar32  c        = fC.fChar;
1287
1288             for (;;) {
1289                 // Loop once per digit, for max allowed number of digits in a back reference.
1290                 int32_t digit = u_charDigitValue(c);
1291                 groupNum = groupNum * 10 + digit;
1292                 if (groupNum >= numCaptureGroups) {
1293                     break;
1294                 }
1295                 c = peekCharLL();
1296                 if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) {
1297                     break;
1298                 }
1299                 nextCharLL();
1300             }
1301
1302             // Scan of the back reference in the source regexp is complete.  Now generate
1303             //  the compiled code for it.
1304             // Because capture groups can be forward-referenced by back-references,
1305             //  we fill the operand with the capture group number.  At the end
1306             //  of compilation, it will be changed to the variable's location.
1307             U_ASSERT(groupNum > 0);  // Shouldn't happen.  '\0' begins an octal escape sequence,
1308                                      //    and shouldn't enter this code path at all.
1309             fixLiterals(FALSE);
1310             if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1311                 appendOp(URX_BACKREF_I, groupNum);
1312             } else {
1313                 appendOp(URX_BACKREF, groupNum);
1314             }
1315         }
1316         break;
1317
1318     case doBeginNamedBackRef:
1319         U_ASSERT(fCaptureName == NULL);
1320         fCaptureName = new UnicodeString;
1321         if (fCaptureName == NULL) {
1322             error(U_MEMORY_ALLOCATION_ERROR);
1323         }
1324         break;
1325
1326     case doContinueNamedBackRef:
1327         fCaptureName->append(fC.fChar);
1328         break;
1329
1330     case doCompleteNamedBackRef:
1331         {
1332         int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName);
1333         if (groupNumber == 0) {
1334             // Group name has not been defined.
1335             //   Could be a forward reference. If we choose to support them at some
1336             //   future time, extra mechanism will be required at this point.
1337             error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
1338         } else {
1339             // Given the number, handle identically to a \n numbered back reference.
1340             // See comments above, under doBackRef
1341             fixLiterals(FALSE);
1342             if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1343                 appendOp(URX_BACKREF_I, groupNumber);
1344             } else {
1345                 appendOp(URX_BACKREF, groupNumber);
1346             }
1347         }
1348         delete fCaptureName;
1349         fCaptureName = NULL;
1350         break;
1351         }
1352
1353     case doPossessivePlus:
1354         // Possessive ++ quantifier.
1355         // Compiles to
1356         //       1.   STO_SP
1357         //       2.      body of stuff being iterated over
1358         //       3.   STATE_SAVE 5
1359         //       4.   JMP        2
1360         //       5.   LD_SP
1361         //       6.   ...
1362         //
1363         //  Note:  TODO:  This is pretty inefficient.  A mass of saved state is built up
1364         //                then unconditionally discarded.  Perhaps introduce a new opcode.  Ticket 6056
1365         //
1366         {
1367             // Emit the STO_SP
1368             int32_t   topLoc = blockTopLoc(TRUE);
1369             int32_t   stoLoc = allocateData(1);  // Reserve the data location for storing save stack ptr.
1370             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1371             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1372
1373             // Emit the STATE_SAVE
1374             appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
1375
1376             // Emit the JMP
1377             appendOp(URX_JMP, topLoc+1);
1378
1379             // Emit the LD_SP
1380             appendOp(URX_LD_SP, stoLoc);
1381         }
1382         break;
1383
1384     case doPossessiveStar:
1385         // Possessive *+ quantifier.
1386         // Compiles to
1387         //       1.   STO_SP       loc
1388         //       2.   STATE_SAVE   5
1389         //       3.      body of stuff being iterated over
1390         //       4.   JMP          2
1391         //       5.   LD_SP        loc
1392         //       6    ...
1393         // TODO:  do something to cut back the state stack each time through the loop.
1394         {
1395             // Reserve two slots at the top of the block.
1396             int32_t   topLoc = blockTopLoc(TRUE);
1397             insertOp(topLoc);
1398
1399             // emit   STO_SP     loc
1400             int32_t   stoLoc = allocateData(1);    // Reserve the data location for storing save stack ptr.
1401             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1402             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1403
1404             // Emit the SAVE_STATE   5
1405             int32_t L7 = fRXPat->fCompiledPat->size()+1;
1406             op = buildOp(URX_STATE_SAVE, L7);
1407             fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1408
1409             // Append the JMP operation.
1410             appendOp(URX_JMP, topLoc+1);
1411
1412             // Emit the LD_SP       loc
1413             appendOp(URX_LD_SP, stoLoc);
1414         }
1415         break;
1416
1417     case doPossessiveOpt:
1418         // Possessive  ?+ quantifier.
1419         //  Compiles to
1420         //     1. STO_SP      loc
1421         //     2. SAVE_STATE  5
1422         //     3.    body of optional block
1423         //     4. LD_SP       loc
1424         //     5. ...
1425         //
1426         {
1427             // Reserve two slots at the top of the block.
1428             int32_t   topLoc = blockTopLoc(TRUE);
1429             insertOp(topLoc);
1430
1431             // Emit the STO_SP
1432             int32_t   stoLoc = allocateData(1);   // Reserve the data location for storing save stack ptr.
1433             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1434             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1435
1436             // Emit the SAVE_STATE
1437             int32_t   continueLoc = fRXPat->fCompiledPat->size()+1;
1438             op = buildOp(URX_STATE_SAVE, continueLoc);
1439             fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1440
1441             // Emit the LD_SP
1442             appendOp(URX_LD_SP, stoLoc);
1443         }
1444         break;
1445
1446
1447     case doBeginMatchMode:
1448         fNewModeFlags = fModeFlags;
1449         fSetModeFlag  = TRUE;
1450         break;
1451
1452     case doMatchMode:   //  (?i)    and similar
1453         {
1454             int32_t  bit = 0;
1455             switch (fC.fChar) {
1456             case 0x69: /* 'i' */   bit = UREGEX_CASE_INSENSITIVE; break;
1457             case 0x64: /* 'd' */   bit = UREGEX_UNIX_LINES;       break;
1458             case 0x6d: /* 'm' */   bit = UREGEX_MULTILINE;        break;
1459             case 0x73: /* 's' */   bit = UREGEX_DOTALL;           break;
1460             case 0x75: /* 'u' */   bit = 0; /* Unicode casing */  break;
1461             case 0x77: /* 'w' */   bit = UREGEX_UWORD;            break;
1462             case 0x78: /* 'x' */   bit = UREGEX_COMMENTS;         break;
1463             case 0x2d: /* '-' */   fSetModeFlag = FALSE;          break;
1464             default:
1465                 U_ASSERT(FALSE);   // Should never happen.  Other chars are filtered out
1466                                    // by the scanner.
1467             }
1468             if (fSetModeFlag) {
1469                 fNewModeFlags |= bit;
1470             } else {
1471                 fNewModeFlags &= ~bit;
1472             }
1473         }
1474         break;
1475
1476     case doSetMatchMode:
1477         // Emit code to match any pending literals, using the not-yet changed match mode.
1478         fixLiterals();
1479
1480         // We've got a (?i) or similar.  The match mode is being changed, but
1481         //   the change is not scoped to a parenthesized block.
1482         U_ASSERT(fNewModeFlags < 0);
1483         fModeFlags = fNewModeFlags;
1484
1485         break;
1486
1487
1488     case doMatchModeParen:
1489         // We've got a (?i: or similar.  Begin a parenthesized block, save old
1490         //   mode flags so they can be restored at the close of the block.
1491         //
1492         //   Compile to a
1493         //      - NOP, which later may be replaced by a save-state if the
1494         //         parenthesized group gets a * quantifier, followed by
1495         //      - NOP, which may later be replaced by a save-state if there
1496         //             is an '|' alternation within the parens.
1497         {
1498             fixLiterals(FALSE);
1499             appendOp(URX_NOP, 0);
1500             appendOp(URX_NOP, 0);
1501
1502             // On the Parentheses stack, start a new frame and add the postions
1503             //   of the two NOPs (a normal non-capturing () frame, except for the
1504             //   saving of the orignal mode flags.)
1505             fParenStack.push(fModeFlags, *fStatus);
1506             fParenStack.push(flags, *fStatus);                            // Frame Marker
1507             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP
1508             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
1509
1510             // Set the current mode flags to the new values.
1511             U_ASSERT(fNewModeFlags < 0);
1512             fModeFlags = fNewModeFlags;
1513         }
1514         break;
1515
1516     case doBadModeFlag:
1517         error(U_REGEX_INVALID_FLAG);
1518         break;
1519
1520     case doSuppressComments:
1521         // We have just scanned a '(?'.  We now need to prevent the character scanner from
1522         // treating a '#' as a to-the-end-of-line comment.
1523         //   (This Perl compatibility just gets uglier and uglier to do...)
1524         fEOLComments = FALSE;
1525         break;
1526
1527
1528     case doSetAddAmp:
1529         {
1530           UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1531           set->add(chAmp);
1532         }
1533         break;
1534
1535     case doSetAddDash:
1536         {
1537           UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1538           set->add(chDash);
1539         }
1540         break;
1541
1542      case doSetBackslash_s:
1543         {
1544          UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1545          set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1546          break;
1547         }
1548
1549      case doSetBackslash_S:
1550         {
1551             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1552             UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1553             SSet.complement();
1554             set->addAll(SSet);
1555             break;
1556         }
1557
1558     case doSetBackslash_d:
1559         {
1560             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1561             // TODO - make a static set, ticket 6058.
1562             addCategory(set, U_GC_ND_MASK, *fStatus);
1563             break;
1564         }
1565
1566     case doSetBackslash_D:
1567         {
1568             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1569             UnicodeSet digits;
1570             // TODO - make a static set, ticket 6058.
1571             digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
1572             digits.complement();
1573             set->addAll(digits);
1574             break;
1575         }
1576
1577     case doSetBackslash_h:
1578         {
1579             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1580             UnicodeSet h;
1581             h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1582             h.add((UChar32)9);   // Tab
1583             set->addAll(h);
1584             break;
1585         }
1586
1587     case doSetBackslash_H:
1588         {
1589             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1590             UnicodeSet h;
1591             h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1592             h.add((UChar32)9);   // Tab
1593             h.complement();
1594             set->addAll(h);
1595             break;
1596         }
1597
1598     case doSetBackslash_v:
1599         {
1600             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1601             set->add((UChar32)0x0a, (UChar32)0x0d);  // add range
1602             set->add((UChar32)0x85);
1603             set->add((UChar32)0x2028, (UChar32)0x2029);
1604             break;
1605         }
1606
1607     case doSetBackslash_V:
1608         {
1609             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1610             UnicodeSet v;
1611             v.add((UChar32)0x0a, (UChar32)0x0d);  // add range
1612             v.add((UChar32)0x85);
1613             v.add((UChar32)0x2028, (UChar32)0x2029);
1614             v.complement();
1615             set->addAll(v);
1616             break;
1617         }
1618
1619     case doSetBackslash_w:
1620         {
1621             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1622             set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1623             break;
1624         }
1625
1626     case doSetBackslash_W:
1627         {
1628             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1629             UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1630             SSet.complement();
1631             set->addAll(SSet);
1632             break;
1633         }
1634
1635     case doSetBegin:
1636         fixLiterals(FALSE);
1637         fSetStack.push(new UnicodeSet(), *fStatus);
1638         fSetOpStack.push(setStart, *fStatus);
1639         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1640             fSetOpStack.push(setCaseClose, *fStatus);
1641         }
1642         break;
1643
1644     case doSetBeginDifference1:
1645         //  We have scanned something like [[abc]-[
1646         //  Set up a new UnicodeSet for the set beginning with the just-scanned '['
1647         //  Push a Difference operator, which will cause the new set to be subtracted from what
1648         //    went before once it is created.
1649         setPushOp(setDifference1);
1650         fSetOpStack.push(setStart, *fStatus);
1651         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1652             fSetOpStack.push(setCaseClose, *fStatus);
1653         }
1654         break;
1655
1656     case doSetBeginIntersection1:
1657         //  We have scanned something like  [[abc]&[
1658         //   Need both the '&' operator and the open '[' operator.
1659         setPushOp(setIntersection1);
1660         fSetOpStack.push(setStart, *fStatus);
1661         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1662             fSetOpStack.push(setCaseClose, *fStatus);
1663         }
1664         break;
1665
1666     case doSetBeginUnion:
1667         //  We have scanned something like  [[abc][
1668         //     Need to handle the union operation explicitly [[abc] | [
1669         setPushOp(setUnion);
1670         fSetOpStack.push(setStart, *fStatus);
1671         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1672             fSetOpStack.push(setCaseClose, *fStatus);
1673         }
1674         break;
1675
1676     case doSetDifference2:
1677         // We have scanned something like [abc--
1678         //   Consider this to unambiguously be a set difference operator.
1679         setPushOp(setDifference2);
1680         break;
1681
1682     case doSetEnd:
1683         // Have encountered the ']' that closes a set.
1684         //    Force the evaluation of any pending operations within this set,
1685         //    leave the completed set on the top of the set stack.
1686         setEval(setEnd);
1687         U_ASSERT(fSetOpStack.peeki()==setStart);
1688         fSetOpStack.popi();
1689         break;
1690
1691     case doSetFinish:
1692         {
1693         // Finished a complete set expression, including all nested sets.
1694         //   The close bracket has already triggered clearing out pending set operators,
1695         //    the operator stack should be empty and the operand stack should have just
1696         //    one entry, the result set.
1697         U_ASSERT(fSetOpStack.empty());
1698         UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
1699         U_ASSERT(fSetStack.empty());
1700         compileSet(theSet);
1701         break;
1702         }
1703
1704     case doSetIntersection2:
1705         // Have scanned something like [abc&&
1706         setPushOp(setIntersection2);
1707         break;
1708
1709     case doSetLiteral:
1710         // Union the just-scanned literal character into the set being built.
1711         //    This operation is the highest precedence set operation, so we can always do
1712         //    it immediately, without waiting to see what follows.  It is necessary to perform
1713         //    any pending '-' or '&' operation first, because these have the same precedence
1714         //    as union-ing in a literal'
1715         {
1716             setEval(setUnion);
1717             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1718             s->add(fC.fChar);
1719             fLastSetLiteral = fC.fChar;
1720             break;
1721         }
1722
1723     case doSetLiteralEscaped:
1724         // A back-slash escaped literal character was encountered.
1725         // Processing is the same as with setLiteral, above, with the addition of
1726         //  the optional check for errors on escaped ASCII letters.
1727         {
1728             if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1729                 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1730                  (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1731                 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1732             }
1733             setEval(setUnion);
1734             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1735             s->add(fC.fChar);
1736             fLastSetLiteral = fC.fChar;
1737             break;
1738         }
1739
1740         case doSetNamedChar:
1741         // Scanning a \N{UNICODE CHARACTER NAME}
1742         //  Aside from the source of the character, the processing is identical to doSetLiteral,
1743         //    above.
1744         {
1745             UChar32  c = scanNamedChar();
1746             setEval(setUnion);
1747             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1748             s->add(c);
1749             fLastSetLiteral = c;
1750             break;
1751         }
1752
1753     case doSetNamedRange:
1754         // We have scanned literal-\N{CHAR NAME}.  Add the range to the set.
1755         // The left character is already in the set, and is saved in fLastSetLiteral.
1756         // The right side needs to be picked up, the scan is at the 'N'.
1757         // Lower Limit > Upper limit being an error matches both Java
1758         //        and ICU UnicodeSet behavior.
1759         {
1760             UChar32  c = scanNamedChar();
1761             if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > c)) {
1762                 error(U_REGEX_INVALID_RANGE);
1763             }
1764             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1765             s->add(fLastSetLiteral, c);
1766             fLastSetLiteral = c;
1767             break;
1768         }
1769
1770
1771     case  doSetNegate:
1772         // Scanned a '^' at the start of a set.
1773         // Push the negation operator onto the set op stack.
1774         // A twist for case-insensitive matching:
1775         //   the case closure operation must happen _before_ negation.
1776         //   But the case closure operation will already be on the stack if it's required.
1777         //   This requires checking for case closure, and swapping the stack order
1778         //    if it is present.
1779         {
1780             int32_t  tosOp = fSetOpStack.peeki();
1781             if (tosOp == setCaseClose) {
1782                 fSetOpStack.popi();
1783                 fSetOpStack.push(setNegation, *fStatus);
1784                 fSetOpStack.push(setCaseClose, *fStatus);
1785             } else {
1786                 fSetOpStack.push(setNegation, *fStatus);
1787             }
1788         }
1789         break;
1790
1791     case doSetNoCloseError:
1792         error(U_REGEX_MISSING_CLOSE_BRACKET);
1793         break;
1794
1795     case doSetOpError:
1796         error(U_REGEX_RULE_SYNTAX);   //  -- or && at the end of a set.  Illegal.
1797         break;
1798
1799     case doSetPosixProp:
1800         {
1801             UnicodeSet *s = scanPosixProp();
1802             if (s != NULL) {
1803                 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1804                 tos->addAll(*s);
1805                 delete s;
1806             }  // else error.  scanProp() reported the error status already.
1807         }
1808         break;
1809
1810     case doSetProp:
1811         //  Scanned a \p \P within [brackets].
1812         {
1813             UnicodeSet *s = scanProp();
1814             if (s != NULL) {
1815                 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1816                 tos->addAll(*s);
1817                 delete s;
1818             }  // else error.  scanProp() reported the error status already.
1819         }
1820         break;
1821
1822
1823     case doSetRange:
1824         // We have scanned literal-literal.  Add the range to the set.
1825         // The left character is already in the set, and is saved in fLastSetLiteral.
1826         // The right side is the current character.
1827         // Lower Limit > Upper limit being an error matches both Java
1828         //        and ICU UnicodeSet behavior.
1829         {
1830
1831         if (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > fC.fChar) {
1832             error(U_REGEX_INVALID_RANGE);
1833         }
1834         UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1835         s->add(fLastSetLiteral, fC.fChar);
1836         break;
1837         }
1838
1839     default:
1840         U_ASSERT(FALSE);
1841         error(U_REGEX_INTERNAL_ERROR);
1842         break;
1843     }
1844
1845     if (U_FAILURE(*fStatus)) {
1846         returnVal = FALSE;
1847     }
1848
1849     return returnVal;
1850 }
1851
1852
1853
1854 //------------------------------------------------------------------------------
1855 //
1856 //   literalChar           We've encountered a literal character from the pattern,
1857 //                             or an escape sequence that reduces to a character.
1858 //                         Add it to the string containing all literal chars/strings from
1859 //                             the pattern.
1860 //
1861 //------------------------------------------------------------------------------
1862 void RegexCompile::literalChar(UChar32 c)  {
1863     fLiteralChars.append(c);
1864 }
1865
1866
1867 //------------------------------------------------------------------------------
1868 //
1869 //    fixLiterals           When compiling something that can follow a literal
1870 //                          string in a pattern, emit the code to match the
1871 //                          accumulated literal string.
1872 //
1873 //                          Optionally, split the last char of the string off into
1874 //                          a single "ONE_CHAR" operation, so that quantifiers can
1875 //                          apply to that char alone.  Example:   abc*
1876 //                          The * must apply to the 'c' only.
1877 //
1878 //------------------------------------------------------------------------------
1879 void    RegexCompile::fixLiterals(UBool split) {
1880
1881     // If no literal characters have been scanned but not yet had code generated
1882     //   for them, nothing needs to be done.
1883     if (fLiteralChars.length() == 0) {
1884         return;
1885     }
1886
1887     int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1888     UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1889
1890     // Split:  We need to  ensure that the last item in the compiled pattern
1891     //     refers only to the last literal scanned in the pattern, so that
1892     //     quantifiers (*, +, etc.) affect only it, and not a longer string.
1893     //     Split before case folding for case insensitive matches.
1894
1895     if (split) {
1896         fLiteralChars.truncate(indexOfLastCodePoint);
1897         fixLiterals(FALSE);   // Recursive call, emit code to match the first part of the string.
1898                               //  Note that the truncated literal string may be empty, in which case
1899                               //  nothing will be emitted.
1900
1901         literalChar(lastCodePoint);  // Re-add the last code point as if it were a new literal.
1902         fixLiterals(FALSE);          // Second recursive call, code for the final code point.
1903         return;
1904     }
1905
1906     // If we are doing case-insensitive matching, case fold the string.  This may expand
1907     //   the string, e.g. the German sharp-s turns into "ss"
1908     if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1909         fLiteralChars.foldCase();
1910         indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1911         lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1912     }
1913
1914     if (indexOfLastCodePoint == 0) {
1915         // Single character, emit a URX_ONECHAR op to match it.
1916         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
1917                  u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
1918             appendOp(URX_ONECHAR_I, lastCodePoint);
1919         } else {
1920             appendOp(URX_ONECHAR, lastCodePoint);
1921         }
1922     } else {
1923         // Two or more chars, emit a URX_STRING to match them.
1924         if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length() > 0x00ffffff) {
1925             error(U_REGEX_PATTERN_TOO_BIG);
1926         }
1927         if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1928             appendOp(URX_STRING_I, fRXPat->fLiteralText.length());
1929         } else {
1930             // TODO here:  add optimization to split case sensitive strings of length two
1931             //             into two single char ops, for efficiency.
1932             appendOp(URX_STRING, fRXPat->fLiteralText.length());
1933         }
1934         appendOp(URX_STRING_LEN, fLiteralChars.length());
1935
1936         // Add this string into the accumulated strings of the compiled pattern.
1937         fRXPat->fLiteralText.append(fLiteralChars);
1938     }
1939
1940     fLiteralChars.remove();
1941 }
1942
1943
1944 int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
1945     if (U_FAILURE(*fStatus)) {
1946         return 0;
1947     }
1948     if (type < 0 || type > 255) {
1949         U_ASSERT(FALSE);
1950         error(U_REGEX_INTERNAL_ERROR);
1951         type = URX_RESERVED_OP;
1952     }
1953     if (val > 0x00ffffff) {
1954         U_ASSERT(FALSE);
1955         error(U_REGEX_INTERNAL_ERROR);
1956         val = 0;
1957     }
1958     if (val < 0) {
1959         if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) {
1960             U_ASSERT(FALSE);
1961             error(U_REGEX_INTERNAL_ERROR);
1962             return -1;
1963         }
1964         if (URX_TYPE(val) != 0xff) {
1965             U_ASSERT(FALSE);
1966             error(U_REGEX_INTERNAL_ERROR);
1967             return -1;
1968         }
1969         type = URX_RESERVED_OP_N;
1970     }
1971     return (type << 24) | val;
1972 }
1973
1974
1975 //------------------------------------------------------------------------------
1976 //
1977 //   appendOp()             Append a new instruction onto the compiled pattern
1978 //                          Includes error checking, limiting the size of the
1979 //                          pattern to lengths that can be represented in the
1980 //                          24 bit operand field of an instruction.
1981 //
1982 //------------------------------------------------------------------------------
1983 void RegexCompile::appendOp(int32_t op) {
1984     if (U_FAILURE(*fStatus)) {
1985         return;
1986     }
1987     fRXPat->fCompiledPat->addElement(op, *fStatus);
1988     if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) {
1989         error(U_REGEX_PATTERN_TOO_BIG);
1990     }
1991 }
1992
1993 void RegexCompile::appendOp(int32_t type, int32_t val) {
1994     appendOp(buildOp(type, val));
1995 }
1996
1997
1998 //------------------------------------------------------------------------------
1999 //
2000 //   insertOp()             Insert a slot for a new opcode into the already
2001 //                          compiled pattern code.
2002 //
2003 //                          Fill the slot with a NOP.  Our caller will replace it
2004 //                          with what they really wanted.
2005 //
2006 //------------------------------------------------------------------------------
2007 void   RegexCompile::insertOp(int32_t where) {
2008     UVector64 *code = fRXPat->fCompiledPat;
2009     U_ASSERT(where>0 && where < code->size());
2010
2011     int32_t  nop = buildOp(URX_NOP, 0);
2012     code->insertElementAt(nop, where, *fStatus);
2013
2014     // Walk through the pattern, looking for any ops with targets that
2015     //  were moved down by the insert.  Fix them.
2016     int32_t loc;
2017     for (loc=0; loc<code->size(); loc++) {
2018         int32_t op = (int32_t)code->elementAti(loc);
2019         int32_t opType = URX_TYPE(op);
2020         int32_t opValue = URX_VAL(op);
2021         if ((opType == URX_JMP         ||
2022             opType == URX_JMPX         ||
2023             opType == URX_STATE_SAVE   ||
2024             opType == URX_CTR_LOOP     ||
2025             opType == URX_CTR_LOOP_NG  ||
2026             opType == URX_JMP_SAV      ||
2027             opType == URX_JMP_SAV_X    ||
2028             opType == URX_RELOC_OPRND)    && opValue > where) {
2029             // Target location for this opcode is after the insertion point and
2030             //   needs to be incremented to adjust for the insertion.
2031             opValue++;
2032             op = buildOp(opType, opValue);
2033             code->setElementAt(op, loc);
2034         }
2035     }
2036
2037     // Now fix up the parentheses stack.  All positive values in it are locations in
2038     //  the compiled pattern.   (Negative values are frame boundaries, and don't need fixing.)
2039     for (loc=0; loc<fParenStack.size(); loc++) {
2040         int32_t x = fParenStack.elementAti(loc);
2041         U_ASSERT(x < code->size());
2042         if (x>where) {
2043             x++;
2044             fParenStack.setElementAt(x, loc);
2045         }
2046     }
2047
2048     if (fMatchCloseParen > where) {
2049         fMatchCloseParen++;
2050     }
2051     if (fMatchOpenParen > where) {
2052         fMatchOpenParen++;
2053     }
2054 }
2055
2056
2057 //------------------------------------------------------------------------------
2058 //
2059 //   allocateData()        Allocate storage in the matcher's static data area.
2060 //                         Return the index for the newly allocated data.
2061 //                         The storage won't actually exist until we are running a match
2062 //                         operation, but the storage indexes are inserted into various
2063 //                         opcodes while compiling the pattern.
2064 //
2065 //------------------------------------------------------------------------------
2066 int32_t RegexCompile::allocateData(int32_t size) {
2067     if (U_FAILURE(*fStatus)) {
2068         return 0;
2069     }
2070     if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) {
2071         error(U_REGEX_INTERNAL_ERROR);
2072         return 0;
2073     }
2074     int32_t dataIndex = fRXPat->fDataSize;
2075     fRXPat->fDataSize += size;
2076     if (fRXPat->fDataSize >= 0x00fffff0) {
2077         error(U_REGEX_INTERNAL_ERROR);
2078     }
2079     return dataIndex;
2080 }
2081
2082
2083 //------------------------------------------------------------------------------
2084 //
2085 //   allocateStackData()   Allocate space in the back-tracking stack frame.
2086 //                         Return the index for the newly allocated data.
2087 //                         The frame indexes are inserted into various
2088 //                         opcodes while compiling the pattern, meaning that frame
2089 //                         size must be restricted to the size that will fit
2090 //                         as an operand (24 bits).
2091 //
2092 //------------------------------------------------------------------------------
2093 int32_t RegexCompile::allocateStackData(int32_t size) {
2094     if (U_FAILURE(*fStatus)) {
2095         return 0;
2096     }
2097     if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) {
2098         error(U_REGEX_INTERNAL_ERROR);
2099         return 0;
2100     }
2101     int32_t dataIndex = fRXPat->fFrameSize;
2102     fRXPat->fFrameSize += size;
2103     if (fRXPat->fFrameSize >= 0x00fffff0) {
2104         error(U_REGEX_PATTERN_TOO_BIG);
2105     }
2106     return dataIndex;
2107 }
2108
2109
2110 //------------------------------------------------------------------------------
2111 //
2112 //   blockTopLoc()          Find or create a location in the compiled pattern
2113 //                          at the start of the operation or block that has
2114 //                          just been compiled.  Needed when a quantifier (* or
2115 //                          whatever) appears, and we need to add an operation
2116 //                          at the start of the thing being quantified.
2117 //
2118 //                          (Parenthesized Blocks) have a slot with a NOP that
2119 //                          is reserved for this purpose.  .* or similar don't
2120 //                          and a slot needs to be added.
2121 //
2122 //       parameter reserveLoc   :  TRUE -  ensure that there is space to add an opcode
2123 //                                         at the returned location.
2124 //                                 FALSE - just return the address,
2125 //                                         do not reserve a location there.
2126 //
2127 //------------------------------------------------------------------------------
2128 int32_t   RegexCompile::blockTopLoc(UBool reserveLoc) {
2129     int32_t   theLoc;
2130     fixLiterals(TRUE);  // Emit code for any pending literals.
2131                         //   If last item was a string, emit separate op for the its last char.
2132     if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
2133     {
2134         // The item just processed is a parenthesized block.
2135         theLoc = fMatchOpenParen;   // A slot is already reserved for us.
2136         U_ASSERT(theLoc > 0);
2137         U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
2138     }
2139     else {
2140         // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
2141         // No slot for STATE_SAVE was pre-reserved in the compiled code.
2142         // We need to make space now.
2143         theLoc = fRXPat->fCompiledPat->size()-1;
2144         int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc);
2145         if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
2146             // Strings take two opcode, we want the position of the first one.
2147             // We can have a string at this point if a single character case-folded to two.
2148             theLoc--;
2149         }
2150         if (reserveLoc) {
2151             int32_t  nop = buildOp(URX_NOP, 0);
2152             fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
2153         }
2154     }
2155     return theLoc;
2156 }
2157
2158
2159
2160 //------------------------------------------------------------------------------
2161 //
2162 //    handleCloseParen      When compiling a close paren, we need to go back
2163 //                          and fix up any JMP or SAVE operations within the
2164 //                          parenthesized block that need to target the end
2165 //                          of the block.  The locations of these are kept on
2166 //                          the paretheses stack.
2167 //
2168 //                          This function is called both when encountering a
2169 //                          real ) and at the end of the pattern.
2170 //
2171 //------------------------------------------------------------------------------
2172 void  RegexCompile::handleCloseParen() {
2173     int32_t   patIdx;
2174     int32_t   patOp;
2175     if (fParenStack.size() <= 0) {
2176         error(U_REGEX_MISMATCHED_PAREN);
2177         return;
2178     }
2179
2180     // Emit code for any pending literals.
2181     fixLiterals(FALSE);
2182
2183     // Fixup any operations within the just-closed parenthesized group
2184     //    that need to reference the end of the (block).
2185     //    (The first one popped from the stack is an unused slot for
2186     //     alternation (OR) state save, but applying the fixup to it does no harm.)
2187     for (;;) {
2188         patIdx = fParenStack.popi();
2189         if (patIdx < 0) {
2190             // value < 0 flags the start of the frame on the paren stack.
2191             break;
2192         }
2193         U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
2194         patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx);
2195         U_ASSERT(URX_VAL(patOp) == 0);          // Branch target for JMP should not be set.
2196         patOp |= fRXPat->fCompiledPat->size();  // Set it now.
2197         fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
2198         fMatchOpenParen     = patIdx;
2199     }
2200
2201     //  At the close of any parenthesized block, restore the match mode flags  to
2202     //  the value they had at the open paren.  Saved value is
2203     //  at the top of the paren stack.
2204     fModeFlags = fParenStack.popi();
2205     U_ASSERT(fModeFlags < 0);
2206
2207     // DO any additional fixups, depending on the specific kind of
2208     // parentesized grouping this is
2209
2210     switch (patIdx) {
2211     case plain:
2212     case flags:
2213         // No additional fixups required.
2214         //   (Grouping-only parentheses)
2215         break;
2216     case capturing:
2217         // Capturing Parentheses.
2218         //   Insert a End Capture op into the pattern.
2219         //   The frame offset of the variables for this cg is obtained from the
2220         //       start capture op and put it into the end-capture op.
2221         {
2222             int32_t   captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2223             U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
2224
2225             int32_t   frameVarLocation = URX_VAL(captureOp);
2226             appendOp(URX_END_CAPTURE, frameVarLocation);
2227         }
2228         break;
2229     case atomic:
2230         // Atomic Parenthesis.
2231         //   Insert a LD_SP operation to restore the state stack to the position
2232         //   it was when the atomic parens were entered.
2233         {
2234             int32_t   stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2235             U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
2236             int32_t   stoLoc = URX_VAL(stoOp);
2237             appendOp(URX_LD_SP, stoLoc);
2238         }
2239         break;
2240
2241     case lookAhead:
2242         {
2243             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2244             U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2245             int32_t dataLoc  = URX_VAL(startOp);
2246             appendOp(URX_LA_END, dataLoc);
2247         }
2248         break;
2249
2250     case negLookAhead:
2251         {
2252             // See comment at doOpenLookAheadNeg
2253             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
2254             U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2255             int32_t dataLoc  = URX_VAL(startOp);
2256             appendOp(URX_LA_END, dataLoc);
2257             appendOp(URX_BACKTRACK, 0);
2258             appendOp(URX_LA_END, dataLoc);
2259
2260             // Patch the URX_SAVE near the top of the block.
2261             // The destination of the SAVE is the final LA_END that was just added.
2262             int32_t saveOp   = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
2263             U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
2264             int32_t dest     = fRXPat->fCompiledPat->size()-1;
2265             saveOp           = buildOp(URX_STATE_SAVE, dest);
2266             fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
2267         }
2268         break;
2269
2270     case lookBehind:
2271         {
2272             // See comment at doOpenLookBehind.
2273
2274             // Append the URX_LB_END and URX_LA_END to the compiled pattern.
2275             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
2276             U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2277             int32_t dataLoc  = URX_VAL(startOp);
2278             appendOp(URX_LB_END, dataLoc);
2279             appendOp(URX_LA_END, dataLoc);
2280
2281             // Determine the min and max bounds for the length of the
2282             //  string that the pattern can match.
2283             //  An unbounded upper limit is an error.
2284             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2285             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2286             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2287             if (URX_TYPE(maxML) != 0) {
2288                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2289                 break;
2290             }
2291             if (maxML == INT32_MAX) {
2292                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2293                 break;
2294             }
2295             U_ASSERT(minML <= maxML);
2296
2297             // Insert the min and max match len bounds into the URX_LB_CONT op that
2298             //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2299             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-2);
2300             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-1);
2301
2302         }
2303         break;
2304
2305
2306
2307     case lookBehindN:
2308         {
2309             // See comment at doOpenLookBehindNeg.
2310
2311             // Append the URX_LBN_END to the compiled pattern.
2312             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2313             U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2314             int32_t dataLoc  = URX_VAL(startOp);
2315             appendOp(URX_LBN_END, dataLoc);
2316
2317             // Determine the min and max bounds for the length of the
2318             //  string that the pattern can match.
2319             //  An unbounded upper limit is an error.
2320             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2321             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2322             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2323             if (URX_TYPE(maxML) != 0) {
2324                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2325                 break;
2326             }
2327             if (maxML == INT32_MAX) {
2328                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2329                 break;
2330             }
2331             U_ASSERT(minML <= maxML);
2332
2333             // Insert the min and max match len bounds into the URX_LB_CONT op that
2334             //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2335             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-3);
2336             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-2);
2337
2338             // Insert the pattern location to continue at after a successful match
2339             //  as the last operand of the URX_LBN_CONT
2340             int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
2341             fRXPat->fCompiledPat->setElementAt(op,  fMatchOpenParen-1);
2342         }
2343         break;
2344
2345
2346
2347     default:
2348         U_ASSERT(FALSE);
2349     }
2350
2351     // remember the next location in the compiled pattern.
2352     // The compilation of Quantifiers will look at this to see whether its looping
2353     //   over a parenthesized block or a single item
2354     fMatchCloseParen = fRXPat->fCompiledPat->size();
2355 }
2356
2357
2358
2359 //------------------------------------------------------------------------------
2360 //
2361 //   compileSet       Compile the pattern operations for a reference to a
2362 //                    UnicodeSet.
2363 //
2364 //------------------------------------------------------------------------------
2365 void        RegexCompile::compileSet(UnicodeSet *theSet)
2366 {
2367     if (theSet == NULL) {
2368         return;
2369     }
2370     //  Remove any strings from the set.
2371     //  There shoudn't be any, but just in case.
2372     //     (Case Closure can add them; if we had a simple case closure avaialble that
2373     //      ignored strings, that would be better.)
2374     theSet->removeAllStrings();
2375     int32_t  setSize = theSet->size();
2376
2377     switch (setSize) {
2378     case 0:
2379         {
2380             // Set of no elements.   Always fails to match.
2381             appendOp(URX_BACKTRACK, 0);
2382             delete theSet;
2383         }
2384         break;
2385
2386     case 1:
2387         {
2388             // The set contains only a single code point.  Put it into
2389             //   the compiled pattern as a single char operation rather
2390             //   than a set, and discard the set itself.
2391             literalChar(theSet->charAt(0));
2392             delete theSet;
2393         }
2394         break;
2395
2396     default:
2397         {
2398             //  The set contains two or more chars.  (the normal case)
2399             //  Put it into the compiled pattern as a set.
2400             int32_t setNumber = fRXPat->fSets->size();
2401             fRXPat->fSets->addElement(theSet, *fStatus);
2402             appendOp(URX_SETREF, setNumber);
2403         }
2404     }
2405 }
2406
2407
2408 //------------------------------------------------------------------------------
2409 //
2410 //   compileInterval    Generate the code for a {min, max} style interval quantifier.
2411 //                      Except for the specific opcodes used, the code is the same
2412 //                      for all three types (greedy, non-greedy, possessive) of
2413 //                      intervals.  The opcodes are supplied as parameters.
2414 //                      (There are two sets of opcodes - greedy & possessive use the
2415 //                      same ones, while non-greedy has it's own.)
2416 //
2417 //                      The code for interval loops has this form:
2418 //                         0  CTR_INIT   counter loc (in stack frame)
2419 //                         1             5  patt address of CTR_LOOP at bottom of block
2420 //                         2             min count
2421 //                         3             max count   (-1 for unbounded)
2422 //                         4  ...        block to be iterated over
2423 //                         5  CTR_LOOP
2424 //
2425 //                       In
2426 //------------------------------------------------------------------------------
2427 void        RegexCompile::compileInterval(int32_t InitOp,  int32_t LoopOp)
2428 {
2429     // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2430     //   four slots in the compiled code.  Reserve them.
2431     int32_t   topOfBlock = blockTopLoc(TRUE);
2432     insertOp(topOfBlock);
2433     insertOp(topOfBlock);
2434     insertOp(topOfBlock);
2435
2436     // The operands for the CTR_INIT opcode include the index in the matcher data
2437     //   of the counter.  Allocate it now. There are two data items
2438     //        counterLoc   -->  Loop counter
2439     //               +1    -->  Input index (for breaking non-progressing loops)
2440     //                          (Only present if unbounded upper limit on loop)
2441     int32_t   dataSize = fIntervalUpper < 0 ? 2 : 1;
2442     int32_t   counterLoc = allocateStackData(dataSize);
2443
2444     int32_t   op = buildOp(InitOp, counterLoc);
2445     fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
2446
2447     // The second operand of CTR_INIT is the location following the end of the loop.
2448     //   Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2449     //   compilation of something later on causes the code to grow and the target
2450     //   position to move.
2451     int32_t loopEnd = fRXPat->fCompiledPat->size();
2452     op = buildOp(URX_RELOC_OPRND, loopEnd);
2453     fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
2454
2455     // Followed by the min and max counts.
2456     fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
2457     fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
2458
2459     // Apend the CTR_LOOP op.  The operand is the location of the CTR_INIT op.
2460     //   Goes at end of the block being looped over, so just append to the code so far.
2461     appendOp(LoopOp, topOfBlock);
2462
2463     if ((fIntervalLow & 0xff000000) != 0 ||
2464         (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
2465             error(U_REGEX_NUMBER_TOO_BIG);
2466         }
2467
2468     if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
2469         error(U_REGEX_MAX_LT_MIN);
2470     }
2471 }
2472
2473
2474
2475 UBool RegexCompile::compileInlineInterval() {
2476     if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
2477         // Too big to inline.  Fail, which will cause looping code to be generated.
2478         //   (Upper < Lower picks up unbounded upper and errors, both.)
2479         return FALSE;
2480     }
2481
2482     int32_t   topOfBlock = blockTopLoc(FALSE);
2483     if (fIntervalUpper == 0) {
2484         // Pathological case.  Attempt no matches, as if the block doesn't exist.
2485         // Discard the generated code for the block.
2486         // If the block included parens, discard the info pertaining to them as well.
2487         fRXPat->fCompiledPat->setSize(topOfBlock);
2488         if (fMatchOpenParen >= topOfBlock) {
2489             fMatchOpenParen = -1;
2490         }
2491         if (fMatchCloseParen >= topOfBlock) {
2492             fMatchCloseParen = -1;
2493         }
2494         return TRUE;
2495     }
2496
2497     if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
2498         // The thing being repeated is not a single op, but some
2499         //   more complex block.  Do it as a loop, not inlines.
2500         //   Note that things "repeated" a max of once are handled as inline, because
2501         //     the one copy of the code already generated is just fine.
2502         return FALSE;
2503     }
2504
2505     // Pick up the opcode that is to be repeated
2506     //
2507     int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock);
2508
2509     // Compute the pattern location where the inline sequence
2510     //   will end, and set up the state save op that will be needed.
2511     //
2512     int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
2513                                 + fIntervalUpper + (fIntervalUpper-fIntervalLow);
2514     int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc);
2515     if (fIntervalLow == 0) {
2516         insertOp(topOfBlock);
2517         fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
2518     }
2519
2520
2521
2522     //  Loop, emitting the op for the thing being repeated each time.
2523     //    Loop starts at 1 because one instance of the op already exists in the pattern,
2524     //    it was put there when it was originally encountered.
2525     int32_t i;
2526     for (i=1; i<fIntervalUpper; i++ ) {
2527         if (i >= fIntervalLow) {
2528             appendOp(saveOp);
2529         }
2530         appendOp(op);
2531     }
2532     return TRUE;
2533 }
2534
2535
2536
2537 //------------------------------------------------------------------------------
2538 //
2539 //   caseInsensitiveStart  given a single code point from a pattern string, determine the
2540 //                         set of characters that could potentially begin a case-insensitive
2541 //                         match of a string beginning with that character, using full Unicode
2542 //                         case insensitive matching.
2543 //
2544 //          This is used in optimizing find().
2545 //
2546 //          closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
2547 //          misses cases like this:
2548 //             A string from the pattern begins with 'ss' (although all we know
2549 //                 in this context is that it begins with 's')
2550 //             The pattern could match a string beginning with a German sharp-s
2551 //
2552 //           To the ordinary case closure for a character c, we add all other
2553 //           characters cx where the case closure of cx incudes a string form that begins
2554 //           with the original character c.
2555 //
2556 //           This function could be made smarter. The full pattern string is available
2557 //           and it would be possible to verify that the extra characters being added
2558 //           to the starting set fully match, rather than having just a first-char of the
2559 //           folded form match.
2560 //
2561 //------------------------------------------------------------------------------
2562 void  RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
2563
2564 // Machine Generated below.
2565 // It may need updating with new versions of Unicode.
2566 // Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
2567 // The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
2568
2569 // Machine Generated Data. Do not hand edit.
2570     static const UChar32 RECaseFixCodePoints[] = {
2571         0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc,
2572         0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565,
2573         0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07,
2574         0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61,
2575         0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
2576
2577     static const int16_t RECaseFixStringOffsets[] = {
2578         0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10,
2579         0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f,
2580         0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43,
2581         0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57,
2582         0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
2583
2584     static const int16_t RECaseFixCounts[] = {
2585         0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1,
2586         0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1,
2587         0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2588         0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2589         0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
2590
2591     static const UChar RECaseFixData[] = {
2592         0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf,
2593         0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3,
2594         0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3,
2595         0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3,
2596         0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14,
2597         0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83,
2598         0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90,
2599         0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95,
2600         0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2,
2601         0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7,
2602         0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
2603
2604 // End of machine generated data.
2605
2606     if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2607         UChar32 caseFoldedC  = u_foldCase(c, U_FOLD_CASE_DEFAULT);
2608         starterChars->set(caseFoldedC, caseFoldedC);
2609
2610         int32_t i;
2611         for (i=0; RECaseFixCodePoints[i]<c ; i++) {
2612             // Simple linear search through the sorted list of interesting code points.
2613         }
2614
2615         if (RECaseFixCodePoints[i] == c) {
2616             int32_t dataIndex = RECaseFixStringOffsets[i];
2617             int32_t numCharsToAdd = RECaseFixCounts[i];
2618             UChar32 cpToAdd = 0;
2619             for (int32_t j=0; j<numCharsToAdd; j++) {
2620                 U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd);
2621                 starterChars->add(cpToAdd);
2622             }
2623         }
2624
2625         starterChars->closeOver(USET_CASE_INSENSITIVE);
2626         starterChars->removeAllStrings();
2627     } else {
2628         // Not a cased character. Just return it alone.
2629         starterChars->set(c, c);
2630     }
2631 }
2632
2633
2634
2635
2636 //------------------------------------------------------------------------------
2637 //
2638 //   matchStartType    Determine how a match can start.
2639 //                     Used to optimize find() operations.
2640 //
2641 //                     Operation is very similar to minMatchLength().  Walk the compiled
2642 //                     pattern, keeping an on-going minimum-match-length.  For any
2643 //                     op where the min match coming in is zero, add that ops possible
2644 //                     starting matches to the possible starts for the overall pattern.
2645 //
2646 //------------------------------------------------------------------------------
2647 void   RegexCompile::matchStartType() {
2648     if (U_FAILURE(*fStatus)) {
2649         return;
2650     }
2651
2652
2653     int32_t    loc;                    // Location in the pattern of the current op being processed.
2654     int32_t    op;                     // The op being processed
2655     int32_t    opType;                 // The opcode type of the op
2656     int32_t    currentLen = 0;         // Minimum length of a match to this point (loc) in the pattern
2657     int32_t    numInitialStrings = 0;  // Number of strings encountered that could match at start.
2658
2659     UBool      atStart = TRUE;         // True if no part of the pattern yet encountered
2660                                        //   could have advanced the position in a match.
2661                                        //   (Maximum match length so far == 0)
2662
2663     // forwardedLength is a vector holding minimum-match-length values that
2664     //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
2665     //   It must be one longer than the pattern being checked because some  ops
2666     //   will jmp to a end-of-block+1 location from within a block, and we must
2667     //   count those when checking the block.
2668     int32_t end = fRXPat->fCompiledPat->size();
2669     UVector32  forwardedLength(end+1, *fStatus);
2670     forwardedLength.setSize(end+1);
2671     for (loc=3; loc<end; loc++) {
2672         forwardedLength.setElementAt(INT32_MAX, loc);
2673     }
2674
2675     for (loc = 3; loc<end; loc++) {
2676         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2677         opType = URX_TYPE(op);
2678
2679         // The loop is advancing linearly through the pattern.
2680         // If the op we are now at was the destination of a branch in the pattern,
2681         // and that path has a shorter minimum length than the current accumulated value,
2682         // replace the current accumulated value.
2683         if (forwardedLength.elementAti(loc) < currentLen) {
2684             currentLen = forwardedLength.elementAti(loc);
2685             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2686         }
2687
2688         switch (opType) {
2689             // Ops that don't change the total length matched
2690         case URX_RESERVED_OP:
2691         case URX_END:
2692         case URX_FAIL:
2693         case URX_STRING_LEN:
2694         case URX_NOP:
2695         case URX_START_CAPTURE:
2696         case URX_END_CAPTURE:
2697         case URX_BACKSLASH_B:
2698         case URX_BACKSLASH_BU:
2699         case URX_BACKSLASH_G:
2700         case URX_BACKSLASH_Z:
2701         case URX_DOLLAR:
2702         case URX_DOLLAR_M:
2703         case URX_DOLLAR_D:
2704         case URX_DOLLAR_MD:
2705         case URX_RELOC_OPRND:
2706         case URX_STO_INP_LOC:
2707         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
2708         case URX_BACKREF_I:
2709
2710         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
2711         case URX_LD_SP:
2712             break;
2713
2714         case URX_CARET:
2715             if (atStart) {
2716                 fRXPat->fStartType = START_START;
2717             }
2718             break;
2719
2720         case URX_CARET_M:
2721         case URX_CARET_M_UNIX:
2722             if (atStart) {
2723                 fRXPat->fStartType = START_LINE;
2724             }
2725             break;
2726
2727         case URX_ONECHAR:
2728             if (currentLen == 0) {
2729                 // This character could appear at the start of a match.
2730                 //   Add it to the set of possible starting characters.
2731                 fRXPat->fInitialChars->add(URX_VAL(op));
2732                 numInitialStrings += 2;
2733             }
2734             currentLen++;
2735             atStart = FALSE;
2736             break;
2737
2738
2739         case URX_SETREF:
2740             if (currentLen == 0) {
2741                 int32_t  sn = URX_VAL(op);
2742                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2743                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2744                 fRXPat->fInitialChars->addAll(*s);
2745                 numInitialStrings += 2;
2746             }
2747             currentLen++;
2748             atStart = FALSE;
2749             break;
2750
2751         case URX_LOOP_SR_I:
2752             // [Set]*, like a SETREF, above, in what it can match,
2753             //  but may not match at all, so currentLen is not incremented.
2754             if (currentLen == 0) {
2755                 int32_t  sn = URX_VAL(op);
2756                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2757                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2758                 fRXPat->fInitialChars->addAll(*s);
2759                 numInitialStrings += 2;
2760             }
2761             atStart = FALSE;
2762             break;
2763
2764         case URX_LOOP_DOT_I:
2765             if (currentLen == 0) {
2766                 // .* at the start of a pattern.
2767                 //    Any character can begin the match.
2768                 fRXPat->fInitialChars->clear();
2769                 fRXPat->fInitialChars->complement();
2770                 numInitialStrings += 2;
2771             }
2772             atStart = FALSE;
2773             break;
2774
2775
2776         case URX_STATIC_SETREF:
2777             if (currentLen == 0) {
2778                 int32_t  sn = URX_VAL(op);
2779                 U_ASSERT(sn>0 && sn<URX_LAST_SET);
2780                 const UnicodeSet *s = fRXPat->fStaticSets[sn];
2781                 fRXPat->fInitialChars->addAll(*s);
2782                 numInitialStrings += 2;
2783             }
2784             currentLen++;
2785             atStart = FALSE;
2786             break;
2787
2788
2789
2790         case URX_STAT_SETREF_N:
2791             if (currentLen == 0) {
2792                 int32_t  sn = URX_VAL(op);
2793                 const UnicodeSet *s = fRXPat->fStaticSets[sn];
2794                 UnicodeSet sc(*s);
2795                 sc.complement();
2796                 fRXPat->fInitialChars->addAll(sc);
2797                 numInitialStrings += 2;
2798             }
2799             currentLen++;
2800             atStart = FALSE;
2801             break;
2802
2803
2804
2805         case URX_BACKSLASH_D:
2806             // Digit Char
2807              if (currentLen == 0) {
2808                  UnicodeSet s;
2809                  s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
2810                  if (URX_VAL(op) != 0) {
2811                      s.complement();
2812                  }
2813                  fRXPat->fInitialChars->addAll(s);
2814                  numInitialStrings += 2;
2815             }
2816             currentLen++;
2817             atStart = FALSE;
2818             break;
2819
2820
2821         case URX_BACKSLASH_H:
2822             // Horiz white space
2823             if (currentLen == 0) {
2824                 UnicodeSet s;
2825                 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
2826                 s.add((UChar32)9);   // Tab
2827                 if (URX_VAL(op) != 0) {
2828                     s.complement();
2829                 }
2830                 fRXPat->fInitialChars->addAll(s);
2831                 numInitialStrings += 2;
2832             }
2833             currentLen++;
2834             atStart = FALSE;
2835             break;
2836
2837
2838         case URX_BACKSLASH_R:       // Any line ending sequence
2839         case URX_BACKSLASH_V:       // Any line ending code point, with optional negation
2840             if (currentLen == 0) {
2841                 UnicodeSet s;
2842                 s.add((UChar32)0x0a, (UChar32)0x0d);  // add range
2843                 s.add((UChar32)0x85);
2844                 s.add((UChar32)0x2028, (UChar32)0x2029);
2845                 if (URX_VAL(op) != 0) {
2846                      // Complement option applies to URX_BACKSLASH_V only.
2847                      s.complement();
2848                 }
2849                 fRXPat->fInitialChars->addAll(s);
2850                 numInitialStrings += 2;
2851             }
2852             currentLen++;
2853             atStart = FALSE;
2854             break;
2855
2856
2857
2858         case URX_ONECHAR_I:
2859             // Case Insensitive Single Character.
2860             if (currentLen == 0) {
2861                 UChar32  c = URX_VAL(op);
2862                 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2863                     UnicodeSet starters(c, c);
2864                     starters.closeOver(USET_CASE_INSENSITIVE);
2865                     // findCaseInsensitiveStarters(c, &starters);
2866                     //   For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
2867                     //   The expanded folding can't match the pattern.
2868                     fRXPat->fInitialChars->addAll(starters);
2869                 } else {
2870                     // Char has no case variants.  Just add it as-is to the
2871                     //   set of possible starting chars.
2872                     fRXPat->fInitialChars->add(c);
2873                 }
2874                 numInitialStrings += 2;
2875             }
2876             currentLen++;
2877             atStart = FALSE;
2878             break;
2879
2880
2881         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
2882         case URX_DOTANY_ALL:    // . matches one or two.
2883         case URX_DOTANY:
2884         case URX_DOTANY_UNIX:
2885             if (currentLen == 0) {
2886                 // These constructs are all bad news when they appear at the start
2887                 //   of a match.  Any character can begin the match.
2888                 fRXPat->fInitialChars->clear();
2889                 fRXPat->fInitialChars->complement();
2890                 numInitialStrings += 2;
2891             }
2892             currentLen++;
2893             atStart = FALSE;
2894             break;
2895
2896
2897         case URX_JMPX:
2898             loc++;             // Except for extra operand on URX_JMPX, same as URX_JMP.
2899             U_FALLTHROUGH;
2900         case URX_JMP:
2901             {
2902                 int32_t  jmpDest = URX_VAL(op);
2903                 if (jmpDest < loc) {
2904                     // Loop of some kind.  Can safely ignore, the worst that will happen
2905                     //  is that we understate the true minimum length
2906                     currentLen = forwardedLength.elementAti(loc+1);
2907
2908                 } else {
2909                     // Forward jump.  Propagate the current min length to the target loc of the jump.
2910                     U_ASSERT(jmpDest <= end+1);
2911                     if (forwardedLength.elementAti(jmpDest) > currentLen) {
2912                         forwardedLength.setElementAt(currentLen, jmpDest);
2913                     }
2914                 }
2915             }
2916             atStart = FALSE;
2917             break;
2918
2919         case URX_JMP_SAV:
2920         case URX_JMP_SAV_X:
2921             // Combo of state save to the next loc, + jmp backwards.
2922             //   Net effect on min. length computation is nothing.
2923             atStart = FALSE;
2924             break;
2925
2926         case URX_BACKTRACK:
2927             // Fails are kind of like a branch, except that the min length was
2928             //   propagated already, by the state save.
2929             currentLen = forwardedLength.elementAti(loc+1);
2930             atStart = FALSE;
2931             break;
2932
2933
2934         case URX_STATE_SAVE:
2935             {
2936                 // State Save, for forward jumps, propagate the current minimum.
2937                 //             of the state save.
2938                 int32_t  jmpDest = URX_VAL(op);
2939                 if (jmpDest > loc) {
2940                     if (currentLen < forwardedLength.elementAti(jmpDest)) {
2941                         forwardedLength.setElementAt(currentLen, jmpDest);
2942                     }
2943                 }
2944             }
2945             atStart = FALSE;
2946             break;
2947
2948
2949
2950
2951         case URX_STRING:
2952             {
2953                 loc++;
2954                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2955                 int32_t stringLen   = URX_VAL(stringLenOp);
2956                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2957                 U_ASSERT(stringLenOp >= 2);
2958                 if (currentLen == 0) {
2959                     // Add the starting character of this string to the set of possible starting
2960                     //   characters for this pattern.
2961                     int32_t stringStartIdx = URX_VAL(op);
2962                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
2963                     fRXPat->fInitialChars->add(c);
2964
2965                     // Remember this string.  After the entire pattern has been checked,
2966                     //  if nothing else is identified that can start a match, we'll use it.
2967                     numInitialStrings++;
2968                     fRXPat->fInitialStringIdx = stringStartIdx;
2969                     fRXPat->fInitialStringLen = stringLen;
2970                 }
2971
2972                 currentLen += stringLen;
2973                 atStart = FALSE;
2974             }
2975             break;
2976
2977         case URX_STRING_I:
2978             {
2979                 // Case-insensitive string.  Unlike exact-match strings, we won't
2980                 //   attempt a string search for possible match positions.  But we
2981                 //   do update the set of possible starting characters.
2982                 loc++;
2983                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2984                 int32_t stringLen   = URX_VAL(stringLenOp);
2985                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2986                 U_ASSERT(stringLenOp >= 2);
2987                 if (currentLen == 0) {
2988                     // Add the starting character of this string to the set of possible starting
2989                     //   characters for this pattern.
2990                     int32_t stringStartIdx = URX_VAL(op);
2991                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
2992                     UnicodeSet s;
2993                     findCaseInsensitiveStarters(c, &s);
2994                     fRXPat->fInitialChars->addAll(s);
2995                     numInitialStrings += 2;  // Matching on an initial string not possible.
2996                 }
2997                 currentLen += stringLen;
2998                 atStart = FALSE;
2999             }
3000             break;
3001
3002         case URX_CTR_INIT:
3003         case URX_CTR_INIT_NG:
3004             {
3005                 // Loop Init Ops.  These don't change the min length, but they are 4 word ops
3006                 //   so location must be updated accordingly.
3007                 // Loop Init Ops.
3008                 //   If the min loop count == 0
3009                 //      move loc forwards to the end of the loop, skipping over the body.
3010                 //   If the min count is > 0,
3011                 //      continue normal processing of the body of the loop.
3012                 int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3013                         loopEndLoc   = URX_VAL(loopEndLoc);
3014                 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3015                 if (minLoopCount == 0) {
3016                     // Min Loop Count of 0, treat like a forward branch and
3017                     //   move the current minimum length up to the target
3018                     //   (end of loop) location.
3019                     U_ASSERT(loopEndLoc <= end+1);
3020                     if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
3021                         forwardedLength.setElementAt(currentLen, loopEndLoc);
3022                     }
3023                 }
3024                 loc+=3;  // Skips over operands of CTR_INIT
3025             }
3026             atStart = FALSE;
3027             break;
3028
3029
3030         case URX_CTR_LOOP:
3031         case URX_CTR_LOOP_NG:
3032             // Loop ops.
3033             //  The jump is conditional, backwards only.
3034             atStart = FALSE;
3035             break;
3036
3037         case URX_LOOP_C:
3038             // More loop ops.  These state-save to themselves.
3039             //   don't change the minimum match
3040             atStart = FALSE;
3041             break;
3042
3043
3044         case URX_LA_START:
3045         case URX_LB_START:
3046             {
3047                 // Look-around.  Scan forward until the matching look-ahead end,
3048                 //   without processing the look-around block.  This is overly pessimistic.
3049
3050                 // Keep track of the nesting depth of look-around blocks.  Boilerplate code for
3051                 //   lookahead contains two LA_END instructions, so count goes up by two
3052                 //   for each LA_START.
3053                 int32_t  depth = (opType == URX_LA_START? 2: 1);
3054                 for (;;) {
3055                     loc++;
3056                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3057                     if (URX_TYPE(op) == URX_LA_START) {
3058                         depth+=2;
3059                     }
3060                     if (URX_TYPE(op) == URX_LB_START) {
3061                         depth++;
3062                     }
3063                     if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3064                         depth--;
3065                         if (depth == 0) {
3066                             break;
3067                         }
3068                     }
3069                     if (URX_TYPE(op) == URX_STATE_SAVE) {
3070                         // Need this because neg lookahead blocks will FAIL to outside
3071                         //   of the block.
3072                         int32_t  jmpDest = URX_VAL(op);
3073                         if (jmpDest > loc) {
3074                             if (currentLen < forwardedLength.elementAti(jmpDest)) {
3075                                 forwardedLength.setElementAt(currentLen, jmpDest);
3076                             }
3077                         }
3078                     }
3079                     U_ASSERT(loc <= end);
3080                 }
3081             }
3082             break;
3083
3084         case URX_LA_END:
3085         case URX_LB_CONT:
3086         case URX_LB_END:
3087         case URX_LBN_CONT:
3088         case URX_LBN_END:
3089             U_ASSERT(FALSE);     // Shouldn't get here.  These ops should be
3090                                  //  consumed by the scan in URX_LA_START and LB_START
3091
3092             break;
3093
3094         default:
3095             U_ASSERT(FALSE);
3096             }
3097
3098         }
3099
3100
3101     // We have finished walking through the ops.  Check whether some forward jump
3102     //   propagated a shorter length to location end+1.
3103     if (forwardedLength.elementAti(end+1) < currentLen) {
3104         currentLen = forwardedLength.elementAti(end+1);
3105     }
3106
3107
3108     fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
3109
3110
3111     // Sort out what we should check for when looking for candidate match start positions.
3112     // In order of preference,
3113     //     1.   Start of input text buffer.
3114     //     2.   A literal string.
3115     //     3.   Start of line in multi-line mode.
3116     //     4.   A single literal character.
3117     //     5.   A character from a set of characters.
3118     //
3119     if (fRXPat->fStartType == START_START) {
3120         // Match only at the start of an input text string.
3121         //    start type is already set.  We're done.
3122     } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
3123         // Match beginning only with a literal string.
3124         UChar32  c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
3125         U_ASSERT(fRXPat->fInitialChars->contains(c));
3126         fRXPat->fStartType   = START_STRING;
3127         fRXPat->fInitialChar = c;
3128     } else if (fRXPat->fStartType == START_LINE) {
3129         // Match at start of line in Multi-Line mode.
3130         // Nothing to do here; everything is already set.
3131     } else if (fRXPat->fMinMatchLen == 0) {
3132         // Zero length match possible.  We could start anywhere.
3133         fRXPat->fStartType = START_NO_INFO;
3134     } else if (fRXPat->fInitialChars->size() == 1) {
3135         // All matches begin with the same char.
3136         fRXPat->fStartType   = START_CHAR;
3137         fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
3138         U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
3139     } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE &&
3140         fRXPat->fMinMatchLen > 0) {
3141         // Matches start with a set of character smaller than the set of all chars.
3142         fRXPat->fStartType = START_SET;
3143     } else {
3144         // Matches can start with anything
3145         fRXPat->fStartType = START_NO_INFO;
3146     }
3147
3148     return;
3149 }
3150
3151
3152
3153 //------------------------------------------------------------------------------
3154 //
3155 //   minMatchLength    Calculate the length of the shortest string that could
3156 //                     match the specified pattern.
3157 //                     Length is in 16 bit code units, not code points.
3158 //
3159 //                     The calculated length may not be exact.  The returned
3160 //                     value may be shorter than the actual minimum; it must
3161 //                     never be longer.
3162 //
3163 //                     start and end are the range of p-code operations to be
3164 //                     examined.  The endpoints are included in the range.
3165 //
3166 //------------------------------------------------------------------------------
3167 int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
3168     if (U_FAILURE(*fStatus)) {
3169         return 0;
3170     }
3171
3172     U_ASSERT(start <= end);
3173     U_ASSERT(end < fRXPat->fCompiledPat->size());
3174
3175
3176     int32_t    loc;
3177     int32_t    op;
3178     int32_t    opType;
3179     int32_t    currentLen = 0;
3180
3181
3182     // forwardedLength is a vector holding minimum-match-length values that
3183     //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
3184     //   It must be one longer than the pattern being checked because some  ops
3185     //   will jmp to a end-of-block+1 location from within a block, and we must
3186     //   count those when checking the block.
3187     UVector32  forwardedLength(end+2, *fStatus);
3188     forwardedLength.setSize(end+2);
3189     for (loc=start; loc<=end+1; loc++) {
3190         forwardedLength.setElementAt(INT32_MAX, loc);
3191     }
3192
3193     for (loc = start; loc<=end; loc++) {
3194         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3195         opType = URX_TYPE(op);
3196
3197         // The loop is advancing linearly through the pattern.
3198         // If the op we are now at was the destination of a branch in the pattern,
3199         // and that path has a shorter minimum length than the current accumulated value,
3200         // replace the current accumulated value.
3201         // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);  // MinLength == INT32_MAX for some
3202                                                                //   no-match-possible cases.
3203         if (forwardedLength.elementAti(loc) < currentLen) {
3204             currentLen = forwardedLength.elementAti(loc);
3205             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3206         }
3207
3208         switch (opType) {
3209             // Ops that don't change the total length matched
3210         case URX_RESERVED_OP:
3211         case URX_END:
3212         case URX_STRING_LEN:
3213         case URX_NOP:
3214         case URX_START_CAPTURE:
3215         case URX_END_CAPTURE:
3216         case URX_BACKSLASH_B:
3217         case URX_BACKSLASH_BU:
3218         case URX_BACKSLASH_G:
3219         case URX_BACKSLASH_Z:
3220         case URX_CARET:
3221         case URX_DOLLAR:
3222         case URX_DOLLAR_M:
3223         case URX_DOLLAR_D:
3224         case URX_DOLLAR_MD:
3225         case URX_RELOC_OPRND:
3226         case URX_STO_INP_LOC:
3227         case URX_CARET_M:
3228         case URX_CARET_M_UNIX:
3229         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
3230         case URX_BACKREF_I:
3231
3232         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
3233         case URX_LD_SP:
3234
3235         case URX_JMP_SAV:
3236         case URX_JMP_SAV_X:
3237             break;
3238
3239
3240             // Ops that match a minimum of one character (one or two 16 bit code units.)
3241             //
3242         case URX_ONECHAR:
3243         case URX_STATIC_SETREF:
3244         case URX_STAT_SETREF_N:
3245         case URX_SETREF:
3246         case URX_BACKSLASH_D:
3247         case URX_BACKSLASH_H:
3248         case URX_BACKSLASH_R:
3249         case URX_BACKSLASH_V:
3250         case URX_ONECHAR_I:
3251         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
3252         case URX_DOTANY_ALL:    // . matches one or two.
3253         case URX_DOTANY:
3254         case URX_DOTANY_UNIX:
3255             currentLen++;
3256             break;
3257
3258
3259         case URX_JMPX:
3260             loc++;              // URX_JMPX has an extra operand, ignored here,
3261                                 //   otherwise processed identically to URX_JMP.
3262             U_FALLTHROUGH;
3263         case URX_JMP:
3264             {
3265                 int32_t  jmpDest = URX_VAL(op);
3266                 if (jmpDest < loc) {
3267                     // Loop of some kind.  Can safely ignore, the worst that will happen
3268                     //  is that we understate the true minimum length
3269                     currentLen = forwardedLength.elementAti(loc+1);
3270                 } else {
3271                     // Forward jump.  Propagate the current min length to the target loc of the jump.
3272                     U_ASSERT(jmpDest <= end+1);
3273                     if (forwardedLength.elementAti(jmpDest) > currentLen) {
3274                         forwardedLength.setElementAt(currentLen, jmpDest);
3275                     }
3276                 }
3277             }
3278             break;
3279
3280         case URX_BACKTRACK:
3281             {
3282                 // Back-tracks are kind of like a branch, except that the min length was
3283                 //   propagated already, by the state save.
3284                 currentLen = forwardedLength.elementAti(loc+1);
3285             }
3286             break;
3287
3288
3289         case URX_STATE_SAVE:
3290             {
3291                 // State Save, for forward jumps, propagate the current minimum.
3292                 //             of the state save.
3293                 int32_t  jmpDest = URX_VAL(op);
3294                 if (jmpDest > loc) {
3295                     if (currentLen < forwardedLength.elementAti(jmpDest)) {
3296                         forwardedLength.setElementAt(currentLen, jmpDest);
3297                     }
3298                 }
3299             }
3300             break;
3301
3302
3303         case URX_STRING:
3304             {
3305                 loc++;
3306                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3307                 currentLen += URX_VAL(stringLenOp);
3308             }
3309             break;
3310
3311
3312         case URX_STRING_I:
3313             {
3314                 loc++;
3315                 // TODO: with full case folding, matching input text may be shorter than
3316                 //       the string we have here.  More smarts could put some bounds on it.
3317                 //       Assume a min length of one for now.  A min length of zero causes
3318                 //        optimization failures for a pattern like "string"+
3319                 // currentLen += URX_VAL(stringLenOp);
3320                 currentLen += 1;
3321             }
3322             break;
3323
3324         case URX_CTR_INIT:
3325         case URX_CTR_INIT_NG:
3326             {
3327                 // Loop Init Ops.
3328                 //   If the min loop count == 0
3329                 //      move loc forwards to the end of the loop, skipping over the body.
3330                 //   If the min count is > 0,
3331                 //      continue normal processing of the body of the loop.
3332                 int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3333                         loopEndLoc   = URX_VAL(loopEndLoc);
3334                 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3335                 if (minLoopCount == 0) {
3336                     loc = loopEndLoc;
3337                 } else {
3338                     loc+=3;  // Skips over operands of CTR_INIT
3339                 }
3340             }
3341             break;
3342
3343
3344         case URX_CTR_LOOP:
3345         case URX_CTR_LOOP_NG:
3346             // Loop ops.
3347             //  The jump is conditional, backwards only.
3348             break;
3349
3350         case URX_LOOP_SR_I:
3351         case URX_LOOP_DOT_I:
3352         case URX_LOOP_C:
3353             // More loop ops.  These state-save to themselves.
3354             //   don't change the minimum match - could match nothing at all.
3355             break;
3356
3357
3358         case URX_LA_START:
3359         case URX_LB_START:
3360             {
3361                 // Look-around.  Scan forward until the matching look-ahead end,
3362                 //   without processing the look-around block.  This is overly pessimistic for look-ahead,
3363                 //   it assumes that the look-ahead match might be zero-length.
3364                 //   TODO:  Positive lookahead could recursively do the block, then continue
3365                 //          with the longer of the block or the value coming in.  Ticket 6060
3366                 int32_t  depth = (opType == URX_LA_START? 2: 1);;
3367                 for (;;) {
3368                     loc++;
3369                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3370                     if (URX_TYPE(op) == URX_LA_START) {
3371                         // The boilerplate for look-ahead includes two LA_END insturctions,
3372                         //    Depth will be decremented by each one when it is seen.
3373                         depth += 2;
3374                     }
3375                     if (URX_TYPE(op) == URX_LB_START) {
3376                         depth++;
3377                     }
3378                     if (URX_TYPE(op) == URX_LA_END) {
3379                         depth--;
3380                         if (depth == 0) {
3381                             break;
3382                         }
3383                     }
3384                     if (URX_TYPE(op)==URX_LBN_END) {
3385                         depth--;
3386                         if (depth == 0) {
3387                             break;
3388                         }
3389                     }
3390                     if (URX_TYPE(op) == URX_STATE_SAVE) {
3391                         // Need this because neg lookahead blocks will FAIL to outside
3392                         //   of the block.
3393                         int32_t  jmpDest = URX_VAL(op);
3394                         if (jmpDest > loc) {
3395                             if (currentLen < forwardedLength.elementAti(jmpDest)) {
3396                                 forwardedLength.setElementAt(currentLen, jmpDest);
3397                             }
3398                         }
3399                     }
3400                     U_ASSERT(loc <= end);
3401                 }
3402             }
3403             break;
3404
3405         case URX_LA_END:
3406         case URX_LB_CONT:
3407         case URX_LB_END:
3408         case URX_LBN_CONT:
3409         case URX_LBN_END:
3410             // Only come here if the matching URX_LA_START or URX_LB_START was not in the
3411             //   range being sized, which happens when measuring size of look-behind blocks.
3412             break;
3413
3414         default:
3415             U_ASSERT(FALSE);
3416             }
3417
3418         }
3419
3420     // We have finished walking through the ops.  Check whether some forward jump
3421     //   propagated a shorter length to location end+1.
3422     if (forwardedLength.elementAti(end+1) < currentLen) {
3423         currentLen = forwardedLength.elementAti(end+1);
3424         U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3425     }
3426
3427     return currentLen;
3428 }
3429
3430 // Increment with overflow check.
3431 // val and delta will both be positive.
3432
3433 static int32_t safeIncrement(int32_t val, int32_t delta) {
3434     if (INT32_MAX - val > delta) {
3435         return val + delta;
3436     } else {
3437         return INT32_MAX;
3438     }
3439 }
3440
3441
3442 //------------------------------------------------------------------------------
3443 //
3444 //   maxMatchLength    Calculate the length of the longest string that could
3445 //                     match the specified pattern.
3446 //                     Length is in 16 bit code units, not code points.
3447 //
3448 //                     The calculated length may not be exact.  The returned
3449 //                     value may be longer than the actual maximum; it must
3450 //                     never be shorter.
3451 //
3452 //------------------------------------------------------------------------------
3453 int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
3454     if (U_FAILURE(*fStatus)) {
3455         return 0;
3456     }
3457     U_ASSERT(start <= end);
3458     U_ASSERT(end < fRXPat->fCompiledPat->size());
3459
3460
3461     int32_t    loc;
3462     int32_t    op;
3463     int32_t    opType;
3464     int32_t    currentLen = 0;
3465     UVector32  forwardedLength(end+1, *fStatus);
3466     forwardedLength.setSize(end+1);
3467
3468     for (loc=start; loc<=end; loc++) {
3469         forwardedLength.setElementAt(0, loc);
3470     }
3471
3472     for (loc = start; loc<=end; loc++) {
3473         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3474         opType = URX_TYPE(op);
3475
3476         // The loop is advancing linearly through the pattern.
3477         // If the op we are now at was the destination of a branch in the pattern,
3478         // and that path has a longer maximum length than the current accumulated value,
3479         // replace the current accumulated value.
3480         if (forwardedLength.elementAti(loc) > currentLen) {
3481             currentLen = forwardedLength.elementAti(loc);
3482         }
3483
3484         switch (opType) {
3485             // Ops that don't change the total length matched
3486         case URX_RESERVED_OP:
3487         case URX_END:
3488         case URX_STRING_LEN:
3489         case URX_NOP:
3490         case URX_START_CAPTURE:
3491         case URX_END_CAPTURE:
3492         case URX_BACKSLASH_B:
3493         case URX_BACKSLASH_BU:
3494         case URX_BACKSLASH_G:
3495         case URX_BACKSLASH_Z:
3496         case URX_CARET:
3497         case URX_DOLLAR:
3498         case URX_DOLLAR_M:
3499         case URX_DOLLAR_D:
3500         case URX_DOLLAR_MD:
3501         case URX_RELOC_OPRND:
3502         case URX_STO_INP_LOC:
3503         case URX_CARET_M:
3504         case URX_CARET_M_UNIX:
3505
3506         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
3507         case URX_LD_SP:
3508
3509         case URX_LB_END:
3510         case URX_LB_CONT:
3511         case URX_LBN_CONT:
3512         case URX_LBN_END:
3513             break;
3514
3515
3516             // Ops that increase that cause an unbounded increase in the length
3517             //   of a matched string, or that increase it a hard to characterize way.
3518             //   Call the max length unbounded, and stop further checking.
3519         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
3520         case URX_BACKREF_I:
3521         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
3522             currentLen = INT32_MAX;
3523             break;
3524
3525
3526             // Ops that match a max of one character (possibly two 16 bit code units.)
3527             //
3528         case URX_STATIC_SETREF:
3529         case URX_STAT_SETREF_N:
3530         case URX_SETREF:
3531         case URX_BACKSLASH_D:
3532         case URX_BACKSLASH_H:
3533         case URX_BACKSLASH_R:
3534         case URX_BACKSLASH_V:
3535         case URX_ONECHAR_I:
3536         case URX_DOTANY_ALL:
3537         case URX_DOTANY:
3538         case URX_DOTANY_UNIX:
3539             currentLen = safeIncrement(currentLen, 2);
3540             break;
3541
3542             // Single literal character.  Increase current max length by one or two,
3543             //       depending on whether the char is in the supplementary range.
3544         case URX_ONECHAR:
3545             currentLen = safeIncrement(currentLen, 1);
3546             if (URX_VAL(op) > 0x10000) {
3547                 currentLen = safeIncrement(currentLen, 1);
3548             }
3549             break;
3550
3551             // Jumps.
3552             //
3553         case URX_JMP:
3554         case URX_JMPX:
3555         case URX_JMP_SAV:
3556         case URX_JMP_SAV_X:
3557             {
3558                 int32_t  jmpDest = URX_VAL(op);
3559                 if (jmpDest < loc) {
3560                     // Loop of some kind.  Max match length is unbounded.
3561                     currentLen = INT32_MAX;
3562                 } else {
3563                     // Forward jump.  Propagate the current min length to the target loc of the jump.
3564                     if (forwardedLength.elementAti(jmpDest) < currentLen) {
3565                         forwardedLength.setElementAt(currentLen, jmpDest);
3566                     }
3567                     currentLen = 0;
3568                 }
3569             }
3570             break;
3571
3572         case URX_BACKTRACK:
3573             // back-tracks are kind of like a branch, except that the max length was
3574             //   propagated already, by the state save.
3575             currentLen = forwardedLength.elementAti(loc+1);
3576             break;
3577
3578
3579         case URX_STATE_SAVE:
3580             {
3581                 // State Save, for forward jumps, propagate the current minimum.
3582                 //               of the state save.
3583                 //             For backwards jumps, they create a loop, maximum
3584                 //               match length is unbounded.
3585                 int32_t  jmpDest = URX_VAL(op);
3586                 if (jmpDest > loc) {
3587                     if (currentLen > forwardedLength.elementAti(jmpDest)) {
3588                         forwardedLength.setElementAt(currentLen, jmpDest);
3589                     }
3590                 } else {
3591                     currentLen = INT32_MAX;
3592                 }
3593             }
3594             break;
3595
3596
3597
3598
3599         case URX_STRING:
3600             {
3601                 loc++;
3602                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3603                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3604                 break;
3605             }
3606
3607         case URX_STRING_I:
3608             // TODO:  This code assumes that any user string that matches will be no longer
3609             //        than our compiled string, with case insensitive matching.
3610             //        Our compiled string has been case-folded already.
3611             //
3612             //        Any matching user string will have no more code points than our
3613             //        compiled (folded) string.  Folding may add code points, but
3614             //        not remove them.
3615             //
3616             //        There is a potential problem if a supplemental code point
3617             //        case-folds to a BMP code point.  In this case our compiled string
3618             //        could be shorter (in code units) than a matching user string.
3619             //
3620             //        At this time (Unicode 6.1) there are no such characters, and this case
3621             //        is not being handled.  A test, intltest regex/Bug9283, will fail if
3622             //        any problematic characters are added to Unicode.
3623             //
3624             //        If this happens, we can make a set of the BMP chars that the
3625             //        troublesome supplementals fold to, scan our string, and bump the
3626             //        currentLen one extra for each that is found.
3627             //
3628             {
3629                 loc++;
3630                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3631                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3632             }
3633             break;
3634
3635         case URX_CTR_INIT:
3636         case URX_CTR_INIT_NG:
3637             // For Loops, recursively call this function on the pattern for the loop body,
3638             //   then multiply the result by the maximum loop count.
3639             {
3640                 int32_t  loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
3641                 if (loopEndLoc == loc+4) {
3642                     // Loop has an empty body. No affect on max match length.
3643                     // Continue processing with code after the loop end.
3644                     loc = loopEndLoc;
3645                     break;
3646                 }
3647
3648                 int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+3));
3649                 if (maxLoopCount == -1) {
3650                     // Unbounded Loop. No upper bound on match length.
3651                     currentLen = INT32_MAX;
3652                     break;
3653                 }
3654
3655                 U_ASSERT(loopEndLoc >= loc+4);
3656                 int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1);  // Recursive call.
3657                 int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount;
3658                 if (updatedLen >= INT32_MAX) {
3659                     currentLen = INT32_MAX;
3660                     break;
3661                 }
3662                 currentLen = (int32_t)updatedLen;
3663                 loc = loopEndLoc;
3664                 break;
3665             }
3666
3667         case URX_CTR_LOOP:
3668         case URX_CTR_LOOP_NG:
3669             // These opcodes will be skipped over by code for URX_CRT_INIT.
3670             // We shouldn't encounter them here.
3671             U_ASSERT(FALSE);
3672             break;
3673
3674         case URX_LOOP_SR_I:
3675         case URX_LOOP_DOT_I:
3676         case URX_LOOP_C:
3677             // For anything to do with loops, make the match length unbounded.
3678             currentLen = INT32_MAX;
3679             break;
3680
3681
3682
3683         case URX_LA_START:
3684         case URX_LA_END:
3685             // Look-ahead.  Just ignore, treat the look-ahead block as if
3686             // it were normal pattern.  Gives a too-long match length,
3687             //  but good enough for now.
3688             break;
3689
3690             // End of look-ahead ops should always be consumed by the processing at
3691             //  the URX_LA_START op.
3692             // U_ASSERT(FALSE);
3693             // break;
3694
3695         case URX_LB_START:
3696             {
3697                 // Look-behind.  Scan forward until the matching look-around end,
3698                 //   without processing the look-behind block.
3699                 int32_t  depth = 0;
3700                 for (;;) {
3701                     loc++;
3702                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3703                     if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) {
3704                         depth++;
3705                     }
3706                     if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3707                         if (depth == 0) {
3708                             break;
3709                         }
3710                         depth--;
3711                     }
3712                     U_ASSERT(loc < end);
3713                 }
3714             }
3715             break;
3716
3717         default:
3718             U_ASSERT(FALSE);
3719         }
3720
3721
3722         if (currentLen == INT32_MAX) {
3723             //  The maximum length is unbounded.
3724             //  Stop further processing of the pattern.
3725             break;
3726         }
3727
3728     }
3729     return currentLen;
3730
3731 }
3732
3733
3734 //------------------------------------------------------------------------------
3735 //
3736 //   stripNOPs    Remove any NOP operations from the compiled pattern code.
3737 //                Extra NOPs are inserted for some constructs during the initial
3738 //                code generation to provide locations that may be patched later.
3739 //                Many end up unneeded, and are removed by this function.
3740 //
3741 //                In order to minimize the number of passes through the pattern,
3742 //                back-reference fixup is also performed here (adjusting
3743 //                back-reference operands to point to the correct frame offsets).
3744 //
3745 //------------------------------------------------------------------------------
3746 void RegexCompile::stripNOPs() {
3747
3748     if (U_FAILURE(*fStatus)) {
3749         return;
3750     }
3751
3752     int32_t    end = fRXPat->fCompiledPat->size();
3753     UVector32  deltas(end, *fStatus);
3754
3755     // Make a first pass over the code, computing the amount that things
3756     //   will be offset at each location in the original code.
3757     int32_t   loc;
3758     int32_t   d = 0;
3759     for (loc=0; loc<end; loc++) {
3760         deltas.addElement(d, *fStatus);
3761         int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3762         if (URX_TYPE(op) == URX_NOP) {
3763             d++;
3764         }
3765     }
3766
3767     UnicodeString caseStringBuffer;
3768
3769     // Make a second pass over the code, removing the NOPs by moving following
3770     //  code up, and patching operands that refer to code locations that
3771     //  are being moved.  The array of offsets from the first step is used
3772     //  to compute the new operand values.
3773     int32_t src;
3774     int32_t dst = 0;
3775     for (src=0; src<end; src++) {
3776         int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
3777         int32_t opType = URX_TYPE(op);
3778         switch (opType) {
3779         case URX_NOP:
3780             break;
3781
3782         case URX_STATE_SAVE:
3783         case URX_JMP:
3784         case URX_CTR_LOOP:
3785         case URX_CTR_LOOP_NG:
3786         case URX_RELOC_OPRND:
3787         case URX_JMPX:
3788         case URX_JMP_SAV:
3789         case URX_JMP_SAV_X:
3790             // These are instructions with operands that refer to code locations.
3791             {
3792                 int32_t  operandAddress = URX_VAL(op);
3793                 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
3794                 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
3795                 op = buildOp(opType, fixedOperandAddress);
3796                 fRXPat->fCompiledPat->setElementAt(op, dst);
3797                 dst++;
3798                 break;
3799             }
3800
3801         case URX_BACKREF:
3802         case URX_BACKREF_I:
3803             {
3804                 int32_t where = URX_VAL(op);
3805                 if (where > fRXPat->fGroupMap->size()) {
3806                     error(U_REGEX_INVALID_BACK_REF);
3807                     break;
3808                 }
3809                 where = fRXPat->fGroupMap->elementAti(where-1);
3810                 op    = buildOp(opType, where);
3811                 fRXPat->fCompiledPat->setElementAt(op, dst);
3812                 dst++;
3813
3814                 fRXPat->fNeedsAltInput = TRUE;
3815                 break;
3816             }
3817         case URX_RESERVED_OP:
3818         case URX_RESERVED_OP_N:
3819         case URX_BACKTRACK:
3820         case URX_END:
3821         case URX_ONECHAR:
3822         case URX_STRING:
3823         case URX_STRING_LEN:
3824         case URX_START_CAPTURE:
3825         case URX_END_CAPTURE:
3826         case URX_STATIC_SETREF:
3827         case URX_STAT_SETREF_N:
3828         case URX_SETREF:
3829         case URX_DOTANY:
3830         case URX_FAIL:
3831         case URX_BACKSLASH_B:
3832         case URX_BACKSLASH_BU:
3833         case URX_BACKSLASH_G:
3834         case URX_BACKSLASH_X:
3835         case URX_BACKSLASH_Z:
3836         case URX_DOTANY_ALL:
3837         case URX_BACKSLASH_D:
3838         case URX_CARET:
3839         case URX_DOLLAR:
3840         case URX_CTR_INIT:
3841         case URX_CTR_INIT_NG:
3842         case URX_DOTANY_UNIX:
3843         case URX_STO_SP:
3844         case URX_LD_SP:
3845         case URX_STO_INP_LOC:
3846         case URX_LA_START:
3847         case URX_LA_END:
3848         case URX_ONECHAR_I:
3849         case URX_STRING_I:
3850         case URX_DOLLAR_M:
3851         case URX_CARET_M:
3852         case URX_CARET_M_UNIX:
3853         case URX_LB_START:
3854         case URX_LB_CONT:
3855         case URX_LB_END:
3856         case URX_LBN_CONT:
3857         case URX_LBN_END:
3858         case URX_LOOP_SR_I:
3859         case URX_LOOP_DOT_I:
3860         case URX_LOOP_C:
3861         case URX_DOLLAR_D:
3862         case URX_DOLLAR_MD:
3863         case URX_BACKSLASH_H:
3864         case URX_BACKSLASH_R:
3865         case URX_BACKSLASH_V:
3866             // These instructions are unaltered by the relocation.
3867             fRXPat->fCompiledPat->setElementAt(op, dst);
3868             dst++;
3869             break;
3870
3871         default:
3872             // Some op is unaccounted for.
3873             U_ASSERT(FALSE);
3874             error(U_REGEX_INTERNAL_ERROR);
3875         }
3876     }
3877
3878     fRXPat->fCompiledPat->setSize(dst);
3879 }
3880
3881
3882
3883
3884 //------------------------------------------------------------------------------
3885 //
3886 //  Error         Report a rule parse error.
3887 //                Only report it if no previous error has been recorded.
3888 //
3889 //------------------------------------------------------------------------------
3890 void RegexCompile::error(UErrorCode e) {
3891     if (U_SUCCESS(*fStatus)) {
3892         *fStatus = e;
3893         // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
3894         // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
3895         // int64_t. If the values of the latter are out of range for the former,
3896         // set them to the appropriate "field not supported" values.
3897         if (fLineNum > 0x7FFFFFFF) {
3898             fParseErr->line   = 0;
3899             fParseErr->offset = -1;
3900         } else if (fCharNum > 0x7FFFFFFF) {
3901             fParseErr->line   = (int32_t)fLineNum;
3902             fParseErr->offset = -1;
3903         } else {
3904             fParseErr->line   = (int32_t)fLineNum;
3905             fParseErr->offset = (int32_t)fCharNum;
3906         }
3907
3908         UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
3909
3910         // Fill in the context.
3911         //   Note: extractBetween() pins supplied indicies to the string bounds.
3912         uprv_memset(fParseErr->preContext,  0, sizeof(fParseErr->preContext));
3913         uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
3914         utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
3915         utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
3916     }
3917 }
3918
3919
3920 //
3921 //  Assorted Unicode character constants.
3922 //     Numeric because there is no portable way to enter them as literals.
3923 //     (Think EBCDIC).
3924 //
3925 static const UChar      chCR        = 0x0d;      // New lines, for terminating comments.
3926 static const UChar      chLF        = 0x0a;      // Line Feed
3927 static const UChar      chPound     = 0x23;      // '#', introduces a comment.
3928 static const UChar      chDigit0    = 0x30;      // '0'
3929 static const UChar      chDigit7    = 0x37;      // '9'
3930 static const UChar      chColon     = 0x3A;      // ':'
3931 static const UChar      chE         = 0x45;      // 'E'
3932 static const UChar      chQ         = 0x51;      // 'Q'
3933 //static const UChar      chN         = 0x4E;      // 'N'
3934 static const UChar      chP         = 0x50;      // 'P'
3935 static const UChar      chBackSlash = 0x5c;      // '\'  introduces a char escape
3936 //static const UChar      chLBracket  = 0x5b;      // '['
3937 static const UChar      chRBracket  = 0x5d;      // ']'
3938 static const UChar      chUp        = 0x5e;      // '^'
3939 static const UChar      chLowerP    = 0x70;
3940 static const UChar      chLBrace    = 0x7b;      // '{'
3941 static const UChar      chRBrace    = 0x7d;      // '}'
3942 static const UChar      chNEL       = 0x85;      //    NEL newline variant
3943 static const UChar      chLS        = 0x2028;    //    Unicode Line Separator
3944
3945
3946 //------------------------------------------------------------------------------
3947 //
3948 //  nextCharLL    Low Level Next Char from the regex pattern.
3949 //                Get a char from the string, keep track of input position
3950 //                     for error reporting.
3951 //
3952 //------------------------------------------------------------------------------
3953 UChar32  RegexCompile::nextCharLL() {
3954     UChar32       ch;
3955
3956     if (fPeekChar != -1) {
3957         ch = fPeekChar;
3958         fPeekChar = -1;
3959         return ch;
3960     }
3961
3962     // assume we're already in the right place
3963     ch = UTEXT_NEXT32(fRXPat->fPattern);
3964     if (ch == U_SENTINEL) {
3965         return ch;
3966     }
3967
3968     if (ch == chCR ||
3969         ch == chNEL ||
3970         ch == chLS   ||
3971         (ch == chLF && fLastChar != chCR)) {
3972         // Character is starting a new line.  Bump up the line number, and
3973         //  reset the column to 0.
3974         fLineNum++;
3975         fCharNum=0;
3976     }
3977     else {
3978         // Character is not starting a new line.  Except in the case of a
3979         //   LF following a CR, increment the column position.
3980         if (ch != chLF) {
3981             fCharNum++;
3982         }
3983     }
3984     fLastChar = ch;
3985     return ch;
3986 }
3987
3988 //------------------------------------------------------------------------------
3989 //
3990 //   peekCharLL    Low Level Character Scanning, sneak a peek at the next
3991 //                 character without actually getting it.
3992 //
3993 //------------------------------------------------------------------------------
3994 UChar32  RegexCompile::peekCharLL() {
3995     if (fPeekChar == -1) {
3996         fPeekChar = nextCharLL();
3997     }
3998     return fPeekChar;
3999 }
4000
4001
4002 //------------------------------------------------------------------------------
4003 //
4004 //   nextChar     for pattern scanning.  At this level, we handle stripping
4005 //                out comments and processing some backslash character escapes.
4006 //                The rest of the pattern grammar is handled at the next level up.
4007 //
4008 //------------------------------------------------------------------------------
4009 void RegexCompile::nextChar(RegexPatternChar &c) {
4010
4011     fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4012     c.fChar    = nextCharLL();
4013     c.fQuoted  = FALSE;
4014
4015     if (fQuoteMode) {
4016         c.fQuoted = TRUE;
4017         if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
4018             c.fChar == (UChar32)-1) {
4019             fQuoteMode = FALSE;  //  Exit quote mode,
4020             nextCharLL();        // discard the E
4021             nextChar(c);         // recurse to get the real next char
4022         }
4023     }
4024     else if (fInBackslashQuote) {
4025         // The current character immediately follows a '\'
4026         // Don't check for any further escapes, just return it as-is.
4027         // Don't set c.fQuoted, because that would prevent the state machine from
4028         //    dispatching on the character.
4029         fInBackslashQuote = FALSE;
4030     }
4031     else
4032     {
4033         // We are not in a \Q quoted region \E of the source.
4034         //
4035         if (fModeFlags & UREGEX_COMMENTS) {
4036             //
4037             // We are in free-spacing and comments mode.
4038             //  Scan through any white space and comments, until we
4039             //  reach a significant character or the end of inut.
4040             for (;;) {
4041                 if (c.fChar == (UChar32)-1) {
4042                     break;     // End of Input
4043                 }
4044                 if  (c.fChar == chPound && fEOLComments == TRUE) {
4045                     // Start of a comment.  Consume the rest of it, until EOF or a new line
4046                     for (;;) {
4047                         c.fChar = nextCharLL();
4048                         if (c.fChar == (UChar32)-1 ||  // EOF
4049                             c.fChar == chCR        ||
4050                             c.fChar == chLF        ||
4051                             c.fChar == chNEL       ||
4052                             c.fChar == chLS)       {
4053                             break;
4054                         }
4055                     }
4056                 }
4057                 // TODO:  check what Java & Perl do with non-ASCII white spaces.  Ticket 6061.
4058                 if (PatternProps::isWhiteSpace(c.fChar) == FALSE) {
4059                     break;
4060                 }
4061                 c.fChar = nextCharLL();
4062             }
4063         }
4064
4065         //
4066         //  check for backslash escaped characters.
4067         //
4068         if (c.fChar == chBackSlash) {
4069             int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4070             if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
4071                 //
4072                 // A '\' sequence that is handled by ICU's standard unescapeAt function.
4073                 //   Includes \uxxxx, \n, \r, many others.
4074                 //   Return the single equivalent character.
4075                 //
4076                 nextCharLL();                 // get & discard the peeked char.
4077                 c.fQuoted = TRUE;
4078
4079                 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
4080                     int32_t endIndex = (int32_t)pos;
4081                     c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
4082
4083                     if (endIndex == pos) {
4084                         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4085                     }
4086                     fCharNum += endIndex - pos;
4087                     UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
4088                 } else {
4089                     int32_t offset = 0;
4090                     struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
4091
4092                     UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
4093                     c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
4094
4095                     if (offset == 0) {
4096                         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4097                     } else if (context.lastOffset == offset) {
4098                         UTEXT_PREVIOUS32(fRXPat->fPattern);
4099                     } else if (context.lastOffset != offset-1) {
4100                         utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
4101                     }
4102                     fCharNum += offset;
4103                 }
4104             }
4105             else if (peekCharLL() == chDigit0) {
4106                 //  Octal Escape, using Java Regexp Conventions
4107                 //    which are \0 followed by 1-3 octal digits.
4108                 //    Different from ICU Unescape handling of Octal, which does not
4109                 //    require the leading 0.
4110                 //  Java also has the convention of only consuming 2 octal digits if
4111                 //    the three digit number would be > 0xff
4112                 //
4113                 c.fChar = 0;
4114                 nextCharLL();    // Consume the initial 0.
4115                 int index;
4116                 for (index=0; index<3; index++) {
4117                     int32_t ch = peekCharLL();
4118                     if (ch<chDigit0 || ch>chDigit7) {
4119                         if (index==0) {
4120                            // \0 is not followed by any octal digits.
4121                            error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4122                         }
4123                         break;
4124                     }
4125                     c.fChar <<= 3;
4126                     c.fChar += ch&7;
4127                     if (c.fChar <= 255) {
4128                         nextCharLL();
4129                     } else {
4130                         // The last digit made the number too big.  Forget we saw it.
4131                         c.fChar >>= 3;
4132                     }
4133                 }
4134                 c.fQuoted = TRUE;
4135             }
4136             else if (peekCharLL() == chQ) {
4137                 //  "\Q"  enter quote mode, which will continue until "\E"
4138                 fQuoteMode = TRUE;
4139                 nextCharLL();       // discard the 'Q'.
4140                 nextChar(c);        // recurse to get the real next char.
4141             }
4142             else
4143             {
4144                 // We are in a '\' escape that will be handled by the state table scanner.
4145                 // Just return the backslash, but remember that the following char is to
4146                 //  be taken literally.
4147                 fInBackslashQuote = TRUE;
4148             }
4149         }
4150     }
4151
4152     // re-enable # to end-of-line comments, in case they were disabled.
4153     // They are disabled by the parser upon seeing '(?', but this lasts for
4154     //  the fetching of the next character only.
4155     fEOLComments = TRUE;
4156
4157     // putc(c.fChar, stdout);
4158 }
4159
4160
4161
4162 //------------------------------------------------------------------------------
4163 //
4164 //  scanNamedChar
4165 //            Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
4166 //
4167 //             The scan position will be at the 'N'.  On return
4168 //             the scan position should be just after the '}'
4169 //
4170 //             Return the UChar32
4171 //
4172 //------------------------------------------------------------------------------
4173 UChar32  RegexCompile::scanNamedChar() {
4174     if (U_FAILURE(*fStatus)) {
4175         return 0;
4176     }
4177
4178     nextChar(fC);
4179     if (fC.fChar != chLBrace) {
4180         error(U_REGEX_PROPERTY_SYNTAX);
4181         return 0;
4182     }
4183
4184     UnicodeString  charName;
4185     for (;;) {
4186         nextChar(fC);
4187         if (fC.fChar == chRBrace) {
4188             break;
4189         }
4190         if (fC.fChar == -1) {
4191             error(U_REGEX_PROPERTY_SYNTAX);
4192             return 0;
4193         }
4194         charName.append(fC.fChar);
4195     }
4196
4197     char name[100];
4198     if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
4199          (uint32_t)charName.length()>=sizeof(name)) {
4200         // All Unicode character names have only invariant characters.
4201         // The API to get a character, given a name, accepts only char *, forcing us to convert,
4202         //   which requires this error check
4203         error(U_REGEX_PROPERTY_SYNTAX);
4204         return 0;
4205     }
4206     charName.extract(0, charName.length(), name, sizeof(name), US_INV);
4207
4208     UChar32  theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
4209     if (U_FAILURE(*fStatus)) {
4210         error(U_REGEX_PROPERTY_SYNTAX);
4211     }
4212
4213     nextChar(fC);      // Continue overall regex pattern processing with char after the '}'
4214     return theChar;
4215 }
4216
4217 //------------------------------------------------------------------------------
4218 //
4219 //  scanProp   Construct a UnicodeSet from the text at the current scan
4220 //             position, which will be of the form \p{whaterver}
4221 //
4222 //             The scan position will be at the 'p' or 'P'.  On return
4223 //             the scan position should be just after the '}'
4224 //
4225 //             Return a UnicodeSet, constructed from the \P pattern,
4226 //             or NULL if the pattern is invalid.
4227 //
4228 //------------------------------------------------------------------------------
4229 UnicodeSet *RegexCompile::scanProp() {
4230     UnicodeSet    *uset = NULL;
4231
4232     if (U_FAILURE(*fStatus)) {
4233         return NULL;
4234     }
4235     (void)chLowerP;   // Suppress compiler unused variable warning.
4236     U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
4237     UBool negated = (fC.fChar == chP);
4238
4239     UnicodeString propertyName;
4240     nextChar(fC);
4241     if (fC.fChar != chLBrace) {
4242         error(U_REGEX_PROPERTY_SYNTAX);
4243         return NULL;
4244     }
4245     for (;;) {
4246         nextChar(fC);
4247         if (fC.fChar == chRBrace) {
4248             break;
4249         }
4250         if (fC.fChar == -1) {
4251             // Hit the end of the input string without finding the closing '}'
4252             error(U_REGEX_PROPERTY_SYNTAX);
4253             return NULL;
4254         }
4255         propertyName.append(fC.fChar);
4256     }
4257     uset = createSetForProperty(propertyName, negated);
4258     nextChar(fC);    // Move input scan to position following the closing '}'
4259     return uset;
4260 }
4261
4262 //------------------------------------------------------------------------------
4263 //
4264 //  scanPosixProp   Construct a UnicodeSet from the text at the current scan
4265 //             position, which is expected be of the form [:property expression:]
4266 //
4267 //             The scan position will be at the opening ':'.  On return
4268 //             the scan position must be on the closing ']'
4269 //
4270 //             Return a UnicodeSet constructed from the pattern,
4271 //             or NULL if this is not a valid POSIX-style set expression.
4272 //             If not a property expression, restore the initial scan position
4273 //                (to the opening ':')
4274 //
4275 //               Note:  the opening '[:' is not sufficient to guarantee that
4276 //                      this is a [:property:] expression.
4277 //                      [:'+=,] is a perfectly good ordinary set expression that
4278 //                              happens to include ':' as one of its characters.
4279 //
4280 //------------------------------------------------------------------------------
4281 UnicodeSet *RegexCompile::scanPosixProp() {
4282     UnicodeSet    *uset = NULL;
4283
4284     if (U_FAILURE(*fStatus)) {
4285         return NULL;
4286     }
4287
4288     U_ASSERT(fC.fChar == chColon);
4289
4290     // Save the scanner state.
4291     // TODO:  move this into the scanner, with the state encapsulated in some way.  Ticket 6062
4292     int64_t     savedScanIndex        = fScanIndex;
4293     int64_t     savedNextIndex        = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4294     UBool       savedQuoteMode        = fQuoteMode;
4295     UBool       savedInBackslashQuote = fInBackslashQuote;
4296     UBool       savedEOLComments      = fEOLComments;
4297     int64_t     savedLineNum          = fLineNum;
4298     int64_t     savedCharNum          = fCharNum;
4299     UChar32     savedLastChar         = fLastChar;
4300     UChar32     savedPeekChar         = fPeekChar;
4301     RegexPatternChar savedfC          = fC;
4302
4303     // Scan for a closing ].   A little tricky because there are some perverse
4304     //   edge cases possible.  "[:abc\Qdef:] \E]"  is a valid non-property expression,
4305     //   ending on the second closing ].
4306
4307     UnicodeString propName;
4308     UBool         negated  = FALSE;
4309
4310     // Check for and consume the '^' in a negated POSIX property, e.g.  [:^Letter:]
4311     nextChar(fC);
4312     if (fC.fChar == chUp) {
4313        negated = TRUE;
4314        nextChar(fC);
4315     }
4316
4317     // Scan for the closing ":]", collecting the property name along the way.
4318     UBool  sawPropSetTerminator = FALSE;
4319     for (;;) {
4320         propName.append(fC.fChar);
4321         nextChar(fC);
4322         if (fC.fQuoted || fC.fChar == -1) {
4323             // Escaped characters or end of input - either says this isn't a [:Property:]
4324             break;
4325         }
4326         if (fC.fChar == chColon) {
4327             nextChar(fC);
4328             if (fC.fChar == chRBracket) {
4329                 sawPropSetTerminator = TRUE;
4330             }
4331             break;
4332         }
4333     }
4334
4335     if (sawPropSetTerminator) {
4336         uset = createSetForProperty(propName, negated);
4337     }
4338     else
4339     {
4340         // No closing ":]".
4341         //  Restore the original scan position.
4342         //  The main scanner will retry the input as a normal set expression,
4343         //    not a [:Property:] expression.
4344         fScanIndex        = savedScanIndex;
4345         fQuoteMode        = savedQuoteMode;
4346         fInBackslashQuote = savedInBackslashQuote;
4347         fEOLComments      = savedEOLComments;
4348         fLineNum          = savedLineNum;
4349         fCharNum          = savedCharNum;
4350         fLastChar         = savedLastChar;
4351         fPeekChar         = savedPeekChar;
4352         fC                = savedfC;
4353         UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
4354     }
4355     return uset;
4356 }
4357
4358 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
4359     set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
4360     addCategory(set, U_GC_CF_MASK, ec);
4361 }
4362
4363 //
4364 //  Create a Unicode Set from a Unicode Property expression.
4365 //     This is common code underlying both \p{...} ane [:...:] expressions.
4366 //     Includes trying the Java "properties" that aren't supported as
4367 //     normal ICU UnicodeSet properties
4368 //
4369 static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{"
4370 static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{"
4371 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
4372     UnicodeString   setExpr;
4373     UnicodeSet      *set;
4374     uint32_t        usetFlags = 0;
4375
4376     if (U_FAILURE(*fStatus)) {
4377         return NULL;
4378     }
4379
4380     //
4381     //  First try the property as we received it
4382     //
4383     if (negated) {
4384         setExpr.append(negSetPrefix, -1);
4385     } else {
4386         setExpr.append(posSetPrefix, -1);
4387     }
4388     setExpr.append(propName);
4389     setExpr.append(chRBrace);
4390     setExpr.append(chRBracket);
4391     if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
4392         usetFlags |= USET_CASE_INSENSITIVE;
4393     }
4394     set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
4395     if (U_SUCCESS(*fStatus)) {
4396        return set;
4397     }
4398     delete set;
4399     set = NULL;
4400
4401     //
4402     //  The property as it was didn't work.
4403
4404     //  Do [:word:]. It is not recognized as a property by UnicodeSet.  "word" not standard POSIX
4405     //     or standard Java, but many other regular expression packages do recognize it.
4406
4407     if (propName.caseCompare(UNICODE_STRING_SIMPLE("word"), 0) == 0) {
4408         *fStatus = U_ZERO_ERROR;
4409         set = new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET]));
4410         if (set == NULL) {
4411             *fStatus = U_MEMORY_ALLOCATION_ERROR;
4412             return set;
4413         }
4414         if (negated) {
4415             set->complement();
4416         }
4417         return set;
4418     }
4419
4420
4421     //    Do Java fixes -
4422     //       InGreek -> InGreek or Coptic, that being the official Unicode name for that block.
4423     //       InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols.
4424     //
4425     //       Note on Spaces:  either "InCombiningMarksForSymbols" or "InCombining Marks for Symbols"
4426     //                        is accepted by Java.  The property part of the name is compared
4427     //                        case-insenstively.  The spaces must be exactly as shown, either
4428     //                        all there, or all omitted, with exactly one at each position
4429     //                        if they are present.  From checking against JDK 1.6
4430     //
4431     //       This code should be removed when ICU properties support the Java  compatibility names
4432     //          (ICU 4.0?)
4433     //
4434     UnicodeString mPropName = propName;
4435     if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) {
4436         mPropName = UNICODE_STRING_SIMPLE("InGreek and Coptic");
4437     }
4438     if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbols"), 0) == 0 ||
4439         mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols"), 0) == 0) {
4440         mPropName = UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Symbols");
4441     }
4442     else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
4443         mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint");
4444     }
4445
4446     //    See if the property looks like a Java "InBlockName", which
4447     //    we will recast as "Block=BlockName"
4448     //
4449     static const UChar IN[] = {0x49, 0x6E, 0};  // "In"
4450     static const UChar BLOCK[] = {0x42, 0x6C, 0x6f, 0x63, 0x6b, 0x3d, 00};  // "Block="
4451     if (mPropName.startsWith(IN, 2) && propName.length()>=3) {
4452         setExpr.truncate(4);   // Leaves "[\p{", or "[\P{"
4453         setExpr.append(BLOCK, -1);
4454         setExpr.append(UnicodeString(mPropName, 2));  // Property with the leading "In" removed.
4455         setExpr.append(chRBrace);
4456         setExpr.append(chRBracket);
4457         *fStatus = U_ZERO_ERROR;
4458         set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
4459         if (U_SUCCESS(*fStatus)) {
4460             return set;
4461         }
4462         delete set;
4463         set = NULL;
4464     }
4465
4466     if (propName.startsWith(UNICODE_STRING_SIMPLE("java")) ||
4467         propName.compare(UNICODE_STRING_SIMPLE("all")) == 0)
4468     {
4469         UErrorCode localStatus = U_ZERO_ERROR;
4470         //setExpr.remove();
4471         set = new UnicodeSet();
4472         //
4473         //  Try the various Java specific properties.
4474         //   These all begin with "java"
4475         //
4476         if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDefined")) == 0) {
4477             addCategory(set, U_GC_CN_MASK, localStatus);
4478             set->complement();
4479         }
4480         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDigit")) == 0) {
4481             addCategory(set, U_GC_ND_MASK, localStatus);
4482         }
4483         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaIdentifierIgnorable")) == 0) {
4484             addIdentifierIgnorable(set, localStatus);
4485         }
4486         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaISOControl")) == 0) {
4487             set->add(0, 0x1F).add(0x7F, 0x9F);
4488         }
4489         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierPart")) == 0) {
4490             addCategory(set, U_GC_L_MASK, localStatus);
4491             addCategory(set, U_GC_SC_MASK, localStatus);
4492             addCategory(set, U_GC_PC_MASK, localStatus);
4493             addCategory(set, U_GC_ND_MASK, localStatus);
4494             addCategory(set, U_GC_NL_MASK, localStatus);
4495             addCategory(set, U_GC_MC_MASK, localStatus);
4496             addCategory(set, U_GC_MN_MASK, localStatus);
4497             addIdentifierIgnorable(set, localStatus);
4498         }
4499         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierStart")) == 0) {
4500             addCategory(set, U_GC_L_MASK, localStatus);
4501             addCategory(set, U_GC_NL_MASK, localStatus);
4502             addCategory(set, U_GC_SC_MASK, localStatus);
4503             addCategory(set, U_GC_PC_MASK, localStatus);
4504         }
4505         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetter")) == 0) {
4506             addCategory(set, U_GC_L_MASK, localStatus);
4507         }
4508         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetterOrDigit")) == 0) {
4509             addCategory(set, U_GC_L_MASK, localStatus);
4510             addCategory(set, U_GC_ND_MASK, localStatus);
4511         }
4512         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLowerCase")) == 0) {
4513             addCategory(set, U_GC_LL_MASK, localStatus);
4514         }
4515         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaMirrored")) == 0) {
4516             set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, localStatus);
4517         }
4518         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSpaceChar")) == 0) {
4519             addCategory(set, U_GC_Z_MASK, localStatus);
4520         }
4521         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSupplementaryCodePoint")) == 0) {
4522             set->add(0x10000, UnicodeSet::MAX_VALUE);
4523         }
4524         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaTitleCase")) == 0) {
4525             addCategory(set, U_GC_LT_MASK, localStatus);
4526         }
4527         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierStart")) == 0) {
4528             addCategory(set, U_GC_L_MASK, localStatus);
4529             addCategory(set, U_GC_NL_MASK, localStatus);
4530         }
4531         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierPart")) == 0) {
4532             addCategory(set, U_GC_L_MASK, localStatus);
4533             addCategory(set, U_GC_PC_MASK, localStatus);
4534             addCategory(set, U_GC_ND_MASK, localStatus);
4535             addCategory(set, U_GC_NL_MASK, localStatus);
4536             addCategory(set, U_GC_MC_MASK, localStatus);
4537             addCategory(set, U_GC_MN_MASK, localStatus);
4538             addIdentifierIgnorable(set, localStatus);
4539         }
4540         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUpperCase")) == 0) {
4541             addCategory(set, U_GC_LU_MASK, localStatus);
4542         }
4543         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaValidCodePoint")) == 0) {
4544             set->add(0, UnicodeSet::MAX_VALUE);
4545         }
4546         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaWhitespace")) == 0) {
4547             addCategory(set, U_GC_Z_MASK, localStatus);
4548             set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
4549             set->add(9, 0x0d).add(0x1c, 0x1f);
4550         }
4551         else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
4552             set->add(0, UnicodeSet::MAX_VALUE);
4553         }
4554
4555         if (U_SUCCESS(localStatus) && !set->isEmpty()) {
4556             *fStatus = U_ZERO_ERROR;
4557             if (usetFlags & USET_CASE_INSENSITIVE) {
4558                 set->closeOver(USET_CASE_INSENSITIVE);
4559             }
4560             if (negated) {
4561                 set->complement();
4562             }
4563             return set;
4564         }
4565         delete set;
4566         set = NULL;
4567     }
4568     error(*fStatus);
4569     return NULL;
4570 }
4571
4572
4573
4574 //
4575 //  SetEval   Part of the evaluation of [set expressions].
4576 //            Perform any pending (stacked) operations with precedence
4577 //            equal or greater to that of the next operator encountered
4578 //            in the expression.
4579 //
4580 void RegexCompile::setEval(int32_t nextOp) {
4581     UnicodeSet *rightOperand = NULL;
4582     UnicodeSet *leftOperand  = NULL;
4583     for (;;) {
4584         U_ASSERT(fSetOpStack.empty()==FALSE);
4585         int32_t pendingSetOperation = fSetOpStack.peeki();
4586         if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
4587             break;
4588         }
4589         fSetOpStack.popi();
4590         U_ASSERT(fSetStack.empty() == FALSE);
4591         rightOperand = (UnicodeSet *)fSetStack.peek();
4592         switch (pendingSetOperation) {
4593             case setNegation:
4594                 rightOperand->complement();
4595                 break;
4596             case setCaseClose:
4597                 // TODO: need a simple close function.  Ticket 6065
4598                 rightOperand->closeOver(USET_CASE_INSENSITIVE);
4599                 rightOperand->removeAllStrings();
4600                 break;
4601             case setDifference1:
4602             case setDifference2:
4603                 fSetStack.pop();
4604                 leftOperand = (UnicodeSet *)fSetStack.peek();
4605                 leftOperand->removeAll(*rightOperand);
4606                 delete rightOperand;
4607                 break;
4608             case setIntersection1:
4609             case setIntersection2:
4610                 fSetStack.pop();
4611                 leftOperand = (UnicodeSet *)fSetStack.peek();
4612                 leftOperand->retainAll(*rightOperand);
4613                 delete rightOperand;
4614                 break;
4615             case setUnion:
4616                 fSetStack.pop();
4617                 leftOperand = (UnicodeSet *)fSetStack.peek();
4618                 leftOperand->addAll(*rightOperand);
4619                 delete rightOperand;
4620                 break;
4621             default:
4622                 U_ASSERT(FALSE);
4623                 break;
4624             }
4625         }
4626     }
4627
4628 void RegexCompile::setPushOp(int32_t op) {
4629     setEval(op);
4630     fSetOpStack.push(op, *fStatus);
4631     fSetStack.push(new UnicodeSet(), *fStatus);
4632 }
4633
4634 U_NAMESPACE_END
4635 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
4636