pcre/pcre_compile.cpp

   1 /* This is JavaScriptCore's variant of the PCRE library. While this library
   2 started out as a copy of PCRE, many of the features of PCRE have been
   3 removed. This library now supports only the regular expression features
   4 required by the JavaScript language specification, and has only the functions
   5 needed by JavaScriptCore and the rest of WebKit.
   6
   7                  Originally written by Philip Hazel
   8            Copyright (c) 1997-2006 University of Cambridge
   9     Copyright (C) 2002, 2004, 2006, 2007 Apple Inc. All rights reserved.
  10     Copyright (C) 2007 Eric Seidel <eric@webkit.org>
  11
  12 -----------------------------------------------------------------------------
  13 Redistribution and use in source and binary forms, with or without
  14 modification, are permitted provided that the following conditions are met:
  15
  16     * Redistributions of source code must retain the above copyright notice,
  17       this list of conditions and the following disclaimer.
  18
  19     * Redistributions in binary form must reproduce the above copyright
  20       notice, this list of conditions and the following disclaimer in the
  21       documentation and/or other materials provided with the distribution.
  22
  23     * Neither the name of the University of Cambridge nor the names of its
  24       contributors may be used to endorse or promote products derived from
  25       this software without specific prior written permission.
  26
  27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  37 POSSIBILITY OF SUCH DAMAGE.
  38 -----------------------------------------------------------------------------
  39 */
  40
  41 /* This module contains the external function jsRegExpExecute(), along with
  42 supporting internal functions that are not used by other modules. */
  43
  44 #include "config.h"
  45
  46 #include "pcre_internal.h"
  47
  48 #include <string.h>
  49 #include <wtf/ASCIICType.h>
  50 #include <wtf/FastMalloc.h>
  51
  52 using namespace WTF;
  53
  54 /* Negative values for the firstchar and reqchar variables */
  55
  56 #define REQ_UNSET (-2)
  57 #define REQ_NONE  (-1)
  58
  59 /*************************************************
  60 *      Code parameters and static tables         *
  61 *************************************************/
  62
  63 /* Maximum number of items on the nested bracket stacks at compile time. This
  64 applies to the nesting of all kinds of parentheses. It does not limit
  65 un-nested, non-capturing parentheses. This number can be made bigger if
  66 necessary - it is used to dimension one int and one unsigned char vector at
  67 compile time. */
  68
  69 #define BRASTACK_SIZE 200
  70
  71 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
  72 are simple data values; negative values are for special things like \d and so
  73 on. Zero means further processing is needed (for things like \x), or the escape
  74 is invalid. */
  75
  76 static const short escapes[] = {
  77      0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
  78      0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
  79    '@',      0, -ESC_B,      0, -ESC_D,      0,      0,      0,   /* @ - G */
  80      0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
  81      0,      0,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
  82      0,      0,      0,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
  83    '`',      7, -ESC_b,      0, -ESC_d,      0,   '\f',      0,   /* ` - g */
  84      0,      0,      0,      0,      0,      0,   '\n',      0,   /* h - o */
  85      0,      0,    '\r', -ESC_s,   '\t',      0,  '\v', -ESC_w,   /* p - w */
  86      0,      0,      0                                            /* x - z */
  87 };
  88
  89 /* Error code numbers. They are given names so that they can more easily be
  90 tracked. */
  91
  92 enum ErrorCode {
  93     ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
  94     ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17
  95 };
  96
  97 /* The texts of compile-time error messages. These are "char *" because they
  98 are passed to the outside world. */
  99
 100 static const char* errorText(ErrorCode code)
 101 {
 102     static const char errorTexts[] =
 103       /* 1 */
 104       "\\ at end of pattern\0"
 105       "\\c at end of pattern\0"
 106       "character value in \\x{...} sequence is too large\0"
 107       "numbers out of order in {} quantifier\0"
 108       /* 5 */
 109       "number too big in {} quantifier\0"
 110       "missing terminating ] for character class\0"
 111       "internal error: code overflow\0"
 112       "range out of order in character class\0"
 113       "nothing to repeat\0"
 114       /* 10 */
 115       "unmatched parentheses\0"
 116       "internal error: unexpected repeat\0"
 117       "unrecognized character after (?\0"
 118       "failed to get memory\0"
 119       "missing )\0"
 120       /* 15 */
 121       "reference to non-existent subpattern\0"
 122       "regular expression too large\0"
 123       "parentheses nested too deeply"
 124     ;
 125
 126     int i = code;
 127     const char* text = errorTexts;
 128     while (i > 1)
 129         i -= !*text++;
 130     return text;
 131 }
 132
 133 /* Structure for passing "static" information around between the functions
 134 doing the compiling. */
 135
 136 struct CompileData {
 137     CompileData() {
 138         top_backref = 0;
 139         backrefMap = 0;
 140         req_varyopt = 0;
 141         needOuterBracket = false;
 142         numCapturingBrackets = 0;
 143     }
 144     int top_backref;            /* Maximum back reference */
 145     unsigned backrefMap;       /* Bitmap of low back refs */
 146     int req_varyopt;            /* "After variable item" flag for reqbyte */
 147     bool needOuterBracket;
 148     int numCapturingBrackets;
 149 };
 150
 151 /* Definitions to allow mutual recursion */
 152
 153 static bool compileBracket(int, int*, unsigned char**, const UChar**, const UChar*, ErrorCode*, int, int*, int*, CompileData&);
 154 static bool bracketIsAnchored(const unsigned char* code);
 155 static bool bracketNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap);
 156 static int bracketFindFirstAssertedCharacter(const unsigned char* code, bool inassert);
 157
 158 /*************************************************
 159 *            Handle escapes                      *
 160 *************************************************/
 161
 162 /* This function is called when a \ has been encountered. It either returns a
 163 positive value for a simple escape such as \n, or a negative value which
 164 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
 165 a positive value greater than 255 may be returned. On entry, ptr is pointing at
 166 the \. On exit, it is on the final character of the escape sequence.
 167
 168 Arguments:
 169   ptrptr         points to the pattern position pointer
 170   errorcodeptr   points to the errorcode variable
 171   bracount       number of previous extracting brackets
 172   options        the options bits
 173   isclass        true if inside a character class
 174
 175 Returns:         zero or positive => a data character
 176                  negative => a special escape sequence
 177                  on error, errorptr is set
 178 */
 179
 180 static int checkEscape(const UChar** ptrptr, const UChar* patternEnd, ErrorCode* errorcodeptr, int bracount, bool isclass)
 181 {
 182     const UChar* ptr = *ptrptr + 1;
 183
 184     /* If backslash is at the end of the pattern, it's an error. */
 185     if (ptr == patternEnd) {
 186         *errorcodeptr = ERR1;
 187         *ptrptr = ptr;
 188         return 0;
 189     }
 190
 191     int c = *ptr;
 192
 193     /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
 194      a table. A non-zero result is something that can be returned immediately.
 195      Otherwise further processing may be required. */
 196
 197     if (c < '0' || c > 'z') { /* Not alphameric */
 198     } else if (int escapeValue = escapes[c - '0']) {
 199         c = escapeValue;
 200         if (isclass) {
 201             if (-c == ESC_b)
 202                 c = '\b'; /* \b is backslash in a class */
 203             else if (-c == ESC_B)
 204                 c = 'B'; /* and \B is a capital B in a class (in browsers event though ECMAScript 15.10.2.19 says it raises an error) */
 205         }
 206     /* Escapes that need further processing, or are illegal. */
 207
 208     } else {
 209         switch (c) {
 210             case '1':
 211             case '2':
 212             case '3':
 213             case '4':
 214             case '5':
 215             case '6':
 216             case '7':
 217             case '8':
 218             case '9':
 219                 /* Escape sequences starting with a non-zero digit are backreferences,
 220                  unless there are insufficient brackets, in which case they are octal
 221                  escape sequences. Those sequences end on the first non-octal character
 222                  or when we overflow 0-255, whichever comes first. */
 223
 224                 if (!isclass) {
 225                     const UChar* oldptr = ptr;
 226                     c -= '0';
 227                     while ((ptr + 1 < patternEnd) && isASCIIDigit(ptr[1]) && c <= bracount)
 228                         c = c * 10 + *(++ptr) - '0';
 229                     if (c <= bracount) {
 230                         c = -(ESC_REF + c);
 231                         break;
 232                     }
 233                     ptr = oldptr;      /* Put the pointer back and fall through */
 234                 }
 235
 236                 /* Handle an octal number following \. If the first digit is 8 or 9,
 237                  this is not octal. */
 238
 239                 if ((c = *ptr) >= '8')
 240                     break;
 241
 242             /* \0 always starts an octal number, but we may drop through to here with a
 243              larger first octal digit. */
 244
 245             case '0': {
 246                 c -= '0';
 247                 int i;
 248                 for (i = 1; i <= 2; ++i) {
 249                     if (ptr + i >= patternEnd || ptr[i] < '0' || ptr[i] > '7')
 250                         break;
 251                     int cc = c * 8 + ptr[i] - '0';
 252                     if (cc > 255)
 253                         break;
 254                     c = cc;
 255                 }
 256                 ptr += i - 1;
 257                 break;
 258             }
 259
 260             case 'x': {
 261                 c = 0;
 262                 int i;
 263                 for (i = 1; i <= 2; ++i) {
 264                     if (ptr + i >= patternEnd || !isASCIIHexDigit(ptr[i])) {
 265                         c = 'x';
 266                         i = 1;
 267                         break;
 268                     }
 269                     int cc = ptr[i];
 270                     if (cc >= 'a')
 271                         cc -= 32;             /* Convert to upper case */
 272                     c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10));
 273                 }
 274                 ptr += i - 1;
 275                 break;
 276             }
 277
 278             case 'u': {
 279                 c = 0;
 280                 int i;
 281                 for (i = 1; i <= 4; ++i) {
 282                     if (ptr + i >= patternEnd || !isASCIIHexDigit(ptr[i])) {
 283                         c = 'u';
 284                         i = 1;
 285                         break;
 286                     }
 287                     int cc = ptr[i];
 288                     if (cc >= 'a')
 289                         cc -= 32;             /* Convert to upper case */
 290                     c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10));
 291                 }
 292                 ptr += i - 1;
 293                 break;
 294             }
 295
 296             case 'c':
 297                 if (++ptr == patternEnd) {
 298                     *errorcodeptr = ERR2;
 299                     return 0;
 300                 }
 301                 c = *ptr;
 302
 303                 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
 304                  is ASCII-specific, but then the whole concept of \cx is ASCII-specific. */
 305                 c = toASCIIUpper(c) ^ 0x40;
 306                 break;
 307             }
 308     }
 309
 310     *ptrptr = ptr;
 311     return c;
 312 }
 313
 314 /*************************************************
 315 *            Check for counted repeat            *
 316 *************************************************/
 317
 318 /* This function is called when a '{' is encountered in a place where it might
 319 start a quantifier. It looks ahead to see if it really is a quantifier or not.
 320 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
 321 where the ddds are digits.
 322
 323 Arguments:
 324   p         pointer to the first char after '{'
 325
 326 Returns:    true or false
 327 */
 328
 329 static bool isCountedRepeat(const UChar* p, const UChar* patternEnd)
 330 {
 331     if (p >= patternEnd || !isASCIIDigit(*p))
 332         return false;
 333     p++;
 334     while (p < patternEnd && isASCIIDigit(*p))
 335         p++;
 336     if (p < patternEnd && *p == '}')
 337         return true;
 338
 339     if (p >= patternEnd || *p++ != ',')
 340         return false;
 341     if (p < patternEnd && *p == '}')
 342         return true;
 343
 344     if (p >= patternEnd || !isASCIIDigit(*p))
 345         return false;
 346     p++;
 347     while (p < patternEnd && isASCIIDigit(*p))
 348         p++;
 349
 350     return (p < patternEnd && *p == '}');
 351 }
 352
 353 /*************************************************
 354 *         Read repeat counts                     *
 355 *************************************************/
 356
 357 /* Read an item of the form {n,m} and return the values. This is called only
 358 after isCountedRepeat() has confirmed that a repeat-count quantifier exists,
 359 so the syntax is guaranteed to be correct, but we need to check the values.
 360
 361 Arguments:
 362   p              pointer to first char after '{'
 363   minp           pointer to int for min
 364   maxp           pointer to int for max
 365                  returned as -1 if no max
 366   errorcodeptr   points to error code variable
 367
 368 Returns:         pointer to '}' on success;
 369                  current ptr on error, with errorcodeptr set non-zero
 370 */
 371
 372 static const UChar* readRepeatCounts(const UChar* p, int* minp, int* maxp, ErrorCode* errorcodeptr)
 373 {
 374     int min = 0;
 375     int max = -1;
 376
 377     /* Read the minimum value and do a paranoid check: a negative value indicates
 378      an integer overflow. */
 379
 380     while (isASCIIDigit(*p))
 381         min = min * 10 + *p++ - '0';
 382     if (min < 0 || min > 65535) {
 383         *errorcodeptr = ERR5;
 384         return p;
 385     }
 386
 387     /* Read the maximum value if there is one, and again do a paranoid on its size.
 388      Also, max must not be less than min. */
 389
 390     if (*p == '}')
 391         max = min;
 392     else {
 393         if (*(++p) != '}') {
 394             max = 0;
 395             while (isASCIIDigit(*p))
 396                 max = max * 10 + *p++ - '0';
 397             if (max < 0 || max > 65535) {
 398                 *errorcodeptr = ERR5;
 399                 return p;
 400             }
 401             if (max < min) {
 402                 *errorcodeptr = ERR4;
 403                 return p;
 404             }
 405         }
 406     }
 407
 408     /* Fill in the required variables, and pass back the pointer to the terminating
 409      '}'. */
 410
 411     *minp = min;
 412     *maxp = max;
 413     return p;
 414 }
 415
 416 /*************************************************
 417 *      Find first significant op code            *
 418 *************************************************/
 419
 420 /* This is called by several functions that scan a compiled expression looking
 421 for a fixed first character, or an anchoring op code etc. It skips over things
 422 that do not influence this.
 423
 424 Arguments:
 425   code         pointer to the start of the group
 426 Returns:       pointer to the first significant opcode
 427 */
 428
 429 static const unsigned char* firstSignificantOpcode(const unsigned char* code)
 430 {
 431     while (*code == OP_BRANUMBER)
 432         code += 3;
 433     return code;
 434 }
 435
 436 static const unsigned char* firstSignificantOpcodeSkippingAssertions(const unsigned char* code)
 437 {
 438     while (true) {
 439         switch (*code) {
 440             case OP_ASSERT_NOT:
 441                 advanceToEndOfBracket(code);
 442                 code += 1 + LINK_SIZE;
 443                 break;
 444             case OP_WORD_BOUNDARY:
 445             case OP_NOT_WORD_BOUNDARY:
 446                 ++code;
 447                 break;
 448             case OP_BRANUMBER:
 449                 code += 3;
 450                 break;
 451             default:
 452                 return code;
 453         }
 454     }
 455 }
 456
 457 /*************************************************
 458 *           Get othercase range                  *
 459 *************************************************/
 460
 461 /* This function is passed the start and end of a class range, in UTF-8 mode
 462 with UCP support. It searches up the characters, looking for internal ranges of
 463 characters in the "other" case. Each call returns the next one, updating the
 464 start address.
 465
 466 Arguments:
 467   cptr        points to starting character value; updated
 468   d           end value
 469   ocptr       where to put start of othercase range
 470   odptr       where to put end of othercase range
 471
 472 Yield:        true when range returned; false when no more
 473 */
 474
 475 static bool getOthercaseRange(int* cptr, int d, int* ocptr, int* odptr)
 476 {
 477     int c, othercase = 0;
 478
 479     for (c = *cptr; c <= d; c++) {
 480         if ((othercase = kjs_pcre_ucp_othercase(c)) >= 0)
 481             break;
 482     }
 483
 484     if (c > d)
 485         return false;
 486
 487     *ocptr = othercase;
 488     int next = othercase + 1;
 489
 490     for (++c; c <= d; c++) {
 491         if (kjs_pcre_ucp_othercase(c) != next)
 492             break;
 493         next++;
 494     }
 495
 496     *odptr = next - 1;
 497     *cptr = c;
 498
 499     return true;
 500 }
 501
 502 /*************************************************
 503  *       Convert character value to UTF-8         *
 504  *************************************************/
 505
 506 /* This function takes an integer value in the range 0 - 0x7fffffff
 507  and encodes it as a UTF-8 character in 0 to 6 bytes.
 508
 509  Arguments:
 510  cvalue     the character value
 511  buffer     pointer to buffer for result - at least 6 bytes long
 512
 513  Returns:     number of characters placed in the buffer
 514  */
 515
 516 static int encodeUTF8(int cvalue, unsigned char *buffer)
 517 {
 518     int i;
 519     for (i = 0; i < kjs_pcre_utf8_table1_size; i++)
 520         if (cvalue <= kjs_pcre_utf8_table1[i])
 521             break;
 522     buffer += i;
 523     for (int j = i; j > 0; j--) {
 524         *buffer-- = 0x80 | (cvalue & 0x3f);
 525         cvalue >>= 6;
 526     }
 527     *buffer = kjs_pcre_utf8_table2[i] | cvalue;
 528     return i + 1;
 529 }
 530
 531 /*************************************************
 532 *           Compile one branch                   *
 533 *************************************************/
 534
 535 /* Scan the pattern, compiling it into the code vector.
 536
 537 Arguments:
 538   options        the option bits
 539   brackets       points to number of extracting brackets used
 540   codeptr        points to the pointer to the current code point
 541   ptrptr         points to the current pattern pointer
 542   errorcodeptr   points to error code variable
 543   firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
 544   reqbyteptr     set to the last literal character required, else < 0
 545   cd             contains pointers to tables etc.
 546
 547 Returns:         true on success
 548                  false, with *errorcodeptr set non-zero on error
 549 */
 550
 551 static inline bool safelyCheckNextChar(const UChar* ptr, const UChar* patternEnd, UChar expected)
 552 {
 553     return ((ptr + 1 < patternEnd) && ptr[1] == expected);
 554 }
 555
 556 static bool
 557 compileBranch(int options, int* brackets, unsigned char** codeptr,
 558                const UChar** ptrptr, const UChar* patternEnd, ErrorCode* errorcodeptr, int *firstbyteptr,
 559                int* reqbyteptr, CompileData& cd)
 560 {
 561     int repeat_type, op_type;
 562     int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
 563     int bravalue = 0;
 564     int reqvary, tempreqvary;
 565     int c;
 566     unsigned char* code = *codeptr;
 567     unsigned char* tempcode;
 568     bool groupsetfirstbyte = false;
 569     const UChar* ptr = *ptrptr;
 570     const UChar* tempptr;
 571     unsigned char* previous = NULL;
 572     unsigned char classbits[32];
 573
 574     bool class_utf8;
 575     unsigned char* class_utf8data;
 576     unsigned char utf8_char[6];
 577
 578     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
 579      matching encountered yet". It gets changed to REQ_NONE if we hit something that
 580      matches a non-fixed char first char; reqbyte just remains unset if we never
 581      find one.
 582
 583      When we hit a repeat whose minimum is zero, we may have to adjust these values
 584      to take the zero repeat into account. This is implemented by setting them to
 585      zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
 586      item types that can be repeated set these backoff variables appropriately. */
 587
 588     int firstbyte = REQ_UNSET;
 589     int reqbyte = REQ_UNSET;
 590     int zeroreqbyte = REQ_UNSET;
 591     int zerofirstbyte = REQ_UNSET;
 592
 593     /* The variable req_caseopt contains either the REQ_IGNORE_CASE value or zero,
 594      according to the current setting of the ignores-case flag. REQ_IGNORE_CASE is a bit
 595      value > 255. It is added into the firstbyte or reqbyte variables to record the
 596      case status of the value. This is used only for ASCII characters. */
 597
 598     int req_caseopt = (options & IgnoreCaseOption) ? REQ_IGNORE_CASE : 0;
 599
 600     /* Switch on next character until the end of the branch */
 601
 602     for (;; ptr++) {
 603         bool negate_class;
 604         bool should_flip_negation; /* If a negative special such as \S is used, we should negate the whole class to properly support Unicode. */
 605         int class_charcount;
 606         int class_lastchar;
 607         int skipbytes;
 608         int subreqbyte;
 609         int subfirstbyte;
 610         int mclength;
 611         unsigned char mcbuffer[8];
 612
 613         /* Next byte in the pattern */
 614
 615         c = ptr < patternEnd ? *ptr : 0;
 616
 617         /* Fill in length of a previous callout, except when the next thing is
 618          a quantifier. */
 619
 620         bool is_quantifier = c == '*' || c == '+' || c == '?' || (c == '{' && isCountedRepeat(ptr + 1, patternEnd));
 621
 622         switch (c) {
 623             /* The branch terminates at end of string, |, or ). */
 624
 625             case 0:
 626                 if (ptr < patternEnd)
 627                     goto NORMAL_CHAR;
 628                 // End of string; fall through
 629             case '|':
 630             case ')':
 631                 *firstbyteptr = firstbyte;
 632                 *reqbyteptr = reqbyte;
 633                 *codeptr = code;
 634                 *ptrptr = ptr;
 635                 return true;
 636
 637             /* Handle single-character metacharacters. In multiline mode, ^ disables
 638              the setting of any following char as a first character. */
 639
 640             case '^':
 641                 if (options & MatchAcrossMultipleLinesOption) {
 642                     if (firstbyte == REQ_UNSET)
 643                         firstbyte = REQ_NONE;
 644                 }
 645                 previous = NULL;
 646                 *code++ = OP_CIRC;
 647                 break;
 648
 649             case '$':
 650                 previous = NULL;
 651                 *code++ = OP_DOLL;
 652                 break;
 653
 654             /* There can never be a first char if '.' is first, whatever happens about
 655              repeats. The value of reqbyte doesn't change either. */
 656
 657             case '.':
 658                 if (firstbyte == REQ_UNSET)
 659                     firstbyte = REQ_NONE;
 660                 zerofirstbyte = firstbyte;
 661                 zeroreqbyte = reqbyte;
 662                 previous = code;
 663                 *code++ = OP_NOT_NEWLINE;
 664                 break;
 665
 666             /* Character classes. If the included characters are all < 256, we build a
 667              32-byte bitmap of the permitted characters, except in the special case
 668              where there is only one such character. For negated classes, we build the
 669              map as usual, then invert it at the end. However, we use a different opcode
 670              so that data characters > 255 can be handled correctly.
 671
 672              If the class contains characters outside the 0-255 range, a different
 673              opcode is compiled. It may optionally have a bit map for characters < 256,
 674              but those above are are explicitly listed afterwards. A flag byte tells
 675              whether the bitmap is present, and whether this is a negated class or not.
 676              */
 677
 678             case '[': {
 679                 previous = code;
 680                 should_flip_negation = false;
 681
 682                 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
 683                  they are encountered at the top level, so we'll do that too. */
 684
 685                 /* If the first character is '^', set the negation flag and skip it. */
 686
 687                 if (ptr + 1 >= patternEnd) {
 688                     *errorcodeptr = ERR6;
 689                     return false;
 690                 }
 691
 692                 if (ptr[1] == '^') {
 693                     negate_class = true;
 694                     ++ptr;
 695                 } else
 696                     negate_class = false;
 697
 698                 /* Keep a count of chars with values < 256 so that we can optimize the case
 699                  of just a single character (as long as it's < 256). For higher valued UTF-8
 700                  characters, we don't yet do any optimization. */
 701
 702                 class_charcount = 0;
 703                 class_lastchar = -1;
 704
 705                 class_utf8 = false;                       /* No chars >= 256 */
 706                 class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */
 707
 708                 /* Initialize the 32-char bit map to all zeros. We have to build the
 709                  map in a temporary bit of store, in case the class contains only 1
 710                  character (< 256), because in that case the compiled code doesn't use the
 711                  bit map. */
 712
 713                 memset(classbits, 0, 32 * sizeof(unsigned char));
 714
 715                 /* Process characters until ] is reached. The first pass
 716                  through the regex checked the overall syntax, so we don't need to be very
 717                  strict here. At the start of the loop, c contains the first byte of the
 718                  character. */
 719
 720                 while ((++ptr < patternEnd) && (c = *ptr) != ']') {
 721                     /* Backslash may introduce a single character, or it may introduce one
 722                      of the specials, which just set a flag. Escaped items are checked for
 723                      validity in the pre-compiling pass. The sequence \b is a special case.
 724                      Inside a class (and only there) it is treated as backspace. Elsewhere
 725                      it marks a word boundary. Other escapes have preset maps ready to
 726                      or into the one we are building. We assume they have more than one
 727                      character in them, so set class_charcount bigger than one. */
 728
 729                     if (c == '\\') {
 730                         c = checkEscape(&ptr, patternEnd, errorcodeptr, cd.numCapturingBrackets, true);
 731                         if (c < 0) {
 732                             class_charcount += 2;     /* Greater than 1 is what matters */
 733                             switch (-c) {
 734                                 case ESC_d:
 735                                     for (c = 0; c < 32; c++)
 736                                         classbits[c] |= classBitmapForChar(c + cbit_digit);
 737                                     continue;
 738
 739                                 case ESC_D:
 740                                     should_flip_negation = true;
 741                                     for (c = 0; c < 32; c++)
 742                                         classbits[c] |= ~classBitmapForChar(c + cbit_digit);
 743                                     continue;
 744
 745                                 case ESC_w:
 746                                     for (c = 0; c < 32; c++)
 747                                         classbits[c] |= classBitmapForChar(c + cbit_word);
 748                                     continue;
 749
 750                                 case ESC_W:
 751                                     should_flip_negation = true;
 752                                     for (c = 0; c < 32; c++)
 753                                         classbits[c] |= ~classBitmapForChar(c + cbit_word);
 754                                     continue;
 755
 756                                 case ESC_s:
 757                                     for (c = 0; c < 32; c++)
 758                                          classbits[c] |= classBitmapForChar(c + cbit_space);
 759                                     continue;
 760
 761                                 case ESC_S:
 762                                     should_flip_negation = true;
 763                                     for (c = 0; c < 32; c++)
 764                                          classbits[c] |= ~classBitmapForChar(c + cbit_space);
 765                                     continue;
 766
 767                                     /* Unrecognized escapes are faulted if PCRE is running in its
 768                                      strict mode. By default, for compatibility with Perl, they are
 769                                      treated as literals. */
 770
 771                                 default:
 772                                     c = *ptr;              /* The final character */
 773                                     class_charcount -= 2;  /* Undo the default count from above */
 774                             }
 775                         }
 776
 777                         /* Fall through if we have a single character (c >= 0). This may be
 778                          > 256 in UTF-8 mode. */
 779
 780                     }   /* End of backslash handling */
 781
 782                     /* A single character may be followed by '-' to form a range. However,
 783                      Perl does not permit ']' to be the end of the range. A '-' character
 784                      here is treated as a literal. */
 785
 786                     if ((ptr + 2 < patternEnd) && ptr[1] == '-' && ptr[2] != ']') {
 787                         ptr += 2;
 788
 789                         int d = *ptr;
 790
 791                         /* The second part of a range can be a single-character escape, but
 792                          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
 793                          in such circumstances. */
 794
 795                         if (d == '\\') {
 796                             const UChar* oldptr = ptr;
 797                             d = checkEscape(&ptr, patternEnd, errorcodeptr, cd.numCapturingBrackets, true);
 798
 799                             /* \X is literal X; any other special means the '-' was literal */
 800                             if (d < 0) {
 801                                 ptr = oldptr - 2;
 802                                 goto LONE_SINGLE_CHARACTER;  /* A few lines below */
 803                             }
 804                         }
 805
 806                         /* The check that the two values are in the correct order happens in
 807                          the pre-pass. Optimize one-character ranges */
 808
 809                         if (d == c)
 810                             goto LONE_SINGLE_CHARACTER;  /* A few lines below */
 811
 812                         /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
 813                          matching, we have to use an XCLASS with extra data items. Caseless
 814                          matching for characters > 127 is available only if UCP support is
 815                          available. */
 816
 817                         if ((d > 255 || ((options & IgnoreCaseOption) && d > 127))) {
 818                             class_utf8 = true;
 819
 820                             /* With UCP support, we can find the other case equivalents of
 821                              the relevant characters. There may be several ranges. Optimize how
 822                              they fit with the basic range. */
 823
 824                             if (options & IgnoreCaseOption) {
 825                                 int occ, ocd;
 826                                 int cc = c;
 827                                 int origd = d;
 828                                 while (getOthercaseRange(&cc, origd, &occ, &ocd)) {
 829                                     if (occ >= c && ocd <= d)
 830                                         continue;  /* Skip embedded ranges */
 831
 832                                     if (occ < c  && ocd >= c - 1)        /* Extend the basic range */
 833                                     {                                  /* if there is overlap,   */
 834                                         c = occ;                           /* noting that if occ < c */
 835                                         continue;                          /* we can't have ocd > d  */
 836                                     }                                  /* because a subrange is  */
 837                                     if (ocd > d && occ <= d + 1)         /* always shorter than    */
 838                                     {                                  /* the basic range.       */
 839                                         d = ocd;
 840                                         continue;
 841                                     }
 842
 843                                     if (occ == ocd)
 844                                         *class_utf8data++ = XCL_SINGLE;
 845                                     else {
 846                                         *class_utf8data++ = XCL_RANGE;
 847                                         class_utf8data += encodeUTF8(occ, class_utf8data);
 848                                     }
 849                                     class_utf8data += encodeUTF8(ocd, class_utf8data);
 850                                 }
 851                             }
 852
 853                             /* Now record the original range, possibly modified for UCP caseless
 854                              overlapping ranges. */
 855
 856                             *class_utf8data++ = XCL_RANGE;
 857                             class_utf8data += encodeUTF8(c, class_utf8data);
 858                             class_utf8data += encodeUTF8(d, class_utf8data);
 859
 860                             /* With UCP support, we are done. Without UCP support, there is no
 861                              caseless matching for UTF-8 characters > 127; we can use the bit map
 862                              for the smaller ones. */
 863
 864                             continue;    /* With next character in the class */
 865                         }
 866
 867                         /* We use the bit map for all cases when not in UTF-8 mode; else
 868                          ranges that lie entirely within 0-127 when there is UCP support; else
 869                          for partial ranges without UCP support. */
 870
 871                         for (; c <= d; c++) {
 872                             classbits[c/8] |= (1 << (c&7));
 873                             if (options & IgnoreCaseOption) {
 874                                 int uc = flipCase(c);
 875                                 classbits[uc/8] |= (1 << (uc&7));
 876                             }
 877                             class_charcount++;                /* in case a one-char range */
 878                             class_lastchar = c;
 879                         }
 880
 881                         continue;   /* Go get the next char in the class */
 882                     }
 883
 884                     /* Handle a lone single character - we can get here for a normal
 885                      non-escape char, or after \ that introduces a single character or for an
 886                      apparent range that isn't. */
 887
 888                 LONE_SINGLE_CHARACTER:
 889
 890                     /* Handle a character that cannot go in the bit map */
 891
 892                     if ((c > 255 || ((options & IgnoreCaseOption) && c > 127))) {
 893                         class_utf8 = true;
 894                         *class_utf8data++ = XCL_SINGLE;
 895                         class_utf8data += encodeUTF8(c, class_utf8data);
 896
 897                         if (options & IgnoreCaseOption) {
 898                             int othercase;
 899                             if ((othercase = kjs_pcre_ucp_othercase(c)) >= 0) {
 900                                 *class_utf8data++ = XCL_SINGLE;
 901                                 class_utf8data += encodeUTF8(othercase, class_utf8data);
 902                             }
 903                         }
 904                     } else {
 905                         /* Handle a single-byte character */
 906                         classbits[c/8] |= (1 << (c&7));
 907                         if (options & IgnoreCaseOption) {
 908                             c = flipCase(c);
 909                             classbits[c/8] |= (1 << (c&7));
 910                         }
 911                         class_charcount++;
 912                         class_lastchar = c;
 913                     }
 914                 }
 915
 916                 /* If class_charcount is 1, we saw precisely one character whose value is
 917                  less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
 918                  can optimize the negative case only if there were no characters >= 128
 919                  because OP_NOT and the related opcodes like OP_NOTSTAR operate on
 920                  single-bytes only. This is an historical hangover. Maybe one day we can
 921                  tidy these opcodes to handle multi-byte characters.
 922
 923                  The optimization throws away the bit map. We turn the item into a
 924                  1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
 925                  that OP_NOT does not support multibyte characters. In the positive case, it
 926                  can cause firstbyte to be set. Otherwise, there can be no first char if
 927                  this item is first, whatever repeat count may follow. In the case of
 928                  reqbyte, save the previous value for reinstating. */
 929
 930                 if (class_charcount == 1 && (!class_utf8 && (!negate_class || class_lastchar < 128))) {
 931                     zeroreqbyte = reqbyte;
 932
 933                     /* The OP_NOT opcode works on one-byte characters only. */
 934
 935                     if (negate_class) {
 936                         if (firstbyte == REQ_UNSET)
 937                             firstbyte = REQ_NONE;
 938                         zerofirstbyte = firstbyte;
 939                         *code++ = OP_NOT;
 940                         *code++ = class_lastchar;
 941                         break;
 942                     }
 943
 944                     /* For a single, positive character, get the value into c, and
 945                      then we can handle this with the normal one-character code. */
 946
 947                     c = class_lastchar;
 948                     goto NORMAL_CHAR;
 949                 }       /* End of 1-char optimization */
 950
 951                 /* The general case - not the one-char optimization. If this is the first
 952                  thing in the branch, there can be no first char setting, whatever the
 953                  repeat count. Any reqbyte setting must remain unchanged after any kind of
 954                  repeat. */
 955
 956                 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
 957                 zerofirstbyte = firstbyte;
 958                 zeroreqbyte = reqbyte;
 959
 960                 /* If there are characters with values > 255, we have to compile an
 961                  extended class, with its own opcode. If there are no characters < 256,
 962                  we can omit the bitmap. */
 963
 964                 if (class_utf8 && !should_flip_negation) {
 965                     *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
 966                     *code++ = OP_XCLASS;
 967                     code += LINK_SIZE;
 968                     *code = negate_class? XCL_NOT : 0;
 969
 970                     /* If the map is required, install it, and move on to the end of
 971                      the extra data */
 972
 973                     if (class_charcount > 0) {
 974                         *code++ |= XCL_MAP;
 975                         memcpy(code, classbits, 32);
 976                         code = class_utf8data;
 977                     }
 978
 979                     /* If the map is not required, slide down the extra data. */
 980
 981                     else {
 982                         int len = class_utf8data - (code + 33);
 983                         memmove(code + 1, code + 33, len);
 984                         code += len + 1;
 985                     }
 986
 987                     /* Now fill in the complete length of the item */
 988
 989                     putLinkValue(previous + 1, code - previous);
 990                     break;   /* End of class handling */
 991                 }
 992
 993                 /* If there are no characters > 255, negate the 32-byte map if necessary,
 994                  and copy it into the code vector. If this is the first thing in the branch,
 995                  there can be no first char setting, whatever the repeat count. Any reqbyte
 996                  setting must remain unchanged after any kind of repeat. */
 997
 998                 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
 999                 if (negate_class)
1000                     for (c = 0; c < 32; c++)
1001                         code[c] = ~classbits[c];
1002                 else
1003                     memcpy(code, classbits, 32);
1004                 code += 32;
1005                 break;
1006             }
1007
1008             /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
1009              has been tested above. */
1010
1011             case '{':
1012                 if (!is_quantifier)
1013                     goto NORMAL_CHAR;
1014                 ptr = readRepeatCounts(ptr + 1, &repeat_min, &repeat_max, errorcodeptr);
1015                 if (*errorcodeptr)
1016                     goto FAILED;
1017                 goto REPEAT;
1018
1019             case '*':
1020                 repeat_min = 0;
1021                 repeat_max = -1;
1022                 goto REPEAT;
1023
1024             case '+':
1025                 repeat_min = 1;
1026                 repeat_max = -1;
1027                 goto REPEAT;
1028
1029             case '?':
1030                 repeat_min = 0;
1031                 repeat_max = 1;
1032
1033             REPEAT:
1034                 if (!previous) {
1035                     *errorcodeptr = ERR9;
1036                     goto FAILED;
1037                 }
1038
1039                 if (repeat_min == 0) {
1040                     firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
1041                     reqbyte = zeroreqbyte;        /* Ditto */
1042                 }
1043
1044                 /* Remember whether this is a variable length repeat */
1045
1046                 reqvary = (repeat_min == repeat_max) ? 0 : REQ_VARY;
1047
1048                 op_type = 0;                    /* Default single-char op codes */
1049
1050                 /* Save start of previous item, in case we have to move it up to make space
1051                  for an inserted OP_ONCE for the additional '+' extension. */
1052                 /* FIXME: Probably don't need this because we don't use OP_ONCE. */
1053
1054                 tempcode = previous;
1055
1056                 /* If the next character is '+', we have a possessive quantifier. This
1057                  implies greediness, whatever the setting of the PCRE_UNGREEDY option.
1058                  If the next character is '?' this is a minimizing repeat, by default,
1059                  but if PCRE_UNGREEDY is set, it works the other way round. We change the
1060                  repeat type to the non-default. */
1061
1062                 if (safelyCheckNextChar(ptr, patternEnd, '?')) {
1063                     repeat_type = 1;
1064                     ptr++;
1065                 } else
1066                     repeat_type = 0;
1067
1068                 /* If previous was a character match, abolish the item and generate a
1069                  repeat item instead. If a char item has a minumum of more than one, ensure
1070                  that it is set in reqbyte - it might not be if a sequence such as x{3} is
1071                  the first thing in a branch because the x will have gone into firstbyte
1072                  instead.  */
1073
1074                 if (*previous == OP_CHAR || *previous == OP_CHAR_IGNORING_CASE) {
1075                     /* Deal with UTF-8 characters that take up more than one byte. It's
1076                      easier to write this out separately than try to macrify it. Use c to
1077                      hold the length of the character in bytes, plus 0x80 to flag that it's a
1078                      length rather than a small character. */
1079
1080                     if (code[-1] & 0x80) {
1081                         unsigned char *lastchar = code - 1;
1082                         while((*lastchar & 0xc0) == 0x80)
1083                             lastchar--;
1084                         c = code - lastchar;            /* Length of UTF-8 character */
1085                         memcpy(utf8_char, lastchar, c); /* Save the char */
1086                         c |= 0x80;                      /* Flag c as a length */
1087                     }
1088                     else {
1089                         c = code[-1];
1090                         if (repeat_min > 1)
1091                             reqbyte = c | req_caseopt | cd.req_varyopt;
1092                     }
1093
1094                     goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
1095                 }
1096
1097                 else if (*previous == OP_ASCII_CHAR || *previous == OP_ASCII_LETTER_IGNORING_CASE) {
1098                     c = previous[1];
1099                     if (repeat_min > 1)
1100                         reqbyte = c | req_caseopt | cd.req_varyopt;
1101                     goto OUTPUT_SINGLE_REPEAT;
1102                 }
1103
1104                 /* If previous was a single negated character ([^a] or similar), we use
1105                  one of the special opcodes, replacing it. The code is shared with single-
1106                  character repeats by setting opt_type to add a suitable offset into
1107                  repeat_type. OP_NOT is currently used only for single-byte chars. */
1108
1109                 else if (*previous == OP_NOT) {
1110                     op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
1111                     c = previous[1];
1112                     goto OUTPUT_SINGLE_REPEAT;
1113                 }
1114
1115                 /* If previous was a character type match (\d or similar), abolish it and
1116                  create a suitable repeat item. The code is shared with single-character
1117                  repeats by setting op_type to add a suitable offset into repeat_type. */
1118
1119                 else if (*previous <= OP_NOT_NEWLINE) {
1120                     op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
1121                     c = *previous;
1122
1123                 OUTPUT_SINGLE_REPEAT:
1124                     int prop_type = -1;
1125                     int prop_value = -1;
1126
1127                     unsigned char* oldcode = code;
1128                     code = previous;                  /* Usually overwrite previous item */
1129
1130                     /* If the maximum is zero then the minimum must also be zero; Perl allows
1131                      this case, so we do too - by simply omitting the item altogether. */
1132
1133                     if (repeat_max == 0)
1134                         goto END_REPEAT;
1135
1136                     /* Combine the op_type with the repeat_type */
1137
1138                     repeat_type += op_type;
1139
1140                     /* A minimum of zero is handled either as the special case * or ?, or as
1141                      an UPTO, with the maximum given. */
1142
1143                     if (repeat_min == 0) {
1144                         if (repeat_max == -1)
1145                             *code++ = OP_STAR + repeat_type;
1146                         else if (repeat_max == 1)
1147                             *code++ = OP_QUERY + repeat_type;
1148                         else {
1149                             *code++ = OP_UPTO + repeat_type;
1150                             put2ByteValueAndAdvance(code, repeat_max);
1151                         }
1152                     }
1153
1154                     /* A repeat minimum of 1 is optimized into some special cases. If the
1155                      maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
1156                      left in place and, if the maximum is greater than 1, we use OP_UPTO with
1157                      one less than the maximum. */
1158
1159                     else if (repeat_min == 1) {
1160                         if (repeat_max == -1)
1161                             *code++ = OP_PLUS + repeat_type;
1162                         else {
1163                             code = oldcode;                 /* leave previous item in place */
1164                             if (repeat_max == 1)
1165                                 goto END_REPEAT;
1166                             *code++ = OP_UPTO + repeat_type;
1167                             put2ByteValueAndAdvance(code, repeat_max - 1);
1168                         }
1169                     }
1170
1171                     /* The case {n,n} is just an EXACT, while the general case {n,m} is
1172                      handled as an EXACT followed by an UPTO. */
1173
1174                     else {
1175                         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
1176                         put2ByteValueAndAdvance(code, repeat_min);
1177
1178                         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
1179                          we have to insert the character for the previous code. For a repeated
1180                          Unicode property match, there are two extra bytes that define the
1181                          required property. In UTF-8 mode, long characters have their length in
1182                          c, with the 0x80 bit as a flag. */
1183
1184                         if (repeat_max < 0) {
1185                             if (c >= 128) {
1186                                 memcpy(code, utf8_char, c & 7);
1187                                 code += c & 7;
1188                             } else {
1189                                 *code++ = c;
1190                                 if (prop_type >= 0) {
1191                                     *code++ = prop_type;
1192                                     *code++ = prop_value;
1193                                 }
1194                             }
1195                             *code++ = OP_STAR + repeat_type;
1196                         }
1197
1198                         /* Else insert an UPTO if the max is greater than the min, again
1199                          preceded by the character, for the previously inserted code. */
1200
1201                         else if (repeat_max != repeat_min) {
1202                             if (c >= 128) {
1203                                 memcpy(code, utf8_char, c & 7);
1204                                 code += c & 7;
1205                             } else
1206                                 *code++ = c;
1207                             if (prop_type >= 0) {
1208                                 *code++ = prop_type;
1209                                 *code++ = prop_value;
1210                             }
1211                             repeat_max -= repeat_min;
1212                             *code++ = OP_UPTO + repeat_type;
1213                             put2ByteValueAndAdvance(code, repeat_max);
1214                         }
1215                     }
1216
1217                     /* The character or character type itself comes last in all cases. */
1218
1219                     if (c >= 128) {
1220                         memcpy(code, utf8_char, c & 7);
1221                         code += c & 7;
1222                     } else
1223                         *code++ = c;
1224
1225                     /* For a repeated Unicode property match, there are two extra bytes that
1226                      define the required property. */
1227
1228                     if (prop_type >= 0) {
1229                         *code++ = prop_type;
1230                         *code++ = prop_value;
1231                     }
1232                 }
1233
1234                 /* If previous was a character class or a back reference, we put the repeat
1235                  stuff after it, but just skip the item if the repeat was {0,0}. */
1236
1237                 else if (*previous == OP_CLASS ||
1238                          *previous == OP_NCLASS ||
1239                          *previous == OP_XCLASS ||
1240                          *previous == OP_REF)
1241                 {
1242                     if (repeat_max == 0) {
1243                         code = previous;
1244                         goto END_REPEAT;
1245                     }
1246
1247                     if (repeat_min == 0 && repeat_max == -1)
1248                         *code++ = OP_CRSTAR + repeat_type;
1249                     else if (repeat_min == 1 && repeat_max == -1)
1250                         *code++ = OP_CRPLUS + repeat_type;
1251                     else if (repeat_min == 0 && repeat_max == 1)
1252                         *code++ = OP_CRQUERY + repeat_type;
1253                     else {
1254                         *code++ = OP_CRRANGE + repeat_type;
1255                         put2ByteValueAndAdvance(code, repeat_min);
1256                         if (repeat_max == -1)
1257                             repeat_max = 0;  /* 2-byte encoding for max */
1258                         put2ByteValueAndAdvance(code, repeat_max);
1259                     }
1260                 }
1261
1262                 /* If previous was a bracket group, we may have to replicate it in certain
1263                  cases. */
1264
1265                 else if (*previous >= OP_BRA) {
1266                     int ketoffset = 0;
1267                     int len = code - previous;
1268                     unsigned char* bralink = NULL;
1269
1270                     /* If the maximum repeat count is unlimited, find the end of the bracket
1271                      by scanning through from the start, and compute the offset back to it
1272                      from the current code pointer. There may be an OP_OPT setting following
1273                      the final KET, so we can't find the end just by going back from the code
1274                      pointer. */
1275
1276                     if (repeat_max == -1) {
1277                         const unsigned char* ket = previous;
1278                         advanceToEndOfBracket(ket);
1279                         ketoffset = code - ket;
1280                     }
1281
1282                     /* The case of a zero minimum is special because of the need to stick
1283                      OP_BRAZERO in front of it, and because the group appears once in the
1284                      data, whereas in other cases it appears the minimum number of times. For
1285                      this reason, it is simplest to treat this case separately, as otherwise
1286                      the code gets far too messy. There are several special subcases when the
1287                      minimum is zero. */
1288
1289                     if (repeat_min == 0) {
1290                         /* If the maximum is also zero, we just omit the group from the output
1291                          altogether. */
1292
1293                         if (repeat_max == 0) {
1294                             code = previous;
1295                             goto END_REPEAT;
1296                         }
1297
1298                         /* If the maximum is 1 or unlimited, we just have to stick in the
1299                          BRAZERO and do no more at this point. However, we do need to adjust
1300                          any OP_RECURSE calls inside the group that refer to the group itself or
1301                          any internal group, because the offset is from the start of the whole
1302                          regex. Temporarily terminate the pattern while doing this. */
1303
1304                         if (repeat_max <= 1) {
1305                             *code = OP_END;
1306                             memmove(previous+1, previous, len);
1307                             code++;
1308                             *previous++ = OP_BRAZERO + repeat_type;
1309                         }
1310
1311                         /* If the maximum is greater than 1 and limited, we have to replicate
1312                          in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1313                          The first one has to be handled carefully because it's the original
1314                          copy, which has to be moved up. The remainder can be handled by code
1315                          that is common with the non-zero minimum case below. We have to
1316                          adjust the value of repeat_max, since one less copy is required. */
1317
1318                         else {
1319                             *code = OP_END;
1320                             memmove(previous + 2 + LINK_SIZE, previous, len);
1321                             code += 2 + LINK_SIZE;
1322                             *previous++ = OP_BRAZERO + repeat_type;
1323                             *previous++ = OP_BRA;
1324
1325                             /* We chain together the bracket offset fields that have to be
1326                              filled in later when the ends of the brackets are reached. */
1327
1328                             int offset = (!bralink) ? 0 : previous - bralink;
1329                             bralink = previous;
1330                             putLinkValueAllowZeroAndAdvance(previous, offset);
1331                         }
1332
1333                         repeat_max--;
1334                     }
1335
1336                     /* If the minimum is greater than zero, replicate the group as many
1337                      times as necessary, and adjust the maximum to the number of subsequent
1338                      copies that we need. If we set a first char from the group, and didn't
1339                      set a required char, copy the latter from the former. */
1340
1341                     else {
1342                         if (repeat_min > 1) {
1343                             if (groupsetfirstbyte && reqbyte < 0)
1344                                 reqbyte = firstbyte;
1345                             for (int i = 1; i < repeat_min; i++) {
1346                                 memcpy(code, previous, len);
1347                                 code += len;
1348                             }
1349                         }
1350                         if (repeat_max > 0)
1351                             repeat_max -= repeat_min;
1352                     }
1353
1354                     /* This code is common to both the zero and non-zero minimum cases. If
1355                      the maximum is limited, it replicates the group in a nested fashion,
1356                      remembering the bracket starts on a stack. In the case of a zero minimum,
1357                      the first one was set up above. In all cases the repeat_max now specifies
1358                      the number of additional copies needed. */
1359
1360                     if (repeat_max >= 0) {
1361                         for (int i = repeat_max - 1; i >= 0; i--) {
1362                             *code++ = OP_BRAZERO + repeat_type;
1363
1364                             /* All but the final copy start a new nesting, maintaining the
1365                              chain of brackets outstanding. */
1366
1367                             if (i != 0) {
1368                                 *code++ = OP_BRA;
1369                                 int offset = (!bralink) ? 0 : code - bralink;
1370                                 bralink = code;
1371                                 putLinkValueAllowZeroAndAdvance(code, offset);
1372                             }
1373
1374                             memcpy(code, previous, len);
1375                             code += len;
1376                         }
1377
1378                         /* Now chain through the pending brackets, and fill in their length
1379                          fields (which are holding the chain links pro tem). */
1380
1381                         while (bralink) {
1382                             int offset = code - bralink + 1;
1383                             unsigned char* bra = code - offset;
1384                             int oldlinkoffset = getLinkValueAllowZero(bra + 1);
1385                             bralink = (!oldlinkoffset) ? 0 : bralink - oldlinkoffset;
1386                             *code++ = OP_KET;
1387                             putLinkValueAndAdvance(code, offset);
1388                             putLinkValue(bra + 1, offset);
1389                         }
1390                     }
1391
1392                     /* If the maximum is unlimited, set a repeater in the final copy. We
1393                      can't just offset backwards from the current code point, because we
1394                      don't know if there's been an options resetting after the ket. The
1395                      correct offset was computed above. */
1396
1397                     else
1398                         code[-ketoffset] = OP_KETRMAX + repeat_type;
1399                 }
1400
1401                 /* Else there's some kind of shambles */
1402
1403                 else {
1404                     *errorcodeptr = ERR11;
1405                     goto FAILED;
1406                 }
1407
1408                 /* In all case we no longer have a previous item. We also set the
1409                  "follows varying string" flag for subsequently encountered reqbytes if
1410                  it isn't already set and we have just passed a varying length item. */
1411
1412             END_REPEAT:
1413                 previous = NULL;
1414                 cd.req_varyopt |= reqvary;
1415                 break;
1416
1417             /* Start of nested bracket sub-expression, or comment or lookahead or
1418              lookbehind or option setting or condition. First deal with special things
1419              that can come after a bracket; all are introduced by ?, and the appearance
1420              of any of them means that this is not a referencing group. They were
1421              checked for validity in the first pass over the string, so we don't have to
1422              check for syntax errors here.  */
1423
1424             case '(':
1425                 skipbytes = 0;
1426
1427                 if (*(++ptr) == '?') {
1428                     switch (*(++ptr)) {
1429                         case ':':                 /* Non-extracting bracket */
1430                             bravalue = OP_BRA;
1431                             ptr++;
1432                             break;
1433
1434                         case '=':                 /* Positive lookahead */
1435                             bravalue = OP_ASSERT;
1436                             ptr++;
1437                             break;
1438
1439                         case '!':                 /* Negative lookahead */
1440                             bravalue = OP_ASSERT_NOT;
1441                             ptr++;
1442                             break;
1443
1444                         /* Character after (? not specially recognized */
1445
1446                         default:
1447                             *errorcodeptr = ERR12;
1448                             goto FAILED;
1449                         }
1450                 }
1451
1452                 /* Else we have a referencing group; adjust the opcode. If the bracket
1453                  number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
1454                  arrange for the true number to follow later, in an OP_BRANUMBER item. */
1455
1456                 else {
1457                     if (++(*brackets) > EXTRACT_BASIC_MAX) {
1458                         bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
1459                         code[1 + LINK_SIZE] = OP_BRANUMBER;
1460                         put2ByteValue(code + 2 + LINK_SIZE, *brackets);
1461                         skipbytes = 3;
1462                     }
1463                     else
1464                         bravalue = OP_BRA + *brackets;
1465                 }
1466
1467                 /* Process nested bracketed re. Assertions may not be repeated, but other
1468                  kinds can be. We copy code into a non-variable in order to be able
1469                  to pass its address because some compilers complain otherwise. Pass in a
1470                  new setting for the ims options if they have changed. */
1471
1472                 previous = (bravalue >= OP_BRAZERO) ? code : 0;
1473                 *code = bravalue;
1474                 tempcode = code;
1475                 tempreqvary = cd.req_varyopt;     /* Save value before bracket */
1476
1477                 if (!compileBracket(
1478                                    options,
1479                                    brackets,                     /* Extracting bracket count */
1480                                    &tempcode,                    /* Where to put code (updated) */
1481                                    &ptr,                         /* Input pointer (updated) */
1482                                    patternEnd,
1483                                    errorcodeptr,                 /* Where to put an error message */
1484                                    skipbytes,                    /* Skip over OP_BRANUMBER */
1485                                    &subfirstbyte,                /* For possible first char */
1486                                    &subreqbyte,                  /* For possible last char */
1487                                    cd))                          /* Tables block */
1488                     goto FAILED;
1489
1490                 /* At the end of compiling, code is still pointing to the start of the
1491                  group, while tempcode has been updated to point past the end of the group
1492                  and any option resetting that may follow it. The pattern pointer (ptr)
1493                  is on the bracket. */
1494
1495                 /* Handle updating of the required and first characters. Update for normal
1496                  brackets of all kinds, and conditions with two branches (see code above).
1497                  If the bracket is followed by a quantifier with zero repeat, we have to
1498                  back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
1499                  main loop so that they can be accessed for the back off. */
1500
1501                 zeroreqbyte = reqbyte;
1502                 zerofirstbyte = firstbyte;
1503                 groupsetfirstbyte = false;
1504
1505                 if (bravalue >= OP_BRA) {
1506                     /* If we have not yet set a firstbyte in this branch, take it from the
1507                      subpattern, remembering that it was set here so that a repeat of more
1508                      than one can replicate it as reqbyte if necessary. If the subpattern has
1509                      no firstbyte, set "none" for the whole branch. In both cases, a zero
1510                      repeat forces firstbyte to "none". */
1511
1512                     if (firstbyte == REQ_UNSET) {
1513                         if (subfirstbyte >= 0) {
1514                             firstbyte = subfirstbyte;
1515                             groupsetfirstbyte = true;
1516                         }
1517                         else
1518                             firstbyte = REQ_NONE;
1519                         zerofirstbyte = REQ_NONE;
1520                     }
1521
1522                     /* If firstbyte was previously set, convert the subpattern's firstbyte
1523                      into reqbyte if there wasn't one, using the vary flag that was in
1524                      existence beforehand. */
1525
1526                     else if (subfirstbyte >= 0 && subreqbyte < 0)
1527                         subreqbyte = subfirstbyte | tempreqvary;
1528
1529                     /* If the subpattern set a required byte (or set a first byte that isn't
1530                      really the first byte - see above), set it. */
1531
1532                     if (subreqbyte >= 0)
1533                         reqbyte = subreqbyte;
1534                 }
1535
1536                 /* For a forward assertion, we take the reqbyte, if set. This can be
1537                  helpful if the pattern that follows the assertion doesn't set a different
1538                  char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
1539                  for an assertion, however because it leads to incorrect effect for patterns
1540                  such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
1541                  of a firstbyte. This is overcome by a scan at the end if there's no
1542                  firstbyte, looking for an asserted first char. */
1543
1544                 else if (bravalue == OP_ASSERT && subreqbyte >= 0)
1545                     reqbyte = subreqbyte;
1546
1547                 /* Now update the main code pointer to the end of the group. */
1548
1549                 code = tempcode;
1550
1551                 /* Error if hit end of pattern */
1552
1553                 if (ptr >= patternEnd || *ptr != ')') {
1554                     *errorcodeptr = ERR14;
1555                     goto FAILED;
1556                 }
1557                 break;
1558
1559             /* Check \ for being a real metacharacter; if not, fall through and handle
1560              it as a data character at the start of a string. Escape items are checked
1561              for validity in the pre-compiling pass. */
1562
1563             case '\\':
1564                 tempptr = ptr;
1565                 c = checkEscape(&ptr, patternEnd, errorcodeptr, cd.numCapturingBrackets, false);
1566
1567                 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1568                  are arranged to be the negation of the corresponding OP_values. For the
1569                  back references, the values are ESC_REF plus the reference number. Only
1570                  back references and those types that consume a character may be repeated.
1571                  We can test for values between ESC_b and ESC_w for the latter; this may
1572                  have to change if any new ones are ever created. */
1573
1574                 if (c < 0) {
1575                     /* For metasequences that actually match a character, we disable the
1576                      setting of a first character if it hasn't already been set. */
1577
1578                     if (firstbyte == REQ_UNSET && -c > ESC_b && -c <= ESC_w)
1579                         firstbyte = REQ_NONE;
1580
1581                     /* Set values to reset to if this is followed by a zero repeat. */
1582
1583                     zerofirstbyte = firstbyte;
1584                     zeroreqbyte = reqbyte;
1585
1586                     /* Back references are handled specially */
1587
1588                     if (-c >= ESC_REF) {
1589                         int number = -c - ESC_REF;
1590                         previous = code;
1591                         *code++ = OP_REF;
1592                         put2ByteValueAndAdvance(code, number);
1593                     }
1594
1595                     /* For the rest, we can obtain the OP value by negating the escape
1596                      value */
1597
1598                     else {
1599                         previous = (-c > ESC_b && -c <= ESC_w) ? code : NULL;
1600                         *code++ = -c;
1601                     }
1602                     continue;
1603                 }
1604
1605                 /* Fall through. */
1606
1607                 /* Handle a literal character. It is guaranteed not to be whitespace or #
1608                  when the extended flag is set. If we are in UTF-8 mode, it may be a
1609                  multi-byte literal character. */
1610
1611                 default:
1612             NORMAL_CHAR:
1613
1614                 previous = code;
1615
1616                 if (c < 128) {
1617                     mclength = 1;
1618                     mcbuffer[0] = c;
1619
1620                     if ((options & IgnoreCaseOption) && (c | 0x20) >= 'a' && (c | 0x20) <= 'z') {
1621                         *code++ = OP_ASCII_LETTER_IGNORING_CASE;
1622                         *code++ = c | 0x20;
1623                     } else {
1624                         *code++ = OP_ASCII_CHAR;
1625                         *code++ = c;
1626                     }
1627                 } else {
1628                     mclength = encodeUTF8(c, mcbuffer);
1629
1630                     *code++ = (options & IgnoreCaseOption) ? OP_CHAR_IGNORING_CASE : OP_CHAR;
1631                     for (c = 0; c < mclength; c++)
1632                         *code++ = mcbuffer[c];
1633                 }
1634
1635                 /* Set the first and required bytes appropriately. If no previous first
1636                  byte, set it from this character, but revert to none on a zero repeat.
1637                  Otherwise, leave the firstbyte value alone, and don't change it on a zero
1638                  repeat. */
1639
1640                 if (firstbyte == REQ_UNSET) {
1641                     zerofirstbyte = REQ_NONE;
1642                     zeroreqbyte = reqbyte;
1643
1644                     /* If the character is more than one byte long, we can set firstbyte
1645                      only if it is not to be matched caselessly. */
1646
1647                     if (mclength == 1 || req_caseopt == 0) {
1648                         firstbyte = mcbuffer[0] | req_caseopt;
1649                         if (mclength != 1)
1650                             reqbyte = code[-1] | cd.req_varyopt;
1651                     }
1652                     else
1653                         firstbyte = reqbyte = REQ_NONE;
1654                 }
1655
1656                 /* firstbyte was previously set; we can set reqbyte only the length is
1657                  1 or the matching is caseful. */
1658
1659                 else {
1660                     zerofirstbyte = firstbyte;
1661                     zeroreqbyte = reqbyte;
1662                     if (mclength == 1 || req_caseopt == 0)
1663                         reqbyte = code[-1] | req_caseopt | cd.req_varyopt;
1664                 }
1665
1666                 break;            /* End of literal character handling */
1667         }
1668     }                   /* end of big loop */
1669
1670     /* Control never reaches here by falling through, only by a goto for all the
1671      error states. Pass back the position in the pattern so that it can be displayed
1672      to the user for diagnosing the error. */
1673
1674 FAILED:
1675     *ptrptr = ptr;
1676     return false;
1677 }
1678
1679 /*************************************************
1680 *     Compile sequence of alternatives           *
1681 *************************************************/
1682
1683 /* On entry, ptr is pointing past the bracket character, but on return
1684 it points to the closing bracket, or vertical bar, or end of string.
1685 The code variable is pointing at the byte into which the BRA operator has been
1686 stored. If the ims options are changed at the start (for a (?ims: group) or
1687 during any branch, we need to insert an OP_OPT item at the start of every
1688 following branch to ensure they get set correctly at run time, and also pass
1689 the new options into every subsequent branch compile.
1690
1691 Argument:
1692   options        option bits, including any changes for this subpattern
1693   brackets       -> int containing the number of extracting brackets used
1694   codeptr        -> the address of the current code pointer
1695   ptrptr         -> the address of the current pattern pointer
1696   errorcodeptr   -> pointer to error code variable
1697   skipbytes      skip this many bytes at start (for OP_BRANUMBER)
1698   firstbyteptr   place to put the first required character, or a negative number
1699   reqbyteptr     place to put the last required character, or a negative number
1700   cd             points to the data block with tables pointers etc.
1701
1702 Returns:      true on success
1703 */
1704
1705 static bool
1706 compileBracket(int options, int* brackets, unsigned char** codeptr,
1707     const UChar** ptrptr, const UChar* patternEnd, ErrorCode* errorcodeptr, int skipbytes,
1708     int* firstbyteptr, int* reqbyteptr, CompileData& cd)
1709 {
1710     const UChar* ptr = *ptrptr;
1711     unsigned char* code = *codeptr;
1712     unsigned char* last_branch = code;
1713     unsigned char* start_bracket = code;
1714     int firstbyte = REQ_UNSET;
1715     int reqbyte = REQ_UNSET;
1716
1717     /* Offset is set zero to mark that this bracket is still open */
1718
1719     putLinkValueAllowZero(code + 1, 0);
1720     code += 1 + LINK_SIZE + skipbytes;
1721
1722     /* Loop for each alternative branch */
1723
1724     while (true) {
1725         /* Now compile the branch */
1726
1727         int branchfirstbyte;
1728         int branchreqbyte;
1729         if (!compileBranch(options, brackets, &code, &ptr, patternEnd, errorcodeptr,
1730                             &branchfirstbyte, &branchreqbyte, cd)) {
1731             *ptrptr = ptr;
1732             return false;
1733         }
1734
1735         /* If this is the first branch, the firstbyte and reqbyte values for the
1736          branch become the values for the regex. */
1737
1738         if (*last_branch != OP_ALT) {
1739             firstbyte = branchfirstbyte;
1740             reqbyte = branchreqbyte;
1741         }
1742
1743         /* If this is not the first branch, the first char and reqbyte have to
1744          match the values from all the previous branches, except that if the previous
1745          value for reqbyte didn't have REQ_VARY set, it can still match, and we set
1746          REQ_VARY for the regex. */
1747
1748         else {
1749             /* If we previously had a firstbyte, but it doesn't match the new branch,
1750              we have to abandon the firstbyte for the regex, but if there was previously
1751              no reqbyte, it takes on the value of the old firstbyte. */
1752
1753             if (firstbyte >= 0 && firstbyte != branchfirstbyte) {
1754                 if (reqbyte < 0)
1755                     reqbyte = firstbyte;
1756                 firstbyte = REQ_NONE;
1757             }
1758
1759             /* If we (now or from before) have no firstbyte, a firstbyte from the
1760              branch becomes a reqbyte if there isn't a branch reqbyte. */
1761
1762             if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
1763                 branchreqbyte = branchfirstbyte;
1764
1765             /* Now ensure that the reqbytes match */
1766
1767             if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
1768                 reqbyte = REQ_NONE;
1769             else
1770                 reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
1771         }
1772
1773         /* Reached end of expression, either ')' or end of pattern. Go back through
1774          the alternative branches and reverse the chain of offsets, with the field in
1775          the BRA item now becoming an offset to the first alternative. If there are
1776          no alternatives, it points to the end of the group. The length in the
1777          terminating ket is always the length of the whole bracketed item. If any of
1778          the ims options were changed inside the group, compile a resetting op-code
1779          following, except at the very end of the pattern. Return leaving the pointer
1780          at the terminating char. */
1781
1782         if (ptr >= patternEnd || *ptr != '|') {
1783             int length = code - last_branch;
1784             do {
1785                 int prev_length = getLinkValueAllowZero(last_branch + 1);
1786                 putLinkValue(last_branch + 1, length);
1787                 length = prev_length;
1788                 last_branch -= length;
1789             } while (length > 0);
1790
1791             /* Fill in the ket */
1792
1793             *code = OP_KET;
1794             putLinkValue(code + 1, code - start_bracket);
1795             code += 1 + LINK_SIZE;
1796
1797             /* Set values to pass back */
1798
1799             *codeptr = code;
1800             *ptrptr = ptr;
1801             *firstbyteptr = firstbyte;
1802             *reqbyteptr = reqbyte;
1803             return true;
1804         }
1805
1806         /* Another branch follows; insert an "or" node. Its length field points back
1807          to the previous branch while the bracket remains open. At the end the chain
1808          is reversed. It's done like this so that the start of the bracket has a
1809          zero offset until it is closed, making it possible to detect recursion. */
1810
1811         *code = OP_ALT;
1812         putLinkValue(code + 1, code - last_branch);
1813         last_branch = code;
1814         code += 1 + LINK_SIZE;
1815         ptr++;
1816     }
1817     ASSERT_NOT_REACHED();
1818 }
1819
1820 /*************************************************
1821 *          Check for anchored expression         *
1822 *************************************************/
1823
1824 /* Try to find out if this is an anchored regular expression. Consider each
1825 alternative branch. If they all start OP_CIRC, or with a bracket
1826 all of whose alternatives start OP_CIRC (recurse ad lib), then
1827 it's anchored.
1828
1829 Arguments:
1830   code          points to start of expression (the bracket)
1831   captureMap    a bitmap of which brackets we are inside while testing; this
1832                  handles up to substring 31; all brackets after that share
1833                  the zero bit
1834   backrefMap    the back reference bitmap
1835 */
1836
1837 static bool branchIsAnchored(const unsigned char* code)
1838 {
1839     const unsigned char* scode = firstSignificantOpcode(code);
1840     int op = *scode;
1841
1842     /* Brackets */
1843     if (op >= OP_BRA || op == OP_ASSERT)
1844         return bracketIsAnchored(scode);
1845
1846     /* Check for explicit anchoring */
1847     return op == OP_CIRC;
1848 }
1849
1850 static bool bracketIsAnchored(const unsigned char* code)
1851 {
1852     do {
1853         if (!branchIsAnchored(code + 1 + LINK_SIZE))
1854             return false;
1855         code += getLinkValue(code + 1);
1856     } while (*code == OP_ALT);   /* Loop for each alternative */
1857     return true;
1858 }
1859
1860 /*************************************************
1861 *         Check for starting with ^ or .*        *
1862 *************************************************/
1863
1864 /* This is called to find out if every branch starts with ^ or .* so that
1865 "first char" processing can be done to speed things up in multiline
1866 matching and for non-DOTALL patterns that start with .* (which must start at
1867 the beginning or after \n)
1868
1869 Except when the .* appears inside capturing parentheses, and there is a
1870 subsequent back reference to those parentheses. By keeping a bitmap of the
1871 first 31 back references, we can catch some of the more common cases more
1872 precisely; all the greater back references share a single bit.
1873
1874 Arguments:
1875   code          points to start of expression (the bracket)
1876   captureMap    a bitmap of which brackets we are inside while testing; this
1877                  handles up to substring 31; all brackets after that share
1878                  the zero bit
1879   backrefMap    the back reference bitmap
1880 */
1881
1882 static bool branchNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap)
1883 {
1884     const unsigned char* scode = firstSignificantOpcode(code);
1885     int op = *scode;
1886
1887     /* Capturing brackets */
1888     if (op > OP_BRA) {
1889         int captureNum = op - OP_BRA;
1890         if (captureNum > EXTRACT_BASIC_MAX)
1891             captureNum = get2ByteValue(scode + 2 + LINK_SIZE);
1892         int bracketMask = (captureNum < 32) ? (1 << captureNum) : 1;
1893         return bracketNeedsLineStart(scode, captureMap | bracketMask, backrefMap);
1894     }
1895
1896     /* Other brackets */
1897     if (op == OP_BRA || op == OP_ASSERT)
1898         return bracketNeedsLineStart(scode, captureMap, backrefMap);
1899
1900     /* .* means "start at start or after \n" if it isn't in brackets that
1901      may be referenced. */
1902
1903     if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1904         return scode[1] == OP_NOT_NEWLINE && !(captureMap & backrefMap);
1905
1906     /* Explicit ^ */
1907     return op == OP_CIRC;
1908 }
1909
1910 static bool bracketNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap)
1911 {
1912     do {
1913         if (!branchNeedsLineStart(code + 1 + LINK_SIZE, captureMap, backrefMap))
1914             return false;
1915         code += getLinkValue(code + 1);
1916     } while (*code == OP_ALT);  /* Loop for each alternative */
1917     return true;
1918 }
1919
1920 /*************************************************
1921 *       Check for asserted fixed first char      *
1922 *************************************************/
1923
1924 /* During compilation, the "first char" settings from forward assertions are
1925 discarded, because they can cause conflicts with actual literals that follow.
1926 However, if we end up without a first char setting for an unanchored pattern,
1927 it is worth scanning the regex to see if there is an initial asserted first
1928 char. If all branches start with the same asserted char, or with a bracket all
1929 of whose alternatives start with the same asserted char (recurse ad lib), then
1930 we return that char, otherwise -1.
1931
1932 Arguments:
1933   code       points to start of expression (the bracket)
1934   options    pointer to the options (used to check casing changes)
1935   inassert   true if in an assertion
1936
1937 Returns:     -1 or the fixed first char
1938 */
1939
1940 static int branchFindFirstAssertedCharacter(const unsigned char* code, bool inassert)
1941 {
1942     const unsigned char* scode = firstSignificantOpcodeSkippingAssertions(code);
1943     int op = *scode;
1944
1945     if (op >= OP_BRA)
1946         op = OP_BRA;
1947
1948     switch (op) {
1949         default:
1950             return -1;
1951
1952         case OP_BRA:
1953         case OP_ASSERT:
1954             return bracketFindFirstAssertedCharacter(scode, op == OP_ASSERT);
1955
1956         case OP_EXACT:
1957             scode += 2;
1958             /* Fall through */
1959
1960         case OP_CHAR:
1961         case OP_CHAR_IGNORING_CASE:
1962         case OP_ASCII_CHAR:
1963         case OP_ASCII_LETTER_IGNORING_CASE:
1964         case OP_PLUS:
1965         case OP_MINPLUS:
1966             if (!inassert)
1967                 return -1;
1968             return scode[1];
1969     }
1970 }
1971
1972 static int bracketFindFirstAssertedCharacter(const unsigned char* code, bool inassert)
1973 {
1974     int c = -1;
1975     do {
1976         int d = branchFindFirstAssertedCharacter(code + 1 + LINK_SIZE, inassert);
1977         if (d < 0)
1978             return -1;
1979         if (c < 0)
1980             c = d;
1981         else if (c != d)
1982             return -1;
1983         code += getLinkValue(code + 1);
1984     } while (*code == OP_ALT);
1985     return c;
1986 }
1987
1988 static inline int multiplyWithOverflowCheck(int a, int b)
1989 {
1990     if (!a || !b)
1991         return 0;
1992     if (a > MAX_PATTERN_SIZE / b)
1993         return -1;
1994     return a * b;
1995 }
1996
1997 static int calculateCompiledPatternLength(const UChar* pattern, int patternLength, JSRegExpIgnoreCaseOption ignoreCase,
1998     CompileData& cd, ErrorCode& errorcode)
1999 {
2000     /* Make a pass over the pattern to compute the
2001      amount of store required to hold the compiled code. This does not have to be
2002      perfect as long as errors are overestimates. */
2003
2004     if (patternLength > MAX_PATTERN_SIZE) {
2005         errorcode = ERR16;
2006         return -1;
2007     }
2008
2009     int length = 1 + LINK_SIZE;      /* For initial BRA plus length */
2010     int branch_extra = 0;
2011     int lastitemlength = 0;
2012     unsigned brastackptr = 0;
2013     int brastack[BRASTACK_SIZE];
2014     unsigned char bralenstack[BRASTACK_SIZE];
2015     int bracount = 0;
2016
2017     const UChar* ptr = (const UChar*)(pattern - 1);
2018     const UChar* patternEnd = (const UChar*)(pattern + patternLength);
2019
2020     while (++ptr < patternEnd) {
2021         int minRepeats = 0, maxRepeats = 0;
2022         int c = *ptr;
2023
2024         switch (c) {
2025             /* A backslashed item may be an escaped data character or it may be a
2026              character type. */
2027
2028             case '\\':
2029                 c = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, false);
2030                 if (errorcode != 0)
2031                     return -1;
2032
2033                 lastitemlength = 1;     /* Default length of last item for repeats */
2034
2035                 if (c >= 0) {            /* Data character */
2036                     length += 2;          /* For a one-byte character */
2037
2038                     if (c > 127) {
2039                         int i;
2040                         for (i = 0; i < kjs_pcre_utf8_table1_size; i++)
2041                             if (c <= kjs_pcre_utf8_table1[i]) break;
2042                         length += i;
2043                         lastitemlength += i;
2044                     }
2045
2046                     continue;
2047                 }
2048
2049                 /* Other escapes need one byte */
2050
2051                 length++;
2052
2053                 /* A back reference needs an additional 2 bytes, plus either one or 5
2054                  bytes for a repeat. We also need to keep the value of the highest
2055                  back reference. */
2056
2057                 if (c <= -ESC_REF) {
2058                     int refnum = -c - ESC_REF;
2059                     cd.backrefMap |= (refnum < 32) ? (1 << refnum) : 1;
2060                     if (refnum > cd.top_backref)
2061                         cd.top_backref = refnum;
2062                     length += 2;   /* For single back reference */
2063                     if (safelyCheckNextChar(ptr, patternEnd, '{') && isCountedRepeat(ptr + 2, patternEnd)) {
2064                         ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode);
2065                         if (errorcode)
2066                             return -1;
2067                         if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
2068                             (minRepeats == 1 && maxRepeats == -1))
2069                             length++;
2070                         else
2071                             length += 5;
2072                         if (safelyCheckNextChar(ptr, patternEnd, '?'))
2073                             ptr++;
2074                     }
2075                 }
2076                 continue;
2077
2078             case '^':     /* Single-byte metacharacters */
2079             case '.':
2080             case '$':
2081                 length++;
2082                 lastitemlength = 1;
2083                 continue;
2084
2085             case '*':            /* These repeats won't be after brackets; */
2086             case '+':            /* those are handled separately */
2087             case '?':
2088                 length++;
2089                 goto POSSESSIVE;
2090
2091             /* This covers the cases of braced repeats after a single char, metachar,
2092              class, or back reference. */
2093
2094             case '{':
2095                 if (!isCountedRepeat(ptr + 1, patternEnd))
2096                     goto NORMAL_CHAR;
2097                 ptr = readRepeatCounts(ptr + 1, &minRepeats, &maxRepeats, &errorcode);
2098                 if (errorcode != 0)
2099                     return -1;
2100
2101                 /* These special cases just insert one extra opcode */
2102
2103                 if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
2104                     (minRepeats == 1 && maxRepeats == -1))
2105                     length++;
2106
2107                 /* These cases might insert additional copies of a preceding character. */
2108
2109                 else {
2110                     if (minRepeats != 1) {
2111                         length -= lastitemlength;   /* Uncount the original char or metachar */
2112                         if (minRepeats > 0)
2113                             length += 3 + lastitemlength;
2114                     }
2115                     length += lastitemlength + ((maxRepeats > 0) ? 3 : 1);
2116                 }
2117
2118                 if (safelyCheckNextChar(ptr, patternEnd, '?'))
2119                     ptr++;      /* Needs no extra length */
2120
2121             POSSESSIVE:                     /* Test for possessive quantifier */
2122                 if (safelyCheckNextChar(ptr, patternEnd, '+')) {
2123                     ptr++;
2124                     length += 2 + 2 * LINK_SIZE;   /* Allow for atomic brackets */
2125                 }
2126                 continue;
2127
2128             /* An alternation contains an offset to the next branch or ket. If any ims
2129              options changed in the previous branch(es), and/or if we are in a
2130              lookbehind assertion, extra space will be needed at the start of the
2131              branch. This is handled by branch_extra. */
2132
2133             case '|':
2134                 if (brastackptr == 0)
2135                     cd.needOuterBracket = true;
2136                 length += 1 + LINK_SIZE + branch_extra;
2137                 continue;
2138
2139             /* A character class uses 33 characters provided that all the character
2140              values are less than 256. Otherwise, it uses a bit map for low valued
2141              characters, and individual items for others. Don't worry about character
2142              types that aren't allowed in classes - they'll get picked up during the
2143              compile. A character class that contains only one single-byte character
2144              uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
2145              where we can. (In UTF-8 mode we can do this only for chars < 128.) */
2146
2147             case '[': {
2148                 int class_optcount;
2149                 if (*(++ptr) == '^') {
2150                     class_optcount = 10;  /* Greater than one */
2151                     ptr++;
2152                 }
2153                 else
2154                     class_optcount = 0;
2155
2156                 bool class_utf8 = false;
2157
2158                 for (; ptr < patternEnd && *ptr != ']'; ++ptr) {
2159                     /* Check for escapes */
2160
2161                     if (*ptr == '\\') {
2162                         c = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, true);
2163                         if (errorcode != 0)
2164                             return -1;
2165
2166                         /* Handle escapes that turn into characters */
2167
2168                         if (c >= 0)
2169                             goto NON_SPECIAL_CHARACTER;
2170
2171                         /* Escapes that are meta-things. The normal ones just affect the
2172                          bit map, but Unicode properties require an XCLASS extended item. */
2173
2174                         else
2175                             class_optcount = 10;         /* \d, \s etc; make sure > 1 */
2176                     }
2177
2178                     /* Anything else increments the possible optimization count. We have to
2179                      detect ranges here so that we can compute the number of extra ranges for
2180                      caseless wide characters when UCP support is available. If there are wide
2181                      characters, we are going to have to use an XCLASS, even for single
2182                      characters. */
2183
2184                     else {
2185                         c = *ptr;
2186
2187                         /* Come here from handling \ above when it escapes to a char value */
2188
2189                     NON_SPECIAL_CHARACTER:
2190                         class_optcount++;
2191
2192                         int d = -1;
2193                         if (safelyCheckNextChar(ptr, patternEnd, '-')) {
2194                             UChar const *hyptr = ptr++;
2195                             if (safelyCheckNextChar(ptr, patternEnd, '\\')) {
2196                                 ptr++;
2197                                 d = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, true);
2198                                 if (errorcode != 0)
2199                                     return -1;
2200                             }
2201                             else if ((ptr + 1 < patternEnd) && ptr[1] != ']')
2202                                 d = *++ptr;
2203                             if (d < 0)
2204                                 ptr = hyptr;      /* go back to hyphen as data */
2205                         }
2206
2207                         /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
2208                          127 for caseless matching, we will need to use an XCLASS. */
2209
2210                         if (d >= 0) {
2211                             class_optcount = 10;     /* Ensure > 1 */
2212                             if (d < c) {
2213                                 errorcode = ERR8;
2214                                 return -1;
2215                             }
2216
2217                             if ((d > 255 || (ignoreCase && d > 127))) {
2218                                 unsigned char buffer[6];
2219                                 if (!class_utf8)         /* Allow for XCLASS overhead */
2220                                 {
2221                                     class_utf8 = true;
2222                                     length += LINK_SIZE + 2;
2223                                 }
2224
2225                                 /* If we have UCP support, find out how many extra ranges are
2226                                  needed to map the other case of characters within this range. We
2227                                  have to mimic the range optimization here, because extending the
2228                                  range upwards might push d over a boundary that makes it use
2229                                  another byte in the UTF-8 representation. */
2230
2231                                 if (ignoreCase) {
2232                                     int occ, ocd;
2233                                     int cc = c;
2234                                     int origd = d;
2235                                     while (getOthercaseRange(&cc, origd, &occ, &ocd)) {
2236                                         if (occ >= c && ocd <= d)
2237                                             continue;   /* Skip embedded */
2238
2239                                         if (occ < c  && ocd >= c - 1)  /* Extend the basic range */
2240                                         {                            /* if there is overlap,   */
2241                                             c = occ;                     /* noting that if occ < c */
2242                                             continue;                    /* we can't have ocd > d  */
2243                                         }                            /* because a subrange is  */
2244                                         if (ocd > d && occ <= d + 1)   /* always shorter than    */
2245                                         {                            /* the basic range.       */
2246                                             d = ocd;
2247                                             continue;
2248                                         }
2249
2250                                         /* An extra item is needed */
2251
2252                                         length += 1 + encodeUTF8(occ, buffer) +
2253                                         ((occ == ocd) ? 0 : encodeUTF8(ocd, buffer));
2254                                     }
2255                                 }
2256
2257                                 /* The length of the (possibly extended) range */
2258
2259                                 length += 1 + encodeUTF8(c, buffer) + encodeUTF8(d, buffer);
2260                             }
2261
2262                         }
2263
2264                         /* We have a single character. There is nothing to be done unless we
2265                          are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
2266                          allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
2267                          support. */
2268
2269                         else {
2270                             if ((c > 255 || (ignoreCase && c > 127))) {
2271                                 unsigned char buffer[6];
2272                                 class_optcount = 10;     /* Ensure > 1 */
2273                                 if (!class_utf8)         /* Allow for XCLASS overhead */
2274                                 {
2275                                     class_utf8 = true;
2276                                     length += LINK_SIZE + 2;
2277                                 }
2278                                 length += (ignoreCase ? 2 : 1) * (1 + encodeUTF8(c, buffer));
2279                             }
2280                         }
2281                     }
2282                 }
2283
2284                 if (ptr >= patternEnd) {   /* Missing terminating ']' */
2285                     errorcode = ERR6;
2286                     return -1;
2287                 }
2288
2289                 /* We can optimize when there was only one optimizable character.
2290                  Note that this does not detect the case of a negated single character.
2291                  In that case we do an incorrect length computation, but it's not a serious
2292                  problem because the computed length is too large rather than too small. */
2293
2294                 if (class_optcount == 1)
2295                     goto NORMAL_CHAR;
2296
2297                 /* Here, we handle repeats for the class opcodes. */
2298                 {
2299                     length += 33;
2300
2301                     /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
2302                      we also need extra for wrapping the whole thing in a sub-pattern. */
2303
2304                     if (safelyCheckNextChar(ptr, patternEnd, '{') && isCountedRepeat(ptr + 2, patternEnd)) {
2305                         ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode);
2306                         if (errorcode != 0)
2307                             return -1;
2308                         if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
2309                             (minRepeats == 1 && maxRepeats == -1))
2310                             length++;
2311                         else
2312                             length += 5;
2313                         if (safelyCheckNextChar(ptr, patternEnd, '+')) {
2314                             ptr++;
2315                             length += 2 + 2 * LINK_SIZE;
2316                         } else if (safelyCheckNextChar(ptr, patternEnd, '?'))
2317                             ptr++;
2318                     }
2319                 }
2320                 continue;
2321             }
2322
2323             /* Brackets may be genuine groups or special things */
2324
2325             case '(': {
2326                 int branch_newextra = 0;
2327                 int bracket_length = 1 + LINK_SIZE;
2328                 bool capturing = false;
2329
2330                 /* Handle special forms of bracket, which all start (? */
2331
2332                 if (safelyCheckNextChar(ptr, patternEnd, '?')) {
2333                     switch (c = (ptr + 2 < patternEnd ? ptr[2] : 0)) {
2334                         /* Non-referencing groups and lookaheads just move the pointer on, and
2335                          then behave like a non-special bracket, except that they don't increment
2336                          the count of extracting brackets. Ditto for the "once only" bracket,
2337                          which is in Perl from version 5.005. */
2338
2339                         case ':':
2340                         case '=':
2341                         case '!':
2342                             ptr += 2;
2343                             break;
2344
2345                         /* Else loop checking valid options until ) is met. Anything else is an
2346                          error. If we are without any brackets, i.e. at top level, the settings
2347                          act as if specified in the options, so massage the options immediately.
2348                          This is for backward compatibility with Perl 5.004. */
2349
2350                         default:
2351                             errorcode = ERR12;
2352                             return -1;
2353                     }
2354                 } else
2355                     capturing = 1;
2356
2357                 /* Capturing brackets must be counted so we can process escapes in a
2358                  Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need
2359                  an additional 3 bytes of memory per capturing bracket. */
2360
2361                 if (capturing) {
2362                     bracount++;
2363                     if (bracount > EXTRACT_BASIC_MAX)
2364                         bracket_length += 3;
2365                 }
2366
2367                 /* Save length for computing whole length at end if there's a repeat that
2368                  requires duplication of the group. Also save the current value of
2369                  branch_extra, and start the new group with the new value. If non-zero, this
2370                  will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
2371
2372                 if (brastackptr >= sizeof(brastack)/sizeof(int)) {
2373                     errorcode = ERR17;
2374                     return -1;
2375                 }
2376
2377                 bralenstack[brastackptr] = branch_extra;
2378                 branch_extra = branch_newextra;
2379
2380                 brastack[brastackptr++] = length;
2381                 length += bracket_length;
2382                 continue;
2383             }
2384
2385             /* Handle ket. Look for subsequent maxRepeats/minRepeats; for certain sets of values we
2386              have to replicate this bracket up to that many times. If brastackptr is
2387              0 this is an unmatched bracket which will generate an error, but take care
2388              not to try to access brastack[-1] when computing the length and restoring
2389              the branch_extra value. */
2390
2391             case ')': {
2392                 int duplength;
2393                 length += 1 + LINK_SIZE;
2394                 if (brastackptr > 0) {
2395                     duplength = length - brastack[--brastackptr];
2396                     branch_extra = bralenstack[brastackptr];
2397                 }
2398                 else
2399                     duplength = 0;
2400
2401                 /* Leave ptr at the final char; for readRepeatCounts this happens
2402                  automatically; for the others we need an increment. */
2403
2404                 if ((ptr + 1 < patternEnd) && (c = ptr[1]) == '{' && isCountedRepeat(ptr + 2, patternEnd)) {
2405                     ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode);
2406                     if (errorcode)
2407                         return -1;
2408                 } else if (c == '*') {
2409                     minRepeats = 0;
2410                     maxRepeats = -1;
2411                     ptr++;
2412                 } else if (c == '+') {
2413                     minRepeats = 1;
2414                     maxRepeats = -1;
2415                     ptr++;
2416                 } else if (c == '?') {
2417                     minRepeats = 0;
2418                     maxRepeats = 1;
2419                     ptr++;
2420                 } else {
2421                     minRepeats = 1;
2422                     maxRepeats = 1;
2423                 }
2424
2425                 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2426                  group, and if the maximum is greater than zero, we have to replicate
2427                  maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2428                  bracket set. */
2429
2430                 int repeatsLength;
2431                 if (minRepeats == 0) {
2432                     length++;
2433                     if (maxRepeats > 0) {
2434                         repeatsLength = multiplyWithOverflowCheck(maxRepeats - 1, duplength + 3 + 2 * LINK_SIZE);
2435                         if (repeatsLength < 0) {
2436                             errorcode = ERR16;
2437                             return -1;
2438                         }
2439                         length += repeatsLength;
2440                         if (length > MAX_PATTERN_SIZE) {
2441                             errorcode = ERR16;
2442                             return -1;
2443                         }
2444                     }
2445                 }
2446
2447                 /* When the minimum is greater than zero, we have to replicate up to
2448                  minval-1 times, with no additions required in the copies. Then, if there
2449                  is a limited maximum we have to replicate up to maxval-1 times allowing
2450                  for a BRAZERO item before each optional copy and nesting brackets for all
2451                  but one of the optional copies. */
2452
2453                 else {
2454                     repeatsLength = multiplyWithOverflowCheck(minRepeats - 1, duplength);
2455                     if (repeatsLength < 0) {
2456                         errorcode = ERR16;
2457                         return -1;
2458                     }
2459                     length += repeatsLength;
2460                     if (maxRepeats > minRepeats) { /* Need this test as maxRepeats=-1 means no limit */
2461                         repeatsLength = multiplyWithOverflowCheck(maxRepeats - minRepeats, duplength + 3 + 2 * LINK_SIZE);
2462                         if (repeatsLength < 0) {
2463                             errorcode = ERR16;
2464                             return -1;
2465                         }
2466                         length += repeatsLength - (2 + 2 * LINK_SIZE);
2467                     }
2468                     if (length > MAX_PATTERN_SIZE) {
2469                         errorcode = ERR16;
2470                         return -1;
2471                     }
2472                 }
2473
2474                 /* Allow space for once brackets for "possessive quantifier" */
2475
2476                 if (safelyCheckNextChar(ptr, patternEnd, '+')) {
2477                     ptr++;
2478                     length += 2 + 2 * LINK_SIZE;
2479                 }
2480                 continue;
2481             }
2482
2483             /* Non-special character. It won't be space or # in extended mode, so it is
2484              always a genuine character. If we are in a \Q...\E sequence, check for the
2485              end; if not, we have a literal. */
2486
2487             default:
2488             NORMAL_CHAR:
2489                 length += 2;          /* For a one-byte character */
2490                 lastitemlength = 1;   /* Default length of last item for repeats */
2491
2492                 if (c > 127) {
2493                     int i;
2494                     for (i = 0; i < kjs_pcre_utf8_table1_size; i++)
2495                         if (c <= kjs_pcre_utf8_table1[i])
2496                             break;
2497                     length += i;
2498                     lastitemlength += i;
2499                 }
2500
2501                 continue;
2502         }
2503     }
2504
2505     length += 2 + LINK_SIZE;    /* For final KET and END */
2506
2507     cd.numCapturingBrackets = bracount;
2508     return length;
2509 }
2510
2511 /*************************************************
2512 *        Compile a Regular Expression            *
2513 *************************************************/
2514
2515 /* This function takes a string and returns a pointer to a block of store
2516 holding a compiled version of the expression. The original API for this
2517 function had no error code return variable; it is retained for backwards
2518 compatibility. The new function is given a new name.
2519
2520 Arguments:
2521   pattern       the regular expression
2522   options       various option bits
2523   errorcodeptr  pointer to error code variable (pcre_compile2() only)
2524                   can be NULL if you don't want a code value
2525   errorptr      pointer to pointer to error text
2526   erroroffset   ptr offset in pattern where error was detected
2527   tables        pointer to character tables or NULL
2528
2529 Returns:        pointer to compiled data block, or NULL on error,
2530                 with errorptr and erroroffset set
2531 */
2532
2533 static inline JSRegExp* returnError(ErrorCode errorcode, const char** errorptr)
2534 {
2535     *errorptr = errorText(errorcode);
2536     return 0;
2537 }
2538
2539 JSRegExp* jsRegExpCompile(const UChar* pattern, int patternLength,
2540                 JSRegExpIgnoreCaseOption ignoreCase, JSRegExpMultilineOption multiline,
2541                 unsigned* numSubpatterns, const char** errorptr)
2542 {
2543     /* We can't pass back an error message if errorptr is NULL; I guess the best we
2544      can do is just return NULL, but we can set a code value if there is a code pointer. */
2545     if (!errorptr)
2546         return 0;
2547     *errorptr = NULL;
2548
2549     CompileData cd;
2550
2551     ErrorCode errorcode = ERR0;
2552     /* Call this once just to count the brackets. */
2553     calculateCompiledPatternLength(pattern, patternLength, ignoreCase, cd, errorcode);
2554     /* Call it again to compute the length. */
2555     int length = calculateCompiledPatternLength(pattern, patternLength, ignoreCase, cd, errorcode);
2556     if (errorcode)
2557         return returnError(errorcode, errorptr);
2558
2559     if (length > MAX_PATTERN_SIZE)
2560         return returnError(ERR16, errorptr);
2561
2562     size_t size = length + sizeof(JSRegExp);
2563     JSRegExp* re = reinterpret_cast<JSRegExp*>(new char[size]);
2564
2565     if (!re)
2566         return returnError(ERR13, errorptr);
2567
2568     re->options = (ignoreCase ? IgnoreCaseOption : 0) | (multiline ? MatchAcrossMultipleLinesOption : 0);
2569
2570     /* The starting points of the name/number translation table and of the code are
2571      passed around in the compile data block. */
2572
2573     const unsigned char* codeStart = (const unsigned char*)(re + 1);
2574
2575     /* Set up a starting, non-extracting bracket, then compile the expression. On
2576      error, errorcode will be set non-zero, so we don't need to look at the result
2577      of the function here. */
2578
2579     const UChar* ptr = (const UChar*)pattern;
2580     const UChar* patternEnd = pattern + patternLength;
2581     unsigned char* code = (unsigned char*)codeStart;
2582     int firstbyte, reqbyte;
2583     int bracketCount = 0;
2584     if (!cd.needOuterBracket)
2585         compileBranch(re->options, &bracketCount, &code, &ptr, patternEnd, &errorcode, &firstbyte, &reqbyte, cd);
2586     else {
2587         *code = OP_BRA;
2588         compileBracket(re->options, &bracketCount, &code, &ptr, patternEnd, &errorcode, 0, &firstbyte, &reqbyte, cd);
2589     }
2590     re->top_bracket = bracketCount;
2591     re->top_backref = cd.top_backref;
2592
2593     /* If not reached end of pattern on success, there's an excess bracket. */
2594
2595     if (errorcode == 0 && ptr < patternEnd)
2596         errorcode = ERR10;
2597
2598     /* Fill in the terminating state and check for disastrous overflow, but
2599      if debugging, leave the test till after things are printed out. */
2600
2601     *code++ = OP_END;
2602
2603     ASSERT(code - codeStart <= length);
2604     if (code - codeStart > length)
2605         errorcode = ERR7;
2606
2607     /* Give an error if there's back reference to a non-existent capturing
2608      subpattern. */
2609
2610     if (re->top_backref > re->top_bracket)
2611         errorcode = ERR15;
2612
2613     /* Failed to compile, or error while post-processing */
2614
2615     if (errorcode != ERR0) {
2616         delete [] reinterpret_cast<char*>(re);
2617         return returnError(errorcode, errorptr);
2618     }
2619
2620     /* If the anchored option was not passed, set the flag if we can determine that
2621      the pattern is anchored by virtue of ^ characters or \A or anything else (such
2622      as starting with .* when DOTALL is set).
2623
2624      Otherwise, if we know what the first character has to be, save it, because that
2625      speeds up unanchored matches no end. If not, see if we can set the
2626      UseMultiLineFirstByteOptimizationOption flag. This is helpful for multiline matches when all branches
2627      start with ^. and also when all branches start with .* for non-DOTALL matches.
2628      */
2629
2630     if (cd.needOuterBracket ? bracketIsAnchored(codeStart) : branchIsAnchored(codeStart))
2631         re->options |= IsAnchoredOption;
2632     else {
2633         if (firstbyte < 0) {
2634             firstbyte = (cd.needOuterBracket
2635                     ? bracketFindFirstAssertedCharacter(codeStart, false)
2636                     : branchFindFirstAssertedCharacter(codeStart, false))
2637                 | ((re->options & IgnoreCaseOption) ? REQ_IGNORE_CASE : 0);
2638         }
2639         if (firstbyte >= 0) {
2640             int ch = firstbyte & 255;
2641             if (ch < 127) {
2642                 re->first_byte = ((firstbyte & REQ_IGNORE_CASE) && flipCase(ch) == ch) ? ch : firstbyte;
2643                 re->options |= UseFirstByteOptimizationOption;
2644             }
2645         } else {
2646             if (cd.needOuterBracket ? bracketNeedsLineStart(codeStart, 0, cd.backrefMap) : branchNeedsLineStart(codeStart, 0, cd.backrefMap))
2647                 re->options |= UseMultiLineFirstByteOptimizationOption;
2648         }
2649     }
2650
2651     /* For an anchored pattern, we use the "required byte" only if it follows a
2652      variable length item in the regex. Remove the caseless flag for non-caseable
2653      bytes. */
2654
2655     if (reqbyte >= 0 && (!(re->options & IsAnchoredOption) || (reqbyte & REQ_VARY))) {
2656         int ch = reqbyte & 255;
2657         if (ch < 127) {
2658             re->req_byte = ((reqbyte & REQ_IGNORE_CASE) && flipCase(ch) == ch) ? (reqbyte & ~REQ_IGNORE_CASE) : reqbyte;
2659             re->options |= UseRequiredByteOptimizationOption;
2660         }
2661     }
2662
2663     if (numSubpatterns)
2664         *numSubpatterns = re->top_bracket;
2665     return re;
2666 }
2667
2668 void jsRegExpFree(JSRegExp* re)
2669 {
2670     delete [] reinterpret_cast<char*>(re);
2671 }