yarr/YarrParser.h

   1 /*
   2  * Copyright (C) 2009 Apple Inc. All rights reserved.
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions
   6  * are met:
   7  * 1. Redistributions of source code must retain the above copyright
   8  *    notice, this list of conditions and the following disclaimer.
   9  * 2. Redistributions in binary form must reproduce the above copyright
  10  *    notice, this list of conditions and the following disclaimer in the
  11  *    documentation and/or other materials provided with the distribution.
  12  *
  13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
  14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
  17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  24  */
  25
  26 #ifndef YarrParser_h
  27 #define YarrParser_h
  28
  29 #include "Yarr.h"
  30 #include <wtf/ASCIICType.h>
  31 #include <wtf/text/WTFString.h>
  32 #include <wtf/unicode/Unicode.h>
  33
  34 namespace JSC { namespace Yarr {
  35
  36 #define REGEXP_ERROR_PREFIX "Invalid regular expression: "
  37
  38 enum BuiltInCharacterClassID {
  39     DigitClassID,
  40     SpaceClassID,
  41     WordClassID,
  42     NewlineClassID,
  43 };
  44
  45 // The Parser class should not be used directly - only via the Yarr::parse() method.
  46 template<class Delegate, typename CharType>
  47 class Parser {
  48 private:
  49     template<class FriendDelegate>
  50     friend const char* parse(FriendDelegate&, const String& pattern, unsigned backReferenceLimit);
  51
  52     enum ErrorCode {
  53         NoError,
  54         PatternTooLarge,
  55         QuantifierOutOfOrder,
  56         QuantifierWithoutAtom,
  57         QuantifierTooLarge,
  58         MissingParentheses,
  59         ParenthesesUnmatched,
  60         ParenthesesTypeInvalid,
  61         CharacterClassUnmatched,
  62         CharacterClassOutOfOrder,
  63         EscapeUnterminated,
  64         NumberOfErrorCodes
  65     };
  66
  67     /*
  68      * CharacterClassParserDelegate:
  69      *
  70      * The class CharacterClassParserDelegate is used in the parsing of character
  71      * classes.  This class handles detection of character ranges.  This class
  72      * implements enough of the delegate interface such that it can be passed to
  73      * parseEscape() as an EscapeDelegate.  This allows parseEscape() to be reused
  74      * to perform the parsing of escape characters in character sets.
  75      */
  76     class CharacterClassParserDelegate {
  77     public:
  78         CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
  79             : m_delegate(delegate)
  80             , m_err(err)
  81             , m_state(Empty)
  82             , m_character(0)
  83         {
  84         }
  85
  86         /*
  87          * begin():
  88          *
  89          * Called at beginning of construction.
  90          */
  91         void begin(bool invert)
  92         {
  93             m_delegate.atomCharacterClassBegin(invert);
  94         }
  95
  96         /*
  97          * atomPatternCharacter():
  98          *
  99          * This method is called either from parseCharacterClass() (for an unescaped
 100          * character in a character class), or from parseEscape(). In the former case
 101          * the value true will be passed for the argument 'hyphenIsRange', and in this
 102          * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
 103          * is different to /[a\-z]/).
 104          */
 105         void atomPatternCharacter(UChar ch, bool hyphenIsRange = false)
 106         {
 107             switch (m_state) {
 108             case AfterCharacterClass:
 109                 // Following a builtin character class we need look out for a hyphen.
 110                 // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
 111                 // If we see a hyphen following a charater class then unlike usual
 112                 // we'll report it to the delegate immediately, and put ourself into
 113                 // a poisoned state. Any following calls to add another character or
 114                 // character class will result in an error. (A hypen following a
 115                 // character-class is itself valid, but only  at the end of a regex).
 116                 if (hyphenIsRange && ch == '-') {
 117                     m_delegate.atomCharacterClassAtom('-');
 118                     m_state = AfterCharacterClassHyphen;
 119                     return;
 120                 }
 121                 // Otherwise just fall through - cached character so treat this as Empty.
 122
 123             case Empty:
 124                 m_character = ch;
 125                 m_state = CachedCharacter;
 126                 return;
 127
 128             case CachedCharacter:
 129                 if (hyphenIsRange && ch == '-')
 130                     m_state = CachedCharacterHyphen;
 131                 else {
 132                     m_delegate.atomCharacterClassAtom(m_character);
 133                     m_character = ch;
 134                 }
 135                 return;
 136
 137             case CachedCharacterHyphen:
 138                 if (ch < m_character) {
 139                     m_err = CharacterClassOutOfOrder;
 140                     return;
 141                 }
 142                 m_delegate.atomCharacterClassRange(m_character, ch);
 143                 m_state = Empty;
 144                 return;
 145
 146                 // See coment in atomBuiltInCharacterClass below.
 147                 // This too is technically an error, per ECMA-262, and again we
 148                 // we chose to allow this.  Note a subtlely here that while we
 149                 // diverge from the spec's definition of CharacterRange we do
 150                 // remain in compliance with the grammar.  For example, consider
 151                 // the expression /[\d-a-z]/.  We comply with the grammar in
 152                 // this case by not allowing a-z to be matched as a range.
 153             case AfterCharacterClassHyphen:
 154                 m_delegate.atomCharacterClassAtom(ch);
 155                 m_state = Empty;
 156                 return;
 157             }
 158         }
 159
 160         /*
 161          * atomBuiltInCharacterClass():
 162          *
 163          * Adds a built-in character class, called by parseEscape().
 164          */
 165         void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
 166         {
 167             switch (m_state) {
 168             case CachedCharacter:
 169                 // Flush the currently cached character, then fall through.
 170                 m_delegate.atomCharacterClassAtom(m_character);
 171
 172             case Empty:
 173             case AfterCharacterClass:
 174                 m_state = AfterCharacterClass;
 175                 m_delegate.atomCharacterClassBuiltIn(classID, invert);
 176                 return;
 177
 178                 // If we hit either of these cases, we have an invalid range that
 179                 // looks something like /[x-\d]/ or /[\d-\d]/.
 180                 // According to ECMA-262 this should be a syntax error, but
 181                 // empirical testing shows this to break teh webz.  Instead we
 182                 // comply with to the ECMA-262 grammar, and assume the grammar to
 183                 // have matched the range correctly, but tweak our interpretation
 184                 // of CharacterRange.  Effectively we implicitly handle the hyphen
 185                 // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/.
 186             case CachedCharacterHyphen:
 187                 m_delegate.atomCharacterClassAtom(m_character);
 188                 m_delegate.atomCharacterClassAtom('-');
 189                 // fall through
 190             case AfterCharacterClassHyphen:
 191                 m_delegate.atomCharacterClassBuiltIn(classID, invert);
 192                 m_state = Empty;
 193                 return;
 194             }
 195         }
 196
 197         /*
 198          * end():
 199          *
 200          * Called at end of construction.
 201          */
 202         void end()
 203         {
 204             if (m_state == CachedCharacter)
 205                 m_delegate.atomCharacterClassAtom(m_character);
 206             else if (m_state == CachedCharacterHyphen) {
 207                 m_delegate.atomCharacterClassAtom(m_character);
 208                 m_delegate.atomCharacterClassAtom('-');
 209             }
 210             m_delegate.atomCharacterClassEnd();
 211         }
 212
 213         // parseEscape() should never call these delegate methods when
 214         // invoked with inCharacterClass set.
 215         NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { RELEASE_ASSERT_NOT_REACHED(); }
 216         NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { RELEASE_ASSERT_NOT_REACHED(); }
 217
 218     private:
 219         Delegate& m_delegate;
 220         ErrorCode& m_err;
 221         enum CharacterClassConstructionState {
 222             Empty,
 223             CachedCharacter,
 224             CachedCharacterHyphen,
 225             AfterCharacterClass,
 226             AfterCharacterClassHyphen,
 227         } m_state;
 228         UChar m_character;
 229     };
 230
 231     Parser(Delegate& delegate, const String& pattern, unsigned backReferenceLimit)
 232         : m_delegate(delegate)
 233         , m_backReferenceLimit(backReferenceLimit)
 234         , m_err(NoError)
 235         , m_data(pattern.getCharacters<CharType>())
 236         , m_size(pattern.length())
 237         , m_index(0)
 238         , m_parenthesesNestingDepth(0)
 239     {
 240     }
 241
 242     /*
 243      * parseEscape():
 244      *
 245      * Helper for parseTokens() AND parseCharacterClass().
 246      * Unlike the other parser methods, this function does not report tokens
 247      * directly to the member delegate (m_delegate), instead tokens are
 248      * emitted to the delegate provided as an argument.  In the case of atom
 249      * escapes, parseTokens() will call parseEscape() passing m_delegate as
 250      * an argument, and as such the escape will be reported to the delegate.
 251      *
 252      * However this method may also be used by parseCharacterClass(), in which
 253      * case a CharacterClassParserDelegate will be passed as the delegate that
 254      * tokens should be added to.  A boolean flag is also provided to indicate
 255      * whether that an escape in a CharacterClass is being parsed (some parsing
 256      * rules change in this context).
 257      *
 258      * The boolean value returned by this method indicates whether the token
 259      * parsed was an atom (outside of a characted class \b and \B will be
 260      * interpreted as assertions).
 261      */
 262     template<bool inCharacterClass, class EscapeDelegate>
 263     bool parseEscape(EscapeDelegate& delegate)
 264     {
 265         ASSERT(!m_err);
 266         ASSERT(peek() == '\\');
 267         consume();
 268
 269         if (atEndOfPattern()) {
 270             m_err = EscapeUnterminated;
 271             return false;
 272         }
 273
 274         switch (peek()) {
 275         // Assertions
 276         case 'b':
 277             consume();
 278             if (inCharacterClass)
 279                 delegate.atomPatternCharacter('\b');
 280             else {
 281                 delegate.assertionWordBoundary(false);
 282                 return false;
 283             }
 284             break;
 285         case 'B':
 286             consume();
 287             if (inCharacterClass)
 288                 delegate.atomPatternCharacter('B');
 289             else {
 290                 delegate.assertionWordBoundary(true);
 291                 return false;
 292             }
 293             break;
 294
 295         // CharacterClassEscape
 296         case 'd':
 297             consume();
 298             delegate.atomBuiltInCharacterClass(DigitClassID, false);
 299             break;
 300         case 's':
 301             consume();
 302             delegate.atomBuiltInCharacterClass(SpaceClassID, false);
 303             break;
 304         case 'w':
 305             consume();
 306             delegate.atomBuiltInCharacterClass(WordClassID, false);
 307             break;
 308         case 'D':
 309             consume();
 310             delegate.atomBuiltInCharacterClass(DigitClassID, true);
 311             break;
 312         case 'S':
 313             consume();
 314             delegate.atomBuiltInCharacterClass(SpaceClassID, true);
 315             break;
 316         case 'W':
 317             consume();
 318             delegate.atomBuiltInCharacterClass(WordClassID, true);
 319             break;
 320
 321         // DecimalEscape
 322         case '1':
 323         case '2':
 324         case '3':
 325         case '4':
 326         case '5':
 327         case '6':
 328         case '7':
 329         case '8':
 330         case '9': {
 331             // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
 332             // First, try to parse this as backreference.
 333             if (!inCharacterClass) {
 334                 ParseState state = saveState();
 335
 336                 unsigned backReference = consumeNumber();
 337                 if (backReference <= m_backReferenceLimit) {
 338                     delegate.atomBackReference(backReference);
 339                     break;
 340                 }
 341
 342                 restoreState(state);
 343             }
 344
 345             // Not a backreference, and not octal.
 346             if (peek() >= '8') {
 347                 delegate.atomPatternCharacter('\\');
 348                 break;
 349             }
 350
 351             // Fall-through to handle this as an octal escape.
 352         }
 353
 354         // Octal escape
 355         case '0':
 356             delegate.atomPatternCharacter(consumeOctal());
 357             break;
 358
 359         // ControlEscape
 360         case 'f':
 361             consume();
 362             delegate.atomPatternCharacter('\f');
 363             break;
 364         case 'n':
 365             consume();
 366             delegate.atomPatternCharacter('\n');
 367             break;
 368         case 'r':
 369             consume();
 370             delegate.atomPatternCharacter('\r');
 371             break;
 372         case 't':
 373             consume();
 374             delegate.atomPatternCharacter('\t');
 375             break;
 376         case 'v':
 377             consume();
 378             delegate.atomPatternCharacter('\v');
 379             break;
 380
 381         // ControlLetter
 382         case 'c': {
 383             ParseState state = saveState();
 384             consume();
 385             if (!atEndOfPattern()) {
 386                 int control = consume();
 387
 388                 // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
 389                 if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) {
 390                     delegate.atomPatternCharacter(control & 0x1f);
 391                     break;
 392                 }
 393             }
 394             restoreState(state);
 395             delegate.atomPatternCharacter('\\');
 396             break;
 397         }
 398
 399         // HexEscape
 400         case 'x': {
 401             consume();
 402             int x = tryConsumeHex(2);
 403             if (x == -1)
 404                 delegate.atomPatternCharacter('x');
 405             else
 406                 delegate.atomPatternCharacter(x);
 407             break;
 408         }
 409
 410         // UnicodeEscape
 411         case 'u': {
 412             consume();
 413             int u = tryConsumeHex(4);
 414             if (u == -1)
 415                 delegate.atomPatternCharacter('u');
 416             else
 417                 delegate.atomPatternCharacter(u);
 418             break;
 419         }
 420
 421         // IdentityEscape
 422         default:
 423             delegate.atomPatternCharacter(consume());
 424         }
 425
 426         return true;
 427     }
 428
 429     /*
 430      * parseAtomEscape(), parseCharacterClassEscape():
 431      *
 432      * These methods alias to parseEscape().
 433      */
 434     bool parseAtomEscape()
 435     {
 436         return parseEscape<false>(m_delegate);
 437     }
 438     void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
 439     {
 440         parseEscape<true>(delegate);
 441     }
 442
 443     /*
 444      * parseCharacterClass():
 445      *
 446      * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
 447      * to an instance of CharacterClassParserDelegate, to describe the character class to the
 448      * delegate.
 449      */
 450     void parseCharacterClass()
 451     {
 452         ASSERT(!m_err);
 453         ASSERT(peek() == '[');
 454         consume();
 455
 456         CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err);
 457
 458         characterClassConstructor.begin(tryConsume('^'));
 459
 460         while (!atEndOfPattern()) {
 461             switch (peek()) {
 462             case ']':
 463                 consume();
 464                 characterClassConstructor.end();
 465                 return;
 466
 467             case '\\':
 468                 parseCharacterClassEscape(characterClassConstructor);
 469                 break;
 470
 471             default:
 472                 characterClassConstructor.atomPatternCharacter(consume(), true);
 473             }
 474
 475             if (m_err)
 476                 return;
 477         }
 478
 479         m_err = CharacterClassUnmatched;
 480     }
 481
 482     /*
 483      * parseParenthesesBegin():
 484      *
 485      * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
 486      */
 487     void parseParenthesesBegin()
 488     {
 489         ASSERT(!m_err);
 490         ASSERT(peek() == '(');
 491         consume();
 492
 493         if (tryConsume('?')) {
 494             if (atEndOfPattern()) {
 495                 m_err = ParenthesesTypeInvalid;
 496                 return;
 497             }
 498
 499             switch (consume()) {
 500             case ':':
 501                 m_delegate.atomParenthesesSubpatternBegin(false);
 502                 break;
 503
 504             case '=':
 505                 m_delegate.atomParentheticalAssertionBegin();
 506                 break;
 507
 508             case '!':
 509                 m_delegate.atomParentheticalAssertionBegin(true);
 510                 break;
 511
 512             default:
 513                 m_err = ParenthesesTypeInvalid;
 514             }
 515         } else
 516             m_delegate.atomParenthesesSubpatternBegin();
 517
 518         ++m_parenthesesNestingDepth;
 519     }
 520
 521     /*
 522      * parseParenthesesEnd():
 523      *
 524      * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
 525      */
 526     void parseParenthesesEnd()
 527     {
 528         ASSERT(!m_err);
 529         ASSERT(peek() == ')');
 530         consume();
 531
 532         if (m_parenthesesNestingDepth > 0)
 533             m_delegate.atomParenthesesEnd();
 534         else
 535             m_err = ParenthesesUnmatched;
 536
 537         --m_parenthesesNestingDepth;
 538     }
 539
 540     /*
 541      * parseQuantifier():
 542      *
 543      * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
 544      */
 545     void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
 546     {
 547         ASSERT(!m_err);
 548         ASSERT(min <= max);
 549
 550         if (min == UINT_MAX) {
 551             m_err = QuantifierTooLarge;
 552             return;
 553         }
 554
 555         if (lastTokenWasAnAtom)
 556             m_delegate.quantifyAtom(min, max, !tryConsume('?'));
 557         else
 558             m_err = QuantifierWithoutAtom;
 559     }
 560
 561     /*
 562      * parseTokens():
 563      *
 564      * This method loops over the input pattern reporting tokens to the delegate.
 565      * The method returns when a parse error is detected, or the end of the pattern
 566      * is reached.  One piece of state is tracked around the loop, which is whether
 567      * the last token passed to the delegate was an atom (this is necessary to detect
 568      * a parse error when a quantifier provided without an atom to quantify).
 569      */
 570     void parseTokens()
 571     {
 572         bool lastTokenWasAnAtom = false;
 573
 574         while (!atEndOfPattern()) {
 575             switch (peek()) {
 576             case '|':
 577                 consume();
 578                 m_delegate.disjunction();
 579                 lastTokenWasAnAtom = false;
 580                 break;
 581
 582             case '(':
 583                 parseParenthesesBegin();
 584                 lastTokenWasAnAtom = false;
 585                 break;
 586
 587             case ')':
 588                 parseParenthesesEnd();
 589                 lastTokenWasAnAtom = true;
 590                 break;
 591
 592             case '^':
 593                 consume();
 594                 m_delegate.assertionBOL();
 595                 lastTokenWasAnAtom = false;
 596                 break;
 597
 598             case '$':
 599                 consume();
 600                 m_delegate.assertionEOL();
 601                 lastTokenWasAnAtom = false;
 602                 break;
 603
 604             case '.':
 605                 consume();
 606                 m_delegate.atomBuiltInCharacterClass(NewlineClassID, true);
 607                 lastTokenWasAnAtom = true;
 608                 break;
 609
 610             case '[':
 611                 parseCharacterClass();
 612                 lastTokenWasAnAtom = true;
 613                 break;
 614
 615             case '\\':
 616                 lastTokenWasAnAtom = parseAtomEscape();
 617                 break;
 618
 619             case '*':
 620                 consume();
 621                 parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite);
 622                 lastTokenWasAnAtom = false;
 623                 break;
 624
 625             case '+':
 626                 consume();
 627                 parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite);
 628                 lastTokenWasAnAtom = false;
 629                 break;
 630
 631             case '?':
 632                 consume();
 633                 parseQuantifier(lastTokenWasAnAtom, 0, 1);
 634                 lastTokenWasAnAtom = false;
 635                 break;
 636
 637             case '{': {
 638                 ParseState state = saveState();
 639
 640                 consume();
 641                 if (peekIsDigit()) {
 642                     unsigned min = consumeNumber();
 643                     unsigned max = min;
 644
 645                     if (tryConsume(','))
 646                         max = peekIsDigit() ? consumeNumber() : quantifyInfinite;
 647
 648                     if (tryConsume('}')) {
 649                         if (min <= max)
 650                             parseQuantifier(lastTokenWasAnAtom, min, max);
 651                         else
 652                             m_err = QuantifierOutOfOrder;
 653                         lastTokenWasAnAtom = false;
 654                         break;
 655                     }
 656                 }
 657
 658                 restoreState(state);
 659             } // if we did not find a complete quantifer, fall through to the default case.
 660
 661             default:
 662                 m_delegate.atomPatternCharacter(consume());
 663                 lastTokenWasAnAtom = true;
 664             }
 665
 666             if (m_err)
 667                 return;
 668         }
 669
 670         if (m_parenthesesNestingDepth > 0)
 671             m_err = MissingParentheses;
 672     }
 673
 674     /*
 675      * parse():
 676      *
 677      * This method calls parseTokens() to parse over the input and converts any
 678      * error code to a const char* for a result.
 679      */
 680     const char* parse()
 681     {
 682         if (m_size > MAX_PATTERN_SIZE)
 683             m_err = PatternTooLarge;
 684         else
 685             parseTokens();
 686         ASSERT(atEndOfPattern() || m_err);
 687
 688         // The order of this array must match the ErrorCode enum.
 689         static const char* errorMessages[NumberOfErrorCodes] = {
 690             0, // NoError
 691             REGEXP_ERROR_PREFIX "regular expression too large",
 692             REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier",
 693             REGEXP_ERROR_PREFIX "nothing to repeat",
 694             REGEXP_ERROR_PREFIX "number too large in {} quantifier",
 695             REGEXP_ERROR_PREFIX "missing )",
 696             REGEXP_ERROR_PREFIX "unmatched parentheses",
 697             REGEXP_ERROR_PREFIX "unrecognized character after (?",
 698             REGEXP_ERROR_PREFIX "missing terminating ] for character class",
 699             REGEXP_ERROR_PREFIX "range out of order in character class",
 700             REGEXP_ERROR_PREFIX "\\ at end of pattern"
 701         };
 702
 703         return errorMessages[m_err];
 704     }
 705
 706     // Misc helper functions:
 707
 708     typedef unsigned ParseState;
 709
 710     ParseState saveState()
 711     {
 712         return m_index;
 713     }
 714
 715     void restoreState(ParseState state)
 716     {
 717         m_index = state;
 718     }
 719
 720     bool atEndOfPattern()
 721     {
 722         ASSERT(m_index <= m_size);
 723         return m_index == m_size;
 724     }
 725
 726     int peek()
 727     {
 728         ASSERT(m_index < m_size);
 729         return m_data[m_index];
 730     }
 731
 732     bool peekIsDigit()
 733     {
 734         return !atEndOfPattern() && WTF::isASCIIDigit(peek());
 735     }
 736
 737     unsigned peekDigit()
 738     {
 739         ASSERT(peekIsDigit());
 740         return peek() - '0';
 741     }
 742
 743     int consume()
 744     {
 745         ASSERT(m_index < m_size);
 746         return m_data[m_index++];
 747     }
 748
 749     unsigned consumeDigit()
 750     {
 751         ASSERT(peekIsDigit());
 752         return consume() - '0';
 753     }
 754
 755     unsigned consumeNumber()
 756     {
 757         unsigned n = consumeDigit();
 758         // check for overflow.
 759         for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) {
 760             n = newValue;
 761             consume();
 762         }
 763         return n;
 764     }
 765
 766     unsigned consumeOctal()
 767     {
 768         ASSERT(WTF::isASCIIOctalDigit(peek()));
 769
 770         unsigned n = consumeDigit();
 771         while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
 772             n = n * 8 + consumeDigit();
 773         return n;
 774     }
 775
 776     bool tryConsume(UChar ch)
 777     {
 778         if (atEndOfPattern() || (m_data[m_index] != ch))
 779             return false;
 780         ++m_index;
 781         return true;
 782     }
 783
 784     int tryConsumeHex(int count)
 785     {
 786         ParseState state = saveState();
 787
 788         int n = 0;
 789         while (count--) {
 790             if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
 791                 restoreState(state);
 792                 return -1;
 793             }
 794             n = (n << 4) | WTF::toASCIIHexValue(consume());
 795         }
 796         return n;
 797     }
 798
 799     Delegate& m_delegate;
 800     unsigned m_backReferenceLimit;
 801     ErrorCode m_err;
 802     const CharType* m_data;
 803     unsigned m_size;
 804     unsigned m_index;
 805     unsigned m_parenthesesNestingDepth;
 806
 807     // Derived by empirical testing of compile time in PCRE and WREC.
 808     static const unsigned MAX_PATTERN_SIZE = 1024 * 1024;
 809 };
 810
 811 /*
 812  * Yarr::parse():
 813  *
 814  * The parse method is passed a pattern to be parsed and a delegate upon which
 815  * callbacks will be made to record the parsed tokens forming the regex.
 816  * Yarr::parse() returns null on success, or a const C string providing an error
 817  * message where a parse error occurs.
 818  *
 819  * The Delegate must implement the following interface:
 820  *
 821  *    void assertionBOL();
 822  *    void assertionEOL();
 823  *    void assertionWordBoundary(bool invert);
 824  *
 825  *    void atomPatternCharacter(UChar ch);
 826  *    void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
 827  *    void atomCharacterClassBegin(bool invert)
 828  *    void atomCharacterClassAtom(UChar ch)
 829  *    void atomCharacterClassRange(UChar begin, UChar end)
 830  *    void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
 831  *    void atomCharacterClassEnd()
 832  *    void atomParenthesesSubpatternBegin(bool capture = true);
 833  *    void atomParentheticalAssertionBegin(bool invert = false);
 834  *    void atomParenthesesEnd();
 835  *    void atomBackReference(unsigned subpatternId);
 836  *
 837  *    void quantifyAtom(unsigned min, unsigned max, bool greedy);
 838  *
 839  *    void disjunction();
 840  *
 841  * The regular expression is described by a sequence of assertion*() and atom*()
 842  * callbacks to the delegate, describing the terms in the regular expression.
 843  * Following an atom a quantifyAtom() call may occur to indicate that the previous
 844  * atom should be quantified.  In the case of atoms described across multiple
 845  * calls (parentheses and character classes) the call to quantifyAtom() will come
 846  * after the call to the atom*End() method, never after atom*Begin().
 847  *
 848  * Character classes may either be described by a single call to
 849  * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
 850  * In the latter case, ...Begin() will be called, followed by a sequence of
 851  * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
 852  *
 853  * Sequences of atoms and assertions are broken into alternatives via calls to
 854  * disjunction().  Assertions, atoms, and disjunctions emitted between calls to
 855  * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
 856  * atomParenthesesBegin() is passed a subpatternId.  In the case of a regular
 857  * capturing subpattern, this will be the subpatternId associated with these
 858  * parentheses, and will also by definition be the lowest subpatternId of these
 859  * parentheses and of any nested paretheses.  The atomParenthesesEnd() method
 860  * is passed the subpatternId of the last capturing subexpression nested within
 861  * these paretheses.  In the case of a capturing subpattern with no nested
 862  * capturing subpatterns, the same subpatternId will be passed to the begin and
 863  * end functions.  In the case of non-capturing subpatterns the subpatternId
 864  * passed to the begin method is also the first possible subpatternId that might
 865  * be nested within these paretheses.  If a set of non-capturing parentheses does
 866  * not contain any capturing subpatterns, then the subpatternId passed to begin
 867  * will be greater than the subpatternId passed to end.
 868  */
 869
 870 template<class Delegate>
 871 const char* parse(Delegate& delegate, const String& pattern, unsigned backReferenceLimit = quantifyInfinite)
 872 {
 873     if (pattern.is8Bit())
 874         return Parser<Delegate, LChar>(delegate, pattern, backReferenceLimit).parse();
 875     return Parser<Delegate, UChar>(delegate, pattern, backReferenceLimit).parse();
 876 }
 877
 878 } } // namespace JSC::Yarr
 879
 880 #endif // YarrParser_h