yarr/YarrParser.h

   1 /*
   2  * Copyright (C) 2009 Apple Inc. All rights reserved.
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions
   6  * are met:
   7  * 1. Redistributions of source code must retain the above copyright
   8  *    notice, this list of conditions and the following disclaimer.
   9  * 2. Redistributions in binary form must reproduce the above copyright
  10  *    notice, this list of conditions and the following disclaimer in the
  11  *    documentation and/or other materials provided with the distribution.
  12  *
  13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
  14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
  17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  24  */
  25
  26 #ifndef YarrParser_h
  27 #define YarrParser_h
  28
  29 #include <runtime/UString.h>
  30 #include "Yarr.h"
  31 #include <wtf/ASCIICType.h>
  32 #include <wtf/unicode/Unicode.h>
  33
  34 namespace JSC { namespace Yarr {
  35
  36 #define REGEXP_ERROR_PREFIX "Invalid regular expression: "
  37
  38 enum BuiltInCharacterClassID {
  39     DigitClassID,
  40     SpaceClassID,
  41     WordClassID,
  42     NewlineClassID,
  43 };
  44
  45 // The Parser class should not be used directly - only via the Yarr::parse() method.
  46 template<class Delegate>
  47 class Parser {
  48 private:
  49     template<class FriendDelegate>
  50     friend const char* parse(FriendDelegate& delegate, const UString& pattern, unsigned backReferenceLimit);
  51
  52     enum ErrorCode {
  53         NoError,
  54         PatternTooLarge,
  55         QuantifierOutOfOrder,
  56         QuantifierWithoutAtom,
  57         MissingParentheses,
  58         ParenthesesUnmatched,
  59         ParenthesesTypeInvalid,
  60         CharacterClassUnmatched,
  61         CharacterClassOutOfOrder,
  62         EscapeUnterminated,
  63         NumberOfErrorCodes
  64     };
  65
  66     /*
  67      * CharacterClassParserDelegate:
  68      *
  69      * The class CharacterClassParserDelegate is used in the parsing of character
  70      * classes.  This class handles detection of character ranges.  This class
  71      * implements enough of the delegate interface such that it can be passed to
  72      * parseEscape() as an EscapeDelegate.  This allows parseEscape() to be reused
  73      * to perform the parsing of escape characters in character sets.
  74      */
  75     class CharacterClassParserDelegate {
  76     public:
  77         CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
  78             : m_delegate(delegate)
  79             , m_err(err)
  80             , m_state(Empty)
  81             , m_character(0)
  82         {
  83         }
  84
  85         /*
  86          * begin():
  87          *
  88          * Called at beginning of construction.
  89          */
  90         void begin(bool invert)
  91         {
  92             m_delegate.atomCharacterClassBegin(invert);
  93         }
  94
  95         /*
  96          * atomPatternCharacter():
  97          *
  98          * This method is called either from parseCharacterClass() (for an unescaped
  99          * character in a character class), or from parseEscape(). In the former case
 100          * the value true will be passed for the argument 'hyphenIsRange', and in this
 101          * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
 102          * is different to /[a\-z]/).
 103          */
 104         void atomPatternCharacter(UChar ch, bool hyphenIsRange = false)
 105         {
 106             switch (m_state) {
 107             case AfterCharacterClass:
 108                 // Following a builtin character class we need look out for a hyphen.
 109                 // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
 110                 // If we see a hyphen following a charater class then unlike usual
 111                 // we'll report it to the delegate immediately, and put ourself into
 112                 // a poisoned state. Any following calls to add another character or
 113                 // character class will result in an error. (A hypen following a
 114                 // character-class is itself valid, but only  at the end of a regex).
 115                 if (hyphenIsRange && ch == '-') {
 116                     m_delegate.atomCharacterClassAtom('-');
 117                     m_state = AfterCharacterClassHyphen;
 118                     return;
 119                 }
 120                 // Otherwise just fall through - cached character so treat this as Empty.
 121
 122             case Empty:
 123                 m_character = ch;
 124                 m_state = CachedCharacter;
 125                 return;
 126
 127             case CachedCharacter:
 128                 if (hyphenIsRange && ch == '-')
 129                     m_state = CachedCharacterHyphen;
 130                 else {
 131                     m_delegate.atomCharacterClassAtom(m_character);
 132                     m_character = ch;
 133                 }
 134                 return;
 135
 136             case CachedCharacterHyphen:
 137                 if (ch < m_character) {
 138                     m_err = CharacterClassOutOfOrder;
 139                     return;
 140                 }
 141                 m_delegate.atomCharacterClassRange(m_character, ch);
 142                 m_state = Empty;
 143                 return;
 144
 145                 // See coment in atomBuiltInCharacterClass below.
 146                 // This too is technically an error, per ECMA-262, and again we
 147                 // we chose to allow this.  Note a subtlely here that while we
 148                 // diverge from the spec's definition of CharacterRange we do
 149                 // remain in compliance with the grammar.  For example, consider
 150                 // the expression /[\d-a-z]/.  We comply with the grammar in
 151                 // this case by not allowing a-z to be matched as a range.
 152             case AfterCharacterClassHyphen:
 153                 m_delegate.atomCharacterClassAtom(ch);
 154                 m_state = Empty;
 155                 return;
 156             }
 157         }
 158
 159         /*
 160          * atomBuiltInCharacterClass():
 161          *
 162          * Adds a built-in character class, called by parseEscape().
 163          */
 164         void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
 165         {
 166             switch (m_state) {
 167             case CachedCharacter:
 168                 // Flush the currently cached character, then fall through.
 169                 m_delegate.atomCharacterClassAtom(m_character);
 170
 171             case Empty:
 172             case AfterCharacterClass:
 173                 m_state = AfterCharacterClass;
 174                 m_delegate.atomCharacterClassBuiltIn(classID, invert);
 175                 return;
 176
 177                 // If we hit either of these cases, we have an invalid range that
 178                 // looks something like /[x-\d]/ or /[\d-\d]/.
 179                 // According to ECMA-262 this should be a syntax error, but
 180                 // empirical testing shows this to break teh webz.  Instead we
 181                 // comply with to the ECMA-262 grammar, and assume the grammar to
 182                 // have matched the range correctly, but tweak our interpretation
 183                 // of CharacterRange.  Effectively we implicitly handle the hyphen
 184                 // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/.
 185             case CachedCharacterHyphen:
 186                 m_delegate.atomCharacterClassAtom(m_character);
 187                 m_delegate.atomCharacterClassAtom('-');
 188                 // fall through
 189             case AfterCharacterClassHyphen:
 190                 m_delegate.atomCharacterClassBuiltIn(classID, invert);
 191                 m_state = Empty;
 192                 return;
 193             }
 194         }
 195
 196         /*
 197          * end():
 198          *
 199          * Called at end of construction.
 200          */
 201         void end()
 202         {
 203             if (m_state == CachedCharacter)
 204                 m_delegate.atomCharacterClassAtom(m_character);
 205             else if (m_state == CachedCharacterHyphen) {
 206                 m_delegate.atomCharacterClassAtom(m_character);
 207                 m_delegate.atomCharacterClassAtom('-');
 208             }
 209             m_delegate.atomCharacterClassEnd();
 210         }
 211
 212         // parseEscape() should never call these delegate methods when
 213         // invoked with inCharacterClass set.
 214         void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); }
 215         void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); }
 216
 217     private:
 218         Delegate& m_delegate;
 219         ErrorCode& m_err;
 220         enum CharacterClassConstructionState {
 221             Empty,
 222             CachedCharacter,
 223             CachedCharacterHyphen,
 224             AfterCharacterClass,
 225             AfterCharacterClassHyphen,
 226         } m_state;
 227         UChar m_character;
 228     };
 229
 230     Parser(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit)
 231         : m_delegate(delegate)
 232         , m_backReferenceLimit(backReferenceLimit)
 233         , m_err(NoError)
 234         , m_data(pattern.characters())
 235         , m_size(pattern.length())
 236         , m_index(0)
 237         , m_parenthesesNestingDepth(0)
 238     {
 239     }
 240
 241     /*
 242      * parseEscape():
 243      *
 244      * Helper for parseTokens() AND parseCharacterClass().
 245      * Unlike the other parser methods, this function does not report tokens
 246      * directly to the member delegate (m_delegate), instead tokens are
 247      * emitted to the delegate provided as an argument.  In the case of atom
 248      * escapes, parseTokens() will call parseEscape() passing m_delegate as
 249      * an argument, and as such the escape will be reported to the delegate.
 250      *
 251      * However this method may also be used by parseCharacterClass(), in which
 252      * case a CharacterClassParserDelegate will be passed as the delegate that
 253      * tokens should be added to.  A boolean flag is also provided to indicate
 254      * whether that an escape in a CharacterClass is being parsed (some parsing
 255      * rules change in this context).
 256      *
 257      * The boolean value returned by this method indicates whether the token
 258      * parsed was an atom (outside of a characted class \b and \B will be
 259      * interpreted as assertions).
 260      */
 261     template<bool inCharacterClass, class EscapeDelegate>
 262     bool parseEscape(EscapeDelegate& delegate)
 263     {
 264         ASSERT(!m_err);
 265         ASSERT(peek() == '\\');
 266         consume();
 267
 268         if (atEndOfPattern()) {
 269             m_err = EscapeUnterminated;
 270             return false;
 271         }
 272
 273         switch (peek()) {
 274         // Assertions
 275         case 'b':
 276             consume();
 277             if (inCharacterClass)
 278                 delegate.atomPatternCharacter('\b');
 279             else {
 280                 delegate.assertionWordBoundary(false);
 281                 return false;
 282             }
 283             break;
 284         case 'B':
 285             consume();
 286             if (inCharacterClass)
 287                 delegate.atomPatternCharacter('B');
 288             else {
 289                 delegate.assertionWordBoundary(true);
 290                 return false;
 291             }
 292             break;
 293
 294         // CharacterClassEscape
 295         case 'd':
 296             consume();
 297             delegate.atomBuiltInCharacterClass(DigitClassID, false);
 298             break;
 299         case 's':
 300             consume();
 301             delegate.atomBuiltInCharacterClass(SpaceClassID, false);
 302             break;
 303         case 'w':
 304             consume();
 305             delegate.atomBuiltInCharacterClass(WordClassID, false);
 306             break;
 307         case 'D':
 308             consume();
 309             delegate.atomBuiltInCharacterClass(DigitClassID, true);
 310             break;
 311         case 'S':
 312             consume();
 313             delegate.atomBuiltInCharacterClass(SpaceClassID, true);
 314             break;
 315         case 'W':
 316             consume();
 317             delegate.atomBuiltInCharacterClass(WordClassID, true);
 318             break;
 319
 320         // DecimalEscape
 321         case '1':
 322         case '2':
 323         case '3':
 324         case '4':
 325         case '5':
 326         case '6':
 327         case '7':
 328         case '8':
 329         case '9': {
 330             // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
 331             // First, try to parse this as backreference.
 332             if (!inCharacterClass) {
 333                 ParseState state = saveState();
 334
 335                 unsigned backReference = consumeNumber();
 336                 if (backReference <= m_backReferenceLimit) {
 337                     delegate.atomBackReference(backReference);
 338                     break;
 339                 }
 340
 341                 restoreState(state);
 342             }
 343
 344             // Not a backreference, and not octal.
 345             if (peek() >= '8') {
 346                 delegate.atomPatternCharacter('\\');
 347                 break;
 348             }
 349
 350             // Fall-through to handle this as an octal escape.
 351         }
 352
 353         // Octal escape
 354         case '0':
 355             delegate.atomPatternCharacter(consumeOctal());
 356             break;
 357
 358         // ControlEscape
 359         case 'f':
 360             consume();
 361             delegate.atomPatternCharacter('\f');
 362             break;
 363         case 'n':
 364             consume();
 365             delegate.atomPatternCharacter('\n');
 366             break;
 367         case 'r':
 368             consume();
 369             delegate.atomPatternCharacter('\r');
 370             break;
 371         case 't':
 372             consume();
 373             delegate.atomPatternCharacter('\t');
 374             break;
 375         case 'v':
 376             consume();
 377             delegate.atomPatternCharacter('\v');
 378             break;
 379
 380         // ControlLetter
 381         case 'c': {
 382             ParseState state = saveState();
 383             consume();
 384             if (!atEndOfPattern()) {
 385                 int control = consume();
 386
 387                 // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
 388                 if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) {
 389                     delegate.atomPatternCharacter(control & 0x1f);
 390                     break;
 391                 }
 392             }
 393             restoreState(state);
 394             delegate.atomPatternCharacter('\\');
 395             break;
 396         }
 397
 398         // HexEscape
 399         case 'x': {
 400             consume();
 401             int x = tryConsumeHex(2);
 402             if (x == -1)
 403                 delegate.atomPatternCharacter('x');
 404             else
 405                 delegate.atomPatternCharacter(x);
 406             break;
 407         }
 408
 409         // UnicodeEscape
 410         case 'u': {
 411             consume();
 412             int u = tryConsumeHex(4);
 413             if (u == -1)
 414                 delegate.atomPatternCharacter('u');
 415             else
 416                 delegate.atomPatternCharacter(u);
 417             break;
 418         }
 419
 420         // IdentityEscape
 421         default:
 422             delegate.atomPatternCharacter(consume());
 423         }
 424
 425         return true;
 426     }
 427
 428     /*
 429      * parseAtomEscape(), parseCharacterClassEscape():
 430      *
 431      * These methods alias to parseEscape().
 432      */
 433     bool parseAtomEscape()
 434     {
 435         return parseEscape<false>(m_delegate);
 436     }
 437     void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
 438     {
 439         parseEscape<true>(delegate);
 440     }
 441
 442     /*
 443      * parseCharacterClass():
 444      *
 445      * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
 446      * to an instance of CharacterClassParserDelegate, to describe the character class to the
 447      * delegate.
 448      */
 449     void parseCharacterClass()
 450     {
 451         ASSERT(!m_err);
 452         ASSERT(peek() == '[');
 453         consume();
 454
 455         CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err);
 456
 457         characterClassConstructor.begin(tryConsume('^'));
 458
 459         while (!atEndOfPattern()) {
 460             switch (peek()) {
 461             case ']':
 462                 consume();
 463                 characterClassConstructor.end();
 464                 return;
 465
 466             case '\\':
 467                 parseCharacterClassEscape(characterClassConstructor);
 468                 break;
 469
 470             default:
 471                 characterClassConstructor.atomPatternCharacter(consume(), true);
 472             }
 473
 474             if (m_err)
 475                 return;
 476         }
 477
 478         m_err = CharacterClassUnmatched;
 479     }
 480
 481     /*
 482      * parseParenthesesBegin():
 483      *
 484      * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
 485      */
 486     void parseParenthesesBegin()
 487     {
 488         ASSERT(!m_err);
 489         ASSERT(peek() == '(');
 490         consume();
 491
 492         if (tryConsume('?')) {
 493             if (atEndOfPattern()) {
 494                 m_err = ParenthesesTypeInvalid;
 495                 return;
 496             }
 497
 498             switch (consume()) {
 499             case ':':
 500                 m_delegate.atomParenthesesSubpatternBegin(false);
 501                 break;
 502
 503             case '=':
 504                 m_delegate.atomParentheticalAssertionBegin();
 505                 break;
 506
 507             case '!':
 508                 m_delegate.atomParentheticalAssertionBegin(true);
 509                 break;
 510
 511             default:
 512                 m_err = ParenthesesTypeInvalid;
 513             }
 514         } else
 515             m_delegate.atomParenthesesSubpatternBegin();
 516
 517         ++m_parenthesesNestingDepth;
 518     }
 519
 520     /*
 521      * parseParenthesesEnd():
 522      *
 523      * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
 524      */
 525     void parseParenthesesEnd()
 526     {
 527         ASSERT(!m_err);
 528         ASSERT(peek() == ')');
 529         consume();
 530
 531         if (m_parenthesesNestingDepth > 0)
 532             m_delegate.atomParenthesesEnd();
 533         else
 534             m_err = ParenthesesUnmatched;
 535
 536         --m_parenthesesNestingDepth;
 537     }
 538
 539     /*
 540      * parseQuantifier():
 541      *
 542      * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
 543      */
 544     void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
 545     {
 546         ASSERT(!m_err);
 547         ASSERT(min <= max);
 548
 549         if (lastTokenWasAnAtom)
 550             m_delegate.quantifyAtom(min, max, !tryConsume('?'));
 551         else
 552             m_err = QuantifierWithoutAtom;
 553     }
 554
 555     /*
 556      * parseTokens():
 557      *
 558      * This method loops over the input pattern reporting tokens to the delegate.
 559      * The method returns when a parse error is detected, or the end of the pattern
 560      * is reached.  One piece of state is tracked around the loop, which is whether
 561      * the last token passed to the delegate was an atom (this is necessary to detect
 562      * a parse error when a quantifier provided without an atom to quantify).
 563      */
 564     void parseTokens()
 565     {
 566         bool lastTokenWasAnAtom = false;
 567
 568         while (!atEndOfPattern()) {
 569             switch (peek()) {
 570             case '|':
 571                 consume();
 572                 m_delegate.disjunction();
 573                 lastTokenWasAnAtom = false;
 574                 break;
 575
 576             case '(':
 577                 parseParenthesesBegin();
 578                 lastTokenWasAnAtom = false;
 579                 break;
 580
 581             case ')':
 582                 parseParenthesesEnd();
 583                 lastTokenWasAnAtom = true;
 584                 break;
 585
 586             case '^':
 587                 consume();
 588                 m_delegate.assertionBOL();
 589                 lastTokenWasAnAtom = false;
 590                 break;
 591
 592             case '$':
 593                 consume();
 594                 m_delegate.assertionEOL();
 595                 lastTokenWasAnAtom = false;
 596                 break;
 597
 598             case '.':
 599                 consume();
 600                 m_delegate.atomBuiltInCharacterClass(NewlineClassID, true);
 601                 lastTokenWasAnAtom = true;
 602                 break;
 603
 604             case '[':
 605                 parseCharacterClass();
 606                 lastTokenWasAnAtom = true;
 607                 break;
 608
 609             case '\\':
 610                 lastTokenWasAnAtom = parseAtomEscape();
 611                 break;
 612
 613             case '*':
 614                 consume();
 615                 parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite);
 616                 lastTokenWasAnAtom = false;
 617                 break;
 618
 619             case '+':
 620                 consume();
 621                 parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite);
 622                 lastTokenWasAnAtom = false;
 623                 break;
 624
 625             case '?':
 626                 consume();
 627                 parseQuantifier(lastTokenWasAnAtom, 0, 1);
 628                 lastTokenWasAnAtom = false;
 629                 break;
 630
 631             case '{': {
 632                 ParseState state = saveState();
 633
 634                 consume();
 635                 if (peekIsDigit()) {
 636                     unsigned min = consumeNumber();
 637                     unsigned max = min;
 638
 639                     if (tryConsume(','))
 640                         max = peekIsDigit() ? consumeNumber() : quantifyInfinite;
 641
 642                     if (tryConsume('}')) {
 643                         if (min <= max)
 644                             parseQuantifier(lastTokenWasAnAtom, min, max);
 645                         else
 646                             m_err = QuantifierOutOfOrder;
 647                         lastTokenWasAnAtom = false;
 648                         break;
 649                     }
 650                 }
 651
 652                 restoreState(state);
 653             } // if we did not find a complete quantifer, fall through to the default case.
 654
 655             default:
 656                 m_delegate.atomPatternCharacter(consume());
 657                 lastTokenWasAnAtom = true;
 658             }
 659
 660             if (m_err)
 661                 return;
 662         }
 663
 664         if (m_parenthesesNestingDepth > 0)
 665             m_err = MissingParentheses;
 666     }
 667
 668     /*
 669      * parse():
 670      *
 671      * This method calls parseTokens() to parse over the input and converts any
 672      * error code to a const char* for a result.
 673      */
 674     const char* parse()
 675     {
 676         if (m_size > MAX_PATTERN_SIZE)
 677             m_err = PatternTooLarge;
 678         else
 679             parseTokens();
 680         ASSERT(atEndOfPattern() || m_err);
 681
 682         // The order of this array must match the ErrorCode enum.
 683         static const char* errorMessages[NumberOfErrorCodes] = {
 684             0, // NoError
 685             REGEXP_ERROR_PREFIX "regular expression too large",
 686             REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier",
 687             REGEXP_ERROR_PREFIX "nothing to repeat",
 688             REGEXP_ERROR_PREFIX "missing )",
 689             REGEXP_ERROR_PREFIX "unmatched parentheses",
 690             REGEXP_ERROR_PREFIX "unrecognized character after (?",
 691             REGEXP_ERROR_PREFIX "missing terminating ] for character class",
 692             REGEXP_ERROR_PREFIX "range out of order in character class",
 693             REGEXP_ERROR_PREFIX "\\ at end of pattern"
 694         };
 695
 696         return errorMessages[m_err];
 697     }
 698
 699
 700     // Misc helper functions:
 701
 702     typedef unsigned ParseState;
 703
 704     ParseState saveState()
 705     {
 706         return m_index;
 707     }
 708
 709     void restoreState(ParseState state)
 710     {
 711         m_index = state;
 712     }
 713
 714     bool atEndOfPattern()
 715     {
 716         ASSERT(m_index <= m_size);
 717         return m_index == m_size;
 718     }
 719
 720     int peek()
 721     {
 722         ASSERT(m_index < m_size);
 723         return m_data[m_index];
 724     }
 725
 726     bool peekIsDigit()
 727     {
 728         return !atEndOfPattern() && WTF::isASCIIDigit(peek());
 729     }
 730
 731     unsigned peekDigit()
 732     {
 733         ASSERT(peekIsDigit());
 734         return peek() - '0';
 735     }
 736
 737     int consume()
 738     {
 739         ASSERT(m_index < m_size);
 740         return m_data[m_index++];
 741     }
 742
 743     unsigned consumeDigit()
 744     {
 745         ASSERT(peekIsDigit());
 746         return consume() - '0';
 747     }
 748
 749     unsigned consumeNumber()
 750     {
 751         unsigned n = consumeDigit();
 752         // check for overflow.
 753         for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) {
 754             n = newValue;
 755             consume();
 756         }
 757         return n;
 758     }
 759
 760     unsigned consumeOctal()
 761     {
 762         ASSERT(WTF::isASCIIOctalDigit(peek()));
 763
 764         unsigned n = consumeDigit();
 765         while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
 766             n = n * 8 + consumeDigit();
 767         return n;
 768     }
 769
 770     bool tryConsume(UChar ch)
 771     {
 772         if (atEndOfPattern() || (m_data[m_index] != ch))
 773             return false;
 774         ++m_index;
 775         return true;
 776     }
 777
 778     int tryConsumeHex(int count)
 779     {
 780         ParseState state = saveState();
 781
 782         int n = 0;
 783         while (count--) {
 784             if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
 785                 restoreState(state);
 786                 return -1;
 787             }
 788             n = (n << 4) | WTF::toASCIIHexValue(consume());
 789         }
 790         return n;
 791     }
 792
 793     Delegate& m_delegate;
 794     unsigned m_backReferenceLimit;
 795     ErrorCode m_err;
 796     const UChar* m_data;
 797     unsigned m_size;
 798     unsigned m_index;
 799     unsigned m_parenthesesNestingDepth;
 800
 801     // Derived by empirical testing of compile time in PCRE and WREC.
 802     static const unsigned MAX_PATTERN_SIZE = 1024 * 1024;
 803 };
 804
 805 /*
 806  * Yarr::parse():
 807  *
 808  * The parse method is passed a pattern to be parsed and a delegate upon which
 809  * callbacks will be made to record the parsed tokens forming the regex.
 810  * Yarr::parse() returns null on success, or a const C string providing an error
 811  * message where a parse error occurs.
 812  *
 813  * The Delegate must implement the following interface:
 814  *
 815  *    void assertionBOL();
 816  *    void assertionEOL();
 817  *    void assertionWordBoundary(bool invert);
 818  *
 819  *    void atomPatternCharacter(UChar ch);
 820  *    void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
 821  *    void atomCharacterClassBegin(bool invert)
 822  *    void atomCharacterClassAtom(UChar ch)
 823  *    void atomCharacterClassRange(UChar begin, UChar end)
 824  *    void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
 825  *    void atomCharacterClassEnd()
 826  *    void atomParenthesesSubpatternBegin(bool capture = true);
 827  *    void atomParentheticalAssertionBegin(bool invert = false);
 828  *    void atomParenthesesEnd();
 829  *    void atomBackReference(unsigned subpatternId);
 830  *
 831  *    void quantifyAtom(unsigned min, unsigned max, bool greedy);
 832  *
 833  *    void disjunction();
 834  *
 835  * The regular expression is described by a sequence of assertion*() and atom*()
 836  * callbacks to the delegate, describing the terms in the regular expression.
 837  * Following an atom a quantifyAtom() call may occur to indicate that the previous
 838  * atom should be quantified.  In the case of atoms described across multiple
 839  * calls (parentheses and character classes) the call to quantifyAtom() will come
 840  * after the call to the atom*End() method, never after atom*Begin().
 841  *
 842  * Character classes may either be described by a single call to
 843  * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
 844  * In the latter case, ...Begin() will be called, followed by a sequence of
 845  * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
 846  *
 847  * Sequences of atoms and assertions are broken into alternatives via calls to
 848  * disjunction().  Assertions, atoms, and disjunctions emitted between calls to
 849  * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
 850  * atomParenthesesBegin() is passed a subpatternId.  In the case of a regular
 851  * capturing subpattern, this will be the subpatternId associated with these
 852  * parentheses, and will also by definition be the lowest subpatternId of these
 853  * parentheses and of any nested paretheses.  The atomParenthesesEnd() method
 854  * is passed the subpatternId of the last capturing subexpression nested within
 855  * these paretheses.  In the case of a capturing subpattern with no nested
 856  * capturing subpatterns, the same subpatternId will be passed to the begin and
 857  * end functions.  In the case of non-capturing subpatterns the subpatternId
 858  * passed to the begin method is also the first possible subpatternId that might
 859  * be nested within these paretheses.  If a set of non-capturing parentheses does
 860  * not contain any capturing subpatterns, then the subpatternId passed to begin
 861  * will be greater than the subpatternId passed to end.
 862  */
 863
 864 template<class Delegate>
 865 const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = quantifyInfinite)
 866 {
 867     return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse();
 868 }
 869
 870 } } // namespace JSC::Yarr
 871
 872 #endif // YarrParser_h