yarr/YarrParser.h

   1 /*
   2  * Copyright (C) 2009 Apple Inc. All rights reserved.
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions
   6  * are met:
   7  * 1. Redistributions of source code must retain the above copyright
   8  *    notice, this list of conditions and the following disclaimer.
   9  * 2. Redistributions in binary form must reproduce the above copyright
  10  *    notice, this list of conditions and the following disclaimer in the
  11  *    documentation and/or other materials provided with the distribution.
  12  *
  13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
  14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
  17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  24  */
  25
  26 #ifndef YarrParser_h
  27 #define YarrParser_h
  28
  29 #include "Yarr.h"
  30 #include <wtf/ASCIICType.h>
  31 #include <wtf/text/WTFString.h>
  32
  33 namespace JSC { namespace Yarr {
  34
  35 #define REGEXP_ERROR_PREFIX "Invalid regular expression: "
  36
  37 enum BuiltInCharacterClassID {
  38     DigitClassID,
  39     SpaceClassID,
  40     WordClassID,
  41     NewlineClassID,
  42 };
  43
  44 // The Parser class should not be used directly - only via the Yarr::parse() method.
  45 template<class Delegate, typename CharType>
  46 class Parser {
  47 private:
  48     template<class FriendDelegate>
  49     friend const char* parse(FriendDelegate&, const String& pattern, unsigned backReferenceLimit);
  50
  51     enum ErrorCode {
  52         NoError,
  53         PatternTooLarge,
  54         QuantifierOutOfOrder,
  55         QuantifierWithoutAtom,
  56         QuantifierTooLarge,
  57         MissingParentheses,
  58         ParenthesesUnmatched,
  59         ParenthesesTypeInvalid,
  60         CharacterClassUnmatched,
  61         CharacterClassOutOfOrder,
  62         EscapeUnterminated,
  63         NumberOfErrorCodes
  64     };
  65
  66     /*
  67      * CharacterClassParserDelegate:
  68      *
  69      * The class CharacterClassParserDelegate is used in the parsing of character
  70      * classes.  This class handles detection of character ranges.  This class
  71      * implements enough of the delegate interface such that it can be passed to
  72      * parseEscape() as an EscapeDelegate.  This allows parseEscape() to be reused
  73      * to perform the parsing of escape characters in character sets.
  74      */
  75     class CharacterClassParserDelegate {
  76     public:
  77         CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
  78             : m_delegate(delegate)
  79             , m_err(err)
  80             , m_state(Empty)
  81             , m_character(0)
  82         {
  83         }
  84
  85         /*
  86          * begin():
  87          *
  88          * Called at beginning of construction.
  89          */
  90         void begin(bool invert)
  91         {
  92             m_delegate.atomCharacterClassBegin(invert);
  93         }
  94
  95         /*
  96          * atomPatternCharacter():
  97          *
  98          * This method is called either from parseCharacterClass() (for an unescaped
  99          * character in a character class), or from parseEscape(). In the former case
 100          * the value true will be passed for the argument 'hyphenIsRange', and in this
 101          * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
 102          * is different to /[a\-z]/).
 103          */
 104         void atomPatternCharacter(UChar ch, bool hyphenIsRange = false)
 105         {
 106             switch (m_state) {
 107             case AfterCharacterClass:
 108                 // Following a builtin character class we need look out for a hyphen.
 109                 // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
 110                 // If we see a hyphen following a charater class then unlike usual
 111                 // we'll report it to the delegate immediately, and put ourself into
 112                 // a poisoned state. Any following calls to add another character or
 113                 // character class will result in an error. (A hypen following a
 114                 // character-class is itself valid, but only  at the end of a regex).
 115                 if (hyphenIsRange && ch == '-') {
 116                     m_delegate.atomCharacterClassAtom('-');
 117                     m_state = AfterCharacterClassHyphen;
 118                     return;
 119                 }
 120                 // Otherwise just fall through - cached character so treat this as Empty.
 121                 FALLTHROUGH;
 122
 123             case Empty:
 124                 m_character = ch;
 125                 m_state = CachedCharacter;
 126                 return;
 127
 128             case CachedCharacter:
 129                 if (hyphenIsRange && ch == '-')
 130                     m_state = CachedCharacterHyphen;
 131                 else {
 132                     m_delegate.atomCharacterClassAtom(m_character);
 133                     m_character = ch;
 134                 }
 135                 return;
 136
 137             case CachedCharacterHyphen:
 138                 if (ch < m_character) {
 139                     m_err = CharacterClassOutOfOrder;
 140                     return;
 141                 }
 142                 m_delegate.atomCharacterClassRange(m_character, ch);
 143                 m_state = Empty;
 144                 return;
 145
 146                 // See coment in atomBuiltInCharacterClass below.
 147                 // This too is technically an error, per ECMA-262, and again we
 148                 // we chose to allow this.  Note a subtlely here that while we
 149                 // diverge from the spec's definition of CharacterRange we do
 150                 // remain in compliance with the grammar.  For example, consider
 151                 // the expression /[\d-a-z]/.  We comply with the grammar in
 152                 // this case by not allowing a-z to be matched as a range.
 153             case AfterCharacterClassHyphen:
 154                 m_delegate.atomCharacterClassAtom(ch);
 155                 m_state = Empty;
 156                 return;
 157             }
 158         }
 159
 160         /*
 161          * atomBuiltInCharacterClass():
 162          *
 163          * Adds a built-in character class, called by parseEscape().
 164          */
 165         void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
 166         {
 167             switch (m_state) {
 168             case CachedCharacter:
 169                 // Flush the currently cached character, then fall through.
 170                 m_delegate.atomCharacterClassAtom(m_character);
 171                 FALLTHROUGH;
 172             case Empty:
 173             case AfterCharacterClass:
 174                 m_state = AfterCharacterClass;
 175                 m_delegate.atomCharacterClassBuiltIn(classID, invert);
 176                 return;
 177
 178                 // If we hit either of these cases, we have an invalid range that
 179                 // looks something like /[x-\d]/ or /[\d-\d]/.
 180                 // According to ECMA-262 this should be a syntax error, but
 181                 // empirical testing shows this to break teh webz.  Instead we
 182                 // comply with to the ECMA-262 grammar, and assume the grammar to
 183                 // have matched the range correctly, but tweak our interpretation
 184                 // of CharacterRange.  Effectively we implicitly handle the hyphen
 185                 // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/.
 186             case CachedCharacterHyphen:
 187                 m_delegate.atomCharacterClassAtom(m_character);
 188                 m_delegate.atomCharacterClassAtom('-');
 189                 FALLTHROUGH;
 190             case AfterCharacterClassHyphen:
 191                 m_delegate.atomCharacterClassBuiltIn(classID, invert);
 192                 m_state = Empty;
 193                 return;
 194             }
 195         }
 196
 197         /*
 198          * end():
 199          *
 200          * Called at end of construction.
 201          */
 202         void end()
 203         {
 204             if (m_state == CachedCharacter)
 205                 m_delegate.atomCharacterClassAtom(m_character);
 206             else if (m_state == CachedCharacterHyphen) {
 207                 m_delegate.atomCharacterClassAtom(m_character);
 208                 m_delegate.atomCharacterClassAtom('-');
 209             }
 210             m_delegate.atomCharacterClassEnd();
 211         }
 212
 213         // parseEscape() should never call these delegate methods when
 214         // invoked with inCharacterClass set.
 215         NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { RELEASE_ASSERT_NOT_REACHED(); }
 216         NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { RELEASE_ASSERT_NOT_REACHED(); }
 217
 218     private:
 219         Delegate& m_delegate;
 220         ErrorCode& m_err;
 221         enum CharacterClassConstructionState {
 222             Empty,
 223             CachedCharacter,
 224             CachedCharacterHyphen,
 225             AfterCharacterClass,
 226             AfterCharacterClassHyphen,
 227         } m_state;
 228         UChar m_character;
 229     };
 230
 231     Parser(Delegate& delegate, const String& pattern, unsigned backReferenceLimit)
 232         : m_delegate(delegate)
 233         , m_backReferenceLimit(backReferenceLimit)
 234         , m_err(NoError)
 235         , m_data(pattern.characters<CharType>())
 236         , m_size(pattern.length())
 237         , m_index(0)
 238         , m_parenthesesNestingDepth(0)
 239     {
 240     }
 241
 242     /*
 243      * parseEscape():
 244      *
 245      * Helper for parseTokens() AND parseCharacterClass().
 246      * Unlike the other parser methods, this function does not report tokens
 247      * directly to the member delegate (m_delegate), instead tokens are
 248      * emitted to the delegate provided as an argument.  In the case of atom
 249      * escapes, parseTokens() will call parseEscape() passing m_delegate as
 250      * an argument, and as such the escape will be reported to the delegate.
 251      *
 252      * However this method may also be used by parseCharacterClass(), in which
 253      * case a CharacterClassParserDelegate will be passed as the delegate that
 254      * tokens should be added to.  A boolean flag is also provided to indicate
 255      * whether that an escape in a CharacterClass is being parsed (some parsing
 256      * rules change in this context).
 257      *
 258      * The boolean value returned by this method indicates whether the token
 259      * parsed was an atom (outside of a characted class \b and \B will be
 260      * interpreted as assertions).
 261      */
 262     template<bool inCharacterClass, class EscapeDelegate>
 263     bool parseEscape(EscapeDelegate& delegate)
 264     {
 265         ASSERT(!m_err);
 266         ASSERT(peek() == '\\');
 267         consume();
 268
 269         if (atEndOfPattern()) {
 270             m_err = EscapeUnterminated;
 271             return false;
 272         }
 273
 274         switch (peek()) {
 275         // Assertions
 276         case 'b':
 277             consume();
 278             if (inCharacterClass)
 279                 delegate.atomPatternCharacter('\b');
 280             else {
 281                 delegate.assertionWordBoundary(false);
 282                 return false;
 283             }
 284             break;
 285         case 'B':
 286             consume();
 287             if (inCharacterClass)
 288                 delegate.atomPatternCharacter('B');
 289             else {
 290                 delegate.assertionWordBoundary(true);
 291                 return false;
 292             }
 293             break;
 294
 295         // CharacterClassEscape
 296         case 'd':
 297             consume();
 298             delegate.atomBuiltInCharacterClass(DigitClassID, false);
 299             break;
 300         case 's':
 301             consume();
 302             delegate.atomBuiltInCharacterClass(SpaceClassID, false);
 303             break;
 304         case 'w':
 305             consume();
 306             delegate.atomBuiltInCharacterClass(WordClassID, false);
 307             break;
 308         case 'D':
 309             consume();
 310             delegate.atomBuiltInCharacterClass(DigitClassID, true);
 311             break;
 312         case 'S':
 313             consume();
 314             delegate.atomBuiltInCharacterClass(SpaceClassID, true);
 315             break;
 316         case 'W':
 317             consume();
 318             delegate.atomBuiltInCharacterClass(WordClassID, true);
 319             break;
 320
 321         // DecimalEscape
 322         case '1':
 323         case '2':
 324         case '3':
 325         case '4':
 326         case '5':
 327         case '6':
 328         case '7':
 329         case '8':
 330         case '9': {
 331             // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
 332             // First, try to parse this as backreference.
 333             if (!inCharacterClass) {
 334                 ParseState state = saveState();
 335
 336                 unsigned backReference = consumeNumber();
 337                 if (backReference <= m_backReferenceLimit) {
 338                     delegate.atomBackReference(backReference);
 339                     break;
 340                 }
 341
 342                 restoreState(state);
 343             }
 344
 345             // Not a backreference, and not octal.
 346             if (peek() >= '8') {
 347                 delegate.atomPatternCharacter('\\');
 348                 break;
 349             }
 350
 351             // Fall-through to handle this as an octal escape.
 352             FALLTHROUGH;
 353         }
 354
 355         // Octal escape
 356         case '0':
 357             delegate.atomPatternCharacter(consumeOctal());
 358             break;
 359
 360         // ControlEscape
 361         case 'f':
 362             consume();
 363             delegate.atomPatternCharacter('\f');
 364             break;
 365         case 'n':
 366             consume();
 367             delegate.atomPatternCharacter('\n');
 368             break;
 369         case 'r':
 370             consume();
 371             delegate.atomPatternCharacter('\r');
 372             break;
 373         case 't':
 374             consume();
 375             delegate.atomPatternCharacter('\t');
 376             break;
 377         case 'v':
 378             consume();
 379             delegate.atomPatternCharacter('\v');
 380             break;
 381
 382         // ControlLetter
 383         case 'c': {
 384             ParseState state = saveState();
 385             consume();
 386             if (!atEndOfPattern()) {
 387                 int control = consume();
 388
 389                 // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
 390                 if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) {
 391                     delegate.atomPatternCharacter(control & 0x1f);
 392                     break;
 393                 }
 394             }
 395             restoreState(state);
 396             delegate.atomPatternCharacter('\\');
 397             break;
 398         }
 399
 400         // HexEscape
 401         case 'x': {
 402             consume();
 403             int x = tryConsumeHex(2);
 404             if (x == -1)
 405                 delegate.atomPatternCharacter('x');
 406             else
 407                 delegate.atomPatternCharacter(x);
 408             break;
 409         }
 410
 411         // UnicodeEscape
 412         case 'u': {
 413             consume();
 414             int u = tryConsumeHex(4);
 415             if (u == -1)
 416                 delegate.atomPatternCharacter('u');
 417             else
 418                 delegate.atomPatternCharacter(u);
 419             break;
 420         }
 421
 422         // IdentityEscape
 423         default:
 424             delegate.atomPatternCharacter(consume());
 425         }
 426
 427         return true;
 428     }
 429
 430     /*
 431      * parseAtomEscape(), parseCharacterClassEscape():
 432      *
 433      * These methods alias to parseEscape().
 434      */
 435     bool parseAtomEscape()
 436     {
 437         return parseEscape<false>(m_delegate);
 438     }
 439     void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
 440     {
 441         parseEscape<true>(delegate);
 442     }
 443
 444     /*
 445      * parseCharacterClass():
 446      *
 447      * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
 448      * to an instance of CharacterClassParserDelegate, to describe the character class to the
 449      * delegate.
 450      */
 451     void parseCharacterClass()
 452     {
 453         ASSERT(!m_err);
 454         ASSERT(peek() == '[');
 455         consume();
 456
 457         CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err);
 458
 459         characterClassConstructor.begin(tryConsume('^'));
 460
 461         while (!atEndOfPattern()) {
 462             switch (peek()) {
 463             case ']':
 464                 consume();
 465                 characterClassConstructor.end();
 466                 return;
 467
 468             case '\\':
 469                 parseCharacterClassEscape(characterClassConstructor);
 470                 break;
 471
 472             default:
 473                 characterClassConstructor.atomPatternCharacter(consume(), true);
 474             }
 475
 476             if (m_err)
 477                 return;
 478         }
 479
 480         m_err = CharacterClassUnmatched;
 481     }
 482
 483     /*
 484      * parseParenthesesBegin():
 485      *
 486      * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
 487      */
 488     void parseParenthesesBegin()
 489     {
 490         ASSERT(!m_err);
 491         ASSERT(peek() == '(');
 492         consume();
 493
 494         if (tryConsume('?')) {
 495             if (atEndOfPattern()) {
 496                 m_err = ParenthesesTypeInvalid;
 497                 return;
 498             }
 499
 500             switch (consume()) {
 501             case ':':
 502                 m_delegate.atomParenthesesSubpatternBegin(false);
 503                 break;
 504
 505             case '=':
 506                 m_delegate.atomParentheticalAssertionBegin();
 507                 break;
 508
 509             case '!':
 510                 m_delegate.atomParentheticalAssertionBegin(true);
 511                 break;
 512
 513             default:
 514                 m_err = ParenthesesTypeInvalid;
 515             }
 516         } else
 517             m_delegate.atomParenthesesSubpatternBegin();
 518
 519         ++m_parenthesesNestingDepth;
 520     }
 521
 522     /*
 523      * parseParenthesesEnd():
 524      *
 525      * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
 526      */
 527     void parseParenthesesEnd()
 528     {
 529         ASSERT(!m_err);
 530         ASSERT(peek() == ')');
 531         consume();
 532
 533         if (m_parenthesesNestingDepth > 0)
 534             m_delegate.atomParenthesesEnd();
 535         else
 536             m_err = ParenthesesUnmatched;
 537
 538         --m_parenthesesNestingDepth;
 539     }
 540
 541     /*
 542      * parseQuantifier():
 543      *
 544      * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
 545      */
 546     void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
 547     {
 548         ASSERT(!m_err);
 549         ASSERT(min <= max);
 550
 551         if (min == UINT_MAX) {
 552             m_err = QuantifierTooLarge;
 553             return;
 554         }
 555
 556         if (lastTokenWasAnAtom)
 557             m_delegate.quantifyAtom(min, max, !tryConsume('?'));
 558         else
 559             m_err = QuantifierWithoutAtom;
 560     }
 561
 562     /*
 563      * parseTokens():
 564      *
 565      * This method loops over the input pattern reporting tokens to the delegate.
 566      * The method returns when a parse error is detected, or the end of the pattern
 567      * is reached.  One piece of state is tracked around the loop, which is whether
 568      * the last token passed to the delegate was an atom (this is necessary to detect
 569      * a parse error when a quantifier provided without an atom to quantify).
 570      */
 571     void parseTokens()
 572     {
 573         bool lastTokenWasAnAtom = false;
 574
 575         while (!atEndOfPattern()) {
 576             switch (peek()) {
 577             case '|':
 578                 consume();
 579                 m_delegate.disjunction();
 580                 lastTokenWasAnAtom = false;
 581                 break;
 582
 583             case '(':
 584                 parseParenthesesBegin();
 585                 lastTokenWasAnAtom = false;
 586                 break;
 587
 588             case ')':
 589                 parseParenthesesEnd();
 590                 lastTokenWasAnAtom = true;
 591                 break;
 592
 593             case '^':
 594                 consume();
 595                 m_delegate.assertionBOL();
 596                 lastTokenWasAnAtom = false;
 597                 break;
 598
 599             case '$':
 600                 consume();
 601                 m_delegate.assertionEOL();
 602                 lastTokenWasAnAtom = false;
 603                 break;
 604
 605             case '.':
 606                 consume();
 607                 m_delegate.atomBuiltInCharacterClass(NewlineClassID, true);
 608                 lastTokenWasAnAtom = true;
 609                 break;
 610
 611             case '[':
 612                 parseCharacterClass();
 613                 lastTokenWasAnAtom = true;
 614                 break;
 615
 616             case '\\':
 617                 lastTokenWasAnAtom = parseAtomEscape();
 618                 break;
 619
 620             case '*':
 621                 consume();
 622                 parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite);
 623                 lastTokenWasAnAtom = false;
 624                 break;
 625
 626             case '+':
 627                 consume();
 628                 parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite);
 629                 lastTokenWasAnAtom = false;
 630                 break;
 631
 632             case '?':
 633                 consume();
 634                 parseQuantifier(lastTokenWasAnAtom, 0, 1);
 635                 lastTokenWasAnAtom = false;
 636                 break;
 637
 638             case '{': {
 639                 ParseState state = saveState();
 640
 641                 consume();
 642                 if (peekIsDigit()) {
 643                     unsigned min = consumeNumber();
 644                     unsigned max = min;
 645
 646                     if (tryConsume(','))
 647                         max = peekIsDigit() ? consumeNumber() : quantifyInfinite;
 648
 649                     if (tryConsume('}')) {
 650                         if (min <= max)
 651                             parseQuantifier(lastTokenWasAnAtom, min, max);
 652                         else
 653                             m_err = QuantifierOutOfOrder;
 654                         lastTokenWasAnAtom = false;
 655                         break;
 656                     }
 657                 }
 658
 659                 restoreState(state);
 660             }
 661             // if we did not find a complete quantifer, fall through to the default case.
 662             FALLTHROUGH;
 663
 664             default:
 665                 m_delegate.atomPatternCharacter(consume());
 666                 lastTokenWasAnAtom = true;
 667             }
 668
 669             if (m_err)
 670                 return;
 671         }
 672
 673         if (m_parenthesesNestingDepth > 0)
 674             m_err = MissingParentheses;
 675     }
 676
 677     /*
 678      * parse():
 679      *
 680      * This method calls parseTokens() to parse over the input and converts any
 681      * error code to a const char* for a result.
 682      */
 683     const char* parse()
 684     {
 685         if (m_size > MAX_PATTERN_SIZE)
 686             m_err = PatternTooLarge;
 687         else
 688             parseTokens();
 689         ASSERT(atEndOfPattern() || m_err);
 690
 691         // The order of this array must match the ErrorCode enum.
 692         static const char* errorMessages[NumberOfErrorCodes] = {
 693             0, // NoError
 694             REGEXP_ERROR_PREFIX "regular expression too large",
 695             REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier",
 696             REGEXP_ERROR_PREFIX "nothing to repeat",
 697             REGEXP_ERROR_PREFIX "number too large in {} quantifier",
 698             REGEXP_ERROR_PREFIX "missing )",
 699             REGEXP_ERROR_PREFIX "unmatched parentheses",
 700             REGEXP_ERROR_PREFIX "unrecognized character after (?",
 701             REGEXP_ERROR_PREFIX "missing terminating ] for character class",
 702             REGEXP_ERROR_PREFIX "range out of order in character class",
 703             REGEXP_ERROR_PREFIX "\\ at end of pattern"
 704         };
 705
 706         return errorMessages[m_err];
 707     }
 708
 709     // Misc helper functions:
 710
 711     typedef unsigned ParseState;
 712
 713     ParseState saveState()
 714     {
 715         return m_index;
 716     }
 717
 718     void restoreState(ParseState state)
 719     {
 720         m_index = state;
 721     }
 722
 723     bool atEndOfPattern()
 724     {
 725         ASSERT(m_index <= m_size);
 726         return m_index == m_size;
 727     }
 728
 729     int peek()
 730     {
 731         ASSERT(m_index < m_size);
 732         return m_data[m_index];
 733     }
 734
 735     bool peekIsDigit()
 736     {
 737         return !atEndOfPattern() && WTF::isASCIIDigit(peek());
 738     }
 739
 740     unsigned peekDigit()
 741     {
 742         ASSERT(peekIsDigit());
 743         return peek() - '0';
 744     }
 745
 746     int consume()
 747     {
 748         ASSERT(m_index < m_size);
 749         return m_data[m_index++];
 750     }
 751
 752     unsigned consumeDigit()
 753     {
 754         ASSERT(peekIsDigit());
 755         return consume() - '0';
 756     }
 757
 758     unsigned consumeNumber()
 759     {
 760         unsigned n = consumeDigit();
 761         // check for overflow.
 762         for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) {
 763             n = newValue;
 764             consume();
 765         }
 766         return n;
 767     }
 768
 769     unsigned consumeOctal()
 770     {
 771         ASSERT(WTF::isASCIIOctalDigit(peek()));
 772
 773         unsigned n = consumeDigit();
 774         while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
 775             n = n * 8 + consumeDigit();
 776         return n;
 777     }
 778
 779     bool tryConsume(UChar ch)
 780     {
 781         if (atEndOfPattern() || (m_data[m_index] != ch))
 782             return false;
 783         ++m_index;
 784         return true;
 785     }
 786
 787     int tryConsumeHex(int count)
 788     {
 789         ParseState state = saveState();
 790
 791         int n = 0;
 792         while (count--) {
 793             if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
 794                 restoreState(state);
 795                 return -1;
 796             }
 797             n = (n << 4) | WTF::toASCIIHexValue(consume());
 798         }
 799         return n;
 800     }
 801
 802     Delegate& m_delegate;
 803     unsigned m_backReferenceLimit;
 804     ErrorCode m_err;
 805     const CharType* m_data;
 806     unsigned m_size;
 807     unsigned m_index;
 808     unsigned m_parenthesesNestingDepth;
 809
 810     // Derived by empirical testing of compile time in PCRE and WREC.
 811     static const unsigned MAX_PATTERN_SIZE = 1024 * 1024;
 812 };
 813
 814 /*
 815  * Yarr::parse():
 816  *
 817  * The parse method is passed a pattern to be parsed and a delegate upon which
 818  * callbacks will be made to record the parsed tokens forming the regex.
 819  * Yarr::parse() returns null on success, or a const C string providing an error
 820  * message where a parse error occurs.
 821  *
 822  * The Delegate must implement the following interface:
 823  *
 824  *    void assertionBOL();
 825  *    void assertionEOL();
 826  *    void assertionWordBoundary(bool invert);
 827  *
 828  *    void atomPatternCharacter(UChar ch);
 829  *    void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
 830  *    void atomCharacterClassBegin(bool invert)
 831  *    void atomCharacterClassAtom(UChar ch)
 832  *    void atomCharacterClassRange(UChar begin, UChar end)
 833  *    void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
 834  *    void atomCharacterClassEnd()
 835  *    void atomParenthesesSubpatternBegin(bool capture = true);
 836  *    void atomParentheticalAssertionBegin(bool invert = false);
 837  *    void atomParenthesesEnd();
 838  *    void atomBackReference(unsigned subpatternId);
 839  *
 840  *    void quantifyAtom(unsigned min, unsigned max, bool greedy);
 841  *
 842  *    void disjunction();
 843  *
 844  * The regular expression is described by a sequence of assertion*() and atom*()
 845  * callbacks to the delegate, describing the terms in the regular expression.
 846  * Following an atom a quantifyAtom() call may occur to indicate that the previous
 847  * atom should be quantified.  In the case of atoms described across multiple
 848  * calls (parentheses and character classes) the call to quantifyAtom() will come
 849  * after the call to the atom*End() method, never after atom*Begin().
 850  *
 851  * Character classes may either be described by a single call to
 852  * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
 853  * In the latter case, ...Begin() will be called, followed by a sequence of
 854  * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
 855  *
 856  * Sequences of atoms and assertions are broken into alternatives via calls to
 857  * disjunction().  Assertions, atoms, and disjunctions emitted between calls to
 858  * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
 859  * atomParenthesesBegin() is passed a subpatternId.  In the case of a regular
 860  * capturing subpattern, this will be the subpatternId associated with these
 861  * parentheses, and will also by definition be the lowest subpatternId of these
 862  * parentheses and of any nested paretheses.  The atomParenthesesEnd() method
 863  * is passed the subpatternId of the last capturing subexpression nested within
 864  * these paretheses.  In the case of a capturing subpattern with no nested
 865  * capturing subpatterns, the same subpatternId will be passed to the begin and
 866  * end functions.  In the case of non-capturing subpatterns the subpatternId
 867  * passed to the begin method is also the first possible subpatternId that might
 868  * be nested within these paretheses.  If a set of non-capturing parentheses does
 869  * not contain any capturing subpatterns, then the subpatternId passed to begin
 870  * will be greater than the subpatternId passed to end.
 871  */
 872
 873 template<class Delegate>
 874 const char* parse(Delegate& delegate, const String& pattern, unsigned backReferenceLimit = quantifyInfinite)
 875 {
 876     if (pattern.is8Bit())
 877         return Parser<Delegate, LChar>(delegate, pattern, backReferenceLimit).parse();
 878     return Parser<Delegate, UChar>(delegate, pattern, backReferenceLimit).parse();
 879 }
 880
 881 } } // namespace JSC::Yarr
 882
 883 #endif // YarrParser_h