yarr/RegexParser.h

   1 /*
   2  * Copyright (C) 2009 Apple Inc. All rights reserved.
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions
   6  * are met:
   7  * 1. Redistributions of source code must retain the above copyright
   8  *    notice, this list of conditions and the following disclaimer.
   9  * 2. Redistributions in binary form must reproduce the above copyright
  10  *    notice, this list of conditions and the following disclaimer in the
  11  *    documentation and/or other materials provided with the distribution.
  12  *
  13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
  14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
  17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  24  */
  25
  26 #ifndef RegexParser_h
  27 #define RegexParser_h
  28
  29 #include <wtf/Platform.h>
  30
  31 #if ENABLE(YARR)
  32
  33 #include <UString.h>
  34 #include <wtf/ASCIICType.h>
  35 #include <wtf/unicode/Unicode.h>
  36 #include <limits.h>
  37
  38 namespace JSC { namespace Yarr {
  39
  40 enum BuiltInCharacterClassID {
  41     DigitClassID,
  42     SpaceClassID,
  43     WordClassID,
  44     NewlineClassID,
  45 };
  46
  47 // The Parser class should not be used directly - only via the Yarr::parse() method.
  48 template<class Delegate>
  49 class Parser {
  50 private:
  51     template<class FriendDelegate>
  52     friend const char* parse(FriendDelegate& delegate, const UString& pattern, unsigned backReferenceLimit);
  53
  54     enum ErrorCode {
  55         NoError,
  56         PatternTooLarge,
  57         QuantifierOutOfOrder,
  58         QuantifierWithoutAtom,
  59         MissingParentheses,
  60         ParenthesesUnmatched,
  61         ParenthesesTypeInvalid,
  62         CharacterClassUnmatched,
  63         CharacterClassOutOfOrder,
  64         EscapeUnterminated,
  65         NumberOfErrorCodes
  66     };
  67
  68     /*
  69      * CharacterClassParserDelegate:
  70      *
  71      * The class CharacterClassParserDelegate is used in the parsing of character
  72      * classes.  This class handles detection of character ranges.  This class
  73      * implements enough of the delegate interface such that it can be passed to
  74      * parseEscape() as an EscapeDelegate.  This allows parseEscape() to be reused
  75      * to perform the parsing of escape characters in character sets.
  76      */
  77     class CharacterClassParserDelegate {
  78     public:
  79         CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
  80             : m_delegate(delegate)
  81             , m_err(err)
  82             , m_state(empty)
  83         {
  84         }
  85
  86         /*
  87          * begin():
  88          *
  89          * Called at beginning of construction.
  90          */
  91         void begin(bool invert)
  92         {
  93             m_delegate.atomCharacterClassBegin(invert);
  94         }
  95
  96         /*
  97          * atomPatternCharacterUnescaped():
  98          *
  99          * This method is called directly from parseCharacterClass(), to report a new
 100          * pattern character token.  This method differs from atomPatternCharacter(),
 101          * which will be called from parseEscape(), since a hypen provided via this
 102          * method may be indicating a character range, but a hyphen parsed by
 103          * parseEscape() cannot be interpreted as doing so.
 104          */
 105         void atomPatternCharacterUnescaped(UChar ch)
 106         {
 107             switch (m_state) {
 108             case empty:
 109                 m_character = ch;
 110                 m_state = cachedCharacter;
 111                 break;
 112
 113             case cachedCharacter:
 114                 if (ch == '-')
 115                     m_state = cachedCharacterHyphen;
 116                 else {
 117                     m_delegate.atomCharacterClassAtom(m_character);
 118                     m_character = ch;
 119                 }
 120                 break;
 121
 122             case cachedCharacterHyphen:
 123                 if (ch >= m_character)
 124                     m_delegate.atomCharacterClassRange(m_character, ch);
 125                 else
 126                     m_err = CharacterClassOutOfOrder;
 127                 m_state = empty;
 128             }
 129         }
 130
 131         /*
 132          * atomPatternCharacter():
 133          *
 134          * Adds a pattern character, called by parseEscape(), as such will not
 135          * interpret a hyphen as indicating a character range.
 136          */
 137         void atomPatternCharacter(UChar ch)
 138         {
 139             // Flush if a character is already pending to prevent the
 140             // hyphen from begin interpreted as indicating a range.
 141             if((ch == '-') && (m_state == cachedCharacter))
 142                 flush();
 143
 144             atomPatternCharacterUnescaped(ch);
 145         }
 146
 147         /*
 148          * atomBuiltInCharacterClass():
 149          *
 150          * Adds a built-in character class, called by parseEscape().
 151          */
 152         void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
 153         {
 154             flush();
 155             m_delegate.atomCharacterClassBuiltIn(classID, invert);
 156         }
 157
 158         /*
 159          * end():
 160          *
 161          * Called at end of construction.
 162          */
 163         void end()
 164         {
 165             flush();
 166             m_delegate.atomCharacterClassEnd();
 167         }
 168
 169         // parseEscape() should never call these delegate methods when
 170         // invoked with inCharacterClass set.
 171         void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); }
 172         void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); }
 173
 174     private:
 175         void flush()
 176         {
 177             if (m_state != empty) // either cachedCharacter or cachedCharacterHyphen
 178                 m_delegate.atomCharacterClassAtom(m_character);
 179             if (m_state == cachedCharacterHyphen)
 180                 m_delegate.atomCharacterClassAtom('-');
 181             m_state = empty;
 182         }
 183
 184         Delegate& m_delegate;
 185         ErrorCode& m_err;
 186         enum CharacterClassConstructionState {
 187             empty,
 188             cachedCharacter,
 189             cachedCharacterHyphen,
 190         } m_state;
 191         UChar m_character;
 192     };
 193
 194     Parser(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit)
 195         : m_delegate(delegate)
 196         , m_backReferenceLimit(backReferenceLimit)
 197         , m_err(NoError)
 198         , m_data(pattern.data())
 199         , m_size(pattern.size())
 200         , m_index(0)
 201         , m_parenthesesNestingDepth(0)
 202     {
 203     }
 204
 205     /*
 206      * parseEscape():
 207      *
 208      * Helper for parseTokens() AND parseCharacterClass().
 209      * Unlike the other parser methods, this function does not report tokens
 210      * directly to the member delegate (m_delegate), instead tokens are
 211      * emitted to the delegate provided as an argument.  In the case of atom
 212      * escapes, parseTokens() will call parseEscape() passing m_delegate as
 213      * an argument, and as such the escape will be reported to the delegate.
 214      *
 215      * However this method may also be used by parseCharacterClass(), in which
 216      * case a CharacterClassParserDelegate will be passed as the delegate that
 217      * tokens should be added to.  A boolean flag is also provided to indicate
 218      * whether that an escape in a CharacterClass is being parsed (some parsing
 219      * rules change in this context).
 220      *
 221      * The boolean value returned by this method indicates whether the token
 222      * parsed was an atom (outside of a characted class \b and \B will be
 223      * interpreted as assertions).
 224      */
 225     template<bool inCharacterClass, class EscapeDelegate>
 226     bool parseEscape(EscapeDelegate& delegate)
 227     {
 228         ASSERT(!m_err);
 229         ASSERT(peek() == '\\');
 230         consume();
 231
 232         if (atEndOfPattern()) {
 233             m_err = EscapeUnterminated;
 234             return false;
 235         }
 236
 237         switch (peek()) {
 238         // Assertions
 239         case 'b':
 240             consume();
 241             if (inCharacterClass)
 242                 delegate.atomPatternCharacter('\b');
 243             else {
 244                 delegate.assertionWordBoundary(false);
 245                 return false;
 246             }
 247             break;
 248         case 'B':
 249             consume();
 250             if (inCharacterClass)
 251                 delegate.atomPatternCharacter('B');
 252             else {
 253                 delegate.assertionWordBoundary(true);
 254                 return false;
 255             }
 256             break;
 257
 258         // CharacterClassEscape
 259         case 'd':
 260             consume();
 261             delegate.atomBuiltInCharacterClass(DigitClassID, false);
 262             break;
 263         case 's':
 264             consume();
 265             delegate.atomBuiltInCharacterClass(SpaceClassID, false);
 266             break;
 267         case 'w':
 268             consume();
 269             delegate.atomBuiltInCharacterClass(WordClassID, false);
 270             break;
 271         case 'D':
 272             consume();
 273             delegate.atomBuiltInCharacterClass(DigitClassID, true);
 274             break;
 275         case 'S':
 276             consume();
 277             delegate.atomBuiltInCharacterClass(SpaceClassID, true);
 278             break;
 279         case 'W':
 280             consume();
 281             delegate.atomBuiltInCharacterClass(WordClassID, true);
 282             break;
 283
 284         // DecimalEscape
 285         case '1':
 286         case '2':
 287         case '3':
 288         case '4':
 289         case '5':
 290         case '6':
 291         case '7':
 292         case '8':
 293         case '9': {
 294             // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
 295             // First, try to parse this as backreference.
 296             if (!inCharacterClass) {
 297                 ParseState state = saveState();
 298
 299                 unsigned backReference = consumeNumber();
 300                 if (backReference <= m_backReferenceLimit) {
 301                     delegate.atomBackReference(backReference);
 302                     break;
 303                 }
 304
 305                 restoreState(state);
 306             }
 307
 308             // Not a backreference, and not octal.
 309             if (peek() >= '8') {
 310                 delegate.atomPatternCharacter('\\');
 311                 break;
 312             }
 313
 314             // Fall-through to handle this as an octal escape.
 315         }
 316
 317         // Octal escape
 318         case '0':
 319             delegate.atomPatternCharacter(consumeOctal());
 320             break;
 321
 322         // ControlEscape
 323         case 'f':
 324             consume();
 325             delegate.atomPatternCharacter('\f');
 326             break;
 327         case 'n':
 328             consume();
 329             delegate.atomPatternCharacter('\n');
 330             break;
 331         case 'r':
 332             consume();
 333             delegate.atomPatternCharacter('\r');
 334             break;
 335         case 't':
 336             consume();
 337             delegate.atomPatternCharacter('\t');
 338             break;
 339         case 'v':
 340             consume();
 341             delegate.atomPatternCharacter('\v');
 342             break;
 343
 344         // ControlLetter
 345         case 'c': {
 346             ParseState state = saveState();
 347             consume();
 348             if (!atEndOfPattern()) {
 349                 int control = consume();
 350
 351                 // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
 352                 if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) {
 353                     delegate.atomPatternCharacter(control & 0x1f);
 354                     break;
 355                 }
 356             }
 357             restoreState(state);
 358             delegate.atomPatternCharacter('\\');
 359             break;
 360         }
 361
 362         // HexEscape
 363         case 'x': {
 364             consume();
 365             int x = tryConsumeHex(2);
 366             if (x == -1)
 367                 delegate.atomPatternCharacter('x');
 368             else
 369                 delegate.atomPatternCharacter(x);
 370             break;
 371         }
 372
 373         // UnicodeEscape
 374         case 'u': {
 375             consume();
 376             int u = tryConsumeHex(4);
 377             if (u == -1)
 378                 delegate.atomPatternCharacter('u');
 379             else
 380                 delegate.atomPatternCharacter(u);
 381             break;
 382         }
 383
 384         // IdentityEscape
 385         default:
 386             delegate.atomPatternCharacter(consume());
 387         }
 388
 389         return true;
 390     }
 391
 392     /*
 393      * parseAtomEscape(), parseCharacterClassEscape():
 394      *
 395      * These methods alias to parseEscape().
 396      */
 397     bool parseAtomEscape()
 398     {
 399         return parseEscape<false>(m_delegate);
 400     }
 401     void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
 402     {
 403         parseEscape<true>(delegate);
 404     }
 405
 406     /*
 407      * parseCharacterClass():
 408      *
 409      * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
 410      * to an instance of CharacterClassParserDelegate, to describe the character class to the
 411      * delegate.
 412      */
 413     void parseCharacterClass()
 414     {
 415         ASSERT(!m_err);
 416         ASSERT(peek() == '[');
 417         consume();
 418
 419         CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err);
 420
 421         characterClassConstructor.begin(tryConsume('^'));
 422
 423         while (!atEndOfPattern()) {
 424             switch (peek()) {
 425             case ']':
 426                 consume();
 427                 characterClassConstructor.end();
 428                 return;
 429
 430             case '\\':
 431                 parseCharacterClassEscape(characterClassConstructor);
 432                 break;
 433
 434             default:
 435                 characterClassConstructor.atomPatternCharacterUnescaped(consume());
 436             }
 437
 438             if (m_err)
 439                 return;
 440         }
 441
 442         m_err = CharacterClassUnmatched;
 443     }
 444
 445     /*
 446      * parseParenthesesBegin():
 447      *
 448      * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
 449      */
 450     void parseParenthesesBegin()
 451     {
 452         ASSERT(!m_err);
 453         ASSERT(peek() == '(');
 454         consume();
 455
 456         if (tryConsume('?')) {
 457             if (atEndOfPattern()) {
 458                 m_err = ParenthesesTypeInvalid;
 459                 return;
 460             }
 461
 462             switch (consume()) {
 463             case ':':
 464                 m_delegate.atomParenthesesSubpatternBegin(false);
 465                 break;
 466
 467             case '=':
 468                 m_delegate.atomParentheticalAssertionBegin();
 469                 break;
 470
 471             case '!':
 472                 m_delegate.atomParentheticalAssertionBegin(true);
 473                 break;
 474
 475             default:
 476                 m_err = ParenthesesTypeInvalid;
 477             }
 478         } else
 479             m_delegate.atomParenthesesSubpatternBegin();
 480
 481         ++m_parenthesesNestingDepth;
 482     }
 483
 484     /*
 485      * parseParenthesesEnd():
 486      *
 487      * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
 488      */
 489     void parseParenthesesEnd()
 490     {
 491         ASSERT(!m_err);
 492         ASSERT(peek() == ')');
 493         consume();
 494
 495         if (m_parenthesesNestingDepth > 0)
 496             m_delegate.atomParenthesesEnd();
 497         else
 498             m_err = ParenthesesUnmatched;
 499
 500         --m_parenthesesNestingDepth;
 501     }
 502
 503     /*
 504      * parseQuantifier():
 505      *
 506      * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
 507      */
 508     void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
 509     {
 510         ASSERT(!m_err);
 511         ASSERT(min <= max);
 512
 513         if (lastTokenWasAnAtom)
 514             m_delegate.quantifyAtom(min, max, !tryConsume('?'));
 515         else
 516             m_err = QuantifierWithoutAtom;
 517     }
 518
 519     /*
 520      * parseTokens():
 521      *
 522      * This method loops over the input pattern reporting tokens to the delegate.
 523      * The method returns when a parse error is detected, or the end of the pattern
 524      * is reached.  One piece of state is tracked around the loop, which is whether
 525      * the last token passed to the delegate was an atom (this is necessary to detect
 526      * a parse error when a quantifier provided without an atom to quantify).
 527      */
 528     void parseTokens()
 529     {
 530         bool lastTokenWasAnAtom = false;
 531
 532         while (!atEndOfPattern()) {
 533             switch (peek()) {
 534             case '|':
 535                 consume();
 536                 m_delegate.disjunction();
 537                 lastTokenWasAnAtom = false;
 538                 break;
 539
 540             case '(':
 541                 parseParenthesesBegin();
 542                 lastTokenWasAnAtom = false;
 543                 break;
 544
 545             case ')':
 546                 parseParenthesesEnd();
 547                 lastTokenWasAnAtom = true;
 548                 break;
 549
 550             case '^':
 551                 consume();
 552                 m_delegate.assertionBOL();
 553                 lastTokenWasAnAtom = false;
 554                 break;
 555
 556             case '$':
 557                 consume();
 558                 m_delegate.assertionEOL();
 559                 lastTokenWasAnAtom = false;
 560                 break;
 561
 562             case '.':
 563                 consume();
 564                 m_delegate.atomBuiltInCharacterClass(NewlineClassID, true);
 565                 lastTokenWasAnAtom = true;
 566                 break;
 567
 568             case '[':
 569                 parseCharacterClass();
 570                 lastTokenWasAnAtom = true;
 571                 break;
 572
 573             case '\\':
 574                 lastTokenWasAnAtom = parseAtomEscape();
 575                 break;
 576
 577             case '*':
 578                 consume();
 579                 parseQuantifier(lastTokenWasAnAtom, 0, UINT_MAX);
 580                 lastTokenWasAnAtom = false;
 581                 break;
 582
 583             case '+':
 584                 consume();
 585                 parseQuantifier(lastTokenWasAnAtom, 1, UINT_MAX);
 586                 lastTokenWasAnAtom = false;
 587                 break;
 588
 589             case '?':
 590                 consume();
 591                 parseQuantifier(lastTokenWasAnAtom, 0, 1);
 592                 lastTokenWasAnAtom = false;
 593                 break;
 594
 595             case '{': {
 596                 ParseState state = saveState();
 597
 598                 consume();
 599                 if (peekIsDigit()) {
 600                     unsigned min = consumeNumber();
 601                     unsigned max = min;
 602
 603                     if (tryConsume(','))
 604                         max = peekIsDigit() ? consumeNumber() : UINT_MAX;
 605
 606                     if (tryConsume('}')) {
 607                         if (min <= max)
 608                             parseQuantifier(lastTokenWasAnAtom, min, max);
 609                         else
 610                             m_err = QuantifierOutOfOrder;
 611                         lastTokenWasAnAtom = false;
 612                         break;
 613                     }
 614                 }
 615
 616                 restoreState(state);
 617             } // if we did not find a complete quantifer, fall through to the default case.
 618
 619             default:
 620                 m_delegate.atomPatternCharacter(consume());
 621                 lastTokenWasAnAtom = true;
 622             }
 623
 624             if (m_err)
 625                 return;
 626         }
 627
 628         if (m_parenthesesNestingDepth > 0)
 629             m_err = MissingParentheses;
 630     }
 631
 632     /*
 633      * parse():
 634      *
 635      * This method calls regexBegin(), calls parseTokens() to parse over the input
 636      * patterns, calls regexEnd() or regexError() as appropriate, and converts any
 637      * error code to a const char* for a result.
 638      */
 639     const char* parse()
 640     {
 641         m_delegate.regexBegin();
 642
 643         if (m_size > MAX_PATTERN_SIZE)
 644             m_err = PatternTooLarge;
 645         else
 646             parseTokens();
 647         ASSERT(atEndOfPattern() || m_err);
 648
 649         if (m_err)
 650             m_delegate.regexError();
 651         else
 652             m_delegate.regexEnd();
 653
 654         // The order of this array must match the ErrorCode enum.
 655         static const char* errorMessages[NumberOfErrorCodes] = {
 656             0, // NoError
 657             "regular expression too large",
 658             "numbers out of order in {} quantifier",
 659             "nothing to repeat",
 660             "missing )",
 661             "unmatched parentheses",
 662             "unrecognized character after (?",
 663             "missing terminating ] for character class",
 664             "range out of order in character class",
 665             "\\ at end of pattern"
 666         };
 667
 668         return errorMessages[m_err];
 669     }
 670
 671
 672     // Misc helper functions:
 673
 674     typedef unsigned ParseState;
 675
 676     ParseState saveState()
 677     {
 678         return m_index;
 679     }
 680
 681     void restoreState(ParseState state)
 682     {
 683         m_index = state;
 684     }
 685
 686     bool atEndOfPattern()
 687     {
 688         ASSERT(m_index <= m_size);
 689         return m_index == m_size;
 690     }
 691
 692     int peek()
 693     {
 694         ASSERT(m_index < m_size);
 695         return m_data[m_index];
 696     }
 697
 698     bool peekIsDigit()
 699     {
 700         return !atEndOfPattern() && WTF::isASCIIDigit(peek());
 701     }
 702
 703     unsigned peekDigit()
 704     {
 705         ASSERT(peekIsDigit());
 706         return peek() - '0';
 707     }
 708
 709     int consume()
 710     {
 711         ASSERT(m_index < m_size);
 712         return m_data[m_index++];
 713     }
 714
 715     unsigned consumeDigit()
 716     {
 717         ASSERT(peekIsDigit());
 718         return consume() - '0';
 719     }
 720
 721     unsigned consumeNumber()
 722     {
 723         unsigned n = consumeDigit();
 724         // check for overflow.
 725         for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) {
 726             n = newValue;
 727             consume();
 728         }
 729         return n;
 730     }
 731
 732     unsigned consumeOctal()
 733     {
 734         ASSERT(WTF::isASCIIOctalDigit(peek()));
 735
 736         unsigned n = consumeDigit();
 737         while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
 738             n = n * 8 + consumeDigit();
 739         return n;
 740     }
 741
 742     bool tryConsume(UChar ch)
 743     {
 744         if (atEndOfPattern() || (m_data[m_index] != ch))
 745             return false;
 746         ++m_index;
 747         return true;
 748     }
 749
 750     int tryConsumeHex(int count)
 751     {
 752         ParseState state = saveState();
 753
 754         int n = 0;
 755         while (count--) {
 756             if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
 757                 restoreState(state);
 758                 return -1;
 759             }
 760             n = (n << 4) | WTF::toASCIIHexValue(consume());
 761         }
 762         return n;
 763     }
 764
 765     Delegate& m_delegate;
 766     unsigned m_backReferenceLimit;
 767     ErrorCode m_err;
 768     const UChar* m_data;
 769     unsigned m_size;
 770     unsigned m_index;
 771     unsigned m_parenthesesNestingDepth;
 772
 773     // Derived by empirical testing of compile time in PCRE and WREC.
 774     static const unsigned MAX_PATTERN_SIZE = 1024 * 1024;
 775 };
 776
 777 /*
 778  * Yarr::parse():
 779  *
 780  * The parse method is passed a pattern to be parsed and a delegate upon which
 781  * callbacks will be made to record the parsed tokens forming the regex.
 782  * Yarr::parse() returns null on success, or a const C string providing an error
 783  * message where a parse error occurs.
 784  *
 785  * The Delegate must implement the following interface:
 786  *
 787  *    void assertionBOL();
 788  *    void assertionEOL();
 789  *    void assertionWordBoundary(bool invert);
 790  *
 791  *    void atomPatternCharacter(UChar ch);
 792  *    void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
 793  *    void atomCharacterClassBegin(bool invert)
 794  *    void atomCharacterClassAtom(UChar ch)
 795  *    void atomCharacterClassRange(UChar begin, UChar end)
 796  *    void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
 797  *    void atomCharacterClassEnd()
 798  *    void atomParenthesesSubpatternBegin(bool capture = true);
 799  *    void atomParentheticalAssertionBegin(bool invert = false);
 800  *    void atomParenthesesEnd();
 801  *    void atomBackReference(unsigned subpatternId);
 802  *
 803  *    void quantifyAtom(unsigned min, unsigned max, bool greedy);
 804  *
 805  *    void disjunction();
 806  *
 807  *    void regexBegin();
 808  *    void regexEnd();
 809  *    void regexError();
 810  *
 811  * Before any call recording tokens are made, regexBegin() will be called on the
 812  * delegate once.  Once parsing is complete either regexEnd() or regexError() will
 813  * be called, as appropriate.
 814  *
 815  * The regular expression is described by a sequence of assertion*() and atom*()
 816  * callbacks to the delegate, describing the terms in the regular expression.
 817  * Following an atom a quantifyAtom() call may occur to indicate that the previous
 818  * atom should be quantified.  In the case of atoms described across multiple
 819  * calls (parentheses and character classes) the call to quantifyAtom() will come
 820  * after the call to the atom*End() method, never after atom*Begin().
 821  *
 822  * Character classes may either be described by a single call to
 823  * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
 824  * In the latter case, ...Begin() will be called, followed by a sequence of
 825  * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
 826  *
 827  * Sequences of atoms and assertions are broken into alternatives via calls to
 828  * disjunction().  Assertions, atoms, and disjunctions emitted between calls to
 829  * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
 830  * atomParenthesesBegin() is passed a subpatternId.  In the case of a regular
 831  * capturing subpattern, this will be the subpatternId associated with these
 832  * parentheses, and will also by definition be the lowest subpatternId of these
 833  * parentheses and of any nested paretheses.  The atomParenthesesEnd() method
 834  * is passed the subpatternId of the last capturing subexpression nested within
 835  * these paretheses.  In the case of a capturing subpattern with no nested
 836  * capturing subpatterns, the same subpatternId will be passed to the begin and
 837  * end functions.  In the case of non-capturing subpatterns the subpatternId
 838  * passed to the begin method is also the first possible subpatternId that might
 839  * be nested within these paretheses.  If a set of non-capturing parentheses does
 840  * not contain any capturing subpatterns, then the subpatternId passed to begin
 841  * will be greater than the subpatternId passed to end.
 842  */
 843
 844 template<class Delegate>
 845 const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = UINT_MAX)
 846 {
 847     return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse();
 848 }
 849
 850 } } // namespace JSC::Yarr
 851
 852 #endif
 853
 854 #endif // RegexParser_h