yarr/RegexParser.h

   1 /*
   2  * Copyright (C) 2009 Apple Inc. All rights reserved.
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions
   6  * are met:
   7  * 1. Redistributions of source code must retain the above copyright
   8  *    notice, this list of conditions and the following disclaimer.
   9  * 2. Redistributions in binary form must reproduce the above copyright
  10  *    notice, this list of conditions and the following disclaimer in the
  11  *    documentation and/or other materials provided with the distribution.
  12  *
  13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
  14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
  17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  24  */
  25
  26 #ifndef RegexParser_h
  27 #define RegexParser_h
  28
  29 #if ENABLE(YARR)
  30
  31 #include <UString.h>
  32 #include <limits.h>
  33 #include <wtf/ASCIICType.h>
  34 #include <wtf/unicode/Unicode.h>
  35
  36 namespace JSC { namespace Yarr {
  37
  38 enum BuiltInCharacterClassID {
  39     DigitClassID,
  40     SpaceClassID,
  41     WordClassID,
  42     NewlineClassID,
  43 };
  44
  45 // The Parser class should not be used directly - only via the Yarr::parse() method.
  46 template<class Delegate>
  47 class Parser {
  48 private:
  49     template<class FriendDelegate>
  50     friend const char* parse(FriendDelegate& delegate, const UString& pattern, unsigned backReferenceLimit);
  51
  52     enum ErrorCode {
  53         NoError,
  54         PatternTooLarge,
  55         QuantifierOutOfOrder,
  56         QuantifierWithoutAtom,
  57         MissingParentheses,
  58         ParenthesesUnmatched,
  59         ParenthesesTypeInvalid,
  60         CharacterClassUnmatched,
  61         CharacterClassOutOfOrder,
  62         EscapeUnterminated,
  63         NumberOfErrorCodes
  64     };
  65
  66     /*
  67      * CharacterClassParserDelegate:
  68      *
  69      * The class CharacterClassParserDelegate is used in the parsing of character
  70      * classes.  This class handles detection of character ranges.  This class
  71      * implements enough of the delegate interface such that it can be passed to
  72      * parseEscape() as an EscapeDelegate.  This allows parseEscape() to be reused
  73      * to perform the parsing of escape characters in character sets.
  74      */
  75     class CharacterClassParserDelegate {
  76     public:
  77         CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
  78             : m_delegate(delegate)
  79             , m_err(err)
  80             , m_state(empty)
  81         {
  82         }
  83
  84         /*
  85          * begin():
  86          *
  87          * Called at beginning of construction.
  88          */
  89         void begin(bool invert)
  90         {
  91             m_delegate.atomCharacterClassBegin(invert);
  92         }
  93
  94         /*
  95          * atomPatternCharacterUnescaped():
  96          *
  97          * This method is called directly from parseCharacterClass(), to report a new
  98          * pattern character token.  This method differs from atomPatternCharacter(),
  99          * which will be called from parseEscape(), since a hypen provided via this
 100          * method may be indicating a character range, but a hyphen parsed by
 101          * parseEscape() cannot be interpreted as doing so.
 102          */
 103         void atomPatternCharacterUnescaped(UChar ch)
 104         {
 105             switch (m_state) {
 106             case empty:
 107                 m_character = ch;
 108                 m_state = cachedCharacter;
 109                 break;
 110
 111             case cachedCharacter:
 112                 if (ch == '-')
 113                     m_state = cachedCharacterHyphen;
 114                 else {
 115                     m_delegate.atomCharacterClassAtom(m_character);
 116                     m_character = ch;
 117                 }
 118                 break;
 119
 120             case cachedCharacterHyphen:
 121                 if (ch >= m_character)
 122                     m_delegate.atomCharacterClassRange(m_character, ch);
 123                 else
 124                     m_err = CharacterClassOutOfOrder;
 125                 m_state = empty;
 126             }
 127         }
 128
 129         /*
 130          * atomPatternCharacter():
 131          *
 132          * Adds a pattern character, called by parseEscape(), as such will not
 133          * interpret a hyphen as indicating a character range.
 134          */
 135         void atomPatternCharacter(UChar ch)
 136         {
 137             // Flush if a character is already pending to prevent the
 138             // hyphen from begin interpreted as indicating a range.
 139             if((ch == '-') && (m_state == cachedCharacter))
 140                 flush();
 141
 142             atomPatternCharacterUnescaped(ch);
 143         }
 144
 145         /*
 146          * atomBuiltInCharacterClass():
 147          *
 148          * Adds a built-in character class, called by parseEscape().
 149          */
 150         void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
 151         {
 152             flush();
 153             m_delegate.atomCharacterClassBuiltIn(classID, invert);
 154         }
 155
 156         /*
 157          * end():
 158          *
 159          * Called at end of construction.
 160          */
 161         void end()
 162         {
 163             flush();
 164             m_delegate.atomCharacterClassEnd();
 165         }
 166
 167         // parseEscape() should never call these delegate methods when
 168         // invoked with inCharacterClass set.
 169         void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); }
 170         void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); }
 171
 172     private:
 173         void flush()
 174         {
 175             if (m_state != empty) // either cachedCharacter or cachedCharacterHyphen
 176                 m_delegate.atomCharacterClassAtom(m_character);
 177             if (m_state == cachedCharacterHyphen)
 178                 m_delegate.atomCharacterClassAtom('-');
 179             m_state = empty;
 180         }
 181
 182         Delegate& m_delegate;
 183         ErrorCode& m_err;
 184         enum CharacterClassConstructionState {
 185             empty,
 186             cachedCharacter,
 187             cachedCharacterHyphen,
 188         } m_state;
 189         UChar m_character;
 190     };
 191
 192     Parser(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit)
 193         : m_delegate(delegate)
 194         , m_backReferenceLimit(backReferenceLimit)
 195         , m_err(NoError)
 196         , m_data(pattern.data())
 197         , m_size(pattern.size())
 198         , m_index(0)
 199         , m_parenthesesNestingDepth(0)
 200     {
 201     }
 202
 203     /*
 204      * parseEscape():
 205      *
 206      * Helper for parseTokens() AND parseCharacterClass().
 207      * Unlike the other parser methods, this function does not report tokens
 208      * directly to the member delegate (m_delegate), instead tokens are
 209      * emitted to the delegate provided as an argument.  In the case of atom
 210      * escapes, parseTokens() will call parseEscape() passing m_delegate as
 211      * an argument, and as such the escape will be reported to the delegate.
 212      *
 213      * However this method may also be used by parseCharacterClass(), in which
 214      * case a CharacterClassParserDelegate will be passed as the delegate that
 215      * tokens should be added to.  A boolean flag is also provided to indicate
 216      * whether that an escape in a CharacterClass is being parsed (some parsing
 217      * rules change in this context).
 218      *
 219      * The boolean value returned by this method indicates whether the token
 220      * parsed was an atom (outside of a characted class \b and \B will be
 221      * interpreted as assertions).
 222      */
 223     template<bool inCharacterClass, class EscapeDelegate>
 224     bool parseEscape(EscapeDelegate& delegate)
 225     {
 226         ASSERT(!m_err);
 227         ASSERT(peek() == '\\');
 228         consume();
 229
 230         if (atEndOfPattern()) {
 231             m_err = EscapeUnterminated;
 232             return false;
 233         }
 234
 235         switch (peek()) {
 236         // Assertions
 237         case 'b':
 238             consume();
 239             if (inCharacterClass)
 240                 delegate.atomPatternCharacter('\b');
 241             else {
 242                 delegate.assertionWordBoundary(false);
 243                 return false;
 244             }
 245             break;
 246         case 'B':
 247             consume();
 248             if (inCharacterClass)
 249                 delegate.atomPatternCharacter('B');
 250             else {
 251                 delegate.assertionWordBoundary(true);
 252                 return false;
 253             }
 254             break;
 255
 256         // CharacterClassEscape
 257         case 'd':
 258             consume();
 259             delegate.atomBuiltInCharacterClass(DigitClassID, false);
 260             break;
 261         case 's':
 262             consume();
 263             delegate.atomBuiltInCharacterClass(SpaceClassID, false);
 264             break;
 265         case 'w':
 266             consume();
 267             delegate.atomBuiltInCharacterClass(WordClassID, false);
 268             break;
 269         case 'D':
 270             consume();
 271             delegate.atomBuiltInCharacterClass(DigitClassID, true);
 272             break;
 273         case 'S':
 274             consume();
 275             delegate.atomBuiltInCharacterClass(SpaceClassID, true);
 276             break;
 277         case 'W':
 278             consume();
 279             delegate.atomBuiltInCharacterClass(WordClassID, true);
 280             break;
 281
 282         // DecimalEscape
 283         case '1':
 284         case '2':
 285         case '3':
 286         case '4':
 287         case '5':
 288         case '6':
 289         case '7':
 290         case '8':
 291         case '9': {
 292             // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
 293             // First, try to parse this as backreference.
 294             if (!inCharacterClass) {
 295                 ParseState state = saveState();
 296
 297                 unsigned backReference = consumeNumber();
 298                 if (backReference <= m_backReferenceLimit) {
 299                     delegate.atomBackReference(backReference);
 300                     break;
 301                 }
 302
 303                 restoreState(state);
 304             }
 305
 306             // Not a backreference, and not octal.
 307             if (peek() >= '8') {
 308                 delegate.atomPatternCharacter('\\');
 309                 break;
 310             }
 311
 312             // Fall-through to handle this as an octal escape.
 313         }
 314
 315         // Octal escape
 316         case '0':
 317             delegate.atomPatternCharacter(consumeOctal());
 318             break;
 319
 320         // ControlEscape
 321         case 'f':
 322             consume();
 323             delegate.atomPatternCharacter('\f');
 324             break;
 325         case 'n':
 326             consume();
 327             delegate.atomPatternCharacter('\n');
 328             break;
 329         case 'r':
 330             consume();
 331             delegate.atomPatternCharacter('\r');
 332             break;
 333         case 't':
 334             consume();
 335             delegate.atomPatternCharacter('\t');
 336             break;
 337         case 'v':
 338             consume();
 339             delegate.atomPatternCharacter('\v');
 340             break;
 341
 342         // ControlLetter
 343         case 'c': {
 344             ParseState state = saveState();
 345             consume();
 346             if (!atEndOfPattern()) {
 347                 int control = consume();
 348
 349                 // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
 350                 if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) {
 351                     delegate.atomPatternCharacter(control & 0x1f);
 352                     break;
 353                 }
 354             }
 355             restoreState(state);
 356             delegate.atomPatternCharacter('\\');
 357             break;
 358         }
 359
 360         // HexEscape
 361         case 'x': {
 362             consume();
 363             int x = tryConsumeHex(2);
 364             if (x == -1)
 365                 delegate.atomPatternCharacter('x');
 366             else
 367                 delegate.atomPatternCharacter(x);
 368             break;
 369         }
 370
 371         // UnicodeEscape
 372         case 'u': {
 373             consume();
 374             int u = tryConsumeHex(4);
 375             if (u == -1)
 376                 delegate.atomPatternCharacter('u');
 377             else
 378                 delegate.atomPatternCharacter(u);
 379             break;
 380         }
 381
 382         // IdentityEscape
 383         default:
 384             delegate.atomPatternCharacter(consume());
 385         }
 386
 387         return true;
 388     }
 389
 390     /*
 391      * parseAtomEscape(), parseCharacterClassEscape():
 392      *
 393      * These methods alias to parseEscape().
 394      */
 395     bool parseAtomEscape()
 396     {
 397         return parseEscape<false>(m_delegate);
 398     }
 399     void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
 400     {
 401         parseEscape<true>(delegate);
 402     }
 403
 404     /*
 405      * parseCharacterClass():
 406      *
 407      * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
 408      * to an instance of CharacterClassParserDelegate, to describe the character class to the
 409      * delegate.
 410      */
 411     void parseCharacterClass()
 412     {
 413         ASSERT(!m_err);
 414         ASSERT(peek() == '[');
 415         consume();
 416
 417         CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err);
 418
 419         characterClassConstructor.begin(tryConsume('^'));
 420
 421         while (!atEndOfPattern()) {
 422             switch (peek()) {
 423             case ']':
 424                 consume();
 425                 characterClassConstructor.end();
 426                 return;
 427
 428             case '\\':
 429                 parseCharacterClassEscape(characterClassConstructor);
 430                 break;
 431
 432             default:
 433                 characterClassConstructor.atomPatternCharacterUnescaped(consume());
 434             }
 435
 436             if (m_err)
 437                 return;
 438         }
 439
 440         m_err = CharacterClassUnmatched;
 441     }
 442
 443     /*
 444      * parseParenthesesBegin():
 445      *
 446      * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
 447      */
 448     void parseParenthesesBegin()
 449     {
 450         ASSERT(!m_err);
 451         ASSERT(peek() == '(');
 452         consume();
 453
 454         if (tryConsume('?')) {
 455             if (atEndOfPattern()) {
 456                 m_err = ParenthesesTypeInvalid;
 457                 return;
 458             }
 459
 460             switch (consume()) {
 461             case ':':
 462                 m_delegate.atomParenthesesSubpatternBegin(false);
 463                 break;
 464
 465             case '=':
 466                 m_delegate.atomParentheticalAssertionBegin();
 467                 break;
 468
 469             case '!':
 470                 m_delegate.atomParentheticalAssertionBegin(true);
 471                 break;
 472
 473             default:
 474                 m_err = ParenthesesTypeInvalid;
 475             }
 476         } else
 477             m_delegate.atomParenthesesSubpatternBegin();
 478
 479         ++m_parenthesesNestingDepth;
 480     }
 481
 482     /*
 483      * parseParenthesesEnd():
 484      *
 485      * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
 486      */
 487     void parseParenthesesEnd()
 488     {
 489         ASSERT(!m_err);
 490         ASSERT(peek() == ')');
 491         consume();
 492
 493         if (m_parenthesesNestingDepth > 0)
 494             m_delegate.atomParenthesesEnd();
 495         else
 496             m_err = ParenthesesUnmatched;
 497
 498         --m_parenthesesNestingDepth;
 499     }
 500
 501     /*
 502      * parseQuantifier():
 503      *
 504      * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
 505      */
 506     void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
 507     {
 508         ASSERT(!m_err);
 509         ASSERT(min <= max);
 510
 511         if (lastTokenWasAnAtom)
 512             m_delegate.quantifyAtom(min, max, !tryConsume('?'));
 513         else
 514             m_err = QuantifierWithoutAtom;
 515     }
 516
 517     /*
 518      * parseTokens():
 519      *
 520      * This method loops over the input pattern reporting tokens to the delegate.
 521      * The method returns when a parse error is detected, or the end of the pattern
 522      * is reached.  One piece of state is tracked around the loop, which is whether
 523      * the last token passed to the delegate was an atom (this is necessary to detect
 524      * a parse error when a quantifier provided without an atom to quantify).
 525      */
 526     void parseTokens()
 527     {
 528         bool lastTokenWasAnAtom = false;
 529
 530         while (!atEndOfPattern()) {
 531             switch (peek()) {
 532             case '|':
 533                 consume();
 534                 m_delegate.disjunction();
 535                 lastTokenWasAnAtom = false;
 536                 break;
 537
 538             case '(':
 539                 parseParenthesesBegin();
 540                 lastTokenWasAnAtom = false;
 541                 break;
 542
 543             case ')':
 544                 parseParenthesesEnd();
 545                 lastTokenWasAnAtom = true;
 546                 break;
 547
 548             case '^':
 549                 consume();
 550                 m_delegate.assertionBOL();
 551                 lastTokenWasAnAtom = false;
 552                 break;
 553
 554             case '$':
 555                 consume();
 556                 m_delegate.assertionEOL();
 557                 lastTokenWasAnAtom = false;
 558                 break;
 559
 560             case '.':
 561                 consume();
 562                 m_delegate.atomBuiltInCharacterClass(NewlineClassID, true);
 563                 lastTokenWasAnAtom = true;
 564                 break;
 565
 566             case '[':
 567                 parseCharacterClass();
 568                 lastTokenWasAnAtom = true;
 569                 break;
 570
 571             case '\\':
 572                 lastTokenWasAnAtom = parseAtomEscape();
 573                 break;
 574
 575             case '*':
 576                 consume();
 577                 parseQuantifier(lastTokenWasAnAtom, 0, UINT_MAX);
 578                 lastTokenWasAnAtom = false;
 579                 break;
 580
 581             case '+':
 582                 consume();
 583                 parseQuantifier(lastTokenWasAnAtom, 1, UINT_MAX);
 584                 lastTokenWasAnAtom = false;
 585                 break;
 586
 587             case '?':
 588                 consume();
 589                 parseQuantifier(lastTokenWasAnAtom, 0, 1);
 590                 lastTokenWasAnAtom = false;
 591                 break;
 592
 593             case '{': {
 594                 ParseState state = saveState();
 595
 596                 consume();
 597                 if (peekIsDigit()) {
 598                     unsigned min = consumeNumber();
 599                     unsigned max = min;
 600
 601                     if (tryConsume(','))
 602                         max = peekIsDigit() ? consumeNumber() : UINT_MAX;
 603
 604                     if (tryConsume('}')) {
 605                         if (min <= max)
 606                             parseQuantifier(lastTokenWasAnAtom, min, max);
 607                         else
 608                             m_err = QuantifierOutOfOrder;
 609                         lastTokenWasAnAtom = false;
 610                         break;
 611                     }
 612                 }
 613
 614                 restoreState(state);
 615             } // if we did not find a complete quantifer, fall through to the default case.
 616
 617             default:
 618                 m_delegate.atomPatternCharacter(consume());
 619                 lastTokenWasAnAtom = true;
 620             }
 621
 622             if (m_err)
 623                 return;
 624         }
 625
 626         if (m_parenthesesNestingDepth > 0)
 627             m_err = MissingParentheses;
 628     }
 629
 630     /*
 631      * parse():
 632      *
 633      * This method calls regexBegin(), calls parseTokens() to parse over the input
 634      * patterns, calls regexEnd() or regexError() as appropriate, and converts any
 635      * error code to a const char* for a result.
 636      */
 637     const char* parse()
 638     {
 639         m_delegate.regexBegin();
 640
 641         if (m_size > MAX_PATTERN_SIZE)
 642             m_err = PatternTooLarge;
 643         else
 644             parseTokens();
 645         ASSERT(atEndOfPattern() || m_err);
 646
 647         if (m_err)
 648             m_delegate.regexError();
 649         else
 650             m_delegate.regexEnd();
 651
 652         // The order of this array must match the ErrorCode enum.
 653         static const char* errorMessages[NumberOfErrorCodes] = {
 654             0, // NoError
 655             "regular expression too large",
 656             "numbers out of order in {} quantifier",
 657             "nothing to repeat",
 658             "missing )",
 659             "unmatched parentheses",
 660             "unrecognized character after (?",
 661             "missing terminating ] for character class",
 662             "range out of order in character class",
 663             "\\ at end of pattern"
 664         };
 665
 666         return errorMessages[m_err];
 667     }
 668
 669
 670     // Misc helper functions:
 671
 672     typedef unsigned ParseState;
 673
 674     ParseState saveState()
 675     {
 676         return m_index;
 677     }
 678
 679     void restoreState(ParseState state)
 680     {
 681         m_index = state;
 682     }
 683
 684     bool atEndOfPattern()
 685     {
 686         ASSERT(m_index <= m_size);
 687         return m_index == m_size;
 688     }
 689
 690     int peek()
 691     {
 692         ASSERT(m_index < m_size);
 693         return m_data[m_index];
 694     }
 695
 696     bool peekIsDigit()
 697     {
 698         return !atEndOfPattern() && WTF::isASCIIDigit(peek());
 699     }
 700
 701     unsigned peekDigit()
 702     {
 703         ASSERT(peekIsDigit());
 704         return peek() - '0';
 705     }
 706
 707     int consume()
 708     {
 709         ASSERT(m_index < m_size);
 710         return m_data[m_index++];
 711     }
 712
 713     unsigned consumeDigit()
 714     {
 715         ASSERT(peekIsDigit());
 716         return consume() - '0';
 717     }
 718
 719     unsigned consumeNumber()
 720     {
 721         unsigned n = consumeDigit();
 722         // check for overflow.
 723         for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) {
 724             n = newValue;
 725             consume();
 726         }
 727         return n;
 728     }
 729
 730     unsigned consumeOctal()
 731     {
 732         ASSERT(WTF::isASCIIOctalDigit(peek()));
 733
 734         unsigned n = consumeDigit();
 735         while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
 736             n = n * 8 + consumeDigit();
 737         return n;
 738     }
 739
 740     bool tryConsume(UChar ch)
 741     {
 742         if (atEndOfPattern() || (m_data[m_index] != ch))
 743             return false;
 744         ++m_index;
 745         return true;
 746     }
 747
 748     int tryConsumeHex(int count)
 749     {
 750         ParseState state = saveState();
 751
 752         int n = 0;
 753         while (count--) {
 754             if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
 755                 restoreState(state);
 756                 return -1;
 757             }
 758             n = (n << 4) | WTF::toASCIIHexValue(consume());
 759         }
 760         return n;
 761     }
 762
 763     Delegate& m_delegate;
 764     unsigned m_backReferenceLimit;
 765     ErrorCode m_err;
 766     const UChar* m_data;
 767     unsigned m_size;
 768     unsigned m_index;
 769     unsigned m_parenthesesNestingDepth;
 770
 771     // Derived by empirical testing of compile time in PCRE and WREC.
 772     static const unsigned MAX_PATTERN_SIZE = 1024 * 1024;
 773 };
 774
 775 /*
 776  * Yarr::parse():
 777  *
 778  * The parse method is passed a pattern to be parsed and a delegate upon which
 779  * callbacks will be made to record the parsed tokens forming the regex.
 780  * Yarr::parse() returns null on success, or a const C string providing an error
 781  * message where a parse error occurs.
 782  *
 783  * The Delegate must implement the following interface:
 784  *
 785  *    void assertionBOL();
 786  *    void assertionEOL();
 787  *    void assertionWordBoundary(bool invert);
 788  *
 789  *    void atomPatternCharacter(UChar ch);
 790  *    void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
 791  *    void atomCharacterClassBegin(bool invert)
 792  *    void atomCharacterClassAtom(UChar ch)
 793  *    void atomCharacterClassRange(UChar begin, UChar end)
 794  *    void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
 795  *    void atomCharacterClassEnd()
 796  *    void atomParenthesesSubpatternBegin(bool capture = true);
 797  *    void atomParentheticalAssertionBegin(bool invert = false);
 798  *    void atomParenthesesEnd();
 799  *    void atomBackReference(unsigned subpatternId);
 800  *
 801  *    void quantifyAtom(unsigned min, unsigned max, bool greedy);
 802  *
 803  *    void disjunction();
 804  *
 805  *    void regexBegin();
 806  *    void regexEnd();
 807  *    void regexError();
 808  *
 809  * Before any call recording tokens are made, regexBegin() will be called on the
 810  * delegate once.  Once parsing is complete either regexEnd() or regexError() will
 811  * be called, as appropriate.
 812  *
 813  * The regular expression is described by a sequence of assertion*() and atom*()
 814  * callbacks to the delegate, describing the terms in the regular expression.
 815  * Following an atom a quantifyAtom() call may occur to indicate that the previous
 816  * atom should be quantified.  In the case of atoms described across multiple
 817  * calls (parentheses and character classes) the call to quantifyAtom() will come
 818  * after the call to the atom*End() method, never after atom*Begin().
 819  *
 820  * Character classes may either be described by a single call to
 821  * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
 822  * In the latter case, ...Begin() will be called, followed by a sequence of
 823  * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
 824  *
 825  * Sequences of atoms and assertions are broken into alternatives via calls to
 826  * disjunction().  Assertions, atoms, and disjunctions emitted between calls to
 827  * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
 828  * atomParenthesesBegin() is passed a subpatternId.  In the case of a regular
 829  * capturing subpattern, this will be the subpatternId associated with these
 830  * parentheses, and will also by definition be the lowest subpatternId of these
 831  * parentheses and of any nested paretheses.  The atomParenthesesEnd() method
 832  * is passed the subpatternId of the last capturing subexpression nested within
 833  * these paretheses.  In the case of a capturing subpattern with no nested
 834  * capturing subpatterns, the same subpatternId will be passed to the begin and
 835  * end functions.  In the case of non-capturing subpatterns the subpatternId
 836  * passed to the begin method is also the first possible subpatternId that might
 837  * be nested within these paretheses.  If a set of non-capturing parentheses does
 838  * not contain any capturing subpatterns, then the subpatternId passed to begin
 839  * will be greater than the subpatternId passed to end.
 840  */
 841
 842 template<class Delegate>
 843 const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = UINT_MAX)
 844 {
 845     return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse();
 846 }
 847
 848 } } // namespace JSC::Yarr
 849
 850 #endif
 851
 852 #endif // RegexParser_h