]> git.saurik.com Git - apple/javascriptcore.git/blame - yarr/YarrParser.h
JavaScriptCore-903.5.tar.gz
[apple/javascriptcore.git] / yarr / YarrParser.h
CommitLineData
ba379fdc
A
1/*
2 * Copyright (C) 2009 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
14957cd0
A
26#ifndef YarrParser_h
27#define YarrParser_h
ba379fdc 28
14957cd0
A
29#include <runtime/UString.h>
30#include "Yarr.h"
ba379fdc
A
31#include <wtf/ASCIICType.h>
32#include <wtf/unicode/Unicode.h>
ba379fdc
A
33
34namespace JSC { namespace Yarr {
35
14957cd0
A
36#define REGEXP_ERROR_PREFIX "Invalid regular expression: "
37
ba379fdc
A
38enum BuiltInCharacterClassID {
39 DigitClassID,
40 SpaceClassID,
41 WordClassID,
42 NewlineClassID,
43};
44
45// The Parser class should not be used directly - only via the Yarr::parse() method.
46template<class Delegate>
47class Parser {
48private:
49 template<class FriendDelegate>
50 friend const char* parse(FriendDelegate& delegate, const UString& pattern, unsigned backReferenceLimit);
51
52 enum ErrorCode {
53 NoError,
54 PatternTooLarge,
55 QuantifierOutOfOrder,
56 QuantifierWithoutAtom,
57 MissingParentheses,
58 ParenthesesUnmatched,
59 ParenthesesTypeInvalid,
60 CharacterClassUnmatched,
61 CharacterClassOutOfOrder,
62 EscapeUnterminated,
63 NumberOfErrorCodes
64 };
65
66 /*
67 * CharacterClassParserDelegate:
68 *
69 * The class CharacterClassParserDelegate is used in the parsing of character
70 * classes. This class handles detection of character ranges. This class
71 * implements enough of the delegate interface such that it can be passed to
72 * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused
73 * to perform the parsing of escape characters in character sets.
74 */
75 class CharacterClassParserDelegate {
76 public:
77 CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
78 : m_delegate(delegate)
79 , m_err(err)
14957cd0
A
80 , m_state(Empty)
81 , m_character(0)
ba379fdc
A
82 {
83 }
84
85 /*
86 * begin():
87 *
88 * Called at beginning of construction.
89 */
90 void begin(bool invert)
91 {
92 m_delegate.atomCharacterClassBegin(invert);
93 }
94
95 /*
14957cd0 96 * atomPatternCharacter():
ba379fdc 97 *
14957cd0
A
98 * This method is called either from parseCharacterClass() (for an unescaped
99 * character in a character class), or from parseEscape(). In the former case
100 * the value true will be passed for the argument 'hyphenIsRange', and in this
101 * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
102 * is different to /[a\-z]/).
ba379fdc 103 */
14957cd0 104 void atomPatternCharacter(UChar ch, bool hyphenIsRange = false)
ba379fdc
A
105 {
106 switch (m_state) {
14957cd0
A
107 case AfterCharacterClass:
108 // Following a builtin character class we need look out for a hyphen.
109 // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
110 // If we see a hyphen following a charater class then unlike usual
111 // we'll report it to the delegate immediately, and put ourself into
112 // a poisoned state. Any following calls to add another character or
113 // character class will result in an error. (A hypen following a
114 // character-class is itself valid, but only at the end of a regex).
115 if (hyphenIsRange && ch == '-') {
116 m_delegate.atomCharacterClassAtom('-');
117 m_state = AfterCharacterClassHyphen;
118 return;
119 }
120 // Otherwise just fall through - cached character so treat this as Empty.
121
122 case Empty:
ba379fdc 123 m_character = ch;
14957cd0
A
124 m_state = CachedCharacter;
125 return;
ba379fdc 126
14957cd0
A
127 case CachedCharacter:
128 if (hyphenIsRange && ch == '-')
129 m_state = CachedCharacterHyphen;
ba379fdc
A
130 else {
131 m_delegate.atomCharacterClassAtom(m_character);
132 m_character = ch;
133 }
14957cd0 134 return;
ba379fdc 135
14957cd0
A
136 case CachedCharacterHyphen:
137 if (ch < m_character) {
ba379fdc 138 m_err = CharacterClassOutOfOrder;
14957cd0
A
139 return;
140 }
141 m_delegate.atomCharacterClassRange(m_character, ch);
142 m_state = Empty;
143 return;
ba379fdc 144
14957cd0
A
145 // See coment in atomBuiltInCharacterClass below.
146 // This too is technically an error, per ECMA-262, and again we
147 // we chose to allow this. Note a subtlely here that while we
148 // diverge from the spec's definition of CharacterRange we do
149 // remain in compliance with the grammar. For example, consider
150 // the expression /[\d-a-z]/. We comply with the grammar in
151 // this case by not allowing a-z to be matched as a range.
152 case AfterCharacterClassHyphen:
153 m_delegate.atomCharacterClassAtom(ch);
154 m_state = Empty;
155 return;
156 }
ba379fdc
A
157 }
158
159 /*
160 * atomBuiltInCharacterClass():
161 *
162 * Adds a built-in character class, called by parseEscape().
163 */
164 void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
165 {
14957cd0
A
166 switch (m_state) {
167 case CachedCharacter:
168 // Flush the currently cached character, then fall through.
169 m_delegate.atomCharacterClassAtom(m_character);
170
171 case Empty:
172 case AfterCharacterClass:
173 m_state = AfterCharacterClass;
174 m_delegate.atomCharacterClassBuiltIn(classID, invert);
175 return;
176
177 // If we hit either of these cases, we have an invalid range that
178 // looks something like /[x-\d]/ or /[\d-\d]/.
179 // According to ECMA-262 this should be a syntax error, but
180 // empirical testing shows this to break teh webz. Instead we
181 // comply with to the ECMA-262 grammar, and assume the grammar to
182 // have matched the range correctly, but tweak our interpretation
183 // of CharacterRange. Effectively we implicitly handle the hyphen
184 // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/.
185 case CachedCharacterHyphen:
186 m_delegate.atomCharacterClassAtom(m_character);
187 m_delegate.atomCharacterClassAtom('-');
188 // fall through
189 case AfterCharacterClassHyphen:
190 m_delegate.atomCharacterClassBuiltIn(classID, invert);
191 m_state = Empty;
192 return;
193 }
ba379fdc
A
194 }
195
196 /*
197 * end():
198 *
199 * Called at end of construction.
200 */
201 void end()
202 {
14957cd0
A
203 if (m_state == CachedCharacter)
204 m_delegate.atomCharacterClassAtom(m_character);
205 else if (m_state == CachedCharacterHyphen) {
206 m_delegate.atomCharacterClassAtom(m_character);
207 m_delegate.atomCharacterClassAtom('-');
208 }
ba379fdc
A
209 m_delegate.atomCharacterClassEnd();
210 }
211
212 // parseEscape() should never call these delegate methods when
213 // invoked with inCharacterClass set.
214 void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); }
215 void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); }
216
217 private:
ba379fdc
A
218 Delegate& m_delegate;
219 ErrorCode& m_err;
220 enum CharacterClassConstructionState {
14957cd0
A
221 Empty,
222 CachedCharacter,
223 CachedCharacterHyphen,
224 AfterCharacterClass,
225 AfterCharacterClassHyphen,
ba379fdc
A
226 } m_state;
227 UChar m_character;
228 };
229
230 Parser(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit)
231 : m_delegate(delegate)
232 , m_backReferenceLimit(backReferenceLimit)
233 , m_err(NoError)
14957cd0
A
234 , m_data(pattern.characters())
235 , m_size(pattern.length())
ba379fdc
A
236 , m_index(0)
237 , m_parenthesesNestingDepth(0)
238 {
239 }
240
241 /*
242 * parseEscape():
243 *
244 * Helper for parseTokens() AND parseCharacterClass().
245 * Unlike the other parser methods, this function does not report tokens
246 * directly to the member delegate (m_delegate), instead tokens are
247 * emitted to the delegate provided as an argument. In the case of atom
248 * escapes, parseTokens() will call parseEscape() passing m_delegate as
249 * an argument, and as such the escape will be reported to the delegate.
250 *
251 * However this method may also be used by parseCharacterClass(), in which
252 * case a CharacterClassParserDelegate will be passed as the delegate that
253 * tokens should be added to. A boolean flag is also provided to indicate
254 * whether that an escape in a CharacterClass is being parsed (some parsing
255 * rules change in this context).
256 *
257 * The boolean value returned by this method indicates whether the token
258 * parsed was an atom (outside of a characted class \b and \B will be
259 * interpreted as assertions).
260 */
261 template<bool inCharacterClass, class EscapeDelegate>
262 bool parseEscape(EscapeDelegate& delegate)
263 {
264 ASSERT(!m_err);
265 ASSERT(peek() == '\\');
266 consume();
267
268 if (atEndOfPattern()) {
269 m_err = EscapeUnterminated;
270 return false;
271 }
272
273 switch (peek()) {
274 // Assertions
275 case 'b':
276 consume();
277 if (inCharacterClass)
278 delegate.atomPatternCharacter('\b');
279 else {
280 delegate.assertionWordBoundary(false);
281 return false;
282 }
283 break;
284 case 'B':
285 consume();
286 if (inCharacterClass)
287 delegate.atomPatternCharacter('B');
288 else {
289 delegate.assertionWordBoundary(true);
290 return false;
291 }
292 break;
293
294 // CharacterClassEscape
295 case 'd':
296 consume();
297 delegate.atomBuiltInCharacterClass(DigitClassID, false);
298 break;
299 case 's':
300 consume();
301 delegate.atomBuiltInCharacterClass(SpaceClassID, false);
302 break;
303 case 'w':
304 consume();
305 delegate.atomBuiltInCharacterClass(WordClassID, false);
306 break;
307 case 'D':
308 consume();
309 delegate.atomBuiltInCharacterClass(DigitClassID, true);
310 break;
311 case 'S':
312 consume();
313 delegate.atomBuiltInCharacterClass(SpaceClassID, true);
314 break;
315 case 'W':
316 consume();
317 delegate.atomBuiltInCharacterClass(WordClassID, true);
318 break;
319
320 // DecimalEscape
321 case '1':
322 case '2':
323 case '3':
324 case '4':
325 case '5':
326 case '6':
327 case '7':
328 case '8':
329 case '9': {
330 // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
331 // First, try to parse this as backreference.
332 if (!inCharacterClass) {
333 ParseState state = saveState();
334
335 unsigned backReference = consumeNumber();
336 if (backReference <= m_backReferenceLimit) {
337 delegate.atomBackReference(backReference);
338 break;
339 }
340
341 restoreState(state);
342 }
343
344 // Not a backreference, and not octal.
345 if (peek() >= '8') {
346 delegate.atomPatternCharacter('\\');
347 break;
348 }
349
350 // Fall-through to handle this as an octal escape.
351 }
352
353 // Octal escape
354 case '0':
355 delegate.atomPatternCharacter(consumeOctal());
356 break;
357
358 // ControlEscape
359 case 'f':
360 consume();
361 delegate.atomPatternCharacter('\f');
362 break;
363 case 'n':
364 consume();
365 delegate.atomPatternCharacter('\n');
366 break;
367 case 'r':
368 consume();
369 delegate.atomPatternCharacter('\r');
370 break;
371 case 't':
372 consume();
373 delegate.atomPatternCharacter('\t');
374 break;
375 case 'v':
376 consume();
377 delegate.atomPatternCharacter('\v');
378 break;
379
380 // ControlLetter
381 case 'c': {
382 ParseState state = saveState();
383 consume();
384 if (!atEndOfPattern()) {
385 int control = consume();
386
387 // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
388 if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) {
389 delegate.atomPatternCharacter(control & 0x1f);
390 break;
391 }
392 }
393 restoreState(state);
394 delegate.atomPatternCharacter('\\');
395 break;
396 }
397
398 // HexEscape
399 case 'x': {
400 consume();
401 int x = tryConsumeHex(2);
402 if (x == -1)
403 delegate.atomPatternCharacter('x');
404 else
405 delegate.atomPatternCharacter(x);
406 break;
407 }
408
409 // UnicodeEscape
410 case 'u': {
411 consume();
412 int u = tryConsumeHex(4);
413 if (u == -1)
414 delegate.atomPatternCharacter('u');
415 else
416 delegate.atomPatternCharacter(u);
417 break;
418 }
419
420 // IdentityEscape
421 default:
422 delegate.atomPatternCharacter(consume());
423 }
424
425 return true;
426 }
427
428 /*
429 * parseAtomEscape(), parseCharacterClassEscape():
430 *
431 * These methods alias to parseEscape().
432 */
433 bool parseAtomEscape()
434 {
435 return parseEscape<false>(m_delegate);
436 }
437 void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
438 {
439 parseEscape<true>(delegate);
440 }
441
442 /*
443 * parseCharacterClass():
444 *
445 * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
446 * to an instance of CharacterClassParserDelegate, to describe the character class to the
447 * delegate.
448 */
449 void parseCharacterClass()
450 {
451 ASSERT(!m_err);
452 ASSERT(peek() == '[');
453 consume();
454
455 CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err);
456
457 characterClassConstructor.begin(tryConsume('^'));
458
459 while (!atEndOfPattern()) {
460 switch (peek()) {
461 case ']':
462 consume();
463 characterClassConstructor.end();
464 return;
465
466 case '\\':
467 parseCharacterClassEscape(characterClassConstructor);
468 break;
469
470 default:
14957cd0 471 characterClassConstructor.atomPatternCharacter(consume(), true);
ba379fdc
A
472 }
473
474 if (m_err)
475 return;
476 }
477
478 m_err = CharacterClassUnmatched;
479 }
480
481 /*
482 * parseParenthesesBegin():
483 *
484 * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
485 */
486 void parseParenthesesBegin()
487 {
488 ASSERT(!m_err);
489 ASSERT(peek() == '(');
490 consume();
491
492 if (tryConsume('?')) {
493 if (atEndOfPattern()) {
494 m_err = ParenthesesTypeInvalid;
495 return;
496 }
497
498 switch (consume()) {
499 case ':':
500 m_delegate.atomParenthesesSubpatternBegin(false);
501 break;
502
503 case '=':
504 m_delegate.atomParentheticalAssertionBegin();
505 break;
506
507 case '!':
508 m_delegate.atomParentheticalAssertionBegin(true);
509 break;
510
511 default:
512 m_err = ParenthesesTypeInvalid;
513 }
514 } else
515 m_delegate.atomParenthesesSubpatternBegin();
516
517 ++m_parenthesesNestingDepth;
518 }
519
520 /*
521 * parseParenthesesEnd():
522 *
523 * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
524 */
525 void parseParenthesesEnd()
526 {
527 ASSERT(!m_err);
528 ASSERT(peek() == ')');
529 consume();
530
531 if (m_parenthesesNestingDepth > 0)
532 m_delegate.atomParenthesesEnd();
533 else
534 m_err = ParenthesesUnmatched;
535
536 --m_parenthesesNestingDepth;
537 }
538
539 /*
540 * parseQuantifier():
541 *
542 * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
543 */
544 void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
545 {
546 ASSERT(!m_err);
547 ASSERT(min <= max);
548
549 if (lastTokenWasAnAtom)
550 m_delegate.quantifyAtom(min, max, !tryConsume('?'));
551 else
552 m_err = QuantifierWithoutAtom;
553 }
554
555 /*
556 * parseTokens():
557 *
558 * This method loops over the input pattern reporting tokens to the delegate.
559 * The method returns when a parse error is detected, or the end of the pattern
560 * is reached. One piece of state is tracked around the loop, which is whether
561 * the last token passed to the delegate was an atom (this is necessary to detect
562 * a parse error when a quantifier provided without an atom to quantify).
563 */
564 void parseTokens()
565 {
566 bool lastTokenWasAnAtom = false;
567
568 while (!atEndOfPattern()) {
569 switch (peek()) {
570 case '|':
571 consume();
572 m_delegate.disjunction();
573 lastTokenWasAnAtom = false;
574 break;
575
576 case '(':
577 parseParenthesesBegin();
578 lastTokenWasAnAtom = false;
579 break;
580
581 case ')':
582 parseParenthesesEnd();
583 lastTokenWasAnAtom = true;
584 break;
585
586 case '^':
587 consume();
588 m_delegate.assertionBOL();
589 lastTokenWasAnAtom = false;
590 break;
591
592 case '$':
593 consume();
594 m_delegate.assertionEOL();
595 lastTokenWasAnAtom = false;
596 break;
597
598 case '.':
599 consume();
600 m_delegate.atomBuiltInCharacterClass(NewlineClassID, true);
601 lastTokenWasAnAtom = true;
602 break;
603
604 case '[':
605 parseCharacterClass();
606 lastTokenWasAnAtom = true;
607 break;
608
609 case '\\':
610 lastTokenWasAnAtom = parseAtomEscape();
611 break;
612
613 case '*':
614 consume();
14957cd0 615 parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite);
ba379fdc
A
616 lastTokenWasAnAtom = false;
617 break;
618
619 case '+':
620 consume();
14957cd0 621 parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite);
ba379fdc
A
622 lastTokenWasAnAtom = false;
623 break;
624
625 case '?':
626 consume();
627 parseQuantifier(lastTokenWasAnAtom, 0, 1);
628 lastTokenWasAnAtom = false;
629 break;
630
631 case '{': {
632 ParseState state = saveState();
633
634 consume();
635 if (peekIsDigit()) {
636 unsigned min = consumeNumber();
637 unsigned max = min;
638
639 if (tryConsume(','))
14957cd0 640 max = peekIsDigit() ? consumeNumber() : quantifyInfinite;
ba379fdc
A
641
642 if (tryConsume('}')) {
643 if (min <= max)
644 parseQuantifier(lastTokenWasAnAtom, min, max);
645 else
646 m_err = QuantifierOutOfOrder;
647 lastTokenWasAnAtom = false;
648 break;
649 }
650 }
651
652 restoreState(state);
653 } // if we did not find a complete quantifer, fall through to the default case.
654
655 default:
656 m_delegate.atomPatternCharacter(consume());
657 lastTokenWasAnAtom = true;
658 }
659
660 if (m_err)
661 return;
662 }
663
664 if (m_parenthesesNestingDepth > 0)
665 m_err = MissingParentheses;
666 }
667
668 /*
669 * parse():
670 *
14957cd0 671 * This method calls parseTokens() to parse over the input and converts any
ba379fdc
A
672 * error code to a const char* for a result.
673 */
674 const char* parse()
675 {
ba379fdc
A
676 if (m_size > MAX_PATTERN_SIZE)
677 m_err = PatternTooLarge;
678 else
679 parseTokens();
680 ASSERT(atEndOfPattern() || m_err);
681
ba379fdc
A
682 // The order of this array must match the ErrorCode enum.
683 static const char* errorMessages[NumberOfErrorCodes] = {
684 0, // NoError
14957cd0
A
685 REGEXP_ERROR_PREFIX "regular expression too large",
686 REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier",
687 REGEXP_ERROR_PREFIX "nothing to repeat",
688 REGEXP_ERROR_PREFIX "missing )",
689 REGEXP_ERROR_PREFIX "unmatched parentheses",
690 REGEXP_ERROR_PREFIX "unrecognized character after (?",
691 REGEXP_ERROR_PREFIX "missing terminating ] for character class",
692 REGEXP_ERROR_PREFIX "range out of order in character class",
693 REGEXP_ERROR_PREFIX "\\ at end of pattern"
ba379fdc
A
694 };
695
696 return errorMessages[m_err];
697 }
698
699
700 // Misc helper functions:
701
702 typedef unsigned ParseState;
703
704 ParseState saveState()
705 {
706 return m_index;
707 }
708
709 void restoreState(ParseState state)
710 {
711 m_index = state;
712 }
713
714 bool atEndOfPattern()
715 {
716 ASSERT(m_index <= m_size);
717 return m_index == m_size;
718 }
719
720 int peek()
721 {
722 ASSERT(m_index < m_size);
723 return m_data[m_index];
724 }
725
726 bool peekIsDigit()
727 {
728 return !atEndOfPattern() && WTF::isASCIIDigit(peek());
729 }
730
731 unsigned peekDigit()
732 {
733 ASSERT(peekIsDigit());
734 return peek() - '0';
735 }
736
737 int consume()
738 {
739 ASSERT(m_index < m_size);
740 return m_data[m_index++];
741 }
742
743 unsigned consumeDigit()
744 {
745 ASSERT(peekIsDigit());
746 return consume() - '0';
747 }
748
749 unsigned consumeNumber()
750 {
751 unsigned n = consumeDigit();
752 // check for overflow.
753 for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) {
754 n = newValue;
755 consume();
756 }
757 return n;
758 }
759
760 unsigned consumeOctal()
761 {
762 ASSERT(WTF::isASCIIOctalDigit(peek()));
763
764 unsigned n = consumeDigit();
765 while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
766 n = n * 8 + consumeDigit();
767 return n;
768 }
769
770 bool tryConsume(UChar ch)
771 {
772 if (atEndOfPattern() || (m_data[m_index] != ch))
773 return false;
774 ++m_index;
775 return true;
776 }
777
778 int tryConsumeHex(int count)
779 {
780 ParseState state = saveState();
781
782 int n = 0;
783 while (count--) {
784 if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
785 restoreState(state);
786 return -1;
787 }
788 n = (n << 4) | WTF::toASCIIHexValue(consume());
789 }
790 return n;
791 }
792
793 Delegate& m_delegate;
794 unsigned m_backReferenceLimit;
795 ErrorCode m_err;
796 const UChar* m_data;
797 unsigned m_size;
798 unsigned m_index;
799 unsigned m_parenthesesNestingDepth;
800
801 // Derived by empirical testing of compile time in PCRE and WREC.
802 static const unsigned MAX_PATTERN_SIZE = 1024 * 1024;
803};
804
805/*
806 * Yarr::parse():
807 *
808 * The parse method is passed a pattern to be parsed and a delegate upon which
809 * callbacks will be made to record the parsed tokens forming the regex.
810 * Yarr::parse() returns null on success, or a const C string providing an error
811 * message where a parse error occurs.
812 *
813 * The Delegate must implement the following interface:
814 *
815 * void assertionBOL();
816 * void assertionEOL();
817 * void assertionWordBoundary(bool invert);
818 *
819 * void atomPatternCharacter(UChar ch);
820 * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
821 * void atomCharacterClassBegin(bool invert)
822 * void atomCharacterClassAtom(UChar ch)
823 * void atomCharacterClassRange(UChar begin, UChar end)
824 * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
825 * void atomCharacterClassEnd()
826 * void atomParenthesesSubpatternBegin(bool capture = true);
827 * void atomParentheticalAssertionBegin(bool invert = false);
828 * void atomParenthesesEnd();
829 * void atomBackReference(unsigned subpatternId);
830 *
831 * void quantifyAtom(unsigned min, unsigned max, bool greedy);
832 *
833 * void disjunction();
834 *
ba379fdc
A
835 * The regular expression is described by a sequence of assertion*() and atom*()
836 * callbacks to the delegate, describing the terms in the regular expression.
837 * Following an atom a quantifyAtom() call may occur to indicate that the previous
838 * atom should be quantified. In the case of atoms described across multiple
839 * calls (parentheses and character classes) the call to quantifyAtom() will come
840 * after the call to the atom*End() method, never after atom*Begin().
841 *
842 * Character classes may either be described by a single call to
843 * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
844 * In the latter case, ...Begin() will be called, followed by a sequence of
845 * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
846 *
847 * Sequences of atoms and assertions are broken into alternatives via calls to
848 * disjunction(). Assertions, atoms, and disjunctions emitted between calls to
849 * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
850 * atomParenthesesBegin() is passed a subpatternId. In the case of a regular
851 * capturing subpattern, this will be the subpatternId associated with these
852 * parentheses, and will also by definition be the lowest subpatternId of these
853 * parentheses and of any nested paretheses. The atomParenthesesEnd() method
854 * is passed the subpatternId of the last capturing subexpression nested within
855 * these paretheses. In the case of a capturing subpattern with no nested
856 * capturing subpatterns, the same subpatternId will be passed to the begin and
857 * end functions. In the case of non-capturing subpatterns the subpatternId
858 * passed to the begin method is also the first possible subpatternId that might
859 * be nested within these paretheses. If a set of non-capturing parentheses does
860 * not contain any capturing subpatterns, then the subpatternId passed to begin
861 * will be greater than the subpatternId passed to end.
862 */
863
864template<class Delegate>
14957cd0 865const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = quantifyInfinite)
ba379fdc
A
866{
867 return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse();
868}
869
870} } // namespace JSC::Yarr
871
14957cd0 872#endif // YarrParser_h