]> git.saurik.com Git - apple/javascriptcore.git/blame - yarr/YarrParser.h
JavaScriptCore-1218.34.tar.gz
[apple/javascriptcore.git] / yarr / YarrParser.h
CommitLineData
ba379fdc
A
1/*
2 * Copyright (C) 2009 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
14957cd0
A
26#ifndef YarrParser_h
27#define YarrParser_h
ba379fdc 28
14957cd0 29#include "Yarr.h"
ba379fdc 30#include <wtf/ASCIICType.h>
93a37866 31#include <wtf/text/WTFString.h>
ba379fdc 32#include <wtf/unicode/Unicode.h>
ba379fdc
A
33
34namespace JSC { namespace Yarr {
35
14957cd0
A
36#define REGEXP_ERROR_PREFIX "Invalid regular expression: "
37
ba379fdc
A
38enum BuiltInCharacterClassID {
39 DigitClassID,
40 SpaceClassID,
41 WordClassID,
42 NewlineClassID,
43};
44
45// The Parser class should not be used directly - only via the Yarr::parse() method.
6fe7ccc8 46template<class Delegate, typename CharType>
ba379fdc
A
47class Parser {
48private:
49 template<class FriendDelegate>
93a37866 50 friend const char* parse(FriendDelegate&, const String& pattern, unsigned backReferenceLimit);
ba379fdc
A
51
52 enum ErrorCode {
53 NoError,
54 PatternTooLarge,
55 QuantifierOutOfOrder,
56 QuantifierWithoutAtom,
6fe7ccc8 57 QuantifierTooLarge,
ba379fdc
A
58 MissingParentheses,
59 ParenthesesUnmatched,
60 ParenthesesTypeInvalid,
61 CharacterClassUnmatched,
62 CharacterClassOutOfOrder,
63 EscapeUnterminated,
64 NumberOfErrorCodes
65 };
66
67 /*
68 * CharacterClassParserDelegate:
69 *
70 * The class CharacterClassParserDelegate is used in the parsing of character
71 * classes. This class handles detection of character ranges. This class
72 * implements enough of the delegate interface such that it can be passed to
73 * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused
74 * to perform the parsing of escape characters in character sets.
75 */
76 class CharacterClassParserDelegate {
77 public:
78 CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
79 : m_delegate(delegate)
80 , m_err(err)
14957cd0
A
81 , m_state(Empty)
82 , m_character(0)
ba379fdc
A
83 {
84 }
85
86 /*
87 * begin():
88 *
89 * Called at beginning of construction.
90 */
91 void begin(bool invert)
92 {
93 m_delegate.atomCharacterClassBegin(invert);
94 }
95
96 /*
14957cd0 97 * atomPatternCharacter():
ba379fdc 98 *
14957cd0
A
99 * This method is called either from parseCharacterClass() (for an unescaped
100 * character in a character class), or from parseEscape(). In the former case
101 * the value true will be passed for the argument 'hyphenIsRange', and in this
102 * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
103 * is different to /[a\-z]/).
ba379fdc 104 */
14957cd0 105 void atomPatternCharacter(UChar ch, bool hyphenIsRange = false)
ba379fdc
A
106 {
107 switch (m_state) {
14957cd0
A
108 case AfterCharacterClass:
109 // Following a builtin character class we need look out for a hyphen.
110 // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
111 // If we see a hyphen following a charater class then unlike usual
112 // we'll report it to the delegate immediately, and put ourself into
113 // a poisoned state. Any following calls to add another character or
114 // character class will result in an error. (A hypen following a
115 // character-class is itself valid, but only at the end of a regex).
116 if (hyphenIsRange && ch == '-') {
117 m_delegate.atomCharacterClassAtom('-');
118 m_state = AfterCharacterClassHyphen;
119 return;
120 }
121 // Otherwise just fall through - cached character so treat this as Empty.
122
123 case Empty:
ba379fdc 124 m_character = ch;
14957cd0
A
125 m_state = CachedCharacter;
126 return;
ba379fdc 127
14957cd0
A
128 case CachedCharacter:
129 if (hyphenIsRange && ch == '-')
130 m_state = CachedCharacterHyphen;
ba379fdc
A
131 else {
132 m_delegate.atomCharacterClassAtom(m_character);
133 m_character = ch;
134 }
14957cd0 135 return;
ba379fdc 136
14957cd0
A
137 case CachedCharacterHyphen:
138 if (ch < m_character) {
ba379fdc 139 m_err = CharacterClassOutOfOrder;
14957cd0
A
140 return;
141 }
142 m_delegate.atomCharacterClassRange(m_character, ch);
143 m_state = Empty;
144 return;
ba379fdc 145
14957cd0
A
146 // See coment in atomBuiltInCharacterClass below.
147 // This too is technically an error, per ECMA-262, and again we
148 // we chose to allow this. Note a subtlely here that while we
149 // diverge from the spec's definition of CharacterRange we do
150 // remain in compliance with the grammar. For example, consider
151 // the expression /[\d-a-z]/. We comply with the grammar in
152 // this case by not allowing a-z to be matched as a range.
153 case AfterCharacterClassHyphen:
154 m_delegate.atomCharacterClassAtom(ch);
155 m_state = Empty;
156 return;
157 }
ba379fdc
A
158 }
159
160 /*
161 * atomBuiltInCharacterClass():
162 *
163 * Adds a built-in character class, called by parseEscape().
164 */
165 void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
166 {
14957cd0
A
167 switch (m_state) {
168 case CachedCharacter:
169 // Flush the currently cached character, then fall through.
170 m_delegate.atomCharacterClassAtom(m_character);
171
172 case Empty:
173 case AfterCharacterClass:
174 m_state = AfterCharacterClass;
175 m_delegate.atomCharacterClassBuiltIn(classID, invert);
176 return;
177
178 // If we hit either of these cases, we have an invalid range that
179 // looks something like /[x-\d]/ or /[\d-\d]/.
180 // According to ECMA-262 this should be a syntax error, but
181 // empirical testing shows this to break teh webz. Instead we
182 // comply with to the ECMA-262 grammar, and assume the grammar to
183 // have matched the range correctly, but tweak our interpretation
184 // of CharacterRange. Effectively we implicitly handle the hyphen
185 // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/.
186 case CachedCharacterHyphen:
187 m_delegate.atomCharacterClassAtom(m_character);
188 m_delegate.atomCharacterClassAtom('-');
189 // fall through
190 case AfterCharacterClassHyphen:
191 m_delegate.atomCharacterClassBuiltIn(classID, invert);
192 m_state = Empty;
193 return;
194 }
ba379fdc
A
195 }
196
197 /*
198 * end():
199 *
200 * Called at end of construction.
201 */
202 void end()
203 {
14957cd0
A
204 if (m_state == CachedCharacter)
205 m_delegate.atomCharacterClassAtom(m_character);
206 else if (m_state == CachedCharacterHyphen) {
207 m_delegate.atomCharacterClassAtom(m_character);
208 m_delegate.atomCharacterClassAtom('-');
209 }
ba379fdc
A
210 m_delegate.atomCharacterClassEnd();
211 }
212
213 // parseEscape() should never call these delegate methods when
214 // invoked with inCharacterClass set.
93a37866
A
215 NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { RELEASE_ASSERT_NOT_REACHED(); }
216 NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { RELEASE_ASSERT_NOT_REACHED(); }
ba379fdc
A
217
218 private:
ba379fdc
A
219 Delegate& m_delegate;
220 ErrorCode& m_err;
221 enum CharacterClassConstructionState {
14957cd0
A
222 Empty,
223 CachedCharacter,
224 CachedCharacterHyphen,
225 AfterCharacterClass,
226 AfterCharacterClassHyphen,
ba379fdc
A
227 } m_state;
228 UChar m_character;
229 };
230
93a37866 231 Parser(Delegate& delegate, const String& pattern, unsigned backReferenceLimit)
ba379fdc
A
232 : m_delegate(delegate)
233 , m_backReferenceLimit(backReferenceLimit)
234 , m_err(NoError)
6fe7ccc8 235 , m_data(pattern.getCharacters<CharType>())
14957cd0 236 , m_size(pattern.length())
ba379fdc
A
237 , m_index(0)
238 , m_parenthesesNestingDepth(0)
239 {
240 }
6fe7ccc8 241
ba379fdc
A
242 /*
243 * parseEscape():
244 *
245 * Helper for parseTokens() AND parseCharacterClass().
246 * Unlike the other parser methods, this function does not report tokens
247 * directly to the member delegate (m_delegate), instead tokens are
248 * emitted to the delegate provided as an argument. In the case of atom
249 * escapes, parseTokens() will call parseEscape() passing m_delegate as
250 * an argument, and as such the escape will be reported to the delegate.
251 *
252 * However this method may also be used by parseCharacterClass(), in which
253 * case a CharacterClassParserDelegate will be passed as the delegate that
254 * tokens should be added to. A boolean flag is also provided to indicate
255 * whether that an escape in a CharacterClass is being parsed (some parsing
256 * rules change in this context).
257 *
258 * The boolean value returned by this method indicates whether the token
259 * parsed was an atom (outside of a characted class \b and \B will be
260 * interpreted as assertions).
261 */
262 template<bool inCharacterClass, class EscapeDelegate>
263 bool parseEscape(EscapeDelegate& delegate)
264 {
265 ASSERT(!m_err);
266 ASSERT(peek() == '\\');
267 consume();
268
269 if (atEndOfPattern()) {
270 m_err = EscapeUnterminated;
271 return false;
272 }
273
274 switch (peek()) {
275 // Assertions
276 case 'b':
277 consume();
278 if (inCharacterClass)
279 delegate.atomPatternCharacter('\b');
280 else {
281 delegate.assertionWordBoundary(false);
282 return false;
283 }
284 break;
285 case 'B':
286 consume();
287 if (inCharacterClass)
288 delegate.atomPatternCharacter('B');
289 else {
290 delegate.assertionWordBoundary(true);
291 return false;
292 }
293 break;
294
295 // CharacterClassEscape
296 case 'd':
297 consume();
298 delegate.atomBuiltInCharacterClass(DigitClassID, false);
299 break;
300 case 's':
301 consume();
302 delegate.atomBuiltInCharacterClass(SpaceClassID, false);
303 break;
304 case 'w':
305 consume();
306 delegate.atomBuiltInCharacterClass(WordClassID, false);
307 break;
308 case 'D':
309 consume();
310 delegate.atomBuiltInCharacterClass(DigitClassID, true);
311 break;
312 case 'S':
313 consume();
314 delegate.atomBuiltInCharacterClass(SpaceClassID, true);
315 break;
316 case 'W':
317 consume();
318 delegate.atomBuiltInCharacterClass(WordClassID, true);
319 break;
320
321 // DecimalEscape
322 case '1':
323 case '2':
324 case '3':
325 case '4':
326 case '5':
327 case '6':
328 case '7':
329 case '8':
330 case '9': {
331 // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
332 // First, try to parse this as backreference.
333 if (!inCharacterClass) {
334 ParseState state = saveState();
335
336 unsigned backReference = consumeNumber();
337 if (backReference <= m_backReferenceLimit) {
338 delegate.atomBackReference(backReference);
339 break;
340 }
341
342 restoreState(state);
343 }
344
345 // Not a backreference, and not octal.
346 if (peek() >= '8') {
347 delegate.atomPatternCharacter('\\');
348 break;
349 }
350
351 // Fall-through to handle this as an octal escape.
352 }
353
354 // Octal escape
355 case '0':
356 delegate.atomPatternCharacter(consumeOctal());
357 break;
358
359 // ControlEscape
360 case 'f':
361 consume();
362 delegate.atomPatternCharacter('\f');
363 break;
364 case 'n':
365 consume();
366 delegate.atomPatternCharacter('\n');
367 break;
368 case 'r':
369 consume();
370 delegate.atomPatternCharacter('\r');
371 break;
372 case 't':
373 consume();
374 delegate.atomPatternCharacter('\t');
375 break;
376 case 'v':
377 consume();
378 delegate.atomPatternCharacter('\v');
379 break;
380
381 // ControlLetter
382 case 'c': {
383 ParseState state = saveState();
384 consume();
385 if (!atEndOfPattern()) {
386 int control = consume();
387
388 // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
389 if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) {
390 delegate.atomPatternCharacter(control & 0x1f);
391 break;
392 }
393 }
394 restoreState(state);
395 delegate.atomPatternCharacter('\\');
396 break;
397 }
398
399 // HexEscape
400 case 'x': {
401 consume();
402 int x = tryConsumeHex(2);
403 if (x == -1)
404 delegate.atomPatternCharacter('x');
405 else
406 delegate.atomPatternCharacter(x);
407 break;
408 }
409
410 // UnicodeEscape
411 case 'u': {
412 consume();
413 int u = tryConsumeHex(4);
414 if (u == -1)
415 delegate.atomPatternCharacter('u');
416 else
417 delegate.atomPatternCharacter(u);
418 break;
419 }
420
421 // IdentityEscape
422 default:
423 delegate.atomPatternCharacter(consume());
424 }
425
426 return true;
427 }
428
429 /*
430 * parseAtomEscape(), parseCharacterClassEscape():
431 *
432 * These methods alias to parseEscape().
433 */
434 bool parseAtomEscape()
435 {
436 return parseEscape<false>(m_delegate);
437 }
438 void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
439 {
440 parseEscape<true>(delegate);
441 }
442
443 /*
444 * parseCharacterClass():
445 *
446 * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
447 * to an instance of CharacterClassParserDelegate, to describe the character class to the
448 * delegate.
449 */
450 void parseCharacterClass()
451 {
452 ASSERT(!m_err);
453 ASSERT(peek() == '[');
454 consume();
455
456 CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err);
457
458 characterClassConstructor.begin(tryConsume('^'));
459
460 while (!atEndOfPattern()) {
461 switch (peek()) {
462 case ']':
463 consume();
464 characterClassConstructor.end();
465 return;
466
467 case '\\':
468 parseCharacterClassEscape(characterClassConstructor);
469 break;
470
471 default:
14957cd0 472 characterClassConstructor.atomPatternCharacter(consume(), true);
ba379fdc
A
473 }
474
475 if (m_err)
476 return;
477 }
478
479 m_err = CharacterClassUnmatched;
480 }
481
482 /*
483 * parseParenthesesBegin():
484 *
485 * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
486 */
487 void parseParenthesesBegin()
488 {
489 ASSERT(!m_err);
490 ASSERT(peek() == '(');
491 consume();
492
493 if (tryConsume('?')) {
494 if (atEndOfPattern()) {
495 m_err = ParenthesesTypeInvalid;
496 return;
497 }
498
499 switch (consume()) {
500 case ':':
501 m_delegate.atomParenthesesSubpatternBegin(false);
502 break;
503
504 case '=':
505 m_delegate.atomParentheticalAssertionBegin();
506 break;
507
508 case '!':
509 m_delegate.atomParentheticalAssertionBegin(true);
510 break;
511
512 default:
513 m_err = ParenthesesTypeInvalid;
514 }
515 } else
516 m_delegate.atomParenthesesSubpatternBegin();
517
518 ++m_parenthesesNestingDepth;
519 }
520
521 /*
522 * parseParenthesesEnd():
523 *
524 * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
525 */
526 void parseParenthesesEnd()
527 {
528 ASSERT(!m_err);
529 ASSERT(peek() == ')');
530 consume();
531
532 if (m_parenthesesNestingDepth > 0)
533 m_delegate.atomParenthesesEnd();
534 else
535 m_err = ParenthesesUnmatched;
536
537 --m_parenthesesNestingDepth;
538 }
539
540 /*
541 * parseQuantifier():
542 *
543 * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
544 */
545 void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
546 {
547 ASSERT(!m_err);
548 ASSERT(min <= max);
549
6fe7ccc8
A
550 if (min == UINT_MAX) {
551 m_err = QuantifierTooLarge;
552 return;
553 }
554
ba379fdc
A
555 if (lastTokenWasAnAtom)
556 m_delegate.quantifyAtom(min, max, !tryConsume('?'));
557 else
558 m_err = QuantifierWithoutAtom;
559 }
560
561 /*
562 * parseTokens():
563 *
564 * This method loops over the input pattern reporting tokens to the delegate.
565 * The method returns when a parse error is detected, or the end of the pattern
566 * is reached. One piece of state is tracked around the loop, which is whether
567 * the last token passed to the delegate was an atom (this is necessary to detect
568 * a parse error when a quantifier provided without an atom to quantify).
569 */
570 void parseTokens()
571 {
572 bool lastTokenWasAnAtom = false;
573
574 while (!atEndOfPattern()) {
575 switch (peek()) {
576 case '|':
577 consume();
578 m_delegate.disjunction();
579 lastTokenWasAnAtom = false;
580 break;
581
582 case '(':
583 parseParenthesesBegin();
584 lastTokenWasAnAtom = false;
585 break;
586
587 case ')':
588 parseParenthesesEnd();
589 lastTokenWasAnAtom = true;
590 break;
591
592 case '^':
593 consume();
594 m_delegate.assertionBOL();
595 lastTokenWasAnAtom = false;
596 break;
597
598 case '$':
599 consume();
600 m_delegate.assertionEOL();
601 lastTokenWasAnAtom = false;
602 break;
603
604 case '.':
605 consume();
606 m_delegate.atomBuiltInCharacterClass(NewlineClassID, true);
607 lastTokenWasAnAtom = true;
608 break;
609
610 case '[':
611 parseCharacterClass();
612 lastTokenWasAnAtom = true;
613 break;
614
615 case '\\':
616 lastTokenWasAnAtom = parseAtomEscape();
617 break;
618
619 case '*':
620 consume();
14957cd0 621 parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite);
ba379fdc
A
622 lastTokenWasAnAtom = false;
623 break;
624
625 case '+':
626 consume();
14957cd0 627 parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite);
ba379fdc
A
628 lastTokenWasAnAtom = false;
629 break;
630
631 case '?':
632 consume();
633 parseQuantifier(lastTokenWasAnAtom, 0, 1);
634 lastTokenWasAnAtom = false;
635 break;
636
637 case '{': {
638 ParseState state = saveState();
639
640 consume();
641 if (peekIsDigit()) {
642 unsigned min = consumeNumber();
643 unsigned max = min;
644
645 if (tryConsume(','))
14957cd0 646 max = peekIsDigit() ? consumeNumber() : quantifyInfinite;
ba379fdc
A
647
648 if (tryConsume('}')) {
649 if (min <= max)
650 parseQuantifier(lastTokenWasAnAtom, min, max);
651 else
652 m_err = QuantifierOutOfOrder;
653 lastTokenWasAnAtom = false;
654 break;
655 }
656 }
657
658 restoreState(state);
659 } // if we did not find a complete quantifer, fall through to the default case.
660
661 default:
662 m_delegate.atomPatternCharacter(consume());
663 lastTokenWasAnAtom = true;
664 }
665
666 if (m_err)
667 return;
668 }
669
670 if (m_parenthesesNestingDepth > 0)
671 m_err = MissingParentheses;
672 }
673
674 /*
675 * parse():
676 *
14957cd0 677 * This method calls parseTokens() to parse over the input and converts any
ba379fdc
A
678 * error code to a const char* for a result.
679 */
680 const char* parse()
681 {
ba379fdc
A
682 if (m_size > MAX_PATTERN_SIZE)
683 m_err = PatternTooLarge;
684 else
685 parseTokens();
686 ASSERT(atEndOfPattern() || m_err);
687
ba379fdc
A
688 // The order of this array must match the ErrorCode enum.
689 static const char* errorMessages[NumberOfErrorCodes] = {
690 0, // NoError
14957cd0
A
691 REGEXP_ERROR_PREFIX "regular expression too large",
692 REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier",
693 REGEXP_ERROR_PREFIX "nothing to repeat",
6fe7ccc8 694 REGEXP_ERROR_PREFIX "number too large in {} quantifier",
14957cd0
A
695 REGEXP_ERROR_PREFIX "missing )",
696 REGEXP_ERROR_PREFIX "unmatched parentheses",
697 REGEXP_ERROR_PREFIX "unrecognized character after (?",
698 REGEXP_ERROR_PREFIX "missing terminating ] for character class",
699 REGEXP_ERROR_PREFIX "range out of order in character class",
700 REGEXP_ERROR_PREFIX "\\ at end of pattern"
ba379fdc
A
701 };
702
703 return errorMessages[m_err];
704 }
705
ba379fdc
A
706 // Misc helper functions:
707
708 typedef unsigned ParseState;
709
710 ParseState saveState()
711 {
712 return m_index;
713 }
714
715 void restoreState(ParseState state)
716 {
717 m_index = state;
718 }
719
720 bool atEndOfPattern()
721 {
722 ASSERT(m_index <= m_size);
723 return m_index == m_size;
724 }
725
726 int peek()
727 {
728 ASSERT(m_index < m_size);
729 return m_data[m_index];
730 }
731
732 bool peekIsDigit()
733 {
734 return !atEndOfPattern() && WTF::isASCIIDigit(peek());
735 }
736
737 unsigned peekDigit()
738 {
739 ASSERT(peekIsDigit());
740 return peek() - '0';
741 }
742
743 int consume()
744 {
745 ASSERT(m_index < m_size);
746 return m_data[m_index++];
747 }
748
749 unsigned consumeDigit()
750 {
751 ASSERT(peekIsDigit());
752 return consume() - '0';
753 }
754
755 unsigned consumeNumber()
756 {
757 unsigned n = consumeDigit();
758 // check for overflow.
759 for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) {
760 n = newValue;
761 consume();
762 }
763 return n;
764 }
765
766 unsigned consumeOctal()
767 {
768 ASSERT(WTF::isASCIIOctalDigit(peek()));
769
770 unsigned n = consumeDigit();
771 while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
772 n = n * 8 + consumeDigit();
773 return n;
774 }
775
776 bool tryConsume(UChar ch)
777 {
778 if (atEndOfPattern() || (m_data[m_index] != ch))
779 return false;
780 ++m_index;
781 return true;
782 }
783
784 int tryConsumeHex(int count)
785 {
786 ParseState state = saveState();
787
788 int n = 0;
789 while (count--) {
790 if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
791 restoreState(state);
792 return -1;
793 }
794 n = (n << 4) | WTF::toASCIIHexValue(consume());
795 }
796 return n;
797 }
798
799 Delegate& m_delegate;
800 unsigned m_backReferenceLimit;
801 ErrorCode m_err;
6fe7ccc8 802 const CharType* m_data;
ba379fdc
A
803 unsigned m_size;
804 unsigned m_index;
805 unsigned m_parenthesesNestingDepth;
806
807 // Derived by empirical testing of compile time in PCRE and WREC.
808 static const unsigned MAX_PATTERN_SIZE = 1024 * 1024;
809};
810
811/*
812 * Yarr::parse():
813 *
814 * The parse method is passed a pattern to be parsed and a delegate upon which
815 * callbacks will be made to record the parsed tokens forming the regex.
816 * Yarr::parse() returns null on success, or a const C string providing an error
817 * message where a parse error occurs.
818 *
819 * The Delegate must implement the following interface:
820 *
821 * void assertionBOL();
822 * void assertionEOL();
823 * void assertionWordBoundary(bool invert);
824 *
825 * void atomPatternCharacter(UChar ch);
826 * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
827 * void atomCharacterClassBegin(bool invert)
828 * void atomCharacterClassAtom(UChar ch)
829 * void atomCharacterClassRange(UChar begin, UChar end)
830 * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
831 * void atomCharacterClassEnd()
832 * void atomParenthesesSubpatternBegin(bool capture = true);
833 * void atomParentheticalAssertionBegin(bool invert = false);
834 * void atomParenthesesEnd();
835 * void atomBackReference(unsigned subpatternId);
836 *
837 * void quantifyAtom(unsigned min, unsigned max, bool greedy);
838 *
839 * void disjunction();
840 *
ba379fdc
A
841 * The regular expression is described by a sequence of assertion*() and atom*()
842 * callbacks to the delegate, describing the terms in the regular expression.
843 * Following an atom a quantifyAtom() call may occur to indicate that the previous
844 * atom should be quantified. In the case of atoms described across multiple
845 * calls (parentheses and character classes) the call to quantifyAtom() will come
846 * after the call to the atom*End() method, never after atom*Begin().
847 *
848 * Character classes may either be described by a single call to
849 * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
850 * In the latter case, ...Begin() will be called, followed by a sequence of
851 * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
852 *
853 * Sequences of atoms and assertions are broken into alternatives via calls to
854 * disjunction(). Assertions, atoms, and disjunctions emitted between calls to
855 * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
856 * atomParenthesesBegin() is passed a subpatternId. In the case of a regular
857 * capturing subpattern, this will be the subpatternId associated with these
858 * parentheses, and will also by definition be the lowest subpatternId of these
859 * parentheses and of any nested paretheses. The atomParenthesesEnd() method
860 * is passed the subpatternId of the last capturing subexpression nested within
861 * these paretheses. In the case of a capturing subpattern with no nested
862 * capturing subpatterns, the same subpatternId will be passed to the begin and
863 * end functions. In the case of non-capturing subpatterns the subpatternId
864 * passed to the begin method is also the first possible subpatternId that might
865 * be nested within these paretheses. If a set of non-capturing parentheses does
866 * not contain any capturing subpatterns, then the subpatternId passed to begin
867 * will be greater than the subpatternId passed to end.
868 */
869
870template<class Delegate>
93a37866 871const char* parse(Delegate& delegate, const String& pattern, unsigned backReferenceLimit = quantifyInfinite)
ba379fdc 872{
6fe7ccc8
A
873 if (pattern.is8Bit())
874 return Parser<Delegate, LChar>(delegate, pattern, backReferenceLimit).parse();
875 return Parser<Delegate, UChar>(delegate, pattern, backReferenceLimit).parse();
ba379fdc
A
876}
877
878} } // namespace JSC::Yarr
879
14957cd0 880#endif // YarrParser_h