]> git.saurik.com Git - apple/javascriptcore.git/blame - yarr/YarrParser.h
JavaScriptCore-7600.1.4.11.8.tar.gz
[apple/javascriptcore.git] / yarr / YarrParser.h
CommitLineData
ba379fdc
A
1/*
2 * Copyright (C) 2009 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
14957cd0
A
26#ifndef YarrParser_h
27#define YarrParser_h
ba379fdc 28
14957cd0 29#include "Yarr.h"
ba379fdc 30#include <wtf/ASCIICType.h>
93a37866 31#include <wtf/text/WTFString.h>
ba379fdc
A
32
33namespace JSC { namespace Yarr {
34
14957cd0
A
35#define REGEXP_ERROR_PREFIX "Invalid regular expression: "
36
ba379fdc
A
37enum BuiltInCharacterClassID {
38 DigitClassID,
39 SpaceClassID,
40 WordClassID,
41 NewlineClassID,
42};
43
44// The Parser class should not be used directly - only via the Yarr::parse() method.
6fe7ccc8 45template<class Delegate, typename CharType>
ba379fdc
A
46class Parser {
47private:
48 template<class FriendDelegate>
93a37866 49 friend const char* parse(FriendDelegate&, const String& pattern, unsigned backReferenceLimit);
ba379fdc
A
50
51 enum ErrorCode {
52 NoError,
53 PatternTooLarge,
54 QuantifierOutOfOrder,
55 QuantifierWithoutAtom,
6fe7ccc8 56 QuantifierTooLarge,
ba379fdc
A
57 MissingParentheses,
58 ParenthesesUnmatched,
59 ParenthesesTypeInvalid,
60 CharacterClassUnmatched,
61 CharacterClassOutOfOrder,
62 EscapeUnterminated,
63 NumberOfErrorCodes
64 };
65
66 /*
67 * CharacterClassParserDelegate:
68 *
69 * The class CharacterClassParserDelegate is used in the parsing of character
70 * classes. This class handles detection of character ranges. This class
71 * implements enough of the delegate interface such that it can be passed to
72 * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused
73 * to perform the parsing of escape characters in character sets.
74 */
75 class CharacterClassParserDelegate {
76 public:
77 CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
78 : m_delegate(delegate)
79 , m_err(err)
14957cd0
A
80 , m_state(Empty)
81 , m_character(0)
ba379fdc
A
82 {
83 }
84
85 /*
86 * begin():
87 *
88 * Called at beginning of construction.
89 */
90 void begin(bool invert)
91 {
92 m_delegate.atomCharacterClassBegin(invert);
93 }
94
95 /*
14957cd0 96 * atomPatternCharacter():
ba379fdc 97 *
14957cd0
A
98 * This method is called either from parseCharacterClass() (for an unescaped
99 * character in a character class), or from parseEscape(). In the former case
100 * the value true will be passed for the argument 'hyphenIsRange', and in this
101 * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
102 * is different to /[a\-z]/).
ba379fdc 103 */
14957cd0 104 void atomPatternCharacter(UChar ch, bool hyphenIsRange = false)
ba379fdc
A
105 {
106 switch (m_state) {
14957cd0
A
107 case AfterCharacterClass:
108 // Following a builtin character class we need look out for a hyphen.
109 // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
110 // If we see a hyphen following a charater class then unlike usual
111 // we'll report it to the delegate immediately, and put ourself into
112 // a poisoned state. Any following calls to add another character or
113 // character class will result in an error. (A hypen following a
114 // character-class is itself valid, but only at the end of a regex).
115 if (hyphenIsRange && ch == '-') {
116 m_delegate.atomCharacterClassAtom('-');
117 m_state = AfterCharacterClassHyphen;
118 return;
119 }
120 // Otherwise just fall through - cached character so treat this as Empty.
81345200 121 FALLTHROUGH;
14957cd0
A
122
123 case Empty:
ba379fdc 124 m_character = ch;
14957cd0
A
125 m_state = CachedCharacter;
126 return;
ba379fdc 127
14957cd0
A
128 case CachedCharacter:
129 if (hyphenIsRange && ch == '-')
130 m_state = CachedCharacterHyphen;
ba379fdc
A
131 else {
132 m_delegate.atomCharacterClassAtom(m_character);
133 m_character = ch;
134 }
14957cd0 135 return;
ba379fdc 136
14957cd0
A
137 case CachedCharacterHyphen:
138 if (ch < m_character) {
ba379fdc 139 m_err = CharacterClassOutOfOrder;
14957cd0
A
140 return;
141 }
142 m_delegate.atomCharacterClassRange(m_character, ch);
143 m_state = Empty;
144 return;
ba379fdc 145
14957cd0
A
146 // See coment in atomBuiltInCharacterClass below.
147 // This too is technically an error, per ECMA-262, and again we
148 // we chose to allow this. Note a subtlely here that while we
149 // diverge from the spec's definition of CharacterRange we do
150 // remain in compliance with the grammar. For example, consider
151 // the expression /[\d-a-z]/. We comply with the grammar in
152 // this case by not allowing a-z to be matched as a range.
153 case AfterCharacterClassHyphen:
154 m_delegate.atomCharacterClassAtom(ch);
155 m_state = Empty;
156 return;
157 }
ba379fdc
A
158 }
159
160 /*
161 * atomBuiltInCharacterClass():
162 *
163 * Adds a built-in character class, called by parseEscape().
164 */
165 void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
166 {
14957cd0
A
167 switch (m_state) {
168 case CachedCharacter:
169 // Flush the currently cached character, then fall through.
170 m_delegate.atomCharacterClassAtom(m_character);
81345200 171 FALLTHROUGH;
14957cd0
A
172 case Empty:
173 case AfterCharacterClass:
174 m_state = AfterCharacterClass;
175 m_delegate.atomCharacterClassBuiltIn(classID, invert);
176 return;
177
178 // If we hit either of these cases, we have an invalid range that
179 // looks something like /[x-\d]/ or /[\d-\d]/.
180 // According to ECMA-262 this should be a syntax error, but
181 // empirical testing shows this to break teh webz. Instead we
182 // comply with to the ECMA-262 grammar, and assume the grammar to
183 // have matched the range correctly, but tweak our interpretation
184 // of CharacterRange. Effectively we implicitly handle the hyphen
185 // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/.
186 case CachedCharacterHyphen:
187 m_delegate.atomCharacterClassAtom(m_character);
188 m_delegate.atomCharacterClassAtom('-');
81345200 189 FALLTHROUGH;
14957cd0
A
190 case AfterCharacterClassHyphen:
191 m_delegate.atomCharacterClassBuiltIn(classID, invert);
192 m_state = Empty;
193 return;
194 }
ba379fdc
A
195 }
196
197 /*
198 * end():
199 *
200 * Called at end of construction.
201 */
202 void end()
203 {
14957cd0
A
204 if (m_state == CachedCharacter)
205 m_delegate.atomCharacterClassAtom(m_character);
206 else if (m_state == CachedCharacterHyphen) {
207 m_delegate.atomCharacterClassAtom(m_character);
208 m_delegate.atomCharacterClassAtom('-');
209 }
ba379fdc
A
210 m_delegate.atomCharacterClassEnd();
211 }
212
213 // parseEscape() should never call these delegate methods when
214 // invoked with inCharacterClass set.
93a37866
A
215 NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { RELEASE_ASSERT_NOT_REACHED(); }
216 NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { RELEASE_ASSERT_NOT_REACHED(); }
ba379fdc
A
217
218 private:
ba379fdc
A
219 Delegate& m_delegate;
220 ErrorCode& m_err;
221 enum CharacterClassConstructionState {
14957cd0
A
222 Empty,
223 CachedCharacter,
224 CachedCharacterHyphen,
225 AfterCharacterClass,
226 AfterCharacterClassHyphen,
ba379fdc
A
227 } m_state;
228 UChar m_character;
229 };
230
93a37866 231 Parser(Delegate& delegate, const String& pattern, unsigned backReferenceLimit)
ba379fdc
A
232 : m_delegate(delegate)
233 , m_backReferenceLimit(backReferenceLimit)
234 , m_err(NoError)
81345200 235 , m_data(pattern.characters<CharType>())
14957cd0 236 , m_size(pattern.length())
ba379fdc
A
237 , m_index(0)
238 , m_parenthesesNestingDepth(0)
239 {
240 }
6fe7ccc8 241
ba379fdc
A
242 /*
243 * parseEscape():
244 *
245 * Helper for parseTokens() AND parseCharacterClass().
246 * Unlike the other parser methods, this function does not report tokens
247 * directly to the member delegate (m_delegate), instead tokens are
248 * emitted to the delegate provided as an argument. In the case of atom
249 * escapes, parseTokens() will call parseEscape() passing m_delegate as
250 * an argument, and as such the escape will be reported to the delegate.
251 *
252 * However this method may also be used by parseCharacterClass(), in which
253 * case a CharacterClassParserDelegate will be passed as the delegate that
254 * tokens should be added to. A boolean flag is also provided to indicate
255 * whether that an escape in a CharacterClass is being parsed (some parsing
256 * rules change in this context).
257 *
258 * The boolean value returned by this method indicates whether the token
259 * parsed was an atom (outside of a characted class \b and \B will be
260 * interpreted as assertions).
261 */
262 template<bool inCharacterClass, class EscapeDelegate>
263 bool parseEscape(EscapeDelegate& delegate)
264 {
265 ASSERT(!m_err);
266 ASSERT(peek() == '\\');
267 consume();
268
269 if (atEndOfPattern()) {
270 m_err = EscapeUnterminated;
271 return false;
272 }
273
274 switch (peek()) {
275 // Assertions
276 case 'b':
277 consume();
278 if (inCharacterClass)
279 delegate.atomPatternCharacter('\b');
280 else {
281 delegate.assertionWordBoundary(false);
282 return false;
283 }
284 break;
285 case 'B':
286 consume();
287 if (inCharacterClass)
288 delegate.atomPatternCharacter('B');
289 else {
290 delegate.assertionWordBoundary(true);
291 return false;
292 }
293 break;
294
295 // CharacterClassEscape
296 case 'd':
297 consume();
298 delegate.atomBuiltInCharacterClass(DigitClassID, false);
299 break;
300 case 's':
301 consume();
302 delegate.atomBuiltInCharacterClass(SpaceClassID, false);
303 break;
304 case 'w':
305 consume();
306 delegate.atomBuiltInCharacterClass(WordClassID, false);
307 break;
308 case 'D':
309 consume();
310 delegate.atomBuiltInCharacterClass(DigitClassID, true);
311 break;
312 case 'S':
313 consume();
314 delegate.atomBuiltInCharacterClass(SpaceClassID, true);
315 break;
316 case 'W':
317 consume();
318 delegate.atomBuiltInCharacterClass(WordClassID, true);
319 break;
320
321 // DecimalEscape
322 case '1':
323 case '2':
324 case '3':
325 case '4':
326 case '5':
327 case '6':
328 case '7':
329 case '8':
330 case '9': {
331 // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
332 // First, try to parse this as backreference.
333 if (!inCharacterClass) {
334 ParseState state = saveState();
335
336 unsigned backReference = consumeNumber();
337 if (backReference <= m_backReferenceLimit) {
338 delegate.atomBackReference(backReference);
339 break;
340 }
341
342 restoreState(state);
343 }
344
345 // Not a backreference, and not octal.
346 if (peek() >= '8') {
347 delegate.atomPatternCharacter('\\');
348 break;
349 }
350
351 // Fall-through to handle this as an octal escape.
81345200 352 FALLTHROUGH;
ba379fdc
A
353 }
354
355 // Octal escape
356 case '0':
357 delegate.atomPatternCharacter(consumeOctal());
358 break;
359
360 // ControlEscape
361 case 'f':
362 consume();
363 delegate.atomPatternCharacter('\f');
364 break;
365 case 'n':
366 consume();
367 delegate.atomPatternCharacter('\n');
368 break;
369 case 'r':
370 consume();
371 delegate.atomPatternCharacter('\r');
372 break;
373 case 't':
374 consume();
375 delegate.atomPatternCharacter('\t');
376 break;
377 case 'v':
378 consume();
379 delegate.atomPatternCharacter('\v');
380 break;
381
382 // ControlLetter
383 case 'c': {
384 ParseState state = saveState();
385 consume();
386 if (!atEndOfPattern()) {
387 int control = consume();
388
389 // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
390 if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) {
391 delegate.atomPatternCharacter(control & 0x1f);
392 break;
393 }
394 }
395 restoreState(state);
396 delegate.atomPatternCharacter('\\');
397 break;
398 }
399
400 // HexEscape
401 case 'x': {
402 consume();
403 int x = tryConsumeHex(2);
404 if (x == -1)
405 delegate.atomPatternCharacter('x');
406 else
407 delegate.atomPatternCharacter(x);
408 break;
409 }
410
411 // UnicodeEscape
412 case 'u': {
413 consume();
414 int u = tryConsumeHex(4);
415 if (u == -1)
416 delegate.atomPatternCharacter('u');
417 else
418 delegate.atomPatternCharacter(u);
419 break;
420 }
421
422 // IdentityEscape
423 default:
424 delegate.atomPatternCharacter(consume());
425 }
426
427 return true;
428 }
429
430 /*
431 * parseAtomEscape(), parseCharacterClassEscape():
432 *
433 * These methods alias to parseEscape().
434 */
435 bool parseAtomEscape()
436 {
437 return parseEscape<false>(m_delegate);
438 }
439 void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
440 {
441 parseEscape<true>(delegate);
442 }
443
444 /*
445 * parseCharacterClass():
446 *
447 * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
448 * to an instance of CharacterClassParserDelegate, to describe the character class to the
449 * delegate.
450 */
451 void parseCharacterClass()
452 {
453 ASSERT(!m_err);
454 ASSERT(peek() == '[');
455 consume();
456
457 CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err);
458
459 characterClassConstructor.begin(tryConsume('^'));
460
461 while (!atEndOfPattern()) {
462 switch (peek()) {
463 case ']':
464 consume();
465 characterClassConstructor.end();
466 return;
467
468 case '\\':
469 parseCharacterClassEscape(characterClassConstructor);
470 break;
471
472 default:
14957cd0 473 characterClassConstructor.atomPatternCharacter(consume(), true);
ba379fdc
A
474 }
475
476 if (m_err)
477 return;
478 }
479
480 m_err = CharacterClassUnmatched;
481 }
482
483 /*
484 * parseParenthesesBegin():
485 *
486 * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
487 */
488 void parseParenthesesBegin()
489 {
490 ASSERT(!m_err);
491 ASSERT(peek() == '(');
492 consume();
493
494 if (tryConsume('?')) {
495 if (atEndOfPattern()) {
496 m_err = ParenthesesTypeInvalid;
497 return;
498 }
499
500 switch (consume()) {
501 case ':':
502 m_delegate.atomParenthesesSubpatternBegin(false);
503 break;
504
505 case '=':
506 m_delegate.atomParentheticalAssertionBegin();
507 break;
508
509 case '!':
510 m_delegate.atomParentheticalAssertionBegin(true);
511 break;
512
513 default:
514 m_err = ParenthesesTypeInvalid;
515 }
516 } else
517 m_delegate.atomParenthesesSubpatternBegin();
518
519 ++m_parenthesesNestingDepth;
520 }
521
522 /*
523 * parseParenthesesEnd():
524 *
525 * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
526 */
527 void parseParenthesesEnd()
528 {
529 ASSERT(!m_err);
530 ASSERT(peek() == ')');
531 consume();
532
533 if (m_parenthesesNestingDepth > 0)
534 m_delegate.atomParenthesesEnd();
535 else
536 m_err = ParenthesesUnmatched;
537
538 --m_parenthesesNestingDepth;
539 }
540
541 /*
542 * parseQuantifier():
543 *
544 * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
545 */
546 void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
547 {
548 ASSERT(!m_err);
549 ASSERT(min <= max);
550
6fe7ccc8
A
551 if (min == UINT_MAX) {
552 m_err = QuantifierTooLarge;
553 return;
554 }
555
ba379fdc
A
556 if (lastTokenWasAnAtom)
557 m_delegate.quantifyAtom(min, max, !tryConsume('?'));
558 else
559 m_err = QuantifierWithoutAtom;
560 }
561
562 /*
563 * parseTokens():
564 *
565 * This method loops over the input pattern reporting tokens to the delegate.
566 * The method returns when a parse error is detected, or the end of the pattern
567 * is reached. One piece of state is tracked around the loop, which is whether
568 * the last token passed to the delegate was an atom (this is necessary to detect
569 * a parse error when a quantifier provided without an atom to quantify).
570 */
571 void parseTokens()
572 {
573 bool lastTokenWasAnAtom = false;
574
575 while (!atEndOfPattern()) {
576 switch (peek()) {
577 case '|':
578 consume();
579 m_delegate.disjunction();
580 lastTokenWasAnAtom = false;
581 break;
582
583 case '(':
584 parseParenthesesBegin();
585 lastTokenWasAnAtom = false;
586 break;
587
588 case ')':
589 parseParenthesesEnd();
590 lastTokenWasAnAtom = true;
591 break;
592
593 case '^':
594 consume();
595 m_delegate.assertionBOL();
596 lastTokenWasAnAtom = false;
597 break;
598
599 case '$':
600 consume();
601 m_delegate.assertionEOL();
602 lastTokenWasAnAtom = false;
603 break;
604
605 case '.':
606 consume();
607 m_delegate.atomBuiltInCharacterClass(NewlineClassID, true);
608 lastTokenWasAnAtom = true;
609 break;
610
611 case '[':
612 parseCharacterClass();
613 lastTokenWasAnAtom = true;
614 break;
615
616 case '\\':
617 lastTokenWasAnAtom = parseAtomEscape();
618 break;
619
620 case '*':
621 consume();
14957cd0 622 parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite);
ba379fdc
A
623 lastTokenWasAnAtom = false;
624 break;
625
626 case '+':
627 consume();
14957cd0 628 parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite);
ba379fdc
A
629 lastTokenWasAnAtom = false;
630 break;
631
632 case '?':
633 consume();
634 parseQuantifier(lastTokenWasAnAtom, 0, 1);
635 lastTokenWasAnAtom = false;
636 break;
637
638 case '{': {
639 ParseState state = saveState();
640
641 consume();
642 if (peekIsDigit()) {
643 unsigned min = consumeNumber();
644 unsigned max = min;
645
646 if (tryConsume(','))
14957cd0 647 max = peekIsDigit() ? consumeNumber() : quantifyInfinite;
ba379fdc
A
648
649 if (tryConsume('}')) {
650 if (min <= max)
651 parseQuantifier(lastTokenWasAnAtom, min, max);
652 else
653 m_err = QuantifierOutOfOrder;
654 lastTokenWasAnAtom = false;
655 break;
656 }
657 }
658
659 restoreState(state);
81345200
A
660 }
661 // if we did not find a complete quantifer, fall through to the default case.
662 FALLTHROUGH;
ba379fdc
A
663
664 default:
665 m_delegate.atomPatternCharacter(consume());
666 lastTokenWasAnAtom = true;
667 }
668
669 if (m_err)
670 return;
671 }
672
673 if (m_parenthesesNestingDepth > 0)
674 m_err = MissingParentheses;
675 }
676
677 /*
678 * parse():
679 *
14957cd0 680 * This method calls parseTokens() to parse over the input and converts any
ba379fdc
A
681 * error code to a const char* for a result.
682 */
683 const char* parse()
684 {
ba379fdc
A
685 if (m_size > MAX_PATTERN_SIZE)
686 m_err = PatternTooLarge;
687 else
688 parseTokens();
689 ASSERT(atEndOfPattern() || m_err);
690
ba379fdc
A
691 // The order of this array must match the ErrorCode enum.
692 static const char* errorMessages[NumberOfErrorCodes] = {
693 0, // NoError
14957cd0
A
694 REGEXP_ERROR_PREFIX "regular expression too large",
695 REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier",
696 REGEXP_ERROR_PREFIX "nothing to repeat",
6fe7ccc8 697 REGEXP_ERROR_PREFIX "number too large in {} quantifier",
14957cd0
A
698 REGEXP_ERROR_PREFIX "missing )",
699 REGEXP_ERROR_PREFIX "unmatched parentheses",
700 REGEXP_ERROR_PREFIX "unrecognized character after (?",
701 REGEXP_ERROR_PREFIX "missing terminating ] for character class",
702 REGEXP_ERROR_PREFIX "range out of order in character class",
703 REGEXP_ERROR_PREFIX "\\ at end of pattern"
ba379fdc
A
704 };
705
706 return errorMessages[m_err];
707 }
708
ba379fdc
A
709 // Misc helper functions:
710
711 typedef unsigned ParseState;
712
713 ParseState saveState()
714 {
715 return m_index;
716 }
717
718 void restoreState(ParseState state)
719 {
720 m_index = state;
721 }
722
723 bool atEndOfPattern()
724 {
725 ASSERT(m_index <= m_size);
726 return m_index == m_size;
727 }
728
729 int peek()
730 {
731 ASSERT(m_index < m_size);
732 return m_data[m_index];
733 }
734
735 bool peekIsDigit()
736 {
737 return !atEndOfPattern() && WTF::isASCIIDigit(peek());
738 }
739
740 unsigned peekDigit()
741 {
742 ASSERT(peekIsDigit());
743 return peek() - '0';
744 }
745
746 int consume()
747 {
748 ASSERT(m_index < m_size);
749 return m_data[m_index++];
750 }
751
752 unsigned consumeDigit()
753 {
754 ASSERT(peekIsDigit());
755 return consume() - '0';
756 }
757
758 unsigned consumeNumber()
759 {
760 unsigned n = consumeDigit();
761 // check for overflow.
762 for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) {
763 n = newValue;
764 consume();
765 }
766 return n;
767 }
768
769 unsigned consumeOctal()
770 {
771 ASSERT(WTF::isASCIIOctalDigit(peek()));
772
773 unsigned n = consumeDigit();
774 while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
775 n = n * 8 + consumeDigit();
776 return n;
777 }
778
779 bool tryConsume(UChar ch)
780 {
781 if (atEndOfPattern() || (m_data[m_index] != ch))
782 return false;
783 ++m_index;
784 return true;
785 }
786
787 int tryConsumeHex(int count)
788 {
789 ParseState state = saveState();
790
791 int n = 0;
792 while (count--) {
793 if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
794 restoreState(state);
795 return -1;
796 }
797 n = (n << 4) | WTF::toASCIIHexValue(consume());
798 }
799 return n;
800 }
801
802 Delegate& m_delegate;
803 unsigned m_backReferenceLimit;
804 ErrorCode m_err;
6fe7ccc8 805 const CharType* m_data;
ba379fdc
A
806 unsigned m_size;
807 unsigned m_index;
808 unsigned m_parenthesesNestingDepth;
809
810 // Derived by empirical testing of compile time in PCRE and WREC.
811 static const unsigned MAX_PATTERN_SIZE = 1024 * 1024;
812};
813
814/*
815 * Yarr::parse():
816 *
817 * The parse method is passed a pattern to be parsed and a delegate upon which
818 * callbacks will be made to record the parsed tokens forming the regex.
819 * Yarr::parse() returns null on success, or a const C string providing an error
820 * message where a parse error occurs.
821 *
822 * The Delegate must implement the following interface:
823 *
824 * void assertionBOL();
825 * void assertionEOL();
826 * void assertionWordBoundary(bool invert);
827 *
828 * void atomPatternCharacter(UChar ch);
829 * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
830 * void atomCharacterClassBegin(bool invert)
831 * void atomCharacterClassAtom(UChar ch)
832 * void atomCharacterClassRange(UChar begin, UChar end)
833 * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
834 * void atomCharacterClassEnd()
835 * void atomParenthesesSubpatternBegin(bool capture = true);
836 * void atomParentheticalAssertionBegin(bool invert = false);
837 * void atomParenthesesEnd();
838 * void atomBackReference(unsigned subpatternId);
839 *
840 * void quantifyAtom(unsigned min, unsigned max, bool greedy);
841 *
842 * void disjunction();
843 *
ba379fdc
A
844 * The regular expression is described by a sequence of assertion*() and atom*()
845 * callbacks to the delegate, describing the terms in the regular expression.
846 * Following an atom a quantifyAtom() call may occur to indicate that the previous
847 * atom should be quantified. In the case of atoms described across multiple
848 * calls (parentheses and character classes) the call to quantifyAtom() will come
849 * after the call to the atom*End() method, never after atom*Begin().
850 *
851 * Character classes may either be described by a single call to
852 * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
853 * In the latter case, ...Begin() will be called, followed by a sequence of
854 * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
855 *
856 * Sequences of atoms and assertions are broken into alternatives via calls to
857 * disjunction(). Assertions, atoms, and disjunctions emitted between calls to
858 * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
859 * atomParenthesesBegin() is passed a subpatternId. In the case of a regular
860 * capturing subpattern, this will be the subpatternId associated with these
861 * parentheses, and will also by definition be the lowest subpatternId of these
862 * parentheses and of any nested paretheses. The atomParenthesesEnd() method
863 * is passed the subpatternId of the last capturing subexpression nested within
864 * these paretheses. In the case of a capturing subpattern with no nested
865 * capturing subpatterns, the same subpatternId will be passed to the begin and
866 * end functions. In the case of non-capturing subpatterns the subpatternId
867 * passed to the begin method is also the first possible subpatternId that might
868 * be nested within these paretheses. If a set of non-capturing parentheses does
869 * not contain any capturing subpatterns, then the subpatternId passed to begin
870 * will be greater than the subpatternId passed to end.
871 */
872
873template<class Delegate>
93a37866 874const char* parse(Delegate& delegate, const String& pattern, unsigned backReferenceLimit = quantifyInfinite)
ba379fdc 875{
6fe7ccc8
A
876 if (pattern.is8Bit())
877 return Parser<Delegate, LChar>(delegate, pattern, backReferenceLimit).parse();
878 return Parser<Delegate, UChar>(delegate, pattern, backReferenceLimit).parse();
ba379fdc
A
879}
880
881} } // namespace JSC::Yarr
882
14957cd0 883#endif // YarrParser_h