2 * Copyright (C) 2009 Apple Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 #include <wtf/ASCIICType.h>
31 #include <wtf/text/WTFString.h>
33 namespace JSC
{ namespace Yarr
{
35 #define REGEXP_ERROR_PREFIX "Invalid regular expression: "
37 enum BuiltInCharacterClassID
{
44 // The Parser class should not be used directly - only via the Yarr::parse() method.
45 template<class Delegate
, typename CharType
>
48 template<class FriendDelegate
>
49 friend const char* parse(FriendDelegate
&, const String
& pattern
, unsigned backReferenceLimit
);
55 QuantifierWithoutAtom
,
59 ParenthesesTypeInvalid
,
60 CharacterClassUnmatched
,
61 CharacterClassOutOfOrder
,
67 * CharacterClassParserDelegate:
69 * The class CharacterClassParserDelegate is used in the parsing of character
70 * classes. This class handles detection of character ranges. This class
71 * implements enough of the delegate interface such that it can be passed to
72 * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused
73 * to perform the parsing of escape characters in character sets.
75 class CharacterClassParserDelegate
{
77 CharacterClassParserDelegate(Delegate
& delegate
, ErrorCode
& err
)
78 : m_delegate(delegate
)
88 * Called at beginning of construction.
90 void begin(bool invert
)
92 m_delegate
.atomCharacterClassBegin(invert
);
96 * atomPatternCharacter():
98 * This method is called either from parseCharacterClass() (for an unescaped
99 * character in a character class), or from parseEscape(). In the former case
100 * the value true will be passed for the argument 'hyphenIsRange', and in this
101 * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
102 * is different to /[a\-z]/).
104 void atomPatternCharacter(UChar ch
, bool hyphenIsRange
= false)
107 case AfterCharacterClass
:
108 // Following a builtin character class we need look out for a hyphen.
109 // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
110 // If we see a hyphen following a charater class then unlike usual
111 // we'll report it to the delegate immediately, and put ourself into
112 // a poisoned state. Any following calls to add another character or
113 // character class will result in an error. (A hypen following a
114 // character-class is itself valid, but only at the end of a regex).
115 if (hyphenIsRange
&& ch
== '-') {
116 m_delegate
.atomCharacterClassAtom('-');
117 m_state
= AfterCharacterClassHyphen
;
120 // Otherwise just fall through - cached character so treat this as Empty.
125 m_state
= CachedCharacter
;
128 case CachedCharacter
:
129 if (hyphenIsRange
&& ch
== '-')
130 m_state
= CachedCharacterHyphen
;
132 m_delegate
.atomCharacterClassAtom(m_character
);
137 case CachedCharacterHyphen
:
138 if (ch
< m_character
) {
139 m_err
= CharacterClassOutOfOrder
;
142 m_delegate
.atomCharacterClassRange(m_character
, ch
);
146 // See coment in atomBuiltInCharacterClass below.
147 // This too is technically an error, per ECMA-262, and again we
148 // we chose to allow this. Note a subtlely here that while we
149 // diverge from the spec's definition of CharacterRange we do
150 // remain in compliance with the grammar. For example, consider
151 // the expression /[\d-a-z]/. We comply with the grammar in
152 // this case by not allowing a-z to be matched as a range.
153 case AfterCharacterClassHyphen
:
154 m_delegate
.atomCharacterClassAtom(ch
);
161 * atomBuiltInCharacterClass():
163 * Adds a built-in character class, called by parseEscape().
165 void atomBuiltInCharacterClass(BuiltInCharacterClassID classID
, bool invert
)
168 case CachedCharacter
:
169 // Flush the currently cached character, then fall through.
170 m_delegate
.atomCharacterClassAtom(m_character
);
173 case AfterCharacterClass
:
174 m_state
= AfterCharacterClass
;
175 m_delegate
.atomCharacterClassBuiltIn(classID
, invert
);
178 // If we hit either of these cases, we have an invalid range that
179 // looks something like /[x-\d]/ or /[\d-\d]/.
180 // According to ECMA-262 this should be a syntax error, but
181 // empirical testing shows this to break teh webz. Instead we
182 // comply with to the ECMA-262 grammar, and assume the grammar to
183 // have matched the range correctly, but tweak our interpretation
184 // of CharacterRange. Effectively we implicitly handle the hyphen
185 // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/.
186 case CachedCharacterHyphen
:
187 m_delegate
.atomCharacterClassAtom(m_character
);
188 m_delegate
.atomCharacterClassAtom('-');
190 case AfterCharacterClassHyphen
:
191 m_delegate
.atomCharacterClassBuiltIn(classID
, invert
);
200 * Called at end of construction.
204 if (m_state
== CachedCharacter
)
205 m_delegate
.atomCharacterClassAtom(m_character
);
206 else if (m_state
== CachedCharacterHyphen
) {
207 m_delegate
.atomCharacterClassAtom(m_character
);
208 m_delegate
.atomCharacterClassAtom('-');
210 m_delegate
.atomCharacterClassEnd();
213 // parseEscape() should never call these delegate methods when
214 // invoked with inCharacterClass set.
215 NO_RETURN_DUE_TO_ASSERT
void assertionWordBoundary(bool) { RELEASE_ASSERT_NOT_REACHED(); }
216 NO_RETURN_DUE_TO_ASSERT
void atomBackReference(unsigned) { RELEASE_ASSERT_NOT_REACHED(); }
219 Delegate
& m_delegate
;
221 enum CharacterClassConstructionState
{
224 CachedCharacterHyphen
,
226 AfterCharacterClassHyphen
,
231 Parser(Delegate
& delegate
, const String
& pattern
, unsigned backReferenceLimit
)
232 : m_delegate(delegate
)
233 , m_backReferenceLimit(backReferenceLimit
)
235 , m_data(pattern
.characters
<CharType
>())
236 , m_size(pattern
.length())
238 , m_parenthesesNestingDepth(0)
245 * Helper for parseTokens() AND parseCharacterClass().
246 * Unlike the other parser methods, this function does not report tokens
247 * directly to the member delegate (m_delegate), instead tokens are
248 * emitted to the delegate provided as an argument. In the case of atom
249 * escapes, parseTokens() will call parseEscape() passing m_delegate as
250 * an argument, and as such the escape will be reported to the delegate.
252 * However this method may also be used by parseCharacterClass(), in which
253 * case a CharacterClassParserDelegate will be passed as the delegate that
254 * tokens should be added to. A boolean flag is also provided to indicate
255 * whether that an escape in a CharacterClass is being parsed (some parsing
256 * rules change in this context).
258 * The boolean value returned by this method indicates whether the token
259 * parsed was an atom (outside of a characted class \b and \B will be
260 * interpreted as assertions).
262 template<bool inCharacterClass
, class EscapeDelegate
>
263 bool parseEscape(EscapeDelegate
& delegate
)
266 ASSERT(peek() == '\\');
269 if (atEndOfPattern()) {
270 m_err
= EscapeUnterminated
;
278 if (inCharacterClass
)
279 delegate
.atomPatternCharacter('\b');
281 delegate
.assertionWordBoundary(false);
287 if (inCharacterClass
)
288 delegate
.atomPatternCharacter('B');
290 delegate
.assertionWordBoundary(true);
295 // CharacterClassEscape
298 delegate
.atomBuiltInCharacterClass(DigitClassID
, false);
302 delegate
.atomBuiltInCharacterClass(SpaceClassID
, false);
306 delegate
.atomBuiltInCharacterClass(WordClassID
, false);
310 delegate
.atomBuiltInCharacterClass(DigitClassID
, true);
314 delegate
.atomBuiltInCharacterClass(SpaceClassID
, true);
318 delegate
.atomBuiltInCharacterClass(WordClassID
, true);
331 // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
332 // First, try to parse this as backreference.
333 if (!inCharacterClass
) {
334 ParseState state
= saveState();
336 unsigned backReference
= consumeNumber();
337 if (backReference
<= m_backReferenceLimit
) {
338 delegate
.atomBackReference(backReference
);
345 // Not a backreference, and not octal.
347 delegate
.atomPatternCharacter('\\');
351 // Fall-through to handle this as an octal escape.
357 delegate
.atomPatternCharacter(consumeOctal());
363 delegate
.atomPatternCharacter('\f');
367 delegate
.atomPatternCharacter('\n');
371 delegate
.atomPatternCharacter('\r');
375 delegate
.atomPatternCharacter('\t');
379 delegate
.atomPatternCharacter('\v');
384 ParseState state
= saveState();
386 if (!atEndOfPattern()) {
387 int control
= consume();
389 // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
390 if (inCharacterClass
? WTF::isASCIIAlphanumeric(control
) || (control
== '_') : WTF::isASCIIAlpha(control
)) {
391 delegate
.atomPatternCharacter(control
& 0x1f);
396 delegate
.atomPatternCharacter('\\');
403 int x
= tryConsumeHex(2);
405 delegate
.atomPatternCharacter('x');
407 delegate
.atomPatternCharacter(x
);
414 int u
= tryConsumeHex(4);
416 delegate
.atomPatternCharacter('u');
418 delegate
.atomPatternCharacter(u
);
424 delegate
.atomPatternCharacter(consume());
431 * parseAtomEscape(), parseCharacterClassEscape():
433 * These methods alias to parseEscape().
435 bool parseAtomEscape()
437 return parseEscape
<false>(m_delegate
);
439 void parseCharacterClassEscape(CharacterClassParserDelegate
& delegate
)
441 parseEscape
<true>(delegate
);
445 * parseCharacterClass():
447 * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
448 * to an instance of CharacterClassParserDelegate, to describe the character class to the
451 void parseCharacterClass()
454 ASSERT(peek() == '[');
457 CharacterClassParserDelegate
characterClassConstructor(m_delegate
, m_err
);
459 characterClassConstructor
.begin(tryConsume('^'));
461 while (!atEndOfPattern()) {
465 characterClassConstructor
.end();
469 parseCharacterClassEscape(characterClassConstructor
);
473 characterClassConstructor
.atomPatternCharacter(consume(), true);
480 m_err
= CharacterClassUnmatched
;
484 * parseParenthesesBegin():
486 * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
488 void parseParenthesesBegin()
491 ASSERT(peek() == '(');
494 if (tryConsume('?')) {
495 if (atEndOfPattern()) {
496 m_err
= ParenthesesTypeInvalid
;
502 m_delegate
.atomParenthesesSubpatternBegin(false);
506 m_delegate
.atomParentheticalAssertionBegin();
510 m_delegate
.atomParentheticalAssertionBegin(true);
514 m_err
= ParenthesesTypeInvalid
;
517 m_delegate
.atomParenthesesSubpatternBegin();
519 ++m_parenthesesNestingDepth
;
523 * parseParenthesesEnd():
525 * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
527 void parseParenthesesEnd()
530 ASSERT(peek() == ')');
533 if (m_parenthesesNestingDepth
> 0)
534 m_delegate
.atomParenthesesEnd();
536 m_err
= ParenthesesUnmatched
;
538 --m_parenthesesNestingDepth
;
544 * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
546 void parseQuantifier(bool lastTokenWasAnAtom
, unsigned min
, unsigned max
)
551 if (min
== UINT_MAX
) {
552 m_err
= QuantifierTooLarge
;
556 if (lastTokenWasAnAtom
)
557 m_delegate
.quantifyAtom(min
, max
, !tryConsume('?'));
559 m_err
= QuantifierWithoutAtom
;
565 * This method loops over the input pattern reporting tokens to the delegate.
566 * The method returns when a parse error is detected, or the end of the pattern
567 * is reached. One piece of state is tracked around the loop, which is whether
568 * the last token passed to the delegate was an atom (this is necessary to detect
569 * a parse error when a quantifier provided without an atom to quantify).
573 bool lastTokenWasAnAtom
= false;
575 while (!atEndOfPattern()) {
579 m_delegate
.disjunction();
580 lastTokenWasAnAtom
= false;
584 parseParenthesesBegin();
585 lastTokenWasAnAtom
= false;
589 parseParenthesesEnd();
590 lastTokenWasAnAtom
= true;
595 m_delegate
.assertionBOL();
596 lastTokenWasAnAtom
= false;
601 m_delegate
.assertionEOL();
602 lastTokenWasAnAtom
= false;
607 m_delegate
.atomBuiltInCharacterClass(NewlineClassID
, true);
608 lastTokenWasAnAtom
= true;
612 parseCharacterClass();
613 lastTokenWasAnAtom
= true;
617 lastTokenWasAnAtom
= parseAtomEscape();
622 parseQuantifier(lastTokenWasAnAtom
, 0, quantifyInfinite
);
623 lastTokenWasAnAtom
= false;
628 parseQuantifier(lastTokenWasAnAtom
, 1, quantifyInfinite
);
629 lastTokenWasAnAtom
= false;
634 parseQuantifier(lastTokenWasAnAtom
, 0, 1);
635 lastTokenWasAnAtom
= false;
639 ParseState state
= saveState();
643 unsigned min
= consumeNumber();
647 max
= peekIsDigit() ? consumeNumber() : quantifyInfinite
;
649 if (tryConsume('}')) {
651 parseQuantifier(lastTokenWasAnAtom
, min
, max
);
653 m_err
= QuantifierOutOfOrder
;
654 lastTokenWasAnAtom
= false;
661 // if we did not find a complete quantifer, fall through to the default case.
665 m_delegate
.atomPatternCharacter(consume());
666 lastTokenWasAnAtom
= true;
673 if (m_parenthesesNestingDepth
> 0)
674 m_err
= MissingParentheses
;
680 * This method calls parseTokens() to parse over the input and converts any
681 * error code to a const char* for a result.
685 if (m_size
> MAX_PATTERN_SIZE
)
686 m_err
= PatternTooLarge
;
689 ASSERT(atEndOfPattern() || m_err
);
691 // The order of this array must match the ErrorCode enum.
692 static const char* errorMessages
[NumberOfErrorCodes
] = {
694 REGEXP_ERROR_PREFIX
"regular expression too large",
695 REGEXP_ERROR_PREFIX
"numbers out of order in {} quantifier",
696 REGEXP_ERROR_PREFIX
"nothing to repeat",
697 REGEXP_ERROR_PREFIX
"number too large in {} quantifier",
698 REGEXP_ERROR_PREFIX
"missing )",
699 REGEXP_ERROR_PREFIX
"unmatched parentheses",
700 REGEXP_ERROR_PREFIX
"unrecognized character after (?",
701 REGEXP_ERROR_PREFIX
"missing terminating ] for character class",
702 REGEXP_ERROR_PREFIX
"range out of order in character class",
703 REGEXP_ERROR_PREFIX
"\\ at end of pattern"
706 return errorMessages
[m_err
];
709 // Misc helper functions:
711 typedef unsigned ParseState
;
713 ParseState
saveState()
718 void restoreState(ParseState state
)
723 bool atEndOfPattern()
725 ASSERT(m_index
<= m_size
);
726 return m_index
== m_size
;
731 ASSERT(m_index
< m_size
);
732 return m_data
[m_index
];
737 return !atEndOfPattern() && WTF::isASCIIDigit(peek());
742 ASSERT(peekIsDigit());
748 ASSERT(m_index
< m_size
);
749 return m_data
[m_index
++];
752 unsigned consumeDigit()
754 ASSERT(peekIsDigit());
755 return consume() - '0';
758 unsigned consumeNumber()
760 unsigned n
= consumeDigit();
761 // check for overflow.
762 for (unsigned newValue
; peekIsDigit() && ((newValue
= n
* 10 + peekDigit()) >= n
); ) {
769 unsigned consumeOctal()
771 ASSERT(WTF::isASCIIOctalDigit(peek()));
773 unsigned n
= consumeDigit();
774 while (n
< 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
775 n
= n
* 8 + consumeDigit();
779 bool tryConsume(UChar ch
)
781 if (atEndOfPattern() || (m_data
[m_index
] != ch
))
787 int tryConsumeHex(int count
)
789 ParseState state
= saveState();
793 if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
797 n
= (n
<< 4) | WTF::toASCIIHexValue(consume());
802 Delegate
& m_delegate
;
803 unsigned m_backReferenceLimit
;
805 const CharType
* m_data
;
808 unsigned m_parenthesesNestingDepth
;
810 // Derived by empirical testing of compile time in PCRE and WREC.
811 static const unsigned MAX_PATTERN_SIZE
= 1024 * 1024;
817 * The parse method is passed a pattern to be parsed and a delegate upon which
818 * callbacks will be made to record the parsed tokens forming the regex.
819 * Yarr::parse() returns null on success, or a const C string providing an error
820 * message where a parse error occurs.
822 * The Delegate must implement the following interface:
824 * void assertionBOL();
825 * void assertionEOL();
826 * void assertionWordBoundary(bool invert);
828 * void atomPatternCharacter(UChar ch);
829 * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
830 * void atomCharacterClassBegin(bool invert)
831 * void atomCharacterClassAtom(UChar ch)
832 * void atomCharacterClassRange(UChar begin, UChar end)
833 * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
834 * void atomCharacterClassEnd()
835 * void atomParenthesesSubpatternBegin(bool capture = true);
836 * void atomParentheticalAssertionBegin(bool invert = false);
837 * void atomParenthesesEnd();
838 * void atomBackReference(unsigned subpatternId);
840 * void quantifyAtom(unsigned min, unsigned max, bool greedy);
842 * void disjunction();
844 * The regular expression is described by a sequence of assertion*() and atom*()
845 * callbacks to the delegate, describing the terms in the regular expression.
846 * Following an atom a quantifyAtom() call may occur to indicate that the previous
847 * atom should be quantified. In the case of atoms described across multiple
848 * calls (parentheses and character classes) the call to quantifyAtom() will come
849 * after the call to the atom*End() method, never after atom*Begin().
851 * Character classes may either be described by a single call to
852 * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
853 * In the latter case, ...Begin() will be called, followed by a sequence of
854 * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
856 * Sequences of atoms and assertions are broken into alternatives via calls to
857 * disjunction(). Assertions, atoms, and disjunctions emitted between calls to
858 * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
859 * atomParenthesesBegin() is passed a subpatternId. In the case of a regular
860 * capturing subpattern, this will be the subpatternId associated with these
861 * parentheses, and will also by definition be the lowest subpatternId of these
862 * parentheses and of any nested paretheses. The atomParenthesesEnd() method
863 * is passed the subpatternId of the last capturing subexpression nested within
864 * these paretheses. In the case of a capturing subpattern with no nested
865 * capturing subpatterns, the same subpatternId will be passed to the begin and
866 * end functions. In the case of non-capturing subpatterns the subpatternId
867 * passed to the begin method is also the first possible subpatternId that might
868 * be nested within these paretheses. If a set of non-capturing parentheses does
869 * not contain any capturing subpatterns, then the subpatternId passed to begin
870 * will be greater than the subpatternId passed to end.
873 template<class Delegate
>
874 const char* parse(Delegate
& delegate
, const String
& pattern
, unsigned backReferenceLimit
= quantifyInfinite
)
876 if (pattern
.is8Bit())
877 return Parser
<Delegate
, LChar
>(delegate
, pattern
, backReferenceLimit
).parse();
878 return Parser
<Delegate
, UChar
>(delegate
, pattern
, backReferenceLimit
).parse();
881 } } // namespace JSC::Yarr
883 #endif // YarrParser_h