2 * Copyright (C) 2009 Apple Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 #include <runtime/UString.h>
31 #include <wtf/ASCIICType.h>
32 #include <wtf/unicode/Unicode.h>
34 namespace JSC
{ namespace Yarr
{
36 #define REGEXP_ERROR_PREFIX "Invalid regular expression: "
38 enum BuiltInCharacterClassID
{
45 // The Parser class should not be used directly - only via the Yarr::parse() method.
46 template<class Delegate
, typename CharType
>
49 template<class FriendDelegate
>
50 friend const char* parse(FriendDelegate
& delegate
, const UString
& pattern
, unsigned backReferenceLimit
);
56 QuantifierWithoutAtom
,
60 ParenthesesTypeInvalid
,
61 CharacterClassUnmatched
,
62 CharacterClassOutOfOrder
,
68 * CharacterClassParserDelegate:
70 * The class CharacterClassParserDelegate is used in the parsing of character
71 * classes. This class handles detection of character ranges. This class
72 * implements enough of the delegate interface such that it can be passed to
73 * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused
74 * to perform the parsing of escape characters in character sets.
76 class CharacterClassParserDelegate
{
78 CharacterClassParserDelegate(Delegate
& delegate
, ErrorCode
& err
)
79 : m_delegate(delegate
)
89 * Called at beginning of construction.
91 void begin(bool invert
)
93 m_delegate
.atomCharacterClassBegin(invert
);
97 * atomPatternCharacter():
99 * This method is called either from parseCharacterClass() (for an unescaped
100 * character in a character class), or from parseEscape(). In the former case
101 * the value true will be passed for the argument 'hyphenIsRange', and in this
102 * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
103 * is different to /[a\-z]/).
105 void atomPatternCharacter(UChar ch
, bool hyphenIsRange
= false)
108 case AfterCharacterClass
:
109 // Following a builtin character class we need look out for a hyphen.
110 // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
111 // If we see a hyphen following a charater class then unlike usual
112 // we'll report it to the delegate immediately, and put ourself into
113 // a poisoned state. Any following calls to add another character or
114 // character class will result in an error. (A hypen following a
115 // character-class is itself valid, but only at the end of a regex).
116 if (hyphenIsRange
&& ch
== '-') {
117 m_delegate
.atomCharacterClassAtom('-');
118 m_state
= AfterCharacterClassHyphen
;
121 // Otherwise just fall through - cached character so treat this as Empty.
125 m_state
= CachedCharacter
;
128 case CachedCharacter
:
129 if (hyphenIsRange
&& ch
== '-')
130 m_state
= CachedCharacterHyphen
;
132 m_delegate
.atomCharacterClassAtom(m_character
);
137 case CachedCharacterHyphen
:
138 if (ch
< m_character
) {
139 m_err
= CharacterClassOutOfOrder
;
142 m_delegate
.atomCharacterClassRange(m_character
, ch
);
146 // See coment in atomBuiltInCharacterClass below.
147 // This too is technically an error, per ECMA-262, and again we
148 // we chose to allow this. Note a subtlely here that while we
149 // diverge from the spec's definition of CharacterRange we do
150 // remain in compliance with the grammar. For example, consider
151 // the expression /[\d-a-z]/. We comply with the grammar in
152 // this case by not allowing a-z to be matched as a range.
153 case AfterCharacterClassHyphen
:
154 m_delegate
.atomCharacterClassAtom(ch
);
161 * atomBuiltInCharacterClass():
163 * Adds a built-in character class, called by parseEscape().
165 void atomBuiltInCharacterClass(BuiltInCharacterClassID classID
, bool invert
)
168 case CachedCharacter
:
169 // Flush the currently cached character, then fall through.
170 m_delegate
.atomCharacterClassAtom(m_character
);
173 case AfterCharacterClass
:
174 m_state
= AfterCharacterClass
;
175 m_delegate
.atomCharacterClassBuiltIn(classID
, invert
);
178 // If we hit either of these cases, we have an invalid range that
179 // looks something like /[x-\d]/ or /[\d-\d]/.
180 // According to ECMA-262 this should be a syntax error, but
181 // empirical testing shows this to break teh webz. Instead we
182 // comply with to the ECMA-262 grammar, and assume the grammar to
183 // have matched the range correctly, but tweak our interpretation
184 // of CharacterRange. Effectively we implicitly handle the hyphen
185 // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/.
186 case CachedCharacterHyphen
:
187 m_delegate
.atomCharacterClassAtom(m_character
);
188 m_delegate
.atomCharacterClassAtom('-');
190 case AfterCharacterClassHyphen
:
191 m_delegate
.atomCharacterClassBuiltIn(classID
, invert
);
200 * Called at end of construction.
204 if (m_state
== CachedCharacter
)
205 m_delegate
.atomCharacterClassAtom(m_character
);
206 else if (m_state
== CachedCharacterHyphen
) {
207 m_delegate
.atomCharacterClassAtom(m_character
);
208 m_delegate
.atomCharacterClassAtom('-');
210 m_delegate
.atomCharacterClassEnd();
213 // parseEscape() should never call these delegate methods when
214 // invoked with inCharacterClass set.
215 NO_RETURN_DUE_TO_ASSERT
void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); }
216 NO_RETURN_DUE_TO_ASSERT
void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); }
219 Delegate
& m_delegate
;
221 enum CharacterClassConstructionState
{
224 CachedCharacterHyphen
,
226 AfterCharacterClassHyphen
,
231 Parser(Delegate
& delegate
, const UString
& pattern
, unsigned backReferenceLimit
)
232 : m_delegate(delegate
)
233 , m_backReferenceLimit(backReferenceLimit
)
235 , m_data(pattern
.getCharacters
<CharType
>())
236 , m_size(pattern
.length())
238 , m_parenthesesNestingDepth(0)
245 * Helper for parseTokens() AND parseCharacterClass().
246 * Unlike the other parser methods, this function does not report tokens
247 * directly to the member delegate (m_delegate), instead tokens are
248 * emitted to the delegate provided as an argument. In the case of atom
249 * escapes, parseTokens() will call parseEscape() passing m_delegate as
250 * an argument, and as such the escape will be reported to the delegate.
252 * However this method may also be used by parseCharacterClass(), in which
253 * case a CharacterClassParserDelegate will be passed as the delegate that
254 * tokens should be added to. A boolean flag is also provided to indicate
255 * whether that an escape in a CharacterClass is being parsed (some parsing
256 * rules change in this context).
258 * The boolean value returned by this method indicates whether the token
259 * parsed was an atom (outside of a characted class \b and \B will be
260 * interpreted as assertions).
262 template<bool inCharacterClass
, class EscapeDelegate
>
263 bool parseEscape(EscapeDelegate
& delegate
)
266 ASSERT(peek() == '\\');
269 if (atEndOfPattern()) {
270 m_err
= EscapeUnterminated
;
278 if (inCharacterClass
)
279 delegate
.atomPatternCharacter('\b');
281 delegate
.assertionWordBoundary(false);
287 if (inCharacterClass
)
288 delegate
.atomPatternCharacter('B');
290 delegate
.assertionWordBoundary(true);
295 // CharacterClassEscape
298 delegate
.atomBuiltInCharacterClass(DigitClassID
, false);
302 delegate
.atomBuiltInCharacterClass(SpaceClassID
, false);
306 delegate
.atomBuiltInCharacterClass(WordClassID
, false);
310 delegate
.atomBuiltInCharacterClass(DigitClassID
, true);
314 delegate
.atomBuiltInCharacterClass(SpaceClassID
, true);
318 delegate
.atomBuiltInCharacterClass(WordClassID
, true);
331 // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
332 // First, try to parse this as backreference.
333 if (!inCharacterClass
) {
334 ParseState state
= saveState();
336 unsigned backReference
= consumeNumber();
337 if (backReference
<= m_backReferenceLimit
) {
338 delegate
.atomBackReference(backReference
);
345 // Not a backreference, and not octal.
347 delegate
.atomPatternCharacter('\\');
351 // Fall-through to handle this as an octal escape.
356 delegate
.atomPatternCharacter(consumeOctal());
362 delegate
.atomPatternCharacter('\f');
366 delegate
.atomPatternCharacter('\n');
370 delegate
.atomPatternCharacter('\r');
374 delegate
.atomPatternCharacter('\t');
378 delegate
.atomPatternCharacter('\v');
383 ParseState state
= saveState();
385 if (!atEndOfPattern()) {
386 int control
= consume();
388 // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
389 if (inCharacterClass
? WTF::isASCIIAlphanumeric(control
) || (control
== '_') : WTF::isASCIIAlpha(control
)) {
390 delegate
.atomPatternCharacter(control
& 0x1f);
395 delegate
.atomPatternCharacter('\\');
402 int x
= tryConsumeHex(2);
404 delegate
.atomPatternCharacter('x');
406 delegate
.atomPatternCharacter(x
);
413 int u
= tryConsumeHex(4);
415 delegate
.atomPatternCharacter('u');
417 delegate
.atomPatternCharacter(u
);
423 delegate
.atomPatternCharacter(consume());
430 * parseAtomEscape(), parseCharacterClassEscape():
432 * These methods alias to parseEscape().
434 bool parseAtomEscape()
436 return parseEscape
<false>(m_delegate
);
438 void parseCharacterClassEscape(CharacterClassParserDelegate
& delegate
)
440 parseEscape
<true>(delegate
);
444 * parseCharacterClass():
446 * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
447 * to an instance of CharacterClassParserDelegate, to describe the character class to the
450 void parseCharacterClass()
453 ASSERT(peek() == '[');
456 CharacterClassParserDelegate
characterClassConstructor(m_delegate
, m_err
);
458 characterClassConstructor
.begin(tryConsume('^'));
460 while (!atEndOfPattern()) {
464 characterClassConstructor
.end();
468 parseCharacterClassEscape(characterClassConstructor
);
472 characterClassConstructor
.atomPatternCharacter(consume(), true);
479 m_err
= CharacterClassUnmatched
;
483 * parseParenthesesBegin():
485 * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
487 void parseParenthesesBegin()
490 ASSERT(peek() == '(');
493 if (tryConsume('?')) {
494 if (atEndOfPattern()) {
495 m_err
= ParenthesesTypeInvalid
;
501 m_delegate
.atomParenthesesSubpatternBegin(false);
505 m_delegate
.atomParentheticalAssertionBegin();
509 m_delegate
.atomParentheticalAssertionBegin(true);
513 m_err
= ParenthesesTypeInvalid
;
516 m_delegate
.atomParenthesesSubpatternBegin();
518 ++m_parenthesesNestingDepth
;
522 * parseParenthesesEnd():
524 * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
526 void parseParenthesesEnd()
529 ASSERT(peek() == ')');
532 if (m_parenthesesNestingDepth
> 0)
533 m_delegate
.atomParenthesesEnd();
535 m_err
= ParenthesesUnmatched
;
537 --m_parenthesesNestingDepth
;
543 * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
545 void parseQuantifier(bool lastTokenWasAnAtom
, unsigned min
, unsigned max
)
550 if (min
== UINT_MAX
) {
551 m_err
= QuantifierTooLarge
;
555 if (lastTokenWasAnAtom
)
556 m_delegate
.quantifyAtom(min
, max
, !tryConsume('?'));
558 m_err
= QuantifierWithoutAtom
;
564 * This method loops over the input pattern reporting tokens to the delegate.
565 * The method returns when a parse error is detected, or the end of the pattern
566 * is reached. One piece of state is tracked around the loop, which is whether
567 * the last token passed to the delegate was an atom (this is necessary to detect
568 * a parse error when a quantifier provided without an atom to quantify).
572 bool lastTokenWasAnAtom
= false;
574 while (!atEndOfPattern()) {
578 m_delegate
.disjunction();
579 lastTokenWasAnAtom
= false;
583 parseParenthesesBegin();
584 lastTokenWasAnAtom
= false;
588 parseParenthesesEnd();
589 lastTokenWasAnAtom
= true;
594 m_delegate
.assertionBOL();
595 lastTokenWasAnAtom
= false;
600 m_delegate
.assertionEOL();
601 lastTokenWasAnAtom
= false;
606 m_delegate
.atomBuiltInCharacterClass(NewlineClassID
, true);
607 lastTokenWasAnAtom
= true;
611 parseCharacterClass();
612 lastTokenWasAnAtom
= true;
616 lastTokenWasAnAtom
= parseAtomEscape();
621 parseQuantifier(lastTokenWasAnAtom
, 0, quantifyInfinite
);
622 lastTokenWasAnAtom
= false;
627 parseQuantifier(lastTokenWasAnAtom
, 1, quantifyInfinite
);
628 lastTokenWasAnAtom
= false;
633 parseQuantifier(lastTokenWasAnAtom
, 0, 1);
634 lastTokenWasAnAtom
= false;
638 ParseState state
= saveState();
642 unsigned min
= consumeNumber();
646 max
= peekIsDigit() ? consumeNumber() : quantifyInfinite
;
648 if (tryConsume('}')) {
650 parseQuantifier(lastTokenWasAnAtom
, min
, max
);
652 m_err
= QuantifierOutOfOrder
;
653 lastTokenWasAnAtom
= false;
659 } // if we did not find a complete quantifer, fall through to the default case.
662 m_delegate
.atomPatternCharacter(consume());
663 lastTokenWasAnAtom
= true;
670 if (m_parenthesesNestingDepth
> 0)
671 m_err
= MissingParentheses
;
677 * This method calls parseTokens() to parse over the input and converts any
678 * error code to a const char* for a result.
682 if (m_size
> MAX_PATTERN_SIZE
)
683 m_err
= PatternTooLarge
;
686 ASSERT(atEndOfPattern() || m_err
);
688 // The order of this array must match the ErrorCode enum.
689 static const char* errorMessages
[NumberOfErrorCodes
] = {
691 REGEXP_ERROR_PREFIX
"regular expression too large",
692 REGEXP_ERROR_PREFIX
"numbers out of order in {} quantifier",
693 REGEXP_ERROR_PREFIX
"nothing to repeat",
694 REGEXP_ERROR_PREFIX
"number too large in {} quantifier",
695 REGEXP_ERROR_PREFIX
"missing )",
696 REGEXP_ERROR_PREFIX
"unmatched parentheses",
697 REGEXP_ERROR_PREFIX
"unrecognized character after (?",
698 REGEXP_ERROR_PREFIX
"missing terminating ] for character class",
699 REGEXP_ERROR_PREFIX
"range out of order in character class",
700 REGEXP_ERROR_PREFIX
"\\ at end of pattern"
703 return errorMessages
[m_err
];
706 // Misc helper functions:
708 typedef unsigned ParseState
;
710 ParseState
saveState()
715 void restoreState(ParseState state
)
720 bool atEndOfPattern()
722 ASSERT(m_index
<= m_size
);
723 return m_index
== m_size
;
728 ASSERT(m_index
< m_size
);
729 return m_data
[m_index
];
734 return !atEndOfPattern() && WTF::isASCIIDigit(peek());
739 ASSERT(peekIsDigit());
745 ASSERT(m_index
< m_size
);
746 return m_data
[m_index
++];
749 unsigned consumeDigit()
751 ASSERT(peekIsDigit());
752 return consume() - '0';
755 unsigned consumeNumber()
757 unsigned n
= consumeDigit();
758 // check for overflow.
759 for (unsigned newValue
; peekIsDigit() && ((newValue
= n
* 10 + peekDigit()) >= n
); ) {
766 unsigned consumeOctal()
768 ASSERT(WTF::isASCIIOctalDigit(peek()));
770 unsigned n
= consumeDigit();
771 while (n
< 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
772 n
= n
* 8 + consumeDigit();
776 bool tryConsume(UChar ch
)
778 if (atEndOfPattern() || (m_data
[m_index
] != ch
))
784 int tryConsumeHex(int count
)
786 ParseState state
= saveState();
790 if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
794 n
= (n
<< 4) | WTF::toASCIIHexValue(consume());
799 Delegate
& m_delegate
;
800 unsigned m_backReferenceLimit
;
802 const CharType
* m_data
;
805 unsigned m_parenthesesNestingDepth
;
807 // Derived by empirical testing of compile time in PCRE and WREC.
808 static const unsigned MAX_PATTERN_SIZE
= 1024 * 1024;
814 * The parse method is passed a pattern to be parsed and a delegate upon which
815 * callbacks will be made to record the parsed tokens forming the regex.
816 * Yarr::parse() returns null on success, or a const C string providing an error
817 * message where a parse error occurs.
819 * The Delegate must implement the following interface:
821 * void assertionBOL();
822 * void assertionEOL();
823 * void assertionWordBoundary(bool invert);
825 * void atomPatternCharacter(UChar ch);
826 * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
827 * void atomCharacterClassBegin(bool invert)
828 * void atomCharacterClassAtom(UChar ch)
829 * void atomCharacterClassRange(UChar begin, UChar end)
830 * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
831 * void atomCharacterClassEnd()
832 * void atomParenthesesSubpatternBegin(bool capture = true);
833 * void atomParentheticalAssertionBegin(bool invert = false);
834 * void atomParenthesesEnd();
835 * void atomBackReference(unsigned subpatternId);
837 * void quantifyAtom(unsigned min, unsigned max, bool greedy);
839 * void disjunction();
841 * The regular expression is described by a sequence of assertion*() and atom*()
842 * callbacks to the delegate, describing the terms in the regular expression.
843 * Following an atom a quantifyAtom() call may occur to indicate that the previous
844 * atom should be quantified. In the case of atoms described across multiple
845 * calls (parentheses and character classes) the call to quantifyAtom() will come
846 * after the call to the atom*End() method, never after atom*Begin().
848 * Character classes may either be described by a single call to
849 * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
850 * In the latter case, ...Begin() will be called, followed by a sequence of
851 * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
853 * Sequences of atoms and assertions are broken into alternatives via calls to
854 * disjunction(). Assertions, atoms, and disjunctions emitted between calls to
855 * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
856 * atomParenthesesBegin() is passed a subpatternId. In the case of a regular
857 * capturing subpattern, this will be the subpatternId associated with these
858 * parentheses, and will also by definition be the lowest subpatternId of these
859 * parentheses and of any nested paretheses. The atomParenthesesEnd() method
860 * is passed the subpatternId of the last capturing subexpression nested within
861 * these paretheses. In the case of a capturing subpattern with no nested
862 * capturing subpatterns, the same subpatternId will be passed to the begin and
863 * end functions. In the case of non-capturing subpatterns the subpatternId
864 * passed to the begin method is also the first possible subpatternId that might
865 * be nested within these paretheses. If a set of non-capturing parentheses does
866 * not contain any capturing subpatterns, then the subpatternId passed to begin
867 * will be greater than the subpatternId passed to end.
870 template<class Delegate
>
871 const char* parse(Delegate
& delegate
, const UString
& pattern
, unsigned backReferenceLimit
= quantifyInfinite
)
873 if (pattern
.is8Bit())
874 return Parser
<Delegate
, LChar
>(delegate
, pattern
, backReferenceLimit
).parse();
875 return Parser
<Delegate
, UChar
>(delegate
, pattern
, backReferenceLimit
).parse();
878 } } // namespace JSC::Yarr
880 #endif // YarrParser_h