2 * Copyright (C) 2009 Apple Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 #include <runtime/UString.h>
31 #include <wtf/ASCIICType.h>
32 #include <wtf/unicode/Unicode.h>
34 namespace JSC
{ namespace Yarr
{
36 #define REGEXP_ERROR_PREFIX "Invalid regular expression: "
38 enum BuiltInCharacterClassID
{
45 // The Parser class should not be used directly - only via the Yarr::parse() method.
46 template<class Delegate
>
49 template<class FriendDelegate
>
50 friend const char* parse(FriendDelegate
& delegate
, const UString
& pattern
, unsigned backReferenceLimit
);
56 QuantifierWithoutAtom
,
59 ParenthesesTypeInvalid
,
60 CharacterClassUnmatched
,
61 CharacterClassOutOfOrder
,
67 * CharacterClassParserDelegate:
69 * The class CharacterClassParserDelegate is used in the parsing of character
70 * classes. This class handles detection of character ranges. This class
71 * implements enough of the delegate interface such that it can be passed to
72 * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused
73 * to perform the parsing of escape characters in character sets.
75 class CharacterClassParserDelegate
{
77 CharacterClassParserDelegate(Delegate
& delegate
, ErrorCode
& err
)
78 : m_delegate(delegate
)
88 * Called at beginning of construction.
90 void begin(bool invert
)
92 m_delegate
.atomCharacterClassBegin(invert
);
96 * atomPatternCharacter():
98 * This method is called either from parseCharacterClass() (for an unescaped
99 * character in a character class), or from parseEscape(). In the former case
100 * the value true will be passed for the argument 'hyphenIsRange', and in this
101 * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
102 * is different to /[a\-z]/).
104 void atomPatternCharacter(UChar ch
, bool hyphenIsRange
= false)
107 case AfterCharacterClass
:
108 // Following a builtin character class we need look out for a hyphen.
109 // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
110 // If we see a hyphen following a charater class then unlike usual
111 // we'll report it to the delegate immediately, and put ourself into
112 // a poisoned state. Any following calls to add another character or
113 // character class will result in an error. (A hypen following a
114 // character-class is itself valid, but only at the end of a regex).
115 if (hyphenIsRange
&& ch
== '-') {
116 m_delegate
.atomCharacterClassAtom('-');
117 m_state
= AfterCharacterClassHyphen
;
120 // Otherwise just fall through - cached character so treat this as Empty.
124 m_state
= CachedCharacter
;
127 case CachedCharacter
:
128 if (hyphenIsRange
&& ch
== '-')
129 m_state
= CachedCharacterHyphen
;
131 m_delegate
.atomCharacterClassAtom(m_character
);
136 case CachedCharacterHyphen
:
137 if (ch
< m_character
) {
138 m_err
= CharacterClassOutOfOrder
;
141 m_delegate
.atomCharacterClassRange(m_character
, ch
);
145 // See coment in atomBuiltInCharacterClass below.
146 // This too is technically an error, per ECMA-262, and again we
147 // we chose to allow this. Note a subtlely here that while we
148 // diverge from the spec's definition of CharacterRange we do
149 // remain in compliance with the grammar. For example, consider
150 // the expression /[\d-a-z]/. We comply with the grammar in
151 // this case by not allowing a-z to be matched as a range.
152 case AfterCharacterClassHyphen
:
153 m_delegate
.atomCharacterClassAtom(ch
);
160 * atomBuiltInCharacterClass():
162 * Adds a built-in character class, called by parseEscape().
164 void atomBuiltInCharacterClass(BuiltInCharacterClassID classID
, bool invert
)
167 case CachedCharacter
:
168 // Flush the currently cached character, then fall through.
169 m_delegate
.atomCharacterClassAtom(m_character
);
172 case AfterCharacterClass
:
173 m_state
= AfterCharacterClass
;
174 m_delegate
.atomCharacterClassBuiltIn(classID
, invert
);
177 // If we hit either of these cases, we have an invalid range that
178 // looks something like /[x-\d]/ or /[\d-\d]/.
179 // According to ECMA-262 this should be a syntax error, but
180 // empirical testing shows this to break teh webz. Instead we
181 // comply with to the ECMA-262 grammar, and assume the grammar to
182 // have matched the range correctly, but tweak our interpretation
183 // of CharacterRange. Effectively we implicitly handle the hyphen
184 // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/.
185 case CachedCharacterHyphen
:
186 m_delegate
.atomCharacterClassAtom(m_character
);
187 m_delegate
.atomCharacterClassAtom('-');
189 case AfterCharacterClassHyphen
:
190 m_delegate
.atomCharacterClassBuiltIn(classID
, invert
);
199 * Called at end of construction.
203 if (m_state
== CachedCharacter
)
204 m_delegate
.atomCharacterClassAtom(m_character
);
205 else if (m_state
== CachedCharacterHyphen
) {
206 m_delegate
.atomCharacterClassAtom(m_character
);
207 m_delegate
.atomCharacterClassAtom('-');
209 m_delegate
.atomCharacterClassEnd();
212 // parseEscape() should never call these delegate methods when
213 // invoked with inCharacterClass set.
214 void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); }
215 void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); }
218 Delegate
& m_delegate
;
220 enum CharacterClassConstructionState
{
223 CachedCharacterHyphen
,
225 AfterCharacterClassHyphen
,
230 Parser(Delegate
& delegate
, const UString
& pattern
, unsigned backReferenceLimit
)
231 : m_delegate(delegate
)
232 , m_backReferenceLimit(backReferenceLimit
)
234 , m_data(pattern
.characters())
235 , m_size(pattern
.length())
237 , m_parenthesesNestingDepth(0)
244 * Helper for parseTokens() AND parseCharacterClass().
245 * Unlike the other parser methods, this function does not report tokens
246 * directly to the member delegate (m_delegate), instead tokens are
247 * emitted to the delegate provided as an argument. In the case of atom
248 * escapes, parseTokens() will call parseEscape() passing m_delegate as
249 * an argument, and as such the escape will be reported to the delegate.
251 * However this method may also be used by parseCharacterClass(), in which
252 * case a CharacterClassParserDelegate will be passed as the delegate that
253 * tokens should be added to. A boolean flag is also provided to indicate
254 * whether that an escape in a CharacterClass is being parsed (some parsing
255 * rules change in this context).
257 * The boolean value returned by this method indicates whether the token
258 * parsed was an atom (outside of a characted class \b and \B will be
259 * interpreted as assertions).
261 template<bool inCharacterClass
, class EscapeDelegate
>
262 bool parseEscape(EscapeDelegate
& delegate
)
265 ASSERT(peek() == '\\');
268 if (atEndOfPattern()) {
269 m_err
= EscapeUnterminated
;
277 if (inCharacterClass
)
278 delegate
.atomPatternCharacter('\b');
280 delegate
.assertionWordBoundary(false);
286 if (inCharacterClass
)
287 delegate
.atomPatternCharacter('B');
289 delegate
.assertionWordBoundary(true);
294 // CharacterClassEscape
297 delegate
.atomBuiltInCharacterClass(DigitClassID
, false);
301 delegate
.atomBuiltInCharacterClass(SpaceClassID
, false);
305 delegate
.atomBuiltInCharacterClass(WordClassID
, false);
309 delegate
.atomBuiltInCharacterClass(DigitClassID
, true);
313 delegate
.atomBuiltInCharacterClass(SpaceClassID
, true);
317 delegate
.atomBuiltInCharacterClass(WordClassID
, true);
330 // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
331 // First, try to parse this as backreference.
332 if (!inCharacterClass
) {
333 ParseState state
= saveState();
335 unsigned backReference
= consumeNumber();
336 if (backReference
<= m_backReferenceLimit
) {
337 delegate
.atomBackReference(backReference
);
344 // Not a backreference, and not octal.
346 delegate
.atomPatternCharacter('\\');
350 // Fall-through to handle this as an octal escape.
355 delegate
.atomPatternCharacter(consumeOctal());
361 delegate
.atomPatternCharacter('\f');
365 delegate
.atomPatternCharacter('\n');
369 delegate
.atomPatternCharacter('\r');
373 delegate
.atomPatternCharacter('\t');
377 delegate
.atomPatternCharacter('\v');
382 ParseState state
= saveState();
384 if (!atEndOfPattern()) {
385 int control
= consume();
387 // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
388 if (inCharacterClass
? WTF::isASCIIAlphanumeric(control
) || (control
== '_') : WTF::isASCIIAlpha(control
)) {
389 delegate
.atomPatternCharacter(control
& 0x1f);
394 delegate
.atomPatternCharacter('\\');
401 int x
= tryConsumeHex(2);
403 delegate
.atomPatternCharacter('x');
405 delegate
.atomPatternCharacter(x
);
412 int u
= tryConsumeHex(4);
414 delegate
.atomPatternCharacter('u');
416 delegate
.atomPatternCharacter(u
);
422 delegate
.atomPatternCharacter(consume());
429 * parseAtomEscape(), parseCharacterClassEscape():
431 * These methods alias to parseEscape().
433 bool parseAtomEscape()
435 return parseEscape
<false>(m_delegate
);
437 void parseCharacterClassEscape(CharacterClassParserDelegate
& delegate
)
439 parseEscape
<true>(delegate
);
443 * parseCharacterClass():
445 * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
446 * to an instance of CharacterClassParserDelegate, to describe the character class to the
449 void parseCharacterClass()
452 ASSERT(peek() == '[');
455 CharacterClassParserDelegate
characterClassConstructor(m_delegate
, m_err
);
457 characterClassConstructor
.begin(tryConsume('^'));
459 while (!atEndOfPattern()) {
463 characterClassConstructor
.end();
467 parseCharacterClassEscape(characterClassConstructor
);
471 characterClassConstructor
.atomPatternCharacter(consume(), true);
478 m_err
= CharacterClassUnmatched
;
482 * parseParenthesesBegin():
484 * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
486 void parseParenthesesBegin()
489 ASSERT(peek() == '(');
492 if (tryConsume('?')) {
493 if (atEndOfPattern()) {
494 m_err
= ParenthesesTypeInvalid
;
500 m_delegate
.atomParenthesesSubpatternBegin(false);
504 m_delegate
.atomParentheticalAssertionBegin();
508 m_delegate
.atomParentheticalAssertionBegin(true);
512 m_err
= ParenthesesTypeInvalid
;
515 m_delegate
.atomParenthesesSubpatternBegin();
517 ++m_parenthesesNestingDepth
;
521 * parseParenthesesEnd():
523 * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
525 void parseParenthesesEnd()
528 ASSERT(peek() == ')');
531 if (m_parenthesesNestingDepth
> 0)
532 m_delegate
.atomParenthesesEnd();
534 m_err
= ParenthesesUnmatched
;
536 --m_parenthesesNestingDepth
;
542 * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
544 void parseQuantifier(bool lastTokenWasAnAtom
, unsigned min
, unsigned max
)
549 if (lastTokenWasAnAtom
)
550 m_delegate
.quantifyAtom(min
, max
, !tryConsume('?'));
552 m_err
= QuantifierWithoutAtom
;
558 * This method loops over the input pattern reporting tokens to the delegate.
559 * The method returns when a parse error is detected, or the end of the pattern
560 * is reached. One piece of state is tracked around the loop, which is whether
561 * the last token passed to the delegate was an atom (this is necessary to detect
562 * a parse error when a quantifier provided without an atom to quantify).
566 bool lastTokenWasAnAtom
= false;
568 while (!atEndOfPattern()) {
572 m_delegate
.disjunction();
573 lastTokenWasAnAtom
= false;
577 parseParenthesesBegin();
578 lastTokenWasAnAtom
= false;
582 parseParenthesesEnd();
583 lastTokenWasAnAtom
= true;
588 m_delegate
.assertionBOL();
589 lastTokenWasAnAtom
= false;
594 m_delegate
.assertionEOL();
595 lastTokenWasAnAtom
= false;
600 m_delegate
.atomBuiltInCharacterClass(NewlineClassID
, true);
601 lastTokenWasAnAtom
= true;
605 parseCharacterClass();
606 lastTokenWasAnAtom
= true;
610 lastTokenWasAnAtom
= parseAtomEscape();
615 parseQuantifier(lastTokenWasAnAtom
, 0, quantifyInfinite
);
616 lastTokenWasAnAtom
= false;
621 parseQuantifier(lastTokenWasAnAtom
, 1, quantifyInfinite
);
622 lastTokenWasAnAtom
= false;
627 parseQuantifier(lastTokenWasAnAtom
, 0, 1);
628 lastTokenWasAnAtom
= false;
632 ParseState state
= saveState();
636 unsigned min
= consumeNumber();
640 max
= peekIsDigit() ? consumeNumber() : quantifyInfinite
;
642 if (tryConsume('}')) {
644 parseQuantifier(lastTokenWasAnAtom
, min
, max
);
646 m_err
= QuantifierOutOfOrder
;
647 lastTokenWasAnAtom
= false;
653 } // if we did not find a complete quantifer, fall through to the default case.
656 m_delegate
.atomPatternCharacter(consume());
657 lastTokenWasAnAtom
= true;
664 if (m_parenthesesNestingDepth
> 0)
665 m_err
= MissingParentheses
;
671 * This method calls parseTokens() to parse over the input and converts any
672 * error code to a const char* for a result.
676 if (m_size
> MAX_PATTERN_SIZE
)
677 m_err
= PatternTooLarge
;
680 ASSERT(atEndOfPattern() || m_err
);
682 // The order of this array must match the ErrorCode enum.
683 static const char* errorMessages
[NumberOfErrorCodes
] = {
685 REGEXP_ERROR_PREFIX
"regular expression too large",
686 REGEXP_ERROR_PREFIX
"numbers out of order in {} quantifier",
687 REGEXP_ERROR_PREFIX
"nothing to repeat",
688 REGEXP_ERROR_PREFIX
"missing )",
689 REGEXP_ERROR_PREFIX
"unmatched parentheses",
690 REGEXP_ERROR_PREFIX
"unrecognized character after (?",
691 REGEXP_ERROR_PREFIX
"missing terminating ] for character class",
692 REGEXP_ERROR_PREFIX
"range out of order in character class",
693 REGEXP_ERROR_PREFIX
"\\ at end of pattern"
696 return errorMessages
[m_err
];
700 // Misc helper functions:
702 typedef unsigned ParseState
;
704 ParseState
saveState()
709 void restoreState(ParseState state
)
714 bool atEndOfPattern()
716 ASSERT(m_index
<= m_size
);
717 return m_index
== m_size
;
722 ASSERT(m_index
< m_size
);
723 return m_data
[m_index
];
728 return !atEndOfPattern() && WTF::isASCIIDigit(peek());
733 ASSERT(peekIsDigit());
739 ASSERT(m_index
< m_size
);
740 return m_data
[m_index
++];
743 unsigned consumeDigit()
745 ASSERT(peekIsDigit());
746 return consume() - '0';
749 unsigned consumeNumber()
751 unsigned n
= consumeDigit();
752 // check for overflow.
753 for (unsigned newValue
; peekIsDigit() && ((newValue
= n
* 10 + peekDigit()) >= n
); ) {
760 unsigned consumeOctal()
762 ASSERT(WTF::isASCIIOctalDigit(peek()));
764 unsigned n
= consumeDigit();
765 while (n
< 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
766 n
= n
* 8 + consumeDigit();
770 bool tryConsume(UChar ch
)
772 if (atEndOfPattern() || (m_data
[m_index
] != ch
))
778 int tryConsumeHex(int count
)
780 ParseState state
= saveState();
784 if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
788 n
= (n
<< 4) | WTF::toASCIIHexValue(consume());
793 Delegate
& m_delegate
;
794 unsigned m_backReferenceLimit
;
799 unsigned m_parenthesesNestingDepth
;
801 // Derived by empirical testing of compile time in PCRE and WREC.
802 static const unsigned MAX_PATTERN_SIZE
= 1024 * 1024;
808 * The parse method is passed a pattern to be parsed and a delegate upon which
809 * callbacks will be made to record the parsed tokens forming the regex.
810 * Yarr::parse() returns null on success, or a const C string providing an error
811 * message where a parse error occurs.
813 * The Delegate must implement the following interface:
815 * void assertionBOL();
816 * void assertionEOL();
817 * void assertionWordBoundary(bool invert);
819 * void atomPatternCharacter(UChar ch);
820 * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
821 * void atomCharacterClassBegin(bool invert)
822 * void atomCharacterClassAtom(UChar ch)
823 * void atomCharacterClassRange(UChar begin, UChar end)
824 * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
825 * void atomCharacterClassEnd()
826 * void atomParenthesesSubpatternBegin(bool capture = true);
827 * void atomParentheticalAssertionBegin(bool invert = false);
828 * void atomParenthesesEnd();
829 * void atomBackReference(unsigned subpatternId);
831 * void quantifyAtom(unsigned min, unsigned max, bool greedy);
833 * void disjunction();
835 * The regular expression is described by a sequence of assertion*() and atom*()
836 * callbacks to the delegate, describing the terms in the regular expression.
837 * Following an atom a quantifyAtom() call may occur to indicate that the previous
838 * atom should be quantified. In the case of atoms described across multiple
839 * calls (parentheses and character classes) the call to quantifyAtom() will come
840 * after the call to the atom*End() method, never after atom*Begin().
842 * Character classes may either be described by a single call to
843 * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
844 * In the latter case, ...Begin() will be called, followed by a sequence of
845 * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
847 * Sequences of atoms and assertions are broken into alternatives via calls to
848 * disjunction(). Assertions, atoms, and disjunctions emitted between calls to
849 * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
850 * atomParenthesesBegin() is passed a subpatternId. In the case of a regular
851 * capturing subpattern, this will be the subpatternId associated with these
852 * parentheses, and will also by definition be the lowest subpatternId of these
853 * parentheses and of any nested paretheses. The atomParenthesesEnd() method
854 * is passed the subpatternId of the last capturing subexpression nested within
855 * these paretheses. In the case of a capturing subpattern with no nested
856 * capturing subpatterns, the same subpatternId will be passed to the begin and
857 * end functions. In the case of non-capturing subpatterns the subpatternId
858 * passed to the begin method is also the first possible subpatternId that might
859 * be nested within these paretheses. If a set of non-capturing parentheses does
860 * not contain any capturing subpatterns, then the subpatternId passed to begin
861 * will be greater than the subpatternId passed to end.
864 template<class Delegate
>
865 const char* parse(Delegate
& delegate
, const UString
& pattern
, unsigned backReferenceLimit
= quantifyInfinite
)
867 return Parser
<Delegate
>(delegate
, pattern
, backReferenceLimit
).parse();
870 } } // namespace JSC::Yarr
872 #endif // YarrParser_h