2 * Copyright (C) 2009 Apple Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 #include <wtf/ASCIICType.h>
34 #include <wtf/unicode/Unicode.h>
36 namespace JSC
{ namespace Yarr
{
38 enum BuiltInCharacterClassID
{
45 // The Parser class should not be used directly - only via the Yarr::parse() method.
46 template<class Delegate
>
49 template<class FriendDelegate
>
50 friend const char* parse(FriendDelegate
& delegate
, const UString
& pattern
, unsigned backReferenceLimit
);
56 QuantifierWithoutAtom
,
59 ParenthesesTypeInvalid
,
60 CharacterClassUnmatched
,
61 CharacterClassOutOfOrder
,
67 * CharacterClassParserDelegate:
69 * The class CharacterClassParserDelegate is used in the parsing of character
70 * classes. This class handles detection of character ranges. This class
71 * implements enough of the delegate interface such that it can be passed to
72 * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused
73 * to perform the parsing of escape characters in character sets.
75 class CharacterClassParserDelegate
{
77 CharacterClassParserDelegate(Delegate
& delegate
, ErrorCode
& err
)
78 : m_delegate(delegate
)
87 * Called at beginning of construction.
89 void begin(bool invert
)
91 m_delegate
.atomCharacterClassBegin(invert
);
95 * atomPatternCharacterUnescaped():
97 * This method is called directly from parseCharacterClass(), to report a new
98 * pattern character token. This method differs from atomPatternCharacter(),
99 * which will be called from parseEscape(), since a hypen provided via this
100 * method may be indicating a character range, but a hyphen parsed by
101 * parseEscape() cannot be interpreted as doing so.
103 void atomPatternCharacterUnescaped(UChar ch
)
108 m_state
= cachedCharacter
;
111 case cachedCharacter
:
113 m_state
= cachedCharacterHyphen
;
115 m_delegate
.atomCharacterClassAtom(m_character
);
120 case cachedCharacterHyphen
:
121 if (ch
>= m_character
)
122 m_delegate
.atomCharacterClassRange(m_character
, ch
);
124 m_err
= CharacterClassOutOfOrder
;
130 * atomPatternCharacter():
132 * Adds a pattern character, called by parseEscape(), as such will not
133 * interpret a hyphen as indicating a character range.
135 void atomPatternCharacter(UChar ch
)
137 // Flush if a character is already pending to prevent the
138 // hyphen from begin interpreted as indicating a range.
139 if((ch
== '-') && (m_state
== cachedCharacter
))
142 atomPatternCharacterUnescaped(ch
);
146 * atomBuiltInCharacterClass():
148 * Adds a built-in character class, called by parseEscape().
150 void atomBuiltInCharacterClass(BuiltInCharacterClassID classID
, bool invert
)
153 m_delegate
.atomCharacterClassBuiltIn(classID
, invert
);
159 * Called at end of construction.
164 m_delegate
.atomCharacterClassEnd();
167 // parseEscape() should never call these delegate methods when
168 // invoked with inCharacterClass set.
169 void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); }
170 void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); }
175 if (m_state
!= empty
) // either cachedCharacter or cachedCharacterHyphen
176 m_delegate
.atomCharacterClassAtom(m_character
);
177 if (m_state
== cachedCharacterHyphen
)
178 m_delegate
.atomCharacterClassAtom('-');
182 Delegate
& m_delegate
;
184 enum CharacterClassConstructionState
{
187 cachedCharacterHyphen
,
192 Parser(Delegate
& delegate
, const UString
& pattern
, unsigned backReferenceLimit
)
193 : m_delegate(delegate
)
194 , m_backReferenceLimit(backReferenceLimit
)
196 , m_data(pattern
.data())
197 , m_size(pattern
.size())
199 , m_parenthesesNestingDepth(0)
206 * Helper for parseTokens() AND parseCharacterClass().
207 * Unlike the other parser methods, this function does not report tokens
208 * directly to the member delegate (m_delegate), instead tokens are
209 * emitted to the delegate provided as an argument. In the case of atom
210 * escapes, parseTokens() will call parseEscape() passing m_delegate as
211 * an argument, and as such the escape will be reported to the delegate.
213 * However this method may also be used by parseCharacterClass(), in which
214 * case a CharacterClassParserDelegate will be passed as the delegate that
215 * tokens should be added to. A boolean flag is also provided to indicate
216 * whether that an escape in a CharacterClass is being parsed (some parsing
217 * rules change in this context).
219 * The boolean value returned by this method indicates whether the token
220 * parsed was an atom (outside of a characted class \b and \B will be
221 * interpreted as assertions).
223 template<bool inCharacterClass
, class EscapeDelegate
>
224 bool parseEscape(EscapeDelegate
& delegate
)
227 ASSERT(peek() == '\\');
230 if (atEndOfPattern()) {
231 m_err
= EscapeUnterminated
;
239 if (inCharacterClass
)
240 delegate
.atomPatternCharacter('\b');
242 delegate
.assertionWordBoundary(false);
248 if (inCharacterClass
)
249 delegate
.atomPatternCharacter('B');
251 delegate
.assertionWordBoundary(true);
256 // CharacterClassEscape
259 delegate
.atomBuiltInCharacterClass(DigitClassID
, false);
263 delegate
.atomBuiltInCharacterClass(SpaceClassID
, false);
267 delegate
.atomBuiltInCharacterClass(WordClassID
, false);
271 delegate
.atomBuiltInCharacterClass(DigitClassID
, true);
275 delegate
.atomBuiltInCharacterClass(SpaceClassID
, true);
279 delegate
.atomBuiltInCharacterClass(WordClassID
, true);
292 // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
293 // First, try to parse this as backreference.
294 if (!inCharacterClass
) {
295 ParseState state
= saveState();
297 unsigned backReference
= consumeNumber();
298 if (backReference
<= m_backReferenceLimit
) {
299 delegate
.atomBackReference(backReference
);
306 // Not a backreference, and not octal.
308 delegate
.atomPatternCharacter('\\');
312 // Fall-through to handle this as an octal escape.
317 delegate
.atomPatternCharacter(consumeOctal());
323 delegate
.atomPatternCharacter('\f');
327 delegate
.atomPatternCharacter('\n');
331 delegate
.atomPatternCharacter('\r');
335 delegate
.atomPatternCharacter('\t');
339 delegate
.atomPatternCharacter('\v');
344 ParseState state
= saveState();
346 if (!atEndOfPattern()) {
347 int control
= consume();
349 // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
350 if (inCharacterClass
? WTF::isASCIIAlphanumeric(control
) || (control
== '_') : WTF::isASCIIAlpha(control
)) {
351 delegate
.atomPatternCharacter(control
& 0x1f);
356 delegate
.atomPatternCharacter('\\');
363 int x
= tryConsumeHex(2);
365 delegate
.atomPatternCharacter('x');
367 delegate
.atomPatternCharacter(x
);
374 int u
= tryConsumeHex(4);
376 delegate
.atomPatternCharacter('u');
378 delegate
.atomPatternCharacter(u
);
384 delegate
.atomPatternCharacter(consume());
391 * parseAtomEscape(), parseCharacterClassEscape():
393 * These methods alias to parseEscape().
395 bool parseAtomEscape()
397 return parseEscape
<false>(m_delegate
);
399 void parseCharacterClassEscape(CharacterClassParserDelegate
& delegate
)
401 parseEscape
<true>(delegate
);
405 * parseCharacterClass():
407 * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
408 * to an instance of CharacterClassParserDelegate, to describe the character class to the
411 void parseCharacterClass()
414 ASSERT(peek() == '[');
417 CharacterClassParserDelegate
characterClassConstructor(m_delegate
, m_err
);
419 characterClassConstructor
.begin(tryConsume('^'));
421 while (!atEndOfPattern()) {
425 characterClassConstructor
.end();
429 parseCharacterClassEscape(characterClassConstructor
);
433 characterClassConstructor
.atomPatternCharacterUnescaped(consume());
440 m_err
= CharacterClassUnmatched
;
444 * parseParenthesesBegin():
446 * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
448 void parseParenthesesBegin()
451 ASSERT(peek() == '(');
454 if (tryConsume('?')) {
455 if (atEndOfPattern()) {
456 m_err
= ParenthesesTypeInvalid
;
462 m_delegate
.atomParenthesesSubpatternBegin(false);
466 m_delegate
.atomParentheticalAssertionBegin();
470 m_delegate
.atomParentheticalAssertionBegin(true);
474 m_err
= ParenthesesTypeInvalid
;
477 m_delegate
.atomParenthesesSubpatternBegin();
479 ++m_parenthesesNestingDepth
;
483 * parseParenthesesEnd():
485 * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
487 void parseParenthesesEnd()
490 ASSERT(peek() == ')');
493 if (m_parenthesesNestingDepth
> 0)
494 m_delegate
.atomParenthesesEnd();
496 m_err
= ParenthesesUnmatched
;
498 --m_parenthesesNestingDepth
;
504 * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
506 void parseQuantifier(bool lastTokenWasAnAtom
, unsigned min
, unsigned max
)
511 if (lastTokenWasAnAtom
)
512 m_delegate
.quantifyAtom(min
, max
, !tryConsume('?'));
514 m_err
= QuantifierWithoutAtom
;
520 * This method loops over the input pattern reporting tokens to the delegate.
521 * The method returns when a parse error is detected, or the end of the pattern
522 * is reached. One piece of state is tracked around the loop, which is whether
523 * the last token passed to the delegate was an atom (this is necessary to detect
524 * a parse error when a quantifier provided without an atom to quantify).
528 bool lastTokenWasAnAtom
= false;
530 while (!atEndOfPattern()) {
534 m_delegate
.disjunction();
535 lastTokenWasAnAtom
= false;
539 parseParenthesesBegin();
540 lastTokenWasAnAtom
= false;
544 parseParenthesesEnd();
545 lastTokenWasAnAtom
= true;
550 m_delegate
.assertionBOL();
551 lastTokenWasAnAtom
= false;
556 m_delegate
.assertionEOL();
557 lastTokenWasAnAtom
= false;
562 m_delegate
.atomBuiltInCharacterClass(NewlineClassID
, true);
563 lastTokenWasAnAtom
= true;
567 parseCharacterClass();
568 lastTokenWasAnAtom
= true;
572 lastTokenWasAnAtom
= parseAtomEscape();
577 parseQuantifier(lastTokenWasAnAtom
, 0, UINT_MAX
);
578 lastTokenWasAnAtom
= false;
583 parseQuantifier(lastTokenWasAnAtom
, 1, UINT_MAX
);
584 lastTokenWasAnAtom
= false;
589 parseQuantifier(lastTokenWasAnAtom
, 0, 1);
590 lastTokenWasAnAtom
= false;
594 ParseState state
= saveState();
598 unsigned min
= consumeNumber();
602 max
= peekIsDigit() ? consumeNumber() : UINT_MAX
;
604 if (tryConsume('}')) {
606 parseQuantifier(lastTokenWasAnAtom
, min
, max
);
608 m_err
= QuantifierOutOfOrder
;
609 lastTokenWasAnAtom
= false;
615 } // if we did not find a complete quantifer, fall through to the default case.
618 m_delegate
.atomPatternCharacter(consume());
619 lastTokenWasAnAtom
= true;
626 if (m_parenthesesNestingDepth
> 0)
627 m_err
= MissingParentheses
;
633 * This method calls regexBegin(), calls parseTokens() to parse over the input
634 * patterns, calls regexEnd() or regexError() as appropriate, and converts any
635 * error code to a const char* for a result.
639 m_delegate
.regexBegin();
641 if (m_size
> MAX_PATTERN_SIZE
)
642 m_err
= PatternTooLarge
;
645 ASSERT(atEndOfPattern() || m_err
);
648 m_delegate
.regexError();
650 m_delegate
.regexEnd();
652 // The order of this array must match the ErrorCode enum.
653 static const char* errorMessages
[NumberOfErrorCodes
] = {
655 "regular expression too large",
656 "numbers out of order in {} quantifier",
659 "unmatched parentheses",
660 "unrecognized character after (?",
661 "missing terminating ] for character class",
662 "range out of order in character class",
663 "\\ at end of pattern"
666 return errorMessages
[m_err
];
670 // Misc helper functions:
672 typedef unsigned ParseState
;
674 ParseState
saveState()
679 void restoreState(ParseState state
)
684 bool atEndOfPattern()
686 ASSERT(m_index
<= m_size
);
687 return m_index
== m_size
;
692 ASSERT(m_index
< m_size
);
693 return m_data
[m_index
];
698 return !atEndOfPattern() && WTF::isASCIIDigit(peek());
703 ASSERT(peekIsDigit());
709 ASSERT(m_index
< m_size
);
710 return m_data
[m_index
++];
713 unsigned consumeDigit()
715 ASSERT(peekIsDigit());
716 return consume() - '0';
719 unsigned consumeNumber()
721 unsigned n
= consumeDigit();
722 // check for overflow.
723 for (unsigned newValue
; peekIsDigit() && ((newValue
= n
* 10 + peekDigit()) >= n
); ) {
730 unsigned consumeOctal()
732 ASSERT(WTF::isASCIIOctalDigit(peek()));
734 unsigned n
= consumeDigit();
735 while (n
< 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
736 n
= n
* 8 + consumeDigit();
740 bool tryConsume(UChar ch
)
742 if (atEndOfPattern() || (m_data
[m_index
] != ch
))
748 int tryConsumeHex(int count
)
750 ParseState state
= saveState();
754 if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
758 n
= (n
<< 4) | WTF::toASCIIHexValue(consume());
763 Delegate
& m_delegate
;
764 unsigned m_backReferenceLimit
;
769 unsigned m_parenthesesNestingDepth
;
771 // Derived by empirical testing of compile time in PCRE and WREC.
772 static const unsigned MAX_PATTERN_SIZE
= 1024 * 1024;
778 * The parse method is passed a pattern to be parsed and a delegate upon which
779 * callbacks will be made to record the parsed tokens forming the regex.
780 * Yarr::parse() returns null on success, or a const C string providing an error
781 * message where a parse error occurs.
783 * The Delegate must implement the following interface:
785 * void assertionBOL();
786 * void assertionEOL();
787 * void assertionWordBoundary(bool invert);
789 * void atomPatternCharacter(UChar ch);
790 * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
791 * void atomCharacterClassBegin(bool invert)
792 * void atomCharacterClassAtom(UChar ch)
793 * void atomCharacterClassRange(UChar begin, UChar end)
794 * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
795 * void atomCharacterClassEnd()
796 * void atomParenthesesSubpatternBegin(bool capture = true);
797 * void atomParentheticalAssertionBegin(bool invert = false);
798 * void atomParenthesesEnd();
799 * void atomBackReference(unsigned subpatternId);
801 * void quantifyAtom(unsigned min, unsigned max, bool greedy);
803 * void disjunction();
809 * Before any call recording tokens are made, regexBegin() will be called on the
810 * delegate once. Once parsing is complete either regexEnd() or regexError() will
811 * be called, as appropriate.
813 * The regular expression is described by a sequence of assertion*() and atom*()
814 * callbacks to the delegate, describing the terms in the regular expression.
815 * Following an atom a quantifyAtom() call may occur to indicate that the previous
816 * atom should be quantified. In the case of atoms described across multiple
817 * calls (parentheses and character classes) the call to quantifyAtom() will come
818 * after the call to the atom*End() method, never after atom*Begin().
820 * Character classes may either be described by a single call to
821 * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
822 * In the latter case, ...Begin() will be called, followed by a sequence of
823 * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
825 * Sequences of atoms and assertions are broken into alternatives via calls to
826 * disjunction(). Assertions, atoms, and disjunctions emitted between calls to
827 * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
828 * atomParenthesesBegin() is passed a subpatternId. In the case of a regular
829 * capturing subpattern, this will be the subpatternId associated with these
830 * parentheses, and will also by definition be the lowest subpatternId of these
831 * parentheses and of any nested paretheses. The atomParenthesesEnd() method
832 * is passed the subpatternId of the last capturing subexpression nested within
833 * these paretheses. In the case of a capturing subpattern with no nested
834 * capturing subpatterns, the same subpatternId will be passed to the begin and
835 * end functions. In the case of non-capturing subpatterns the subpatternId
836 * passed to the begin method is also the first possible subpatternId that might
837 * be nested within these paretheses. If a set of non-capturing parentheses does
838 * not contain any capturing subpatterns, then the subpatternId passed to begin
839 * will be greater than the subpatternId passed to end.
842 template<class Delegate
>
843 const char* parse(Delegate
& delegate
, const UString
& pattern
, unsigned backReferenceLimit
= UINT_MAX
)
845 return Parser
<Delegate
>(delegate
, pattern
, backReferenceLimit
).parse();
848 } } // namespace JSC::Yarr
852 #endif // RegexParser_h