2 * Copyright (C) 2009 Apple Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 #include <wtf/Platform.h>
34 #include <wtf/ASCIICType.h>
35 #include <wtf/unicode/Unicode.h>
38 namespace JSC
{ namespace Yarr
{
40 enum BuiltInCharacterClassID
{
47 // The Parser class should not be used directly - only via the Yarr::parse() method.
48 template<class Delegate
>
51 template<class FriendDelegate
>
52 friend const char* parse(FriendDelegate
& delegate
, const UString
& pattern
, unsigned backReferenceLimit
);
58 QuantifierWithoutAtom
,
61 ParenthesesTypeInvalid
,
62 CharacterClassUnmatched
,
63 CharacterClassOutOfOrder
,
69 * CharacterClassParserDelegate:
71 * The class CharacterClassParserDelegate is used in the parsing of character
72 * classes. This class handles detection of character ranges. This class
73 * implements enough of the delegate interface such that it can be passed to
74 * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused
75 * to perform the parsing of escape characters in character sets.
77 class CharacterClassParserDelegate
{
79 CharacterClassParserDelegate(Delegate
& delegate
, ErrorCode
& err
)
80 : m_delegate(delegate
)
89 * Called at beginning of construction.
91 void begin(bool invert
)
93 m_delegate
.atomCharacterClassBegin(invert
);
97 * atomPatternCharacterUnescaped():
99 * This method is called directly from parseCharacterClass(), to report a new
100 * pattern character token. This method differs from atomPatternCharacter(),
101 * which will be called from parseEscape(), since a hypen provided via this
102 * method may be indicating a character range, but a hyphen parsed by
103 * parseEscape() cannot be interpreted as doing so.
105 void atomPatternCharacterUnescaped(UChar ch
)
110 m_state
= cachedCharacter
;
113 case cachedCharacter
:
115 m_state
= cachedCharacterHyphen
;
117 m_delegate
.atomCharacterClassAtom(m_character
);
122 case cachedCharacterHyphen
:
123 if (ch
>= m_character
)
124 m_delegate
.atomCharacterClassRange(m_character
, ch
);
126 m_err
= CharacterClassOutOfOrder
;
132 * atomPatternCharacter():
134 * Adds a pattern character, called by parseEscape(), as such will not
135 * interpret a hyphen as indicating a character range.
137 void atomPatternCharacter(UChar ch
)
139 // Flush if a character is already pending to prevent the
140 // hyphen from begin interpreted as indicating a range.
141 if((ch
== '-') && (m_state
== cachedCharacter
))
144 atomPatternCharacterUnescaped(ch
);
148 * atomBuiltInCharacterClass():
150 * Adds a built-in character class, called by parseEscape().
152 void atomBuiltInCharacterClass(BuiltInCharacterClassID classID
, bool invert
)
155 m_delegate
.atomCharacterClassBuiltIn(classID
, invert
);
161 * Called at end of construction.
166 m_delegate
.atomCharacterClassEnd();
169 // parseEscape() should never call these delegate methods when
170 // invoked with inCharacterClass set.
171 void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); }
172 void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); }
177 if (m_state
!= empty
) // either cachedCharacter or cachedCharacterHyphen
178 m_delegate
.atomCharacterClassAtom(m_character
);
179 if (m_state
== cachedCharacterHyphen
)
180 m_delegate
.atomCharacterClassAtom('-');
184 Delegate
& m_delegate
;
186 enum CharacterClassConstructionState
{
189 cachedCharacterHyphen
,
194 Parser(Delegate
& delegate
, const UString
& pattern
, unsigned backReferenceLimit
)
195 : m_delegate(delegate
)
196 , m_backReferenceLimit(backReferenceLimit
)
198 , m_data(pattern
.data())
199 , m_size(pattern
.size())
201 , m_parenthesesNestingDepth(0)
208 * Helper for parseTokens() AND parseCharacterClass().
209 * Unlike the other parser methods, this function does not report tokens
210 * directly to the member delegate (m_delegate), instead tokens are
211 * emitted to the delegate provided as an argument. In the case of atom
212 * escapes, parseTokens() will call parseEscape() passing m_delegate as
213 * an argument, and as such the escape will be reported to the delegate.
215 * However this method may also be used by parseCharacterClass(), in which
216 * case a CharacterClassParserDelegate will be passed as the delegate that
217 * tokens should be added to. A boolean flag is also provided to indicate
218 * whether that an escape in a CharacterClass is being parsed (some parsing
219 * rules change in this context).
221 * The boolean value returned by this method indicates whether the token
222 * parsed was an atom (outside of a characted class \b and \B will be
223 * interpreted as assertions).
225 template<bool inCharacterClass
, class EscapeDelegate
>
226 bool parseEscape(EscapeDelegate
& delegate
)
229 ASSERT(peek() == '\\');
232 if (atEndOfPattern()) {
233 m_err
= EscapeUnterminated
;
241 if (inCharacterClass
)
242 delegate
.atomPatternCharacter('\b');
244 delegate
.assertionWordBoundary(false);
250 if (inCharacterClass
)
251 delegate
.atomPatternCharacter('B');
253 delegate
.assertionWordBoundary(true);
258 // CharacterClassEscape
261 delegate
.atomBuiltInCharacterClass(DigitClassID
, false);
265 delegate
.atomBuiltInCharacterClass(SpaceClassID
, false);
269 delegate
.atomBuiltInCharacterClass(WordClassID
, false);
273 delegate
.atomBuiltInCharacterClass(DigitClassID
, true);
277 delegate
.atomBuiltInCharacterClass(SpaceClassID
, true);
281 delegate
.atomBuiltInCharacterClass(WordClassID
, true);
294 // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
295 // First, try to parse this as backreference.
296 if (!inCharacterClass
) {
297 ParseState state
= saveState();
299 unsigned backReference
= consumeNumber();
300 if (backReference
<= m_backReferenceLimit
) {
301 delegate
.atomBackReference(backReference
);
308 // Not a backreference, and not octal.
310 delegate
.atomPatternCharacter('\\');
314 // Fall-through to handle this as an octal escape.
319 delegate
.atomPatternCharacter(consumeOctal());
325 delegate
.atomPatternCharacter('\f');
329 delegate
.atomPatternCharacter('\n');
333 delegate
.atomPatternCharacter('\r');
337 delegate
.atomPatternCharacter('\t');
341 delegate
.atomPatternCharacter('\v');
346 ParseState state
= saveState();
348 if (!atEndOfPattern()) {
349 int control
= consume();
351 // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
352 if (inCharacterClass
? WTF::isASCIIAlphanumeric(control
) || (control
== '_') : WTF::isASCIIAlpha(control
)) {
353 delegate
.atomPatternCharacter(control
& 0x1f);
358 delegate
.atomPatternCharacter('\\');
365 int x
= tryConsumeHex(2);
367 delegate
.atomPatternCharacter('x');
369 delegate
.atomPatternCharacter(x
);
376 int u
= tryConsumeHex(4);
378 delegate
.atomPatternCharacter('u');
380 delegate
.atomPatternCharacter(u
);
386 delegate
.atomPatternCharacter(consume());
393 * parseAtomEscape(), parseCharacterClassEscape():
395 * These methods alias to parseEscape().
397 bool parseAtomEscape()
399 return parseEscape
<false>(m_delegate
);
401 void parseCharacterClassEscape(CharacterClassParserDelegate
& delegate
)
403 parseEscape
<true>(delegate
);
407 * parseCharacterClass():
409 * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
410 * to an instance of CharacterClassParserDelegate, to describe the character class to the
413 void parseCharacterClass()
416 ASSERT(peek() == '[');
419 CharacterClassParserDelegate
characterClassConstructor(m_delegate
, m_err
);
421 characterClassConstructor
.begin(tryConsume('^'));
423 while (!atEndOfPattern()) {
427 characterClassConstructor
.end();
431 parseCharacterClassEscape(characterClassConstructor
);
435 characterClassConstructor
.atomPatternCharacterUnescaped(consume());
442 m_err
= CharacterClassUnmatched
;
446 * parseParenthesesBegin():
448 * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
450 void parseParenthesesBegin()
453 ASSERT(peek() == '(');
456 if (tryConsume('?')) {
457 if (atEndOfPattern()) {
458 m_err
= ParenthesesTypeInvalid
;
464 m_delegate
.atomParenthesesSubpatternBegin(false);
468 m_delegate
.atomParentheticalAssertionBegin();
472 m_delegate
.atomParentheticalAssertionBegin(true);
476 m_err
= ParenthesesTypeInvalid
;
479 m_delegate
.atomParenthesesSubpatternBegin();
481 ++m_parenthesesNestingDepth
;
485 * parseParenthesesEnd():
487 * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
489 void parseParenthesesEnd()
492 ASSERT(peek() == ')');
495 if (m_parenthesesNestingDepth
> 0)
496 m_delegate
.atomParenthesesEnd();
498 m_err
= ParenthesesUnmatched
;
500 --m_parenthesesNestingDepth
;
506 * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
508 void parseQuantifier(bool lastTokenWasAnAtom
, unsigned min
, unsigned max
)
513 if (lastTokenWasAnAtom
)
514 m_delegate
.quantifyAtom(min
, max
, !tryConsume('?'));
516 m_err
= QuantifierWithoutAtom
;
522 * This method loops over the input pattern reporting tokens to the delegate.
523 * The method returns when a parse error is detected, or the end of the pattern
524 * is reached. One piece of state is tracked around the loop, which is whether
525 * the last token passed to the delegate was an atom (this is necessary to detect
526 * a parse error when a quantifier provided without an atom to quantify).
530 bool lastTokenWasAnAtom
= false;
532 while (!atEndOfPattern()) {
536 m_delegate
.disjunction();
537 lastTokenWasAnAtom
= false;
541 parseParenthesesBegin();
542 lastTokenWasAnAtom
= false;
546 parseParenthesesEnd();
547 lastTokenWasAnAtom
= true;
552 m_delegate
.assertionBOL();
553 lastTokenWasAnAtom
= false;
558 m_delegate
.assertionEOL();
559 lastTokenWasAnAtom
= false;
564 m_delegate
.atomBuiltInCharacterClass(NewlineClassID
, true);
565 lastTokenWasAnAtom
= true;
569 parseCharacterClass();
570 lastTokenWasAnAtom
= true;
574 lastTokenWasAnAtom
= parseAtomEscape();
579 parseQuantifier(lastTokenWasAnAtom
, 0, UINT_MAX
);
580 lastTokenWasAnAtom
= false;
585 parseQuantifier(lastTokenWasAnAtom
, 1, UINT_MAX
);
586 lastTokenWasAnAtom
= false;
591 parseQuantifier(lastTokenWasAnAtom
, 0, 1);
592 lastTokenWasAnAtom
= false;
596 ParseState state
= saveState();
600 unsigned min
= consumeNumber();
604 max
= peekIsDigit() ? consumeNumber() : UINT_MAX
;
606 if (tryConsume('}')) {
608 parseQuantifier(lastTokenWasAnAtom
, min
, max
);
610 m_err
= QuantifierOutOfOrder
;
611 lastTokenWasAnAtom
= false;
617 } // if we did not find a complete quantifer, fall through to the default case.
620 m_delegate
.atomPatternCharacter(consume());
621 lastTokenWasAnAtom
= true;
628 if (m_parenthesesNestingDepth
> 0)
629 m_err
= MissingParentheses
;
635 * This method calls regexBegin(), calls parseTokens() to parse over the input
636 * patterns, calls regexEnd() or regexError() as appropriate, and converts any
637 * error code to a const char* for a result.
641 m_delegate
.regexBegin();
643 if (m_size
> MAX_PATTERN_SIZE
)
644 m_err
= PatternTooLarge
;
647 ASSERT(atEndOfPattern() || m_err
);
650 m_delegate
.regexError();
652 m_delegate
.regexEnd();
654 // The order of this array must match the ErrorCode enum.
655 static const char* errorMessages
[NumberOfErrorCodes
] = {
657 "regular expression too large",
658 "numbers out of order in {} quantifier",
661 "unmatched parentheses",
662 "unrecognized character after (?",
663 "missing terminating ] for character class",
664 "range out of order in character class",
665 "\\ at end of pattern"
668 return errorMessages
[m_err
];
672 // Misc helper functions:
674 typedef unsigned ParseState
;
676 ParseState
saveState()
681 void restoreState(ParseState state
)
686 bool atEndOfPattern()
688 ASSERT(m_index
<= m_size
);
689 return m_index
== m_size
;
694 ASSERT(m_index
< m_size
);
695 return m_data
[m_index
];
700 return !atEndOfPattern() && WTF::isASCIIDigit(peek());
705 ASSERT(peekIsDigit());
711 ASSERT(m_index
< m_size
);
712 return m_data
[m_index
++];
715 unsigned consumeDigit()
717 ASSERT(peekIsDigit());
718 return consume() - '0';
721 unsigned consumeNumber()
723 unsigned n
= consumeDigit();
724 // check for overflow.
725 for (unsigned newValue
; peekIsDigit() && ((newValue
= n
* 10 + peekDigit()) >= n
); ) {
732 unsigned consumeOctal()
734 ASSERT(WTF::isASCIIOctalDigit(peek()));
736 unsigned n
= consumeDigit();
737 while (n
< 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
738 n
= n
* 8 + consumeDigit();
742 bool tryConsume(UChar ch
)
744 if (atEndOfPattern() || (m_data
[m_index
] != ch
))
750 int tryConsumeHex(int count
)
752 ParseState state
= saveState();
756 if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
760 n
= (n
<< 4) | WTF::toASCIIHexValue(consume());
765 Delegate
& m_delegate
;
766 unsigned m_backReferenceLimit
;
771 unsigned m_parenthesesNestingDepth
;
773 // Derived by empirical testing of compile time in PCRE and WREC.
774 static const unsigned MAX_PATTERN_SIZE
= 1024 * 1024;
780 * The parse method is passed a pattern to be parsed and a delegate upon which
781 * callbacks will be made to record the parsed tokens forming the regex.
782 * Yarr::parse() returns null on success, or a const C string providing an error
783 * message where a parse error occurs.
785 * The Delegate must implement the following interface:
787 * void assertionBOL();
788 * void assertionEOL();
789 * void assertionWordBoundary(bool invert);
791 * void atomPatternCharacter(UChar ch);
792 * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
793 * void atomCharacterClassBegin(bool invert)
794 * void atomCharacterClassAtom(UChar ch)
795 * void atomCharacterClassRange(UChar begin, UChar end)
796 * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
797 * void atomCharacterClassEnd()
798 * void atomParenthesesSubpatternBegin(bool capture = true);
799 * void atomParentheticalAssertionBegin(bool invert = false);
800 * void atomParenthesesEnd();
801 * void atomBackReference(unsigned subpatternId);
803 * void quantifyAtom(unsigned min, unsigned max, bool greedy);
805 * void disjunction();
811 * Before any call recording tokens are made, regexBegin() will be called on the
812 * delegate once. Once parsing is complete either regexEnd() or regexError() will
813 * be called, as appropriate.
815 * The regular expression is described by a sequence of assertion*() and atom*()
816 * callbacks to the delegate, describing the terms in the regular expression.
817 * Following an atom a quantifyAtom() call may occur to indicate that the previous
818 * atom should be quantified. In the case of atoms described across multiple
819 * calls (parentheses and character classes) the call to quantifyAtom() will come
820 * after the call to the atom*End() method, never after atom*Begin().
822 * Character classes may either be described by a single call to
823 * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
824 * In the latter case, ...Begin() will be called, followed by a sequence of
825 * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
827 * Sequences of atoms and assertions are broken into alternatives via calls to
828 * disjunction(). Assertions, atoms, and disjunctions emitted between calls to
829 * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
830 * atomParenthesesBegin() is passed a subpatternId. In the case of a regular
831 * capturing subpattern, this will be the subpatternId associated with these
832 * parentheses, and will also by definition be the lowest subpatternId of these
833 * parentheses and of any nested paretheses. The atomParenthesesEnd() method
834 * is passed the subpatternId of the last capturing subexpression nested within
835 * these paretheses. In the case of a capturing subpattern with no nested
836 * capturing subpatterns, the same subpatternId will be passed to the begin and
837 * end functions. In the case of non-capturing subpatterns the subpatternId
838 * passed to the begin method is also the first possible subpatternId that might
839 * be nested within these paretheses. If a set of non-capturing parentheses does
840 * not contain any capturing subpatterns, then the subpatternId passed to begin
841 * will be greater than the subpatternId passed to end.
844 template<class Delegate
>
845 const char* parse(Delegate
& delegate
, const UString
& pattern
, unsigned backReferenceLimit
= UINT_MAX
)
847 return Parser
<Delegate
>(delegate
, pattern
, backReferenceLimit
).parse();
850 } } // namespace JSC::Yarr
854 #endif // RegexParser_h