2 * Copyright (C) 2008 Apple Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 #include "WRECParser.h"
31 #include "CharacterClassConstructor.h"
32 #include "WRECFunctors.h"
36 namespace JSC
{ namespace WREC
{
38 // These error messages match the error messages used by PCRE.
39 const char* Parser::QuantifierOutOfOrder
= "numbers out of order in {} quantifier";
40 const char* Parser::QuantifierWithoutAtom
= "nothing to repeat";
41 const char* Parser::ParenthesesUnmatched
= "unmatched parentheses";
42 const char* Parser::ParenthesesTypeInvalid
= "unrecognized character after (?";
43 const char* Parser::ParenthesesNotSupported
= ""; // Not a user-visible syntax error -- just signals a syntax that WREC doesn't support yet.
44 const char* Parser::CharacterClassUnmatched
= "missing terminating ] for character class";
45 const char* Parser::CharacterClassOutOfOrder
= "range out of order in character class";
46 const char* Parser::EscapeUnterminated
= "\\ at end of pattern";
48 class PatternCharacterSequence
{
49 typedef Generator::JumpList JumpList
;
52 PatternCharacterSequence(Generator
& generator
, JumpList
& failures
)
53 : m_generator(generator
)
54 , m_failures(failures
)
58 size_t size() { return m_sequence
.size(); }
62 m_sequence
.append(ch
);
67 if (!m_sequence
.size())
70 m_generator
.generatePatternCharacterSequence(m_failures
, m_sequence
.begin(), m_sequence
.size());
74 void flush(const Quantifier
& quantifier
)
76 if (!m_sequence
.size())
79 m_generator
.generatePatternCharacterSequence(m_failures
, m_sequence
.begin(), m_sequence
.size() - 1);
81 switch (quantifier
.type
) {
82 case Quantifier::None
:
83 case Quantifier::Error
:
87 case Quantifier::Greedy
: {
88 GeneratePatternCharacterFunctor
functor(m_sequence
.last());
89 m_generator
.generateGreedyQuantifier(m_failures
, functor
, quantifier
.min
, quantifier
.max
);
93 case Quantifier::NonGreedy
: {
94 GeneratePatternCharacterFunctor
functor(m_sequence
.last());
95 m_generator
.generateNonGreedyQuantifier(m_failures
, functor
, quantifier
.min
, quantifier
.max
);
104 Generator
& m_generator
;
105 JumpList
& m_failures
;
106 Vector
<int, 8> m_sequence
;
109 ALWAYS_INLINE Quantifier
Parser::consumeGreedyQuantifier()
114 return Quantifier(Quantifier::Greedy
, 0, 1);
118 return Quantifier(Quantifier::Greedy
, 0);
122 return Quantifier(Quantifier::Greedy
, 1);
125 SavedState
state(*this);
128 // Accept: {n}, {n,}, {n,m}.
129 // Reject: {n,m} where n > m.
130 // Ignore: Anything else, such as {n, m}.
132 if (!peekIsDigit()) {
137 unsigned min
= consumeNumber();
142 max
= peekIsDigit() ? consumeNumber() : Quantifier::Infinity
;
152 setError(QuantifierOutOfOrder
);
153 return Quantifier(Quantifier::Error
);
156 return Quantifier(Quantifier::Greedy
, min
, max
);
160 return Quantifier(); // No quantifier.
164 Quantifier
Parser::consumeQuantifier()
166 Quantifier q
= consumeGreedyQuantifier();
168 if ((q
.type
== Quantifier::Greedy
) && (peek() == '?')) {
170 q
.type
= Quantifier::NonGreedy
;
176 bool Parser::parseCharacterClassQuantifier(JumpList
& failures
, const CharacterClass
& charClass
, bool invert
)
178 Quantifier q
= consumeQuantifier();
181 case Quantifier::None
: {
182 m_generator
.generateCharacterClass(failures
, charClass
, invert
);
186 case Quantifier::Greedy
: {
187 GenerateCharacterClassFunctor
functor(&charClass
, invert
);
188 m_generator
.generateGreedyQuantifier(failures
, functor
, q
.min
, q
.max
);
192 case Quantifier::NonGreedy
: {
193 GenerateCharacterClassFunctor
functor(&charClass
, invert
);
194 m_generator
.generateNonGreedyQuantifier(failures
, functor
, q
.min
, q
.max
);
198 case Quantifier::Error
:
205 bool Parser::parseBackreferenceQuantifier(JumpList
& failures
, unsigned subpatternId
)
207 Quantifier q
= consumeQuantifier();
210 case Quantifier::None
: {
211 m_generator
.generateBackreference(failures
, subpatternId
);
215 case Quantifier::Greedy
:
216 case Quantifier::NonGreedy
:
217 m_generator
.generateBackreferenceQuantifier(failures
, q
.type
, subpatternId
, q
.min
, q
.max
);
220 case Quantifier::Error
:
227 bool Parser::parseParentheses(JumpList
& failures
)
229 ParenthesesType type
= consumeParenthesesType();
231 // FIXME: WREC originally failed to backtrack correctly in cases such as
232 // "c".match(/(.*)c/). Now, most parentheses handling is disabled. For
233 // unsupported parentheses, we fall back on PCRE.
236 case Generator::Assertion
: {
237 m_generator
.generateParenthesesAssertion(failures
);
239 if (consume() != ')') {
240 setError(ParenthesesUnmatched
);
244 Quantifier quantifier
= consumeQuantifier();
245 if (quantifier
.type
!= Quantifier::None
&& quantifier
.min
== 0) {
246 setError(ParenthesesNotSupported
);
252 case Generator::InvertedAssertion
: {
253 m_generator
.generateParenthesesInvertedAssertion(failures
);
255 if (consume() != ')') {
256 setError(ParenthesesUnmatched
);
260 Quantifier quantifier
= consumeQuantifier();
261 if (quantifier
.type
!= Quantifier::None
&& quantifier
.min
== 0) {
262 setError(ParenthesesNotSupported
);
269 setError(ParenthesesNotSupported
);
274 bool Parser::parseCharacterClass(JumpList
& failures
)
282 CharacterClassConstructor
constructor(m_ignoreCase
);
285 while ((ch
= peek()) != ']') {
288 setError(CharacterClassUnmatched
);
293 Escape escape
= consumeEscape(true);
295 switch (escape
.type()) {
296 case Escape::PatternCharacter
: {
297 int character
= PatternCharacterEscape::cast(escape
).character();
298 if (character
== '-')
299 constructor
.flushBeforeEscapedHyphen();
300 constructor
.put(character
);
303 case Escape::CharacterClass
: {
304 const CharacterClassEscape
& characterClassEscape
= CharacterClassEscape::cast(escape
);
305 ASSERT(!characterClassEscape
.invert());
306 constructor
.append(characterClassEscape
.characterClass());
311 case Escape::Backreference
:
312 case Escape::WordBoundaryAssertion
: {
313 ASSERT_NOT_REACHED();
327 // lazily catch reversed ranges ([z-a])in character classes
328 if (constructor
.isUpsideDown()) {
329 setError(CharacterClassOutOfOrder
);
334 CharacterClass charClass
= constructor
.charClass();
335 return parseCharacterClassQuantifier(failures
, charClass
, invert
);
338 bool Parser::parseNonCharacterEscape(JumpList
& failures
, const Escape
& escape
)
340 switch (escape
.type()) {
341 case Escape::PatternCharacter
:
342 ASSERT_NOT_REACHED();
345 case Escape::CharacterClass
:
346 return parseCharacterClassQuantifier(failures
, CharacterClassEscape::cast(escape
).characterClass(), CharacterClassEscape::cast(escape
).invert());
348 case Escape::Backreference
:
349 return parseBackreferenceQuantifier(failures
, BackreferenceEscape::cast(escape
).subpatternId());
351 case Escape::WordBoundaryAssertion
:
352 m_generator
.generateAssertionWordBoundary(failures
, WordBoundaryAssertionEscape::cast(escape
).invert());
359 ASSERT_NOT_REACHED();
363 Escape
Parser::consumeEscape(bool inCharacterClass
)
367 setError(EscapeUnterminated
);
368 return Escape(Escape::Error
);
373 if (inCharacterClass
)
374 return PatternCharacterEscape('\b');
375 return WordBoundaryAssertionEscape(false); // do not invert
378 if (inCharacterClass
)
379 return PatternCharacterEscape('B');
380 return WordBoundaryAssertionEscape(true); // invert
382 // CharacterClassEscape
385 return CharacterClassEscape(CharacterClass::digits(), false);
388 return CharacterClassEscape(CharacterClass::spaces(), false);
391 return CharacterClassEscape(CharacterClass::wordchar(), false);
394 return inCharacterClass
395 ? CharacterClassEscape(CharacterClass::nondigits(), false)
396 : CharacterClassEscape(CharacterClass::digits(), true);
399 return inCharacterClass
400 ? CharacterClassEscape(CharacterClass::nonspaces(), false)
401 : CharacterClassEscape(CharacterClass::spaces(), true);
404 return inCharacterClass
405 ? CharacterClassEscape(CharacterClass::nonwordchar(), false)
406 : CharacterClassEscape(CharacterClass::wordchar(), true);
418 if (peekDigit() > m_numSubpatterns
|| inCharacterClass
) {
419 // To match Firefox, we parse an invalid backreference in the range [1-7]
420 // as an octal escape.
421 return peekDigit() > 7 ? PatternCharacterEscape('\\') : PatternCharacterEscape(consumeOctal());
426 unsigned newValue
= value
* 10 + peekDigit();
427 if (newValue
> m_numSubpatterns
)
431 } while (peekIsDigit());
433 return BackreferenceEscape(value
);
439 return PatternCharacterEscape(consumeOctal());
444 return PatternCharacterEscape('\f');
447 return PatternCharacterEscape('\n');
450 return PatternCharacterEscape('\r');
453 return PatternCharacterEscape('\t');
456 return PatternCharacterEscape('\v');
460 SavedState
state(*this);
463 int control
= consume();
464 // To match Firefox, inside a character class, we also accept numbers
465 // and '_' as control characters.
466 if ((!inCharacterClass
&& !isASCIIAlpha(control
)) || (!isASCIIAlphanumeric(control
) && control
!= '_')) {
468 return PatternCharacterEscape('\\');
470 return PatternCharacterEscape(control
& 31);
477 SavedState
state(*this);
478 int x
= consumeHex(2);
481 return PatternCharacterEscape('x');
483 return PatternCharacterEscape(x
);
490 SavedState
state(*this);
491 int x
= consumeHex(4);
494 return PatternCharacterEscape('u');
496 return PatternCharacterEscape(x
);
501 return PatternCharacterEscape(consume());
505 void Parser::parseAlternative(JumpList
& failures
)
507 PatternCharacterSequence
sequence(m_generator
, failures
);
521 Quantifier q
= consumeQuantifier();
523 if (q
.type
== Quantifier::None
) {
524 sequence
.append(consume());
528 if (q
.type
== Quantifier::Error
)
531 if (!sequence
.size()) {
532 setError(QuantifierWithoutAtom
);
544 m_generator
.generateAssertionBOL(failures
);
551 m_generator
.generateAssertionEOL(failures
);
558 if (!parseCharacterClassQuantifier(failures
, CharacterClass::newline(), true))
566 if (!parseCharacterClass(failures
))
574 if (!parseParentheses(failures
))
581 Escape escape
= consumeEscape(false);
582 if (escape
.type() == Escape::PatternCharacter
) {
583 sequence
.append(PatternCharacterEscape::cast(escape
).character());
588 if (!parseNonCharacterEscape(failures
, escape
))
594 sequence
.append(consume());
603 void Parser::parseDisjunction(JumpList
& failures
)
605 parseAlternative(failures
);
612 m_generator
.terminateAlternative(successes
, failures
);
613 parseAlternative(failures
);
614 } while (peek() == '|');
616 m_generator
.terminateDisjunction(successes
);
619 Generator::ParenthesesType
Parser::consumeParenthesesType()
622 return Generator::Capturing
;
627 return Generator::NonCapturing
;
630 return Generator::Assertion
;
633 return Generator::InvertedAssertion
;
636 setError(ParenthesesTypeInvalid
);
637 return Generator::Error
;
641 } } // namespace JSC::WREC
643 #endif // ENABLE(WREC)