]>
Commit | Line | Data |
---|---|---|
9dae56ea A |
1 | /* |
2 | * Copyright (C) 2008 Apple Inc. All rights reserved. | |
3 | * | |
4 | * Redistribution and use in source and binary forms, with or without | |
5 | * modification, are permitted provided that the following conditions | |
6 | * are met: | |
7 | * 1. Redistributions of source code must retain the above copyright | |
8 | * notice, this list of conditions and the following disclaimer. | |
9 | * 2. Redistributions in binary form must reproduce the above copyright | |
10 | * notice, this list of conditions and the following disclaimer in the | |
11 | * documentation and/or other materials provided with the distribution. | |
12 | * | |
13 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY | |
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
16 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR | |
17 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
18 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
19 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
20 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
21 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
23 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
24 | */ | |
25 | ||
26 | #include "config.h" | |
27 | #include "WRECParser.h" | |
28 | ||
29 | #if ENABLE(WREC) | |
30 | ||
31 | #include "CharacterClassConstructor.h" | |
32 | #include "WRECFunctors.h" | |
33 | ||
34 | using namespace WTF; | |
35 | ||
36 | namespace JSC { namespace WREC { | |
37 | ||
38 | // These error messages match the error messages used by PCRE. | |
39 | const char* Parser::QuantifierOutOfOrder = "numbers out of order in {} quantifier"; | |
40 | const char* Parser::QuantifierWithoutAtom = "nothing to repeat"; | |
41 | const char* Parser::ParenthesesUnmatched = "unmatched parentheses"; | |
42 | const char* Parser::ParenthesesTypeInvalid = "unrecognized character after (?"; | |
43 | const char* Parser::ParenthesesNotSupported = ""; // Not a user-visible syntax error -- just signals a syntax that WREC doesn't support yet. | |
44 | const char* Parser::CharacterClassUnmatched = "missing terminating ] for character class"; | |
45 | const char* Parser::CharacterClassOutOfOrder = "range out of order in character class"; | |
46 | const char* Parser::EscapeUnterminated = "\\ at end of pattern"; | |
47 | ||
48 | class PatternCharacterSequence { | |
49 | typedef Generator::JumpList JumpList; | |
50 | ||
51 | public: | |
52 | PatternCharacterSequence(Generator& generator, JumpList& failures) | |
53 | : m_generator(generator) | |
54 | , m_failures(failures) | |
55 | { | |
56 | } | |
57 | ||
58 | size_t size() { return m_sequence.size(); } | |
59 | ||
60 | void append(int ch) | |
61 | { | |
62 | m_sequence.append(ch); | |
63 | } | |
64 | ||
65 | void flush() | |
66 | { | |
67 | if (!m_sequence.size()) | |
68 | return; | |
69 | ||
70 | m_generator.generatePatternCharacterSequence(m_failures, m_sequence.begin(), m_sequence.size()); | |
71 | m_sequence.clear(); | |
72 | } | |
73 | ||
74 | void flush(const Quantifier& quantifier) | |
75 | { | |
76 | if (!m_sequence.size()) | |
77 | return; | |
78 | ||
79 | m_generator.generatePatternCharacterSequence(m_failures, m_sequence.begin(), m_sequence.size() - 1); | |
80 | ||
81 | switch (quantifier.type) { | |
82 | case Quantifier::None: | |
83 | case Quantifier::Error: | |
84 | ASSERT_NOT_REACHED(); | |
85 | break; | |
86 | ||
87 | case Quantifier::Greedy: { | |
88 | GeneratePatternCharacterFunctor functor(m_sequence.last()); | |
89 | m_generator.generateGreedyQuantifier(m_failures, functor, quantifier.min, quantifier.max); | |
90 | break; | |
91 | } | |
92 | ||
93 | case Quantifier::NonGreedy: { | |
94 | GeneratePatternCharacterFunctor functor(m_sequence.last()); | |
95 | m_generator.generateNonGreedyQuantifier(m_failures, functor, quantifier.min, quantifier.max); | |
96 | break; | |
97 | } | |
98 | } | |
99 | ||
100 | m_sequence.clear(); | |
101 | } | |
102 | ||
103 | private: | |
104 | Generator& m_generator; | |
105 | JumpList& m_failures; | |
106 | Vector<int, 8> m_sequence; | |
107 | }; | |
108 | ||
109 | ALWAYS_INLINE Quantifier Parser::consumeGreedyQuantifier() | |
110 | { | |
111 | switch (peek()) { | |
112 | case '?': | |
113 | consume(); | |
114 | return Quantifier(Quantifier::Greedy, 0, 1); | |
115 | ||
116 | case '*': | |
117 | consume(); | |
118 | return Quantifier(Quantifier::Greedy, 0); | |
119 | ||
120 | case '+': | |
121 | consume(); | |
122 | return Quantifier(Quantifier::Greedy, 1); | |
123 | ||
124 | case '{': { | |
125 | SavedState state(*this); | |
126 | consume(); | |
127 | ||
128 | // Accept: {n}, {n,}, {n,m}. | |
129 | // Reject: {n,m} where n > m. | |
130 | // Ignore: Anything else, such as {n, m}. | |
131 | ||
132 | if (!peekIsDigit()) { | |
133 | state.restore(); | |
134 | return Quantifier(); | |
135 | } | |
136 | ||
137 | unsigned min = consumeNumber(); | |
138 | unsigned max = min; | |
139 | ||
140 | if (peek() == ',') { | |
141 | consume(); | |
142 | max = peekIsDigit() ? consumeNumber() : Quantifier::Infinity; | |
143 | } | |
144 | ||
145 | if (peek() != '}') { | |
146 | state.restore(); | |
147 | return Quantifier(); | |
148 | } | |
149 | consume(); | |
150 | ||
151 | if (min > max) { | |
152 | setError(QuantifierOutOfOrder); | |
153 | return Quantifier(Quantifier::Error); | |
154 | } | |
155 | ||
156 | return Quantifier(Quantifier::Greedy, min, max); | |
157 | } | |
158 | ||
159 | default: | |
160 | return Quantifier(); // No quantifier. | |
161 | } | |
162 | } | |
163 | ||
164 | Quantifier Parser::consumeQuantifier() | |
165 | { | |
166 | Quantifier q = consumeGreedyQuantifier(); | |
167 | ||
168 | if ((q.type == Quantifier::Greedy) && (peek() == '?')) { | |
169 | consume(); | |
170 | q.type = Quantifier::NonGreedy; | |
171 | } | |
172 | ||
173 | return q; | |
174 | } | |
175 | ||
176 | bool Parser::parseCharacterClassQuantifier(JumpList& failures, const CharacterClass& charClass, bool invert) | |
177 | { | |
178 | Quantifier q = consumeQuantifier(); | |
179 | ||
180 | switch (q.type) { | |
181 | case Quantifier::None: { | |
182 | m_generator.generateCharacterClass(failures, charClass, invert); | |
183 | break; | |
184 | } | |
185 | ||
186 | case Quantifier::Greedy: { | |
187 | GenerateCharacterClassFunctor functor(&charClass, invert); | |
188 | m_generator.generateGreedyQuantifier(failures, functor, q.min, q.max); | |
189 | break; | |
190 | } | |
191 | ||
192 | case Quantifier::NonGreedy: { | |
193 | GenerateCharacterClassFunctor functor(&charClass, invert); | |
194 | m_generator.generateNonGreedyQuantifier(failures, functor, q.min, q.max); | |
195 | break; | |
196 | } | |
197 | ||
198 | case Quantifier::Error: | |
199 | return false; | |
200 | } | |
201 | ||
202 | return true; | |
203 | } | |
204 | ||
205 | bool Parser::parseBackreferenceQuantifier(JumpList& failures, unsigned subpatternId) | |
206 | { | |
207 | Quantifier q = consumeQuantifier(); | |
208 | ||
209 | switch (q.type) { | |
210 | case Quantifier::None: { | |
211 | m_generator.generateBackreference(failures, subpatternId); | |
212 | break; | |
213 | } | |
214 | ||
215 | case Quantifier::Greedy: | |
216 | case Quantifier::NonGreedy: | |
217 | m_generator.generateBackreferenceQuantifier(failures, q.type, subpatternId, q.min, q.max); | |
218 | return true; | |
219 | ||
220 | case Quantifier::Error: | |
221 | return false; | |
222 | } | |
223 | ||
224 | return true; | |
225 | } | |
226 | ||
227 | bool Parser::parseParentheses(JumpList& failures) | |
228 | { | |
229 | ParenthesesType type = consumeParenthesesType(); | |
230 | ||
231 | // FIXME: WREC originally failed to backtrack correctly in cases such as | |
232 | // "c".match(/(.*)c/). Now, most parentheses handling is disabled. For | |
233 | // unsupported parentheses, we fall back on PCRE. | |
234 | ||
235 | switch (type) { | |
236 | case Generator::Assertion: { | |
237 | m_generator.generateParenthesesAssertion(failures); | |
238 | ||
239 | if (consume() != ')') { | |
240 | setError(ParenthesesUnmatched); | |
241 | return false; | |
242 | } | |
243 | ||
244 | Quantifier quantifier = consumeQuantifier(); | |
245 | if (quantifier.type != Quantifier::None && quantifier.min == 0) { | |
246 | setError(ParenthesesNotSupported); | |
247 | return false; | |
248 | } | |
249 | ||
250 | return true; | |
251 | } | |
252 | case Generator::InvertedAssertion: { | |
253 | m_generator.generateParenthesesInvertedAssertion(failures); | |
254 | ||
255 | if (consume() != ')') { | |
256 | setError(ParenthesesUnmatched); | |
257 | return false; | |
258 | } | |
259 | ||
260 | Quantifier quantifier = consumeQuantifier(); | |
261 | if (quantifier.type != Quantifier::None && quantifier.min == 0) { | |
262 | setError(ParenthesesNotSupported); | |
263 | return false; | |
264 | } | |
265 | ||
266 | return true; | |
267 | } | |
268 | default: | |
269 | setError(ParenthesesNotSupported); | |
270 | return false; | |
271 | } | |
272 | } | |
273 | ||
274 | bool Parser::parseCharacterClass(JumpList& failures) | |
275 | { | |
276 | bool invert = false; | |
277 | if (peek() == '^') { | |
278 | consume(); | |
279 | invert = true; | |
280 | } | |
281 | ||
282 | CharacterClassConstructor constructor(m_ignoreCase); | |
283 | ||
284 | int ch; | |
285 | while ((ch = peek()) != ']') { | |
286 | switch (ch) { | |
287 | case EndOfPattern: | |
288 | setError(CharacterClassUnmatched); | |
289 | return false; | |
290 | ||
291 | case '\\': { | |
292 | consume(); | |
293 | Escape escape = consumeEscape(true); | |
294 | ||
295 | switch (escape.type()) { | |
296 | case Escape::PatternCharacter: { | |
297 | int character = PatternCharacterEscape::cast(escape).character(); | |
298 | if (character == '-') | |
299 | constructor.flushBeforeEscapedHyphen(); | |
300 | constructor.put(character); | |
301 | break; | |
302 | } | |
303 | case Escape::CharacterClass: { | |
304 | const CharacterClassEscape& characterClassEscape = CharacterClassEscape::cast(escape); | |
305 | ASSERT(!characterClassEscape.invert()); | |
306 | constructor.append(characterClassEscape.characterClass()); | |
307 | break; | |
308 | } | |
309 | case Escape::Error: | |
310 | return false; | |
311 | case Escape::Backreference: | |
312 | case Escape::WordBoundaryAssertion: { | |
313 | ASSERT_NOT_REACHED(); | |
314 | break; | |
315 | } | |
316 | } | |
317 | break; | |
318 | } | |
319 | ||
320 | default: | |
321 | consume(); | |
322 | constructor.put(ch); | |
323 | } | |
324 | } | |
325 | consume(); | |
326 | ||
327 | // lazily catch reversed ranges ([z-a])in character classes | |
328 | if (constructor.isUpsideDown()) { | |
329 | setError(CharacterClassOutOfOrder); | |
330 | return false; | |
331 | } | |
332 | ||
333 | constructor.flush(); | |
334 | CharacterClass charClass = constructor.charClass(); | |
335 | return parseCharacterClassQuantifier(failures, charClass, invert); | |
336 | } | |
337 | ||
338 | bool Parser::parseNonCharacterEscape(JumpList& failures, const Escape& escape) | |
339 | { | |
340 | switch (escape.type()) { | |
341 | case Escape::PatternCharacter: | |
342 | ASSERT_NOT_REACHED(); | |
343 | return false; | |
344 | ||
345 | case Escape::CharacterClass: | |
346 | return parseCharacterClassQuantifier(failures, CharacterClassEscape::cast(escape).characterClass(), CharacterClassEscape::cast(escape).invert()); | |
347 | ||
348 | case Escape::Backreference: | |
349 | return parseBackreferenceQuantifier(failures, BackreferenceEscape::cast(escape).subpatternId()); | |
350 | ||
351 | case Escape::WordBoundaryAssertion: | |
352 | m_generator.generateAssertionWordBoundary(failures, WordBoundaryAssertionEscape::cast(escape).invert()); | |
353 | return true; | |
354 | ||
355 | case Escape::Error: | |
356 | return false; | |
357 | } | |
358 | ||
359 | ASSERT_NOT_REACHED(); | |
360 | return false; | |
361 | } | |
362 | ||
363 | Escape Parser::consumeEscape(bool inCharacterClass) | |
364 | { | |
365 | switch (peek()) { | |
366 | case EndOfPattern: | |
367 | setError(EscapeUnterminated); | |
368 | return Escape(Escape::Error); | |
369 | ||
370 | // Assertions | |
371 | case 'b': | |
372 | consume(); | |
373 | if (inCharacterClass) | |
374 | return PatternCharacterEscape('\b'); | |
375 | return WordBoundaryAssertionEscape(false); // do not invert | |
376 | case 'B': | |
377 | consume(); | |
378 | if (inCharacterClass) | |
379 | return PatternCharacterEscape('B'); | |
380 | return WordBoundaryAssertionEscape(true); // invert | |
381 | ||
382 | // CharacterClassEscape | |
383 | case 'd': | |
384 | consume(); | |
385 | return CharacterClassEscape(CharacterClass::digits(), false); | |
386 | case 's': | |
387 | consume(); | |
388 | return CharacterClassEscape(CharacterClass::spaces(), false); | |
389 | case 'w': | |
390 | consume(); | |
391 | return CharacterClassEscape(CharacterClass::wordchar(), false); | |
392 | case 'D': | |
393 | consume(); | |
394 | return inCharacterClass | |
395 | ? CharacterClassEscape(CharacterClass::nondigits(), false) | |
396 | : CharacterClassEscape(CharacterClass::digits(), true); | |
397 | case 'S': | |
398 | consume(); | |
399 | return inCharacterClass | |
400 | ? CharacterClassEscape(CharacterClass::nonspaces(), false) | |
401 | : CharacterClassEscape(CharacterClass::spaces(), true); | |
402 | case 'W': | |
403 | consume(); | |
404 | return inCharacterClass | |
405 | ? CharacterClassEscape(CharacterClass::nonwordchar(), false) | |
406 | : CharacterClassEscape(CharacterClass::wordchar(), true); | |
407 | ||
408 | // DecimalEscape | |
409 | case '1': | |
410 | case '2': | |
411 | case '3': | |
412 | case '4': | |
413 | case '5': | |
414 | case '6': | |
415 | case '7': | |
416 | case '8': | |
417 | case '9': { | |
418 | if (peekDigit() > m_numSubpatterns || inCharacterClass) { | |
419 | // To match Firefox, we parse an invalid backreference in the range [1-7] | |
420 | // as an octal escape. | |
421 | return peekDigit() > 7 ? PatternCharacterEscape('\\') : PatternCharacterEscape(consumeOctal()); | |
422 | } | |
423 | ||
424 | int value = 0; | |
425 | do { | |
426 | unsigned newValue = value * 10 + peekDigit(); | |
427 | if (newValue > m_numSubpatterns) | |
428 | break; | |
429 | value = newValue; | |
430 | consume(); | |
431 | } while (peekIsDigit()); | |
432 | ||
433 | return BackreferenceEscape(value); | |
434 | } | |
435 | ||
436 | // Octal escape | |
437 | case '0': | |
438 | consume(); | |
439 | return PatternCharacterEscape(consumeOctal()); | |
440 | ||
441 | // ControlEscape | |
442 | case 'f': | |
443 | consume(); | |
444 | return PatternCharacterEscape('\f'); | |
445 | case 'n': | |
446 | consume(); | |
447 | return PatternCharacterEscape('\n'); | |
448 | case 'r': | |
449 | consume(); | |
450 | return PatternCharacterEscape('\r'); | |
451 | case 't': | |
452 | consume(); | |
453 | return PatternCharacterEscape('\t'); | |
454 | case 'v': | |
455 | consume(); | |
456 | return PatternCharacterEscape('\v'); | |
457 | ||
458 | // ControlLetter | |
459 | case 'c': { | |
460 | SavedState state(*this); | |
461 | consume(); | |
462 | ||
463 | int control = consume(); | |
464 | // To match Firefox, inside a character class, we also accept numbers | |
465 | // and '_' as control characters. | |
466 | if ((!inCharacterClass && !isASCIIAlpha(control)) || (!isASCIIAlphanumeric(control) && control != '_')) { | |
467 | state.restore(); | |
468 | return PatternCharacterEscape('\\'); | |
469 | } | |
470 | return PatternCharacterEscape(control & 31); | |
471 | } | |
472 | ||
473 | // HexEscape | |
474 | case 'x': { | |
475 | consume(); | |
476 | ||
477 | SavedState state(*this); | |
478 | int x = consumeHex(2); | |
479 | if (x == -1) { | |
480 | state.restore(); | |
481 | return PatternCharacterEscape('x'); | |
482 | } | |
483 | return PatternCharacterEscape(x); | |
484 | } | |
485 | ||
486 | // UnicodeEscape | |
487 | case 'u': { | |
488 | consume(); | |
489 | ||
490 | SavedState state(*this); | |
491 | int x = consumeHex(4); | |
492 | if (x == -1) { | |
493 | state.restore(); | |
494 | return PatternCharacterEscape('u'); | |
495 | } | |
496 | return PatternCharacterEscape(x); | |
497 | } | |
498 | ||
499 | // IdentityEscape | |
500 | default: | |
501 | return PatternCharacterEscape(consume()); | |
502 | } | |
503 | } | |
504 | ||
505 | void Parser::parseAlternative(JumpList& failures) | |
506 | { | |
507 | PatternCharacterSequence sequence(m_generator, failures); | |
508 | ||
509 | while (1) { | |
510 | switch (peek()) { | |
511 | case EndOfPattern: | |
512 | case '|': | |
513 | case ')': | |
514 | sequence.flush(); | |
515 | return; | |
516 | ||
517 | case '*': | |
518 | case '+': | |
519 | case '?': | |
520 | case '{': { | |
521 | Quantifier q = consumeQuantifier(); | |
522 | ||
523 | if (q.type == Quantifier::None) { | |
524 | sequence.append(consume()); | |
525 | continue; | |
526 | } | |
527 | ||
528 | if (q.type == Quantifier::Error) | |
529 | return; | |
530 | ||
531 | if (!sequence.size()) { | |
532 | setError(QuantifierWithoutAtom); | |
533 | return; | |
534 | } | |
535 | ||
536 | sequence.flush(q); | |
537 | continue; | |
538 | } | |
539 | ||
540 | case '^': | |
541 | consume(); | |
542 | ||
543 | sequence.flush(); | |
544 | m_generator.generateAssertionBOL(failures); | |
545 | continue; | |
546 | ||
547 | case '$': | |
548 | consume(); | |
549 | ||
550 | sequence.flush(); | |
551 | m_generator.generateAssertionEOL(failures); | |
552 | continue; | |
553 | ||
554 | case '.': | |
555 | consume(); | |
556 | ||
557 | sequence.flush(); | |
558 | if (!parseCharacterClassQuantifier(failures, CharacterClass::newline(), true)) | |
559 | return; | |
560 | continue; | |
561 | ||
562 | case '[': | |
563 | consume(); | |
564 | ||
565 | sequence.flush(); | |
566 | if (!parseCharacterClass(failures)) | |
567 | return; | |
568 | continue; | |
569 | ||
570 | case '(': | |
571 | consume(); | |
572 | ||
573 | sequence.flush(); | |
574 | if (!parseParentheses(failures)) | |
575 | return; | |
576 | continue; | |
577 | ||
578 | case '\\': { | |
579 | consume(); | |
580 | ||
581 | Escape escape = consumeEscape(false); | |
582 | if (escape.type() == Escape::PatternCharacter) { | |
583 | sequence.append(PatternCharacterEscape::cast(escape).character()); | |
584 | continue; | |
585 | } | |
586 | ||
587 | sequence.flush(); | |
588 | if (!parseNonCharacterEscape(failures, escape)) | |
589 | return; | |
590 | continue; | |
591 | } | |
592 | ||
593 | default: | |
594 | sequence.append(consume()); | |
595 | continue; | |
596 | } | |
597 | } | |
598 | } | |
599 | ||
600 | /* | |
601 | TOS holds index. | |
602 | */ | |
603 | void Parser::parseDisjunction(JumpList& failures) | |
604 | { | |
605 | parseAlternative(failures); | |
606 | if (peek() != '|') | |
607 | return; | |
608 | ||
609 | JumpList successes; | |
610 | do { | |
611 | consume(); | |
612 | m_generator.terminateAlternative(successes, failures); | |
613 | parseAlternative(failures); | |
614 | } while (peek() == '|'); | |
615 | ||
616 | m_generator.terminateDisjunction(successes); | |
617 | } | |
618 | ||
619 | Generator::ParenthesesType Parser::consumeParenthesesType() | |
620 | { | |
621 | if (peek() != '?') | |
622 | return Generator::Capturing; | |
623 | consume(); | |
624 | ||
625 | switch (consume()) { | |
626 | case ':': | |
627 | return Generator::NonCapturing; | |
628 | ||
629 | case '=': | |
630 | return Generator::Assertion; | |
631 | ||
632 | case '!': | |
633 | return Generator::InvertedAssertion; | |
634 | ||
635 | default: | |
636 | setError(ParenthesesTypeInvalid); | |
637 | return Generator::Error; | |
638 | } | |
639 | } | |
640 | ||
641 | } } // namespace JSC::WREC | |
642 | ||
643 | #endif // ENABLE(WREC) |