]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | * Copyright (C) 2009 Apple Inc. All rights reserved. | |
3 | * | |
4 | * Redistribution and use in source and binary forms, with or without | |
5 | * modification, are permitted provided that the following conditions | |
6 | * are met: | |
7 | * 1. Redistributions of source code must retain the above copyright | |
8 | * notice, this list of conditions and the following disclaimer. | |
9 | * 2. Redistributions in binary form must reproduce the above copyright | |
10 | * notice, this list of conditions and the following disclaimer in the | |
11 | * documentation and/or other materials provided with the distribution. | |
12 | * | |
13 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY | |
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
16 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR | |
17 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
18 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
19 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
20 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
21 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
23 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
24 | */ | |
25 | ||
26 | #ifndef YarrParser_h | |
27 | #define YarrParser_h | |
28 | ||
29 | #include "Yarr.h" | |
30 | #include <wtf/ASCIICType.h> | |
31 | #include <wtf/text/WTFString.h> | |
32 | #include <wtf/unicode/Unicode.h> | |
33 | ||
34 | namespace JSC { namespace Yarr { | |
35 | ||
36 | #define REGEXP_ERROR_PREFIX "Invalid regular expression: " | |
37 | ||
38 | enum BuiltInCharacterClassID { | |
39 | DigitClassID, | |
40 | SpaceClassID, | |
41 | WordClassID, | |
42 | NewlineClassID, | |
43 | }; | |
44 | ||
45 | // The Parser class should not be used directly - only via the Yarr::parse() method. | |
46 | template<class Delegate, typename CharType> | |
47 | class Parser { | |
48 | private: | |
49 | template<class FriendDelegate> | |
50 | friend const char* parse(FriendDelegate&, const String& pattern, unsigned backReferenceLimit); | |
51 | ||
52 | enum ErrorCode { | |
53 | NoError, | |
54 | PatternTooLarge, | |
55 | QuantifierOutOfOrder, | |
56 | QuantifierWithoutAtom, | |
57 | QuantifierTooLarge, | |
58 | MissingParentheses, | |
59 | ParenthesesUnmatched, | |
60 | ParenthesesTypeInvalid, | |
61 | CharacterClassUnmatched, | |
62 | CharacterClassOutOfOrder, | |
63 | EscapeUnterminated, | |
64 | NumberOfErrorCodes | |
65 | }; | |
66 | ||
67 | /* | |
68 | * CharacterClassParserDelegate: | |
69 | * | |
70 | * The class CharacterClassParserDelegate is used in the parsing of character | |
71 | * classes. This class handles detection of character ranges. This class | |
72 | * implements enough of the delegate interface such that it can be passed to | |
73 | * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused | |
74 | * to perform the parsing of escape characters in character sets. | |
75 | */ | |
76 | class CharacterClassParserDelegate { | |
77 | public: | |
78 | CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) | |
79 | : m_delegate(delegate) | |
80 | , m_err(err) | |
81 | , m_state(Empty) | |
82 | , m_character(0) | |
83 | { | |
84 | } | |
85 | ||
86 | /* | |
87 | * begin(): | |
88 | * | |
89 | * Called at beginning of construction. | |
90 | */ | |
91 | void begin(bool invert) | |
92 | { | |
93 | m_delegate.atomCharacterClassBegin(invert); | |
94 | } | |
95 | ||
96 | /* | |
97 | * atomPatternCharacter(): | |
98 | * | |
99 | * This method is called either from parseCharacterClass() (for an unescaped | |
100 | * character in a character class), or from parseEscape(). In the former case | |
101 | * the value true will be passed for the argument 'hyphenIsRange', and in this | |
102 | * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/ | |
103 | * is different to /[a\-z]/). | |
104 | */ | |
105 | void atomPatternCharacter(UChar ch, bool hyphenIsRange = false) | |
106 | { | |
107 | switch (m_state) { | |
108 | case AfterCharacterClass: | |
109 | // Following a builtin character class we need look out for a hyphen. | |
110 | // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/. | |
111 | // If we see a hyphen following a charater class then unlike usual | |
112 | // we'll report it to the delegate immediately, and put ourself into | |
113 | // a poisoned state. Any following calls to add another character or | |
114 | // character class will result in an error. (A hypen following a | |
115 | // character-class is itself valid, but only at the end of a regex). | |
116 | if (hyphenIsRange && ch == '-') { | |
117 | m_delegate.atomCharacterClassAtom('-'); | |
118 | m_state = AfterCharacterClassHyphen; | |
119 | return; | |
120 | } | |
121 | // Otherwise just fall through - cached character so treat this as Empty. | |
122 | ||
123 | case Empty: | |
124 | m_character = ch; | |
125 | m_state = CachedCharacter; | |
126 | return; | |
127 | ||
128 | case CachedCharacter: | |
129 | if (hyphenIsRange && ch == '-') | |
130 | m_state = CachedCharacterHyphen; | |
131 | else { | |
132 | m_delegate.atomCharacterClassAtom(m_character); | |
133 | m_character = ch; | |
134 | } | |
135 | return; | |
136 | ||
137 | case CachedCharacterHyphen: | |
138 | if (ch < m_character) { | |
139 | m_err = CharacterClassOutOfOrder; | |
140 | return; | |
141 | } | |
142 | m_delegate.atomCharacterClassRange(m_character, ch); | |
143 | m_state = Empty; | |
144 | return; | |
145 | ||
146 | // See coment in atomBuiltInCharacterClass below. | |
147 | // This too is technically an error, per ECMA-262, and again we | |
148 | // we chose to allow this. Note a subtlely here that while we | |
149 | // diverge from the spec's definition of CharacterRange we do | |
150 | // remain in compliance with the grammar. For example, consider | |
151 | // the expression /[\d-a-z]/. We comply with the grammar in | |
152 | // this case by not allowing a-z to be matched as a range. | |
153 | case AfterCharacterClassHyphen: | |
154 | m_delegate.atomCharacterClassAtom(ch); | |
155 | m_state = Empty; | |
156 | return; | |
157 | } | |
158 | } | |
159 | ||
160 | /* | |
161 | * atomBuiltInCharacterClass(): | |
162 | * | |
163 | * Adds a built-in character class, called by parseEscape(). | |
164 | */ | |
165 | void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) | |
166 | { | |
167 | switch (m_state) { | |
168 | case CachedCharacter: | |
169 | // Flush the currently cached character, then fall through. | |
170 | m_delegate.atomCharacterClassAtom(m_character); | |
171 | ||
172 | case Empty: | |
173 | case AfterCharacterClass: | |
174 | m_state = AfterCharacterClass; | |
175 | m_delegate.atomCharacterClassBuiltIn(classID, invert); | |
176 | return; | |
177 | ||
178 | // If we hit either of these cases, we have an invalid range that | |
179 | // looks something like /[x-\d]/ or /[\d-\d]/. | |
180 | // According to ECMA-262 this should be a syntax error, but | |
181 | // empirical testing shows this to break teh webz. Instead we | |
182 | // comply with to the ECMA-262 grammar, and assume the grammar to | |
183 | // have matched the range correctly, but tweak our interpretation | |
184 | // of CharacterRange. Effectively we implicitly handle the hyphen | |
185 | // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/. | |
186 | case CachedCharacterHyphen: | |
187 | m_delegate.atomCharacterClassAtom(m_character); | |
188 | m_delegate.atomCharacterClassAtom('-'); | |
189 | // fall through | |
190 | case AfterCharacterClassHyphen: | |
191 | m_delegate.atomCharacterClassBuiltIn(classID, invert); | |
192 | m_state = Empty; | |
193 | return; | |
194 | } | |
195 | } | |
196 | ||
197 | /* | |
198 | * end(): | |
199 | * | |
200 | * Called at end of construction. | |
201 | */ | |
202 | void end() | |
203 | { | |
204 | if (m_state == CachedCharacter) | |
205 | m_delegate.atomCharacterClassAtom(m_character); | |
206 | else if (m_state == CachedCharacterHyphen) { | |
207 | m_delegate.atomCharacterClassAtom(m_character); | |
208 | m_delegate.atomCharacterClassAtom('-'); | |
209 | } | |
210 | m_delegate.atomCharacterClassEnd(); | |
211 | } | |
212 | ||
213 | // parseEscape() should never call these delegate methods when | |
214 | // invoked with inCharacterClass set. | |
215 | NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { RELEASE_ASSERT_NOT_REACHED(); } | |
216 | NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { RELEASE_ASSERT_NOT_REACHED(); } | |
217 | ||
218 | private: | |
219 | Delegate& m_delegate; | |
220 | ErrorCode& m_err; | |
221 | enum CharacterClassConstructionState { | |
222 | Empty, | |
223 | CachedCharacter, | |
224 | CachedCharacterHyphen, | |
225 | AfterCharacterClass, | |
226 | AfterCharacterClassHyphen, | |
227 | } m_state; | |
228 | UChar m_character; | |
229 | }; | |
230 | ||
231 | Parser(Delegate& delegate, const String& pattern, unsigned backReferenceLimit) | |
232 | : m_delegate(delegate) | |
233 | , m_backReferenceLimit(backReferenceLimit) | |
234 | , m_err(NoError) | |
235 | , m_data(pattern.getCharacters<CharType>()) | |
236 | , m_size(pattern.length()) | |
237 | , m_index(0) | |
238 | , m_parenthesesNestingDepth(0) | |
239 | { | |
240 | } | |
241 | ||
242 | /* | |
243 | * parseEscape(): | |
244 | * | |
245 | * Helper for parseTokens() AND parseCharacterClass(). | |
246 | * Unlike the other parser methods, this function does not report tokens | |
247 | * directly to the member delegate (m_delegate), instead tokens are | |
248 | * emitted to the delegate provided as an argument. In the case of atom | |
249 | * escapes, parseTokens() will call parseEscape() passing m_delegate as | |
250 | * an argument, and as such the escape will be reported to the delegate. | |
251 | * | |
252 | * However this method may also be used by parseCharacterClass(), in which | |
253 | * case a CharacterClassParserDelegate will be passed as the delegate that | |
254 | * tokens should be added to. A boolean flag is also provided to indicate | |
255 | * whether that an escape in a CharacterClass is being parsed (some parsing | |
256 | * rules change in this context). | |
257 | * | |
258 | * The boolean value returned by this method indicates whether the token | |
259 | * parsed was an atom (outside of a characted class \b and \B will be | |
260 | * interpreted as assertions). | |
261 | */ | |
262 | template<bool inCharacterClass, class EscapeDelegate> | |
263 | bool parseEscape(EscapeDelegate& delegate) | |
264 | { | |
265 | ASSERT(!m_err); | |
266 | ASSERT(peek() == '\\'); | |
267 | consume(); | |
268 | ||
269 | if (atEndOfPattern()) { | |
270 | m_err = EscapeUnterminated; | |
271 | return false; | |
272 | } | |
273 | ||
274 | switch (peek()) { | |
275 | // Assertions | |
276 | case 'b': | |
277 | consume(); | |
278 | if (inCharacterClass) | |
279 | delegate.atomPatternCharacter('\b'); | |
280 | else { | |
281 | delegate.assertionWordBoundary(false); | |
282 | return false; | |
283 | } | |
284 | break; | |
285 | case 'B': | |
286 | consume(); | |
287 | if (inCharacterClass) | |
288 | delegate.atomPatternCharacter('B'); | |
289 | else { | |
290 | delegate.assertionWordBoundary(true); | |
291 | return false; | |
292 | } | |
293 | break; | |
294 | ||
295 | // CharacterClassEscape | |
296 | case 'd': | |
297 | consume(); | |
298 | delegate.atomBuiltInCharacterClass(DigitClassID, false); | |
299 | break; | |
300 | case 's': | |
301 | consume(); | |
302 | delegate.atomBuiltInCharacterClass(SpaceClassID, false); | |
303 | break; | |
304 | case 'w': | |
305 | consume(); | |
306 | delegate.atomBuiltInCharacterClass(WordClassID, false); | |
307 | break; | |
308 | case 'D': | |
309 | consume(); | |
310 | delegate.atomBuiltInCharacterClass(DigitClassID, true); | |
311 | break; | |
312 | case 'S': | |
313 | consume(); | |
314 | delegate.atomBuiltInCharacterClass(SpaceClassID, true); | |
315 | break; | |
316 | case 'W': | |
317 | consume(); | |
318 | delegate.atomBuiltInCharacterClass(WordClassID, true); | |
319 | break; | |
320 | ||
321 | // DecimalEscape | |
322 | case '1': | |
323 | case '2': | |
324 | case '3': | |
325 | case '4': | |
326 | case '5': | |
327 | case '6': | |
328 | case '7': | |
329 | case '8': | |
330 | case '9': { | |
331 | // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape. | |
332 | // First, try to parse this as backreference. | |
333 | if (!inCharacterClass) { | |
334 | ParseState state = saveState(); | |
335 | ||
336 | unsigned backReference = consumeNumber(); | |
337 | if (backReference <= m_backReferenceLimit) { | |
338 | delegate.atomBackReference(backReference); | |
339 | break; | |
340 | } | |
341 | ||
342 | restoreState(state); | |
343 | } | |
344 | ||
345 | // Not a backreference, and not octal. | |
346 | if (peek() >= '8') { | |
347 | delegate.atomPatternCharacter('\\'); | |
348 | break; | |
349 | } | |
350 | ||
351 | // Fall-through to handle this as an octal escape. | |
352 | } | |
353 | ||
354 | // Octal escape | |
355 | case '0': | |
356 | delegate.atomPatternCharacter(consumeOctal()); | |
357 | break; | |
358 | ||
359 | // ControlEscape | |
360 | case 'f': | |
361 | consume(); | |
362 | delegate.atomPatternCharacter('\f'); | |
363 | break; | |
364 | case 'n': | |
365 | consume(); | |
366 | delegate.atomPatternCharacter('\n'); | |
367 | break; | |
368 | case 'r': | |
369 | consume(); | |
370 | delegate.atomPatternCharacter('\r'); | |
371 | break; | |
372 | case 't': | |
373 | consume(); | |
374 | delegate.atomPatternCharacter('\t'); | |
375 | break; | |
376 | case 'v': | |
377 | consume(); | |
378 | delegate.atomPatternCharacter('\v'); | |
379 | break; | |
380 | ||
381 | // ControlLetter | |
382 | case 'c': { | |
383 | ParseState state = saveState(); | |
384 | consume(); | |
385 | if (!atEndOfPattern()) { | |
386 | int control = consume(); | |
387 | ||
388 | // To match Firefox, inside a character class, we also accept numbers and '_' as control characters. | |
389 | if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) { | |
390 | delegate.atomPatternCharacter(control & 0x1f); | |
391 | break; | |
392 | } | |
393 | } | |
394 | restoreState(state); | |
395 | delegate.atomPatternCharacter('\\'); | |
396 | break; | |
397 | } | |
398 | ||
399 | // HexEscape | |
400 | case 'x': { | |
401 | consume(); | |
402 | int x = tryConsumeHex(2); | |
403 | if (x == -1) | |
404 | delegate.atomPatternCharacter('x'); | |
405 | else | |
406 | delegate.atomPatternCharacter(x); | |
407 | break; | |
408 | } | |
409 | ||
410 | // UnicodeEscape | |
411 | case 'u': { | |
412 | consume(); | |
413 | int u = tryConsumeHex(4); | |
414 | if (u == -1) | |
415 | delegate.atomPatternCharacter('u'); | |
416 | else | |
417 | delegate.atomPatternCharacter(u); | |
418 | break; | |
419 | } | |
420 | ||
421 | // IdentityEscape | |
422 | default: | |
423 | delegate.atomPatternCharacter(consume()); | |
424 | } | |
425 | ||
426 | return true; | |
427 | } | |
428 | ||
429 | /* | |
430 | * parseAtomEscape(), parseCharacterClassEscape(): | |
431 | * | |
432 | * These methods alias to parseEscape(). | |
433 | */ | |
434 | bool parseAtomEscape() | |
435 | { | |
436 | return parseEscape<false>(m_delegate); | |
437 | } | |
438 | void parseCharacterClassEscape(CharacterClassParserDelegate& delegate) | |
439 | { | |
440 | parseEscape<true>(delegate); | |
441 | } | |
442 | ||
443 | /* | |
444 | * parseCharacterClass(): | |
445 | * | |
446 | * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape) | |
447 | * to an instance of CharacterClassParserDelegate, to describe the character class to the | |
448 | * delegate. | |
449 | */ | |
450 | void parseCharacterClass() | |
451 | { | |
452 | ASSERT(!m_err); | |
453 | ASSERT(peek() == '['); | |
454 | consume(); | |
455 | ||
456 | CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err); | |
457 | ||
458 | characterClassConstructor.begin(tryConsume('^')); | |
459 | ||
460 | while (!atEndOfPattern()) { | |
461 | switch (peek()) { | |
462 | case ']': | |
463 | consume(); | |
464 | characterClassConstructor.end(); | |
465 | return; | |
466 | ||
467 | case '\\': | |
468 | parseCharacterClassEscape(characterClassConstructor); | |
469 | break; | |
470 | ||
471 | default: | |
472 | characterClassConstructor.atomPatternCharacter(consume(), true); | |
473 | } | |
474 | ||
475 | if (m_err) | |
476 | return; | |
477 | } | |
478 | ||
479 | m_err = CharacterClassUnmatched; | |
480 | } | |
481 | ||
482 | /* | |
483 | * parseParenthesesBegin(): | |
484 | * | |
485 | * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns. | |
486 | */ | |
487 | void parseParenthesesBegin() | |
488 | { | |
489 | ASSERT(!m_err); | |
490 | ASSERT(peek() == '('); | |
491 | consume(); | |
492 | ||
493 | if (tryConsume('?')) { | |
494 | if (atEndOfPattern()) { | |
495 | m_err = ParenthesesTypeInvalid; | |
496 | return; | |
497 | } | |
498 | ||
499 | switch (consume()) { | |
500 | case ':': | |
501 | m_delegate.atomParenthesesSubpatternBegin(false); | |
502 | break; | |
503 | ||
504 | case '=': | |
505 | m_delegate.atomParentheticalAssertionBegin(); | |
506 | break; | |
507 | ||
508 | case '!': | |
509 | m_delegate.atomParentheticalAssertionBegin(true); | |
510 | break; | |
511 | ||
512 | default: | |
513 | m_err = ParenthesesTypeInvalid; | |
514 | } | |
515 | } else | |
516 | m_delegate.atomParenthesesSubpatternBegin(); | |
517 | ||
518 | ++m_parenthesesNestingDepth; | |
519 | } | |
520 | ||
521 | /* | |
522 | * parseParenthesesEnd(): | |
523 | * | |
524 | * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses). | |
525 | */ | |
526 | void parseParenthesesEnd() | |
527 | { | |
528 | ASSERT(!m_err); | |
529 | ASSERT(peek() == ')'); | |
530 | consume(); | |
531 | ||
532 | if (m_parenthesesNestingDepth > 0) | |
533 | m_delegate.atomParenthesesEnd(); | |
534 | else | |
535 | m_err = ParenthesesUnmatched; | |
536 | ||
537 | --m_parenthesesNestingDepth; | |
538 | } | |
539 | ||
540 | /* | |
541 | * parseQuantifier(): | |
542 | * | |
543 | * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers. | |
544 | */ | |
545 | void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max) | |
546 | { | |
547 | ASSERT(!m_err); | |
548 | ASSERT(min <= max); | |
549 | ||
550 | if (min == UINT_MAX) { | |
551 | m_err = QuantifierTooLarge; | |
552 | return; | |
553 | } | |
554 | ||
555 | if (lastTokenWasAnAtom) | |
556 | m_delegate.quantifyAtom(min, max, !tryConsume('?')); | |
557 | else | |
558 | m_err = QuantifierWithoutAtom; | |
559 | } | |
560 | ||
561 | /* | |
562 | * parseTokens(): | |
563 | * | |
564 | * This method loops over the input pattern reporting tokens to the delegate. | |
565 | * The method returns when a parse error is detected, or the end of the pattern | |
566 | * is reached. One piece of state is tracked around the loop, which is whether | |
567 | * the last token passed to the delegate was an atom (this is necessary to detect | |
568 | * a parse error when a quantifier provided without an atom to quantify). | |
569 | */ | |
570 | void parseTokens() | |
571 | { | |
572 | bool lastTokenWasAnAtom = false; | |
573 | ||
574 | while (!atEndOfPattern()) { | |
575 | switch (peek()) { | |
576 | case '|': | |
577 | consume(); | |
578 | m_delegate.disjunction(); | |
579 | lastTokenWasAnAtom = false; | |
580 | break; | |
581 | ||
582 | case '(': | |
583 | parseParenthesesBegin(); | |
584 | lastTokenWasAnAtom = false; | |
585 | break; | |
586 | ||
587 | case ')': | |
588 | parseParenthesesEnd(); | |
589 | lastTokenWasAnAtom = true; | |
590 | break; | |
591 | ||
592 | case '^': | |
593 | consume(); | |
594 | m_delegate.assertionBOL(); | |
595 | lastTokenWasAnAtom = false; | |
596 | break; | |
597 | ||
598 | case '$': | |
599 | consume(); | |
600 | m_delegate.assertionEOL(); | |
601 | lastTokenWasAnAtom = false; | |
602 | break; | |
603 | ||
604 | case '.': | |
605 | consume(); | |
606 | m_delegate.atomBuiltInCharacterClass(NewlineClassID, true); | |
607 | lastTokenWasAnAtom = true; | |
608 | break; | |
609 | ||
610 | case '[': | |
611 | parseCharacterClass(); | |
612 | lastTokenWasAnAtom = true; | |
613 | break; | |
614 | ||
615 | case '\\': | |
616 | lastTokenWasAnAtom = parseAtomEscape(); | |
617 | break; | |
618 | ||
619 | case '*': | |
620 | consume(); | |
621 | parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite); | |
622 | lastTokenWasAnAtom = false; | |
623 | break; | |
624 | ||
625 | case '+': | |
626 | consume(); | |
627 | parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite); | |
628 | lastTokenWasAnAtom = false; | |
629 | break; | |
630 | ||
631 | case '?': | |
632 | consume(); | |
633 | parseQuantifier(lastTokenWasAnAtom, 0, 1); | |
634 | lastTokenWasAnAtom = false; | |
635 | break; | |
636 | ||
637 | case '{': { | |
638 | ParseState state = saveState(); | |
639 | ||
640 | consume(); | |
641 | if (peekIsDigit()) { | |
642 | unsigned min = consumeNumber(); | |
643 | unsigned max = min; | |
644 | ||
645 | if (tryConsume(',')) | |
646 | max = peekIsDigit() ? consumeNumber() : quantifyInfinite; | |
647 | ||
648 | if (tryConsume('}')) { | |
649 | if (min <= max) | |
650 | parseQuantifier(lastTokenWasAnAtom, min, max); | |
651 | else | |
652 | m_err = QuantifierOutOfOrder; | |
653 | lastTokenWasAnAtom = false; | |
654 | break; | |
655 | } | |
656 | } | |
657 | ||
658 | restoreState(state); | |
659 | } // if we did not find a complete quantifer, fall through to the default case. | |
660 | ||
661 | default: | |
662 | m_delegate.atomPatternCharacter(consume()); | |
663 | lastTokenWasAnAtom = true; | |
664 | } | |
665 | ||
666 | if (m_err) | |
667 | return; | |
668 | } | |
669 | ||
670 | if (m_parenthesesNestingDepth > 0) | |
671 | m_err = MissingParentheses; | |
672 | } | |
673 | ||
674 | /* | |
675 | * parse(): | |
676 | * | |
677 | * This method calls parseTokens() to parse over the input and converts any | |
678 | * error code to a const char* for a result. | |
679 | */ | |
680 | const char* parse() | |
681 | { | |
682 | if (m_size > MAX_PATTERN_SIZE) | |
683 | m_err = PatternTooLarge; | |
684 | else | |
685 | parseTokens(); | |
686 | ASSERT(atEndOfPattern() || m_err); | |
687 | ||
688 | // The order of this array must match the ErrorCode enum. | |
689 | static const char* errorMessages[NumberOfErrorCodes] = { | |
690 | 0, // NoError | |
691 | REGEXP_ERROR_PREFIX "regular expression too large", | |
692 | REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier", | |
693 | REGEXP_ERROR_PREFIX "nothing to repeat", | |
694 | REGEXP_ERROR_PREFIX "number too large in {} quantifier", | |
695 | REGEXP_ERROR_PREFIX "missing )", | |
696 | REGEXP_ERROR_PREFIX "unmatched parentheses", | |
697 | REGEXP_ERROR_PREFIX "unrecognized character after (?", | |
698 | REGEXP_ERROR_PREFIX "missing terminating ] for character class", | |
699 | REGEXP_ERROR_PREFIX "range out of order in character class", | |
700 | REGEXP_ERROR_PREFIX "\\ at end of pattern" | |
701 | }; | |
702 | ||
703 | return errorMessages[m_err]; | |
704 | } | |
705 | ||
706 | // Misc helper functions: | |
707 | ||
708 | typedef unsigned ParseState; | |
709 | ||
710 | ParseState saveState() | |
711 | { | |
712 | return m_index; | |
713 | } | |
714 | ||
715 | void restoreState(ParseState state) | |
716 | { | |
717 | m_index = state; | |
718 | } | |
719 | ||
720 | bool atEndOfPattern() | |
721 | { | |
722 | ASSERT(m_index <= m_size); | |
723 | return m_index == m_size; | |
724 | } | |
725 | ||
726 | int peek() | |
727 | { | |
728 | ASSERT(m_index < m_size); | |
729 | return m_data[m_index]; | |
730 | } | |
731 | ||
732 | bool peekIsDigit() | |
733 | { | |
734 | return !atEndOfPattern() && WTF::isASCIIDigit(peek()); | |
735 | } | |
736 | ||
737 | unsigned peekDigit() | |
738 | { | |
739 | ASSERT(peekIsDigit()); | |
740 | return peek() - '0'; | |
741 | } | |
742 | ||
743 | int consume() | |
744 | { | |
745 | ASSERT(m_index < m_size); | |
746 | return m_data[m_index++]; | |
747 | } | |
748 | ||
749 | unsigned consumeDigit() | |
750 | { | |
751 | ASSERT(peekIsDigit()); | |
752 | return consume() - '0'; | |
753 | } | |
754 | ||
755 | unsigned consumeNumber() | |
756 | { | |
757 | unsigned n = consumeDigit(); | |
758 | // check for overflow. | |
759 | for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) { | |
760 | n = newValue; | |
761 | consume(); | |
762 | } | |
763 | return n; | |
764 | } | |
765 | ||
766 | unsigned consumeOctal() | |
767 | { | |
768 | ASSERT(WTF::isASCIIOctalDigit(peek())); | |
769 | ||
770 | unsigned n = consumeDigit(); | |
771 | while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek())) | |
772 | n = n * 8 + consumeDigit(); | |
773 | return n; | |
774 | } | |
775 | ||
776 | bool tryConsume(UChar ch) | |
777 | { | |
778 | if (atEndOfPattern() || (m_data[m_index] != ch)) | |
779 | return false; | |
780 | ++m_index; | |
781 | return true; | |
782 | } | |
783 | ||
784 | int tryConsumeHex(int count) | |
785 | { | |
786 | ParseState state = saveState(); | |
787 | ||
788 | int n = 0; | |
789 | while (count--) { | |
790 | if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) { | |
791 | restoreState(state); | |
792 | return -1; | |
793 | } | |
794 | n = (n << 4) | WTF::toASCIIHexValue(consume()); | |
795 | } | |
796 | return n; | |
797 | } | |
798 | ||
799 | Delegate& m_delegate; | |
800 | unsigned m_backReferenceLimit; | |
801 | ErrorCode m_err; | |
802 | const CharType* m_data; | |
803 | unsigned m_size; | |
804 | unsigned m_index; | |
805 | unsigned m_parenthesesNestingDepth; | |
806 | ||
807 | // Derived by empirical testing of compile time in PCRE and WREC. | |
808 | static const unsigned MAX_PATTERN_SIZE = 1024 * 1024; | |
809 | }; | |
810 | ||
811 | /* | |
812 | * Yarr::parse(): | |
813 | * | |
814 | * The parse method is passed a pattern to be parsed and a delegate upon which | |
815 | * callbacks will be made to record the parsed tokens forming the regex. | |
816 | * Yarr::parse() returns null on success, or a const C string providing an error | |
817 | * message where a parse error occurs. | |
818 | * | |
819 | * The Delegate must implement the following interface: | |
820 | * | |
821 | * void assertionBOL(); | |
822 | * void assertionEOL(); | |
823 | * void assertionWordBoundary(bool invert); | |
824 | * | |
825 | * void atomPatternCharacter(UChar ch); | |
826 | * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert); | |
827 | * void atomCharacterClassBegin(bool invert) | |
828 | * void atomCharacterClassAtom(UChar ch) | |
829 | * void atomCharacterClassRange(UChar begin, UChar end) | |
830 | * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) | |
831 | * void atomCharacterClassEnd() | |
832 | * void atomParenthesesSubpatternBegin(bool capture = true); | |
833 | * void atomParentheticalAssertionBegin(bool invert = false); | |
834 | * void atomParenthesesEnd(); | |
835 | * void atomBackReference(unsigned subpatternId); | |
836 | * | |
837 | * void quantifyAtom(unsigned min, unsigned max, bool greedy); | |
838 | * | |
839 | * void disjunction(); | |
840 | * | |
841 | * The regular expression is described by a sequence of assertion*() and atom*() | |
842 | * callbacks to the delegate, describing the terms in the regular expression. | |
843 | * Following an atom a quantifyAtom() call may occur to indicate that the previous | |
844 | * atom should be quantified. In the case of atoms described across multiple | |
845 | * calls (parentheses and character classes) the call to quantifyAtom() will come | |
846 | * after the call to the atom*End() method, never after atom*Begin(). | |
847 | * | |
848 | * Character classes may either be described by a single call to | |
849 | * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls. | |
850 | * In the latter case, ...Begin() will be called, followed by a sequence of | |
851 | * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End(). | |
852 | * | |
853 | * Sequences of atoms and assertions are broken into alternatives via calls to | |
854 | * disjunction(). Assertions, atoms, and disjunctions emitted between calls to | |
855 | * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern. | |
856 | * atomParenthesesBegin() is passed a subpatternId. In the case of a regular | |
857 | * capturing subpattern, this will be the subpatternId associated with these | |
858 | * parentheses, and will also by definition be the lowest subpatternId of these | |
859 | * parentheses and of any nested paretheses. The atomParenthesesEnd() method | |
860 | * is passed the subpatternId of the last capturing subexpression nested within | |
861 | * these paretheses. In the case of a capturing subpattern with no nested | |
862 | * capturing subpatterns, the same subpatternId will be passed to the begin and | |
863 | * end functions. In the case of non-capturing subpatterns the subpatternId | |
864 | * passed to the begin method is also the first possible subpatternId that might | |
865 | * be nested within these paretheses. If a set of non-capturing parentheses does | |
866 | * not contain any capturing subpatterns, then the subpatternId passed to begin | |
867 | * will be greater than the subpatternId passed to end. | |
868 | */ | |
869 | ||
870 | template<class Delegate> | |
871 | const char* parse(Delegate& delegate, const String& pattern, unsigned backReferenceLimit = quantifyInfinite) | |
872 | { | |
873 | if (pattern.is8Bit()) | |
874 | return Parser<Delegate, LChar>(delegate, pattern, backReferenceLimit).parse(); | |
875 | return Parser<Delegate, UChar>(delegate, pattern, backReferenceLimit).parse(); | |
876 | } | |
877 | ||
878 | } } // namespace JSC::Yarr | |
879 | ||
880 | #endif // YarrParser_h |