]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | * Copyright (C) 2009 Apple Inc. All rights reserved. | |
3 | * | |
4 | * Redistribution and use in source and binary forms, with or without | |
5 | * modification, are permitted provided that the following conditions | |
6 | * are met: | |
7 | * 1. Redistributions of source code must retain the above copyright | |
8 | * notice, this list of conditions and the following disclaimer. | |
9 | * 2. Redistributions in binary form must reproduce the above copyright | |
10 | * notice, this list of conditions and the following disclaimer in the | |
11 | * documentation and/or other materials provided with the distribution. | |
12 | * | |
13 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY | |
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
16 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR | |
17 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
18 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
19 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
20 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
21 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
23 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
24 | */ | |
25 | ||
26 | #ifndef YarrParser_h | |
27 | #define YarrParser_h | |
28 | ||
29 | #include <runtime/UString.h> | |
30 | #include "Yarr.h" | |
31 | #include <wtf/ASCIICType.h> | |
32 | #include <wtf/unicode/Unicode.h> | |
33 | ||
34 | namespace JSC { namespace Yarr { | |
35 | ||
36 | #define REGEXP_ERROR_PREFIX "Invalid regular expression: " | |
37 | ||
38 | enum BuiltInCharacterClassID { | |
39 | DigitClassID, | |
40 | SpaceClassID, | |
41 | WordClassID, | |
42 | NewlineClassID, | |
43 | }; | |
44 | ||
45 | // The Parser class should not be used directly - only via the Yarr::parse() method. | |
46 | template<class Delegate> | |
47 | class Parser { | |
48 | private: | |
49 | template<class FriendDelegate> | |
50 | friend const char* parse(FriendDelegate& delegate, const UString& pattern, unsigned backReferenceLimit); | |
51 | ||
52 | enum ErrorCode { | |
53 | NoError, | |
54 | PatternTooLarge, | |
55 | QuantifierOutOfOrder, | |
56 | QuantifierWithoutAtom, | |
57 | MissingParentheses, | |
58 | ParenthesesUnmatched, | |
59 | ParenthesesTypeInvalid, | |
60 | CharacterClassUnmatched, | |
61 | CharacterClassOutOfOrder, | |
62 | EscapeUnterminated, | |
63 | NumberOfErrorCodes | |
64 | }; | |
65 | ||
66 | /* | |
67 | * CharacterClassParserDelegate: | |
68 | * | |
69 | * The class CharacterClassParserDelegate is used in the parsing of character | |
70 | * classes. This class handles detection of character ranges. This class | |
71 | * implements enough of the delegate interface such that it can be passed to | |
72 | * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused | |
73 | * to perform the parsing of escape characters in character sets. | |
74 | */ | |
75 | class CharacterClassParserDelegate { | |
76 | public: | |
77 | CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) | |
78 | : m_delegate(delegate) | |
79 | , m_err(err) | |
80 | , m_state(Empty) | |
81 | , m_character(0) | |
82 | { | |
83 | } | |
84 | ||
85 | /* | |
86 | * begin(): | |
87 | * | |
88 | * Called at beginning of construction. | |
89 | */ | |
90 | void begin(bool invert) | |
91 | { | |
92 | m_delegate.atomCharacterClassBegin(invert); | |
93 | } | |
94 | ||
95 | /* | |
96 | * atomPatternCharacter(): | |
97 | * | |
98 | * This method is called either from parseCharacterClass() (for an unescaped | |
99 | * character in a character class), or from parseEscape(). In the former case | |
100 | * the value true will be passed for the argument 'hyphenIsRange', and in this | |
101 | * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/ | |
102 | * is different to /[a\-z]/). | |
103 | */ | |
104 | void atomPatternCharacter(UChar ch, bool hyphenIsRange = false) | |
105 | { | |
106 | switch (m_state) { | |
107 | case AfterCharacterClass: | |
108 | // Following a builtin character class we need look out for a hyphen. | |
109 | // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/. | |
110 | // If we see a hyphen following a charater class then unlike usual | |
111 | // we'll report it to the delegate immediately, and put ourself into | |
112 | // a poisoned state. Any following calls to add another character or | |
113 | // character class will result in an error. (A hypen following a | |
114 | // character-class is itself valid, but only at the end of a regex). | |
115 | if (hyphenIsRange && ch == '-') { | |
116 | m_delegate.atomCharacterClassAtom('-'); | |
117 | m_state = AfterCharacterClassHyphen; | |
118 | return; | |
119 | } | |
120 | // Otherwise just fall through - cached character so treat this as Empty. | |
121 | ||
122 | case Empty: | |
123 | m_character = ch; | |
124 | m_state = CachedCharacter; | |
125 | return; | |
126 | ||
127 | case CachedCharacter: | |
128 | if (hyphenIsRange && ch == '-') | |
129 | m_state = CachedCharacterHyphen; | |
130 | else { | |
131 | m_delegate.atomCharacterClassAtom(m_character); | |
132 | m_character = ch; | |
133 | } | |
134 | return; | |
135 | ||
136 | case CachedCharacterHyphen: | |
137 | if (ch < m_character) { | |
138 | m_err = CharacterClassOutOfOrder; | |
139 | return; | |
140 | } | |
141 | m_delegate.atomCharacterClassRange(m_character, ch); | |
142 | m_state = Empty; | |
143 | return; | |
144 | ||
145 | // See coment in atomBuiltInCharacterClass below. | |
146 | // This too is technically an error, per ECMA-262, and again we | |
147 | // we chose to allow this. Note a subtlely here that while we | |
148 | // diverge from the spec's definition of CharacterRange we do | |
149 | // remain in compliance with the grammar. For example, consider | |
150 | // the expression /[\d-a-z]/. We comply with the grammar in | |
151 | // this case by not allowing a-z to be matched as a range. | |
152 | case AfterCharacterClassHyphen: | |
153 | m_delegate.atomCharacterClassAtom(ch); | |
154 | m_state = Empty; | |
155 | return; | |
156 | } | |
157 | } | |
158 | ||
159 | /* | |
160 | * atomBuiltInCharacterClass(): | |
161 | * | |
162 | * Adds a built-in character class, called by parseEscape(). | |
163 | */ | |
164 | void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) | |
165 | { | |
166 | switch (m_state) { | |
167 | case CachedCharacter: | |
168 | // Flush the currently cached character, then fall through. | |
169 | m_delegate.atomCharacterClassAtom(m_character); | |
170 | ||
171 | case Empty: | |
172 | case AfterCharacterClass: | |
173 | m_state = AfterCharacterClass; | |
174 | m_delegate.atomCharacterClassBuiltIn(classID, invert); | |
175 | return; | |
176 | ||
177 | // If we hit either of these cases, we have an invalid range that | |
178 | // looks something like /[x-\d]/ or /[\d-\d]/. | |
179 | // According to ECMA-262 this should be a syntax error, but | |
180 | // empirical testing shows this to break teh webz. Instead we | |
181 | // comply with to the ECMA-262 grammar, and assume the grammar to | |
182 | // have matched the range correctly, but tweak our interpretation | |
183 | // of CharacterRange. Effectively we implicitly handle the hyphen | |
184 | // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/. | |
185 | case CachedCharacterHyphen: | |
186 | m_delegate.atomCharacterClassAtom(m_character); | |
187 | m_delegate.atomCharacterClassAtom('-'); | |
188 | // fall through | |
189 | case AfterCharacterClassHyphen: | |
190 | m_delegate.atomCharacterClassBuiltIn(classID, invert); | |
191 | m_state = Empty; | |
192 | return; | |
193 | } | |
194 | } | |
195 | ||
196 | /* | |
197 | * end(): | |
198 | * | |
199 | * Called at end of construction. | |
200 | */ | |
201 | void end() | |
202 | { | |
203 | if (m_state == CachedCharacter) | |
204 | m_delegate.atomCharacterClassAtom(m_character); | |
205 | else if (m_state == CachedCharacterHyphen) { | |
206 | m_delegate.atomCharacterClassAtom(m_character); | |
207 | m_delegate.atomCharacterClassAtom('-'); | |
208 | } | |
209 | m_delegate.atomCharacterClassEnd(); | |
210 | } | |
211 | ||
212 | // parseEscape() should never call these delegate methods when | |
213 | // invoked with inCharacterClass set. | |
214 | void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); } | |
215 | void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); } | |
216 | ||
217 | private: | |
218 | Delegate& m_delegate; | |
219 | ErrorCode& m_err; | |
220 | enum CharacterClassConstructionState { | |
221 | Empty, | |
222 | CachedCharacter, | |
223 | CachedCharacterHyphen, | |
224 | AfterCharacterClass, | |
225 | AfterCharacterClassHyphen, | |
226 | } m_state; | |
227 | UChar m_character; | |
228 | }; | |
229 | ||
230 | Parser(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit) | |
231 | : m_delegate(delegate) | |
232 | , m_backReferenceLimit(backReferenceLimit) | |
233 | , m_err(NoError) | |
234 | , m_data(pattern.characters()) | |
235 | , m_size(pattern.length()) | |
236 | , m_index(0) | |
237 | , m_parenthesesNestingDepth(0) | |
238 | { | |
239 | } | |
240 | ||
241 | /* | |
242 | * parseEscape(): | |
243 | * | |
244 | * Helper for parseTokens() AND parseCharacterClass(). | |
245 | * Unlike the other parser methods, this function does not report tokens | |
246 | * directly to the member delegate (m_delegate), instead tokens are | |
247 | * emitted to the delegate provided as an argument. In the case of atom | |
248 | * escapes, parseTokens() will call parseEscape() passing m_delegate as | |
249 | * an argument, and as such the escape will be reported to the delegate. | |
250 | * | |
251 | * However this method may also be used by parseCharacterClass(), in which | |
252 | * case a CharacterClassParserDelegate will be passed as the delegate that | |
253 | * tokens should be added to. A boolean flag is also provided to indicate | |
254 | * whether that an escape in a CharacterClass is being parsed (some parsing | |
255 | * rules change in this context). | |
256 | * | |
257 | * The boolean value returned by this method indicates whether the token | |
258 | * parsed was an atom (outside of a characted class \b and \B will be | |
259 | * interpreted as assertions). | |
260 | */ | |
261 | template<bool inCharacterClass, class EscapeDelegate> | |
262 | bool parseEscape(EscapeDelegate& delegate) | |
263 | { | |
264 | ASSERT(!m_err); | |
265 | ASSERT(peek() == '\\'); | |
266 | consume(); | |
267 | ||
268 | if (atEndOfPattern()) { | |
269 | m_err = EscapeUnterminated; | |
270 | return false; | |
271 | } | |
272 | ||
273 | switch (peek()) { | |
274 | // Assertions | |
275 | case 'b': | |
276 | consume(); | |
277 | if (inCharacterClass) | |
278 | delegate.atomPatternCharacter('\b'); | |
279 | else { | |
280 | delegate.assertionWordBoundary(false); | |
281 | return false; | |
282 | } | |
283 | break; | |
284 | case 'B': | |
285 | consume(); | |
286 | if (inCharacterClass) | |
287 | delegate.atomPatternCharacter('B'); | |
288 | else { | |
289 | delegate.assertionWordBoundary(true); | |
290 | return false; | |
291 | } | |
292 | break; | |
293 | ||
294 | // CharacterClassEscape | |
295 | case 'd': | |
296 | consume(); | |
297 | delegate.atomBuiltInCharacterClass(DigitClassID, false); | |
298 | break; | |
299 | case 's': | |
300 | consume(); | |
301 | delegate.atomBuiltInCharacterClass(SpaceClassID, false); | |
302 | break; | |
303 | case 'w': | |
304 | consume(); | |
305 | delegate.atomBuiltInCharacterClass(WordClassID, false); | |
306 | break; | |
307 | case 'D': | |
308 | consume(); | |
309 | delegate.atomBuiltInCharacterClass(DigitClassID, true); | |
310 | break; | |
311 | case 'S': | |
312 | consume(); | |
313 | delegate.atomBuiltInCharacterClass(SpaceClassID, true); | |
314 | break; | |
315 | case 'W': | |
316 | consume(); | |
317 | delegate.atomBuiltInCharacterClass(WordClassID, true); | |
318 | break; | |
319 | ||
320 | // DecimalEscape | |
321 | case '1': | |
322 | case '2': | |
323 | case '3': | |
324 | case '4': | |
325 | case '5': | |
326 | case '6': | |
327 | case '7': | |
328 | case '8': | |
329 | case '9': { | |
330 | // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape. | |
331 | // First, try to parse this as backreference. | |
332 | if (!inCharacterClass) { | |
333 | ParseState state = saveState(); | |
334 | ||
335 | unsigned backReference = consumeNumber(); | |
336 | if (backReference <= m_backReferenceLimit) { | |
337 | delegate.atomBackReference(backReference); | |
338 | break; | |
339 | } | |
340 | ||
341 | restoreState(state); | |
342 | } | |
343 | ||
344 | // Not a backreference, and not octal. | |
345 | if (peek() >= '8') { | |
346 | delegate.atomPatternCharacter('\\'); | |
347 | break; | |
348 | } | |
349 | ||
350 | // Fall-through to handle this as an octal escape. | |
351 | } | |
352 | ||
353 | // Octal escape | |
354 | case '0': | |
355 | delegate.atomPatternCharacter(consumeOctal()); | |
356 | break; | |
357 | ||
358 | // ControlEscape | |
359 | case 'f': | |
360 | consume(); | |
361 | delegate.atomPatternCharacter('\f'); | |
362 | break; | |
363 | case 'n': | |
364 | consume(); | |
365 | delegate.atomPatternCharacter('\n'); | |
366 | break; | |
367 | case 'r': | |
368 | consume(); | |
369 | delegate.atomPatternCharacter('\r'); | |
370 | break; | |
371 | case 't': | |
372 | consume(); | |
373 | delegate.atomPatternCharacter('\t'); | |
374 | break; | |
375 | case 'v': | |
376 | consume(); | |
377 | delegate.atomPatternCharacter('\v'); | |
378 | break; | |
379 | ||
380 | // ControlLetter | |
381 | case 'c': { | |
382 | ParseState state = saveState(); | |
383 | consume(); | |
384 | if (!atEndOfPattern()) { | |
385 | int control = consume(); | |
386 | ||
387 | // To match Firefox, inside a character class, we also accept numbers and '_' as control characters. | |
388 | if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) { | |
389 | delegate.atomPatternCharacter(control & 0x1f); | |
390 | break; | |
391 | } | |
392 | } | |
393 | restoreState(state); | |
394 | delegate.atomPatternCharacter('\\'); | |
395 | break; | |
396 | } | |
397 | ||
398 | // HexEscape | |
399 | case 'x': { | |
400 | consume(); | |
401 | int x = tryConsumeHex(2); | |
402 | if (x == -1) | |
403 | delegate.atomPatternCharacter('x'); | |
404 | else | |
405 | delegate.atomPatternCharacter(x); | |
406 | break; | |
407 | } | |
408 | ||
409 | // UnicodeEscape | |
410 | case 'u': { | |
411 | consume(); | |
412 | int u = tryConsumeHex(4); | |
413 | if (u == -1) | |
414 | delegate.atomPatternCharacter('u'); | |
415 | else | |
416 | delegate.atomPatternCharacter(u); | |
417 | break; | |
418 | } | |
419 | ||
420 | // IdentityEscape | |
421 | default: | |
422 | delegate.atomPatternCharacter(consume()); | |
423 | } | |
424 | ||
425 | return true; | |
426 | } | |
427 | ||
428 | /* | |
429 | * parseAtomEscape(), parseCharacterClassEscape(): | |
430 | * | |
431 | * These methods alias to parseEscape(). | |
432 | */ | |
433 | bool parseAtomEscape() | |
434 | { | |
435 | return parseEscape<false>(m_delegate); | |
436 | } | |
437 | void parseCharacterClassEscape(CharacterClassParserDelegate& delegate) | |
438 | { | |
439 | parseEscape<true>(delegate); | |
440 | } | |
441 | ||
442 | /* | |
443 | * parseCharacterClass(): | |
444 | * | |
445 | * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape) | |
446 | * to an instance of CharacterClassParserDelegate, to describe the character class to the | |
447 | * delegate. | |
448 | */ | |
449 | void parseCharacterClass() | |
450 | { | |
451 | ASSERT(!m_err); | |
452 | ASSERT(peek() == '['); | |
453 | consume(); | |
454 | ||
455 | CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err); | |
456 | ||
457 | characterClassConstructor.begin(tryConsume('^')); | |
458 | ||
459 | while (!atEndOfPattern()) { | |
460 | switch (peek()) { | |
461 | case ']': | |
462 | consume(); | |
463 | characterClassConstructor.end(); | |
464 | return; | |
465 | ||
466 | case '\\': | |
467 | parseCharacterClassEscape(characterClassConstructor); | |
468 | break; | |
469 | ||
470 | default: | |
471 | characterClassConstructor.atomPatternCharacter(consume(), true); | |
472 | } | |
473 | ||
474 | if (m_err) | |
475 | return; | |
476 | } | |
477 | ||
478 | m_err = CharacterClassUnmatched; | |
479 | } | |
480 | ||
481 | /* | |
482 | * parseParenthesesBegin(): | |
483 | * | |
484 | * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns. | |
485 | */ | |
486 | void parseParenthesesBegin() | |
487 | { | |
488 | ASSERT(!m_err); | |
489 | ASSERT(peek() == '('); | |
490 | consume(); | |
491 | ||
492 | if (tryConsume('?')) { | |
493 | if (atEndOfPattern()) { | |
494 | m_err = ParenthesesTypeInvalid; | |
495 | return; | |
496 | } | |
497 | ||
498 | switch (consume()) { | |
499 | case ':': | |
500 | m_delegate.atomParenthesesSubpatternBegin(false); | |
501 | break; | |
502 | ||
503 | case '=': | |
504 | m_delegate.atomParentheticalAssertionBegin(); | |
505 | break; | |
506 | ||
507 | case '!': | |
508 | m_delegate.atomParentheticalAssertionBegin(true); | |
509 | break; | |
510 | ||
511 | default: | |
512 | m_err = ParenthesesTypeInvalid; | |
513 | } | |
514 | } else | |
515 | m_delegate.atomParenthesesSubpatternBegin(); | |
516 | ||
517 | ++m_parenthesesNestingDepth; | |
518 | } | |
519 | ||
520 | /* | |
521 | * parseParenthesesEnd(): | |
522 | * | |
523 | * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses). | |
524 | */ | |
525 | void parseParenthesesEnd() | |
526 | { | |
527 | ASSERT(!m_err); | |
528 | ASSERT(peek() == ')'); | |
529 | consume(); | |
530 | ||
531 | if (m_parenthesesNestingDepth > 0) | |
532 | m_delegate.atomParenthesesEnd(); | |
533 | else | |
534 | m_err = ParenthesesUnmatched; | |
535 | ||
536 | --m_parenthesesNestingDepth; | |
537 | } | |
538 | ||
539 | /* | |
540 | * parseQuantifier(): | |
541 | * | |
542 | * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers. | |
543 | */ | |
544 | void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max) | |
545 | { | |
546 | ASSERT(!m_err); | |
547 | ASSERT(min <= max); | |
548 | ||
549 | if (lastTokenWasAnAtom) | |
550 | m_delegate.quantifyAtom(min, max, !tryConsume('?')); | |
551 | else | |
552 | m_err = QuantifierWithoutAtom; | |
553 | } | |
554 | ||
555 | /* | |
556 | * parseTokens(): | |
557 | * | |
558 | * This method loops over the input pattern reporting tokens to the delegate. | |
559 | * The method returns when a parse error is detected, or the end of the pattern | |
560 | * is reached. One piece of state is tracked around the loop, which is whether | |
561 | * the last token passed to the delegate was an atom (this is necessary to detect | |
562 | * a parse error when a quantifier provided without an atom to quantify). | |
563 | */ | |
564 | void parseTokens() | |
565 | { | |
566 | bool lastTokenWasAnAtom = false; | |
567 | ||
568 | while (!atEndOfPattern()) { | |
569 | switch (peek()) { | |
570 | case '|': | |
571 | consume(); | |
572 | m_delegate.disjunction(); | |
573 | lastTokenWasAnAtom = false; | |
574 | break; | |
575 | ||
576 | case '(': | |
577 | parseParenthesesBegin(); | |
578 | lastTokenWasAnAtom = false; | |
579 | break; | |
580 | ||
581 | case ')': | |
582 | parseParenthesesEnd(); | |
583 | lastTokenWasAnAtom = true; | |
584 | break; | |
585 | ||
586 | case '^': | |
587 | consume(); | |
588 | m_delegate.assertionBOL(); | |
589 | lastTokenWasAnAtom = false; | |
590 | break; | |
591 | ||
592 | case '$': | |
593 | consume(); | |
594 | m_delegate.assertionEOL(); | |
595 | lastTokenWasAnAtom = false; | |
596 | break; | |
597 | ||
598 | case '.': | |
599 | consume(); | |
600 | m_delegate.atomBuiltInCharacterClass(NewlineClassID, true); | |
601 | lastTokenWasAnAtom = true; | |
602 | break; | |
603 | ||
604 | case '[': | |
605 | parseCharacterClass(); | |
606 | lastTokenWasAnAtom = true; | |
607 | break; | |
608 | ||
609 | case '\\': | |
610 | lastTokenWasAnAtom = parseAtomEscape(); | |
611 | break; | |
612 | ||
613 | case '*': | |
614 | consume(); | |
615 | parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite); | |
616 | lastTokenWasAnAtom = false; | |
617 | break; | |
618 | ||
619 | case '+': | |
620 | consume(); | |
621 | parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite); | |
622 | lastTokenWasAnAtom = false; | |
623 | break; | |
624 | ||
625 | case '?': | |
626 | consume(); | |
627 | parseQuantifier(lastTokenWasAnAtom, 0, 1); | |
628 | lastTokenWasAnAtom = false; | |
629 | break; | |
630 | ||
631 | case '{': { | |
632 | ParseState state = saveState(); | |
633 | ||
634 | consume(); | |
635 | if (peekIsDigit()) { | |
636 | unsigned min = consumeNumber(); | |
637 | unsigned max = min; | |
638 | ||
639 | if (tryConsume(',')) | |
640 | max = peekIsDigit() ? consumeNumber() : quantifyInfinite; | |
641 | ||
642 | if (tryConsume('}')) { | |
643 | if (min <= max) | |
644 | parseQuantifier(lastTokenWasAnAtom, min, max); | |
645 | else | |
646 | m_err = QuantifierOutOfOrder; | |
647 | lastTokenWasAnAtom = false; | |
648 | break; | |
649 | } | |
650 | } | |
651 | ||
652 | restoreState(state); | |
653 | } // if we did not find a complete quantifer, fall through to the default case. | |
654 | ||
655 | default: | |
656 | m_delegate.atomPatternCharacter(consume()); | |
657 | lastTokenWasAnAtom = true; | |
658 | } | |
659 | ||
660 | if (m_err) | |
661 | return; | |
662 | } | |
663 | ||
664 | if (m_parenthesesNestingDepth > 0) | |
665 | m_err = MissingParentheses; | |
666 | } | |
667 | ||
668 | /* | |
669 | * parse(): | |
670 | * | |
671 | * This method calls parseTokens() to parse over the input and converts any | |
672 | * error code to a const char* for a result. | |
673 | */ | |
674 | const char* parse() | |
675 | { | |
676 | if (m_size > MAX_PATTERN_SIZE) | |
677 | m_err = PatternTooLarge; | |
678 | else | |
679 | parseTokens(); | |
680 | ASSERT(atEndOfPattern() || m_err); | |
681 | ||
682 | // The order of this array must match the ErrorCode enum. | |
683 | static const char* errorMessages[NumberOfErrorCodes] = { | |
684 | 0, // NoError | |
685 | REGEXP_ERROR_PREFIX "regular expression too large", | |
686 | REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier", | |
687 | REGEXP_ERROR_PREFIX "nothing to repeat", | |
688 | REGEXP_ERROR_PREFIX "missing )", | |
689 | REGEXP_ERROR_PREFIX "unmatched parentheses", | |
690 | REGEXP_ERROR_PREFIX "unrecognized character after (?", | |
691 | REGEXP_ERROR_PREFIX "missing terminating ] for character class", | |
692 | REGEXP_ERROR_PREFIX "range out of order in character class", | |
693 | REGEXP_ERROR_PREFIX "\\ at end of pattern" | |
694 | }; | |
695 | ||
696 | return errorMessages[m_err]; | |
697 | } | |
698 | ||
699 | ||
700 | // Misc helper functions: | |
701 | ||
702 | typedef unsigned ParseState; | |
703 | ||
704 | ParseState saveState() | |
705 | { | |
706 | return m_index; | |
707 | } | |
708 | ||
709 | void restoreState(ParseState state) | |
710 | { | |
711 | m_index = state; | |
712 | } | |
713 | ||
714 | bool atEndOfPattern() | |
715 | { | |
716 | ASSERT(m_index <= m_size); | |
717 | return m_index == m_size; | |
718 | } | |
719 | ||
720 | int peek() | |
721 | { | |
722 | ASSERT(m_index < m_size); | |
723 | return m_data[m_index]; | |
724 | } | |
725 | ||
726 | bool peekIsDigit() | |
727 | { | |
728 | return !atEndOfPattern() && WTF::isASCIIDigit(peek()); | |
729 | } | |
730 | ||
731 | unsigned peekDigit() | |
732 | { | |
733 | ASSERT(peekIsDigit()); | |
734 | return peek() - '0'; | |
735 | } | |
736 | ||
737 | int consume() | |
738 | { | |
739 | ASSERT(m_index < m_size); | |
740 | return m_data[m_index++]; | |
741 | } | |
742 | ||
743 | unsigned consumeDigit() | |
744 | { | |
745 | ASSERT(peekIsDigit()); | |
746 | return consume() - '0'; | |
747 | } | |
748 | ||
749 | unsigned consumeNumber() | |
750 | { | |
751 | unsigned n = consumeDigit(); | |
752 | // check for overflow. | |
753 | for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) { | |
754 | n = newValue; | |
755 | consume(); | |
756 | } | |
757 | return n; | |
758 | } | |
759 | ||
760 | unsigned consumeOctal() | |
761 | { | |
762 | ASSERT(WTF::isASCIIOctalDigit(peek())); | |
763 | ||
764 | unsigned n = consumeDigit(); | |
765 | while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek())) | |
766 | n = n * 8 + consumeDigit(); | |
767 | return n; | |
768 | } | |
769 | ||
770 | bool tryConsume(UChar ch) | |
771 | { | |
772 | if (atEndOfPattern() || (m_data[m_index] != ch)) | |
773 | return false; | |
774 | ++m_index; | |
775 | return true; | |
776 | } | |
777 | ||
778 | int tryConsumeHex(int count) | |
779 | { | |
780 | ParseState state = saveState(); | |
781 | ||
782 | int n = 0; | |
783 | while (count--) { | |
784 | if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) { | |
785 | restoreState(state); | |
786 | return -1; | |
787 | } | |
788 | n = (n << 4) | WTF::toASCIIHexValue(consume()); | |
789 | } | |
790 | return n; | |
791 | } | |
792 | ||
793 | Delegate& m_delegate; | |
794 | unsigned m_backReferenceLimit; | |
795 | ErrorCode m_err; | |
796 | const UChar* m_data; | |
797 | unsigned m_size; | |
798 | unsigned m_index; | |
799 | unsigned m_parenthesesNestingDepth; | |
800 | ||
801 | // Derived by empirical testing of compile time in PCRE and WREC. | |
802 | static const unsigned MAX_PATTERN_SIZE = 1024 * 1024; | |
803 | }; | |
804 | ||
805 | /* | |
806 | * Yarr::parse(): | |
807 | * | |
808 | * The parse method is passed a pattern to be parsed and a delegate upon which | |
809 | * callbacks will be made to record the parsed tokens forming the regex. | |
810 | * Yarr::parse() returns null on success, or a const C string providing an error | |
811 | * message where a parse error occurs. | |
812 | * | |
813 | * The Delegate must implement the following interface: | |
814 | * | |
815 | * void assertionBOL(); | |
816 | * void assertionEOL(); | |
817 | * void assertionWordBoundary(bool invert); | |
818 | * | |
819 | * void atomPatternCharacter(UChar ch); | |
820 | * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert); | |
821 | * void atomCharacterClassBegin(bool invert) | |
822 | * void atomCharacterClassAtom(UChar ch) | |
823 | * void atomCharacterClassRange(UChar begin, UChar end) | |
824 | * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) | |
825 | * void atomCharacterClassEnd() | |
826 | * void atomParenthesesSubpatternBegin(bool capture = true); | |
827 | * void atomParentheticalAssertionBegin(bool invert = false); | |
828 | * void atomParenthesesEnd(); | |
829 | * void atomBackReference(unsigned subpatternId); | |
830 | * | |
831 | * void quantifyAtom(unsigned min, unsigned max, bool greedy); | |
832 | * | |
833 | * void disjunction(); | |
834 | * | |
835 | * The regular expression is described by a sequence of assertion*() and atom*() | |
836 | * callbacks to the delegate, describing the terms in the regular expression. | |
837 | * Following an atom a quantifyAtom() call may occur to indicate that the previous | |
838 | * atom should be quantified. In the case of atoms described across multiple | |
839 | * calls (parentheses and character classes) the call to quantifyAtom() will come | |
840 | * after the call to the atom*End() method, never after atom*Begin(). | |
841 | * | |
842 | * Character classes may either be described by a single call to | |
843 | * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls. | |
844 | * In the latter case, ...Begin() will be called, followed by a sequence of | |
845 | * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End(). | |
846 | * | |
847 | * Sequences of atoms and assertions are broken into alternatives via calls to | |
848 | * disjunction(). Assertions, atoms, and disjunctions emitted between calls to | |
849 | * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern. | |
850 | * atomParenthesesBegin() is passed a subpatternId. In the case of a regular | |
851 | * capturing subpattern, this will be the subpatternId associated with these | |
852 | * parentheses, and will also by definition be the lowest subpatternId of these | |
853 | * parentheses and of any nested paretheses. The atomParenthesesEnd() method | |
854 | * is passed the subpatternId of the last capturing subexpression nested within | |
855 | * these paretheses. In the case of a capturing subpattern with no nested | |
856 | * capturing subpatterns, the same subpatternId will be passed to the begin and | |
857 | * end functions. In the case of non-capturing subpatterns the subpatternId | |
858 | * passed to the begin method is also the first possible subpatternId that might | |
859 | * be nested within these paretheses. If a set of non-capturing parentheses does | |
860 | * not contain any capturing subpatterns, then the subpatternId passed to begin | |
861 | * will be greater than the subpatternId passed to end. | |
862 | */ | |
863 | ||
864 | template<class Delegate> | |
865 | const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = quantifyInfinite) | |
866 | { | |
867 | return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse(); | |
868 | } | |
869 | ||
870 | } } // namespace JSC::Yarr | |
871 | ||
872 | #endif // YarrParser_h |