]>
Commit | Line | Data |
---|---|---|
ba379fdc A |
1 | /* |
2 | * Copyright (C) 2009 Apple Inc. All rights reserved. | |
3 | * | |
4 | * Redistribution and use in source and binary forms, with or without | |
5 | * modification, are permitted provided that the following conditions | |
6 | * are met: | |
7 | * 1. Redistributions of source code must retain the above copyright | |
8 | * notice, this list of conditions and the following disclaimer. | |
9 | * 2. Redistributions in binary form must reproduce the above copyright | |
10 | * notice, this list of conditions and the following disclaimer in the | |
11 | * documentation and/or other materials provided with the distribution. | |
12 | * | |
13 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY | |
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
16 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR | |
17 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
18 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
19 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
20 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
21 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
23 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
24 | */ | |
25 | ||
26 | #ifndef RegexParser_h | |
27 | #define RegexParser_h | |
28 | ||
ba379fdc A |
29 | #if ENABLE(YARR) |
30 | ||
31 | #include <UString.h> | |
4e4e5a6f | 32 | #include <limits.h> |
ba379fdc A |
33 | #include <wtf/ASCIICType.h> |
34 | #include <wtf/unicode/Unicode.h> | |
ba379fdc A |
35 | |
36 | namespace JSC { namespace Yarr { | |
37 | ||
38 | enum BuiltInCharacterClassID { | |
39 | DigitClassID, | |
40 | SpaceClassID, | |
41 | WordClassID, | |
42 | NewlineClassID, | |
43 | }; | |
44 | ||
45 | // The Parser class should not be used directly - only via the Yarr::parse() method. | |
46 | template<class Delegate> | |
47 | class Parser { | |
48 | private: | |
49 | template<class FriendDelegate> | |
50 | friend const char* parse(FriendDelegate& delegate, const UString& pattern, unsigned backReferenceLimit); | |
51 | ||
52 | enum ErrorCode { | |
53 | NoError, | |
54 | PatternTooLarge, | |
55 | QuantifierOutOfOrder, | |
56 | QuantifierWithoutAtom, | |
57 | MissingParentheses, | |
58 | ParenthesesUnmatched, | |
59 | ParenthesesTypeInvalid, | |
60 | CharacterClassUnmatched, | |
61 | CharacterClassOutOfOrder, | |
62 | EscapeUnterminated, | |
63 | NumberOfErrorCodes | |
64 | }; | |
65 | ||
66 | /* | |
67 | * CharacterClassParserDelegate: | |
68 | * | |
69 | * The class CharacterClassParserDelegate is used in the parsing of character | |
70 | * classes. This class handles detection of character ranges. This class | |
71 | * implements enough of the delegate interface such that it can be passed to | |
72 | * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused | |
73 | * to perform the parsing of escape characters in character sets. | |
74 | */ | |
75 | class CharacterClassParserDelegate { | |
76 | public: | |
77 | CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) | |
78 | : m_delegate(delegate) | |
79 | , m_err(err) | |
80 | , m_state(empty) | |
81 | { | |
82 | } | |
83 | ||
84 | /* | |
85 | * begin(): | |
86 | * | |
87 | * Called at beginning of construction. | |
88 | */ | |
89 | void begin(bool invert) | |
90 | { | |
91 | m_delegate.atomCharacterClassBegin(invert); | |
92 | } | |
93 | ||
94 | /* | |
95 | * atomPatternCharacterUnescaped(): | |
96 | * | |
97 | * This method is called directly from parseCharacterClass(), to report a new | |
98 | * pattern character token. This method differs from atomPatternCharacter(), | |
99 | * which will be called from parseEscape(), since a hypen provided via this | |
100 | * method may be indicating a character range, but a hyphen parsed by | |
101 | * parseEscape() cannot be interpreted as doing so. | |
102 | */ | |
103 | void atomPatternCharacterUnescaped(UChar ch) | |
104 | { | |
105 | switch (m_state) { | |
106 | case empty: | |
107 | m_character = ch; | |
108 | m_state = cachedCharacter; | |
109 | break; | |
110 | ||
111 | case cachedCharacter: | |
112 | if (ch == '-') | |
113 | m_state = cachedCharacterHyphen; | |
114 | else { | |
115 | m_delegate.atomCharacterClassAtom(m_character); | |
116 | m_character = ch; | |
117 | } | |
118 | break; | |
119 | ||
120 | case cachedCharacterHyphen: | |
121 | if (ch >= m_character) | |
122 | m_delegate.atomCharacterClassRange(m_character, ch); | |
123 | else | |
124 | m_err = CharacterClassOutOfOrder; | |
125 | m_state = empty; | |
126 | } | |
127 | } | |
128 | ||
129 | /* | |
130 | * atomPatternCharacter(): | |
131 | * | |
132 | * Adds a pattern character, called by parseEscape(), as such will not | |
133 | * interpret a hyphen as indicating a character range. | |
134 | */ | |
135 | void atomPatternCharacter(UChar ch) | |
136 | { | |
137 | // Flush if a character is already pending to prevent the | |
138 | // hyphen from begin interpreted as indicating a range. | |
139 | if((ch == '-') && (m_state == cachedCharacter)) | |
140 | flush(); | |
141 | ||
142 | atomPatternCharacterUnescaped(ch); | |
143 | } | |
144 | ||
145 | /* | |
146 | * atomBuiltInCharacterClass(): | |
147 | * | |
148 | * Adds a built-in character class, called by parseEscape(). | |
149 | */ | |
150 | void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) | |
151 | { | |
152 | flush(); | |
153 | m_delegate.atomCharacterClassBuiltIn(classID, invert); | |
154 | } | |
155 | ||
156 | /* | |
157 | * end(): | |
158 | * | |
159 | * Called at end of construction. | |
160 | */ | |
161 | void end() | |
162 | { | |
163 | flush(); | |
164 | m_delegate.atomCharacterClassEnd(); | |
165 | } | |
166 | ||
167 | // parseEscape() should never call these delegate methods when | |
168 | // invoked with inCharacterClass set. | |
169 | void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); } | |
170 | void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); } | |
171 | ||
172 | private: | |
173 | void flush() | |
174 | { | |
175 | if (m_state != empty) // either cachedCharacter or cachedCharacterHyphen | |
176 | m_delegate.atomCharacterClassAtom(m_character); | |
177 | if (m_state == cachedCharacterHyphen) | |
178 | m_delegate.atomCharacterClassAtom('-'); | |
179 | m_state = empty; | |
180 | } | |
181 | ||
182 | Delegate& m_delegate; | |
183 | ErrorCode& m_err; | |
184 | enum CharacterClassConstructionState { | |
185 | empty, | |
186 | cachedCharacter, | |
187 | cachedCharacterHyphen, | |
188 | } m_state; | |
189 | UChar m_character; | |
190 | }; | |
191 | ||
192 | Parser(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit) | |
193 | : m_delegate(delegate) | |
194 | , m_backReferenceLimit(backReferenceLimit) | |
195 | , m_err(NoError) | |
196 | , m_data(pattern.data()) | |
197 | , m_size(pattern.size()) | |
198 | , m_index(0) | |
199 | , m_parenthesesNestingDepth(0) | |
200 | { | |
201 | } | |
202 | ||
203 | /* | |
204 | * parseEscape(): | |
205 | * | |
206 | * Helper for parseTokens() AND parseCharacterClass(). | |
207 | * Unlike the other parser methods, this function does not report tokens | |
208 | * directly to the member delegate (m_delegate), instead tokens are | |
209 | * emitted to the delegate provided as an argument. In the case of atom | |
210 | * escapes, parseTokens() will call parseEscape() passing m_delegate as | |
211 | * an argument, and as such the escape will be reported to the delegate. | |
212 | * | |
213 | * However this method may also be used by parseCharacterClass(), in which | |
214 | * case a CharacterClassParserDelegate will be passed as the delegate that | |
215 | * tokens should be added to. A boolean flag is also provided to indicate | |
216 | * whether that an escape in a CharacterClass is being parsed (some parsing | |
217 | * rules change in this context). | |
218 | * | |
219 | * The boolean value returned by this method indicates whether the token | |
220 | * parsed was an atom (outside of a characted class \b and \B will be | |
221 | * interpreted as assertions). | |
222 | */ | |
223 | template<bool inCharacterClass, class EscapeDelegate> | |
224 | bool parseEscape(EscapeDelegate& delegate) | |
225 | { | |
226 | ASSERT(!m_err); | |
227 | ASSERT(peek() == '\\'); | |
228 | consume(); | |
229 | ||
230 | if (atEndOfPattern()) { | |
231 | m_err = EscapeUnterminated; | |
232 | return false; | |
233 | } | |
234 | ||
235 | switch (peek()) { | |
236 | // Assertions | |
237 | case 'b': | |
238 | consume(); | |
239 | if (inCharacterClass) | |
240 | delegate.atomPatternCharacter('\b'); | |
241 | else { | |
242 | delegate.assertionWordBoundary(false); | |
243 | return false; | |
244 | } | |
245 | break; | |
246 | case 'B': | |
247 | consume(); | |
248 | if (inCharacterClass) | |
249 | delegate.atomPatternCharacter('B'); | |
250 | else { | |
251 | delegate.assertionWordBoundary(true); | |
252 | return false; | |
253 | } | |
254 | break; | |
255 | ||
256 | // CharacterClassEscape | |
257 | case 'd': | |
258 | consume(); | |
259 | delegate.atomBuiltInCharacterClass(DigitClassID, false); | |
260 | break; | |
261 | case 's': | |
262 | consume(); | |
263 | delegate.atomBuiltInCharacterClass(SpaceClassID, false); | |
264 | break; | |
265 | case 'w': | |
266 | consume(); | |
267 | delegate.atomBuiltInCharacterClass(WordClassID, false); | |
268 | break; | |
269 | case 'D': | |
270 | consume(); | |
271 | delegate.atomBuiltInCharacterClass(DigitClassID, true); | |
272 | break; | |
273 | case 'S': | |
274 | consume(); | |
275 | delegate.atomBuiltInCharacterClass(SpaceClassID, true); | |
276 | break; | |
277 | case 'W': | |
278 | consume(); | |
279 | delegate.atomBuiltInCharacterClass(WordClassID, true); | |
280 | break; | |
281 | ||
282 | // DecimalEscape | |
283 | case '1': | |
284 | case '2': | |
285 | case '3': | |
286 | case '4': | |
287 | case '5': | |
288 | case '6': | |
289 | case '7': | |
290 | case '8': | |
291 | case '9': { | |
292 | // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape. | |
293 | // First, try to parse this as backreference. | |
294 | if (!inCharacterClass) { | |
295 | ParseState state = saveState(); | |
296 | ||
297 | unsigned backReference = consumeNumber(); | |
298 | if (backReference <= m_backReferenceLimit) { | |
299 | delegate.atomBackReference(backReference); | |
300 | break; | |
301 | } | |
302 | ||
303 | restoreState(state); | |
304 | } | |
305 | ||
306 | // Not a backreference, and not octal. | |
307 | if (peek() >= '8') { | |
308 | delegate.atomPatternCharacter('\\'); | |
309 | break; | |
310 | } | |
311 | ||
312 | // Fall-through to handle this as an octal escape. | |
313 | } | |
314 | ||
315 | // Octal escape | |
316 | case '0': | |
317 | delegate.atomPatternCharacter(consumeOctal()); | |
318 | break; | |
319 | ||
320 | // ControlEscape | |
321 | case 'f': | |
322 | consume(); | |
323 | delegate.atomPatternCharacter('\f'); | |
324 | break; | |
325 | case 'n': | |
326 | consume(); | |
327 | delegate.atomPatternCharacter('\n'); | |
328 | break; | |
329 | case 'r': | |
330 | consume(); | |
331 | delegate.atomPatternCharacter('\r'); | |
332 | break; | |
333 | case 't': | |
334 | consume(); | |
335 | delegate.atomPatternCharacter('\t'); | |
336 | break; | |
337 | case 'v': | |
338 | consume(); | |
339 | delegate.atomPatternCharacter('\v'); | |
340 | break; | |
341 | ||
342 | // ControlLetter | |
343 | case 'c': { | |
344 | ParseState state = saveState(); | |
345 | consume(); | |
346 | if (!atEndOfPattern()) { | |
347 | int control = consume(); | |
348 | ||
349 | // To match Firefox, inside a character class, we also accept numbers and '_' as control characters. | |
350 | if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) { | |
351 | delegate.atomPatternCharacter(control & 0x1f); | |
352 | break; | |
353 | } | |
354 | } | |
355 | restoreState(state); | |
356 | delegate.atomPatternCharacter('\\'); | |
357 | break; | |
358 | } | |
359 | ||
360 | // HexEscape | |
361 | case 'x': { | |
362 | consume(); | |
363 | int x = tryConsumeHex(2); | |
364 | if (x == -1) | |
365 | delegate.atomPatternCharacter('x'); | |
366 | else | |
367 | delegate.atomPatternCharacter(x); | |
368 | break; | |
369 | } | |
370 | ||
371 | // UnicodeEscape | |
372 | case 'u': { | |
373 | consume(); | |
374 | int u = tryConsumeHex(4); | |
375 | if (u == -1) | |
376 | delegate.atomPatternCharacter('u'); | |
377 | else | |
378 | delegate.atomPatternCharacter(u); | |
379 | break; | |
380 | } | |
381 | ||
382 | // IdentityEscape | |
383 | default: | |
384 | delegate.atomPatternCharacter(consume()); | |
385 | } | |
386 | ||
387 | return true; | |
388 | } | |
389 | ||
390 | /* | |
391 | * parseAtomEscape(), parseCharacterClassEscape(): | |
392 | * | |
393 | * These methods alias to parseEscape(). | |
394 | */ | |
395 | bool parseAtomEscape() | |
396 | { | |
397 | return parseEscape<false>(m_delegate); | |
398 | } | |
399 | void parseCharacterClassEscape(CharacterClassParserDelegate& delegate) | |
400 | { | |
401 | parseEscape<true>(delegate); | |
402 | } | |
403 | ||
404 | /* | |
405 | * parseCharacterClass(): | |
406 | * | |
407 | * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape) | |
408 | * to an instance of CharacterClassParserDelegate, to describe the character class to the | |
409 | * delegate. | |
410 | */ | |
411 | void parseCharacterClass() | |
412 | { | |
413 | ASSERT(!m_err); | |
414 | ASSERT(peek() == '['); | |
415 | consume(); | |
416 | ||
417 | CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err); | |
418 | ||
419 | characterClassConstructor.begin(tryConsume('^')); | |
420 | ||
421 | while (!atEndOfPattern()) { | |
422 | switch (peek()) { | |
423 | case ']': | |
424 | consume(); | |
425 | characterClassConstructor.end(); | |
426 | return; | |
427 | ||
428 | case '\\': | |
429 | parseCharacterClassEscape(characterClassConstructor); | |
430 | break; | |
431 | ||
432 | default: | |
433 | characterClassConstructor.atomPatternCharacterUnescaped(consume()); | |
434 | } | |
435 | ||
436 | if (m_err) | |
437 | return; | |
438 | } | |
439 | ||
440 | m_err = CharacterClassUnmatched; | |
441 | } | |
442 | ||
443 | /* | |
444 | * parseParenthesesBegin(): | |
445 | * | |
446 | * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns. | |
447 | */ | |
448 | void parseParenthesesBegin() | |
449 | { | |
450 | ASSERT(!m_err); | |
451 | ASSERT(peek() == '('); | |
452 | consume(); | |
453 | ||
454 | if (tryConsume('?')) { | |
455 | if (atEndOfPattern()) { | |
456 | m_err = ParenthesesTypeInvalid; | |
457 | return; | |
458 | } | |
459 | ||
460 | switch (consume()) { | |
461 | case ':': | |
462 | m_delegate.atomParenthesesSubpatternBegin(false); | |
463 | break; | |
464 | ||
465 | case '=': | |
466 | m_delegate.atomParentheticalAssertionBegin(); | |
467 | break; | |
468 | ||
469 | case '!': | |
470 | m_delegate.atomParentheticalAssertionBegin(true); | |
471 | break; | |
472 | ||
473 | default: | |
474 | m_err = ParenthesesTypeInvalid; | |
475 | } | |
476 | } else | |
477 | m_delegate.atomParenthesesSubpatternBegin(); | |
478 | ||
479 | ++m_parenthesesNestingDepth; | |
480 | } | |
481 | ||
482 | /* | |
483 | * parseParenthesesEnd(): | |
484 | * | |
485 | * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses). | |
486 | */ | |
487 | void parseParenthesesEnd() | |
488 | { | |
489 | ASSERT(!m_err); | |
490 | ASSERT(peek() == ')'); | |
491 | consume(); | |
492 | ||
493 | if (m_parenthesesNestingDepth > 0) | |
494 | m_delegate.atomParenthesesEnd(); | |
495 | else | |
496 | m_err = ParenthesesUnmatched; | |
497 | ||
498 | --m_parenthesesNestingDepth; | |
499 | } | |
500 | ||
501 | /* | |
502 | * parseQuantifier(): | |
503 | * | |
504 | * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers. | |
505 | */ | |
506 | void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max) | |
507 | { | |
508 | ASSERT(!m_err); | |
509 | ASSERT(min <= max); | |
510 | ||
511 | if (lastTokenWasAnAtom) | |
512 | m_delegate.quantifyAtom(min, max, !tryConsume('?')); | |
513 | else | |
514 | m_err = QuantifierWithoutAtom; | |
515 | } | |
516 | ||
517 | /* | |
518 | * parseTokens(): | |
519 | * | |
520 | * This method loops over the input pattern reporting tokens to the delegate. | |
521 | * The method returns when a parse error is detected, or the end of the pattern | |
522 | * is reached. One piece of state is tracked around the loop, which is whether | |
523 | * the last token passed to the delegate was an atom (this is necessary to detect | |
524 | * a parse error when a quantifier provided without an atom to quantify). | |
525 | */ | |
526 | void parseTokens() | |
527 | { | |
528 | bool lastTokenWasAnAtom = false; | |
529 | ||
530 | while (!atEndOfPattern()) { | |
531 | switch (peek()) { | |
532 | case '|': | |
533 | consume(); | |
534 | m_delegate.disjunction(); | |
535 | lastTokenWasAnAtom = false; | |
536 | break; | |
537 | ||
538 | case '(': | |
539 | parseParenthesesBegin(); | |
540 | lastTokenWasAnAtom = false; | |
541 | break; | |
542 | ||
543 | case ')': | |
544 | parseParenthesesEnd(); | |
545 | lastTokenWasAnAtom = true; | |
546 | break; | |
547 | ||
548 | case '^': | |
549 | consume(); | |
550 | m_delegate.assertionBOL(); | |
551 | lastTokenWasAnAtom = false; | |
552 | break; | |
553 | ||
554 | case '$': | |
555 | consume(); | |
556 | m_delegate.assertionEOL(); | |
557 | lastTokenWasAnAtom = false; | |
558 | break; | |
559 | ||
560 | case '.': | |
561 | consume(); | |
562 | m_delegate.atomBuiltInCharacterClass(NewlineClassID, true); | |
563 | lastTokenWasAnAtom = true; | |
564 | break; | |
565 | ||
566 | case '[': | |
567 | parseCharacterClass(); | |
568 | lastTokenWasAnAtom = true; | |
569 | break; | |
570 | ||
571 | case '\\': | |
572 | lastTokenWasAnAtom = parseAtomEscape(); | |
573 | break; | |
574 | ||
575 | case '*': | |
576 | consume(); | |
577 | parseQuantifier(lastTokenWasAnAtom, 0, UINT_MAX); | |
578 | lastTokenWasAnAtom = false; | |
579 | break; | |
580 | ||
581 | case '+': | |
582 | consume(); | |
583 | parseQuantifier(lastTokenWasAnAtom, 1, UINT_MAX); | |
584 | lastTokenWasAnAtom = false; | |
585 | break; | |
586 | ||
587 | case '?': | |
588 | consume(); | |
589 | parseQuantifier(lastTokenWasAnAtom, 0, 1); | |
590 | lastTokenWasAnAtom = false; | |
591 | break; | |
592 | ||
593 | case '{': { | |
594 | ParseState state = saveState(); | |
595 | ||
596 | consume(); | |
597 | if (peekIsDigit()) { | |
598 | unsigned min = consumeNumber(); | |
599 | unsigned max = min; | |
600 | ||
601 | if (tryConsume(',')) | |
602 | max = peekIsDigit() ? consumeNumber() : UINT_MAX; | |
603 | ||
604 | if (tryConsume('}')) { | |
605 | if (min <= max) | |
606 | parseQuantifier(lastTokenWasAnAtom, min, max); | |
607 | else | |
608 | m_err = QuantifierOutOfOrder; | |
609 | lastTokenWasAnAtom = false; | |
610 | break; | |
611 | } | |
612 | } | |
613 | ||
614 | restoreState(state); | |
615 | } // if we did not find a complete quantifer, fall through to the default case. | |
616 | ||
617 | default: | |
618 | m_delegate.atomPatternCharacter(consume()); | |
619 | lastTokenWasAnAtom = true; | |
620 | } | |
621 | ||
622 | if (m_err) | |
623 | return; | |
624 | } | |
625 | ||
626 | if (m_parenthesesNestingDepth > 0) | |
627 | m_err = MissingParentheses; | |
628 | } | |
629 | ||
630 | /* | |
631 | * parse(): | |
632 | * | |
633 | * This method calls regexBegin(), calls parseTokens() to parse over the input | |
634 | * patterns, calls regexEnd() or regexError() as appropriate, and converts any | |
635 | * error code to a const char* for a result. | |
636 | */ | |
637 | const char* parse() | |
638 | { | |
639 | m_delegate.regexBegin(); | |
640 | ||
641 | if (m_size > MAX_PATTERN_SIZE) | |
642 | m_err = PatternTooLarge; | |
643 | else | |
644 | parseTokens(); | |
645 | ASSERT(atEndOfPattern() || m_err); | |
646 | ||
647 | if (m_err) | |
648 | m_delegate.regexError(); | |
649 | else | |
650 | m_delegate.regexEnd(); | |
651 | ||
652 | // The order of this array must match the ErrorCode enum. | |
653 | static const char* errorMessages[NumberOfErrorCodes] = { | |
654 | 0, // NoError | |
655 | "regular expression too large", | |
656 | "numbers out of order in {} quantifier", | |
657 | "nothing to repeat", | |
658 | "missing )", | |
659 | "unmatched parentheses", | |
660 | "unrecognized character after (?", | |
661 | "missing terminating ] for character class", | |
662 | "range out of order in character class", | |
663 | "\\ at end of pattern" | |
664 | }; | |
665 | ||
666 | return errorMessages[m_err]; | |
667 | } | |
668 | ||
669 | ||
670 | // Misc helper functions: | |
671 | ||
672 | typedef unsigned ParseState; | |
673 | ||
674 | ParseState saveState() | |
675 | { | |
676 | return m_index; | |
677 | } | |
678 | ||
679 | void restoreState(ParseState state) | |
680 | { | |
681 | m_index = state; | |
682 | } | |
683 | ||
684 | bool atEndOfPattern() | |
685 | { | |
686 | ASSERT(m_index <= m_size); | |
687 | return m_index == m_size; | |
688 | } | |
689 | ||
690 | int peek() | |
691 | { | |
692 | ASSERT(m_index < m_size); | |
693 | return m_data[m_index]; | |
694 | } | |
695 | ||
696 | bool peekIsDigit() | |
697 | { | |
698 | return !atEndOfPattern() && WTF::isASCIIDigit(peek()); | |
699 | } | |
700 | ||
701 | unsigned peekDigit() | |
702 | { | |
703 | ASSERT(peekIsDigit()); | |
704 | return peek() - '0'; | |
705 | } | |
706 | ||
707 | int consume() | |
708 | { | |
709 | ASSERT(m_index < m_size); | |
710 | return m_data[m_index++]; | |
711 | } | |
712 | ||
713 | unsigned consumeDigit() | |
714 | { | |
715 | ASSERT(peekIsDigit()); | |
716 | return consume() - '0'; | |
717 | } | |
718 | ||
719 | unsigned consumeNumber() | |
720 | { | |
721 | unsigned n = consumeDigit(); | |
722 | // check for overflow. | |
723 | for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) { | |
724 | n = newValue; | |
725 | consume(); | |
726 | } | |
727 | return n; | |
728 | } | |
729 | ||
730 | unsigned consumeOctal() | |
731 | { | |
732 | ASSERT(WTF::isASCIIOctalDigit(peek())); | |
733 | ||
734 | unsigned n = consumeDigit(); | |
735 | while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek())) | |
736 | n = n * 8 + consumeDigit(); | |
737 | return n; | |
738 | } | |
739 | ||
740 | bool tryConsume(UChar ch) | |
741 | { | |
742 | if (atEndOfPattern() || (m_data[m_index] != ch)) | |
743 | return false; | |
744 | ++m_index; | |
745 | return true; | |
746 | } | |
747 | ||
748 | int tryConsumeHex(int count) | |
749 | { | |
750 | ParseState state = saveState(); | |
751 | ||
752 | int n = 0; | |
753 | while (count--) { | |
754 | if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) { | |
755 | restoreState(state); | |
756 | return -1; | |
757 | } | |
758 | n = (n << 4) | WTF::toASCIIHexValue(consume()); | |
759 | } | |
760 | return n; | |
761 | } | |
762 | ||
763 | Delegate& m_delegate; | |
764 | unsigned m_backReferenceLimit; | |
765 | ErrorCode m_err; | |
766 | const UChar* m_data; | |
767 | unsigned m_size; | |
768 | unsigned m_index; | |
769 | unsigned m_parenthesesNestingDepth; | |
770 | ||
771 | // Derived by empirical testing of compile time in PCRE and WREC. | |
772 | static const unsigned MAX_PATTERN_SIZE = 1024 * 1024; | |
773 | }; | |
774 | ||
775 | /* | |
776 | * Yarr::parse(): | |
777 | * | |
778 | * The parse method is passed a pattern to be parsed and a delegate upon which | |
779 | * callbacks will be made to record the parsed tokens forming the regex. | |
780 | * Yarr::parse() returns null on success, or a const C string providing an error | |
781 | * message where a parse error occurs. | |
782 | * | |
783 | * The Delegate must implement the following interface: | |
784 | * | |
785 | * void assertionBOL(); | |
786 | * void assertionEOL(); | |
787 | * void assertionWordBoundary(bool invert); | |
788 | * | |
789 | * void atomPatternCharacter(UChar ch); | |
790 | * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert); | |
791 | * void atomCharacterClassBegin(bool invert) | |
792 | * void atomCharacterClassAtom(UChar ch) | |
793 | * void atomCharacterClassRange(UChar begin, UChar end) | |
794 | * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) | |
795 | * void atomCharacterClassEnd() | |
796 | * void atomParenthesesSubpatternBegin(bool capture = true); | |
797 | * void atomParentheticalAssertionBegin(bool invert = false); | |
798 | * void atomParenthesesEnd(); | |
799 | * void atomBackReference(unsigned subpatternId); | |
800 | * | |
801 | * void quantifyAtom(unsigned min, unsigned max, bool greedy); | |
802 | * | |
803 | * void disjunction(); | |
804 | * | |
805 | * void regexBegin(); | |
806 | * void regexEnd(); | |
807 | * void regexError(); | |
808 | * | |
809 | * Before any call recording tokens are made, regexBegin() will be called on the | |
810 | * delegate once. Once parsing is complete either regexEnd() or regexError() will | |
811 | * be called, as appropriate. | |
812 | * | |
813 | * The regular expression is described by a sequence of assertion*() and atom*() | |
814 | * callbacks to the delegate, describing the terms in the regular expression. | |
815 | * Following an atom a quantifyAtom() call may occur to indicate that the previous | |
816 | * atom should be quantified. In the case of atoms described across multiple | |
817 | * calls (parentheses and character classes) the call to quantifyAtom() will come | |
818 | * after the call to the atom*End() method, never after atom*Begin(). | |
819 | * | |
820 | * Character classes may either be described by a single call to | |
821 | * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls. | |
822 | * In the latter case, ...Begin() will be called, followed by a sequence of | |
823 | * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End(). | |
824 | * | |
825 | * Sequences of atoms and assertions are broken into alternatives via calls to | |
826 | * disjunction(). Assertions, atoms, and disjunctions emitted between calls to | |
827 | * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern. | |
828 | * atomParenthesesBegin() is passed a subpatternId. In the case of a regular | |
829 | * capturing subpattern, this will be the subpatternId associated with these | |
830 | * parentheses, and will also by definition be the lowest subpatternId of these | |
831 | * parentheses and of any nested paretheses. The atomParenthesesEnd() method | |
832 | * is passed the subpatternId of the last capturing subexpression nested within | |
833 | * these paretheses. In the case of a capturing subpattern with no nested | |
834 | * capturing subpatterns, the same subpatternId will be passed to the begin and | |
835 | * end functions. In the case of non-capturing subpatterns the subpatternId | |
836 | * passed to the begin method is also the first possible subpatternId that might | |
837 | * be nested within these paretheses. If a set of non-capturing parentheses does | |
838 | * not contain any capturing subpatterns, then the subpatternId passed to begin | |
839 | * will be greater than the subpatternId passed to end. | |
840 | */ | |
841 | ||
842 | template<class Delegate> | |
843 | const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = UINT_MAX) | |
844 | { | |
845 | return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse(); | |
846 | } | |
847 | ||
848 | } } // namespace JSC::Yarr | |
849 | ||
850 | #endif | |
851 | ||
852 | #endif // RegexParser_h |