]>
Commit | Line | Data |
---|---|---|
ba379fdc A |
1 | /* |
2 | * Copyright (C) 2009 Apple Inc. All rights reserved. | |
3 | * | |
4 | * Redistribution and use in source and binary forms, with or without | |
5 | * modification, are permitted provided that the following conditions | |
6 | * are met: | |
7 | * 1. Redistributions of source code must retain the above copyright | |
8 | * notice, this list of conditions and the following disclaimer. | |
9 | * 2. Redistributions in binary form must reproduce the above copyright | |
10 | * notice, this list of conditions and the following disclaimer in the | |
11 | * documentation and/or other materials provided with the distribution. | |
12 | * | |
13 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY | |
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
16 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR | |
17 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
18 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
19 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
20 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
21 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
23 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
24 | */ | |
25 | ||
14957cd0 A |
26 | #ifndef YarrParser_h |
27 | #define YarrParser_h | |
ba379fdc | 28 | |
14957cd0 | 29 | #include "Yarr.h" |
ba379fdc | 30 | #include <wtf/ASCIICType.h> |
93a37866 | 31 | #include <wtf/text/WTFString.h> |
ba379fdc A |
32 | |
33 | namespace JSC { namespace Yarr { | |
34 | ||
14957cd0 A |
35 | #define REGEXP_ERROR_PREFIX "Invalid regular expression: " |
36 | ||
ba379fdc A |
37 | enum BuiltInCharacterClassID { |
38 | DigitClassID, | |
39 | SpaceClassID, | |
40 | WordClassID, | |
41 | NewlineClassID, | |
42 | }; | |
43 | ||
44 | // The Parser class should not be used directly - only via the Yarr::parse() method. | |
6fe7ccc8 | 45 | template<class Delegate, typename CharType> |
ba379fdc A |
46 | class Parser { |
47 | private: | |
48 | template<class FriendDelegate> | |
93a37866 | 49 | friend const char* parse(FriendDelegate&, const String& pattern, unsigned backReferenceLimit); |
ba379fdc A |
50 | |
51 | enum ErrorCode { | |
52 | NoError, | |
53 | PatternTooLarge, | |
54 | QuantifierOutOfOrder, | |
55 | QuantifierWithoutAtom, | |
6fe7ccc8 | 56 | QuantifierTooLarge, |
ba379fdc A |
57 | MissingParentheses, |
58 | ParenthesesUnmatched, | |
59 | ParenthesesTypeInvalid, | |
60 | CharacterClassUnmatched, | |
61 | CharacterClassOutOfOrder, | |
62 | EscapeUnterminated, | |
63 | NumberOfErrorCodes | |
64 | }; | |
65 | ||
66 | /* | |
67 | * CharacterClassParserDelegate: | |
68 | * | |
69 | * The class CharacterClassParserDelegate is used in the parsing of character | |
70 | * classes. This class handles detection of character ranges. This class | |
71 | * implements enough of the delegate interface such that it can be passed to | |
72 | * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused | |
73 | * to perform the parsing of escape characters in character sets. | |
74 | */ | |
75 | class CharacterClassParserDelegate { | |
76 | public: | |
77 | CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) | |
78 | : m_delegate(delegate) | |
79 | , m_err(err) | |
14957cd0 A |
80 | , m_state(Empty) |
81 | , m_character(0) | |
ba379fdc A |
82 | { |
83 | } | |
84 | ||
85 | /* | |
86 | * begin(): | |
87 | * | |
88 | * Called at beginning of construction. | |
89 | */ | |
90 | void begin(bool invert) | |
91 | { | |
92 | m_delegate.atomCharacterClassBegin(invert); | |
93 | } | |
94 | ||
95 | /* | |
14957cd0 | 96 | * atomPatternCharacter(): |
ba379fdc | 97 | * |
14957cd0 A |
98 | * This method is called either from parseCharacterClass() (for an unescaped |
99 | * character in a character class), or from parseEscape(). In the former case | |
100 | * the value true will be passed for the argument 'hyphenIsRange', and in this | |
101 | * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/ | |
102 | * is different to /[a\-z]/). | |
ba379fdc | 103 | */ |
14957cd0 | 104 | void atomPatternCharacter(UChar ch, bool hyphenIsRange = false) |
ba379fdc A |
105 | { |
106 | switch (m_state) { | |
14957cd0 A |
107 | case AfterCharacterClass: |
108 | // Following a builtin character class we need look out for a hyphen. | |
109 | // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/. | |
110 | // If we see a hyphen following a charater class then unlike usual | |
111 | // we'll report it to the delegate immediately, and put ourself into | |
112 | // a poisoned state. Any following calls to add another character or | |
113 | // character class will result in an error. (A hypen following a | |
114 | // character-class is itself valid, but only at the end of a regex). | |
115 | if (hyphenIsRange && ch == '-') { | |
116 | m_delegate.atomCharacterClassAtom('-'); | |
117 | m_state = AfterCharacterClassHyphen; | |
118 | return; | |
119 | } | |
120 | // Otherwise just fall through - cached character so treat this as Empty. | |
81345200 | 121 | FALLTHROUGH; |
14957cd0 A |
122 | |
123 | case Empty: | |
ba379fdc | 124 | m_character = ch; |
14957cd0 A |
125 | m_state = CachedCharacter; |
126 | return; | |
ba379fdc | 127 | |
14957cd0 A |
128 | case CachedCharacter: |
129 | if (hyphenIsRange && ch == '-') | |
130 | m_state = CachedCharacterHyphen; | |
ba379fdc A |
131 | else { |
132 | m_delegate.atomCharacterClassAtom(m_character); | |
133 | m_character = ch; | |
134 | } | |
14957cd0 | 135 | return; |
ba379fdc | 136 | |
14957cd0 A |
137 | case CachedCharacterHyphen: |
138 | if (ch < m_character) { | |
ba379fdc | 139 | m_err = CharacterClassOutOfOrder; |
14957cd0 A |
140 | return; |
141 | } | |
142 | m_delegate.atomCharacterClassRange(m_character, ch); | |
143 | m_state = Empty; | |
144 | return; | |
ba379fdc | 145 | |
14957cd0 A |
146 | // See coment in atomBuiltInCharacterClass below. |
147 | // This too is technically an error, per ECMA-262, and again we | |
148 | // we chose to allow this. Note a subtlely here that while we | |
149 | // diverge from the spec's definition of CharacterRange we do | |
150 | // remain in compliance with the grammar. For example, consider | |
151 | // the expression /[\d-a-z]/. We comply with the grammar in | |
152 | // this case by not allowing a-z to be matched as a range. | |
153 | case AfterCharacterClassHyphen: | |
154 | m_delegate.atomCharacterClassAtom(ch); | |
155 | m_state = Empty; | |
156 | return; | |
157 | } | |
ba379fdc A |
158 | } |
159 | ||
160 | /* | |
161 | * atomBuiltInCharacterClass(): | |
162 | * | |
163 | * Adds a built-in character class, called by parseEscape(). | |
164 | */ | |
165 | void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) | |
166 | { | |
14957cd0 A |
167 | switch (m_state) { |
168 | case CachedCharacter: | |
169 | // Flush the currently cached character, then fall through. | |
170 | m_delegate.atomCharacterClassAtom(m_character); | |
81345200 | 171 | FALLTHROUGH; |
14957cd0 A |
172 | case Empty: |
173 | case AfterCharacterClass: | |
174 | m_state = AfterCharacterClass; | |
175 | m_delegate.atomCharacterClassBuiltIn(classID, invert); | |
176 | return; | |
177 | ||
178 | // If we hit either of these cases, we have an invalid range that | |
179 | // looks something like /[x-\d]/ or /[\d-\d]/. | |
180 | // According to ECMA-262 this should be a syntax error, but | |
181 | // empirical testing shows this to break teh webz. Instead we | |
182 | // comply with to the ECMA-262 grammar, and assume the grammar to | |
183 | // have matched the range correctly, but tweak our interpretation | |
184 | // of CharacterRange. Effectively we implicitly handle the hyphen | |
185 | // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/. | |
186 | case CachedCharacterHyphen: | |
187 | m_delegate.atomCharacterClassAtom(m_character); | |
188 | m_delegate.atomCharacterClassAtom('-'); | |
81345200 | 189 | FALLTHROUGH; |
14957cd0 A |
190 | case AfterCharacterClassHyphen: |
191 | m_delegate.atomCharacterClassBuiltIn(classID, invert); | |
192 | m_state = Empty; | |
193 | return; | |
194 | } | |
ba379fdc A |
195 | } |
196 | ||
197 | /* | |
198 | * end(): | |
199 | * | |
200 | * Called at end of construction. | |
201 | */ | |
202 | void end() | |
203 | { | |
14957cd0 A |
204 | if (m_state == CachedCharacter) |
205 | m_delegate.atomCharacterClassAtom(m_character); | |
206 | else if (m_state == CachedCharacterHyphen) { | |
207 | m_delegate.atomCharacterClassAtom(m_character); | |
208 | m_delegate.atomCharacterClassAtom('-'); | |
209 | } | |
ba379fdc A |
210 | m_delegate.atomCharacterClassEnd(); |
211 | } | |
212 | ||
213 | // parseEscape() should never call these delegate methods when | |
214 | // invoked with inCharacterClass set. | |
93a37866 A |
215 | NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { RELEASE_ASSERT_NOT_REACHED(); } |
216 | NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { RELEASE_ASSERT_NOT_REACHED(); } | |
ba379fdc A |
217 | |
218 | private: | |
ba379fdc A |
219 | Delegate& m_delegate; |
220 | ErrorCode& m_err; | |
221 | enum CharacterClassConstructionState { | |
14957cd0 A |
222 | Empty, |
223 | CachedCharacter, | |
224 | CachedCharacterHyphen, | |
225 | AfterCharacterClass, | |
226 | AfterCharacterClassHyphen, | |
ba379fdc A |
227 | } m_state; |
228 | UChar m_character; | |
229 | }; | |
230 | ||
93a37866 | 231 | Parser(Delegate& delegate, const String& pattern, unsigned backReferenceLimit) |
ba379fdc A |
232 | : m_delegate(delegate) |
233 | , m_backReferenceLimit(backReferenceLimit) | |
234 | , m_err(NoError) | |
81345200 | 235 | , m_data(pattern.characters<CharType>()) |
14957cd0 | 236 | , m_size(pattern.length()) |
ba379fdc A |
237 | , m_index(0) |
238 | , m_parenthesesNestingDepth(0) | |
239 | { | |
240 | } | |
6fe7ccc8 | 241 | |
ba379fdc A |
242 | /* |
243 | * parseEscape(): | |
244 | * | |
245 | * Helper for parseTokens() AND parseCharacterClass(). | |
246 | * Unlike the other parser methods, this function does not report tokens | |
247 | * directly to the member delegate (m_delegate), instead tokens are | |
248 | * emitted to the delegate provided as an argument. In the case of atom | |
249 | * escapes, parseTokens() will call parseEscape() passing m_delegate as | |
250 | * an argument, and as such the escape will be reported to the delegate. | |
251 | * | |
252 | * However this method may also be used by parseCharacterClass(), in which | |
253 | * case a CharacterClassParserDelegate will be passed as the delegate that | |
254 | * tokens should be added to. A boolean flag is also provided to indicate | |
255 | * whether that an escape in a CharacterClass is being parsed (some parsing | |
256 | * rules change in this context). | |
257 | * | |
258 | * The boolean value returned by this method indicates whether the token | |
259 | * parsed was an atom (outside of a characted class \b and \B will be | |
260 | * interpreted as assertions). | |
261 | */ | |
262 | template<bool inCharacterClass, class EscapeDelegate> | |
263 | bool parseEscape(EscapeDelegate& delegate) | |
264 | { | |
265 | ASSERT(!m_err); | |
266 | ASSERT(peek() == '\\'); | |
267 | consume(); | |
268 | ||
269 | if (atEndOfPattern()) { | |
270 | m_err = EscapeUnterminated; | |
271 | return false; | |
272 | } | |
273 | ||
274 | switch (peek()) { | |
275 | // Assertions | |
276 | case 'b': | |
277 | consume(); | |
278 | if (inCharacterClass) | |
279 | delegate.atomPatternCharacter('\b'); | |
280 | else { | |
281 | delegate.assertionWordBoundary(false); | |
282 | return false; | |
283 | } | |
284 | break; | |
285 | case 'B': | |
286 | consume(); | |
287 | if (inCharacterClass) | |
288 | delegate.atomPatternCharacter('B'); | |
289 | else { | |
290 | delegate.assertionWordBoundary(true); | |
291 | return false; | |
292 | } | |
293 | break; | |
294 | ||
295 | // CharacterClassEscape | |
296 | case 'd': | |
297 | consume(); | |
298 | delegate.atomBuiltInCharacterClass(DigitClassID, false); | |
299 | break; | |
300 | case 's': | |
301 | consume(); | |
302 | delegate.atomBuiltInCharacterClass(SpaceClassID, false); | |
303 | break; | |
304 | case 'w': | |
305 | consume(); | |
306 | delegate.atomBuiltInCharacterClass(WordClassID, false); | |
307 | break; | |
308 | case 'D': | |
309 | consume(); | |
310 | delegate.atomBuiltInCharacterClass(DigitClassID, true); | |
311 | break; | |
312 | case 'S': | |
313 | consume(); | |
314 | delegate.atomBuiltInCharacterClass(SpaceClassID, true); | |
315 | break; | |
316 | case 'W': | |
317 | consume(); | |
318 | delegate.atomBuiltInCharacterClass(WordClassID, true); | |
319 | break; | |
320 | ||
321 | // DecimalEscape | |
322 | case '1': | |
323 | case '2': | |
324 | case '3': | |
325 | case '4': | |
326 | case '5': | |
327 | case '6': | |
328 | case '7': | |
329 | case '8': | |
330 | case '9': { | |
331 | // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape. | |
332 | // First, try to parse this as backreference. | |
333 | if (!inCharacterClass) { | |
334 | ParseState state = saveState(); | |
335 | ||
336 | unsigned backReference = consumeNumber(); | |
337 | if (backReference <= m_backReferenceLimit) { | |
338 | delegate.atomBackReference(backReference); | |
339 | break; | |
340 | } | |
341 | ||
342 | restoreState(state); | |
343 | } | |
344 | ||
345 | // Not a backreference, and not octal. | |
346 | if (peek() >= '8') { | |
347 | delegate.atomPatternCharacter('\\'); | |
348 | break; | |
349 | } | |
350 | ||
351 | // Fall-through to handle this as an octal escape. | |
81345200 | 352 | FALLTHROUGH; |
ba379fdc A |
353 | } |
354 | ||
355 | // Octal escape | |
356 | case '0': | |
357 | delegate.atomPatternCharacter(consumeOctal()); | |
358 | break; | |
359 | ||
360 | // ControlEscape | |
361 | case 'f': | |
362 | consume(); | |
363 | delegate.atomPatternCharacter('\f'); | |
364 | break; | |
365 | case 'n': | |
366 | consume(); | |
367 | delegate.atomPatternCharacter('\n'); | |
368 | break; | |
369 | case 'r': | |
370 | consume(); | |
371 | delegate.atomPatternCharacter('\r'); | |
372 | break; | |
373 | case 't': | |
374 | consume(); | |
375 | delegate.atomPatternCharacter('\t'); | |
376 | break; | |
377 | case 'v': | |
378 | consume(); | |
379 | delegate.atomPatternCharacter('\v'); | |
380 | break; | |
381 | ||
382 | // ControlLetter | |
383 | case 'c': { | |
384 | ParseState state = saveState(); | |
385 | consume(); | |
386 | if (!atEndOfPattern()) { | |
387 | int control = consume(); | |
388 | ||
389 | // To match Firefox, inside a character class, we also accept numbers and '_' as control characters. | |
390 | if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) { | |
391 | delegate.atomPatternCharacter(control & 0x1f); | |
392 | break; | |
393 | } | |
394 | } | |
395 | restoreState(state); | |
396 | delegate.atomPatternCharacter('\\'); | |
397 | break; | |
398 | } | |
399 | ||
400 | // HexEscape | |
401 | case 'x': { | |
402 | consume(); | |
403 | int x = tryConsumeHex(2); | |
404 | if (x == -1) | |
405 | delegate.atomPatternCharacter('x'); | |
406 | else | |
407 | delegate.atomPatternCharacter(x); | |
408 | break; | |
409 | } | |
410 | ||
411 | // UnicodeEscape | |
412 | case 'u': { | |
413 | consume(); | |
414 | int u = tryConsumeHex(4); | |
415 | if (u == -1) | |
416 | delegate.atomPatternCharacter('u'); | |
417 | else | |
418 | delegate.atomPatternCharacter(u); | |
419 | break; | |
420 | } | |
421 | ||
422 | // IdentityEscape | |
423 | default: | |
424 | delegate.atomPatternCharacter(consume()); | |
425 | } | |
426 | ||
427 | return true; | |
428 | } | |
429 | ||
430 | /* | |
431 | * parseAtomEscape(), parseCharacterClassEscape(): | |
432 | * | |
433 | * These methods alias to parseEscape(). | |
434 | */ | |
435 | bool parseAtomEscape() | |
436 | { | |
437 | return parseEscape<false>(m_delegate); | |
438 | } | |
439 | void parseCharacterClassEscape(CharacterClassParserDelegate& delegate) | |
440 | { | |
441 | parseEscape<true>(delegate); | |
442 | } | |
443 | ||
444 | /* | |
445 | * parseCharacterClass(): | |
446 | * | |
447 | * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape) | |
448 | * to an instance of CharacterClassParserDelegate, to describe the character class to the | |
449 | * delegate. | |
450 | */ | |
451 | void parseCharacterClass() | |
452 | { | |
453 | ASSERT(!m_err); | |
454 | ASSERT(peek() == '['); | |
455 | consume(); | |
456 | ||
457 | CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err); | |
458 | ||
459 | characterClassConstructor.begin(tryConsume('^')); | |
460 | ||
461 | while (!atEndOfPattern()) { | |
462 | switch (peek()) { | |
463 | case ']': | |
464 | consume(); | |
465 | characterClassConstructor.end(); | |
466 | return; | |
467 | ||
468 | case '\\': | |
469 | parseCharacterClassEscape(characterClassConstructor); | |
470 | break; | |
471 | ||
472 | default: | |
14957cd0 | 473 | characterClassConstructor.atomPatternCharacter(consume(), true); |
ba379fdc A |
474 | } |
475 | ||
476 | if (m_err) | |
477 | return; | |
478 | } | |
479 | ||
480 | m_err = CharacterClassUnmatched; | |
481 | } | |
482 | ||
483 | /* | |
484 | * parseParenthesesBegin(): | |
485 | * | |
486 | * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns. | |
487 | */ | |
488 | void parseParenthesesBegin() | |
489 | { | |
490 | ASSERT(!m_err); | |
491 | ASSERT(peek() == '('); | |
492 | consume(); | |
493 | ||
494 | if (tryConsume('?')) { | |
495 | if (atEndOfPattern()) { | |
496 | m_err = ParenthesesTypeInvalid; | |
497 | return; | |
498 | } | |
499 | ||
500 | switch (consume()) { | |
501 | case ':': | |
502 | m_delegate.atomParenthesesSubpatternBegin(false); | |
503 | break; | |
504 | ||
505 | case '=': | |
506 | m_delegate.atomParentheticalAssertionBegin(); | |
507 | break; | |
508 | ||
509 | case '!': | |
510 | m_delegate.atomParentheticalAssertionBegin(true); | |
511 | break; | |
512 | ||
513 | default: | |
514 | m_err = ParenthesesTypeInvalid; | |
515 | } | |
516 | } else | |
517 | m_delegate.atomParenthesesSubpatternBegin(); | |
518 | ||
519 | ++m_parenthesesNestingDepth; | |
520 | } | |
521 | ||
522 | /* | |
523 | * parseParenthesesEnd(): | |
524 | * | |
525 | * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses). | |
526 | */ | |
527 | void parseParenthesesEnd() | |
528 | { | |
529 | ASSERT(!m_err); | |
530 | ASSERT(peek() == ')'); | |
531 | consume(); | |
532 | ||
533 | if (m_parenthesesNestingDepth > 0) | |
534 | m_delegate.atomParenthesesEnd(); | |
535 | else | |
536 | m_err = ParenthesesUnmatched; | |
537 | ||
538 | --m_parenthesesNestingDepth; | |
539 | } | |
540 | ||
541 | /* | |
542 | * parseQuantifier(): | |
543 | * | |
544 | * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers. | |
545 | */ | |
546 | void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max) | |
547 | { | |
548 | ASSERT(!m_err); | |
549 | ASSERT(min <= max); | |
550 | ||
6fe7ccc8 A |
551 | if (min == UINT_MAX) { |
552 | m_err = QuantifierTooLarge; | |
553 | return; | |
554 | } | |
555 | ||
ba379fdc A |
556 | if (lastTokenWasAnAtom) |
557 | m_delegate.quantifyAtom(min, max, !tryConsume('?')); | |
558 | else | |
559 | m_err = QuantifierWithoutAtom; | |
560 | } | |
561 | ||
562 | /* | |
563 | * parseTokens(): | |
564 | * | |
565 | * This method loops over the input pattern reporting tokens to the delegate. | |
566 | * The method returns when a parse error is detected, or the end of the pattern | |
567 | * is reached. One piece of state is tracked around the loop, which is whether | |
568 | * the last token passed to the delegate was an atom (this is necessary to detect | |
569 | * a parse error when a quantifier provided without an atom to quantify). | |
570 | */ | |
571 | void parseTokens() | |
572 | { | |
573 | bool lastTokenWasAnAtom = false; | |
574 | ||
575 | while (!atEndOfPattern()) { | |
576 | switch (peek()) { | |
577 | case '|': | |
578 | consume(); | |
579 | m_delegate.disjunction(); | |
580 | lastTokenWasAnAtom = false; | |
581 | break; | |
582 | ||
583 | case '(': | |
584 | parseParenthesesBegin(); | |
585 | lastTokenWasAnAtom = false; | |
586 | break; | |
587 | ||
588 | case ')': | |
589 | parseParenthesesEnd(); | |
590 | lastTokenWasAnAtom = true; | |
591 | break; | |
592 | ||
593 | case '^': | |
594 | consume(); | |
595 | m_delegate.assertionBOL(); | |
596 | lastTokenWasAnAtom = false; | |
597 | break; | |
598 | ||
599 | case '$': | |
600 | consume(); | |
601 | m_delegate.assertionEOL(); | |
602 | lastTokenWasAnAtom = false; | |
603 | break; | |
604 | ||
605 | case '.': | |
606 | consume(); | |
607 | m_delegate.atomBuiltInCharacterClass(NewlineClassID, true); | |
608 | lastTokenWasAnAtom = true; | |
609 | break; | |
610 | ||
611 | case '[': | |
612 | parseCharacterClass(); | |
613 | lastTokenWasAnAtom = true; | |
614 | break; | |
615 | ||
616 | case '\\': | |
617 | lastTokenWasAnAtom = parseAtomEscape(); | |
618 | break; | |
619 | ||
620 | case '*': | |
621 | consume(); | |
14957cd0 | 622 | parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite); |
ba379fdc A |
623 | lastTokenWasAnAtom = false; |
624 | break; | |
625 | ||
626 | case '+': | |
627 | consume(); | |
14957cd0 | 628 | parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite); |
ba379fdc A |
629 | lastTokenWasAnAtom = false; |
630 | break; | |
631 | ||
632 | case '?': | |
633 | consume(); | |
634 | parseQuantifier(lastTokenWasAnAtom, 0, 1); | |
635 | lastTokenWasAnAtom = false; | |
636 | break; | |
637 | ||
638 | case '{': { | |
639 | ParseState state = saveState(); | |
640 | ||
641 | consume(); | |
642 | if (peekIsDigit()) { | |
643 | unsigned min = consumeNumber(); | |
644 | unsigned max = min; | |
645 | ||
646 | if (tryConsume(',')) | |
14957cd0 | 647 | max = peekIsDigit() ? consumeNumber() : quantifyInfinite; |
ba379fdc A |
648 | |
649 | if (tryConsume('}')) { | |
650 | if (min <= max) | |
651 | parseQuantifier(lastTokenWasAnAtom, min, max); | |
652 | else | |
653 | m_err = QuantifierOutOfOrder; | |
654 | lastTokenWasAnAtom = false; | |
655 | break; | |
656 | } | |
657 | } | |
658 | ||
659 | restoreState(state); | |
81345200 A |
660 | } |
661 | // if we did not find a complete quantifer, fall through to the default case. | |
662 | FALLTHROUGH; | |
ba379fdc A |
663 | |
664 | default: | |
665 | m_delegate.atomPatternCharacter(consume()); | |
666 | lastTokenWasAnAtom = true; | |
667 | } | |
668 | ||
669 | if (m_err) | |
670 | return; | |
671 | } | |
672 | ||
673 | if (m_parenthesesNestingDepth > 0) | |
674 | m_err = MissingParentheses; | |
675 | } | |
676 | ||
677 | /* | |
678 | * parse(): | |
679 | * | |
14957cd0 | 680 | * This method calls parseTokens() to parse over the input and converts any |
ba379fdc A |
681 | * error code to a const char* for a result. |
682 | */ | |
683 | const char* parse() | |
684 | { | |
ba379fdc A |
685 | if (m_size > MAX_PATTERN_SIZE) |
686 | m_err = PatternTooLarge; | |
687 | else | |
688 | parseTokens(); | |
689 | ASSERT(atEndOfPattern() || m_err); | |
690 | ||
ba379fdc A |
691 | // The order of this array must match the ErrorCode enum. |
692 | static const char* errorMessages[NumberOfErrorCodes] = { | |
693 | 0, // NoError | |
14957cd0 A |
694 | REGEXP_ERROR_PREFIX "regular expression too large", |
695 | REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier", | |
696 | REGEXP_ERROR_PREFIX "nothing to repeat", | |
6fe7ccc8 | 697 | REGEXP_ERROR_PREFIX "number too large in {} quantifier", |
14957cd0 A |
698 | REGEXP_ERROR_PREFIX "missing )", |
699 | REGEXP_ERROR_PREFIX "unmatched parentheses", | |
700 | REGEXP_ERROR_PREFIX "unrecognized character after (?", | |
701 | REGEXP_ERROR_PREFIX "missing terminating ] for character class", | |
702 | REGEXP_ERROR_PREFIX "range out of order in character class", | |
703 | REGEXP_ERROR_PREFIX "\\ at end of pattern" | |
ba379fdc A |
704 | }; |
705 | ||
706 | return errorMessages[m_err]; | |
707 | } | |
708 | ||
ba379fdc A |
709 | // Misc helper functions: |
710 | ||
711 | typedef unsigned ParseState; | |
712 | ||
713 | ParseState saveState() | |
714 | { | |
715 | return m_index; | |
716 | } | |
717 | ||
718 | void restoreState(ParseState state) | |
719 | { | |
720 | m_index = state; | |
721 | } | |
722 | ||
723 | bool atEndOfPattern() | |
724 | { | |
725 | ASSERT(m_index <= m_size); | |
726 | return m_index == m_size; | |
727 | } | |
728 | ||
729 | int peek() | |
730 | { | |
731 | ASSERT(m_index < m_size); | |
732 | return m_data[m_index]; | |
733 | } | |
734 | ||
735 | bool peekIsDigit() | |
736 | { | |
737 | return !atEndOfPattern() && WTF::isASCIIDigit(peek()); | |
738 | } | |
739 | ||
740 | unsigned peekDigit() | |
741 | { | |
742 | ASSERT(peekIsDigit()); | |
743 | return peek() - '0'; | |
744 | } | |
745 | ||
746 | int consume() | |
747 | { | |
748 | ASSERT(m_index < m_size); | |
749 | return m_data[m_index++]; | |
750 | } | |
751 | ||
752 | unsigned consumeDigit() | |
753 | { | |
754 | ASSERT(peekIsDigit()); | |
755 | return consume() - '0'; | |
756 | } | |
757 | ||
758 | unsigned consumeNumber() | |
759 | { | |
760 | unsigned n = consumeDigit(); | |
761 | // check for overflow. | |
762 | for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) { | |
763 | n = newValue; | |
764 | consume(); | |
765 | } | |
766 | return n; | |
767 | } | |
768 | ||
769 | unsigned consumeOctal() | |
770 | { | |
771 | ASSERT(WTF::isASCIIOctalDigit(peek())); | |
772 | ||
773 | unsigned n = consumeDigit(); | |
774 | while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek())) | |
775 | n = n * 8 + consumeDigit(); | |
776 | return n; | |
777 | } | |
778 | ||
779 | bool tryConsume(UChar ch) | |
780 | { | |
781 | if (atEndOfPattern() || (m_data[m_index] != ch)) | |
782 | return false; | |
783 | ++m_index; | |
784 | return true; | |
785 | } | |
786 | ||
787 | int tryConsumeHex(int count) | |
788 | { | |
789 | ParseState state = saveState(); | |
790 | ||
791 | int n = 0; | |
792 | while (count--) { | |
793 | if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) { | |
794 | restoreState(state); | |
795 | return -1; | |
796 | } | |
797 | n = (n << 4) | WTF::toASCIIHexValue(consume()); | |
798 | } | |
799 | return n; | |
800 | } | |
801 | ||
802 | Delegate& m_delegate; | |
803 | unsigned m_backReferenceLimit; | |
804 | ErrorCode m_err; | |
6fe7ccc8 | 805 | const CharType* m_data; |
ba379fdc A |
806 | unsigned m_size; |
807 | unsigned m_index; | |
808 | unsigned m_parenthesesNestingDepth; | |
809 | ||
810 | // Derived by empirical testing of compile time in PCRE and WREC. | |
811 | static const unsigned MAX_PATTERN_SIZE = 1024 * 1024; | |
812 | }; | |
813 | ||
814 | /* | |
815 | * Yarr::parse(): | |
816 | * | |
817 | * The parse method is passed a pattern to be parsed and a delegate upon which | |
818 | * callbacks will be made to record the parsed tokens forming the regex. | |
819 | * Yarr::parse() returns null on success, or a const C string providing an error | |
820 | * message where a parse error occurs. | |
821 | * | |
822 | * The Delegate must implement the following interface: | |
823 | * | |
824 | * void assertionBOL(); | |
825 | * void assertionEOL(); | |
826 | * void assertionWordBoundary(bool invert); | |
827 | * | |
828 | * void atomPatternCharacter(UChar ch); | |
829 | * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert); | |
830 | * void atomCharacterClassBegin(bool invert) | |
831 | * void atomCharacterClassAtom(UChar ch) | |
832 | * void atomCharacterClassRange(UChar begin, UChar end) | |
833 | * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) | |
834 | * void atomCharacterClassEnd() | |
835 | * void atomParenthesesSubpatternBegin(bool capture = true); | |
836 | * void atomParentheticalAssertionBegin(bool invert = false); | |
837 | * void atomParenthesesEnd(); | |
838 | * void atomBackReference(unsigned subpatternId); | |
839 | * | |
840 | * void quantifyAtom(unsigned min, unsigned max, bool greedy); | |
841 | * | |
842 | * void disjunction(); | |
843 | * | |
ba379fdc A |
844 | * The regular expression is described by a sequence of assertion*() and atom*() |
845 | * callbacks to the delegate, describing the terms in the regular expression. | |
846 | * Following an atom a quantifyAtom() call may occur to indicate that the previous | |
847 | * atom should be quantified. In the case of atoms described across multiple | |
848 | * calls (parentheses and character classes) the call to quantifyAtom() will come | |
849 | * after the call to the atom*End() method, never after atom*Begin(). | |
850 | * | |
851 | * Character classes may either be described by a single call to | |
852 | * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls. | |
853 | * In the latter case, ...Begin() will be called, followed by a sequence of | |
854 | * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End(). | |
855 | * | |
856 | * Sequences of atoms and assertions are broken into alternatives via calls to | |
857 | * disjunction(). Assertions, atoms, and disjunctions emitted between calls to | |
858 | * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern. | |
859 | * atomParenthesesBegin() is passed a subpatternId. In the case of a regular | |
860 | * capturing subpattern, this will be the subpatternId associated with these | |
861 | * parentheses, and will also by definition be the lowest subpatternId of these | |
862 | * parentheses and of any nested paretheses. The atomParenthesesEnd() method | |
863 | * is passed the subpatternId of the last capturing subexpression nested within | |
864 | * these paretheses. In the case of a capturing subpattern with no nested | |
865 | * capturing subpatterns, the same subpatternId will be passed to the begin and | |
866 | * end functions. In the case of non-capturing subpatterns the subpatternId | |
867 | * passed to the begin method is also the first possible subpatternId that might | |
868 | * be nested within these paretheses. If a set of non-capturing parentheses does | |
869 | * not contain any capturing subpatterns, then the subpatternId passed to begin | |
870 | * will be greater than the subpatternId passed to end. | |
871 | */ | |
872 | ||
873 | template<class Delegate> | |
93a37866 | 874 | const char* parse(Delegate& delegate, const String& pattern, unsigned backReferenceLimit = quantifyInfinite) |
ba379fdc | 875 | { |
6fe7ccc8 A |
876 | if (pattern.is8Bit()) |
877 | return Parser<Delegate, LChar>(delegate, pattern, backReferenceLimit).parse(); | |
878 | return Parser<Delegate, UChar>(delegate, pattern, backReferenceLimit).parse(); | |
ba379fdc A |
879 | } |
880 | ||
881 | } } // namespace JSC::Yarr | |
882 | ||
14957cd0 | 883 | #endif // YarrParser_h |