]>
Commit | Line | Data |
---|---|---|
ba379fdc A |
1 | /* |
2 | * Copyright (C) 2009 Apple Inc. All rights reserved. | |
3 | * | |
4 | * Redistribution and use in source and binary forms, with or without | |
5 | * modification, are permitted provided that the following conditions | |
6 | * are met: | |
7 | * 1. Redistributions of source code must retain the above copyright | |
8 | * notice, this list of conditions and the following disclaimer. | |
9 | * 2. Redistributions in binary form must reproduce the above copyright | |
10 | * notice, this list of conditions and the following disclaimer in the | |
11 | * documentation and/or other materials provided with the distribution. | |
12 | * | |
13 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY | |
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
16 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR | |
17 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
18 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
19 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
20 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
21 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
f9bf01c6 | 23 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
ba379fdc A |
24 | */ |
25 | ||
26 | #include "config.h" | |
27 | #include "RegexInterpreter.h" | |
28 | ||
29 | #include "RegexCompiler.h" | |
30 | #include "RegexPattern.h" | |
31 | ||
32 | #ifndef NDEBUG | |
33 | #include <stdio.h> | |
34 | #endif | |
35 | ||
36 | #if ENABLE(YARR) | |
37 | ||
38 | using namespace WTF; | |
39 | ||
40 | namespace JSC { namespace Yarr { | |
41 | ||
42 | class Interpreter { | |
43 | public: | |
44 | struct ParenthesesDisjunctionContext; | |
45 | ||
46 | struct BackTrackInfoPatternCharacter { | |
47 | uintptr_t matchAmount; | |
48 | }; | |
49 | struct BackTrackInfoCharacterClass { | |
50 | uintptr_t matchAmount; | |
51 | }; | |
52 | struct BackTrackInfoBackReference { | |
53 | uintptr_t begin; // Not really needed for greedy quantifiers. | |
54 | uintptr_t matchAmount; // Not really needed for fixed quantifiers. | |
55 | }; | |
56 | struct BackTrackInfoAlternative { | |
57 | uintptr_t offset; | |
58 | }; | |
59 | struct BackTrackInfoParentheticalAssertion { | |
60 | uintptr_t begin; | |
61 | }; | |
62 | struct BackTrackInfoParenthesesOnce { | |
63 | uintptr_t inParentheses; | |
64 | }; | |
65 | struct BackTrackInfoParentheses { | |
66 | uintptr_t matchAmount; | |
67 | ParenthesesDisjunctionContext* lastContext; | |
68 | uintptr_t prevBegin; | |
69 | uintptr_t prevEnd; | |
70 | }; | |
71 | ||
72 | static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context) | |
73 | { | |
74 | context->next = backTrack->lastContext; | |
75 | backTrack->lastContext = context; | |
76 | ++backTrack->matchAmount; | |
77 | } | |
78 | ||
79 | static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack) | |
80 | { | |
81 | ASSERT(backTrack->matchAmount); | |
82 | ASSERT(backTrack->lastContext); | |
83 | backTrack->lastContext = backTrack->lastContext->next; | |
84 | --backTrack->matchAmount; | |
85 | } | |
86 | ||
87 | struct DisjunctionContext | |
88 | { | |
89 | DisjunctionContext() | |
90 | : term(0) | |
91 | { | |
92 | } | |
f9bf01c6 | 93 | |
ba379fdc A |
94 | void* operator new(size_t, void* where) |
95 | { | |
96 | return where; | |
97 | } | |
98 | ||
99 | int term; | |
100 | unsigned matchBegin; | |
101 | unsigned matchEnd; | |
102 | uintptr_t frame[1]; | |
103 | }; | |
104 | ||
105 | DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction) | |
106 | { | |
107 | return new(malloc(sizeof(DisjunctionContext) + (disjunction->m_frameSize - 1) * sizeof(uintptr_t))) DisjunctionContext(); | |
108 | } | |
109 | ||
110 | void freeDisjunctionContext(DisjunctionContext* context) | |
111 | { | |
112 | free(context); | |
113 | } | |
114 | ||
115 | struct ParenthesesDisjunctionContext | |
116 | { | |
117 | ParenthesesDisjunctionContext(int* output, ByteTerm& term) | |
118 | : next(0) | |
119 | { | |
120 | unsigned firstSubpatternId = term.atom.subpatternId; | |
121 | unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns; | |
122 | ||
123 | for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) { | |
124 | subpatternBackup[i] = output[(firstSubpatternId << 1) + i]; | |
125 | output[(firstSubpatternId << 1) + i] = -1; | |
126 | } | |
f9bf01c6 | 127 | |
ba379fdc A |
128 | new(getDisjunctionContext(term)) DisjunctionContext(); |
129 | } | |
130 | ||
131 | void* operator new(size_t, void* where) | |
132 | { | |
133 | return where; | |
134 | } | |
135 | ||
136 | void restoreOutput(int* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns) | |
137 | { | |
138 | for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) | |
139 | output[(firstSubpatternId << 1) + i] = subpatternBackup[i]; | |
140 | } | |
f9bf01c6 | 141 | |
ba379fdc A |
142 | DisjunctionContext* getDisjunctionContext(ByteTerm& term) |
143 | { | |
144 | return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1])); | |
145 | } | |
146 | ||
147 | ParenthesesDisjunctionContext* next; | |
148 | int subpatternBackup[1]; | |
149 | }; | |
150 | ||
151 | ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, int* output, ByteTerm& term) | |
152 | { | |
153 | return new(malloc(sizeof(ParenthesesDisjunctionContext) + (((term.atom.parenthesesDisjunction->m_numSubpatterns << 1) - 1) * sizeof(int)) + sizeof(DisjunctionContext) + (disjunction->m_frameSize - 1) * sizeof(uintptr_t))) ParenthesesDisjunctionContext(output, term); | |
154 | } | |
155 | ||
156 | void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context) | |
157 | { | |
158 | free(context); | |
159 | } | |
160 | ||
161 | class InputStream { | |
162 | public: | |
163 | InputStream(const UChar* input, unsigned start, unsigned length) | |
164 | : input(input) | |
165 | , pos(start) | |
166 | , length(length) | |
167 | { | |
168 | } | |
169 | ||
170 | void next() | |
171 | { | |
172 | ++pos; | |
173 | } | |
174 | ||
175 | void rewind(unsigned amount) | |
176 | { | |
177 | ASSERT(pos >= amount); | |
178 | pos -= amount; | |
179 | } | |
180 | ||
181 | int read() | |
182 | { | |
183 | ASSERT(pos < length); | |
184 | if (pos < length) | |
185 | return input[pos]; | |
186 | return -1; | |
187 | } | |
188 | ||
189 | int readChecked(int position) | |
190 | { | |
191 | ASSERT(position < 0); | |
192 | ASSERT((unsigned)-position <= pos); | |
193 | unsigned p = pos + position; | |
194 | ASSERT(p < length); | |
195 | return input[p]; | |
196 | } | |
197 | ||
198 | int reread(unsigned from) | |
199 | { | |
200 | ASSERT(from < length); | |
201 | return input[from]; | |
202 | } | |
203 | ||
204 | int prev() | |
205 | { | |
206 | ASSERT(!(pos > length)); | |
207 | if (pos && length) | |
208 | return input[pos - 1]; | |
209 | return -1; | |
210 | } | |
f9bf01c6 | 211 | |
ba379fdc A |
212 | unsigned getPos() |
213 | { | |
214 | return pos; | |
215 | } | |
216 | ||
217 | void setPos(unsigned p) | |
218 | { | |
219 | pos = p; | |
220 | } | |
f9bf01c6 | 221 | |
ba379fdc A |
222 | bool atStart() |
223 | { | |
224 | return pos == 0; | |
225 | } | |
226 | ||
227 | bool atEnd() | |
228 | { | |
229 | return pos == length; | |
230 | } | |
231 | ||
232 | bool checkInput(int count) | |
233 | { | |
234 | if ((pos + count) <= length) { | |
235 | pos += count; | |
236 | return true; | |
237 | } else | |
238 | return false; | |
239 | } | |
240 | ||
241 | void uncheckInput(int count) | |
242 | { | |
243 | pos -= count; | |
244 | } | |
245 | ||
246 | bool atStart(int position) | |
247 | { | |
248 | return (pos + position) == 0; | |
249 | } | |
250 | ||
251 | bool atEnd(int position) | |
252 | { | |
253 | return (pos + position) == length; | |
254 | } | |
255 | ||
256 | private: | |
257 | const UChar* input; | |
258 | unsigned pos; | |
259 | unsigned length; | |
260 | }; | |
261 | ||
262 | bool testCharacterClass(CharacterClass* characterClass, int ch) | |
263 | { | |
264 | if (ch & 0xFF80) { | |
265 | for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i) | |
266 | if (ch == characterClass->m_matchesUnicode[i]) | |
267 | return true; | |
268 | for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i) | |
269 | if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end)) | |
270 | return true; | |
271 | } else { | |
272 | for (unsigned i = 0; i < characterClass->m_matches.size(); ++i) | |
273 | if (ch == characterClass->m_matches[i]) | |
274 | return true; | |
275 | for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i) | |
276 | if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end)) | |
277 | return true; | |
278 | } | |
279 | ||
280 | return false; | |
281 | } | |
282 | ||
ba379fdc A |
283 | bool checkCharacter(int testChar, int inputPosition) |
284 | { | |
285 | return testChar == input.readChecked(inputPosition); | |
286 | } | |
287 | ||
288 | bool checkCasedCharacter(int loChar, int hiChar, int inputPosition) | |
289 | { | |
290 | int ch = input.readChecked(inputPosition); | |
291 | return (loChar == ch) || (hiChar == ch); | |
292 | } | |
293 | ||
ba379fdc A |
294 | bool checkCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition) |
295 | { | |
296 | bool match = testCharacterClass(characterClass, input.readChecked(inputPosition)); | |
297 | return invert ? !match : match; | |
298 | } | |
299 | ||
300 | bool tryConsumeBackReference(int matchBegin, int matchEnd, int inputOffset) | |
301 | { | |
302 | int matchSize = matchEnd - matchBegin; | |
303 | ||
304 | if (!input.checkInput(matchSize)) | |
305 | return false; | |
306 | ||
307 | for (int i = 0; i < matchSize; ++i) { | |
308 | if (!checkCharacter(input.reread(matchBegin + i), inputOffset - matchSize + i)) { | |
309 | input.uncheckInput(matchSize); | |
310 | return false; | |
311 | } | |
312 | } | |
f9bf01c6 | 313 | |
ba379fdc A |
314 | return true; |
315 | } | |
316 | ||
317 | bool matchAssertionBOL(ByteTerm& term) | |
318 | { | |
319 | return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition - 1))); | |
320 | } | |
321 | ||
322 | bool matchAssertionEOL(ByteTerm& term) | |
323 | { | |
324 | if (term.inputPosition) | |
325 | return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition))); | |
326 | else | |
327 | return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read())); | |
328 | } | |
329 | ||
330 | bool matchAssertionWordBoundary(ByteTerm& term) | |
331 | { | |
332 | bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition - 1)); | |
333 | bool readIsWordchar; | |
334 | if (term.inputPosition) | |
335 | readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition)); | |
336 | else | |
337 | readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read()); | |
338 | ||
339 | bool wordBoundary = prevIsWordchar != readIsWordchar; | |
340 | return term.invert() ? !wordBoundary : wordBoundary; | |
341 | } | |
342 | ||
343 | bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context) | |
344 | { | |
345 | BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); | |
346 | ||
347 | switch (term.atom.quantityType) { | |
348 | case QuantifierFixedCount: | |
349 | break; | |
350 | ||
351 | case QuantifierGreedy: | |
352 | if (backTrack->matchAmount) { | |
353 | --backTrack->matchAmount; | |
354 | input.uncheckInput(1); | |
355 | return true; | |
356 | } | |
357 | break; | |
358 | ||
359 | case QuantifierNonGreedy: | |
360 | if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { | |
361 | ++backTrack->matchAmount; | |
362 | if (checkCharacter(term.atom.patternCharacter, term.inputPosition - 1)) | |
363 | return true; | |
364 | } | |
365 | input.uncheckInput(backTrack->matchAmount); | |
366 | break; | |
367 | } | |
368 | ||
369 | return false; | |
370 | } | |
371 | ||
372 | bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context) | |
373 | { | |
374 | BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); | |
375 | ||
376 | switch (term.atom.quantityType) { | |
377 | case QuantifierFixedCount: | |
378 | break; | |
379 | ||
380 | case QuantifierGreedy: | |
381 | if (backTrack->matchAmount) { | |
382 | --backTrack->matchAmount; | |
383 | input.uncheckInput(1); | |
384 | return true; | |
385 | } | |
386 | break; | |
387 | ||
388 | case QuantifierNonGreedy: | |
389 | if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { | |
390 | ++backTrack->matchAmount; | |
391 | if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition - 1)) | |
392 | return true; | |
393 | } | |
394 | input.uncheckInput(backTrack->matchAmount); | |
395 | break; | |
396 | } | |
397 | ||
398 | return false; | |
399 | } | |
400 | ||
401 | bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context) | |
402 | { | |
403 | ASSERT(term.type == ByteTerm::TypeCharacterClass); | |
404 | BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); | |
405 | ||
406 | switch (term.atom.quantityType) { | |
407 | case QuantifierFixedCount: { | |
408 | for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { | |
409 | if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + matchAmount)) | |
410 | return false; | |
411 | } | |
412 | return true; | |
413 | } | |
414 | ||
415 | case QuantifierGreedy: { | |
416 | unsigned matchAmount = 0; | |
417 | while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) { | |
418 | if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) { | |
419 | input.uncheckInput(1); | |
420 | break; | |
421 | } | |
422 | ++matchAmount; | |
423 | } | |
424 | backTrack->matchAmount = matchAmount; | |
425 | ||
426 | return true; | |
427 | } | |
428 | ||
429 | case QuantifierNonGreedy: | |
430 | backTrack->matchAmount = 0; | |
431 | return true; | |
432 | } | |
433 | ||
434 | ASSERT_NOT_REACHED(); | |
435 | return false; | |
436 | } | |
437 | ||
438 | bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context) | |
439 | { | |
440 | ASSERT(term.type == ByteTerm::TypeCharacterClass); | |
441 | BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); | |
442 | ||
443 | switch (term.atom.quantityType) { | |
444 | case QuantifierFixedCount: | |
445 | break; | |
446 | ||
447 | case QuantifierGreedy: | |
448 | if (backTrack->matchAmount) { | |
449 | --backTrack->matchAmount; | |
450 | input.uncheckInput(1); | |
451 | return true; | |
452 | } | |
453 | break; | |
454 | ||
455 | case QuantifierNonGreedy: | |
456 | if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { | |
457 | ++backTrack->matchAmount; | |
458 | if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) | |
459 | return true; | |
460 | } | |
461 | input.uncheckInput(backTrack->matchAmount); | |
462 | break; | |
463 | } | |
464 | ||
465 | return false; | |
466 | } | |
467 | ||
468 | bool matchBackReference(ByteTerm& term, DisjunctionContext* context) | |
469 | { | |
470 | ASSERT(term.type == ByteTerm::TypeBackReference); | |
471 | BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation); | |
472 | ||
473 | int matchBegin = output[(term.atom.subpatternId << 1)]; | |
474 | int matchEnd = output[(term.atom.subpatternId << 1) + 1]; | |
475 | ASSERT((matchBegin == -1) == (matchEnd == -1)); | |
476 | ASSERT(matchBegin <= matchEnd); | |
477 | ||
478 | if (matchBegin == matchEnd) | |
479 | return true; | |
480 | ||
481 | switch (term.atom.quantityType) { | |
482 | case QuantifierFixedCount: { | |
483 | backTrack->begin = input.getPos(); | |
484 | for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { | |
485 | if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { | |
486 | input.setPos(backTrack->begin); | |
487 | return false; | |
488 | } | |
489 | } | |
490 | return true; | |
491 | } | |
492 | ||
493 | case QuantifierGreedy: { | |
494 | unsigned matchAmount = 0; | |
495 | while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) | |
496 | ++matchAmount; | |
497 | backTrack->matchAmount = matchAmount; | |
498 | return true; | |
499 | } | |
500 | ||
501 | case QuantifierNonGreedy: | |
502 | backTrack->begin = input.getPos(); | |
503 | backTrack->matchAmount = 0; | |
504 | return true; | |
505 | } | |
506 | ||
507 | ASSERT_NOT_REACHED(); | |
508 | return false; | |
509 | } | |
510 | ||
511 | bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context) | |
512 | { | |
513 | ASSERT(term.type == ByteTerm::TypeBackReference); | |
514 | BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation); | |
515 | ||
516 | int matchBegin = output[(term.atom.subpatternId << 1)]; | |
517 | int matchEnd = output[(term.atom.subpatternId << 1) + 1]; | |
518 | ASSERT((matchBegin == -1) == (matchEnd == -1)); | |
519 | ASSERT(matchBegin <= matchEnd); | |
520 | ||
521 | if (matchBegin == matchEnd) | |
522 | return false; | |
523 | ||
524 | switch (term.atom.quantityType) { | |
525 | case QuantifierFixedCount: | |
526 | // for quantityCount == 1, could rewind. | |
527 | input.setPos(backTrack->begin); | |
528 | break; | |
529 | ||
530 | case QuantifierGreedy: | |
531 | if (backTrack->matchAmount) { | |
532 | --backTrack->matchAmount; | |
533 | input.rewind(matchEnd - matchBegin); | |
534 | return true; | |
535 | } | |
536 | break; | |
537 | ||
538 | case QuantifierNonGreedy: | |
539 | if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { | |
540 | ++backTrack->matchAmount; | |
541 | return true; | |
542 | } else | |
543 | input.setPos(backTrack->begin); | |
544 | break; | |
545 | } | |
546 | ||
547 | return false; | |
548 | } | |
549 | ||
550 | void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context) | |
551 | { | |
552 | if (term.capture()) { | |
553 | unsigned subpatternId = term.atom.subpatternId; | |
554 | output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition; | |
555 | output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition; | |
556 | } | |
557 | } | |
558 | void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context) | |
559 | { | |
560 | unsigned firstSubpatternId = term.atom.subpatternId; | |
561 | unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns; | |
562 | context->restoreOutput(output, firstSubpatternId, count); | |
563 | } | |
564 | void resetAssertionMatches(ByteTerm& term) | |
565 | { | |
566 | unsigned firstSubpatternId = term.atom.subpatternId; | |
567 | unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns; | |
568 | for (unsigned i = 0; i < (count << 1); ++i) | |
569 | output[(firstSubpatternId << 1) + i] = -1; | |
570 | } | |
571 | bool parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack) | |
572 | { | |
573 | while (backTrack->matchAmount) { | |
574 | ParenthesesDisjunctionContext* context = backTrack->lastContext; | |
575 | ||
576 | if (matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true)) | |
577 | return true; | |
f9bf01c6 | 578 | |
ba379fdc | 579 | resetMatches(term, context); |
ba379fdc | 580 | popParenthesesDisjunctionContext(backTrack); |
f9bf01c6 | 581 | freeParenthesesDisjunctionContext(context); |
ba379fdc A |
582 | } |
583 | ||
584 | return false; | |
585 | } | |
586 | ||
587 | bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) | |
588 | { | |
589 | ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); | |
590 | ASSERT(term.atom.quantityCount == 1); | |
591 | ||
592 | BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); | |
593 | ||
594 | switch (term.atom.quantityType) { | |
595 | case QuantifierGreedy: { | |
596 | // set this speculatively; if we get to the parens end this will be true. | |
597 | backTrack->inParentheses = 1; | |
598 | break; | |
599 | } | |
600 | case QuantifierNonGreedy: { | |
601 | backTrack->inParentheses = 0; | |
602 | context->term += term.atom.parenthesesWidth; | |
603 | return true; | |
604 | } | |
605 | case QuantifierFixedCount: | |
606 | break; | |
607 | } | |
608 | ||
609 | if (term.capture()) { | |
610 | unsigned subpatternId = term.atom.subpatternId; | |
611 | output[(subpatternId << 1)] = input.getPos() + term.inputPosition; | |
612 | } | |
613 | ||
614 | return true; | |
615 | } | |
616 | ||
617 | bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext*) | |
618 | { | |
619 | ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); | |
620 | ASSERT(term.atom.quantityCount == 1); | |
621 | ||
622 | if (term.capture()) { | |
623 | unsigned subpatternId = term.atom.subpatternId; | |
624 | output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition; | |
625 | } | |
626 | return true; | |
627 | } | |
628 | ||
629 | bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) | |
630 | { | |
631 | ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); | |
632 | ASSERT(term.atom.quantityCount == 1); | |
633 | ||
634 | BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); | |
635 | ||
636 | if (term.capture()) { | |
637 | unsigned subpatternId = term.atom.subpatternId; | |
638 | output[(subpatternId << 1)] = -1; | |
639 | output[(subpatternId << 1) + 1] = -1; | |
640 | } | |
641 | ||
642 | switch (term.atom.quantityType) { | |
643 | case QuantifierGreedy: | |
644 | // if we backtrack to this point, there is another chance - try matching nothing. | |
645 | ASSERT(backTrack->inParentheses); | |
646 | backTrack->inParentheses = 0; | |
647 | context->term += term.atom.parenthesesWidth; | |
648 | return true; | |
649 | case QuantifierNonGreedy: | |
650 | ASSERT(backTrack->inParentheses); | |
651 | case QuantifierFixedCount: | |
652 | break; | |
653 | } | |
654 | ||
655 | return false; | |
656 | } | |
657 | ||
658 | bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) | |
659 | { | |
660 | ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); | |
661 | ASSERT(term.atom.quantityCount == 1); | |
662 | ||
663 | BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); | |
664 | ||
665 | switch (term.atom.quantityType) { | |
666 | case QuantifierGreedy: | |
667 | if (!backTrack->inParentheses) { | |
668 | context->term -= term.atom.parenthesesWidth; | |
669 | return false; | |
670 | } | |
671 | case QuantifierNonGreedy: | |
672 | if (!backTrack->inParentheses) { | |
673 | // now try to match the parens; set this speculatively. | |
674 | backTrack->inParentheses = 1; | |
675 | if (term.capture()) { | |
676 | unsigned subpatternId = term.atom.subpatternId; | |
677 | output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition; | |
678 | } | |
679 | context->term -= term.atom.parenthesesWidth; | |
680 | return true; | |
681 | } | |
682 | case QuantifierFixedCount: | |
683 | break; | |
684 | } | |
685 | ||
686 | return false; | |
687 | } | |
688 | ||
689 | bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) | |
690 | { | |
691 | ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); | |
692 | ASSERT(term.atom.quantityCount == 1); | |
693 | ||
694 | BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); | |
695 | ||
696 | backTrack->begin = input.getPos(); | |
697 | return true; | |
698 | } | |
699 | ||
700 | bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) | |
701 | { | |
702 | ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); | |
703 | ASSERT(term.atom.quantityCount == 1); | |
704 | ||
705 | BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); | |
706 | ||
707 | input.setPos(backTrack->begin); | |
708 | ||
709 | // We've reached the end of the parens; if they are inverted, this is failure. | |
710 | if (term.invert()) { | |
711 | context->term -= term.atom.parenthesesWidth; | |
712 | return false; | |
713 | } | |
714 | ||
715 | return true; | |
716 | } | |
717 | ||
718 | bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) | |
719 | { | |
720 | ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); | |
721 | ASSERT(term.atom.quantityCount == 1); | |
722 | ||
723 | // We've failed to match parens; if they are inverted, this is win! | |
724 | if (term.invert()) { | |
725 | context->term += term.atom.parenthesesWidth; | |
726 | return true; | |
727 | } | |
728 | ||
729 | return false; | |
730 | } | |
731 | ||
732 | bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) | |
733 | { | |
734 | ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); | |
735 | ASSERT(term.atom.quantityCount == 1); | |
736 | ||
737 | BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); | |
738 | ||
739 | input.setPos(backTrack->begin); | |
740 | ||
741 | context->term -= term.atom.parenthesesWidth; | |
742 | return false; | |
743 | } | |
744 | ||
745 | bool matchParentheses(ByteTerm& term, DisjunctionContext* context) | |
746 | { | |
747 | ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); | |
748 | ||
749 | BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation); | |
750 | ||
751 | unsigned subpatternId = term.atom.subpatternId; | |
752 | ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; | |
753 | ||
754 | backTrack->prevBegin = output[(subpatternId << 1)]; | |
755 | backTrack->prevEnd = output[(subpatternId << 1) + 1]; | |
756 | ||
757 | backTrack->matchAmount = 0; | |
758 | backTrack->lastContext = 0; | |
759 | ||
760 | switch (term.atom.quantityType) { | |
761 | case QuantifierFixedCount: { | |
762 | // While we haven't yet reached our fixed limit, | |
763 | while (backTrack->matchAmount < term.atom.quantityCount) { | |
764 | // Try to do a match, and it it succeeds, add it to the list. | |
765 | ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); | |
766 | if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term))) | |
767 | appendParenthesesDisjunctionContext(backTrack, context); | |
768 | else { | |
769 | // The match failed; try to find an alternate point to carry on from. | |
770 | resetMatches(term, context); | |
771 | freeParenthesesDisjunctionContext(context); | |
772 | if (!parenthesesDoBacktrack(term, backTrack)) | |
773 | return false; | |
774 | } | |
775 | } | |
776 | ||
777 | ASSERT(backTrack->matchAmount == term.atom.quantityCount); | |
778 | ParenthesesDisjunctionContext* context = backTrack->lastContext; | |
779 | recordParenthesesMatch(term, context); | |
780 | return true; | |
781 | } | |
782 | ||
783 | case QuantifierGreedy: { | |
784 | while (backTrack->matchAmount < term.atom.quantityCount) { | |
785 | ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); | |
786 | if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) | |
787 | appendParenthesesDisjunctionContext(backTrack, context); | |
788 | else { | |
789 | resetMatches(term, context); | |
790 | freeParenthesesDisjunctionContext(context); | |
791 | break; | |
792 | } | |
793 | } | |
794 | ||
795 | if (backTrack->matchAmount) { | |
796 | ParenthesesDisjunctionContext* context = backTrack->lastContext; | |
797 | recordParenthesesMatch(term, context); | |
798 | } | |
799 | return true; | |
800 | } | |
801 | ||
802 | case QuantifierNonGreedy: | |
803 | return true; | |
804 | } | |
805 | ||
806 | ASSERT_NOT_REACHED(); | |
807 | return false; | |
808 | } | |
809 | ||
810 | // Rules for backtracking differ depending on whether this is greedy or non-greedy. | |
811 | // | |
812 | // Greedy matches never should try just adding more - you should already have done | |
813 | // the 'more' cases. Always backtrack, at least a leetle bit. However cases where | |
814 | // you backtrack an item off the list needs checking, since we'll never have matched | |
815 | // the one less case. Tracking forwards, still add as much as possible. | |
816 | // | |
817 | // Non-greedy, we've already done the one less case, so don't match on popping. | |
818 | // We haven't done the one more case, so always try to add that. | |
819 | // | |
820 | bool backtrackParentheses(ByteTerm& term, DisjunctionContext* context) | |
821 | { | |
822 | ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); | |
823 | ||
824 | BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation); | |
825 | ||
826 | if (term.capture()) { | |
827 | unsigned subpatternId = term.atom.subpatternId; | |
828 | output[(subpatternId << 1)] = backTrack->prevBegin; | |
829 | output[(subpatternId << 1) + 1] = backTrack->prevEnd; | |
830 | } | |
831 | ||
832 | ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; | |
833 | ||
834 | switch (term.atom.quantityType) { | |
835 | case QuantifierFixedCount: { | |
836 | ASSERT(backTrack->matchAmount == term.atom.quantityCount); | |
837 | ||
838 | ParenthesesDisjunctionContext* context = 0; | |
839 | ||
840 | if (!parenthesesDoBacktrack(term, backTrack)) | |
841 | return false; | |
842 | ||
843 | // While we haven't yet reached our fixed limit, | |
844 | while (backTrack->matchAmount < term.atom.quantityCount) { | |
845 | // Try to do a match, and it it succeeds, add it to the list. | |
846 | context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); | |
847 | if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term))) | |
848 | appendParenthesesDisjunctionContext(backTrack, context); | |
849 | else { | |
850 | // The match failed; try to find an alternate point to carry on from. | |
851 | resetMatches(term, context); | |
852 | freeParenthesesDisjunctionContext(context); | |
853 | if (!parenthesesDoBacktrack(term, backTrack)) | |
854 | return false; | |
855 | } | |
856 | } | |
857 | ||
858 | ASSERT(backTrack->matchAmount == term.atom.quantityCount); | |
859 | context = backTrack->lastContext; | |
860 | recordParenthesesMatch(term, context); | |
861 | return true; | |
862 | } | |
863 | ||
864 | case QuantifierGreedy: { | |
865 | if (!backTrack->matchAmount) | |
866 | return false; | |
867 | ||
868 | ParenthesesDisjunctionContext* context = backTrack->lastContext; | |
869 | if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) { | |
870 | while (backTrack->matchAmount < term.atom.quantityCount) { | |
871 | ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); | |
872 | if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) | |
873 | appendParenthesesDisjunctionContext(backTrack, context); | |
874 | else { | |
875 | resetMatches(term, context); | |
876 | freeParenthesesDisjunctionContext(context); | |
877 | break; | |
878 | } | |
879 | } | |
880 | } else { | |
881 | resetMatches(term, context); | |
ba379fdc | 882 | popParenthesesDisjunctionContext(backTrack); |
f9bf01c6 | 883 | freeParenthesesDisjunctionContext(context); |
ba379fdc A |
884 | } |
885 | ||
886 | if (backTrack->matchAmount) { | |
887 | ParenthesesDisjunctionContext* context = backTrack->lastContext; | |
888 | recordParenthesesMatch(term, context); | |
889 | } | |
890 | return true; | |
891 | } | |
892 | ||
893 | case QuantifierNonGreedy: { | |
894 | // If we've not reached the limit, try to add one more match. | |
895 | if (backTrack->matchAmount < term.atom.quantityCount) { | |
896 | ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); | |
897 | if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) { | |
898 | appendParenthesesDisjunctionContext(backTrack, context); | |
899 | recordParenthesesMatch(term, context); | |
900 | return true; | |
901 | } else { | |
902 | resetMatches(term, context); | |
903 | freeParenthesesDisjunctionContext(context); | |
904 | } | |
905 | } | |
906 | ||
907 | // Nope - okay backtrack looking for an alternative. | |
908 | while (backTrack->matchAmount) { | |
909 | ParenthesesDisjunctionContext* context = backTrack->lastContext; | |
910 | if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) { | |
911 | // successful backtrack! we're back in the game! | |
912 | if (backTrack->matchAmount) { | |
913 | context = backTrack->lastContext; | |
914 | recordParenthesesMatch(term, context); | |
915 | } | |
916 | return true; | |
917 | } | |
f9bf01c6 | 918 | |
ba379fdc A |
919 | // pop a match off the stack |
920 | resetMatches(term, context); | |
ba379fdc | 921 | popParenthesesDisjunctionContext(backTrack); |
f9bf01c6 | 922 | freeParenthesesDisjunctionContext(context); |
ba379fdc A |
923 | } |
924 | ||
925 | return false; | |
926 | } | |
927 | } | |
928 | ||
929 | ASSERT_NOT_REACHED(); | |
930 | return false; | |
931 | } | |
932 | ||
933 | #define MATCH_NEXT() { ++context->term; goto matchAgain; } | |
934 | #define BACKTRACK() { --context->term; goto backtrack; } | |
935 | #define currentTerm() (disjunction->terms[context->term]) | |
936 | bool matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) | |
937 | { | |
938 | if (btrack) | |
939 | BACKTRACK(); | |
940 | ||
941 | context->matchBegin = input.getPos(); | |
942 | context->term = 0; | |
943 | ||
944 | matchAgain: | |
945 | ASSERT(context->term < static_cast<int>(disjunction->terms.size())); | |
946 | ||
947 | switch (currentTerm().type) { | |
948 | case ByteTerm::TypeSubpatternBegin: | |
949 | MATCH_NEXT(); | |
950 | case ByteTerm::TypeSubpatternEnd: | |
951 | context->matchEnd = input.getPos(); | |
952 | return true; | |
953 | ||
954 | case ByteTerm::TypeBodyAlternativeBegin: | |
955 | MATCH_NEXT(); | |
956 | case ByteTerm::TypeBodyAlternativeDisjunction: | |
957 | case ByteTerm::TypeBodyAlternativeEnd: | |
958 | context->matchEnd = input.getPos(); | |
959 | return true; | |
960 | ||
961 | case ByteTerm::TypeAlternativeBegin: | |
962 | MATCH_NEXT(); | |
963 | case ByteTerm::TypeAlternativeDisjunction: | |
964 | case ByteTerm::TypeAlternativeEnd: { | |
965 | int offset = currentTerm().alternative.end; | |
966 | BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation); | |
967 | backTrack->offset = offset; | |
968 | context->term += offset; | |
969 | MATCH_NEXT(); | |
970 | } | |
971 | ||
972 | case ByteTerm::TypeAssertionBOL: | |
973 | if (matchAssertionBOL(currentTerm())) | |
974 | MATCH_NEXT(); | |
975 | BACKTRACK(); | |
976 | case ByteTerm::TypeAssertionEOL: | |
977 | if (matchAssertionEOL(currentTerm())) | |
978 | MATCH_NEXT(); | |
979 | BACKTRACK(); | |
980 | case ByteTerm::TypeAssertionWordBoundary: | |
981 | if (matchAssertionWordBoundary(currentTerm())) | |
982 | MATCH_NEXT(); | |
983 | BACKTRACK(); | |
984 | ||
985 | case ByteTerm::TypePatternCharacterOnce: | |
986 | case ByteTerm::TypePatternCharacterFixed: { | |
987 | for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { | |
988 | if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + matchAmount)) | |
989 | BACKTRACK(); | |
990 | } | |
991 | MATCH_NEXT(); | |
992 | } | |
993 | case ByteTerm::TypePatternCharacterGreedy: { | |
994 | BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); | |
995 | unsigned matchAmount = 0; | |
996 | while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { | |
997 | if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 1)) { | |
998 | input.uncheckInput(1); | |
999 | break; | |
1000 | } | |
1001 | ++matchAmount; | |
1002 | } | |
1003 | backTrack->matchAmount = matchAmount; | |
1004 | ||
1005 | MATCH_NEXT(); | |
1006 | } | |
1007 | case ByteTerm::TypePatternCharacterNonGreedy: { | |
1008 | BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); | |
1009 | backTrack->matchAmount = 0; | |
1010 | MATCH_NEXT(); | |
1011 | } | |
1012 | ||
1013 | case ByteTerm::TypePatternCasedCharacterOnce: | |
1014 | case ByteTerm::TypePatternCasedCharacterFixed: { | |
1015 | for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { | |
1016 | if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + matchAmount)) | |
1017 | BACKTRACK(); | |
1018 | } | |
1019 | MATCH_NEXT(); | |
1020 | } | |
1021 | case ByteTerm::TypePatternCasedCharacterGreedy: { | |
1022 | BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); | |
1023 | unsigned matchAmount = 0; | |
1024 | while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { | |
1025 | if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - 1)) { | |
1026 | input.uncheckInput(1); | |
1027 | break; | |
1028 | } | |
1029 | ++matchAmount; | |
1030 | } | |
1031 | backTrack->matchAmount = matchAmount; | |
1032 | ||
1033 | MATCH_NEXT(); | |
1034 | } | |
1035 | case ByteTerm::TypePatternCasedCharacterNonGreedy: { | |
1036 | BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); | |
1037 | backTrack->matchAmount = 0; | |
1038 | MATCH_NEXT(); | |
1039 | } | |
1040 | ||
1041 | case ByteTerm::TypeCharacterClass: | |
1042 | if (matchCharacterClass(currentTerm(), context)) | |
1043 | MATCH_NEXT(); | |
1044 | BACKTRACK(); | |
1045 | case ByteTerm::TypeBackReference: | |
1046 | if (matchBackReference(currentTerm(), context)) | |
1047 | MATCH_NEXT(); | |
1048 | BACKTRACK(); | |
1049 | case ByteTerm::TypeParenthesesSubpattern: | |
1050 | if (matchParentheses(currentTerm(), context)) | |
1051 | MATCH_NEXT(); | |
1052 | BACKTRACK(); | |
1053 | case ByteTerm::TypeParenthesesSubpatternOnceBegin: | |
1054 | if (matchParenthesesOnceBegin(currentTerm(), context)) | |
1055 | MATCH_NEXT(); | |
1056 | BACKTRACK(); | |
1057 | case ByteTerm::TypeParenthesesSubpatternOnceEnd: | |
1058 | if (matchParenthesesOnceEnd(currentTerm(), context)) | |
1059 | MATCH_NEXT(); | |
1060 | BACKTRACK(); | |
1061 | case ByteTerm::TypeParentheticalAssertionBegin: | |
1062 | if (matchParentheticalAssertionBegin(currentTerm(), context)) | |
1063 | MATCH_NEXT(); | |
1064 | BACKTRACK(); | |
1065 | case ByteTerm::TypeParentheticalAssertionEnd: | |
1066 | if (matchParentheticalAssertionEnd(currentTerm(), context)) | |
1067 | MATCH_NEXT(); | |
1068 | BACKTRACK(); | |
1069 | ||
1070 | case ByteTerm::TypeCheckInput: | |
1071 | if (input.checkInput(currentTerm().checkInputCount)) | |
1072 | MATCH_NEXT(); | |
1073 | BACKTRACK(); | |
1074 | } | |
1075 | ||
1076 | // We should never fall-through to here. | |
1077 | ASSERT_NOT_REACHED(); | |
1078 | ||
1079 | backtrack: | |
1080 | ASSERT(context->term < static_cast<int>(disjunction->terms.size())); | |
1081 | ||
1082 | switch (currentTerm().type) { | |
1083 | case ByteTerm::TypeSubpatternBegin: | |
1084 | return false; | |
1085 | case ByteTerm::TypeSubpatternEnd: | |
1086 | ASSERT_NOT_REACHED(); | |
1087 | ||
1088 | case ByteTerm::TypeBodyAlternativeBegin: | |
1089 | case ByteTerm::TypeBodyAlternativeDisjunction: { | |
1090 | int offset = currentTerm().alternative.next; | |
1091 | context->term += offset; | |
1092 | if (offset > 0) | |
1093 | MATCH_NEXT(); | |
1094 | ||
1095 | if (input.atEnd()) | |
1096 | return false; | |
1097 | ||
1098 | input.next(); | |
1099 | context->matchBegin = input.getPos(); | |
1100 | MATCH_NEXT(); | |
1101 | } | |
1102 | case ByteTerm::TypeBodyAlternativeEnd: | |
1103 | ASSERT_NOT_REACHED(); | |
1104 | ||
1105 | case ByteTerm::TypeAlternativeBegin: | |
1106 | case ByteTerm::TypeAlternativeDisjunction: { | |
1107 | int offset = currentTerm().alternative.next; | |
1108 | context->term += offset; | |
1109 | if (offset > 0) | |
1110 | MATCH_NEXT(); | |
1111 | BACKTRACK(); | |
1112 | } | |
1113 | case ByteTerm::TypeAlternativeEnd: { | |
1114 | // We should never backtrack back into an alternative of the main body of the regex. | |
1115 | BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation); | |
1116 | unsigned offset = backTrack->offset; | |
1117 | context->term -= offset; | |
1118 | BACKTRACK(); | |
1119 | } | |
1120 | ||
1121 | case ByteTerm::TypeAssertionBOL: | |
1122 | case ByteTerm::TypeAssertionEOL: | |
1123 | case ByteTerm::TypeAssertionWordBoundary: | |
1124 | BACKTRACK(); | |
1125 | ||
1126 | case ByteTerm::TypePatternCharacterOnce: | |
1127 | case ByteTerm::TypePatternCharacterFixed: | |
1128 | case ByteTerm::TypePatternCharacterGreedy: | |
1129 | case ByteTerm::TypePatternCharacterNonGreedy: | |
1130 | if (backtrackPatternCharacter(currentTerm(), context)) | |
1131 | MATCH_NEXT(); | |
1132 | BACKTRACK(); | |
1133 | case ByteTerm::TypePatternCasedCharacterOnce: | |
1134 | case ByteTerm::TypePatternCasedCharacterFixed: | |
1135 | case ByteTerm::TypePatternCasedCharacterGreedy: | |
1136 | case ByteTerm::TypePatternCasedCharacterNonGreedy: | |
1137 | if (backtrackPatternCasedCharacter(currentTerm(), context)) | |
1138 | MATCH_NEXT(); | |
1139 | BACKTRACK(); | |
1140 | case ByteTerm::TypeCharacterClass: | |
1141 | if (backtrackCharacterClass(currentTerm(), context)) | |
1142 | MATCH_NEXT(); | |
1143 | BACKTRACK(); | |
1144 | case ByteTerm::TypeBackReference: | |
1145 | if (backtrackBackReference(currentTerm(), context)) | |
1146 | MATCH_NEXT(); | |
1147 | BACKTRACK(); | |
1148 | case ByteTerm::TypeParenthesesSubpattern: | |
1149 | if (backtrackParentheses(currentTerm(), context)) | |
1150 | MATCH_NEXT(); | |
1151 | BACKTRACK(); | |
1152 | case ByteTerm::TypeParenthesesSubpatternOnceBegin: | |
1153 | if (backtrackParenthesesOnceBegin(currentTerm(), context)) | |
1154 | MATCH_NEXT(); | |
1155 | BACKTRACK(); | |
1156 | case ByteTerm::TypeParenthesesSubpatternOnceEnd: | |
1157 | if (backtrackParenthesesOnceEnd(currentTerm(), context)) | |
1158 | MATCH_NEXT(); | |
1159 | BACKTRACK(); | |
1160 | case ByteTerm::TypeParentheticalAssertionBegin: | |
1161 | if (backtrackParentheticalAssertionBegin(currentTerm(), context)) | |
1162 | MATCH_NEXT(); | |
1163 | BACKTRACK(); | |
1164 | case ByteTerm::TypeParentheticalAssertionEnd: | |
1165 | if (backtrackParentheticalAssertionEnd(currentTerm(), context)) | |
1166 | MATCH_NEXT(); | |
1167 | BACKTRACK(); | |
1168 | ||
1169 | case ByteTerm::TypeCheckInput: | |
1170 | input.uncheckInput(currentTerm().checkInputCount); | |
1171 | BACKTRACK(); | |
1172 | } | |
1173 | ||
1174 | ASSERT_NOT_REACHED(); | |
1175 | return false; | |
1176 | } | |
1177 | ||
1178 | bool matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) | |
1179 | { | |
1180 | if (matchDisjunction(disjunction, context, btrack)) { | |
1181 | while (context->matchBegin == context->matchEnd) { | |
1182 | if (!matchDisjunction(disjunction, context, true)) | |
1183 | return false; | |
1184 | } | |
1185 | return true; | |
1186 | } | |
1187 | ||
1188 | return false; | |
1189 | } | |
1190 | ||
1191 | int interpret() | |
1192 | { | |
1193 | for (unsigned i = 0; i < ((pattern->m_body->m_numSubpatterns + 1) << 1); ++i) | |
1194 | output[i] = -1; | |
1195 | ||
1196 | DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get()); | |
1197 | ||
1198 | if (matchDisjunction(pattern->m_body.get(), context)) { | |
1199 | output[0] = context->matchBegin; | |
1200 | output[1] = context->matchEnd; | |
1201 | } | |
1202 | ||
1203 | freeDisjunctionContext(context); | |
1204 | ||
1205 | return output[0]; | |
1206 | } | |
1207 | ||
1208 | Interpreter(BytecodePattern* pattern, int* output, const UChar* inputChar, unsigned start, unsigned length) | |
1209 | : pattern(pattern) | |
1210 | , output(output) | |
1211 | , input(inputChar, start, length) | |
1212 | { | |
1213 | } | |
1214 | ||
1215 | private: | |
1216 | BytecodePattern *pattern; | |
1217 | int* output; | |
1218 | InputStream input; | |
1219 | }; | |
1220 | ||
1221 | ||
1222 | ||
1223 | class ByteCompiler { | |
1224 | struct ParenthesesStackEntry { | |
1225 | unsigned beginTerm; | |
1226 | unsigned savedAlternativeIndex; | |
1227 | ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/) | |
1228 | : beginTerm(beginTerm) | |
1229 | , savedAlternativeIndex(savedAlternativeIndex) | |
1230 | { | |
1231 | } | |
1232 | }; | |
1233 | ||
1234 | public: | |
1235 | ByteCompiler(RegexPattern& pattern) | |
1236 | : m_pattern(pattern) | |
1237 | { | |
f9bf01c6 A |
1238 | m_bodyDisjunction = 0; |
1239 | m_currentAlternativeIndex = 0; | |
ba379fdc | 1240 | } |
f9bf01c6 | 1241 | |
ba379fdc A |
1242 | BytecodePattern* compile() |
1243 | { | |
1244 | regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize); | |
1245 | emitDisjunction(m_pattern.m_body); | |
1246 | regexEnd(); | |
1247 | ||
f9bf01c6 | 1248 | return new BytecodePattern(m_bodyDisjunction, m_allParenthesesInfo, m_pattern); |
ba379fdc | 1249 | } |
f9bf01c6 | 1250 | |
ba379fdc A |
1251 | void checkInput(unsigned count) |
1252 | { | |
f9bf01c6 | 1253 | m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count)); |
ba379fdc A |
1254 | } |
1255 | ||
1256 | void assertionBOL(int inputPosition) | |
1257 | { | |
f9bf01c6 | 1258 | m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition)); |
ba379fdc A |
1259 | } |
1260 | ||
1261 | void assertionEOL(int inputPosition) | |
1262 | { | |
f9bf01c6 | 1263 | m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition)); |
ba379fdc A |
1264 | } |
1265 | ||
1266 | void assertionWordBoundary(bool invert, int inputPosition) | |
1267 | { | |
f9bf01c6 | 1268 | m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition)); |
ba379fdc A |
1269 | } |
1270 | ||
1271 | void atomPatternCharacter(UChar ch, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) | |
1272 | { | |
1273 | if (m_pattern.m_ignoreCase) { | |
1274 | UChar lo = Unicode::toLower(ch); | |
1275 | UChar hi = Unicode::toUpper(ch); | |
f9bf01c6 | 1276 | |
ba379fdc | 1277 | if (lo != hi) { |
f9bf01c6 | 1278 | m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType)); |
ba379fdc A |
1279 | return; |
1280 | } | |
1281 | } | |
1282 | ||
f9bf01c6 | 1283 | m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType)); |
ba379fdc | 1284 | } |
f9bf01c6 | 1285 | |
ba379fdc A |
1286 | void atomCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) |
1287 | { | |
f9bf01c6 | 1288 | m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition)); |
ba379fdc | 1289 | |
f9bf01c6 A |
1290 | m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount; |
1291 | m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; | |
1292 | m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; | |
ba379fdc A |
1293 | } |
1294 | ||
1295 | void atomBackReference(unsigned subpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) | |
1296 | { | |
1297 | ASSERT(subpatternId); | |
1298 | ||
f9bf01c6 | 1299 | m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition)); |
ba379fdc | 1300 | |
f9bf01c6 A |
1301 | m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount; |
1302 | m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; | |
1303 | m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; | |
ba379fdc A |
1304 | } |
1305 | ||
1306 | void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) | |
1307 | { | |
f9bf01c6 | 1308 | int beginTerm = m_bodyDisjunction->terms.size(); |
ba379fdc | 1309 | |
f9bf01c6 A |
1310 | m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, inputPosition)); |
1311 | m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; | |
1312 | m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); | |
1313 | m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; | |
ba379fdc | 1314 | |
f9bf01c6 A |
1315 | m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); |
1316 | m_currentAlternativeIndex = beginTerm + 1; | |
ba379fdc A |
1317 | } |
1318 | ||
1319 | void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation) | |
1320 | { | |
f9bf01c6 | 1321 | int beginTerm = m_bodyDisjunction->terms.size(); |
ba379fdc | 1322 | |
f9bf01c6 A |
1323 | m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, invert, 0)); |
1324 | m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; | |
1325 | m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); | |
1326 | m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; | |
ba379fdc | 1327 | |
f9bf01c6 A |
1328 | m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); |
1329 | m_currentAlternativeIndex = beginTerm + 1; | |
ba379fdc A |
1330 | } |
1331 | ||
1332 | unsigned popParenthesesStack() | |
1333 | { | |
1334 | ASSERT(m_parenthesesStack.size()); | |
1335 | int stackEnd = m_parenthesesStack.size() - 1; | |
1336 | unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm; | |
f9bf01c6 | 1337 | m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex; |
ba379fdc A |
1338 | m_parenthesesStack.shrink(stackEnd); |
1339 | ||
f9bf01c6 A |
1340 | ASSERT(beginTerm < m_bodyDisjunction->terms.size()); |
1341 | ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size()); | |
1342 | ||
ba379fdc A |
1343 | return beginTerm; |
1344 | } | |
1345 | ||
1346 | #ifndef NDEBUG | |
1347 | void dumpDisjunction(ByteDisjunction* disjunction) | |
1348 | { | |
1349 | printf("ByteDisjunction(%p):\n\t", disjunction); | |
1350 | for (unsigned i = 0; i < disjunction->terms.size(); ++i) | |
1351 | printf("{ %d } ", disjunction->terms[i].type); | |
1352 | printf("\n"); | |
1353 | } | |
1354 | #endif | |
1355 | ||
1356 | void closeAlternative(int beginTerm) | |
1357 | { | |
1358 | int origBeginTerm = beginTerm; | |
f9bf01c6 A |
1359 | ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin); |
1360 | int endIndex = m_bodyDisjunction->terms.size(); | |
ba379fdc | 1361 | |
f9bf01c6 | 1362 | unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; |
ba379fdc | 1363 | |
f9bf01c6 A |
1364 | if (!m_bodyDisjunction->terms[beginTerm].alternative.next) |
1365 | m_bodyDisjunction->terms.remove(beginTerm); | |
ba379fdc | 1366 | else { |
f9bf01c6 A |
1367 | while (m_bodyDisjunction->terms[beginTerm].alternative.next) { |
1368 | beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; | |
1369 | ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction); | |
1370 | m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; | |
1371 | m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; | |
ba379fdc | 1372 | } |
ba379fdc | 1373 | |
f9bf01c6 A |
1374 | m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; |
1375 | ||
1376 | m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd()); | |
1377 | m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; | |
ba379fdc A |
1378 | } |
1379 | } | |
1380 | ||
1381 | void closeBodyAlternative() | |
1382 | { | |
1383 | int beginTerm = 0; | |
1384 | int origBeginTerm = 0; | |
f9bf01c6 A |
1385 | ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin); |
1386 | int endIndex = m_bodyDisjunction->terms.size(); | |
ba379fdc | 1387 | |
f9bf01c6 | 1388 | unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; |
ba379fdc | 1389 | |
f9bf01c6 A |
1390 | while (m_bodyDisjunction->terms[beginTerm].alternative.next) { |
1391 | beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; | |
1392 | ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction); | |
1393 | m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; | |
1394 | m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; | |
ba379fdc | 1395 | } |
ba379fdc | 1396 | |
f9bf01c6 A |
1397 | m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; |
1398 | ||
1399 | m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd()); | |
1400 | m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; | |
ba379fdc A |
1401 | } |
1402 | ||
1403 | void atomParenthesesEnd(bool doInline, unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0) | |
1404 | { | |
1405 | unsigned beginTerm = popParenthesesStack(); | |
1406 | closeAlternative(beginTerm + 1); | |
f9bf01c6 | 1407 | unsigned endTerm = m_bodyDisjunction->terms.size(); |
ba379fdc | 1408 | |
f9bf01c6 A |
1409 | bool isAssertion = m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin; |
1410 | bool invertOrCapture = m_bodyDisjunction->terms[beginTerm].invertOrCapture; | |
1411 | unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; | |
ba379fdc | 1412 | |
f9bf01c6 A |
1413 | m_bodyDisjunction->terms.append(ByteTerm(isAssertion ? ByteTerm::TypeParentheticalAssertionEnd : ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, invertOrCapture, inputPosition)); |
1414 | m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; | |
1415 | m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; | |
1416 | m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; | |
ba379fdc A |
1417 | |
1418 | if (doInline) { | |
f9bf01c6 A |
1419 | m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; |
1420 | m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; | |
1421 | m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount; | |
1422 | m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; | |
ba379fdc | 1423 | } else { |
f9bf01c6 | 1424 | ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm]; |
ba379fdc A |
1425 | ASSERT(parenthesesBegin.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); |
1426 | ||
1427 | bool invertOrCapture = parenthesesBegin.invertOrCapture; | |
1428 | unsigned subpatternId = parenthesesBegin.atom.subpatternId; | |
1429 | ||
1430 | unsigned numSubpatterns = lastSubpatternId - subpatternId + 1; | |
1431 | ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize); | |
1432 | ||
1433 | parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin()); | |
1434 | for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses) | |
f9bf01c6 | 1435 | parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]); |
ba379fdc A |
1436 | parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd()); |
1437 | ||
f9bf01c6 | 1438 | m_bodyDisjunction->terms.shrink(beginTerm); |
ba379fdc A |
1439 | |
1440 | m_allParenthesesInfo.append(parenthesesDisjunction); | |
f9bf01c6 | 1441 | m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, invertOrCapture, inputPosition)); |
ba379fdc | 1442 | |
f9bf01c6 A |
1443 | m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; |
1444 | m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; | |
1445 | m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; | |
ba379fdc A |
1446 | } |
1447 | } | |
1448 | ||
1449 | void regexBegin(unsigned numSubpatterns, unsigned callFrameSize) | |
1450 | { | |
f9bf01c6 A |
1451 | m_bodyDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize); |
1452 | m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin()); | |
1453 | m_bodyDisjunction->terms[0].frameLocation = 0; | |
1454 | m_currentAlternativeIndex = 0; | |
ba379fdc A |
1455 | } |
1456 | ||
1457 | void regexEnd() | |
1458 | { | |
1459 | closeBodyAlternative(); | |
1460 | } | |
1461 | ||
f9bf01c6 | 1462 | void alternativeBodyDisjunction() |
ba379fdc | 1463 | { |
f9bf01c6 A |
1464 | int newAlternativeIndex = m_bodyDisjunction->terms.size(); |
1465 | m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; | |
1466 | m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction()); | |
ba379fdc | 1467 | |
f9bf01c6 | 1468 | m_currentAlternativeIndex = newAlternativeIndex; |
ba379fdc A |
1469 | } |
1470 | ||
f9bf01c6 | 1471 | void alternativeDisjunction() |
ba379fdc | 1472 | { |
f9bf01c6 A |
1473 | int newAlternativeIndex = m_bodyDisjunction->terms.size(); |
1474 | m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; | |
1475 | m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction()); | |
ba379fdc | 1476 | |
f9bf01c6 | 1477 | m_currentAlternativeIndex = newAlternativeIndex; |
ba379fdc A |
1478 | } |
1479 | ||
1480 | void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0) | |
1481 | { | |
1482 | for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { | |
1483 | unsigned currentCountAlreadyChecked = inputCountAlreadyChecked; | |
f9bf01c6 | 1484 | |
ba379fdc A |
1485 | if (alt) { |
1486 | if (disjunction == m_pattern.m_body) | |
f9bf01c6 | 1487 | alternativeBodyDisjunction(); |
ba379fdc | 1488 | else |
f9bf01c6 | 1489 | alternativeDisjunction(); |
ba379fdc A |
1490 | } |
1491 | ||
1492 | PatternAlternative* alternative = disjunction->m_alternatives[alt]; | |
1493 | unsigned minimumSize = alternative->m_minimumSize; | |
1494 | ||
1495 | ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked); | |
1496 | unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked; | |
1497 | if (countToCheck) | |
1498 | checkInput(countToCheck); | |
1499 | currentCountAlreadyChecked += countToCheck; | |
1500 | ||
1501 | for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { | |
1502 | PatternTerm& term = alternative->m_terms[i]; | |
1503 | ||
1504 | switch (term.type) { | |
1505 | case PatternTerm::TypeAssertionBOL: | |
1506 | assertionBOL(term.inputPosition - currentCountAlreadyChecked); | |
1507 | break; | |
1508 | ||
1509 | case PatternTerm::TypeAssertionEOL: | |
1510 | assertionEOL(term.inputPosition - currentCountAlreadyChecked); | |
1511 | break; | |
1512 | ||
1513 | case PatternTerm::TypeAssertionWordBoundary: | |
1514 | assertionWordBoundary(term.invertOrCapture, term.inputPosition - currentCountAlreadyChecked); | |
1515 | break; | |
1516 | ||
1517 | case PatternTerm::TypePatternCharacter: | |
1518 | atomPatternCharacter(term.patternCharacter, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); | |
1519 | break; | |
1520 | ||
1521 | case PatternTerm::TypeCharacterClass: | |
1522 | atomCharacterClass(term.characterClass, term.invertOrCapture, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); | |
1523 | break; | |
1524 | ||
1525 | case PatternTerm::TypeBackReference: | |
1526 | atomBackReference(term.subpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); | |
1527 | break; | |
1528 | ||
1529 | case PatternTerm::TypeForwardReference: | |
1530 | break; | |
1531 | ||
1532 | case PatternTerm::TypeParenthesesSubpattern: { | |
1533 | unsigned disjunctionAlreadyCheckedCount = 0; | |
1534 | if ((term.quantityCount == 1) && !term.parentheses.isCopy) { | |
1535 | if (term.quantityType == QuantifierFixedCount) { | |
1536 | disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize; | |
1537 | unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; | |
1538 | atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation); | |
1539 | emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, term.parentheses.disjunction->m_minimumSize); | |
1540 | atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); | |
1541 | } else { | |
1542 | unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; | |
1543 | atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + RegexStackSpaceForBackTrackInfoParenthesesOnce); | |
1544 | emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); | |
1545 | atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); | |
1546 | } | |
1547 | } else { | |
1548 | unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; | |
1549 | atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, 0); | |
1550 | emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); | |
1551 | atomParenthesesEnd(false, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); | |
1552 | } | |
1553 | break; | |
1554 | } | |
1555 | ||
1556 | case PatternTerm::TypeParentheticalAssertion: { | |
4e4e5a6f | 1557 | unsigned alternativeFrameLocation = term.frameLocation + RegexStackSpaceForBackTrackInfoParentheticalAssertion; |
f9bf01c6 | 1558 | |
ba379fdc A |
1559 | atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invertOrCapture, term.frameLocation, alternativeFrameLocation); |
1560 | emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); | |
1561 | atomParenthesesEnd(true, term.parentheses.lastSubpatternId, 0, term.frameLocation, term.quantityCount, term.quantityType); | |
1562 | break; | |
1563 | } | |
1564 | } | |
1565 | } | |
1566 | } | |
1567 | } | |
1568 | ||
1569 | private: | |
1570 | RegexPattern& m_pattern; | |
f9bf01c6 A |
1571 | ByteDisjunction* m_bodyDisjunction; |
1572 | unsigned m_currentAlternativeIndex; | |
ba379fdc A |
1573 | Vector<ParenthesesStackEntry> m_parenthesesStack; |
1574 | Vector<ByteDisjunction*> m_allParenthesesInfo; | |
1575 | }; | |
1576 | ||
1577 | ||
1578 | BytecodePattern* byteCompileRegex(const UString& patternString, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline) | |
1579 | { | |
1580 | RegexPattern pattern(ignoreCase, multiline); | |
1581 | ||
1582 | if ((error = compileRegex(patternString, pattern))) | |
1583 | return 0; | |
1584 | ||
1585 | numSubpatterns = pattern.m_numSubpatterns; | |
1586 | ||
1587 | return ByteCompiler(pattern).compile(); | |
1588 | } | |
1589 | ||
1590 | int interpretRegex(BytecodePattern* regex, const UChar* input, unsigned start, unsigned length, int* output) | |
1591 | { | |
1592 | return Interpreter(regex, output, input, start, length).interpret(); | |
1593 | } | |
1594 | ||
1595 | ||
1596 | COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter) == (RegexStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoPatternCharacter); | |
1597 | COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass) == (RegexStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoCharacterClass); | |
1598 | COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference) == (RegexStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoBackReference); | |
1599 | COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative) == (RegexStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoAlternative); | |
1600 | COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion) == (RegexStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheticalAssertion); | |
1601 | COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce) == (RegexStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParenthesesOnce); | |
1602 | COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (RegexStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheses); | |
1603 | ||
1604 | ||
1605 | } } | |
1606 | ||
1607 | #endif |