]>
Commit | Line | Data |
---|---|---|
ba379fdc A |
1 | /* |
2 | * Copyright (C) 2009 Apple Inc. All rights reserved. | |
3 | * | |
4 | * Redistribution and use in source and binary forms, with or without | |
5 | * modification, are permitted provided that the following conditions | |
6 | * are met: | |
7 | * 1. Redistributions of source code must retain the above copyright | |
8 | * notice, this list of conditions and the following disclaimer. | |
9 | * 2. Redistributions in binary form must reproduce the above copyright | |
10 | * notice, this list of conditions and the following disclaimer in the | |
11 | * documentation and/or other materials provided with the distribution. | |
12 | * | |
13 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY | |
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
16 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR | |
17 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
18 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
19 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
20 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
21 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
f9bf01c6 | 23 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
ba379fdc A |
24 | */ |
25 | ||
26 | #include "config.h" | |
27 | #include "RegexInterpreter.h" | |
28 | ||
29 | #include "RegexCompiler.h" | |
30 | #include "RegexPattern.h" | |
31 | ||
32 | #ifndef NDEBUG | |
33 | #include <stdio.h> | |
34 | #endif | |
35 | ||
36 | #if ENABLE(YARR) | |
37 | ||
38 | using namespace WTF; | |
39 | ||
40 | namespace JSC { namespace Yarr { | |
41 | ||
42 | class Interpreter { | |
43 | public: | |
44 | struct ParenthesesDisjunctionContext; | |
45 | ||
46 | struct BackTrackInfoPatternCharacter { | |
47 | uintptr_t matchAmount; | |
48 | }; | |
49 | struct BackTrackInfoCharacterClass { | |
50 | uintptr_t matchAmount; | |
51 | }; | |
52 | struct BackTrackInfoBackReference { | |
53 | uintptr_t begin; // Not really needed for greedy quantifiers. | |
54 | uintptr_t matchAmount; // Not really needed for fixed quantifiers. | |
55 | }; | |
56 | struct BackTrackInfoAlternative { | |
57 | uintptr_t offset; | |
58 | }; | |
59 | struct BackTrackInfoParentheticalAssertion { | |
60 | uintptr_t begin; | |
61 | }; | |
62 | struct BackTrackInfoParenthesesOnce { | |
63 | uintptr_t inParentheses; | |
64 | }; | |
65 | struct BackTrackInfoParentheses { | |
66 | uintptr_t matchAmount; | |
67 | ParenthesesDisjunctionContext* lastContext; | |
68 | uintptr_t prevBegin; | |
69 | uintptr_t prevEnd; | |
70 | }; | |
71 | ||
72 | static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context) | |
73 | { | |
74 | context->next = backTrack->lastContext; | |
75 | backTrack->lastContext = context; | |
76 | ++backTrack->matchAmount; | |
77 | } | |
78 | ||
79 | static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack) | |
80 | { | |
81 | ASSERT(backTrack->matchAmount); | |
82 | ASSERT(backTrack->lastContext); | |
83 | backTrack->lastContext = backTrack->lastContext->next; | |
84 | --backTrack->matchAmount; | |
85 | } | |
86 | ||
87 | struct DisjunctionContext | |
88 | { | |
89 | DisjunctionContext() | |
90 | : term(0) | |
91 | { | |
92 | } | |
f9bf01c6 | 93 | |
ba379fdc A |
94 | void* operator new(size_t, void* where) |
95 | { | |
96 | return where; | |
97 | } | |
98 | ||
99 | int term; | |
100 | unsigned matchBegin; | |
101 | unsigned matchEnd; | |
102 | uintptr_t frame[1]; | |
103 | }; | |
104 | ||
105 | DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction) | |
106 | { | |
107 | return new(malloc(sizeof(DisjunctionContext) + (disjunction->m_frameSize - 1) * sizeof(uintptr_t))) DisjunctionContext(); | |
108 | } | |
109 | ||
110 | void freeDisjunctionContext(DisjunctionContext* context) | |
111 | { | |
112 | free(context); | |
113 | } | |
114 | ||
115 | struct ParenthesesDisjunctionContext | |
116 | { | |
117 | ParenthesesDisjunctionContext(int* output, ByteTerm& term) | |
118 | : next(0) | |
119 | { | |
120 | unsigned firstSubpatternId = term.atom.subpatternId; | |
121 | unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns; | |
122 | ||
123 | for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) { | |
124 | subpatternBackup[i] = output[(firstSubpatternId << 1) + i]; | |
125 | output[(firstSubpatternId << 1) + i] = -1; | |
126 | } | |
f9bf01c6 | 127 | |
ba379fdc A |
128 | new(getDisjunctionContext(term)) DisjunctionContext(); |
129 | } | |
130 | ||
131 | void* operator new(size_t, void* where) | |
132 | { | |
133 | return where; | |
134 | } | |
135 | ||
136 | void restoreOutput(int* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns) | |
137 | { | |
138 | for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) | |
139 | output[(firstSubpatternId << 1) + i] = subpatternBackup[i]; | |
140 | } | |
f9bf01c6 | 141 | |
ba379fdc A |
142 | DisjunctionContext* getDisjunctionContext(ByteTerm& term) |
143 | { | |
144 | return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1])); | |
145 | } | |
146 | ||
147 | ParenthesesDisjunctionContext* next; | |
148 | int subpatternBackup[1]; | |
149 | }; | |
150 | ||
151 | ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, int* output, ByteTerm& term) | |
152 | { | |
153 | return new(malloc(sizeof(ParenthesesDisjunctionContext) + (((term.atom.parenthesesDisjunction->m_numSubpatterns << 1) - 1) * sizeof(int)) + sizeof(DisjunctionContext) + (disjunction->m_frameSize - 1) * sizeof(uintptr_t))) ParenthesesDisjunctionContext(output, term); | |
154 | } | |
155 | ||
156 | void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context) | |
157 | { | |
158 | free(context); | |
159 | } | |
160 | ||
161 | class InputStream { | |
162 | public: | |
163 | InputStream(const UChar* input, unsigned start, unsigned length) | |
164 | : input(input) | |
165 | , pos(start) | |
166 | , length(length) | |
167 | { | |
168 | } | |
169 | ||
170 | void next() | |
171 | { | |
172 | ++pos; | |
173 | } | |
174 | ||
175 | void rewind(unsigned amount) | |
176 | { | |
177 | ASSERT(pos >= amount); | |
178 | pos -= amount; | |
179 | } | |
180 | ||
181 | int read() | |
182 | { | |
183 | ASSERT(pos < length); | |
184 | if (pos < length) | |
185 | return input[pos]; | |
186 | return -1; | |
187 | } | |
188 | ||
189 | int readChecked(int position) | |
190 | { | |
191 | ASSERT(position < 0); | |
192 | ASSERT((unsigned)-position <= pos); | |
193 | unsigned p = pos + position; | |
194 | ASSERT(p < length); | |
195 | return input[p]; | |
196 | } | |
197 | ||
198 | int reread(unsigned from) | |
199 | { | |
200 | ASSERT(from < length); | |
201 | return input[from]; | |
202 | } | |
203 | ||
204 | int prev() | |
205 | { | |
206 | ASSERT(!(pos > length)); | |
207 | if (pos && length) | |
208 | return input[pos - 1]; | |
209 | return -1; | |
210 | } | |
f9bf01c6 | 211 | |
ba379fdc A |
212 | unsigned getPos() |
213 | { | |
214 | return pos; | |
215 | } | |
216 | ||
217 | void setPos(unsigned p) | |
218 | { | |
219 | pos = p; | |
220 | } | |
f9bf01c6 | 221 | |
ba379fdc A |
222 | bool atStart() |
223 | { | |
224 | return pos == 0; | |
225 | } | |
226 | ||
227 | bool atEnd() | |
228 | { | |
229 | return pos == length; | |
230 | } | |
231 | ||
232 | bool checkInput(int count) | |
233 | { | |
234 | if ((pos + count) <= length) { | |
235 | pos += count; | |
236 | return true; | |
237 | } else | |
238 | return false; | |
239 | } | |
240 | ||
241 | void uncheckInput(int count) | |
242 | { | |
243 | pos -= count; | |
244 | } | |
245 | ||
246 | bool atStart(int position) | |
247 | { | |
248 | return (pos + position) == 0; | |
249 | } | |
250 | ||
251 | bool atEnd(int position) | |
252 | { | |
253 | return (pos + position) == length; | |
254 | } | |
255 | ||
256 | private: | |
257 | const UChar* input; | |
258 | unsigned pos; | |
259 | unsigned length; | |
260 | }; | |
261 | ||
262 | bool testCharacterClass(CharacterClass* characterClass, int ch) | |
263 | { | |
264 | if (ch & 0xFF80) { | |
265 | for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i) | |
266 | if (ch == characterClass->m_matchesUnicode[i]) | |
267 | return true; | |
268 | for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i) | |
269 | if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end)) | |
270 | return true; | |
271 | } else { | |
272 | for (unsigned i = 0; i < characterClass->m_matches.size(); ++i) | |
273 | if (ch == characterClass->m_matches[i]) | |
274 | return true; | |
275 | for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i) | |
276 | if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end)) | |
277 | return true; | |
278 | } | |
279 | ||
280 | return false; | |
281 | } | |
282 | ||
283 | bool tryConsumeCharacter(int testChar) | |
284 | { | |
285 | if (input.atEnd()) | |
286 | return false; | |
f9bf01c6 | 287 | |
ba379fdc A |
288 | int ch = input.read(); |
289 | ||
290 | if (pattern->m_ignoreCase ? ((Unicode::toLower(testChar) == ch) || (Unicode::toUpper(testChar) == ch)) : (testChar == ch)) { | |
291 | input.next(); | |
292 | return true; | |
293 | } | |
294 | return false; | |
295 | } | |
296 | ||
297 | bool checkCharacter(int testChar, int inputPosition) | |
298 | { | |
299 | return testChar == input.readChecked(inputPosition); | |
300 | } | |
301 | ||
302 | bool checkCasedCharacter(int loChar, int hiChar, int inputPosition) | |
303 | { | |
304 | int ch = input.readChecked(inputPosition); | |
305 | return (loChar == ch) || (hiChar == ch); | |
306 | } | |
307 | ||
308 | bool tryConsumeCharacterClass(CharacterClass* characterClass, bool invert) | |
309 | { | |
310 | if (input.atEnd()) | |
311 | return false; | |
312 | ||
313 | bool match = testCharacterClass(characterClass, input.read()); | |
314 | ||
315 | if (invert) | |
316 | match = !match; | |
317 | ||
318 | if (match) { | |
319 | input.next(); | |
320 | return true; | |
321 | } | |
322 | return false; | |
323 | } | |
324 | ||
325 | bool checkCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition) | |
326 | { | |
327 | bool match = testCharacterClass(characterClass, input.readChecked(inputPosition)); | |
328 | return invert ? !match : match; | |
329 | } | |
330 | ||
331 | bool tryConsumeBackReference(int matchBegin, int matchEnd, int inputOffset) | |
332 | { | |
333 | int matchSize = matchEnd - matchBegin; | |
334 | ||
335 | if (!input.checkInput(matchSize)) | |
336 | return false; | |
337 | ||
338 | for (int i = 0; i < matchSize; ++i) { | |
339 | if (!checkCharacter(input.reread(matchBegin + i), inputOffset - matchSize + i)) { | |
340 | input.uncheckInput(matchSize); | |
341 | return false; | |
342 | } | |
343 | } | |
f9bf01c6 | 344 | |
ba379fdc A |
345 | return true; |
346 | } | |
347 | ||
348 | bool matchAssertionBOL(ByteTerm& term) | |
349 | { | |
350 | return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition - 1))); | |
351 | } | |
352 | ||
353 | bool matchAssertionEOL(ByteTerm& term) | |
354 | { | |
355 | if (term.inputPosition) | |
356 | return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition))); | |
357 | else | |
358 | return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read())); | |
359 | } | |
360 | ||
361 | bool matchAssertionWordBoundary(ByteTerm& term) | |
362 | { | |
363 | bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition - 1)); | |
364 | bool readIsWordchar; | |
365 | if (term.inputPosition) | |
366 | readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition)); | |
367 | else | |
368 | readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read()); | |
369 | ||
370 | bool wordBoundary = prevIsWordchar != readIsWordchar; | |
371 | return term.invert() ? !wordBoundary : wordBoundary; | |
372 | } | |
373 | ||
374 | bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context) | |
375 | { | |
376 | BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); | |
377 | ||
378 | switch (term.atom.quantityType) { | |
379 | case QuantifierFixedCount: | |
380 | break; | |
381 | ||
382 | case QuantifierGreedy: | |
383 | if (backTrack->matchAmount) { | |
384 | --backTrack->matchAmount; | |
385 | input.uncheckInput(1); | |
386 | return true; | |
387 | } | |
388 | break; | |
389 | ||
390 | case QuantifierNonGreedy: | |
391 | if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { | |
392 | ++backTrack->matchAmount; | |
393 | if (checkCharacter(term.atom.patternCharacter, term.inputPosition - 1)) | |
394 | return true; | |
395 | } | |
396 | input.uncheckInput(backTrack->matchAmount); | |
397 | break; | |
398 | } | |
399 | ||
400 | return false; | |
401 | } | |
402 | ||
403 | bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context) | |
404 | { | |
405 | BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); | |
406 | ||
407 | switch (term.atom.quantityType) { | |
408 | case QuantifierFixedCount: | |
409 | break; | |
410 | ||
411 | case QuantifierGreedy: | |
412 | if (backTrack->matchAmount) { | |
413 | --backTrack->matchAmount; | |
414 | input.uncheckInput(1); | |
415 | return true; | |
416 | } | |
417 | break; | |
418 | ||
419 | case QuantifierNonGreedy: | |
420 | if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { | |
421 | ++backTrack->matchAmount; | |
422 | if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition - 1)) | |
423 | return true; | |
424 | } | |
425 | input.uncheckInput(backTrack->matchAmount); | |
426 | break; | |
427 | } | |
428 | ||
429 | return false; | |
430 | } | |
431 | ||
432 | bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context) | |
433 | { | |
434 | ASSERT(term.type == ByteTerm::TypeCharacterClass); | |
435 | BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); | |
436 | ||
437 | switch (term.atom.quantityType) { | |
438 | case QuantifierFixedCount: { | |
439 | for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { | |
440 | if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + matchAmount)) | |
441 | return false; | |
442 | } | |
443 | return true; | |
444 | } | |
445 | ||
446 | case QuantifierGreedy: { | |
447 | unsigned matchAmount = 0; | |
448 | while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) { | |
449 | if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) { | |
450 | input.uncheckInput(1); | |
451 | break; | |
452 | } | |
453 | ++matchAmount; | |
454 | } | |
455 | backTrack->matchAmount = matchAmount; | |
456 | ||
457 | return true; | |
458 | } | |
459 | ||
460 | case QuantifierNonGreedy: | |
461 | backTrack->matchAmount = 0; | |
462 | return true; | |
463 | } | |
464 | ||
465 | ASSERT_NOT_REACHED(); | |
466 | return false; | |
467 | } | |
468 | ||
469 | bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context) | |
470 | { | |
471 | ASSERT(term.type == ByteTerm::TypeCharacterClass); | |
472 | BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); | |
473 | ||
474 | switch (term.atom.quantityType) { | |
475 | case QuantifierFixedCount: | |
476 | break; | |
477 | ||
478 | case QuantifierGreedy: | |
479 | if (backTrack->matchAmount) { | |
480 | --backTrack->matchAmount; | |
481 | input.uncheckInput(1); | |
482 | return true; | |
483 | } | |
484 | break; | |
485 | ||
486 | case QuantifierNonGreedy: | |
487 | if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { | |
488 | ++backTrack->matchAmount; | |
489 | if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) | |
490 | return true; | |
491 | } | |
492 | input.uncheckInput(backTrack->matchAmount); | |
493 | break; | |
494 | } | |
495 | ||
496 | return false; | |
497 | } | |
498 | ||
499 | bool matchBackReference(ByteTerm& term, DisjunctionContext* context) | |
500 | { | |
501 | ASSERT(term.type == ByteTerm::TypeBackReference); | |
502 | BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation); | |
503 | ||
504 | int matchBegin = output[(term.atom.subpatternId << 1)]; | |
505 | int matchEnd = output[(term.atom.subpatternId << 1) + 1]; | |
506 | ASSERT((matchBegin == -1) == (matchEnd == -1)); | |
507 | ASSERT(matchBegin <= matchEnd); | |
508 | ||
509 | if (matchBegin == matchEnd) | |
510 | return true; | |
511 | ||
512 | switch (term.atom.quantityType) { | |
513 | case QuantifierFixedCount: { | |
514 | backTrack->begin = input.getPos(); | |
515 | for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { | |
516 | if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { | |
517 | input.setPos(backTrack->begin); | |
518 | return false; | |
519 | } | |
520 | } | |
521 | return true; | |
522 | } | |
523 | ||
524 | case QuantifierGreedy: { | |
525 | unsigned matchAmount = 0; | |
526 | while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) | |
527 | ++matchAmount; | |
528 | backTrack->matchAmount = matchAmount; | |
529 | return true; | |
530 | } | |
531 | ||
532 | case QuantifierNonGreedy: | |
533 | backTrack->begin = input.getPos(); | |
534 | backTrack->matchAmount = 0; | |
535 | return true; | |
536 | } | |
537 | ||
538 | ASSERT_NOT_REACHED(); | |
539 | return false; | |
540 | } | |
541 | ||
542 | bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context) | |
543 | { | |
544 | ASSERT(term.type == ByteTerm::TypeBackReference); | |
545 | BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation); | |
546 | ||
547 | int matchBegin = output[(term.atom.subpatternId << 1)]; | |
548 | int matchEnd = output[(term.atom.subpatternId << 1) + 1]; | |
549 | ASSERT((matchBegin == -1) == (matchEnd == -1)); | |
550 | ASSERT(matchBegin <= matchEnd); | |
551 | ||
552 | if (matchBegin == matchEnd) | |
553 | return false; | |
554 | ||
555 | switch (term.atom.quantityType) { | |
556 | case QuantifierFixedCount: | |
557 | // for quantityCount == 1, could rewind. | |
558 | input.setPos(backTrack->begin); | |
559 | break; | |
560 | ||
561 | case QuantifierGreedy: | |
562 | if (backTrack->matchAmount) { | |
563 | --backTrack->matchAmount; | |
564 | input.rewind(matchEnd - matchBegin); | |
565 | return true; | |
566 | } | |
567 | break; | |
568 | ||
569 | case QuantifierNonGreedy: | |
570 | if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { | |
571 | ++backTrack->matchAmount; | |
572 | return true; | |
573 | } else | |
574 | input.setPos(backTrack->begin); | |
575 | break; | |
576 | } | |
577 | ||
578 | return false; | |
579 | } | |
580 | ||
581 | void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context) | |
582 | { | |
583 | if (term.capture()) { | |
584 | unsigned subpatternId = term.atom.subpatternId; | |
585 | output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition; | |
586 | output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition; | |
587 | } | |
588 | } | |
589 | void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context) | |
590 | { | |
591 | unsigned firstSubpatternId = term.atom.subpatternId; | |
592 | unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns; | |
593 | context->restoreOutput(output, firstSubpatternId, count); | |
594 | } | |
595 | void resetAssertionMatches(ByteTerm& term) | |
596 | { | |
597 | unsigned firstSubpatternId = term.atom.subpatternId; | |
598 | unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns; | |
599 | for (unsigned i = 0; i < (count << 1); ++i) | |
600 | output[(firstSubpatternId << 1) + i] = -1; | |
601 | } | |
602 | bool parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack) | |
603 | { | |
604 | while (backTrack->matchAmount) { | |
605 | ParenthesesDisjunctionContext* context = backTrack->lastContext; | |
606 | ||
607 | if (matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true)) | |
608 | return true; | |
f9bf01c6 | 609 | |
ba379fdc | 610 | resetMatches(term, context); |
ba379fdc | 611 | popParenthesesDisjunctionContext(backTrack); |
f9bf01c6 | 612 | freeParenthesesDisjunctionContext(context); |
ba379fdc A |
613 | } |
614 | ||
615 | return false; | |
616 | } | |
617 | ||
618 | bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) | |
619 | { | |
620 | ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); | |
621 | ASSERT(term.atom.quantityCount == 1); | |
622 | ||
623 | BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); | |
624 | ||
625 | switch (term.atom.quantityType) { | |
626 | case QuantifierGreedy: { | |
627 | // set this speculatively; if we get to the parens end this will be true. | |
628 | backTrack->inParentheses = 1; | |
629 | break; | |
630 | } | |
631 | case QuantifierNonGreedy: { | |
632 | backTrack->inParentheses = 0; | |
633 | context->term += term.atom.parenthesesWidth; | |
634 | return true; | |
635 | } | |
636 | case QuantifierFixedCount: | |
637 | break; | |
638 | } | |
639 | ||
640 | if (term.capture()) { | |
641 | unsigned subpatternId = term.atom.subpatternId; | |
642 | output[(subpatternId << 1)] = input.getPos() + term.inputPosition; | |
643 | } | |
644 | ||
645 | return true; | |
646 | } | |
647 | ||
648 | bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext*) | |
649 | { | |
650 | ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); | |
651 | ASSERT(term.atom.quantityCount == 1); | |
652 | ||
653 | if (term.capture()) { | |
654 | unsigned subpatternId = term.atom.subpatternId; | |
655 | output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition; | |
656 | } | |
657 | return true; | |
658 | } | |
659 | ||
660 | bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) | |
661 | { | |
662 | ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); | |
663 | ASSERT(term.atom.quantityCount == 1); | |
664 | ||
665 | BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); | |
666 | ||
667 | if (term.capture()) { | |
668 | unsigned subpatternId = term.atom.subpatternId; | |
669 | output[(subpatternId << 1)] = -1; | |
670 | output[(subpatternId << 1) + 1] = -1; | |
671 | } | |
672 | ||
673 | switch (term.atom.quantityType) { | |
674 | case QuantifierGreedy: | |
675 | // if we backtrack to this point, there is another chance - try matching nothing. | |
676 | ASSERT(backTrack->inParentheses); | |
677 | backTrack->inParentheses = 0; | |
678 | context->term += term.atom.parenthesesWidth; | |
679 | return true; | |
680 | case QuantifierNonGreedy: | |
681 | ASSERT(backTrack->inParentheses); | |
682 | case QuantifierFixedCount: | |
683 | break; | |
684 | } | |
685 | ||
686 | return false; | |
687 | } | |
688 | ||
689 | bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) | |
690 | { | |
691 | ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); | |
692 | ASSERT(term.atom.quantityCount == 1); | |
693 | ||
694 | BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); | |
695 | ||
696 | switch (term.atom.quantityType) { | |
697 | case QuantifierGreedy: | |
698 | if (!backTrack->inParentheses) { | |
699 | context->term -= term.atom.parenthesesWidth; | |
700 | return false; | |
701 | } | |
702 | case QuantifierNonGreedy: | |
703 | if (!backTrack->inParentheses) { | |
704 | // now try to match the parens; set this speculatively. | |
705 | backTrack->inParentheses = 1; | |
706 | if (term.capture()) { | |
707 | unsigned subpatternId = term.atom.subpatternId; | |
708 | output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition; | |
709 | } | |
710 | context->term -= term.atom.parenthesesWidth; | |
711 | return true; | |
712 | } | |
713 | case QuantifierFixedCount: | |
714 | break; | |
715 | } | |
716 | ||
717 | return false; | |
718 | } | |
719 | ||
720 | bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) | |
721 | { | |
722 | ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); | |
723 | ASSERT(term.atom.quantityCount == 1); | |
724 | ||
725 | BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); | |
726 | ||
727 | backTrack->begin = input.getPos(); | |
728 | return true; | |
729 | } | |
730 | ||
731 | bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) | |
732 | { | |
733 | ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); | |
734 | ASSERT(term.atom.quantityCount == 1); | |
735 | ||
736 | BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); | |
737 | ||
738 | input.setPos(backTrack->begin); | |
739 | ||
740 | // We've reached the end of the parens; if they are inverted, this is failure. | |
741 | if (term.invert()) { | |
742 | context->term -= term.atom.parenthesesWidth; | |
743 | return false; | |
744 | } | |
745 | ||
746 | return true; | |
747 | } | |
748 | ||
749 | bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) | |
750 | { | |
751 | ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); | |
752 | ASSERT(term.atom.quantityCount == 1); | |
753 | ||
754 | // We've failed to match parens; if they are inverted, this is win! | |
755 | if (term.invert()) { | |
756 | context->term += term.atom.parenthesesWidth; | |
757 | return true; | |
758 | } | |
759 | ||
760 | return false; | |
761 | } | |
762 | ||
763 | bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) | |
764 | { | |
765 | ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); | |
766 | ASSERT(term.atom.quantityCount == 1); | |
767 | ||
768 | BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); | |
769 | ||
770 | input.setPos(backTrack->begin); | |
771 | ||
772 | context->term -= term.atom.parenthesesWidth; | |
773 | return false; | |
774 | } | |
775 | ||
776 | bool matchParentheses(ByteTerm& term, DisjunctionContext* context) | |
777 | { | |
778 | ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); | |
779 | ||
780 | BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation); | |
781 | ||
782 | unsigned subpatternId = term.atom.subpatternId; | |
783 | ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; | |
784 | ||
785 | backTrack->prevBegin = output[(subpatternId << 1)]; | |
786 | backTrack->prevEnd = output[(subpatternId << 1) + 1]; | |
787 | ||
788 | backTrack->matchAmount = 0; | |
789 | backTrack->lastContext = 0; | |
790 | ||
791 | switch (term.atom.quantityType) { | |
792 | case QuantifierFixedCount: { | |
793 | // While we haven't yet reached our fixed limit, | |
794 | while (backTrack->matchAmount < term.atom.quantityCount) { | |
795 | // Try to do a match, and it it succeeds, add it to the list. | |
796 | ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); | |
797 | if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term))) | |
798 | appendParenthesesDisjunctionContext(backTrack, context); | |
799 | else { | |
800 | // The match failed; try to find an alternate point to carry on from. | |
801 | resetMatches(term, context); | |
802 | freeParenthesesDisjunctionContext(context); | |
803 | if (!parenthesesDoBacktrack(term, backTrack)) | |
804 | return false; | |
805 | } | |
806 | } | |
807 | ||
808 | ASSERT(backTrack->matchAmount == term.atom.quantityCount); | |
809 | ParenthesesDisjunctionContext* context = backTrack->lastContext; | |
810 | recordParenthesesMatch(term, context); | |
811 | return true; | |
812 | } | |
813 | ||
814 | case QuantifierGreedy: { | |
815 | while (backTrack->matchAmount < term.atom.quantityCount) { | |
816 | ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); | |
817 | if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) | |
818 | appendParenthesesDisjunctionContext(backTrack, context); | |
819 | else { | |
820 | resetMatches(term, context); | |
821 | freeParenthesesDisjunctionContext(context); | |
822 | break; | |
823 | } | |
824 | } | |
825 | ||
826 | if (backTrack->matchAmount) { | |
827 | ParenthesesDisjunctionContext* context = backTrack->lastContext; | |
828 | recordParenthesesMatch(term, context); | |
829 | } | |
830 | return true; | |
831 | } | |
832 | ||
833 | case QuantifierNonGreedy: | |
834 | return true; | |
835 | } | |
836 | ||
837 | ASSERT_NOT_REACHED(); | |
838 | return false; | |
839 | } | |
840 | ||
841 | // Rules for backtracking differ depending on whether this is greedy or non-greedy. | |
842 | // | |
843 | // Greedy matches never should try just adding more - you should already have done | |
844 | // the 'more' cases. Always backtrack, at least a leetle bit. However cases where | |
845 | // you backtrack an item off the list needs checking, since we'll never have matched | |
846 | // the one less case. Tracking forwards, still add as much as possible. | |
847 | // | |
848 | // Non-greedy, we've already done the one less case, so don't match on popping. | |
849 | // We haven't done the one more case, so always try to add that. | |
850 | // | |
851 | bool backtrackParentheses(ByteTerm& term, DisjunctionContext* context) | |
852 | { | |
853 | ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); | |
854 | ||
855 | BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation); | |
856 | ||
857 | if (term.capture()) { | |
858 | unsigned subpatternId = term.atom.subpatternId; | |
859 | output[(subpatternId << 1)] = backTrack->prevBegin; | |
860 | output[(subpatternId << 1) + 1] = backTrack->prevEnd; | |
861 | } | |
862 | ||
863 | ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; | |
864 | ||
865 | switch (term.atom.quantityType) { | |
866 | case QuantifierFixedCount: { | |
867 | ASSERT(backTrack->matchAmount == term.atom.quantityCount); | |
868 | ||
869 | ParenthesesDisjunctionContext* context = 0; | |
870 | ||
871 | if (!parenthesesDoBacktrack(term, backTrack)) | |
872 | return false; | |
873 | ||
874 | // While we haven't yet reached our fixed limit, | |
875 | while (backTrack->matchAmount < term.atom.quantityCount) { | |
876 | // Try to do a match, and it it succeeds, add it to the list. | |
877 | context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); | |
878 | if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term))) | |
879 | appendParenthesesDisjunctionContext(backTrack, context); | |
880 | else { | |
881 | // The match failed; try to find an alternate point to carry on from. | |
882 | resetMatches(term, context); | |
883 | freeParenthesesDisjunctionContext(context); | |
884 | if (!parenthesesDoBacktrack(term, backTrack)) | |
885 | return false; | |
886 | } | |
887 | } | |
888 | ||
889 | ASSERT(backTrack->matchAmount == term.atom.quantityCount); | |
890 | context = backTrack->lastContext; | |
891 | recordParenthesesMatch(term, context); | |
892 | return true; | |
893 | } | |
894 | ||
895 | case QuantifierGreedy: { | |
896 | if (!backTrack->matchAmount) | |
897 | return false; | |
898 | ||
899 | ParenthesesDisjunctionContext* context = backTrack->lastContext; | |
900 | if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) { | |
901 | while (backTrack->matchAmount < term.atom.quantityCount) { | |
902 | ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); | |
903 | if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) | |
904 | appendParenthesesDisjunctionContext(backTrack, context); | |
905 | else { | |
906 | resetMatches(term, context); | |
907 | freeParenthesesDisjunctionContext(context); | |
908 | break; | |
909 | } | |
910 | } | |
911 | } else { | |
912 | resetMatches(term, context); | |
ba379fdc | 913 | popParenthesesDisjunctionContext(backTrack); |
f9bf01c6 | 914 | freeParenthesesDisjunctionContext(context); |
ba379fdc A |
915 | } |
916 | ||
917 | if (backTrack->matchAmount) { | |
918 | ParenthesesDisjunctionContext* context = backTrack->lastContext; | |
919 | recordParenthesesMatch(term, context); | |
920 | } | |
921 | return true; | |
922 | } | |
923 | ||
924 | case QuantifierNonGreedy: { | |
925 | // If we've not reached the limit, try to add one more match. | |
926 | if (backTrack->matchAmount < term.atom.quantityCount) { | |
927 | ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); | |
928 | if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) { | |
929 | appendParenthesesDisjunctionContext(backTrack, context); | |
930 | recordParenthesesMatch(term, context); | |
931 | return true; | |
932 | } else { | |
933 | resetMatches(term, context); | |
934 | freeParenthesesDisjunctionContext(context); | |
935 | } | |
936 | } | |
937 | ||
938 | // Nope - okay backtrack looking for an alternative. | |
939 | while (backTrack->matchAmount) { | |
940 | ParenthesesDisjunctionContext* context = backTrack->lastContext; | |
941 | if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) { | |
942 | // successful backtrack! we're back in the game! | |
943 | if (backTrack->matchAmount) { | |
944 | context = backTrack->lastContext; | |
945 | recordParenthesesMatch(term, context); | |
946 | } | |
947 | return true; | |
948 | } | |
f9bf01c6 | 949 | |
ba379fdc A |
950 | // pop a match off the stack |
951 | resetMatches(term, context); | |
ba379fdc | 952 | popParenthesesDisjunctionContext(backTrack); |
f9bf01c6 | 953 | freeParenthesesDisjunctionContext(context); |
ba379fdc A |
954 | } |
955 | ||
956 | return false; | |
957 | } | |
958 | } | |
959 | ||
960 | ASSERT_NOT_REACHED(); | |
961 | return false; | |
962 | } | |
963 | ||
964 | #define MATCH_NEXT() { ++context->term; goto matchAgain; } | |
965 | #define BACKTRACK() { --context->term; goto backtrack; } | |
966 | #define currentTerm() (disjunction->terms[context->term]) | |
967 | bool matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) | |
968 | { | |
969 | if (btrack) | |
970 | BACKTRACK(); | |
971 | ||
972 | context->matchBegin = input.getPos(); | |
973 | context->term = 0; | |
974 | ||
975 | matchAgain: | |
976 | ASSERT(context->term < static_cast<int>(disjunction->terms.size())); | |
977 | ||
978 | switch (currentTerm().type) { | |
979 | case ByteTerm::TypeSubpatternBegin: | |
980 | MATCH_NEXT(); | |
981 | case ByteTerm::TypeSubpatternEnd: | |
982 | context->matchEnd = input.getPos(); | |
983 | return true; | |
984 | ||
985 | case ByteTerm::TypeBodyAlternativeBegin: | |
986 | MATCH_NEXT(); | |
987 | case ByteTerm::TypeBodyAlternativeDisjunction: | |
988 | case ByteTerm::TypeBodyAlternativeEnd: | |
989 | context->matchEnd = input.getPos(); | |
990 | return true; | |
991 | ||
992 | case ByteTerm::TypeAlternativeBegin: | |
993 | MATCH_NEXT(); | |
994 | case ByteTerm::TypeAlternativeDisjunction: | |
995 | case ByteTerm::TypeAlternativeEnd: { | |
996 | int offset = currentTerm().alternative.end; | |
997 | BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation); | |
998 | backTrack->offset = offset; | |
999 | context->term += offset; | |
1000 | MATCH_NEXT(); | |
1001 | } | |
1002 | ||
1003 | case ByteTerm::TypeAssertionBOL: | |
1004 | if (matchAssertionBOL(currentTerm())) | |
1005 | MATCH_NEXT(); | |
1006 | BACKTRACK(); | |
1007 | case ByteTerm::TypeAssertionEOL: | |
1008 | if (matchAssertionEOL(currentTerm())) | |
1009 | MATCH_NEXT(); | |
1010 | BACKTRACK(); | |
1011 | case ByteTerm::TypeAssertionWordBoundary: | |
1012 | if (matchAssertionWordBoundary(currentTerm())) | |
1013 | MATCH_NEXT(); | |
1014 | BACKTRACK(); | |
1015 | ||
1016 | case ByteTerm::TypePatternCharacterOnce: | |
1017 | case ByteTerm::TypePatternCharacterFixed: { | |
1018 | for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { | |
1019 | if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + matchAmount)) | |
1020 | BACKTRACK(); | |
1021 | } | |
1022 | MATCH_NEXT(); | |
1023 | } | |
1024 | case ByteTerm::TypePatternCharacterGreedy: { | |
1025 | BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); | |
1026 | unsigned matchAmount = 0; | |
1027 | while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { | |
1028 | if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 1)) { | |
1029 | input.uncheckInput(1); | |
1030 | break; | |
1031 | } | |
1032 | ++matchAmount; | |
1033 | } | |
1034 | backTrack->matchAmount = matchAmount; | |
1035 | ||
1036 | MATCH_NEXT(); | |
1037 | } | |
1038 | case ByteTerm::TypePatternCharacterNonGreedy: { | |
1039 | BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); | |
1040 | backTrack->matchAmount = 0; | |
1041 | MATCH_NEXT(); | |
1042 | } | |
1043 | ||
1044 | case ByteTerm::TypePatternCasedCharacterOnce: | |
1045 | case ByteTerm::TypePatternCasedCharacterFixed: { | |
1046 | for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { | |
1047 | if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + matchAmount)) | |
1048 | BACKTRACK(); | |
1049 | } | |
1050 | MATCH_NEXT(); | |
1051 | } | |
1052 | case ByteTerm::TypePatternCasedCharacterGreedy: { | |
1053 | BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); | |
1054 | unsigned matchAmount = 0; | |
1055 | while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { | |
1056 | if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - 1)) { | |
1057 | input.uncheckInput(1); | |
1058 | break; | |
1059 | } | |
1060 | ++matchAmount; | |
1061 | } | |
1062 | backTrack->matchAmount = matchAmount; | |
1063 | ||
1064 | MATCH_NEXT(); | |
1065 | } | |
1066 | case ByteTerm::TypePatternCasedCharacterNonGreedy: { | |
1067 | BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); | |
1068 | backTrack->matchAmount = 0; | |
1069 | MATCH_NEXT(); | |
1070 | } | |
1071 | ||
1072 | case ByteTerm::TypeCharacterClass: | |
1073 | if (matchCharacterClass(currentTerm(), context)) | |
1074 | MATCH_NEXT(); | |
1075 | BACKTRACK(); | |
1076 | case ByteTerm::TypeBackReference: | |
1077 | if (matchBackReference(currentTerm(), context)) | |
1078 | MATCH_NEXT(); | |
1079 | BACKTRACK(); | |
1080 | case ByteTerm::TypeParenthesesSubpattern: | |
1081 | if (matchParentheses(currentTerm(), context)) | |
1082 | MATCH_NEXT(); | |
1083 | BACKTRACK(); | |
1084 | case ByteTerm::TypeParenthesesSubpatternOnceBegin: | |
1085 | if (matchParenthesesOnceBegin(currentTerm(), context)) | |
1086 | MATCH_NEXT(); | |
1087 | BACKTRACK(); | |
1088 | case ByteTerm::TypeParenthesesSubpatternOnceEnd: | |
1089 | if (matchParenthesesOnceEnd(currentTerm(), context)) | |
1090 | MATCH_NEXT(); | |
1091 | BACKTRACK(); | |
1092 | case ByteTerm::TypeParentheticalAssertionBegin: | |
1093 | if (matchParentheticalAssertionBegin(currentTerm(), context)) | |
1094 | MATCH_NEXT(); | |
1095 | BACKTRACK(); | |
1096 | case ByteTerm::TypeParentheticalAssertionEnd: | |
1097 | if (matchParentheticalAssertionEnd(currentTerm(), context)) | |
1098 | MATCH_NEXT(); | |
1099 | BACKTRACK(); | |
1100 | ||
1101 | case ByteTerm::TypeCheckInput: | |
1102 | if (input.checkInput(currentTerm().checkInputCount)) | |
1103 | MATCH_NEXT(); | |
1104 | BACKTRACK(); | |
1105 | } | |
1106 | ||
1107 | // We should never fall-through to here. | |
1108 | ASSERT_NOT_REACHED(); | |
1109 | ||
1110 | backtrack: | |
1111 | ASSERT(context->term < static_cast<int>(disjunction->terms.size())); | |
1112 | ||
1113 | switch (currentTerm().type) { | |
1114 | case ByteTerm::TypeSubpatternBegin: | |
1115 | return false; | |
1116 | case ByteTerm::TypeSubpatternEnd: | |
1117 | ASSERT_NOT_REACHED(); | |
1118 | ||
1119 | case ByteTerm::TypeBodyAlternativeBegin: | |
1120 | case ByteTerm::TypeBodyAlternativeDisjunction: { | |
1121 | int offset = currentTerm().alternative.next; | |
1122 | context->term += offset; | |
1123 | if (offset > 0) | |
1124 | MATCH_NEXT(); | |
1125 | ||
1126 | if (input.atEnd()) | |
1127 | return false; | |
1128 | ||
1129 | input.next(); | |
1130 | context->matchBegin = input.getPos(); | |
1131 | MATCH_NEXT(); | |
1132 | } | |
1133 | case ByteTerm::TypeBodyAlternativeEnd: | |
1134 | ASSERT_NOT_REACHED(); | |
1135 | ||
1136 | case ByteTerm::TypeAlternativeBegin: | |
1137 | case ByteTerm::TypeAlternativeDisjunction: { | |
1138 | int offset = currentTerm().alternative.next; | |
1139 | context->term += offset; | |
1140 | if (offset > 0) | |
1141 | MATCH_NEXT(); | |
1142 | BACKTRACK(); | |
1143 | } | |
1144 | case ByteTerm::TypeAlternativeEnd: { | |
1145 | // We should never backtrack back into an alternative of the main body of the regex. | |
1146 | BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation); | |
1147 | unsigned offset = backTrack->offset; | |
1148 | context->term -= offset; | |
1149 | BACKTRACK(); | |
1150 | } | |
1151 | ||
1152 | case ByteTerm::TypeAssertionBOL: | |
1153 | case ByteTerm::TypeAssertionEOL: | |
1154 | case ByteTerm::TypeAssertionWordBoundary: | |
1155 | BACKTRACK(); | |
1156 | ||
1157 | case ByteTerm::TypePatternCharacterOnce: | |
1158 | case ByteTerm::TypePatternCharacterFixed: | |
1159 | case ByteTerm::TypePatternCharacterGreedy: | |
1160 | case ByteTerm::TypePatternCharacterNonGreedy: | |
1161 | if (backtrackPatternCharacter(currentTerm(), context)) | |
1162 | MATCH_NEXT(); | |
1163 | BACKTRACK(); | |
1164 | case ByteTerm::TypePatternCasedCharacterOnce: | |
1165 | case ByteTerm::TypePatternCasedCharacterFixed: | |
1166 | case ByteTerm::TypePatternCasedCharacterGreedy: | |
1167 | case ByteTerm::TypePatternCasedCharacterNonGreedy: | |
1168 | if (backtrackPatternCasedCharacter(currentTerm(), context)) | |
1169 | MATCH_NEXT(); | |
1170 | BACKTRACK(); | |
1171 | case ByteTerm::TypeCharacterClass: | |
1172 | if (backtrackCharacterClass(currentTerm(), context)) | |
1173 | MATCH_NEXT(); | |
1174 | BACKTRACK(); | |
1175 | case ByteTerm::TypeBackReference: | |
1176 | if (backtrackBackReference(currentTerm(), context)) | |
1177 | MATCH_NEXT(); | |
1178 | BACKTRACK(); | |
1179 | case ByteTerm::TypeParenthesesSubpattern: | |
1180 | if (backtrackParentheses(currentTerm(), context)) | |
1181 | MATCH_NEXT(); | |
1182 | BACKTRACK(); | |
1183 | case ByteTerm::TypeParenthesesSubpatternOnceBegin: | |
1184 | if (backtrackParenthesesOnceBegin(currentTerm(), context)) | |
1185 | MATCH_NEXT(); | |
1186 | BACKTRACK(); | |
1187 | case ByteTerm::TypeParenthesesSubpatternOnceEnd: | |
1188 | if (backtrackParenthesesOnceEnd(currentTerm(), context)) | |
1189 | MATCH_NEXT(); | |
1190 | BACKTRACK(); | |
1191 | case ByteTerm::TypeParentheticalAssertionBegin: | |
1192 | if (backtrackParentheticalAssertionBegin(currentTerm(), context)) | |
1193 | MATCH_NEXT(); | |
1194 | BACKTRACK(); | |
1195 | case ByteTerm::TypeParentheticalAssertionEnd: | |
1196 | if (backtrackParentheticalAssertionEnd(currentTerm(), context)) | |
1197 | MATCH_NEXT(); | |
1198 | BACKTRACK(); | |
1199 | ||
1200 | case ByteTerm::TypeCheckInput: | |
1201 | input.uncheckInput(currentTerm().checkInputCount); | |
1202 | BACKTRACK(); | |
1203 | } | |
1204 | ||
1205 | ASSERT_NOT_REACHED(); | |
1206 | return false; | |
1207 | } | |
1208 | ||
1209 | bool matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) | |
1210 | { | |
1211 | if (matchDisjunction(disjunction, context, btrack)) { | |
1212 | while (context->matchBegin == context->matchEnd) { | |
1213 | if (!matchDisjunction(disjunction, context, true)) | |
1214 | return false; | |
1215 | } | |
1216 | return true; | |
1217 | } | |
1218 | ||
1219 | return false; | |
1220 | } | |
1221 | ||
1222 | int interpret() | |
1223 | { | |
1224 | for (unsigned i = 0; i < ((pattern->m_body->m_numSubpatterns + 1) << 1); ++i) | |
1225 | output[i] = -1; | |
1226 | ||
1227 | DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get()); | |
1228 | ||
1229 | if (matchDisjunction(pattern->m_body.get(), context)) { | |
1230 | output[0] = context->matchBegin; | |
1231 | output[1] = context->matchEnd; | |
1232 | } | |
1233 | ||
1234 | freeDisjunctionContext(context); | |
1235 | ||
1236 | return output[0]; | |
1237 | } | |
1238 | ||
1239 | Interpreter(BytecodePattern* pattern, int* output, const UChar* inputChar, unsigned start, unsigned length) | |
1240 | : pattern(pattern) | |
1241 | , output(output) | |
1242 | , input(inputChar, start, length) | |
1243 | { | |
1244 | } | |
1245 | ||
1246 | private: | |
1247 | BytecodePattern *pattern; | |
1248 | int* output; | |
1249 | InputStream input; | |
1250 | }; | |
1251 | ||
1252 | ||
1253 | ||
1254 | class ByteCompiler { | |
1255 | struct ParenthesesStackEntry { | |
1256 | unsigned beginTerm; | |
1257 | unsigned savedAlternativeIndex; | |
1258 | ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/) | |
1259 | : beginTerm(beginTerm) | |
1260 | , savedAlternativeIndex(savedAlternativeIndex) | |
1261 | { | |
1262 | } | |
1263 | }; | |
1264 | ||
1265 | public: | |
1266 | ByteCompiler(RegexPattern& pattern) | |
1267 | : m_pattern(pattern) | |
1268 | { | |
f9bf01c6 A |
1269 | m_bodyDisjunction = 0; |
1270 | m_currentAlternativeIndex = 0; | |
ba379fdc | 1271 | } |
f9bf01c6 | 1272 | |
ba379fdc A |
1273 | BytecodePattern* compile() |
1274 | { | |
1275 | regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize); | |
1276 | emitDisjunction(m_pattern.m_body); | |
1277 | regexEnd(); | |
1278 | ||
f9bf01c6 | 1279 | return new BytecodePattern(m_bodyDisjunction, m_allParenthesesInfo, m_pattern); |
ba379fdc | 1280 | } |
f9bf01c6 | 1281 | |
ba379fdc A |
1282 | void checkInput(unsigned count) |
1283 | { | |
f9bf01c6 | 1284 | m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count)); |
ba379fdc A |
1285 | } |
1286 | ||
1287 | void assertionBOL(int inputPosition) | |
1288 | { | |
f9bf01c6 | 1289 | m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition)); |
ba379fdc A |
1290 | } |
1291 | ||
1292 | void assertionEOL(int inputPosition) | |
1293 | { | |
f9bf01c6 | 1294 | m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition)); |
ba379fdc A |
1295 | } |
1296 | ||
1297 | void assertionWordBoundary(bool invert, int inputPosition) | |
1298 | { | |
f9bf01c6 | 1299 | m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition)); |
ba379fdc A |
1300 | } |
1301 | ||
1302 | void atomPatternCharacter(UChar ch, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) | |
1303 | { | |
1304 | if (m_pattern.m_ignoreCase) { | |
1305 | UChar lo = Unicode::toLower(ch); | |
1306 | UChar hi = Unicode::toUpper(ch); | |
f9bf01c6 | 1307 | |
ba379fdc | 1308 | if (lo != hi) { |
f9bf01c6 | 1309 | m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType)); |
ba379fdc A |
1310 | return; |
1311 | } | |
1312 | } | |
1313 | ||
f9bf01c6 | 1314 | m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType)); |
ba379fdc | 1315 | } |
f9bf01c6 | 1316 | |
ba379fdc A |
1317 | void atomCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) |
1318 | { | |
f9bf01c6 | 1319 | m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition)); |
ba379fdc | 1320 | |
f9bf01c6 A |
1321 | m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount; |
1322 | m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; | |
1323 | m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; | |
ba379fdc A |
1324 | } |
1325 | ||
1326 | void atomBackReference(unsigned subpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) | |
1327 | { | |
1328 | ASSERT(subpatternId); | |
1329 | ||
f9bf01c6 | 1330 | m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition)); |
ba379fdc | 1331 | |
f9bf01c6 A |
1332 | m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount; |
1333 | m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; | |
1334 | m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; | |
ba379fdc A |
1335 | } |
1336 | ||
1337 | void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) | |
1338 | { | |
f9bf01c6 | 1339 | int beginTerm = m_bodyDisjunction->terms.size(); |
ba379fdc | 1340 | |
f9bf01c6 A |
1341 | m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, inputPosition)); |
1342 | m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; | |
1343 | m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); | |
1344 | m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; | |
ba379fdc | 1345 | |
f9bf01c6 A |
1346 | m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); |
1347 | m_currentAlternativeIndex = beginTerm + 1; | |
ba379fdc A |
1348 | } |
1349 | ||
1350 | void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation) | |
1351 | { | |
f9bf01c6 | 1352 | int beginTerm = m_bodyDisjunction->terms.size(); |
ba379fdc | 1353 | |
f9bf01c6 A |
1354 | m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, invert, 0)); |
1355 | m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; | |
1356 | m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); | |
1357 | m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; | |
ba379fdc | 1358 | |
f9bf01c6 A |
1359 | m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); |
1360 | m_currentAlternativeIndex = beginTerm + 1; | |
ba379fdc A |
1361 | } |
1362 | ||
1363 | unsigned popParenthesesStack() | |
1364 | { | |
1365 | ASSERT(m_parenthesesStack.size()); | |
1366 | int stackEnd = m_parenthesesStack.size() - 1; | |
1367 | unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm; | |
f9bf01c6 | 1368 | m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex; |
ba379fdc A |
1369 | m_parenthesesStack.shrink(stackEnd); |
1370 | ||
f9bf01c6 A |
1371 | ASSERT(beginTerm < m_bodyDisjunction->terms.size()); |
1372 | ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size()); | |
1373 | ||
ba379fdc A |
1374 | return beginTerm; |
1375 | } | |
1376 | ||
1377 | #ifndef NDEBUG | |
1378 | void dumpDisjunction(ByteDisjunction* disjunction) | |
1379 | { | |
1380 | printf("ByteDisjunction(%p):\n\t", disjunction); | |
1381 | for (unsigned i = 0; i < disjunction->terms.size(); ++i) | |
1382 | printf("{ %d } ", disjunction->terms[i].type); | |
1383 | printf("\n"); | |
1384 | } | |
1385 | #endif | |
1386 | ||
1387 | void closeAlternative(int beginTerm) | |
1388 | { | |
1389 | int origBeginTerm = beginTerm; | |
f9bf01c6 A |
1390 | ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin); |
1391 | int endIndex = m_bodyDisjunction->terms.size(); | |
ba379fdc | 1392 | |
f9bf01c6 | 1393 | unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; |
ba379fdc | 1394 | |
f9bf01c6 A |
1395 | if (!m_bodyDisjunction->terms[beginTerm].alternative.next) |
1396 | m_bodyDisjunction->terms.remove(beginTerm); | |
ba379fdc | 1397 | else { |
f9bf01c6 A |
1398 | while (m_bodyDisjunction->terms[beginTerm].alternative.next) { |
1399 | beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; | |
1400 | ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction); | |
1401 | m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; | |
1402 | m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; | |
ba379fdc | 1403 | } |
ba379fdc | 1404 | |
f9bf01c6 A |
1405 | m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; |
1406 | ||
1407 | m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd()); | |
1408 | m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; | |
ba379fdc A |
1409 | } |
1410 | } | |
1411 | ||
1412 | void closeBodyAlternative() | |
1413 | { | |
1414 | int beginTerm = 0; | |
1415 | int origBeginTerm = 0; | |
f9bf01c6 A |
1416 | ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin); |
1417 | int endIndex = m_bodyDisjunction->terms.size(); | |
ba379fdc | 1418 | |
f9bf01c6 | 1419 | unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; |
ba379fdc | 1420 | |
f9bf01c6 A |
1421 | while (m_bodyDisjunction->terms[beginTerm].alternative.next) { |
1422 | beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; | |
1423 | ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction); | |
1424 | m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; | |
1425 | m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; | |
ba379fdc | 1426 | } |
ba379fdc | 1427 | |
f9bf01c6 A |
1428 | m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; |
1429 | ||
1430 | m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd()); | |
1431 | m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; | |
ba379fdc A |
1432 | } |
1433 | ||
1434 | void atomParenthesesEnd(bool doInline, unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0) | |
1435 | { | |
1436 | unsigned beginTerm = popParenthesesStack(); | |
1437 | closeAlternative(beginTerm + 1); | |
f9bf01c6 | 1438 | unsigned endTerm = m_bodyDisjunction->terms.size(); |
ba379fdc | 1439 | |
f9bf01c6 A |
1440 | bool isAssertion = m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin; |
1441 | bool invertOrCapture = m_bodyDisjunction->terms[beginTerm].invertOrCapture; | |
1442 | unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; | |
ba379fdc | 1443 | |
f9bf01c6 A |
1444 | m_bodyDisjunction->terms.append(ByteTerm(isAssertion ? ByteTerm::TypeParentheticalAssertionEnd : ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, invertOrCapture, inputPosition)); |
1445 | m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; | |
1446 | m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; | |
1447 | m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; | |
ba379fdc A |
1448 | |
1449 | if (doInline) { | |
f9bf01c6 A |
1450 | m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; |
1451 | m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; | |
1452 | m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount; | |
1453 | m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; | |
ba379fdc | 1454 | } else { |
f9bf01c6 | 1455 | ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm]; |
ba379fdc A |
1456 | ASSERT(parenthesesBegin.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); |
1457 | ||
1458 | bool invertOrCapture = parenthesesBegin.invertOrCapture; | |
1459 | unsigned subpatternId = parenthesesBegin.atom.subpatternId; | |
1460 | ||
1461 | unsigned numSubpatterns = lastSubpatternId - subpatternId + 1; | |
1462 | ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize); | |
1463 | ||
1464 | parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin()); | |
1465 | for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses) | |
f9bf01c6 | 1466 | parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]); |
ba379fdc A |
1467 | parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd()); |
1468 | ||
f9bf01c6 | 1469 | m_bodyDisjunction->terms.shrink(beginTerm); |
ba379fdc A |
1470 | |
1471 | m_allParenthesesInfo.append(parenthesesDisjunction); | |
f9bf01c6 | 1472 | m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, invertOrCapture, inputPosition)); |
ba379fdc | 1473 | |
f9bf01c6 A |
1474 | m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; |
1475 | m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; | |
1476 | m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; | |
ba379fdc A |
1477 | } |
1478 | } | |
1479 | ||
1480 | void regexBegin(unsigned numSubpatterns, unsigned callFrameSize) | |
1481 | { | |
f9bf01c6 A |
1482 | m_bodyDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize); |
1483 | m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin()); | |
1484 | m_bodyDisjunction->terms[0].frameLocation = 0; | |
1485 | m_currentAlternativeIndex = 0; | |
ba379fdc A |
1486 | } |
1487 | ||
1488 | void regexEnd() | |
1489 | { | |
1490 | closeBodyAlternative(); | |
1491 | } | |
1492 | ||
f9bf01c6 | 1493 | void alternativeBodyDisjunction() |
ba379fdc | 1494 | { |
f9bf01c6 A |
1495 | int newAlternativeIndex = m_bodyDisjunction->terms.size(); |
1496 | m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; | |
1497 | m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction()); | |
ba379fdc | 1498 | |
f9bf01c6 | 1499 | m_currentAlternativeIndex = newAlternativeIndex; |
ba379fdc A |
1500 | } |
1501 | ||
f9bf01c6 | 1502 | void alternativeDisjunction() |
ba379fdc | 1503 | { |
f9bf01c6 A |
1504 | int newAlternativeIndex = m_bodyDisjunction->terms.size(); |
1505 | m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; | |
1506 | m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction()); | |
ba379fdc | 1507 | |
f9bf01c6 | 1508 | m_currentAlternativeIndex = newAlternativeIndex; |
ba379fdc A |
1509 | } |
1510 | ||
1511 | void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0) | |
1512 | { | |
1513 | for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { | |
1514 | unsigned currentCountAlreadyChecked = inputCountAlreadyChecked; | |
f9bf01c6 | 1515 | |
ba379fdc A |
1516 | if (alt) { |
1517 | if (disjunction == m_pattern.m_body) | |
f9bf01c6 | 1518 | alternativeBodyDisjunction(); |
ba379fdc | 1519 | else |
f9bf01c6 | 1520 | alternativeDisjunction(); |
ba379fdc A |
1521 | } |
1522 | ||
1523 | PatternAlternative* alternative = disjunction->m_alternatives[alt]; | |
1524 | unsigned minimumSize = alternative->m_minimumSize; | |
1525 | ||
1526 | ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked); | |
1527 | unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked; | |
1528 | if (countToCheck) | |
1529 | checkInput(countToCheck); | |
1530 | currentCountAlreadyChecked += countToCheck; | |
1531 | ||
1532 | for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { | |
1533 | PatternTerm& term = alternative->m_terms[i]; | |
1534 | ||
1535 | switch (term.type) { | |
1536 | case PatternTerm::TypeAssertionBOL: | |
1537 | assertionBOL(term.inputPosition - currentCountAlreadyChecked); | |
1538 | break; | |
1539 | ||
1540 | case PatternTerm::TypeAssertionEOL: | |
1541 | assertionEOL(term.inputPosition - currentCountAlreadyChecked); | |
1542 | break; | |
1543 | ||
1544 | case PatternTerm::TypeAssertionWordBoundary: | |
1545 | assertionWordBoundary(term.invertOrCapture, term.inputPosition - currentCountAlreadyChecked); | |
1546 | break; | |
1547 | ||
1548 | case PatternTerm::TypePatternCharacter: | |
1549 | atomPatternCharacter(term.patternCharacter, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); | |
1550 | break; | |
1551 | ||
1552 | case PatternTerm::TypeCharacterClass: | |
1553 | atomCharacterClass(term.characterClass, term.invertOrCapture, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); | |
1554 | break; | |
1555 | ||
1556 | case PatternTerm::TypeBackReference: | |
1557 | atomBackReference(term.subpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); | |
1558 | break; | |
1559 | ||
1560 | case PatternTerm::TypeForwardReference: | |
1561 | break; | |
1562 | ||
1563 | case PatternTerm::TypeParenthesesSubpattern: { | |
1564 | unsigned disjunctionAlreadyCheckedCount = 0; | |
1565 | if ((term.quantityCount == 1) && !term.parentheses.isCopy) { | |
1566 | if (term.quantityType == QuantifierFixedCount) { | |
1567 | disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize; | |
1568 | unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; | |
1569 | atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation); | |
1570 | emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, term.parentheses.disjunction->m_minimumSize); | |
1571 | atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); | |
1572 | } else { | |
1573 | unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; | |
1574 | atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + RegexStackSpaceForBackTrackInfoParenthesesOnce); | |
1575 | emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); | |
1576 | atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); | |
1577 | } | |
1578 | } else { | |
1579 | unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; | |
1580 | atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, 0); | |
1581 | emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); | |
1582 | atomParenthesesEnd(false, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); | |
1583 | } | |
1584 | break; | |
1585 | } | |
1586 | ||
1587 | case PatternTerm::TypeParentheticalAssertion: { | |
1588 | unsigned alternativeFrameLocation = term.inputPosition + RegexStackSpaceForBackTrackInfoParentheticalAssertion; | |
f9bf01c6 | 1589 | |
ba379fdc A |
1590 | atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invertOrCapture, term.frameLocation, alternativeFrameLocation); |
1591 | emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); | |
1592 | atomParenthesesEnd(true, term.parentheses.lastSubpatternId, 0, term.frameLocation, term.quantityCount, term.quantityType); | |
1593 | break; | |
1594 | } | |
1595 | } | |
1596 | } | |
1597 | } | |
1598 | } | |
1599 | ||
1600 | private: | |
1601 | RegexPattern& m_pattern; | |
f9bf01c6 A |
1602 | ByteDisjunction* m_bodyDisjunction; |
1603 | unsigned m_currentAlternativeIndex; | |
ba379fdc A |
1604 | Vector<ParenthesesStackEntry> m_parenthesesStack; |
1605 | Vector<ByteDisjunction*> m_allParenthesesInfo; | |
1606 | }; | |
1607 | ||
1608 | ||
1609 | BytecodePattern* byteCompileRegex(const UString& patternString, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline) | |
1610 | { | |
1611 | RegexPattern pattern(ignoreCase, multiline); | |
1612 | ||
1613 | if ((error = compileRegex(patternString, pattern))) | |
1614 | return 0; | |
1615 | ||
1616 | numSubpatterns = pattern.m_numSubpatterns; | |
1617 | ||
1618 | return ByteCompiler(pattern).compile(); | |
1619 | } | |
1620 | ||
1621 | int interpretRegex(BytecodePattern* regex, const UChar* input, unsigned start, unsigned length, int* output) | |
1622 | { | |
1623 | return Interpreter(regex, output, input, start, length).interpret(); | |
1624 | } | |
1625 | ||
1626 | ||
1627 | COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter) == (RegexStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoPatternCharacter); | |
1628 | COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass) == (RegexStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoCharacterClass); | |
1629 | COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference) == (RegexStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoBackReference); | |
1630 | COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative) == (RegexStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoAlternative); | |
1631 | COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion) == (RegexStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheticalAssertion); | |
1632 | COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce) == (RegexStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParenthesesOnce); | |
1633 | COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (RegexStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheses); | |
1634 | ||
1635 | ||
1636 | } } | |
1637 | ||
1638 | #endif |