]> git.saurik.com Git - apple/javascriptcore.git/blob - yarr/RegexInterpreter.cpp
JavaScriptCore-576.tar.gz
[apple/javascriptcore.git] / yarr / RegexInterpreter.cpp
1 /*
2 * Copyright (C) 2009 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26 #include "config.h"
27 #include "RegexInterpreter.h"
28
29 #include "RegexCompiler.h"
30 #include "RegexPattern.h"
31
32 #ifndef NDEBUG
33 #include <stdio.h>
34 #endif
35
36 #if ENABLE(YARR)
37
38 using namespace WTF;
39
40 namespace JSC { namespace Yarr {
41
42 class Interpreter {
43 public:
44 struct ParenthesesDisjunctionContext;
45
46 struct BackTrackInfoPatternCharacter {
47 uintptr_t matchAmount;
48 };
49 struct BackTrackInfoCharacterClass {
50 uintptr_t matchAmount;
51 };
52 struct BackTrackInfoBackReference {
53 uintptr_t begin; // Not really needed for greedy quantifiers.
54 uintptr_t matchAmount; // Not really needed for fixed quantifiers.
55 };
56 struct BackTrackInfoAlternative {
57 uintptr_t offset;
58 };
59 struct BackTrackInfoParentheticalAssertion {
60 uintptr_t begin;
61 };
62 struct BackTrackInfoParenthesesOnce {
63 uintptr_t inParentheses;
64 };
65 struct BackTrackInfoParentheses {
66 uintptr_t matchAmount;
67 ParenthesesDisjunctionContext* lastContext;
68 uintptr_t prevBegin;
69 uintptr_t prevEnd;
70 };
71
72 static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context)
73 {
74 context->next = backTrack->lastContext;
75 backTrack->lastContext = context;
76 ++backTrack->matchAmount;
77 }
78
79 static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack)
80 {
81 ASSERT(backTrack->matchAmount);
82 ASSERT(backTrack->lastContext);
83 backTrack->lastContext = backTrack->lastContext->next;
84 --backTrack->matchAmount;
85 }
86
87 struct DisjunctionContext
88 {
89 DisjunctionContext()
90 : term(0)
91 {
92 }
93
94 void* operator new(size_t, void* where)
95 {
96 return where;
97 }
98
99 int term;
100 unsigned matchBegin;
101 unsigned matchEnd;
102 uintptr_t frame[1];
103 };
104
105 DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction)
106 {
107 return new(malloc(sizeof(DisjunctionContext) + (disjunction->m_frameSize - 1) * sizeof(uintptr_t))) DisjunctionContext();
108 }
109
110 void freeDisjunctionContext(DisjunctionContext* context)
111 {
112 free(context);
113 }
114
115 struct ParenthesesDisjunctionContext
116 {
117 ParenthesesDisjunctionContext(int* output, ByteTerm& term)
118 : next(0)
119 {
120 unsigned firstSubpatternId = term.atom.subpatternId;
121 unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns;
122
123 for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) {
124 subpatternBackup[i] = output[(firstSubpatternId << 1) + i];
125 output[(firstSubpatternId << 1) + i] = -1;
126 }
127
128 new(getDisjunctionContext(term)) DisjunctionContext();
129 }
130
131 void* operator new(size_t, void* where)
132 {
133 return where;
134 }
135
136 void restoreOutput(int* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns)
137 {
138 for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i)
139 output[(firstSubpatternId << 1) + i] = subpatternBackup[i];
140 }
141
142 DisjunctionContext* getDisjunctionContext(ByteTerm& term)
143 {
144 return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1]));
145 }
146
147 ParenthesesDisjunctionContext* next;
148 int subpatternBackup[1];
149 };
150
151 ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, int* output, ByteTerm& term)
152 {
153 return new(malloc(sizeof(ParenthesesDisjunctionContext) + (((term.atom.parenthesesDisjunction->m_numSubpatterns << 1) - 1) * sizeof(int)) + sizeof(DisjunctionContext) + (disjunction->m_frameSize - 1) * sizeof(uintptr_t))) ParenthesesDisjunctionContext(output, term);
154 }
155
156 void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context)
157 {
158 free(context);
159 }
160
161 class InputStream {
162 public:
163 InputStream(const UChar* input, unsigned start, unsigned length)
164 : input(input)
165 , pos(start)
166 , length(length)
167 {
168 }
169
170 void next()
171 {
172 ++pos;
173 }
174
175 void rewind(unsigned amount)
176 {
177 ASSERT(pos >= amount);
178 pos -= amount;
179 }
180
181 int read()
182 {
183 ASSERT(pos < length);
184 if (pos < length)
185 return input[pos];
186 return -1;
187 }
188
189 int readChecked(int position)
190 {
191 ASSERT(position < 0);
192 ASSERT((unsigned)-position <= pos);
193 unsigned p = pos + position;
194 ASSERT(p < length);
195 return input[p];
196 }
197
198 int reread(unsigned from)
199 {
200 ASSERT(from < length);
201 return input[from];
202 }
203
204 int prev()
205 {
206 ASSERT(!(pos > length));
207 if (pos && length)
208 return input[pos - 1];
209 return -1;
210 }
211
212 unsigned getPos()
213 {
214 return pos;
215 }
216
217 void setPos(unsigned p)
218 {
219 pos = p;
220 }
221
222 bool atStart()
223 {
224 return pos == 0;
225 }
226
227 bool atEnd()
228 {
229 return pos == length;
230 }
231
232 bool checkInput(int count)
233 {
234 if ((pos + count) <= length) {
235 pos += count;
236 return true;
237 } else
238 return false;
239 }
240
241 void uncheckInput(int count)
242 {
243 pos -= count;
244 }
245
246 bool atStart(int position)
247 {
248 return (pos + position) == 0;
249 }
250
251 bool atEnd(int position)
252 {
253 return (pos + position) == length;
254 }
255
256 private:
257 const UChar* input;
258 unsigned pos;
259 unsigned length;
260 };
261
262 bool testCharacterClass(CharacterClass* characterClass, int ch)
263 {
264 if (ch & 0xFF80) {
265 for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i)
266 if (ch == characterClass->m_matchesUnicode[i])
267 return true;
268 for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i)
269 if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end))
270 return true;
271 } else {
272 for (unsigned i = 0; i < characterClass->m_matches.size(); ++i)
273 if (ch == characterClass->m_matches[i])
274 return true;
275 for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i)
276 if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end))
277 return true;
278 }
279
280 return false;
281 }
282
283 bool tryConsumeCharacter(int testChar)
284 {
285 if (input.atEnd())
286 return false;
287
288 int ch = input.read();
289
290 if (pattern->m_ignoreCase ? ((Unicode::toLower(testChar) == ch) || (Unicode::toUpper(testChar) == ch)) : (testChar == ch)) {
291 input.next();
292 return true;
293 }
294 return false;
295 }
296
297 bool checkCharacter(int testChar, int inputPosition)
298 {
299 return testChar == input.readChecked(inputPosition);
300 }
301
302 bool checkCasedCharacter(int loChar, int hiChar, int inputPosition)
303 {
304 int ch = input.readChecked(inputPosition);
305 return (loChar == ch) || (hiChar == ch);
306 }
307
308 bool tryConsumeCharacterClass(CharacterClass* characterClass, bool invert)
309 {
310 if (input.atEnd())
311 return false;
312
313 bool match = testCharacterClass(characterClass, input.read());
314
315 if (invert)
316 match = !match;
317
318 if (match) {
319 input.next();
320 return true;
321 }
322 return false;
323 }
324
325 bool checkCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition)
326 {
327 bool match = testCharacterClass(characterClass, input.readChecked(inputPosition));
328 return invert ? !match : match;
329 }
330
331 bool tryConsumeBackReference(int matchBegin, int matchEnd, int inputOffset)
332 {
333 int matchSize = matchEnd - matchBegin;
334
335 if (!input.checkInput(matchSize))
336 return false;
337
338 for (int i = 0; i < matchSize; ++i) {
339 if (!checkCharacter(input.reread(matchBegin + i), inputOffset - matchSize + i)) {
340 input.uncheckInput(matchSize);
341 return false;
342 }
343 }
344
345 return true;
346 }
347
348 bool matchAssertionBOL(ByteTerm& term)
349 {
350 return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition - 1)));
351 }
352
353 bool matchAssertionEOL(ByteTerm& term)
354 {
355 if (term.inputPosition)
356 return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition)));
357 else
358 return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read()));
359 }
360
361 bool matchAssertionWordBoundary(ByteTerm& term)
362 {
363 bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition - 1));
364 bool readIsWordchar;
365 if (term.inputPosition)
366 readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition));
367 else
368 readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read());
369
370 bool wordBoundary = prevIsWordchar != readIsWordchar;
371 return term.invert() ? !wordBoundary : wordBoundary;
372 }
373
374 bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context)
375 {
376 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
377
378 switch (term.atom.quantityType) {
379 case QuantifierFixedCount:
380 break;
381
382 case QuantifierGreedy:
383 if (backTrack->matchAmount) {
384 --backTrack->matchAmount;
385 input.uncheckInput(1);
386 return true;
387 }
388 break;
389
390 case QuantifierNonGreedy:
391 if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
392 ++backTrack->matchAmount;
393 if (checkCharacter(term.atom.patternCharacter, term.inputPosition - 1))
394 return true;
395 }
396 input.uncheckInput(backTrack->matchAmount);
397 break;
398 }
399
400 return false;
401 }
402
403 bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context)
404 {
405 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
406
407 switch (term.atom.quantityType) {
408 case QuantifierFixedCount:
409 break;
410
411 case QuantifierGreedy:
412 if (backTrack->matchAmount) {
413 --backTrack->matchAmount;
414 input.uncheckInput(1);
415 return true;
416 }
417 break;
418
419 case QuantifierNonGreedy:
420 if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
421 ++backTrack->matchAmount;
422 if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition - 1))
423 return true;
424 }
425 input.uncheckInput(backTrack->matchAmount);
426 break;
427 }
428
429 return false;
430 }
431
432 bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context)
433 {
434 ASSERT(term.type == ByteTerm::TypeCharacterClass);
435 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
436
437 switch (term.atom.quantityType) {
438 case QuantifierFixedCount: {
439 for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
440 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + matchAmount))
441 return false;
442 }
443 return true;
444 }
445
446 case QuantifierGreedy: {
447 unsigned matchAmount = 0;
448 while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
449 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) {
450 input.uncheckInput(1);
451 break;
452 }
453 ++matchAmount;
454 }
455 backTrack->matchAmount = matchAmount;
456
457 return true;
458 }
459
460 case QuantifierNonGreedy:
461 backTrack->matchAmount = 0;
462 return true;
463 }
464
465 ASSERT_NOT_REACHED();
466 return false;
467 }
468
469 bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context)
470 {
471 ASSERT(term.type == ByteTerm::TypeCharacterClass);
472 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
473
474 switch (term.atom.quantityType) {
475 case QuantifierFixedCount:
476 break;
477
478 case QuantifierGreedy:
479 if (backTrack->matchAmount) {
480 --backTrack->matchAmount;
481 input.uncheckInput(1);
482 return true;
483 }
484 break;
485
486 case QuantifierNonGreedy:
487 if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
488 ++backTrack->matchAmount;
489 if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1))
490 return true;
491 }
492 input.uncheckInput(backTrack->matchAmount);
493 break;
494 }
495
496 return false;
497 }
498
499 bool matchBackReference(ByteTerm& term, DisjunctionContext* context)
500 {
501 ASSERT(term.type == ByteTerm::TypeBackReference);
502 BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
503
504 int matchBegin = output[(term.atom.subpatternId << 1)];
505 int matchEnd = output[(term.atom.subpatternId << 1) + 1];
506 ASSERT((matchBegin == -1) == (matchEnd == -1));
507 ASSERT(matchBegin <= matchEnd);
508
509 if (matchBegin == matchEnd)
510 return true;
511
512 switch (term.atom.quantityType) {
513 case QuantifierFixedCount: {
514 backTrack->begin = input.getPos();
515 for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
516 if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
517 input.setPos(backTrack->begin);
518 return false;
519 }
520 }
521 return true;
522 }
523
524 case QuantifierGreedy: {
525 unsigned matchAmount = 0;
526 while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition))
527 ++matchAmount;
528 backTrack->matchAmount = matchAmount;
529 return true;
530 }
531
532 case QuantifierNonGreedy:
533 backTrack->begin = input.getPos();
534 backTrack->matchAmount = 0;
535 return true;
536 }
537
538 ASSERT_NOT_REACHED();
539 return false;
540 }
541
542 bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context)
543 {
544 ASSERT(term.type == ByteTerm::TypeBackReference);
545 BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
546
547 int matchBegin = output[(term.atom.subpatternId << 1)];
548 int matchEnd = output[(term.atom.subpatternId << 1) + 1];
549 ASSERT((matchBegin == -1) == (matchEnd == -1));
550 ASSERT(matchBegin <= matchEnd);
551
552 if (matchBegin == matchEnd)
553 return false;
554
555 switch (term.atom.quantityType) {
556 case QuantifierFixedCount:
557 // for quantityCount == 1, could rewind.
558 input.setPos(backTrack->begin);
559 break;
560
561 case QuantifierGreedy:
562 if (backTrack->matchAmount) {
563 --backTrack->matchAmount;
564 input.rewind(matchEnd - matchBegin);
565 return true;
566 }
567 break;
568
569 case QuantifierNonGreedy:
570 if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
571 ++backTrack->matchAmount;
572 return true;
573 } else
574 input.setPos(backTrack->begin);
575 break;
576 }
577
578 return false;
579 }
580
581 void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context)
582 {
583 if (term.capture()) {
584 unsigned subpatternId = term.atom.subpatternId;
585 output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition;
586 output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition;
587 }
588 }
589 void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context)
590 {
591 unsigned firstSubpatternId = term.atom.subpatternId;
592 unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
593 context->restoreOutput(output, firstSubpatternId, count);
594 }
595 void resetAssertionMatches(ByteTerm& term)
596 {
597 unsigned firstSubpatternId = term.atom.subpatternId;
598 unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
599 for (unsigned i = 0; i < (count << 1); ++i)
600 output[(firstSubpatternId << 1) + i] = -1;
601 }
602 bool parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack)
603 {
604 while (backTrack->matchAmount) {
605 ParenthesesDisjunctionContext* context = backTrack->lastContext;
606
607 if (matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true))
608 return true;
609
610 resetMatches(term, context);
611 popParenthesesDisjunctionContext(backTrack);
612 freeParenthesesDisjunctionContext(context);
613 }
614
615 return false;
616 }
617
618 bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
619 {
620 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
621 ASSERT(term.atom.quantityCount == 1);
622
623 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
624
625 switch (term.atom.quantityType) {
626 case QuantifierGreedy: {
627 // set this speculatively; if we get to the parens end this will be true.
628 backTrack->inParentheses = 1;
629 break;
630 }
631 case QuantifierNonGreedy: {
632 backTrack->inParentheses = 0;
633 context->term += term.atom.parenthesesWidth;
634 return true;
635 }
636 case QuantifierFixedCount:
637 break;
638 }
639
640 if (term.capture()) {
641 unsigned subpatternId = term.atom.subpatternId;
642 output[(subpatternId << 1)] = input.getPos() + term.inputPosition;
643 }
644
645 return true;
646 }
647
648 bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext*)
649 {
650 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
651 ASSERT(term.atom.quantityCount == 1);
652
653 if (term.capture()) {
654 unsigned subpatternId = term.atom.subpatternId;
655 output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition;
656 }
657 return true;
658 }
659
660 bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
661 {
662 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
663 ASSERT(term.atom.quantityCount == 1);
664
665 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
666
667 if (term.capture()) {
668 unsigned subpatternId = term.atom.subpatternId;
669 output[(subpatternId << 1)] = -1;
670 output[(subpatternId << 1) + 1] = -1;
671 }
672
673 switch (term.atom.quantityType) {
674 case QuantifierGreedy:
675 // if we backtrack to this point, there is another chance - try matching nothing.
676 ASSERT(backTrack->inParentheses);
677 backTrack->inParentheses = 0;
678 context->term += term.atom.parenthesesWidth;
679 return true;
680 case QuantifierNonGreedy:
681 ASSERT(backTrack->inParentheses);
682 case QuantifierFixedCount:
683 break;
684 }
685
686 return false;
687 }
688
689 bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
690 {
691 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
692 ASSERT(term.atom.quantityCount == 1);
693
694 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
695
696 switch (term.atom.quantityType) {
697 case QuantifierGreedy:
698 if (!backTrack->inParentheses) {
699 context->term -= term.atom.parenthesesWidth;
700 return false;
701 }
702 case QuantifierNonGreedy:
703 if (!backTrack->inParentheses) {
704 // now try to match the parens; set this speculatively.
705 backTrack->inParentheses = 1;
706 if (term.capture()) {
707 unsigned subpatternId = term.atom.subpatternId;
708 output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition;
709 }
710 context->term -= term.atom.parenthesesWidth;
711 return true;
712 }
713 case QuantifierFixedCount:
714 break;
715 }
716
717 return false;
718 }
719
720 bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
721 {
722 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
723 ASSERT(term.atom.quantityCount == 1);
724
725 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
726
727 backTrack->begin = input.getPos();
728 return true;
729 }
730
731 bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
732 {
733 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
734 ASSERT(term.atom.quantityCount == 1);
735
736 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
737
738 input.setPos(backTrack->begin);
739
740 // We've reached the end of the parens; if they are inverted, this is failure.
741 if (term.invert()) {
742 context->term -= term.atom.parenthesesWidth;
743 return false;
744 }
745
746 return true;
747 }
748
749 bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
750 {
751 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
752 ASSERT(term.atom.quantityCount == 1);
753
754 // We've failed to match parens; if they are inverted, this is win!
755 if (term.invert()) {
756 context->term += term.atom.parenthesesWidth;
757 return true;
758 }
759
760 return false;
761 }
762
763 bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
764 {
765 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
766 ASSERT(term.atom.quantityCount == 1);
767
768 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
769
770 input.setPos(backTrack->begin);
771
772 context->term -= term.atom.parenthesesWidth;
773 return false;
774 }
775
776 bool matchParentheses(ByteTerm& term, DisjunctionContext* context)
777 {
778 ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
779
780 BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
781
782 unsigned subpatternId = term.atom.subpatternId;
783 ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
784
785 backTrack->prevBegin = output[(subpatternId << 1)];
786 backTrack->prevEnd = output[(subpatternId << 1) + 1];
787
788 backTrack->matchAmount = 0;
789 backTrack->lastContext = 0;
790
791 switch (term.atom.quantityType) {
792 case QuantifierFixedCount: {
793 // While we haven't yet reached our fixed limit,
794 while (backTrack->matchAmount < term.atom.quantityCount) {
795 // Try to do a match, and it it succeeds, add it to the list.
796 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
797 if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
798 appendParenthesesDisjunctionContext(backTrack, context);
799 else {
800 // The match failed; try to find an alternate point to carry on from.
801 resetMatches(term, context);
802 freeParenthesesDisjunctionContext(context);
803 if (!parenthesesDoBacktrack(term, backTrack))
804 return false;
805 }
806 }
807
808 ASSERT(backTrack->matchAmount == term.atom.quantityCount);
809 ParenthesesDisjunctionContext* context = backTrack->lastContext;
810 recordParenthesesMatch(term, context);
811 return true;
812 }
813
814 case QuantifierGreedy: {
815 while (backTrack->matchAmount < term.atom.quantityCount) {
816 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
817 if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
818 appendParenthesesDisjunctionContext(backTrack, context);
819 else {
820 resetMatches(term, context);
821 freeParenthesesDisjunctionContext(context);
822 break;
823 }
824 }
825
826 if (backTrack->matchAmount) {
827 ParenthesesDisjunctionContext* context = backTrack->lastContext;
828 recordParenthesesMatch(term, context);
829 }
830 return true;
831 }
832
833 case QuantifierNonGreedy:
834 return true;
835 }
836
837 ASSERT_NOT_REACHED();
838 return false;
839 }
840
841 // Rules for backtracking differ depending on whether this is greedy or non-greedy.
842 //
843 // Greedy matches never should try just adding more - you should already have done
844 // the 'more' cases. Always backtrack, at least a leetle bit. However cases where
845 // you backtrack an item off the list needs checking, since we'll never have matched
846 // the one less case. Tracking forwards, still add as much as possible.
847 //
848 // Non-greedy, we've already done the one less case, so don't match on popping.
849 // We haven't done the one more case, so always try to add that.
850 //
851 bool backtrackParentheses(ByteTerm& term, DisjunctionContext* context)
852 {
853 ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
854
855 BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
856
857 if (term.capture()) {
858 unsigned subpatternId = term.atom.subpatternId;
859 output[(subpatternId << 1)] = backTrack->prevBegin;
860 output[(subpatternId << 1) + 1] = backTrack->prevEnd;
861 }
862
863 ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
864
865 switch (term.atom.quantityType) {
866 case QuantifierFixedCount: {
867 ASSERT(backTrack->matchAmount == term.atom.quantityCount);
868
869 ParenthesesDisjunctionContext* context = 0;
870
871 if (!parenthesesDoBacktrack(term, backTrack))
872 return false;
873
874 // While we haven't yet reached our fixed limit,
875 while (backTrack->matchAmount < term.atom.quantityCount) {
876 // Try to do a match, and it it succeeds, add it to the list.
877 context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
878 if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
879 appendParenthesesDisjunctionContext(backTrack, context);
880 else {
881 // The match failed; try to find an alternate point to carry on from.
882 resetMatches(term, context);
883 freeParenthesesDisjunctionContext(context);
884 if (!parenthesesDoBacktrack(term, backTrack))
885 return false;
886 }
887 }
888
889 ASSERT(backTrack->matchAmount == term.atom.quantityCount);
890 context = backTrack->lastContext;
891 recordParenthesesMatch(term, context);
892 return true;
893 }
894
895 case QuantifierGreedy: {
896 if (!backTrack->matchAmount)
897 return false;
898
899 ParenthesesDisjunctionContext* context = backTrack->lastContext;
900 if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) {
901 while (backTrack->matchAmount < term.atom.quantityCount) {
902 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
903 if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
904 appendParenthesesDisjunctionContext(backTrack, context);
905 else {
906 resetMatches(term, context);
907 freeParenthesesDisjunctionContext(context);
908 break;
909 }
910 }
911 } else {
912 resetMatches(term, context);
913 popParenthesesDisjunctionContext(backTrack);
914 freeParenthesesDisjunctionContext(context);
915 }
916
917 if (backTrack->matchAmount) {
918 ParenthesesDisjunctionContext* context = backTrack->lastContext;
919 recordParenthesesMatch(term, context);
920 }
921 return true;
922 }
923
924 case QuantifierNonGreedy: {
925 // If we've not reached the limit, try to add one more match.
926 if (backTrack->matchAmount < term.atom.quantityCount) {
927 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
928 if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) {
929 appendParenthesesDisjunctionContext(backTrack, context);
930 recordParenthesesMatch(term, context);
931 return true;
932 } else {
933 resetMatches(term, context);
934 freeParenthesesDisjunctionContext(context);
935 }
936 }
937
938 // Nope - okay backtrack looking for an alternative.
939 while (backTrack->matchAmount) {
940 ParenthesesDisjunctionContext* context = backTrack->lastContext;
941 if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) {
942 // successful backtrack! we're back in the game!
943 if (backTrack->matchAmount) {
944 context = backTrack->lastContext;
945 recordParenthesesMatch(term, context);
946 }
947 return true;
948 }
949
950 // pop a match off the stack
951 resetMatches(term, context);
952 popParenthesesDisjunctionContext(backTrack);
953 freeParenthesesDisjunctionContext(context);
954 }
955
956 return false;
957 }
958 }
959
960 ASSERT_NOT_REACHED();
961 return false;
962 }
963
964 #define MATCH_NEXT() { ++context->term; goto matchAgain; }
965 #define BACKTRACK() { --context->term; goto backtrack; }
966 #define currentTerm() (disjunction->terms[context->term])
967 bool matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
968 {
969 if (btrack)
970 BACKTRACK();
971
972 context->matchBegin = input.getPos();
973 context->term = 0;
974
975 matchAgain:
976 ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
977
978 switch (currentTerm().type) {
979 case ByteTerm::TypeSubpatternBegin:
980 MATCH_NEXT();
981 case ByteTerm::TypeSubpatternEnd:
982 context->matchEnd = input.getPos();
983 return true;
984
985 case ByteTerm::TypeBodyAlternativeBegin:
986 MATCH_NEXT();
987 case ByteTerm::TypeBodyAlternativeDisjunction:
988 case ByteTerm::TypeBodyAlternativeEnd:
989 context->matchEnd = input.getPos();
990 return true;
991
992 case ByteTerm::TypeAlternativeBegin:
993 MATCH_NEXT();
994 case ByteTerm::TypeAlternativeDisjunction:
995 case ByteTerm::TypeAlternativeEnd: {
996 int offset = currentTerm().alternative.end;
997 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
998 backTrack->offset = offset;
999 context->term += offset;
1000 MATCH_NEXT();
1001 }
1002
1003 case ByteTerm::TypeAssertionBOL:
1004 if (matchAssertionBOL(currentTerm()))
1005 MATCH_NEXT();
1006 BACKTRACK();
1007 case ByteTerm::TypeAssertionEOL:
1008 if (matchAssertionEOL(currentTerm()))
1009 MATCH_NEXT();
1010 BACKTRACK();
1011 case ByteTerm::TypeAssertionWordBoundary:
1012 if (matchAssertionWordBoundary(currentTerm()))
1013 MATCH_NEXT();
1014 BACKTRACK();
1015
1016 case ByteTerm::TypePatternCharacterOnce:
1017 case ByteTerm::TypePatternCharacterFixed: {
1018 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
1019 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + matchAmount))
1020 BACKTRACK();
1021 }
1022 MATCH_NEXT();
1023 }
1024 case ByteTerm::TypePatternCharacterGreedy: {
1025 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1026 unsigned matchAmount = 0;
1027 while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
1028 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 1)) {
1029 input.uncheckInput(1);
1030 break;
1031 }
1032 ++matchAmount;
1033 }
1034 backTrack->matchAmount = matchAmount;
1035
1036 MATCH_NEXT();
1037 }
1038 case ByteTerm::TypePatternCharacterNonGreedy: {
1039 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1040 backTrack->matchAmount = 0;
1041 MATCH_NEXT();
1042 }
1043
1044 case ByteTerm::TypePatternCasedCharacterOnce:
1045 case ByteTerm::TypePatternCasedCharacterFixed: {
1046 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
1047 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + matchAmount))
1048 BACKTRACK();
1049 }
1050 MATCH_NEXT();
1051 }
1052 case ByteTerm::TypePatternCasedCharacterGreedy: {
1053 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1054 unsigned matchAmount = 0;
1055 while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
1056 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - 1)) {
1057 input.uncheckInput(1);
1058 break;
1059 }
1060 ++matchAmount;
1061 }
1062 backTrack->matchAmount = matchAmount;
1063
1064 MATCH_NEXT();
1065 }
1066 case ByteTerm::TypePatternCasedCharacterNonGreedy: {
1067 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1068 backTrack->matchAmount = 0;
1069 MATCH_NEXT();
1070 }
1071
1072 case ByteTerm::TypeCharacterClass:
1073 if (matchCharacterClass(currentTerm(), context))
1074 MATCH_NEXT();
1075 BACKTRACK();
1076 case ByteTerm::TypeBackReference:
1077 if (matchBackReference(currentTerm(), context))
1078 MATCH_NEXT();
1079 BACKTRACK();
1080 case ByteTerm::TypeParenthesesSubpattern:
1081 if (matchParentheses(currentTerm(), context))
1082 MATCH_NEXT();
1083 BACKTRACK();
1084 case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1085 if (matchParenthesesOnceBegin(currentTerm(), context))
1086 MATCH_NEXT();
1087 BACKTRACK();
1088 case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1089 if (matchParenthesesOnceEnd(currentTerm(), context))
1090 MATCH_NEXT();
1091 BACKTRACK();
1092 case ByteTerm::TypeParentheticalAssertionBegin:
1093 if (matchParentheticalAssertionBegin(currentTerm(), context))
1094 MATCH_NEXT();
1095 BACKTRACK();
1096 case ByteTerm::TypeParentheticalAssertionEnd:
1097 if (matchParentheticalAssertionEnd(currentTerm(), context))
1098 MATCH_NEXT();
1099 BACKTRACK();
1100
1101 case ByteTerm::TypeCheckInput:
1102 if (input.checkInput(currentTerm().checkInputCount))
1103 MATCH_NEXT();
1104 BACKTRACK();
1105 }
1106
1107 // We should never fall-through to here.
1108 ASSERT_NOT_REACHED();
1109
1110 backtrack:
1111 ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
1112
1113 switch (currentTerm().type) {
1114 case ByteTerm::TypeSubpatternBegin:
1115 return false;
1116 case ByteTerm::TypeSubpatternEnd:
1117 ASSERT_NOT_REACHED();
1118
1119 case ByteTerm::TypeBodyAlternativeBegin:
1120 case ByteTerm::TypeBodyAlternativeDisjunction: {
1121 int offset = currentTerm().alternative.next;
1122 context->term += offset;
1123 if (offset > 0)
1124 MATCH_NEXT();
1125
1126 if (input.atEnd())
1127 return false;
1128
1129 input.next();
1130 context->matchBegin = input.getPos();
1131 MATCH_NEXT();
1132 }
1133 case ByteTerm::TypeBodyAlternativeEnd:
1134 ASSERT_NOT_REACHED();
1135
1136 case ByteTerm::TypeAlternativeBegin:
1137 case ByteTerm::TypeAlternativeDisjunction: {
1138 int offset = currentTerm().alternative.next;
1139 context->term += offset;
1140 if (offset > 0)
1141 MATCH_NEXT();
1142 BACKTRACK();
1143 }
1144 case ByteTerm::TypeAlternativeEnd: {
1145 // We should never backtrack back into an alternative of the main body of the regex.
1146 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
1147 unsigned offset = backTrack->offset;
1148 context->term -= offset;
1149 BACKTRACK();
1150 }
1151
1152 case ByteTerm::TypeAssertionBOL:
1153 case ByteTerm::TypeAssertionEOL:
1154 case ByteTerm::TypeAssertionWordBoundary:
1155 BACKTRACK();
1156
1157 case ByteTerm::TypePatternCharacterOnce:
1158 case ByteTerm::TypePatternCharacterFixed:
1159 case ByteTerm::TypePatternCharacterGreedy:
1160 case ByteTerm::TypePatternCharacterNonGreedy:
1161 if (backtrackPatternCharacter(currentTerm(), context))
1162 MATCH_NEXT();
1163 BACKTRACK();
1164 case ByteTerm::TypePatternCasedCharacterOnce:
1165 case ByteTerm::TypePatternCasedCharacterFixed:
1166 case ByteTerm::TypePatternCasedCharacterGreedy:
1167 case ByteTerm::TypePatternCasedCharacterNonGreedy:
1168 if (backtrackPatternCasedCharacter(currentTerm(), context))
1169 MATCH_NEXT();
1170 BACKTRACK();
1171 case ByteTerm::TypeCharacterClass:
1172 if (backtrackCharacterClass(currentTerm(), context))
1173 MATCH_NEXT();
1174 BACKTRACK();
1175 case ByteTerm::TypeBackReference:
1176 if (backtrackBackReference(currentTerm(), context))
1177 MATCH_NEXT();
1178 BACKTRACK();
1179 case ByteTerm::TypeParenthesesSubpattern:
1180 if (backtrackParentheses(currentTerm(), context))
1181 MATCH_NEXT();
1182 BACKTRACK();
1183 case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1184 if (backtrackParenthesesOnceBegin(currentTerm(), context))
1185 MATCH_NEXT();
1186 BACKTRACK();
1187 case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1188 if (backtrackParenthesesOnceEnd(currentTerm(), context))
1189 MATCH_NEXT();
1190 BACKTRACK();
1191 case ByteTerm::TypeParentheticalAssertionBegin:
1192 if (backtrackParentheticalAssertionBegin(currentTerm(), context))
1193 MATCH_NEXT();
1194 BACKTRACK();
1195 case ByteTerm::TypeParentheticalAssertionEnd:
1196 if (backtrackParentheticalAssertionEnd(currentTerm(), context))
1197 MATCH_NEXT();
1198 BACKTRACK();
1199
1200 case ByteTerm::TypeCheckInput:
1201 input.uncheckInput(currentTerm().checkInputCount);
1202 BACKTRACK();
1203 }
1204
1205 ASSERT_NOT_REACHED();
1206 return false;
1207 }
1208
1209 bool matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
1210 {
1211 if (matchDisjunction(disjunction, context, btrack)) {
1212 while (context->matchBegin == context->matchEnd) {
1213 if (!matchDisjunction(disjunction, context, true))
1214 return false;
1215 }
1216 return true;
1217 }
1218
1219 return false;
1220 }
1221
1222 int interpret()
1223 {
1224 for (unsigned i = 0; i < ((pattern->m_body->m_numSubpatterns + 1) << 1); ++i)
1225 output[i] = -1;
1226
1227 DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get());
1228
1229 if (matchDisjunction(pattern->m_body.get(), context)) {
1230 output[0] = context->matchBegin;
1231 output[1] = context->matchEnd;
1232 }
1233
1234 freeDisjunctionContext(context);
1235
1236 return output[0];
1237 }
1238
1239 Interpreter(BytecodePattern* pattern, int* output, const UChar* inputChar, unsigned start, unsigned length)
1240 : pattern(pattern)
1241 , output(output)
1242 , input(inputChar, start, length)
1243 {
1244 }
1245
1246 private:
1247 BytecodePattern *pattern;
1248 int* output;
1249 InputStream input;
1250 };
1251
1252
1253
1254 class ByteCompiler {
1255 struct ParenthesesStackEntry {
1256 unsigned beginTerm;
1257 unsigned savedAlternativeIndex;
1258 ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/)
1259 : beginTerm(beginTerm)
1260 , savedAlternativeIndex(savedAlternativeIndex)
1261 {
1262 }
1263 };
1264
1265 public:
1266 ByteCompiler(RegexPattern& pattern)
1267 : m_pattern(pattern)
1268 {
1269 m_bodyDisjunction = 0;
1270 m_currentAlternativeIndex = 0;
1271 }
1272
1273 BytecodePattern* compile()
1274 {
1275 regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize);
1276 emitDisjunction(m_pattern.m_body);
1277 regexEnd();
1278
1279 return new BytecodePattern(m_bodyDisjunction, m_allParenthesesInfo, m_pattern);
1280 }
1281
1282 void checkInput(unsigned count)
1283 {
1284 m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count));
1285 }
1286
1287 void assertionBOL(int inputPosition)
1288 {
1289 m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition));
1290 }
1291
1292 void assertionEOL(int inputPosition)
1293 {
1294 m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition));
1295 }
1296
1297 void assertionWordBoundary(bool invert, int inputPosition)
1298 {
1299 m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition));
1300 }
1301
1302 void atomPatternCharacter(UChar ch, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1303 {
1304 if (m_pattern.m_ignoreCase) {
1305 UChar lo = Unicode::toLower(ch);
1306 UChar hi = Unicode::toUpper(ch);
1307
1308 if (lo != hi) {
1309 m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType));
1310 return;
1311 }
1312 }
1313
1314 m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType));
1315 }
1316
1317 void atomCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1318 {
1319 m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition));
1320
1321 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount;
1322 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
1323 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1324 }
1325
1326 void atomBackReference(unsigned subpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1327 {
1328 ASSERT(subpatternId);
1329
1330 m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition));
1331
1332 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount;
1333 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
1334 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1335 }
1336
1337 void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1338 {
1339 int beginTerm = m_bodyDisjunction->terms.size();
1340
1341 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, inputPosition));
1342 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1343 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1344 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1345
1346 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1347 m_currentAlternativeIndex = beginTerm + 1;
1348 }
1349
1350 void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation)
1351 {
1352 int beginTerm = m_bodyDisjunction->terms.size();
1353
1354 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, invert, 0));
1355 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1356 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1357 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1358
1359 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1360 m_currentAlternativeIndex = beginTerm + 1;
1361 }
1362
1363 unsigned popParenthesesStack()
1364 {
1365 ASSERT(m_parenthesesStack.size());
1366 int stackEnd = m_parenthesesStack.size() - 1;
1367 unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm;
1368 m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex;
1369 m_parenthesesStack.shrink(stackEnd);
1370
1371 ASSERT(beginTerm < m_bodyDisjunction->terms.size());
1372 ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size());
1373
1374 return beginTerm;
1375 }
1376
1377 #ifndef NDEBUG
1378 void dumpDisjunction(ByteDisjunction* disjunction)
1379 {
1380 printf("ByteDisjunction(%p):\n\t", disjunction);
1381 for (unsigned i = 0; i < disjunction->terms.size(); ++i)
1382 printf("{ %d } ", disjunction->terms[i].type);
1383 printf("\n");
1384 }
1385 #endif
1386
1387 void closeAlternative(int beginTerm)
1388 {
1389 int origBeginTerm = beginTerm;
1390 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin);
1391 int endIndex = m_bodyDisjunction->terms.size();
1392
1393 unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
1394
1395 if (!m_bodyDisjunction->terms[beginTerm].alternative.next)
1396 m_bodyDisjunction->terms.remove(beginTerm);
1397 else {
1398 while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
1399 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
1400 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction);
1401 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1402 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1403 }
1404
1405 m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1406
1407 m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd());
1408 m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1409 }
1410 }
1411
1412 void closeBodyAlternative()
1413 {
1414 int beginTerm = 0;
1415 int origBeginTerm = 0;
1416 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin);
1417 int endIndex = m_bodyDisjunction->terms.size();
1418
1419 unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
1420
1421 while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
1422 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
1423 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction);
1424 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1425 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1426 }
1427
1428 m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1429
1430 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd());
1431 m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1432 }
1433
1434 void atomParenthesesEnd(bool doInline, unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0)
1435 {
1436 unsigned beginTerm = popParenthesesStack();
1437 closeAlternative(beginTerm + 1);
1438 unsigned endTerm = m_bodyDisjunction->terms.size();
1439
1440 bool isAssertion = m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin;
1441 bool invertOrCapture = m_bodyDisjunction->terms[beginTerm].invertOrCapture;
1442 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1443
1444 m_bodyDisjunction->terms.append(ByteTerm(isAssertion ? ByteTerm::TypeParentheticalAssertionEnd : ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, invertOrCapture, inputPosition));
1445 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1446 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1447 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1448
1449 if (doInline) {
1450 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
1451 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1452 m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount;
1453 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1454 } else {
1455 ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm];
1456 ASSERT(parenthesesBegin.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
1457
1458 bool invertOrCapture = parenthesesBegin.invertOrCapture;
1459 unsigned subpatternId = parenthesesBegin.atom.subpatternId;
1460
1461 unsigned numSubpatterns = lastSubpatternId - subpatternId + 1;
1462 ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize);
1463
1464 parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin());
1465 for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses)
1466 parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]);
1467 parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd());
1468
1469 m_bodyDisjunction->terms.shrink(beginTerm);
1470
1471 m_allParenthesesInfo.append(parenthesesDisjunction);
1472 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, invertOrCapture, inputPosition));
1473
1474 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
1475 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1476 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1477 }
1478 }
1479
1480 void regexBegin(unsigned numSubpatterns, unsigned callFrameSize)
1481 {
1482 m_bodyDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize);
1483 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin());
1484 m_bodyDisjunction->terms[0].frameLocation = 0;
1485 m_currentAlternativeIndex = 0;
1486 }
1487
1488 void regexEnd()
1489 {
1490 closeBodyAlternative();
1491 }
1492
1493 void alternativeBodyDisjunction()
1494 {
1495 int newAlternativeIndex = m_bodyDisjunction->terms.size();
1496 m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
1497 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction());
1498
1499 m_currentAlternativeIndex = newAlternativeIndex;
1500 }
1501
1502 void alternativeDisjunction()
1503 {
1504 int newAlternativeIndex = m_bodyDisjunction->terms.size();
1505 m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
1506 m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction());
1507
1508 m_currentAlternativeIndex = newAlternativeIndex;
1509 }
1510
1511 void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0)
1512 {
1513 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
1514 unsigned currentCountAlreadyChecked = inputCountAlreadyChecked;
1515
1516 if (alt) {
1517 if (disjunction == m_pattern.m_body)
1518 alternativeBodyDisjunction();
1519 else
1520 alternativeDisjunction();
1521 }
1522
1523 PatternAlternative* alternative = disjunction->m_alternatives[alt];
1524 unsigned minimumSize = alternative->m_minimumSize;
1525
1526 ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked);
1527 unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked;
1528 if (countToCheck)
1529 checkInput(countToCheck);
1530 currentCountAlreadyChecked += countToCheck;
1531
1532 for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
1533 PatternTerm& term = alternative->m_terms[i];
1534
1535 switch (term.type) {
1536 case PatternTerm::TypeAssertionBOL:
1537 assertionBOL(term.inputPosition - currentCountAlreadyChecked);
1538 break;
1539
1540 case PatternTerm::TypeAssertionEOL:
1541 assertionEOL(term.inputPosition - currentCountAlreadyChecked);
1542 break;
1543
1544 case PatternTerm::TypeAssertionWordBoundary:
1545 assertionWordBoundary(term.invertOrCapture, term.inputPosition - currentCountAlreadyChecked);
1546 break;
1547
1548 case PatternTerm::TypePatternCharacter:
1549 atomPatternCharacter(term.patternCharacter, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
1550 break;
1551
1552 case PatternTerm::TypeCharacterClass:
1553 atomCharacterClass(term.characterClass, term.invertOrCapture, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
1554 break;
1555
1556 case PatternTerm::TypeBackReference:
1557 atomBackReference(term.subpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
1558 break;
1559
1560 case PatternTerm::TypeForwardReference:
1561 break;
1562
1563 case PatternTerm::TypeParenthesesSubpattern: {
1564 unsigned disjunctionAlreadyCheckedCount = 0;
1565 if ((term.quantityCount == 1) && !term.parentheses.isCopy) {
1566 if (term.quantityType == QuantifierFixedCount) {
1567 disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize;
1568 unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1569 atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation);
1570 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, term.parentheses.disjunction->m_minimumSize);
1571 atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
1572 } else {
1573 unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1574 atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + RegexStackSpaceForBackTrackInfoParenthesesOnce);
1575 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
1576 atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
1577 }
1578 } else {
1579 unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1580 atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, 0);
1581 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
1582 atomParenthesesEnd(false, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
1583 }
1584 break;
1585 }
1586
1587 case PatternTerm::TypeParentheticalAssertion: {
1588 unsigned alternativeFrameLocation = term.inputPosition + RegexStackSpaceForBackTrackInfoParentheticalAssertion;
1589
1590 atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invertOrCapture, term.frameLocation, alternativeFrameLocation);
1591 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
1592 atomParenthesesEnd(true, term.parentheses.lastSubpatternId, 0, term.frameLocation, term.quantityCount, term.quantityType);
1593 break;
1594 }
1595 }
1596 }
1597 }
1598 }
1599
1600 private:
1601 RegexPattern& m_pattern;
1602 ByteDisjunction* m_bodyDisjunction;
1603 unsigned m_currentAlternativeIndex;
1604 Vector<ParenthesesStackEntry> m_parenthesesStack;
1605 Vector<ByteDisjunction*> m_allParenthesesInfo;
1606 };
1607
1608
1609 BytecodePattern* byteCompileRegex(const UString& patternString, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline)
1610 {
1611 RegexPattern pattern(ignoreCase, multiline);
1612
1613 if ((error = compileRegex(patternString, pattern)))
1614 return 0;
1615
1616 numSubpatterns = pattern.m_numSubpatterns;
1617
1618 return ByteCompiler(pattern).compile();
1619 }
1620
1621 int interpretRegex(BytecodePattern* regex, const UChar* input, unsigned start, unsigned length, int* output)
1622 {
1623 return Interpreter(regex, output, input, start, length).interpret();
1624 }
1625
1626
1627 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter) == (RegexStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoPatternCharacter);
1628 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass) == (RegexStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoCharacterClass);
1629 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference) == (RegexStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoBackReference);
1630 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative) == (RegexStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoAlternative);
1631 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion) == (RegexStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheticalAssertion);
1632 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce) == (RegexStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParenthesesOnce);
1633 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (RegexStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheses);
1634
1635
1636 } }
1637
1638 #endif