]> git.saurik.com Git - apple/javascriptcore.git/blob - yarr/RegexInterpreter.cpp
JavaScriptCore-721.26.tar.gz
[apple/javascriptcore.git] / yarr / RegexInterpreter.cpp
1 /*
2 * Copyright (C) 2009 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26 #include "config.h"
27 #include "RegexInterpreter.h"
28
29 #include "RegexCompiler.h"
30 #include "RegexPattern.h"
31
32 #ifndef NDEBUG
33 #include <stdio.h>
34 #endif
35
36 #if ENABLE(YARR)
37
38 using namespace WTF;
39
40 namespace JSC { namespace Yarr {
41
42 class Interpreter {
43 public:
44 struct ParenthesesDisjunctionContext;
45
46 struct BackTrackInfoPatternCharacter {
47 uintptr_t matchAmount;
48 };
49 struct BackTrackInfoCharacterClass {
50 uintptr_t matchAmount;
51 };
52 struct BackTrackInfoBackReference {
53 uintptr_t begin; // Not really needed for greedy quantifiers.
54 uintptr_t matchAmount; // Not really needed for fixed quantifiers.
55 };
56 struct BackTrackInfoAlternative {
57 uintptr_t offset;
58 };
59 struct BackTrackInfoParentheticalAssertion {
60 uintptr_t begin;
61 };
62 struct BackTrackInfoParenthesesOnce {
63 uintptr_t inParentheses;
64 };
65 struct BackTrackInfoParentheses {
66 uintptr_t matchAmount;
67 ParenthesesDisjunctionContext* lastContext;
68 uintptr_t prevBegin;
69 uintptr_t prevEnd;
70 };
71
72 static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context)
73 {
74 context->next = backTrack->lastContext;
75 backTrack->lastContext = context;
76 ++backTrack->matchAmount;
77 }
78
79 static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack)
80 {
81 ASSERT(backTrack->matchAmount);
82 ASSERT(backTrack->lastContext);
83 backTrack->lastContext = backTrack->lastContext->next;
84 --backTrack->matchAmount;
85 }
86
87 struct DisjunctionContext
88 {
89 DisjunctionContext()
90 : term(0)
91 {
92 }
93
94 void* operator new(size_t, void* where)
95 {
96 return where;
97 }
98
99 int term;
100 unsigned matchBegin;
101 unsigned matchEnd;
102 uintptr_t frame[1];
103 };
104
105 DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction)
106 {
107 return new(malloc(sizeof(DisjunctionContext) + (disjunction->m_frameSize - 1) * sizeof(uintptr_t))) DisjunctionContext();
108 }
109
110 void freeDisjunctionContext(DisjunctionContext* context)
111 {
112 free(context);
113 }
114
115 struct ParenthesesDisjunctionContext
116 {
117 ParenthesesDisjunctionContext(int* output, ByteTerm& term)
118 : next(0)
119 {
120 unsigned firstSubpatternId = term.atom.subpatternId;
121 unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns;
122
123 for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) {
124 subpatternBackup[i] = output[(firstSubpatternId << 1) + i];
125 output[(firstSubpatternId << 1) + i] = -1;
126 }
127
128 new(getDisjunctionContext(term)) DisjunctionContext();
129 }
130
131 void* operator new(size_t, void* where)
132 {
133 return where;
134 }
135
136 void restoreOutput(int* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns)
137 {
138 for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i)
139 output[(firstSubpatternId << 1) + i] = subpatternBackup[i];
140 }
141
142 DisjunctionContext* getDisjunctionContext(ByteTerm& term)
143 {
144 return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1]));
145 }
146
147 ParenthesesDisjunctionContext* next;
148 int subpatternBackup[1];
149 };
150
151 ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, int* output, ByteTerm& term)
152 {
153 return new(malloc(sizeof(ParenthesesDisjunctionContext) + (((term.atom.parenthesesDisjunction->m_numSubpatterns << 1) - 1) * sizeof(int)) + sizeof(DisjunctionContext) + (disjunction->m_frameSize - 1) * sizeof(uintptr_t))) ParenthesesDisjunctionContext(output, term);
154 }
155
156 void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context)
157 {
158 free(context);
159 }
160
161 class InputStream {
162 public:
163 InputStream(const UChar* input, unsigned start, unsigned length)
164 : input(input)
165 , pos(start)
166 , length(length)
167 {
168 }
169
170 void next()
171 {
172 ++pos;
173 }
174
175 void rewind(unsigned amount)
176 {
177 ASSERT(pos >= amount);
178 pos -= amount;
179 }
180
181 int read()
182 {
183 ASSERT(pos < length);
184 if (pos < length)
185 return input[pos];
186 return -1;
187 }
188
189 int readChecked(int position)
190 {
191 ASSERT(position < 0);
192 ASSERT((unsigned)-position <= pos);
193 unsigned p = pos + position;
194 ASSERT(p < length);
195 return input[p];
196 }
197
198 int reread(unsigned from)
199 {
200 ASSERT(from < length);
201 return input[from];
202 }
203
204 int prev()
205 {
206 ASSERT(!(pos > length));
207 if (pos && length)
208 return input[pos - 1];
209 return -1;
210 }
211
212 unsigned getPos()
213 {
214 return pos;
215 }
216
217 void setPos(unsigned p)
218 {
219 pos = p;
220 }
221
222 bool atStart()
223 {
224 return pos == 0;
225 }
226
227 bool atEnd()
228 {
229 return pos == length;
230 }
231
232 bool checkInput(int count)
233 {
234 if ((pos + count) <= length) {
235 pos += count;
236 return true;
237 } else
238 return false;
239 }
240
241 void uncheckInput(int count)
242 {
243 pos -= count;
244 }
245
246 bool atStart(int position)
247 {
248 return (pos + position) == 0;
249 }
250
251 bool atEnd(int position)
252 {
253 return (pos + position) == length;
254 }
255
256 private:
257 const UChar* input;
258 unsigned pos;
259 unsigned length;
260 };
261
262 bool testCharacterClass(CharacterClass* characterClass, int ch)
263 {
264 if (ch & 0xFF80) {
265 for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i)
266 if (ch == characterClass->m_matchesUnicode[i])
267 return true;
268 for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i)
269 if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end))
270 return true;
271 } else {
272 for (unsigned i = 0; i < characterClass->m_matches.size(); ++i)
273 if (ch == characterClass->m_matches[i])
274 return true;
275 for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i)
276 if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end))
277 return true;
278 }
279
280 return false;
281 }
282
283 bool checkCharacter(int testChar, int inputPosition)
284 {
285 return testChar == input.readChecked(inputPosition);
286 }
287
288 bool checkCasedCharacter(int loChar, int hiChar, int inputPosition)
289 {
290 int ch = input.readChecked(inputPosition);
291 return (loChar == ch) || (hiChar == ch);
292 }
293
294 bool checkCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition)
295 {
296 bool match = testCharacterClass(characterClass, input.readChecked(inputPosition));
297 return invert ? !match : match;
298 }
299
300 bool tryConsumeBackReference(int matchBegin, int matchEnd, int inputOffset)
301 {
302 int matchSize = matchEnd - matchBegin;
303
304 if (!input.checkInput(matchSize))
305 return false;
306
307 for (int i = 0; i < matchSize; ++i) {
308 if (!checkCharacter(input.reread(matchBegin + i), inputOffset - matchSize + i)) {
309 input.uncheckInput(matchSize);
310 return false;
311 }
312 }
313
314 return true;
315 }
316
317 bool matchAssertionBOL(ByteTerm& term)
318 {
319 return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition - 1)));
320 }
321
322 bool matchAssertionEOL(ByteTerm& term)
323 {
324 if (term.inputPosition)
325 return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition)));
326 else
327 return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read()));
328 }
329
330 bool matchAssertionWordBoundary(ByteTerm& term)
331 {
332 bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition - 1));
333 bool readIsWordchar;
334 if (term.inputPosition)
335 readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition));
336 else
337 readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read());
338
339 bool wordBoundary = prevIsWordchar != readIsWordchar;
340 return term.invert() ? !wordBoundary : wordBoundary;
341 }
342
343 bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context)
344 {
345 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
346
347 switch (term.atom.quantityType) {
348 case QuantifierFixedCount:
349 break;
350
351 case QuantifierGreedy:
352 if (backTrack->matchAmount) {
353 --backTrack->matchAmount;
354 input.uncheckInput(1);
355 return true;
356 }
357 break;
358
359 case QuantifierNonGreedy:
360 if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
361 ++backTrack->matchAmount;
362 if (checkCharacter(term.atom.patternCharacter, term.inputPosition - 1))
363 return true;
364 }
365 input.uncheckInput(backTrack->matchAmount);
366 break;
367 }
368
369 return false;
370 }
371
372 bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context)
373 {
374 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
375
376 switch (term.atom.quantityType) {
377 case QuantifierFixedCount:
378 break;
379
380 case QuantifierGreedy:
381 if (backTrack->matchAmount) {
382 --backTrack->matchAmount;
383 input.uncheckInput(1);
384 return true;
385 }
386 break;
387
388 case QuantifierNonGreedy:
389 if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
390 ++backTrack->matchAmount;
391 if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition - 1))
392 return true;
393 }
394 input.uncheckInput(backTrack->matchAmount);
395 break;
396 }
397
398 return false;
399 }
400
401 bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context)
402 {
403 ASSERT(term.type == ByteTerm::TypeCharacterClass);
404 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
405
406 switch (term.atom.quantityType) {
407 case QuantifierFixedCount: {
408 for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
409 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + matchAmount))
410 return false;
411 }
412 return true;
413 }
414
415 case QuantifierGreedy: {
416 unsigned matchAmount = 0;
417 while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
418 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) {
419 input.uncheckInput(1);
420 break;
421 }
422 ++matchAmount;
423 }
424 backTrack->matchAmount = matchAmount;
425
426 return true;
427 }
428
429 case QuantifierNonGreedy:
430 backTrack->matchAmount = 0;
431 return true;
432 }
433
434 ASSERT_NOT_REACHED();
435 return false;
436 }
437
438 bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context)
439 {
440 ASSERT(term.type == ByteTerm::TypeCharacterClass);
441 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
442
443 switch (term.atom.quantityType) {
444 case QuantifierFixedCount:
445 break;
446
447 case QuantifierGreedy:
448 if (backTrack->matchAmount) {
449 --backTrack->matchAmount;
450 input.uncheckInput(1);
451 return true;
452 }
453 break;
454
455 case QuantifierNonGreedy:
456 if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
457 ++backTrack->matchAmount;
458 if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1))
459 return true;
460 }
461 input.uncheckInput(backTrack->matchAmount);
462 break;
463 }
464
465 return false;
466 }
467
468 bool matchBackReference(ByteTerm& term, DisjunctionContext* context)
469 {
470 ASSERT(term.type == ByteTerm::TypeBackReference);
471 BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
472
473 int matchBegin = output[(term.atom.subpatternId << 1)];
474 int matchEnd = output[(term.atom.subpatternId << 1) + 1];
475 ASSERT((matchBegin == -1) == (matchEnd == -1));
476 ASSERT(matchBegin <= matchEnd);
477
478 if (matchBegin == matchEnd)
479 return true;
480
481 switch (term.atom.quantityType) {
482 case QuantifierFixedCount: {
483 backTrack->begin = input.getPos();
484 for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
485 if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
486 input.setPos(backTrack->begin);
487 return false;
488 }
489 }
490 return true;
491 }
492
493 case QuantifierGreedy: {
494 unsigned matchAmount = 0;
495 while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition))
496 ++matchAmount;
497 backTrack->matchAmount = matchAmount;
498 return true;
499 }
500
501 case QuantifierNonGreedy:
502 backTrack->begin = input.getPos();
503 backTrack->matchAmount = 0;
504 return true;
505 }
506
507 ASSERT_NOT_REACHED();
508 return false;
509 }
510
511 bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context)
512 {
513 ASSERT(term.type == ByteTerm::TypeBackReference);
514 BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
515
516 int matchBegin = output[(term.atom.subpatternId << 1)];
517 int matchEnd = output[(term.atom.subpatternId << 1) + 1];
518 ASSERT((matchBegin == -1) == (matchEnd == -1));
519 ASSERT(matchBegin <= matchEnd);
520
521 if (matchBegin == matchEnd)
522 return false;
523
524 switch (term.atom.quantityType) {
525 case QuantifierFixedCount:
526 // for quantityCount == 1, could rewind.
527 input.setPos(backTrack->begin);
528 break;
529
530 case QuantifierGreedy:
531 if (backTrack->matchAmount) {
532 --backTrack->matchAmount;
533 input.rewind(matchEnd - matchBegin);
534 return true;
535 }
536 break;
537
538 case QuantifierNonGreedy:
539 if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
540 ++backTrack->matchAmount;
541 return true;
542 } else
543 input.setPos(backTrack->begin);
544 break;
545 }
546
547 return false;
548 }
549
550 void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context)
551 {
552 if (term.capture()) {
553 unsigned subpatternId = term.atom.subpatternId;
554 output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition;
555 output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition;
556 }
557 }
558 void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context)
559 {
560 unsigned firstSubpatternId = term.atom.subpatternId;
561 unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
562 context->restoreOutput(output, firstSubpatternId, count);
563 }
564 void resetAssertionMatches(ByteTerm& term)
565 {
566 unsigned firstSubpatternId = term.atom.subpatternId;
567 unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
568 for (unsigned i = 0; i < (count << 1); ++i)
569 output[(firstSubpatternId << 1) + i] = -1;
570 }
571 bool parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack)
572 {
573 while (backTrack->matchAmount) {
574 ParenthesesDisjunctionContext* context = backTrack->lastContext;
575
576 if (matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true))
577 return true;
578
579 resetMatches(term, context);
580 popParenthesesDisjunctionContext(backTrack);
581 freeParenthesesDisjunctionContext(context);
582 }
583
584 return false;
585 }
586
587 bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
588 {
589 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
590 ASSERT(term.atom.quantityCount == 1);
591
592 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
593
594 switch (term.atom.quantityType) {
595 case QuantifierGreedy: {
596 // set this speculatively; if we get to the parens end this will be true.
597 backTrack->inParentheses = 1;
598 break;
599 }
600 case QuantifierNonGreedy: {
601 backTrack->inParentheses = 0;
602 context->term += term.atom.parenthesesWidth;
603 return true;
604 }
605 case QuantifierFixedCount:
606 break;
607 }
608
609 if (term.capture()) {
610 unsigned subpatternId = term.atom.subpatternId;
611 output[(subpatternId << 1)] = input.getPos() + term.inputPosition;
612 }
613
614 return true;
615 }
616
617 bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext*)
618 {
619 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
620 ASSERT(term.atom.quantityCount == 1);
621
622 if (term.capture()) {
623 unsigned subpatternId = term.atom.subpatternId;
624 output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition;
625 }
626 return true;
627 }
628
629 bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
630 {
631 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
632 ASSERT(term.atom.quantityCount == 1);
633
634 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
635
636 if (term.capture()) {
637 unsigned subpatternId = term.atom.subpatternId;
638 output[(subpatternId << 1)] = -1;
639 output[(subpatternId << 1) + 1] = -1;
640 }
641
642 switch (term.atom.quantityType) {
643 case QuantifierGreedy:
644 // if we backtrack to this point, there is another chance - try matching nothing.
645 ASSERT(backTrack->inParentheses);
646 backTrack->inParentheses = 0;
647 context->term += term.atom.parenthesesWidth;
648 return true;
649 case QuantifierNonGreedy:
650 ASSERT(backTrack->inParentheses);
651 case QuantifierFixedCount:
652 break;
653 }
654
655 return false;
656 }
657
658 bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
659 {
660 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
661 ASSERT(term.atom.quantityCount == 1);
662
663 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
664
665 switch (term.atom.quantityType) {
666 case QuantifierGreedy:
667 if (!backTrack->inParentheses) {
668 context->term -= term.atom.parenthesesWidth;
669 return false;
670 }
671 case QuantifierNonGreedy:
672 if (!backTrack->inParentheses) {
673 // now try to match the parens; set this speculatively.
674 backTrack->inParentheses = 1;
675 if (term.capture()) {
676 unsigned subpatternId = term.atom.subpatternId;
677 output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition;
678 }
679 context->term -= term.atom.parenthesesWidth;
680 return true;
681 }
682 case QuantifierFixedCount:
683 break;
684 }
685
686 return false;
687 }
688
689 bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
690 {
691 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
692 ASSERT(term.atom.quantityCount == 1);
693
694 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
695
696 backTrack->begin = input.getPos();
697 return true;
698 }
699
700 bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
701 {
702 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
703 ASSERT(term.atom.quantityCount == 1);
704
705 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
706
707 input.setPos(backTrack->begin);
708
709 // We've reached the end of the parens; if they are inverted, this is failure.
710 if (term.invert()) {
711 context->term -= term.atom.parenthesesWidth;
712 return false;
713 }
714
715 return true;
716 }
717
718 bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
719 {
720 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
721 ASSERT(term.atom.quantityCount == 1);
722
723 // We've failed to match parens; if they are inverted, this is win!
724 if (term.invert()) {
725 context->term += term.atom.parenthesesWidth;
726 return true;
727 }
728
729 return false;
730 }
731
732 bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
733 {
734 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
735 ASSERT(term.atom.quantityCount == 1);
736
737 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
738
739 input.setPos(backTrack->begin);
740
741 context->term -= term.atom.parenthesesWidth;
742 return false;
743 }
744
745 bool matchParentheses(ByteTerm& term, DisjunctionContext* context)
746 {
747 ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
748
749 BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
750
751 unsigned subpatternId = term.atom.subpatternId;
752 ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
753
754 backTrack->prevBegin = output[(subpatternId << 1)];
755 backTrack->prevEnd = output[(subpatternId << 1) + 1];
756
757 backTrack->matchAmount = 0;
758 backTrack->lastContext = 0;
759
760 switch (term.atom.quantityType) {
761 case QuantifierFixedCount: {
762 // While we haven't yet reached our fixed limit,
763 while (backTrack->matchAmount < term.atom.quantityCount) {
764 // Try to do a match, and it it succeeds, add it to the list.
765 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
766 if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
767 appendParenthesesDisjunctionContext(backTrack, context);
768 else {
769 // The match failed; try to find an alternate point to carry on from.
770 resetMatches(term, context);
771 freeParenthesesDisjunctionContext(context);
772 if (!parenthesesDoBacktrack(term, backTrack))
773 return false;
774 }
775 }
776
777 ASSERT(backTrack->matchAmount == term.atom.quantityCount);
778 ParenthesesDisjunctionContext* context = backTrack->lastContext;
779 recordParenthesesMatch(term, context);
780 return true;
781 }
782
783 case QuantifierGreedy: {
784 while (backTrack->matchAmount < term.atom.quantityCount) {
785 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
786 if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
787 appendParenthesesDisjunctionContext(backTrack, context);
788 else {
789 resetMatches(term, context);
790 freeParenthesesDisjunctionContext(context);
791 break;
792 }
793 }
794
795 if (backTrack->matchAmount) {
796 ParenthesesDisjunctionContext* context = backTrack->lastContext;
797 recordParenthesesMatch(term, context);
798 }
799 return true;
800 }
801
802 case QuantifierNonGreedy:
803 return true;
804 }
805
806 ASSERT_NOT_REACHED();
807 return false;
808 }
809
810 // Rules for backtracking differ depending on whether this is greedy or non-greedy.
811 //
812 // Greedy matches never should try just adding more - you should already have done
813 // the 'more' cases. Always backtrack, at least a leetle bit. However cases where
814 // you backtrack an item off the list needs checking, since we'll never have matched
815 // the one less case. Tracking forwards, still add as much as possible.
816 //
817 // Non-greedy, we've already done the one less case, so don't match on popping.
818 // We haven't done the one more case, so always try to add that.
819 //
820 bool backtrackParentheses(ByteTerm& term, DisjunctionContext* context)
821 {
822 ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
823
824 BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
825
826 if (term.capture()) {
827 unsigned subpatternId = term.atom.subpatternId;
828 output[(subpatternId << 1)] = backTrack->prevBegin;
829 output[(subpatternId << 1) + 1] = backTrack->prevEnd;
830 }
831
832 ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
833
834 switch (term.atom.quantityType) {
835 case QuantifierFixedCount: {
836 ASSERT(backTrack->matchAmount == term.atom.quantityCount);
837
838 ParenthesesDisjunctionContext* context = 0;
839
840 if (!parenthesesDoBacktrack(term, backTrack))
841 return false;
842
843 // While we haven't yet reached our fixed limit,
844 while (backTrack->matchAmount < term.atom.quantityCount) {
845 // Try to do a match, and it it succeeds, add it to the list.
846 context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
847 if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
848 appendParenthesesDisjunctionContext(backTrack, context);
849 else {
850 // The match failed; try to find an alternate point to carry on from.
851 resetMatches(term, context);
852 freeParenthesesDisjunctionContext(context);
853 if (!parenthesesDoBacktrack(term, backTrack))
854 return false;
855 }
856 }
857
858 ASSERT(backTrack->matchAmount == term.atom.quantityCount);
859 context = backTrack->lastContext;
860 recordParenthesesMatch(term, context);
861 return true;
862 }
863
864 case QuantifierGreedy: {
865 if (!backTrack->matchAmount)
866 return false;
867
868 ParenthesesDisjunctionContext* context = backTrack->lastContext;
869 if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) {
870 while (backTrack->matchAmount < term.atom.quantityCount) {
871 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
872 if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
873 appendParenthesesDisjunctionContext(backTrack, context);
874 else {
875 resetMatches(term, context);
876 freeParenthesesDisjunctionContext(context);
877 break;
878 }
879 }
880 } else {
881 resetMatches(term, context);
882 popParenthesesDisjunctionContext(backTrack);
883 freeParenthesesDisjunctionContext(context);
884 }
885
886 if (backTrack->matchAmount) {
887 ParenthesesDisjunctionContext* context = backTrack->lastContext;
888 recordParenthesesMatch(term, context);
889 }
890 return true;
891 }
892
893 case QuantifierNonGreedy: {
894 // If we've not reached the limit, try to add one more match.
895 if (backTrack->matchAmount < term.atom.quantityCount) {
896 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
897 if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) {
898 appendParenthesesDisjunctionContext(backTrack, context);
899 recordParenthesesMatch(term, context);
900 return true;
901 } else {
902 resetMatches(term, context);
903 freeParenthesesDisjunctionContext(context);
904 }
905 }
906
907 // Nope - okay backtrack looking for an alternative.
908 while (backTrack->matchAmount) {
909 ParenthesesDisjunctionContext* context = backTrack->lastContext;
910 if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) {
911 // successful backtrack! we're back in the game!
912 if (backTrack->matchAmount) {
913 context = backTrack->lastContext;
914 recordParenthesesMatch(term, context);
915 }
916 return true;
917 }
918
919 // pop a match off the stack
920 resetMatches(term, context);
921 popParenthesesDisjunctionContext(backTrack);
922 freeParenthesesDisjunctionContext(context);
923 }
924
925 return false;
926 }
927 }
928
929 ASSERT_NOT_REACHED();
930 return false;
931 }
932
933 #define MATCH_NEXT() { ++context->term; goto matchAgain; }
934 #define BACKTRACK() { --context->term; goto backtrack; }
935 #define currentTerm() (disjunction->terms[context->term])
936 bool matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
937 {
938 if (btrack)
939 BACKTRACK();
940
941 context->matchBegin = input.getPos();
942 context->term = 0;
943
944 matchAgain:
945 ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
946
947 switch (currentTerm().type) {
948 case ByteTerm::TypeSubpatternBegin:
949 MATCH_NEXT();
950 case ByteTerm::TypeSubpatternEnd:
951 context->matchEnd = input.getPos();
952 return true;
953
954 case ByteTerm::TypeBodyAlternativeBegin:
955 MATCH_NEXT();
956 case ByteTerm::TypeBodyAlternativeDisjunction:
957 case ByteTerm::TypeBodyAlternativeEnd:
958 context->matchEnd = input.getPos();
959 return true;
960
961 case ByteTerm::TypeAlternativeBegin:
962 MATCH_NEXT();
963 case ByteTerm::TypeAlternativeDisjunction:
964 case ByteTerm::TypeAlternativeEnd: {
965 int offset = currentTerm().alternative.end;
966 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
967 backTrack->offset = offset;
968 context->term += offset;
969 MATCH_NEXT();
970 }
971
972 case ByteTerm::TypeAssertionBOL:
973 if (matchAssertionBOL(currentTerm()))
974 MATCH_NEXT();
975 BACKTRACK();
976 case ByteTerm::TypeAssertionEOL:
977 if (matchAssertionEOL(currentTerm()))
978 MATCH_NEXT();
979 BACKTRACK();
980 case ByteTerm::TypeAssertionWordBoundary:
981 if (matchAssertionWordBoundary(currentTerm()))
982 MATCH_NEXT();
983 BACKTRACK();
984
985 case ByteTerm::TypePatternCharacterOnce:
986 case ByteTerm::TypePatternCharacterFixed: {
987 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
988 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + matchAmount))
989 BACKTRACK();
990 }
991 MATCH_NEXT();
992 }
993 case ByteTerm::TypePatternCharacterGreedy: {
994 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
995 unsigned matchAmount = 0;
996 while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
997 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 1)) {
998 input.uncheckInput(1);
999 break;
1000 }
1001 ++matchAmount;
1002 }
1003 backTrack->matchAmount = matchAmount;
1004
1005 MATCH_NEXT();
1006 }
1007 case ByteTerm::TypePatternCharacterNonGreedy: {
1008 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1009 backTrack->matchAmount = 0;
1010 MATCH_NEXT();
1011 }
1012
1013 case ByteTerm::TypePatternCasedCharacterOnce:
1014 case ByteTerm::TypePatternCasedCharacterFixed: {
1015 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
1016 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + matchAmount))
1017 BACKTRACK();
1018 }
1019 MATCH_NEXT();
1020 }
1021 case ByteTerm::TypePatternCasedCharacterGreedy: {
1022 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1023 unsigned matchAmount = 0;
1024 while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
1025 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - 1)) {
1026 input.uncheckInput(1);
1027 break;
1028 }
1029 ++matchAmount;
1030 }
1031 backTrack->matchAmount = matchAmount;
1032
1033 MATCH_NEXT();
1034 }
1035 case ByteTerm::TypePatternCasedCharacterNonGreedy: {
1036 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1037 backTrack->matchAmount = 0;
1038 MATCH_NEXT();
1039 }
1040
1041 case ByteTerm::TypeCharacterClass:
1042 if (matchCharacterClass(currentTerm(), context))
1043 MATCH_NEXT();
1044 BACKTRACK();
1045 case ByteTerm::TypeBackReference:
1046 if (matchBackReference(currentTerm(), context))
1047 MATCH_NEXT();
1048 BACKTRACK();
1049 case ByteTerm::TypeParenthesesSubpattern:
1050 if (matchParentheses(currentTerm(), context))
1051 MATCH_NEXT();
1052 BACKTRACK();
1053 case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1054 if (matchParenthesesOnceBegin(currentTerm(), context))
1055 MATCH_NEXT();
1056 BACKTRACK();
1057 case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1058 if (matchParenthesesOnceEnd(currentTerm(), context))
1059 MATCH_NEXT();
1060 BACKTRACK();
1061 case ByteTerm::TypeParentheticalAssertionBegin:
1062 if (matchParentheticalAssertionBegin(currentTerm(), context))
1063 MATCH_NEXT();
1064 BACKTRACK();
1065 case ByteTerm::TypeParentheticalAssertionEnd:
1066 if (matchParentheticalAssertionEnd(currentTerm(), context))
1067 MATCH_NEXT();
1068 BACKTRACK();
1069
1070 case ByteTerm::TypeCheckInput:
1071 if (input.checkInput(currentTerm().checkInputCount))
1072 MATCH_NEXT();
1073 BACKTRACK();
1074 }
1075
1076 // We should never fall-through to here.
1077 ASSERT_NOT_REACHED();
1078
1079 backtrack:
1080 ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
1081
1082 switch (currentTerm().type) {
1083 case ByteTerm::TypeSubpatternBegin:
1084 return false;
1085 case ByteTerm::TypeSubpatternEnd:
1086 ASSERT_NOT_REACHED();
1087
1088 case ByteTerm::TypeBodyAlternativeBegin:
1089 case ByteTerm::TypeBodyAlternativeDisjunction: {
1090 int offset = currentTerm().alternative.next;
1091 context->term += offset;
1092 if (offset > 0)
1093 MATCH_NEXT();
1094
1095 if (input.atEnd())
1096 return false;
1097
1098 input.next();
1099 context->matchBegin = input.getPos();
1100 MATCH_NEXT();
1101 }
1102 case ByteTerm::TypeBodyAlternativeEnd:
1103 ASSERT_NOT_REACHED();
1104
1105 case ByteTerm::TypeAlternativeBegin:
1106 case ByteTerm::TypeAlternativeDisjunction: {
1107 int offset = currentTerm().alternative.next;
1108 context->term += offset;
1109 if (offset > 0)
1110 MATCH_NEXT();
1111 BACKTRACK();
1112 }
1113 case ByteTerm::TypeAlternativeEnd: {
1114 // We should never backtrack back into an alternative of the main body of the regex.
1115 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
1116 unsigned offset = backTrack->offset;
1117 context->term -= offset;
1118 BACKTRACK();
1119 }
1120
1121 case ByteTerm::TypeAssertionBOL:
1122 case ByteTerm::TypeAssertionEOL:
1123 case ByteTerm::TypeAssertionWordBoundary:
1124 BACKTRACK();
1125
1126 case ByteTerm::TypePatternCharacterOnce:
1127 case ByteTerm::TypePatternCharacterFixed:
1128 case ByteTerm::TypePatternCharacterGreedy:
1129 case ByteTerm::TypePatternCharacterNonGreedy:
1130 if (backtrackPatternCharacter(currentTerm(), context))
1131 MATCH_NEXT();
1132 BACKTRACK();
1133 case ByteTerm::TypePatternCasedCharacterOnce:
1134 case ByteTerm::TypePatternCasedCharacterFixed:
1135 case ByteTerm::TypePatternCasedCharacterGreedy:
1136 case ByteTerm::TypePatternCasedCharacterNonGreedy:
1137 if (backtrackPatternCasedCharacter(currentTerm(), context))
1138 MATCH_NEXT();
1139 BACKTRACK();
1140 case ByteTerm::TypeCharacterClass:
1141 if (backtrackCharacterClass(currentTerm(), context))
1142 MATCH_NEXT();
1143 BACKTRACK();
1144 case ByteTerm::TypeBackReference:
1145 if (backtrackBackReference(currentTerm(), context))
1146 MATCH_NEXT();
1147 BACKTRACK();
1148 case ByteTerm::TypeParenthesesSubpattern:
1149 if (backtrackParentheses(currentTerm(), context))
1150 MATCH_NEXT();
1151 BACKTRACK();
1152 case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1153 if (backtrackParenthesesOnceBegin(currentTerm(), context))
1154 MATCH_NEXT();
1155 BACKTRACK();
1156 case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1157 if (backtrackParenthesesOnceEnd(currentTerm(), context))
1158 MATCH_NEXT();
1159 BACKTRACK();
1160 case ByteTerm::TypeParentheticalAssertionBegin:
1161 if (backtrackParentheticalAssertionBegin(currentTerm(), context))
1162 MATCH_NEXT();
1163 BACKTRACK();
1164 case ByteTerm::TypeParentheticalAssertionEnd:
1165 if (backtrackParentheticalAssertionEnd(currentTerm(), context))
1166 MATCH_NEXT();
1167 BACKTRACK();
1168
1169 case ByteTerm::TypeCheckInput:
1170 input.uncheckInput(currentTerm().checkInputCount);
1171 BACKTRACK();
1172 }
1173
1174 ASSERT_NOT_REACHED();
1175 return false;
1176 }
1177
1178 bool matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
1179 {
1180 if (matchDisjunction(disjunction, context, btrack)) {
1181 while (context->matchBegin == context->matchEnd) {
1182 if (!matchDisjunction(disjunction, context, true))
1183 return false;
1184 }
1185 return true;
1186 }
1187
1188 return false;
1189 }
1190
1191 int interpret()
1192 {
1193 for (unsigned i = 0; i < ((pattern->m_body->m_numSubpatterns + 1) << 1); ++i)
1194 output[i] = -1;
1195
1196 DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get());
1197
1198 if (matchDisjunction(pattern->m_body.get(), context)) {
1199 output[0] = context->matchBegin;
1200 output[1] = context->matchEnd;
1201 }
1202
1203 freeDisjunctionContext(context);
1204
1205 return output[0];
1206 }
1207
1208 Interpreter(BytecodePattern* pattern, int* output, const UChar* inputChar, unsigned start, unsigned length)
1209 : pattern(pattern)
1210 , output(output)
1211 , input(inputChar, start, length)
1212 {
1213 }
1214
1215 private:
1216 BytecodePattern *pattern;
1217 int* output;
1218 InputStream input;
1219 };
1220
1221
1222
1223 class ByteCompiler {
1224 struct ParenthesesStackEntry {
1225 unsigned beginTerm;
1226 unsigned savedAlternativeIndex;
1227 ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/)
1228 : beginTerm(beginTerm)
1229 , savedAlternativeIndex(savedAlternativeIndex)
1230 {
1231 }
1232 };
1233
1234 public:
1235 ByteCompiler(RegexPattern& pattern)
1236 : m_pattern(pattern)
1237 {
1238 m_bodyDisjunction = 0;
1239 m_currentAlternativeIndex = 0;
1240 }
1241
1242 BytecodePattern* compile()
1243 {
1244 regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize);
1245 emitDisjunction(m_pattern.m_body);
1246 regexEnd();
1247
1248 return new BytecodePattern(m_bodyDisjunction, m_allParenthesesInfo, m_pattern);
1249 }
1250
1251 void checkInput(unsigned count)
1252 {
1253 m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count));
1254 }
1255
1256 void assertionBOL(int inputPosition)
1257 {
1258 m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition));
1259 }
1260
1261 void assertionEOL(int inputPosition)
1262 {
1263 m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition));
1264 }
1265
1266 void assertionWordBoundary(bool invert, int inputPosition)
1267 {
1268 m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition));
1269 }
1270
1271 void atomPatternCharacter(UChar ch, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1272 {
1273 if (m_pattern.m_ignoreCase) {
1274 UChar lo = Unicode::toLower(ch);
1275 UChar hi = Unicode::toUpper(ch);
1276
1277 if (lo != hi) {
1278 m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType));
1279 return;
1280 }
1281 }
1282
1283 m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType));
1284 }
1285
1286 void atomCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1287 {
1288 m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition));
1289
1290 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount;
1291 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
1292 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1293 }
1294
1295 void atomBackReference(unsigned subpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1296 {
1297 ASSERT(subpatternId);
1298
1299 m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition));
1300
1301 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount;
1302 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
1303 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1304 }
1305
1306 void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1307 {
1308 int beginTerm = m_bodyDisjunction->terms.size();
1309
1310 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, inputPosition));
1311 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1312 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1313 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1314
1315 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1316 m_currentAlternativeIndex = beginTerm + 1;
1317 }
1318
1319 void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation)
1320 {
1321 int beginTerm = m_bodyDisjunction->terms.size();
1322
1323 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, invert, 0));
1324 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1325 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1326 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1327
1328 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1329 m_currentAlternativeIndex = beginTerm + 1;
1330 }
1331
1332 unsigned popParenthesesStack()
1333 {
1334 ASSERT(m_parenthesesStack.size());
1335 int stackEnd = m_parenthesesStack.size() - 1;
1336 unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm;
1337 m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex;
1338 m_parenthesesStack.shrink(stackEnd);
1339
1340 ASSERT(beginTerm < m_bodyDisjunction->terms.size());
1341 ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size());
1342
1343 return beginTerm;
1344 }
1345
1346 #ifndef NDEBUG
1347 void dumpDisjunction(ByteDisjunction* disjunction)
1348 {
1349 printf("ByteDisjunction(%p):\n\t", disjunction);
1350 for (unsigned i = 0; i < disjunction->terms.size(); ++i)
1351 printf("{ %d } ", disjunction->terms[i].type);
1352 printf("\n");
1353 }
1354 #endif
1355
1356 void closeAlternative(int beginTerm)
1357 {
1358 int origBeginTerm = beginTerm;
1359 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin);
1360 int endIndex = m_bodyDisjunction->terms.size();
1361
1362 unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
1363
1364 if (!m_bodyDisjunction->terms[beginTerm].alternative.next)
1365 m_bodyDisjunction->terms.remove(beginTerm);
1366 else {
1367 while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
1368 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
1369 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction);
1370 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1371 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1372 }
1373
1374 m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1375
1376 m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd());
1377 m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1378 }
1379 }
1380
1381 void closeBodyAlternative()
1382 {
1383 int beginTerm = 0;
1384 int origBeginTerm = 0;
1385 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin);
1386 int endIndex = m_bodyDisjunction->terms.size();
1387
1388 unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
1389
1390 while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
1391 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
1392 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction);
1393 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1394 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1395 }
1396
1397 m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1398
1399 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd());
1400 m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1401 }
1402
1403 void atomParenthesesEnd(bool doInline, unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0)
1404 {
1405 unsigned beginTerm = popParenthesesStack();
1406 closeAlternative(beginTerm + 1);
1407 unsigned endTerm = m_bodyDisjunction->terms.size();
1408
1409 bool isAssertion = m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin;
1410 bool invertOrCapture = m_bodyDisjunction->terms[beginTerm].invertOrCapture;
1411 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1412
1413 m_bodyDisjunction->terms.append(ByteTerm(isAssertion ? ByteTerm::TypeParentheticalAssertionEnd : ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, invertOrCapture, inputPosition));
1414 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1415 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1416 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1417
1418 if (doInline) {
1419 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
1420 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1421 m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount;
1422 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1423 } else {
1424 ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm];
1425 ASSERT(parenthesesBegin.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
1426
1427 bool invertOrCapture = parenthesesBegin.invertOrCapture;
1428 unsigned subpatternId = parenthesesBegin.atom.subpatternId;
1429
1430 unsigned numSubpatterns = lastSubpatternId - subpatternId + 1;
1431 ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize);
1432
1433 parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin());
1434 for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses)
1435 parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]);
1436 parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd());
1437
1438 m_bodyDisjunction->terms.shrink(beginTerm);
1439
1440 m_allParenthesesInfo.append(parenthesesDisjunction);
1441 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, invertOrCapture, inputPosition));
1442
1443 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
1444 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1445 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1446 }
1447 }
1448
1449 void regexBegin(unsigned numSubpatterns, unsigned callFrameSize)
1450 {
1451 m_bodyDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize);
1452 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin());
1453 m_bodyDisjunction->terms[0].frameLocation = 0;
1454 m_currentAlternativeIndex = 0;
1455 }
1456
1457 void regexEnd()
1458 {
1459 closeBodyAlternative();
1460 }
1461
1462 void alternativeBodyDisjunction()
1463 {
1464 int newAlternativeIndex = m_bodyDisjunction->terms.size();
1465 m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
1466 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction());
1467
1468 m_currentAlternativeIndex = newAlternativeIndex;
1469 }
1470
1471 void alternativeDisjunction()
1472 {
1473 int newAlternativeIndex = m_bodyDisjunction->terms.size();
1474 m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
1475 m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction());
1476
1477 m_currentAlternativeIndex = newAlternativeIndex;
1478 }
1479
1480 void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0)
1481 {
1482 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
1483 unsigned currentCountAlreadyChecked = inputCountAlreadyChecked;
1484
1485 if (alt) {
1486 if (disjunction == m_pattern.m_body)
1487 alternativeBodyDisjunction();
1488 else
1489 alternativeDisjunction();
1490 }
1491
1492 PatternAlternative* alternative = disjunction->m_alternatives[alt];
1493 unsigned minimumSize = alternative->m_minimumSize;
1494
1495 ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked);
1496 unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked;
1497 if (countToCheck)
1498 checkInput(countToCheck);
1499 currentCountAlreadyChecked += countToCheck;
1500
1501 for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
1502 PatternTerm& term = alternative->m_terms[i];
1503
1504 switch (term.type) {
1505 case PatternTerm::TypeAssertionBOL:
1506 assertionBOL(term.inputPosition - currentCountAlreadyChecked);
1507 break;
1508
1509 case PatternTerm::TypeAssertionEOL:
1510 assertionEOL(term.inputPosition - currentCountAlreadyChecked);
1511 break;
1512
1513 case PatternTerm::TypeAssertionWordBoundary:
1514 assertionWordBoundary(term.invertOrCapture, term.inputPosition - currentCountAlreadyChecked);
1515 break;
1516
1517 case PatternTerm::TypePatternCharacter:
1518 atomPatternCharacter(term.patternCharacter, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
1519 break;
1520
1521 case PatternTerm::TypeCharacterClass:
1522 atomCharacterClass(term.characterClass, term.invertOrCapture, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
1523 break;
1524
1525 case PatternTerm::TypeBackReference:
1526 atomBackReference(term.subpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
1527 break;
1528
1529 case PatternTerm::TypeForwardReference:
1530 break;
1531
1532 case PatternTerm::TypeParenthesesSubpattern: {
1533 unsigned disjunctionAlreadyCheckedCount = 0;
1534 if ((term.quantityCount == 1) && !term.parentheses.isCopy) {
1535 if (term.quantityType == QuantifierFixedCount) {
1536 disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize;
1537 unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1538 atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation);
1539 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, term.parentheses.disjunction->m_minimumSize);
1540 atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
1541 } else {
1542 unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1543 atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + RegexStackSpaceForBackTrackInfoParenthesesOnce);
1544 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
1545 atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
1546 }
1547 } else {
1548 unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1549 atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, 0);
1550 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
1551 atomParenthesesEnd(false, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
1552 }
1553 break;
1554 }
1555
1556 case PatternTerm::TypeParentheticalAssertion: {
1557 unsigned alternativeFrameLocation = term.frameLocation + RegexStackSpaceForBackTrackInfoParentheticalAssertion;
1558
1559 atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invertOrCapture, term.frameLocation, alternativeFrameLocation);
1560 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
1561 atomParenthesesEnd(true, term.parentheses.lastSubpatternId, 0, term.frameLocation, term.quantityCount, term.quantityType);
1562 break;
1563 }
1564 }
1565 }
1566 }
1567 }
1568
1569 private:
1570 RegexPattern& m_pattern;
1571 ByteDisjunction* m_bodyDisjunction;
1572 unsigned m_currentAlternativeIndex;
1573 Vector<ParenthesesStackEntry> m_parenthesesStack;
1574 Vector<ByteDisjunction*> m_allParenthesesInfo;
1575 };
1576
1577
1578 BytecodePattern* byteCompileRegex(const UString& patternString, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline)
1579 {
1580 RegexPattern pattern(ignoreCase, multiline);
1581
1582 if ((error = compileRegex(patternString, pattern)))
1583 return 0;
1584
1585 numSubpatterns = pattern.m_numSubpatterns;
1586
1587 return ByteCompiler(pattern).compile();
1588 }
1589
1590 int interpretRegex(BytecodePattern* regex, const UChar* input, unsigned start, unsigned length, int* output)
1591 {
1592 return Interpreter(regex, output, input, start, length).interpret();
1593 }
1594
1595
1596 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter) == (RegexStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoPatternCharacter);
1597 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass) == (RegexStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoCharacterClass);
1598 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference) == (RegexStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoBackReference);
1599 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative) == (RegexStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoAlternative);
1600 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion) == (RegexStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheticalAssertion);
1601 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce) == (RegexStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParenthesesOnce);
1602 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (RegexStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheses);
1603
1604
1605 } }
1606
1607 #endif