]> git.saurik.com Git - apple/javascriptcore.git/blob - yarr/YarrInterpreter.cpp
JavaScriptCore-1218.tar.gz
[apple/javascriptcore.git] / yarr / YarrInterpreter.cpp
1 /*
2 * Copyright (C) 2009 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27 #include "config.h"
28 #include "YarrInterpreter.h"
29
30 #include "Yarr.h"
31 #include "YarrCanonicalizeUCS2.h"
32 #include <wtf/BumpPointerAllocator.h>
33 #include <wtf/DataLog.h>
34 #include <wtf/text/CString.h>
35 #include <wtf/text/WTFString.h>
36
37 #ifndef NDEBUG
38 #include <stdio.h>
39 #endif
40
41 using namespace WTF;
42
43 namespace JSC { namespace Yarr {
44
45 template<typename CharType>
46 class Interpreter {
47 public:
48 struct ParenthesesDisjunctionContext;
49
50 struct BackTrackInfoPatternCharacter {
51 uintptr_t matchAmount;
52 };
53 struct BackTrackInfoCharacterClass {
54 uintptr_t matchAmount;
55 };
56 struct BackTrackInfoBackReference {
57 uintptr_t begin; // Not really needed for greedy quantifiers.
58 uintptr_t matchAmount; // Not really needed for fixed quantifiers.
59 };
60 struct BackTrackInfoAlternative {
61 uintptr_t offset;
62 };
63 struct BackTrackInfoParentheticalAssertion {
64 uintptr_t begin;
65 };
66 struct BackTrackInfoParenthesesOnce {
67 uintptr_t begin;
68 };
69 struct BackTrackInfoParenthesesTerminal {
70 uintptr_t begin;
71 };
72 struct BackTrackInfoParentheses {
73 uintptr_t matchAmount;
74 ParenthesesDisjunctionContext* lastContext;
75 };
76
77 static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context)
78 {
79 context->next = backTrack->lastContext;
80 backTrack->lastContext = context;
81 ++backTrack->matchAmount;
82 }
83
84 static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack)
85 {
86 RELEASE_ASSERT(backTrack->matchAmount);
87 RELEASE_ASSERT(backTrack->lastContext);
88 backTrack->lastContext = backTrack->lastContext->next;
89 --backTrack->matchAmount;
90 }
91
92 struct DisjunctionContext
93 {
94 DisjunctionContext()
95 : term(0)
96 {
97 }
98
99 void* operator new(size_t, void* where)
100 {
101 return where;
102 }
103
104 int term;
105 unsigned matchBegin;
106 unsigned matchEnd;
107 uintptr_t frame[1];
108 };
109
110 DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction)
111 {
112 size_t size = sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t);
113 allocatorPool = allocatorPool->ensureCapacity(size);
114 RELEASE_ASSERT(allocatorPool);
115 return new (allocatorPool->alloc(size)) DisjunctionContext();
116 }
117
118 void freeDisjunctionContext(DisjunctionContext* context)
119 {
120 allocatorPool = allocatorPool->dealloc(context);
121 }
122
123 struct ParenthesesDisjunctionContext
124 {
125 ParenthesesDisjunctionContext(unsigned* output, ByteTerm& term)
126 : next(0)
127 {
128 unsigned firstSubpatternId = term.atom.subpatternId;
129 unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns;
130
131 for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) {
132 subpatternBackup[i] = output[(firstSubpatternId << 1) + i];
133 output[(firstSubpatternId << 1) + i] = offsetNoMatch;
134 }
135
136 new (getDisjunctionContext(term)) DisjunctionContext();
137 }
138
139 void* operator new(size_t, void* where)
140 {
141 return where;
142 }
143
144 void restoreOutput(unsigned* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns)
145 {
146 for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i)
147 output[(firstSubpatternId << 1) + i] = subpatternBackup[i];
148 }
149
150 DisjunctionContext* getDisjunctionContext(ByteTerm& term)
151 {
152 return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1]));
153 }
154
155 ParenthesesDisjunctionContext* next;
156 unsigned subpatternBackup[1];
157 };
158
159 ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, unsigned* output, ByteTerm& term)
160 {
161 size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(unsigned) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(unsigned) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t);
162 allocatorPool = allocatorPool->ensureCapacity(size);
163 RELEASE_ASSERT(allocatorPool);
164 return new (allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term);
165 }
166
167 void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context)
168 {
169 allocatorPool = allocatorPool->dealloc(context);
170 }
171
172 class InputStream {
173 public:
174 InputStream(const CharType* input, unsigned start, unsigned length)
175 : input(input)
176 , pos(start)
177 , length(length)
178 {
179 }
180
181 void next()
182 {
183 ++pos;
184 }
185
186 void rewind(unsigned amount)
187 {
188 ASSERT(pos >= amount);
189 pos -= amount;
190 }
191
192 int read()
193 {
194 ASSERT(pos < length);
195 if (pos < length)
196 return input[pos];
197 return -1;
198 }
199
200 int readPair()
201 {
202 ASSERT(pos + 1 < length);
203 return input[pos] | input[pos + 1] << 16;
204 }
205
206 int readChecked(unsigned negativePositionOffest)
207 {
208 RELEASE_ASSERT(pos >= negativePositionOffest);
209 unsigned p = pos - negativePositionOffest;
210 ASSERT(p < length);
211 return input[p];
212 }
213
214 int reread(unsigned from)
215 {
216 ASSERT(from < length);
217 return input[from];
218 }
219
220 int prev()
221 {
222 ASSERT(!(pos > length));
223 if (pos && length)
224 return input[pos - 1];
225 return -1;
226 }
227
228 unsigned getPos()
229 {
230 return pos;
231 }
232
233 void setPos(unsigned p)
234 {
235 pos = p;
236 }
237
238 bool atStart()
239 {
240 return pos == 0;
241 }
242
243 bool atEnd()
244 {
245 return pos == length;
246 }
247
248 unsigned end()
249 {
250 return length;
251 }
252
253 bool checkInput(unsigned count)
254 {
255 if (((pos + count) <= length) && ((pos + count) >= pos)) {
256 pos += count;
257 return true;
258 }
259 return false;
260 }
261
262 void uncheckInput(unsigned count)
263 {
264 RELEASE_ASSERT(pos >= count);
265 pos -= count;
266 }
267
268 bool atStart(unsigned negativePositionOffest)
269 {
270 return pos == negativePositionOffest;
271 }
272
273 bool atEnd(unsigned negativePositionOffest)
274 {
275 RELEASE_ASSERT(pos >= negativePositionOffest);
276 return (pos - negativePositionOffest) == length;
277 }
278
279 bool isAvailableInput(unsigned offset)
280 {
281 return (((pos + offset) <= length) && ((pos + offset) >= pos));
282 }
283
284 private:
285 const CharType* input;
286 unsigned pos;
287 unsigned length;
288 };
289
290 bool testCharacterClass(CharacterClass* characterClass, int ch)
291 {
292 if (ch & 0xFF80) {
293 for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i)
294 if (ch == characterClass->m_matchesUnicode[i])
295 return true;
296 for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i)
297 if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end))
298 return true;
299 } else {
300 for (unsigned i = 0; i < characterClass->m_matches.size(); ++i)
301 if (ch == characterClass->m_matches[i])
302 return true;
303 for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i)
304 if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end))
305 return true;
306 }
307
308 return false;
309 }
310
311 bool checkCharacter(int testChar, unsigned negativeInputOffset)
312 {
313 return testChar == input.readChecked(negativeInputOffset);
314 }
315
316 bool checkCasedCharacter(int loChar, int hiChar, unsigned negativeInputOffset)
317 {
318 int ch = input.readChecked(negativeInputOffset);
319 return (loChar == ch) || (hiChar == ch);
320 }
321
322 bool checkCharacterClass(CharacterClass* characterClass, bool invert, unsigned negativeInputOffset)
323 {
324 bool match = testCharacterClass(characterClass, input.readChecked(negativeInputOffset));
325 return invert ? !match : match;
326 }
327
328 bool tryConsumeBackReference(int matchBegin, int matchEnd, unsigned negativeInputOffset)
329 {
330 unsigned matchSize = (unsigned)(matchEnd - matchBegin);
331
332 if (!input.checkInput(matchSize))
333 return false;
334
335 if (pattern->m_ignoreCase) {
336 for (unsigned i = 0; i < matchSize; ++i) {
337 int oldCh = input.reread(matchBegin + i);
338 int ch = input.readChecked(negativeInputOffset + matchSize - i);
339
340 if (oldCh == ch)
341 continue;
342
343 // The definition for canonicalize (see ES 5.1, 15.10.2.8) means that
344 // unicode values are never allowed to match against ascii ones.
345 if (isASCII(oldCh) || isASCII(ch)) {
346 if (toASCIIUpper(oldCh) == toASCIIUpper(ch))
347 continue;
348 } else if (areCanonicallyEquivalent(oldCh, ch))
349 continue;
350
351 input.uncheckInput(matchSize);
352 return false;
353 }
354 } else {
355 for (unsigned i = 0; i < matchSize; ++i) {
356 if (!checkCharacter(input.reread(matchBegin + i), negativeInputOffset + matchSize - i)) {
357 input.uncheckInput(matchSize);
358 return false;
359 }
360 }
361 }
362
363 return true;
364 }
365
366 bool matchAssertionBOL(ByteTerm& term)
367 {
368 return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition + 1)));
369 }
370
371 bool matchAssertionEOL(ByteTerm& term)
372 {
373 if (term.inputPosition)
374 return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition)));
375
376 return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read()));
377 }
378
379 bool matchAssertionWordBoundary(ByteTerm& term)
380 {
381 bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition + 1));
382 bool readIsWordchar;
383 if (term.inputPosition)
384 readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition));
385 else
386 readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read());
387
388 bool wordBoundary = prevIsWordchar != readIsWordchar;
389 return term.invert() ? !wordBoundary : wordBoundary;
390 }
391
392 bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context)
393 {
394 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
395
396 switch (term.atom.quantityType) {
397 case QuantifierFixedCount:
398 break;
399
400 case QuantifierGreedy:
401 if (backTrack->matchAmount) {
402 --backTrack->matchAmount;
403 input.uncheckInput(1);
404 return true;
405 }
406 break;
407
408 case QuantifierNonGreedy:
409 if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
410 ++backTrack->matchAmount;
411 if (checkCharacter(term.atom.patternCharacter, term.inputPosition + 1))
412 return true;
413 }
414 input.uncheckInput(backTrack->matchAmount);
415 break;
416 }
417
418 return false;
419 }
420
421 bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context)
422 {
423 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
424
425 switch (term.atom.quantityType) {
426 case QuantifierFixedCount:
427 break;
428
429 case QuantifierGreedy:
430 if (backTrack->matchAmount) {
431 --backTrack->matchAmount;
432 input.uncheckInput(1);
433 return true;
434 }
435 break;
436
437 case QuantifierNonGreedy:
438 if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
439 ++backTrack->matchAmount;
440 if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition + 1))
441 return true;
442 }
443 input.uncheckInput(backTrack->matchAmount);
444 break;
445 }
446
447 return false;
448 }
449
450 bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context)
451 {
452 ASSERT(term.type == ByteTerm::TypeCharacterClass);
453 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
454
455 switch (term.atom.quantityType) {
456 case QuantifierFixedCount: {
457 for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
458 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - matchAmount))
459 return false;
460 }
461 return true;
462 }
463
464 case QuantifierGreedy: {
465 unsigned matchAmount = 0;
466 while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
467 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) {
468 input.uncheckInput(1);
469 break;
470 }
471 ++matchAmount;
472 }
473 backTrack->matchAmount = matchAmount;
474
475 return true;
476 }
477
478 case QuantifierNonGreedy:
479 backTrack->matchAmount = 0;
480 return true;
481 }
482
483 RELEASE_ASSERT_NOT_REACHED();
484 return false;
485 }
486
487 bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context)
488 {
489 ASSERT(term.type == ByteTerm::TypeCharacterClass);
490 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
491
492 switch (term.atom.quantityType) {
493 case QuantifierFixedCount:
494 break;
495
496 case QuantifierGreedy:
497 if (backTrack->matchAmount) {
498 --backTrack->matchAmount;
499 input.uncheckInput(1);
500 return true;
501 }
502 break;
503
504 case QuantifierNonGreedy:
505 if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
506 ++backTrack->matchAmount;
507 if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1))
508 return true;
509 }
510 input.uncheckInput(backTrack->matchAmount);
511 break;
512 }
513
514 return false;
515 }
516
517 bool matchBackReference(ByteTerm& term, DisjunctionContext* context)
518 {
519 ASSERT(term.type == ByteTerm::TypeBackReference);
520 BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
521
522 unsigned matchBegin = output[(term.atom.subpatternId << 1)];
523 unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1];
524
525 // If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that.
526 // In this case the result of match is empty string like when it references to a parentheses with zero-width match.
527 // Eg.: /(a\1)/
528 if (matchEnd == offsetNoMatch)
529 return true;
530
531 if (matchBegin == offsetNoMatch)
532 return true;
533
534 ASSERT(matchBegin <= matchEnd);
535
536 if (matchBegin == matchEnd)
537 return true;
538
539 switch (term.atom.quantityType) {
540 case QuantifierFixedCount: {
541 backTrack->begin = input.getPos();
542 for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
543 if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
544 input.setPos(backTrack->begin);
545 return false;
546 }
547 }
548 return true;
549 }
550
551 case QuantifierGreedy: {
552 unsigned matchAmount = 0;
553 while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition))
554 ++matchAmount;
555 backTrack->matchAmount = matchAmount;
556 return true;
557 }
558
559 case QuantifierNonGreedy:
560 backTrack->begin = input.getPos();
561 backTrack->matchAmount = 0;
562 return true;
563 }
564
565 RELEASE_ASSERT_NOT_REACHED();
566 return false;
567 }
568
569 bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context)
570 {
571 ASSERT(term.type == ByteTerm::TypeBackReference);
572 BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
573
574 unsigned matchBegin = output[(term.atom.subpatternId << 1)];
575 unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1];
576
577 if (matchBegin == offsetNoMatch)
578 return false;
579
580 ASSERT(matchBegin <= matchEnd);
581
582 if (matchBegin == matchEnd)
583 return false;
584
585 switch (term.atom.quantityType) {
586 case QuantifierFixedCount:
587 // for quantityCount == 1, could rewind.
588 input.setPos(backTrack->begin);
589 break;
590
591 case QuantifierGreedy:
592 if (backTrack->matchAmount) {
593 --backTrack->matchAmount;
594 input.rewind(matchEnd - matchBegin);
595 return true;
596 }
597 break;
598
599 case QuantifierNonGreedy:
600 if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
601 ++backTrack->matchAmount;
602 return true;
603 }
604 input.setPos(backTrack->begin);
605 break;
606 }
607
608 return false;
609 }
610
611 void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context)
612 {
613 if (term.capture()) {
614 unsigned subpatternId = term.atom.subpatternId;
615 output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition;
616 output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition;
617 }
618 }
619 void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context)
620 {
621 unsigned firstSubpatternId = term.atom.subpatternId;
622 unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
623 context->restoreOutput(output, firstSubpatternId, count);
624 }
625 JSRegExpResult parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack)
626 {
627 while (backTrack->matchAmount) {
628 ParenthesesDisjunctionContext* context = backTrack->lastContext;
629
630 JSRegExpResult result = matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true);
631 if (result == JSRegExpMatch)
632 return JSRegExpMatch;
633
634 resetMatches(term, context);
635 popParenthesesDisjunctionContext(backTrack);
636 freeParenthesesDisjunctionContext(context);
637
638 if (result != JSRegExpNoMatch)
639 return result;
640 }
641
642 return JSRegExpNoMatch;
643 }
644
645 bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
646 {
647 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
648 ASSERT(term.atom.quantityCount == 1);
649
650 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
651
652 switch (term.atom.quantityType) {
653 case QuantifierGreedy: {
654 // set this speculatively; if we get to the parens end this will be true.
655 backTrack->begin = input.getPos();
656 break;
657 }
658 case QuantifierNonGreedy: {
659 backTrack->begin = notFound;
660 context->term += term.atom.parenthesesWidth;
661 return true;
662 }
663 case QuantifierFixedCount:
664 break;
665 }
666
667 if (term.capture()) {
668 unsigned subpatternId = term.atom.subpatternId;
669 output[(subpatternId << 1)] = input.getPos() - term.inputPosition;
670 }
671
672 return true;
673 }
674
675 bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
676 {
677 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
678 ASSERT(term.atom.quantityCount == 1);
679
680 if (term.capture()) {
681 unsigned subpatternId = term.atom.subpatternId;
682 output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition;
683 }
684
685 if (term.atom.quantityType == QuantifierFixedCount)
686 return true;
687
688 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
689 return backTrack->begin != input.getPos();
690 }
691
692 bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
693 {
694 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
695 ASSERT(term.atom.quantityCount == 1);
696
697 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
698
699 if (term.capture()) {
700 unsigned subpatternId = term.atom.subpatternId;
701 output[(subpatternId << 1)] = offsetNoMatch;
702 output[(subpatternId << 1) + 1] = offsetNoMatch;
703 }
704
705 switch (term.atom.quantityType) {
706 case QuantifierGreedy:
707 // if we backtrack to this point, there is another chance - try matching nothing.
708 ASSERT(backTrack->begin != notFound);
709 backTrack->begin = notFound;
710 context->term += term.atom.parenthesesWidth;
711 return true;
712 case QuantifierNonGreedy:
713 ASSERT(backTrack->begin != notFound);
714 case QuantifierFixedCount:
715 break;
716 }
717
718 return false;
719 }
720
721 bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
722 {
723 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
724 ASSERT(term.atom.quantityCount == 1);
725
726 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
727
728 switch (term.atom.quantityType) {
729 case QuantifierGreedy:
730 if (backTrack->begin == notFound) {
731 context->term -= term.atom.parenthesesWidth;
732 return false;
733 }
734 case QuantifierNonGreedy:
735 if (backTrack->begin == notFound) {
736 backTrack->begin = input.getPos();
737 if (term.capture()) {
738 // Technically this access to inputPosition should be accessing the begin term's
739 // inputPosition, but for repeats other than fixed these values should be
740 // the same anyway! (We don't pre-check for greedy or non-greedy matches.)
741 ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
742 ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition);
743 unsigned subpatternId = term.atom.subpatternId;
744 output[subpatternId << 1] = input.getPos() + term.inputPosition;
745 }
746 context->term -= term.atom.parenthesesWidth;
747 return true;
748 }
749 case QuantifierFixedCount:
750 break;
751 }
752
753 return false;
754 }
755
756 bool matchParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
757 {
758 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
759 ASSERT(term.atom.quantityType == QuantifierGreedy);
760 ASSERT(term.atom.quantityCount == quantifyInfinite);
761 ASSERT(!term.capture());
762
763 BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
764 backTrack->begin = input.getPos();
765 return true;
766 }
767
768 bool matchParenthesesTerminalEnd(ByteTerm& term, DisjunctionContext* context)
769 {
770 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalEnd);
771
772 BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
773 // Empty match is a failed match.
774 if (backTrack->begin == input.getPos())
775 return false;
776
777 // Successful match! Okay, what's next? - loop around and try to match moar!
778 context->term -= (term.atom.parenthesesWidth + 1);
779 return true;
780 }
781
782 bool backtrackParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
783 {
784 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
785 ASSERT(term.atom.quantityType == QuantifierGreedy);
786 ASSERT(term.atom.quantityCount == quantifyInfinite);
787 ASSERT(!term.capture());
788
789 // If we backtrack to this point, we have failed to match this iteration of the parens.
790 // Since this is greedy / zero minimum a failed is also accepted as a match!
791 context->term += term.atom.parenthesesWidth;
792 return true;
793 }
794
795 bool backtrackParenthesesTerminalEnd(ByteTerm&, DisjunctionContext*)
796 {
797 // 'Terminal' parentheses are at the end of the regex, and as such a match past end
798 // should always be returned as a successful match - we should never backtrack to here.
799 RELEASE_ASSERT_NOT_REACHED();
800 return false;
801 }
802
803 bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
804 {
805 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
806 ASSERT(term.atom.quantityCount == 1);
807
808 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
809
810 backTrack->begin = input.getPos();
811 return true;
812 }
813
814 bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
815 {
816 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
817 ASSERT(term.atom.quantityCount == 1);
818
819 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
820
821 input.setPos(backTrack->begin);
822
823 // We've reached the end of the parens; if they are inverted, this is failure.
824 if (term.invert()) {
825 context->term -= term.atom.parenthesesWidth;
826 return false;
827 }
828
829 return true;
830 }
831
832 bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
833 {
834 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
835 ASSERT(term.atom.quantityCount == 1);
836
837 // We've failed to match parens; if they are inverted, this is win!
838 if (term.invert()) {
839 context->term += term.atom.parenthesesWidth;
840 return true;
841 }
842
843 return false;
844 }
845
846 bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
847 {
848 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
849 ASSERT(term.atom.quantityCount == 1);
850
851 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
852
853 input.setPos(backTrack->begin);
854
855 context->term -= term.atom.parenthesesWidth;
856 return false;
857 }
858
859 JSRegExpResult matchParentheses(ByteTerm& term, DisjunctionContext* context)
860 {
861 ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
862
863 BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
864 ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
865
866 backTrack->matchAmount = 0;
867 backTrack->lastContext = 0;
868
869 switch (term.atom.quantityType) {
870 case QuantifierFixedCount: {
871 // While we haven't yet reached our fixed limit,
872 while (backTrack->matchAmount < term.atom.quantityCount) {
873 // Try to do a match, and it it succeeds, add it to the list.
874 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
875 JSRegExpResult result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
876 if (result == JSRegExpMatch)
877 appendParenthesesDisjunctionContext(backTrack, context);
878 else {
879 // The match failed; try to find an alternate point to carry on from.
880 resetMatches(term, context);
881 freeParenthesesDisjunctionContext(context);
882
883 if (result != JSRegExpNoMatch)
884 return result;
885 JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
886 if (backtrackResult != JSRegExpMatch)
887 return backtrackResult;
888 }
889 }
890
891 ASSERT(backTrack->matchAmount == term.atom.quantityCount);
892 ParenthesesDisjunctionContext* context = backTrack->lastContext;
893 recordParenthesesMatch(term, context);
894 return JSRegExpMatch;
895 }
896
897 case QuantifierGreedy: {
898 while (backTrack->matchAmount < term.atom.quantityCount) {
899 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
900 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
901 if (result == JSRegExpMatch)
902 appendParenthesesDisjunctionContext(backTrack, context);
903 else {
904 resetMatches(term, context);
905 freeParenthesesDisjunctionContext(context);
906
907 if (result != JSRegExpNoMatch)
908 return result;
909
910 break;
911 }
912 }
913
914 if (backTrack->matchAmount) {
915 ParenthesesDisjunctionContext* context = backTrack->lastContext;
916 recordParenthesesMatch(term, context);
917 }
918 return JSRegExpMatch;
919 }
920
921 case QuantifierNonGreedy:
922 return JSRegExpMatch;
923 }
924
925 RELEASE_ASSERT_NOT_REACHED();
926 return JSRegExpErrorNoMatch;
927 }
928
929 // Rules for backtracking differ depending on whether this is greedy or non-greedy.
930 //
931 // Greedy matches never should try just adding more - you should already have done
932 // the 'more' cases. Always backtrack, at least a leetle bit. However cases where
933 // you backtrack an item off the list needs checking, since we'll never have matched
934 // the one less case. Tracking forwards, still add as much as possible.
935 //
936 // Non-greedy, we've already done the one less case, so don't match on popping.
937 // We haven't done the one more case, so always try to add that.
938 //
939 JSRegExpResult backtrackParentheses(ByteTerm& term, DisjunctionContext* context)
940 {
941 ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
942
943 BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
944 ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
945
946 switch (term.atom.quantityType) {
947 case QuantifierFixedCount: {
948 ASSERT(backTrack->matchAmount == term.atom.quantityCount);
949
950 ParenthesesDisjunctionContext* context = 0;
951 JSRegExpResult result = parenthesesDoBacktrack(term, backTrack);
952
953 if (result != JSRegExpMatch)
954 return result;
955
956 // While we haven't yet reached our fixed limit,
957 while (backTrack->matchAmount < term.atom.quantityCount) {
958 // Try to do a match, and it it succeeds, add it to the list.
959 context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
960 result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
961
962 if (result == JSRegExpMatch)
963 appendParenthesesDisjunctionContext(backTrack, context);
964 else {
965 // The match failed; try to find an alternate point to carry on from.
966 resetMatches(term, context);
967 freeParenthesesDisjunctionContext(context);
968
969 if (result != JSRegExpNoMatch)
970 return result;
971 JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
972 if (backtrackResult != JSRegExpMatch)
973 return backtrackResult;
974 }
975 }
976
977 ASSERT(backTrack->matchAmount == term.atom.quantityCount);
978 context = backTrack->lastContext;
979 recordParenthesesMatch(term, context);
980 return JSRegExpMatch;
981 }
982
983 case QuantifierGreedy: {
984 if (!backTrack->matchAmount)
985 return JSRegExpNoMatch;
986
987 ParenthesesDisjunctionContext* context = backTrack->lastContext;
988 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
989 if (result == JSRegExpMatch) {
990 while (backTrack->matchAmount < term.atom.quantityCount) {
991 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
992 JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
993 if (parenthesesResult == JSRegExpMatch)
994 appendParenthesesDisjunctionContext(backTrack, context);
995 else {
996 resetMatches(term, context);
997 freeParenthesesDisjunctionContext(context);
998
999 if (parenthesesResult != JSRegExpNoMatch)
1000 return parenthesesResult;
1001
1002 break;
1003 }
1004 }
1005 } else {
1006 resetMatches(term, context);
1007 popParenthesesDisjunctionContext(backTrack);
1008 freeParenthesesDisjunctionContext(context);
1009
1010 if (result != JSRegExpNoMatch)
1011 return result;
1012 }
1013
1014 if (backTrack->matchAmount) {
1015 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1016 recordParenthesesMatch(term, context);
1017 }
1018 return JSRegExpMatch;
1019 }
1020
1021 case QuantifierNonGreedy: {
1022 // If we've not reached the limit, try to add one more match.
1023 if (backTrack->matchAmount < term.atom.quantityCount) {
1024 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
1025 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
1026 if (result == JSRegExpMatch) {
1027 appendParenthesesDisjunctionContext(backTrack, context);
1028 recordParenthesesMatch(term, context);
1029 return JSRegExpMatch;
1030 }
1031
1032 resetMatches(term, context);
1033 freeParenthesesDisjunctionContext(context);
1034
1035 if (result != JSRegExpNoMatch)
1036 return result;
1037 }
1038
1039 // Nope - okay backtrack looking for an alternative.
1040 while (backTrack->matchAmount) {
1041 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1042 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
1043 if (result == JSRegExpMatch) {
1044 // successful backtrack! we're back in the game!
1045 if (backTrack->matchAmount) {
1046 context = backTrack->lastContext;
1047 recordParenthesesMatch(term, context);
1048 }
1049 return JSRegExpMatch;
1050 }
1051
1052 // pop a match off the stack
1053 resetMatches(term, context);
1054 popParenthesesDisjunctionContext(backTrack);
1055 freeParenthesesDisjunctionContext(context);
1056
1057 if (result != JSRegExpNoMatch)
1058 return result;
1059 }
1060
1061 return JSRegExpNoMatch;
1062 }
1063 }
1064
1065 RELEASE_ASSERT_NOT_REACHED();
1066 return JSRegExpErrorNoMatch;
1067 }
1068
1069 bool matchDotStarEnclosure(ByteTerm& term, DisjunctionContext* context)
1070 {
1071 UNUSED_PARAM(term);
1072 unsigned matchBegin = context->matchBegin;
1073
1074 if (matchBegin) {
1075 for (matchBegin--; true; matchBegin--) {
1076 if (testCharacterClass(pattern->newlineCharacterClass, input.reread(matchBegin))) {
1077 ++matchBegin;
1078 break;
1079 }
1080
1081 if (!matchBegin)
1082 break;
1083 }
1084 }
1085
1086 unsigned matchEnd = input.getPos();
1087
1088 for (; (matchEnd != input.end())
1089 && (!testCharacterClass(pattern->newlineCharacterClass, input.reread(matchEnd))); matchEnd++) { }
1090
1091 if (((matchBegin && term.anchors.m_bol)
1092 || ((matchEnd != input.end()) && term.anchors.m_eol))
1093 && !pattern->m_multiline)
1094 return false;
1095
1096 context->matchBegin = matchBegin;
1097 context->matchEnd = matchEnd;
1098 return true;
1099 }
1100
1101 #define MATCH_NEXT() { ++context->term; goto matchAgain; }
1102 #define BACKTRACK() { --context->term; goto backtrack; }
1103 #define currentTerm() (disjunction->terms[context->term])
1104 JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
1105 {
1106 if (!--remainingMatchCount)
1107 return JSRegExpErrorHitLimit;
1108
1109 if (btrack)
1110 BACKTRACK();
1111
1112 context->matchBegin = input.getPos();
1113 context->term = 0;
1114
1115 matchAgain:
1116 ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
1117
1118 switch (currentTerm().type) {
1119 case ByteTerm::TypeSubpatternBegin:
1120 MATCH_NEXT();
1121 case ByteTerm::TypeSubpatternEnd:
1122 context->matchEnd = input.getPos();
1123 return JSRegExpMatch;
1124
1125 case ByteTerm::TypeBodyAlternativeBegin:
1126 MATCH_NEXT();
1127 case ByteTerm::TypeBodyAlternativeDisjunction:
1128 case ByteTerm::TypeBodyAlternativeEnd:
1129 context->matchEnd = input.getPos();
1130 return JSRegExpMatch;
1131
1132 case ByteTerm::TypeAlternativeBegin:
1133 MATCH_NEXT();
1134 case ByteTerm::TypeAlternativeDisjunction:
1135 case ByteTerm::TypeAlternativeEnd: {
1136 int offset = currentTerm().alternative.end;
1137 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
1138 backTrack->offset = offset;
1139 context->term += offset;
1140 MATCH_NEXT();
1141 }
1142
1143 case ByteTerm::TypeAssertionBOL:
1144 if (matchAssertionBOL(currentTerm()))
1145 MATCH_NEXT();
1146 BACKTRACK();
1147 case ByteTerm::TypeAssertionEOL:
1148 if (matchAssertionEOL(currentTerm()))
1149 MATCH_NEXT();
1150 BACKTRACK();
1151 case ByteTerm::TypeAssertionWordBoundary:
1152 if (matchAssertionWordBoundary(currentTerm()))
1153 MATCH_NEXT();
1154 BACKTRACK();
1155
1156 case ByteTerm::TypePatternCharacterOnce:
1157 case ByteTerm::TypePatternCharacterFixed: {
1158 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
1159 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - matchAmount))
1160 BACKTRACK();
1161 }
1162 MATCH_NEXT();
1163 }
1164 case ByteTerm::TypePatternCharacterGreedy: {
1165 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1166 unsigned matchAmount = 0;
1167 while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
1168 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + 1)) {
1169 input.uncheckInput(1);
1170 break;
1171 }
1172 ++matchAmount;
1173 }
1174 backTrack->matchAmount = matchAmount;
1175
1176 MATCH_NEXT();
1177 }
1178 case ByteTerm::TypePatternCharacterNonGreedy: {
1179 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1180 backTrack->matchAmount = 0;
1181 MATCH_NEXT();
1182 }
1183
1184 case ByteTerm::TypePatternCasedCharacterOnce:
1185 case ByteTerm::TypePatternCasedCharacterFixed: {
1186 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
1187 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - matchAmount))
1188 BACKTRACK();
1189 }
1190 MATCH_NEXT();
1191 }
1192 case ByteTerm::TypePatternCasedCharacterGreedy: {
1193 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1194 unsigned matchAmount = 0;
1195 while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
1196 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + 1)) {
1197 input.uncheckInput(1);
1198 break;
1199 }
1200 ++matchAmount;
1201 }
1202 backTrack->matchAmount = matchAmount;
1203
1204 MATCH_NEXT();
1205 }
1206 case ByteTerm::TypePatternCasedCharacterNonGreedy: {
1207 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1208 backTrack->matchAmount = 0;
1209 MATCH_NEXT();
1210 }
1211
1212 case ByteTerm::TypeCharacterClass:
1213 if (matchCharacterClass(currentTerm(), context))
1214 MATCH_NEXT();
1215 BACKTRACK();
1216 case ByteTerm::TypeBackReference:
1217 if (matchBackReference(currentTerm(), context))
1218 MATCH_NEXT();
1219 BACKTRACK();
1220 case ByteTerm::TypeParenthesesSubpattern: {
1221 JSRegExpResult result = matchParentheses(currentTerm(), context);
1222
1223 if (result == JSRegExpMatch) {
1224 MATCH_NEXT();
1225 } else if (result != JSRegExpNoMatch)
1226 return result;
1227
1228 BACKTRACK();
1229 }
1230 case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1231 if (matchParenthesesOnceBegin(currentTerm(), context))
1232 MATCH_NEXT();
1233 BACKTRACK();
1234 case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1235 if (matchParenthesesOnceEnd(currentTerm(), context))
1236 MATCH_NEXT();
1237 BACKTRACK();
1238 case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
1239 if (matchParenthesesTerminalBegin(currentTerm(), context))
1240 MATCH_NEXT();
1241 BACKTRACK();
1242 case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
1243 if (matchParenthesesTerminalEnd(currentTerm(), context))
1244 MATCH_NEXT();
1245 BACKTRACK();
1246 case ByteTerm::TypeParentheticalAssertionBegin:
1247 if (matchParentheticalAssertionBegin(currentTerm(), context))
1248 MATCH_NEXT();
1249 BACKTRACK();
1250 case ByteTerm::TypeParentheticalAssertionEnd:
1251 if (matchParentheticalAssertionEnd(currentTerm(), context))
1252 MATCH_NEXT();
1253 BACKTRACK();
1254
1255 case ByteTerm::TypeCheckInput:
1256 if (input.checkInput(currentTerm().checkInputCount))
1257 MATCH_NEXT();
1258 BACKTRACK();
1259
1260 case ByteTerm::TypeUncheckInput:
1261 input.uncheckInput(currentTerm().checkInputCount);
1262 MATCH_NEXT();
1263
1264 case ByteTerm::TypeDotStarEnclosure:
1265 if (matchDotStarEnclosure(currentTerm(), context))
1266 return JSRegExpMatch;
1267 BACKTRACK();
1268 }
1269
1270 // We should never fall-through to here.
1271 RELEASE_ASSERT_NOT_REACHED();
1272
1273 backtrack:
1274 ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
1275
1276 switch (currentTerm().type) {
1277 case ByteTerm::TypeSubpatternBegin:
1278 return JSRegExpNoMatch;
1279 case ByteTerm::TypeSubpatternEnd:
1280 RELEASE_ASSERT_NOT_REACHED();
1281
1282 case ByteTerm::TypeBodyAlternativeBegin:
1283 case ByteTerm::TypeBodyAlternativeDisjunction: {
1284 int offset = currentTerm().alternative.next;
1285 context->term += offset;
1286 if (offset > 0)
1287 MATCH_NEXT();
1288
1289 if (input.atEnd())
1290 return JSRegExpNoMatch;
1291
1292 input.next();
1293
1294 context->matchBegin = input.getPos();
1295
1296 if (currentTerm().alternative.onceThrough)
1297 context->term += currentTerm().alternative.next;
1298
1299 MATCH_NEXT();
1300 }
1301 case ByteTerm::TypeBodyAlternativeEnd:
1302 RELEASE_ASSERT_NOT_REACHED();
1303
1304 case ByteTerm::TypeAlternativeBegin:
1305 case ByteTerm::TypeAlternativeDisjunction: {
1306 int offset = currentTerm().alternative.next;
1307 context->term += offset;
1308 if (offset > 0)
1309 MATCH_NEXT();
1310 BACKTRACK();
1311 }
1312 case ByteTerm::TypeAlternativeEnd: {
1313 // We should never backtrack back into an alternative of the main body of the regex.
1314 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
1315 unsigned offset = backTrack->offset;
1316 context->term -= offset;
1317 BACKTRACK();
1318 }
1319
1320 case ByteTerm::TypeAssertionBOL:
1321 case ByteTerm::TypeAssertionEOL:
1322 case ByteTerm::TypeAssertionWordBoundary:
1323 BACKTRACK();
1324
1325 case ByteTerm::TypePatternCharacterOnce:
1326 case ByteTerm::TypePatternCharacterFixed:
1327 case ByteTerm::TypePatternCharacterGreedy:
1328 case ByteTerm::TypePatternCharacterNonGreedy:
1329 if (backtrackPatternCharacter(currentTerm(), context))
1330 MATCH_NEXT();
1331 BACKTRACK();
1332 case ByteTerm::TypePatternCasedCharacterOnce:
1333 case ByteTerm::TypePatternCasedCharacterFixed:
1334 case ByteTerm::TypePatternCasedCharacterGreedy:
1335 case ByteTerm::TypePatternCasedCharacterNonGreedy:
1336 if (backtrackPatternCasedCharacter(currentTerm(), context))
1337 MATCH_NEXT();
1338 BACKTRACK();
1339 case ByteTerm::TypeCharacterClass:
1340 if (backtrackCharacterClass(currentTerm(), context))
1341 MATCH_NEXT();
1342 BACKTRACK();
1343 case ByteTerm::TypeBackReference:
1344 if (backtrackBackReference(currentTerm(), context))
1345 MATCH_NEXT();
1346 BACKTRACK();
1347 case ByteTerm::TypeParenthesesSubpattern: {
1348 JSRegExpResult result = backtrackParentheses(currentTerm(), context);
1349
1350 if (result == JSRegExpMatch) {
1351 MATCH_NEXT();
1352 } else if (result != JSRegExpNoMatch)
1353 return result;
1354
1355 BACKTRACK();
1356 }
1357 case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1358 if (backtrackParenthesesOnceBegin(currentTerm(), context))
1359 MATCH_NEXT();
1360 BACKTRACK();
1361 case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1362 if (backtrackParenthesesOnceEnd(currentTerm(), context))
1363 MATCH_NEXT();
1364 BACKTRACK();
1365 case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
1366 if (backtrackParenthesesTerminalBegin(currentTerm(), context))
1367 MATCH_NEXT();
1368 BACKTRACK();
1369 case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
1370 if (backtrackParenthesesTerminalEnd(currentTerm(), context))
1371 MATCH_NEXT();
1372 BACKTRACK();
1373 case ByteTerm::TypeParentheticalAssertionBegin:
1374 if (backtrackParentheticalAssertionBegin(currentTerm(), context))
1375 MATCH_NEXT();
1376 BACKTRACK();
1377 case ByteTerm::TypeParentheticalAssertionEnd:
1378 if (backtrackParentheticalAssertionEnd(currentTerm(), context))
1379 MATCH_NEXT();
1380 BACKTRACK();
1381
1382 case ByteTerm::TypeCheckInput:
1383 input.uncheckInput(currentTerm().checkInputCount);
1384 BACKTRACK();
1385
1386 case ByteTerm::TypeUncheckInput:
1387 input.checkInput(currentTerm().checkInputCount);
1388 BACKTRACK();
1389
1390 case ByteTerm::TypeDotStarEnclosure:
1391 RELEASE_ASSERT_NOT_REACHED();
1392 }
1393
1394 RELEASE_ASSERT_NOT_REACHED();
1395 return JSRegExpErrorNoMatch;
1396 }
1397
1398 JSRegExpResult matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
1399 {
1400 JSRegExpResult result = matchDisjunction(disjunction, context, btrack);
1401
1402 if (result == JSRegExpMatch) {
1403 while (context->matchBegin == context->matchEnd) {
1404 result = matchDisjunction(disjunction, context, true);
1405 if (result != JSRegExpMatch)
1406 return result;
1407 }
1408 return JSRegExpMatch;
1409 }
1410
1411 return result;
1412 }
1413
1414 unsigned interpret()
1415 {
1416 if (!input.isAvailableInput(0))
1417 return offsetNoMatch;
1418
1419 for (unsigned i = 0; i < pattern->m_body->m_numSubpatterns + 1; ++i)
1420 output[i << 1] = offsetNoMatch;
1421
1422 allocatorPool = pattern->m_allocator->startAllocator();
1423 RELEASE_ASSERT(allocatorPool);
1424
1425 DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get());
1426
1427 JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false);
1428 if (result == JSRegExpMatch) {
1429 output[0] = context->matchBegin;
1430 output[1] = context->matchEnd;
1431 }
1432
1433 freeDisjunctionContext(context);
1434
1435 pattern->m_allocator->stopAllocator();
1436
1437 ASSERT((result == JSRegExpMatch) == (output[0] != offsetNoMatch));
1438 return output[0];
1439 }
1440
1441 Interpreter(BytecodePattern* pattern, unsigned* output, const CharType* input, unsigned length, unsigned start)
1442 : pattern(pattern)
1443 , output(output)
1444 , input(input, start, length)
1445 , allocatorPool(0)
1446 , remainingMatchCount(matchLimit)
1447 {
1448 }
1449
1450 private:
1451 BytecodePattern* pattern;
1452 unsigned* output;
1453 InputStream input;
1454 BumpPointerPool* allocatorPool;
1455 unsigned remainingMatchCount;
1456 };
1457
1458 class ByteCompiler {
1459 struct ParenthesesStackEntry {
1460 unsigned beginTerm;
1461 unsigned savedAlternativeIndex;
1462 ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/)
1463 : beginTerm(beginTerm)
1464 , savedAlternativeIndex(savedAlternativeIndex)
1465 {
1466 }
1467 };
1468
1469 public:
1470 ByteCompiler(YarrPattern& pattern)
1471 : m_pattern(pattern)
1472 {
1473 m_currentAlternativeIndex = 0;
1474 }
1475
1476 PassOwnPtr<BytecodePattern> compile(BumpPointerAllocator* allocator)
1477 {
1478 regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough());
1479 emitDisjunction(m_pattern.m_body);
1480 regexEnd();
1481
1482 return adoptPtr(new BytecodePattern(m_bodyDisjunction.release(), m_allParenthesesInfo, m_pattern, allocator));
1483 }
1484
1485 void checkInput(unsigned count)
1486 {
1487 m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count));
1488 }
1489
1490 void uncheckInput(unsigned count)
1491 {
1492 m_bodyDisjunction->terms.append(ByteTerm::UncheckInput(count));
1493 }
1494
1495 void assertionBOL(unsigned inputPosition)
1496 {
1497 m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition));
1498 }
1499
1500 void assertionEOL(unsigned inputPosition)
1501 {
1502 m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition));
1503 }
1504
1505 void assertionWordBoundary(bool invert, unsigned inputPosition)
1506 {
1507 m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition));
1508 }
1509
1510 void atomPatternCharacter(UChar ch, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
1511 {
1512 if (m_pattern.m_ignoreCase) {
1513 UChar lo = Unicode::toLower(ch);
1514 UChar hi = Unicode::toUpper(ch);
1515
1516 if (lo != hi) {
1517 m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType));
1518 return;
1519 }
1520 }
1521
1522 m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType));
1523 }
1524
1525 void atomCharacterClass(CharacterClass* characterClass, bool invert, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
1526 {
1527 m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition));
1528
1529 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount.unsafeGet();
1530 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
1531 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1532 }
1533
1534 void atomBackReference(unsigned subpatternId, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
1535 {
1536 ASSERT(subpatternId);
1537
1538 m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition));
1539
1540 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount.unsafeGet();
1541 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
1542 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1543 }
1544
1545 void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1546 {
1547 int beginTerm = m_bodyDisjunction->terms.size();
1548
1549 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
1550 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1551 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1552 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1553
1554 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1555 m_currentAlternativeIndex = beginTerm + 1;
1556 }
1557
1558 void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1559 {
1560 int beginTerm = m_bodyDisjunction->terms.size();
1561
1562 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalBegin, subpatternId, capture, false, inputPosition));
1563 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1564 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1565 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1566
1567 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1568 m_currentAlternativeIndex = beginTerm + 1;
1569 }
1570
1571 void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1572 {
1573 // Errrk! - this is a little crazy, we initially generate as a TypeParenthesesSubpatternOnceBegin,
1574 // then fix this up at the end! - simplifying this should make it much clearer.
1575 // https://bugs.webkit.org/show_bug.cgi?id=50136
1576
1577 int beginTerm = m_bodyDisjunction->terms.size();
1578
1579 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
1580 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1581 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1582 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1583
1584 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1585 m_currentAlternativeIndex = beginTerm + 1;
1586 }
1587
1588 void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation)
1589 {
1590 int beginTerm = m_bodyDisjunction->terms.size();
1591
1592 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, false, invert, 0));
1593 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1594 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1595 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1596
1597 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1598 m_currentAlternativeIndex = beginTerm + 1;
1599 }
1600
1601 void atomParentheticalAssertionEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
1602 {
1603 unsigned beginTerm = popParenthesesStack();
1604 closeAlternative(beginTerm + 1);
1605 unsigned endTerm = m_bodyDisjunction->terms.size();
1606
1607 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin);
1608
1609 bool invert = m_bodyDisjunction->terms[beginTerm].invert();
1610 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1611
1612 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionEnd, subpatternId, false, invert, inputPosition));
1613 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1614 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1615 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1616
1617 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet();
1618 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1619 m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet();
1620 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1621 }
1622
1623 void assertionDotStarEnclosure(bool bolAnchored, bool eolAnchored)
1624 {
1625 m_bodyDisjunction->terms.append(ByteTerm::DotStarEnclosure(bolAnchored, eolAnchored));
1626 }
1627
1628 unsigned popParenthesesStack()
1629 {
1630 ASSERT(m_parenthesesStack.size());
1631 int stackEnd = m_parenthesesStack.size() - 1;
1632 unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm;
1633 m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex;
1634 m_parenthesesStack.shrink(stackEnd);
1635
1636 ASSERT(beginTerm < m_bodyDisjunction->terms.size());
1637 ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size());
1638
1639 return beginTerm;
1640 }
1641
1642 #ifndef NDEBUG
1643 void dumpDisjunction(ByteDisjunction* disjunction)
1644 {
1645 dataLogF("ByteDisjunction(%p):\n\t", disjunction);
1646 for (unsigned i = 0; i < disjunction->terms.size(); ++i)
1647 dataLogF("{ %d } ", disjunction->terms[i].type);
1648 dataLogF("\n");
1649 }
1650 #endif
1651
1652 void closeAlternative(int beginTerm)
1653 {
1654 int origBeginTerm = beginTerm;
1655 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin);
1656 int endIndex = m_bodyDisjunction->terms.size();
1657
1658 unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
1659
1660 if (!m_bodyDisjunction->terms[beginTerm].alternative.next)
1661 m_bodyDisjunction->terms.remove(beginTerm);
1662 else {
1663 while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
1664 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
1665 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction);
1666 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1667 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1668 }
1669
1670 m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1671
1672 m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd());
1673 m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1674 }
1675 }
1676
1677 void closeBodyAlternative()
1678 {
1679 int beginTerm = 0;
1680 int origBeginTerm = 0;
1681 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin);
1682 int endIndex = m_bodyDisjunction->terms.size();
1683
1684 unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
1685
1686 while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
1687 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
1688 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction);
1689 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1690 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1691 }
1692
1693 m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1694
1695 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd());
1696 m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1697 }
1698
1699 void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0)
1700 {
1701 unsigned beginTerm = popParenthesesStack();
1702 closeAlternative(beginTerm + 1);
1703 unsigned endTerm = m_bodyDisjunction->terms.size();
1704
1705 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
1706
1707 ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm];
1708
1709 bool capture = parenthesesBegin.capture();
1710 unsigned subpatternId = parenthesesBegin.atom.subpatternId;
1711
1712 unsigned numSubpatterns = lastSubpatternId - subpatternId + 1;
1713 OwnPtr<ByteDisjunction> parenthesesDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize));
1714
1715 unsigned firstTermInParentheses = beginTerm + 1;
1716 parenthesesDisjunction->terms.reserveInitialCapacity(endTerm - firstTermInParentheses + 2);
1717
1718 parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin());
1719 for (unsigned termInParentheses = firstTermInParentheses; termInParentheses < endTerm; ++termInParentheses)
1720 parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]);
1721 parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd());
1722
1723 m_bodyDisjunction->terms.shrink(beginTerm);
1724
1725 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, inputPosition));
1726 m_allParenthesesInfo.append(parenthesesDisjunction.release());
1727
1728 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet();
1729 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1730 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1731 }
1732
1733 void atomParenthesesOnceEnd(int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
1734 {
1735 unsigned beginTerm = popParenthesesStack();
1736 closeAlternative(beginTerm + 1);
1737 unsigned endTerm = m_bodyDisjunction->terms.size();
1738
1739 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
1740
1741 bool capture = m_bodyDisjunction->terms[beginTerm].capture();
1742 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1743
1744 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, capture, false, inputPosition));
1745 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1746 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1747 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1748
1749 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet();
1750 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1751 m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet();
1752 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1753 }
1754
1755 void atomParenthesesTerminalEnd(int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
1756 {
1757 unsigned beginTerm = popParenthesesStack();
1758 closeAlternative(beginTerm + 1);
1759 unsigned endTerm = m_bodyDisjunction->terms.size();
1760
1761 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
1762
1763 bool capture = m_bodyDisjunction->terms[beginTerm].capture();
1764 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1765
1766 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalEnd, subpatternId, capture, false, inputPosition));
1767 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1768 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1769 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1770
1771 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet();
1772 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1773 m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet();
1774 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1775 }
1776
1777 void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough)
1778 {
1779 m_bodyDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize));
1780 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough));
1781 m_bodyDisjunction->terms[0].frameLocation = 0;
1782 m_currentAlternativeIndex = 0;
1783 }
1784
1785 void regexEnd()
1786 {
1787 closeBodyAlternative();
1788 }
1789
1790 void alternativeBodyDisjunction(bool onceThrough)
1791 {
1792 int newAlternativeIndex = m_bodyDisjunction->terms.size();
1793 m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
1794 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction(onceThrough));
1795
1796 m_currentAlternativeIndex = newAlternativeIndex;
1797 }
1798
1799 void alternativeDisjunction()
1800 {
1801 int newAlternativeIndex = m_bodyDisjunction->terms.size();
1802 m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
1803 m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction());
1804
1805 m_currentAlternativeIndex = newAlternativeIndex;
1806 }
1807
1808 void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0)
1809 {
1810 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
1811 unsigned currentCountAlreadyChecked = inputCountAlreadyChecked;
1812
1813 PatternAlternative* alternative = disjunction->m_alternatives[alt].get();
1814
1815 if (alt) {
1816 if (disjunction == m_pattern.m_body)
1817 alternativeBodyDisjunction(alternative->onceThrough());
1818 else
1819 alternativeDisjunction();
1820 }
1821
1822 unsigned minimumSize = alternative->m_minimumSize;
1823 ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked);
1824 unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked;
1825
1826 if (countToCheck) {
1827 checkInput(countToCheck);
1828 currentCountAlreadyChecked += countToCheck;
1829 }
1830
1831 for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
1832 PatternTerm& term = alternative->m_terms[i];
1833
1834 switch (term.type) {
1835 case PatternTerm::TypeAssertionBOL:
1836 assertionBOL(currentCountAlreadyChecked - term.inputPosition);
1837 break;
1838
1839 case PatternTerm::TypeAssertionEOL:
1840 assertionEOL(currentCountAlreadyChecked - term.inputPosition);
1841 break;
1842
1843 case PatternTerm::TypeAssertionWordBoundary:
1844 assertionWordBoundary(term.invert(), currentCountAlreadyChecked - term.inputPosition);
1845 break;
1846
1847 case PatternTerm::TypePatternCharacter:
1848 atomPatternCharacter(term.patternCharacter, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType);
1849 break;
1850
1851 case PatternTerm::TypeCharacterClass:
1852 atomCharacterClass(term.characterClass, term.invert(), currentCountAlreadyChecked- term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType);
1853 break;
1854
1855 case PatternTerm::TypeBackReference:
1856 atomBackReference(term.backReferenceSubpatternId, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType);
1857 break;
1858
1859 case PatternTerm::TypeForwardReference:
1860 break;
1861
1862 case PatternTerm::TypeParenthesesSubpattern: {
1863 unsigned disjunctionAlreadyCheckedCount = 0;
1864 if (term.quantityCount == 1 && !term.parentheses.isCopy) {
1865 unsigned alternativeFrameLocation = term.frameLocation;
1866 // For QuantifierFixedCount we pre-check the minimum size; for greedy/non-greedy we reserve a slot in the frame.
1867 if (term.quantityType == QuantifierFixedCount)
1868 disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize;
1869 else
1870 alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
1871 unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1872 atomParenthesesOnceBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, alternativeFrameLocation);
1873 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount);
1874 atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType);
1875 } else if (term.parentheses.isTerminal) {
1876 unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1877 atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, term.frameLocation + YarrStackSpaceForBackTrackInfoParenthesesOnce);
1878 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount);
1879 atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType);
1880 } else {
1881 unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1882 atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, 0);
1883 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
1884 atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
1885 }
1886 break;
1887 }
1888
1889 case PatternTerm::TypeParentheticalAssertion: {
1890 unsigned alternativeFrameLocation = term.frameLocation + YarrStackSpaceForBackTrackInfoParentheticalAssertion;
1891
1892 ASSERT(currentCountAlreadyChecked >= static_cast<unsigned>(term.inputPosition));
1893 unsigned positiveInputOffset = currentCountAlreadyChecked - static_cast<unsigned>(term.inputPosition);
1894 unsigned uncheckAmount = 0;
1895 if (positiveInputOffset > term.parentheses.disjunction->m_minimumSize) {
1896 uncheckAmount = positiveInputOffset - term.parentheses.disjunction->m_minimumSize;
1897 uncheckInput(uncheckAmount);
1898 currentCountAlreadyChecked -= uncheckAmount;
1899 }
1900
1901 atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invert(), term.frameLocation, alternativeFrameLocation);
1902 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset - uncheckAmount);
1903 atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityCount, term.quantityType);
1904 if (uncheckAmount) {
1905 checkInput(uncheckAmount);
1906 currentCountAlreadyChecked += uncheckAmount;
1907 }
1908 break;
1909 }
1910
1911 case PatternTerm::TypeDotStarEnclosure:
1912 assertionDotStarEnclosure(term.anchors.bolAnchor, term.anchors.eolAnchor);
1913 break;
1914 }
1915 }
1916 }
1917 }
1918
1919 private:
1920 YarrPattern& m_pattern;
1921 OwnPtr<ByteDisjunction> m_bodyDisjunction;
1922 unsigned m_currentAlternativeIndex;
1923 Vector<ParenthesesStackEntry> m_parenthesesStack;
1924 Vector<OwnPtr<ByteDisjunction> > m_allParenthesesInfo;
1925 };
1926
1927 PassOwnPtr<BytecodePattern> byteCompile(YarrPattern& pattern, BumpPointerAllocator* allocator)
1928 {
1929 return ByteCompiler(pattern).compile(allocator);
1930 }
1931
1932 unsigned interpret(BytecodePattern* bytecode, const String& input, unsigned start, unsigned* output)
1933 {
1934 if (input.is8Bit())
1935 return Interpreter<LChar>(bytecode, output, input.characters8(), input.length(), start).interpret();
1936 return Interpreter<UChar>(bytecode, output, input.characters16(), input.length(), start).interpret();
1937 }
1938
1939 unsigned interpret(BytecodePattern* bytecode, const LChar* input, unsigned length, unsigned start, unsigned* output)
1940 {
1941 return Interpreter<LChar>(bytecode, output, input, length, start).interpret();
1942 }
1943
1944 unsigned interpret(BytecodePattern* bytecode, const UChar* input, unsigned length, unsigned start, unsigned* output)
1945 {
1946 return Interpreter<UChar>(bytecode, output, input, length, start).interpret();
1947 }
1948
1949 // These should be the same for both UChar & LChar.
1950 COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoPatternCharacter);
1951 COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoCharacterClass);
1952 COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoBackReference);
1953 COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoAlternative);
1954 COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion);
1955 COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce);
1956 COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheses) == (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses);
1957
1958
1959 } }