]> git.saurik.com Git - apple/javascriptcore.git/blob - yarr/YarrInterpreter.cpp
JavaScriptCore-7601.1.46.3.tar.gz
[apple/javascriptcore.git] / yarr / YarrInterpreter.cpp
1 /*
2 * Copyright (C) 2009 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27 #include "config.h"
28 #include "YarrInterpreter.h"
29
30 #include "Yarr.h"
31 #include "YarrCanonicalizeUCS2.h"
32 #include <wtf/BumpPointerAllocator.h>
33 #include <wtf/DataLog.h>
34 #include <wtf/text/CString.h>
35 #include <wtf/text/WTFString.h>
36
37 using namespace WTF;
38
39 namespace JSC { namespace Yarr {
40
41 template<typename CharType>
42 class Interpreter {
43 public:
44 struct ParenthesesDisjunctionContext;
45
46 struct BackTrackInfoPatternCharacter {
47 uintptr_t matchAmount;
48 };
49 struct BackTrackInfoCharacterClass {
50 uintptr_t matchAmount;
51 };
52 struct BackTrackInfoBackReference {
53 uintptr_t begin; // Not really needed for greedy quantifiers.
54 uintptr_t matchAmount; // Not really needed for fixed quantifiers.
55 };
56 struct BackTrackInfoAlternative {
57 uintptr_t offset;
58 };
59 struct BackTrackInfoParentheticalAssertion {
60 uintptr_t begin;
61 };
62 struct BackTrackInfoParenthesesOnce {
63 uintptr_t begin;
64 };
65 struct BackTrackInfoParenthesesTerminal {
66 uintptr_t begin;
67 };
68 struct BackTrackInfoParentheses {
69 uintptr_t matchAmount;
70 ParenthesesDisjunctionContext* lastContext;
71 };
72
73 static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context)
74 {
75 context->next = backTrack->lastContext;
76 backTrack->lastContext = context;
77 ++backTrack->matchAmount;
78 }
79
80 static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack)
81 {
82 RELEASE_ASSERT(backTrack->matchAmount);
83 RELEASE_ASSERT(backTrack->lastContext);
84 backTrack->lastContext = backTrack->lastContext->next;
85 --backTrack->matchAmount;
86 }
87
88 struct DisjunctionContext
89 {
90 DisjunctionContext()
91 : term(0)
92 {
93 }
94
95 void* operator new(size_t, void* where)
96 {
97 return where;
98 }
99
100 int term;
101 unsigned matchBegin;
102 unsigned matchEnd;
103 uintptr_t frame[1];
104 };
105
106 DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction)
107 {
108 size_t size = sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t);
109 allocatorPool = allocatorPool->ensureCapacity(size);
110 RELEASE_ASSERT(allocatorPool);
111 return new (allocatorPool->alloc(size)) DisjunctionContext();
112 }
113
114 void freeDisjunctionContext(DisjunctionContext* context)
115 {
116 allocatorPool = allocatorPool->dealloc(context);
117 }
118
119 struct ParenthesesDisjunctionContext
120 {
121 ParenthesesDisjunctionContext(unsigned* output, ByteTerm& term)
122 : next(0)
123 {
124 unsigned firstSubpatternId = term.atom.subpatternId;
125 unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns;
126
127 for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) {
128 subpatternBackup[i] = output[(firstSubpatternId << 1) + i];
129 output[(firstSubpatternId << 1) + i] = offsetNoMatch;
130 }
131
132 new (getDisjunctionContext(term)) DisjunctionContext();
133 }
134
135 void* operator new(size_t, void* where)
136 {
137 return where;
138 }
139
140 void restoreOutput(unsigned* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns)
141 {
142 for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i)
143 output[(firstSubpatternId << 1) + i] = subpatternBackup[i];
144 }
145
146 DisjunctionContext* getDisjunctionContext(ByteTerm& term)
147 {
148 return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1]));
149 }
150
151 ParenthesesDisjunctionContext* next;
152 unsigned subpatternBackup[1];
153 };
154
155 ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, unsigned* output, ByteTerm& term)
156 {
157 size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(unsigned) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(unsigned) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + static_cast<size_t>(disjunction->m_frameSize) * sizeof(uintptr_t);
158 allocatorPool = allocatorPool->ensureCapacity(size);
159 RELEASE_ASSERT(allocatorPool);
160 return new (allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term);
161 }
162
163 void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context)
164 {
165 allocatorPool = allocatorPool->dealloc(context);
166 }
167
168 class InputStream {
169 public:
170 InputStream(const CharType* input, unsigned start, unsigned length)
171 : input(input)
172 , pos(start)
173 , length(length)
174 {
175 }
176
177 void next()
178 {
179 ++pos;
180 }
181
182 void rewind(unsigned amount)
183 {
184 ASSERT(pos >= amount);
185 pos -= amount;
186 }
187
188 int read()
189 {
190 ASSERT(pos < length);
191 if (pos < length)
192 return input[pos];
193 return -1;
194 }
195
196 int readPair()
197 {
198 ASSERT(pos + 1 < length);
199 return input[pos] | input[pos + 1] << 16;
200 }
201
202 int readChecked(unsigned negativePositionOffest)
203 {
204 RELEASE_ASSERT(pos >= negativePositionOffest);
205 unsigned p = pos - negativePositionOffest;
206 ASSERT(p < length);
207 return input[p];
208 }
209
210 int reread(unsigned from)
211 {
212 ASSERT(from < length);
213 return input[from];
214 }
215
216 int prev()
217 {
218 ASSERT(!(pos > length));
219 if (pos && length)
220 return input[pos - 1];
221 return -1;
222 }
223
224 unsigned getPos()
225 {
226 return pos;
227 }
228
229 void setPos(unsigned p)
230 {
231 pos = p;
232 }
233
234 bool atStart()
235 {
236 return pos == 0;
237 }
238
239 bool atEnd()
240 {
241 return pos == length;
242 }
243
244 unsigned end()
245 {
246 return length;
247 }
248
249 bool checkInput(unsigned count)
250 {
251 if (((pos + count) <= length) && ((pos + count) >= pos)) {
252 pos += count;
253 return true;
254 }
255 return false;
256 }
257
258 void uncheckInput(unsigned count)
259 {
260 RELEASE_ASSERT(pos >= count);
261 pos -= count;
262 }
263
264 bool atStart(unsigned negativePositionOffest)
265 {
266 return pos == negativePositionOffest;
267 }
268
269 bool atEnd(unsigned negativePositionOffest)
270 {
271 RELEASE_ASSERT(pos >= negativePositionOffest);
272 return (pos - negativePositionOffest) == length;
273 }
274
275 bool isAvailableInput(unsigned offset)
276 {
277 return (((pos + offset) <= length) && ((pos + offset) >= pos));
278 }
279
280 private:
281 const CharType* input;
282 unsigned pos;
283 unsigned length;
284 };
285
286 bool testCharacterClass(CharacterClass* characterClass, int ch)
287 {
288 if (ch & 0xFF80) {
289 for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i)
290 if (ch == characterClass->m_matchesUnicode[i])
291 return true;
292 for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i)
293 if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end))
294 return true;
295 } else {
296 for (unsigned i = 0; i < characterClass->m_matches.size(); ++i)
297 if (ch == characterClass->m_matches[i])
298 return true;
299 for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i)
300 if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end))
301 return true;
302 }
303
304 return false;
305 }
306
307 bool checkCharacter(int testChar, unsigned negativeInputOffset)
308 {
309 return testChar == input.readChecked(negativeInputOffset);
310 }
311
312 bool checkCasedCharacter(int loChar, int hiChar, unsigned negativeInputOffset)
313 {
314 int ch = input.readChecked(negativeInputOffset);
315 return (loChar == ch) || (hiChar == ch);
316 }
317
318 bool checkCharacterClass(CharacterClass* characterClass, bool invert, unsigned negativeInputOffset)
319 {
320 bool match = testCharacterClass(characterClass, input.readChecked(negativeInputOffset));
321 return invert ? !match : match;
322 }
323
324 bool tryConsumeBackReference(int matchBegin, int matchEnd, unsigned negativeInputOffset)
325 {
326 unsigned matchSize = (unsigned)(matchEnd - matchBegin);
327
328 if (!input.checkInput(matchSize))
329 return false;
330
331 if (pattern->m_ignoreCase) {
332 for (unsigned i = 0; i < matchSize; ++i) {
333 int oldCh = input.reread(matchBegin + i);
334 int ch = input.readChecked(negativeInputOffset + matchSize - i);
335
336 if (oldCh == ch)
337 continue;
338
339 // The definition for canonicalize (see ES 5.1, 15.10.2.8) means that
340 // unicode values are never allowed to match against ascii ones.
341 if (isASCII(oldCh) || isASCII(ch)) {
342 if (toASCIIUpper(oldCh) == toASCIIUpper(ch))
343 continue;
344 } else if (areCanonicallyEquivalent(oldCh, ch))
345 continue;
346
347 input.uncheckInput(matchSize);
348 return false;
349 }
350 } else {
351 for (unsigned i = 0; i < matchSize; ++i) {
352 if (!checkCharacter(input.reread(matchBegin + i), negativeInputOffset + matchSize - i)) {
353 input.uncheckInput(matchSize);
354 return false;
355 }
356 }
357 }
358
359 return true;
360 }
361
362 bool matchAssertionBOL(ByteTerm& term)
363 {
364 return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition + 1)));
365 }
366
367 bool matchAssertionEOL(ByteTerm& term)
368 {
369 if (term.inputPosition)
370 return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition)));
371
372 return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read()));
373 }
374
375 bool matchAssertionWordBoundary(ByteTerm& term)
376 {
377 bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition + 1));
378 bool readIsWordchar;
379 if (term.inputPosition)
380 readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition));
381 else
382 readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read());
383
384 bool wordBoundary = prevIsWordchar != readIsWordchar;
385 return term.invert() ? !wordBoundary : wordBoundary;
386 }
387
388 bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context)
389 {
390 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
391
392 switch (term.atom.quantityType) {
393 case QuantifierFixedCount:
394 break;
395
396 case QuantifierGreedy:
397 if (backTrack->matchAmount) {
398 --backTrack->matchAmount;
399 input.uncheckInput(1);
400 return true;
401 }
402 break;
403
404 case QuantifierNonGreedy:
405 if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
406 ++backTrack->matchAmount;
407 if (checkCharacter(term.atom.patternCharacter, term.inputPosition + 1))
408 return true;
409 }
410 input.uncheckInput(backTrack->matchAmount);
411 break;
412 }
413
414 return false;
415 }
416
417 bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context)
418 {
419 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
420
421 switch (term.atom.quantityType) {
422 case QuantifierFixedCount:
423 break;
424
425 case QuantifierGreedy:
426 if (backTrack->matchAmount) {
427 --backTrack->matchAmount;
428 input.uncheckInput(1);
429 return true;
430 }
431 break;
432
433 case QuantifierNonGreedy:
434 if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
435 ++backTrack->matchAmount;
436 if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition + 1))
437 return true;
438 }
439 input.uncheckInput(backTrack->matchAmount);
440 break;
441 }
442
443 return false;
444 }
445
446 bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context)
447 {
448 ASSERT(term.type == ByteTerm::TypeCharacterClass);
449 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
450
451 switch (term.atom.quantityType) {
452 case QuantifierFixedCount: {
453 for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
454 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - matchAmount))
455 return false;
456 }
457 return true;
458 }
459
460 case QuantifierGreedy: {
461 unsigned matchAmount = 0;
462 while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
463 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) {
464 input.uncheckInput(1);
465 break;
466 }
467 ++matchAmount;
468 }
469 backTrack->matchAmount = matchAmount;
470
471 return true;
472 }
473
474 case QuantifierNonGreedy:
475 backTrack->matchAmount = 0;
476 return true;
477 }
478
479 RELEASE_ASSERT_NOT_REACHED();
480 return false;
481 }
482
483 bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context)
484 {
485 ASSERT(term.type == ByteTerm::TypeCharacterClass);
486 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
487
488 switch (term.atom.quantityType) {
489 case QuantifierFixedCount:
490 break;
491
492 case QuantifierGreedy:
493 if (backTrack->matchAmount) {
494 --backTrack->matchAmount;
495 input.uncheckInput(1);
496 return true;
497 }
498 break;
499
500 case QuantifierNonGreedy:
501 if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
502 ++backTrack->matchAmount;
503 if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1))
504 return true;
505 }
506 input.uncheckInput(backTrack->matchAmount);
507 break;
508 }
509
510 return false;
511 }
512
513 bool matchBackReference(ByteTerm& term, DisjunctionContext* context)
514 {
515 ASSERT(term.type == ByteTerm::TypeBackReference);
516 BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
517
518 unsigned matchBegin = output[(term.atom.subpatternId << 1)];
519 unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1];
520
521 // If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that.
522 // In this case the result of match is empty string like when it references to a parentheses with zero-width match.
523 // Eg.: /(a\1)/
524 if (matchEnd == offsetNoMatch)
525 return true;
526
527 if (matchBegin == offsetNoMatch)
528 return true;
529
530 ASSERT(matchBegin <= matchEnd);
531
532 if (matchBegin == matchEnd)
533 return true;
534
535 switch (term.atom.quantityType) {
536 case QuantifierFixedCount: {
537 backTrack->begin = input.getPos();
538 for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
539 if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
540 input.setPos(backTrack->begin);
541 return false;
542 }
543 }
544 return true;
545 }
546
547 case QuantifierGreedy: {
548 unsigned matchAmount = 0;
549 while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition))
550 ++matchAmount;
551 backTrack->matchAmount = matchAmount;
552 return true;
553 }
554
555 case QuantifierNonGreedy:
556 backTrack->begin = input.getPos();
557 backTrack->matchAmount = 0;
558 return true;
559 }
560
561 RELEASE_ASSERT_NOT_REACHED();
562 return false;
563 }
564
565 bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context)
566 {
567 ASSERT(term.type == ByteTerm::TypeBackReference);
568 BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
569
570 unsigned matchBegin = output[(term.atom.subpatternId << 1)];
571 unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1];
572
573 if (matchBegin == offsetNoMatch)
574 return false;
575
576 ASSERT(matchBegin <= matchEnd);
577
578 if (matchBegin == matchEnd)
579 return false;
580
581 switch (term.atom.quantityType) {
582 case QuantifierFixedCount:
583 // for quantityCount == 1, could rewind.
584 input.setPos(backTrack->begin);
585 break;
586
587 case QuantifierGreedy:
588 if (backTrack->matchAmount) {
589 --backTrack->matchAmount;
590 input.rewind(matchEnd - matchBegin);
591 return true;
592 }
593 break;
594
595 case QuantifierNonGreedy:
596 if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
597 ++backTrack->matchAmount;
598 return true;
599 }
600 input.setPos(backTrack->begin);
601 break;
602 }
603
604 return false;
605 }
606
607 void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context)
608 {
609 if (term.capture()) {
610 unsigned subpatternId = term.atom.subpatternId;
611 output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition;
612 output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition;
613 }
614 }
615 void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context)
616 {
617 unsigned firstSubpatternId = term.atom.subpatternId;
618 unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
619 context->restoreOutput(output, firstSubpatternId, count);
620 }
621 JSRegExpResult parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack)
622 {
623 while (backTrack->matchAmount) {
624 ParenthesesDisjunctionContext* context = backTrack->lastContext;
625
626 JSRegExpResult result = matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true);
627 if (result == JSRegExpMatch)
628 return JSRegExpMatch;
629
630 resetMatches(term, context);
631 popParenthesesDisjunctionContext(backTrack);
632 freeParenthesesDisjunctionContext(context);
633
634 if (result != JSRegExpNoMatch)
635 return result;
636 }
637
638 return JSRegExpNoMatch;
639 }
640
641 bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
642 {
643 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
644 ASSERT(term.atom.quantityCount == 1);
645
646 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
647
648 switch (term.atom.quantityType) {
649 case QuantifierGreedy: {
650 // set this speculatively; if we get to the parens end this will be true.
651 backTrack->begin = input.getPos();
652 break;
653 }
654 case QuantifierNonGreedy: {
655 backTrack->begin = notFound;
656 context->term += term.atom.parenthesesWidth;
657 return true;
658 }
659 case QuantifierFixedCount:
660 break;
661 }
662
663 if (term.capture()) {
664 unsigned subpatternId = term.atom.subpatternId;
665 output[(subpatternId << 1)] = input.getPos() - term.inputPosition;
666 }
667
668 return true;
669 }
670
671 bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
672 {
673 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
674 ASSERT(term.atom.quantityCount == 1);
675
676 if (term.capture()) {
677 unsigned subpatternId = term.atom.subpatternId;
678 output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition;
679 }
680
681 if (term.atom.quantityType == QuantifierFixedCount)
682 return true;
683
684 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
685 return backTrack->begin != input.getPos();
686 }
687
688 bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
689 {
690 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
691 ASSERT(term.atom.quantityCount == 1);
692
693 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
694
695 if (term.capture()) {
696 unsigned subpatternId = term.atom.subpatternId;
697 output[(subpatternId << 1)] = offsetNoMatch;
698 output[(subpatternId << 1) + 1] = offsetNoMatch;
699 }
700
701 switch (term.atom.quantityType) {
702 case QuantifierGreedy:
703 // if we backtrack to this point, there is another chance - try matching nothing.
704 ASSERT(backTrack->begin != notFound);
705 backTrack->begin = notFound;
706 context->term += term.atom.parenthesesWidth;
707 return true;
708 case QuantifierNonGreedy:
709 ASSERT(backTrack->begin != notFound);
710 FALLTHROUGH;
711 case QuantifierFixedCount:
712 break;
713 }
714
715 return false;
716 }
717
718 bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
719 {
720 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
721 ASSERT(term.atom.quantityCount == 1);
722
723 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
724
725 switch (term.atom.quantityType) {
726 case QuantifierGreedy:
727 if (backTrack->begin == notFound) {
728 context->term -= term.atom.parenthesesWidth;
729 return false;
730 }
731 FALLTHROUGH;
732 case QuantifierNonGreedy:
733 if (backTrack->begin == notFound) {
734 backTrack->begin = input.getPos();
735 if (term.capture()) {
736 // Technically this access to inputPosition should be accessing the begin term's
737 // inputPosition, but for repeats other than fixed these values should be
738 // the same anyway! (We don't pre-check for greedy or non-greedy matches.)
739 ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
740 ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition);
741 unsigned subpatternId = term.atom.subpatternId;
742 output[subpatternId << 1] = input.getPos() + term.inputPosition;
743 }
744 context->term -= term.atom.parenthesesWidth;
745 return true;
746 }
747 FALLTHROUGH;
748 case QuantifierFixedCount:
749 break;
750 }
751
752 return false;
753 }
754
755 bool matchParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
756 {
757 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
758 ASSERT(term.atom.quantityType == QuantifierGreedy);
759 ASSERT(term.atom.quantityCount == quantifyInfinite);
760 ASSERT(!term.capture());
761
762 BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
763 backTrack->begin = input.getPos();
764 return true;
765 }
766
767 bool matchParenthesesTerminalEnd(ByteTerm& term, DisjunctionContext* context)
768 {
769 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalEnd);
770
771 BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
772 // Empty match is a failed match.
773 if (backTrack->begin == input.getPos())
774 return false;
775
776 // Successful match! Okay, what's next? - loop around and try to match moar!
777 context->term -= (term.atom.parenthesesWidth + 1);
778 return true;
779 }
780
781 bool backtrackParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
782 {
783 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
784 ASSERT(term.atom.quantityType == QuantifierGreedy);
785 ASSERT(term.atom.quantityCount == quantifyInfinite);
786 ASSERT(!term.capture());
787
788 // If we backtrack to this point, we have failed to match this iteration of the parens.
789 // Since this is greedy / zero minimum a failed is also accepted as a match!
790 context->term += term.atom.parenthesesWidth;
791 return true;
792 }
793
794 bool backtrackParenthesesTerminalEnd(ByteTerm&, DisjunctionContext*)
795 {
796 // 'Terminal' parentheses are at the end of the regex, and as such a match past end
797 // should always be returned as a successful match - we should never backtrack to here.
798 RELEASE_ASSERT_NOT_REACHED();
799 return false;
800 }
801
802 bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
803 {
804 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
805 ASSERT(term.atom.quantityCount == 1);
806
807 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
808
809 backTrack->begin = input.getPos();
810 return true;
811 }
812
813 bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
814 {
815 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
816 ASSERT(term.atom.quantityCount == 1);
817
818 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
819
820 input.setPos(backTrack->begin);
821
822 // We've reached the end of the parens; if they are inverted, this is failure.
823 if (term.invert()) {
824 context->term -= term.atom.parenthesesWidth;
825 return false;
826 }
827
828 return true;
829 }
830
831 bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
832 {
833 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
834 ASSERT(term.atom.quantityCount == 1);
835
836 // We've failed to match parens; if they are inverted, this is win!
837 if (term.invert()) {
838 context->term += term.atom.parenthesesWidth;
839 return true;
840 }
841
842 return false;
843 }
844
845 bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
846 {
847 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
848 ASSERT(term.atom.quantityCount == 1);
849
850 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
851
852 input.setPos(backTrack->begin);
853
854 context->term -= term.atom.parenthesesWidth;
855 return false;
856 }
857
858 JSRegExpResult matchParentheses(ByteTerm& term, DisjunctionContext* context)
859 {
860 ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
861
862 BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
863 ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
864
865 backTrack->matchAmount = 0;
866 backTrack->lastContext = 0;
867
868 switch (term.atom.quantityType) {
869 case QuantifierFixedCount: {
870 // While we haven't yet reached our fixed limit,
871 while (backTrack->matchAmount < term.atom.quantityCount) {
872 // Try to do a match, and it it succeeds, add it to the list.
873 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
874 JSRegExpResult result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
875 if (result == JSRegExpMatch)
876 appendParenthesesDisjunctionContext(backTrack, context);
877 else {
878 // The match failed; try to find an alternate point to carry on from.
879 resetMatches(term, context);
880 freeParenthesesDisjunctionContext(context);
881
882 if (result != JSRegExpNoMatch)
883 return result;
884 JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
885 if (backtrackResult != JSRegExpMatch)
886 return backtrackResult;
887 }
888 }
889
890 ASSERT(backTrack->matchAmount == term.atom.quantityCount);
891 ParenthesesDisjunctionContext* context = backTrack->lastContext;
892 recordParenthesesMatch(term, context);
893 return JSRegExpMatch;
894 }
895
896 case QuantifierGreedy: {
897 while (backTrack->matchAmount < term.atom.quantityCount) {
898 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
899 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
900 if (result == JSRegExpMatch)
901 appendParenthesesDisjunctionContext(backTrack, context);
902 else {
903 resetMatches(term, context);
904 freeParenthesesDisjunctionContext(context);
905
906 if (result != JSRegExpNoMatch)
907 return result;
908
909 break;
910 }
911 }
912
913 if (backTrack->matchAmount) {
914 ParenthesesDisjunctionContext* context = backTrack->lastContext;
915 recordParenthesesMatch(term, context);
916 }
917 return JSRegExpMatch;
918 }
919
920 case QuantifierNonGreedy:
921 return JSRegExpMatch;
922 }
923
924 RELEASE_ASSERT_NOT_REACHED();
925 return JSRegExpErrorNoMatch;
926 }
927
928 // Rules for backtracking differ depending on whether this is greedy or non-greedy.
929 //
930 // Greedy matches never should try just adding more - you should already have done
931 // the 'more' cases. Always backtrack, at least a leetle bit. However cases where
932 // you backtrack an item off the list needs checking, since we'll never have matched
933 // the one less case. Tracking forwards, still add as much as possible.
934 //
935 // Non-greedy, we've already done the one less case, so don't match on popping.
936 // We haven't done the one more case, so always try to add that.
937 //
938 JSRegExpResult backtrackParentheses(ByteTerm& term, DisjunctionContext* context)
939 {
940 ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
941
942 BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
943 ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
944
945 switch (term.atom.quantityType) {
946 case QuantifierFixedCount: {
947 ASSERT(backTrack->matchAmount == term.atom.quantityCount);
948
949 ParenthesesDisjunctionContext* context = 0;
950 JSRegExpResult result = parenthesesDoBacktrack(term, backTrack);
951
952 if (result != JSRegExpMatch)
953 return result;
954
955 // While we haven't yet reached our fixed limit,
956 while (backTrack->matchAmount < term.atom.quantityCount) {
957 // Try to do a match, and it it succeeds, add it to the list.
958 context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
959 result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
960
961 if (result == JSRegExpMatch)
962 appendParenthesesDisjunctionContext(backTrack, context);
963 else {
964 // The match failed; try to find an alternate point to carry on from.
965 resetMatches(term, context);
966 freeParenthesesDisjunctionContext(context);
967
968 if (result != JSRegExpNoMatch)
969 return result;
970 JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
971 if (backtrackResult != JSRegExpMatch)
972 return backtrackResult;
973 }
974 }
975
976 ASSERT(backTrack->matchAmount == term.atom.quantityCount);
977 context = backTrack->lastContext;
978 recordParenthesesMatch(term, context);
979 return JSRegExpMatch;
980 }
981
982 case QuantifierGreedy: {
983 if (!backTrack->matchAmount)
984 return JSRegExpNoMatch;
985
986 ParenthesesDisjunctionContext* context = backTrack->lastContext;
987 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
988 if (result == JSRegExpMatch) {
989 while (backTrack->matchAmount < term.atom.quantityCount) {
990 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
991 JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
992 if (parenthesesResult == JSRegExpMatch)
993 appendParenthesesDisjunctionContext(backTrack, context);
994 else {
995 resetMatches(term, context);
996 freeParenthesesDisjunctionContext(context);
997
998 if (parenthesesResult != JSRegExpNoMatch)
999 return parenthesesResult;
1000
1001 break;
1002 }
1003 }
1004 } else {
1005 resetMatches(term, context);
1006 popParenthesesDisjunctionContext(backTrack);
1007 freeParenthesesDisjunctionContext(context);
1008
1009 if (result != JSRegExpNoMatch)
1010 return result;
1011 }
1012
1013 if (backTrack->matchAmount) {
1014 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1015 recordParenthesesMatch(term, context);
1016 }
1017 return JSRegExpMatch;
1018 }
1019
1020 case QuantifierNonGreedy: {
1021 // If we've not reached the limit, try to add one more match.
1022 if (backTrack->matchAmount < term.atom.quantityCount) {
1023 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
1024 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
1025 if (result == JSRegExpMatch) {
1026 appendParenthesesDisjunctionContext(backTrack, context);
1027 recordParenthesesMatch(term, context);
1028 return JSRegExpMatch;
1029 }
1030
1031 resetMatches(term, context);
1032 freeParenthesesDisjunctionContext(context);
1033
1034 if (result != JSRegExpNoMatch)
1035 return result;
1036 }
1037
1038 // Nope - okay backtrack looking for an alternative.
1039 while (backTrack->matchAmount) {
1040 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1041 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
1042 if (result == JSRegExpMatch) {
1043 // successful backtrack! we're back in the game!
1044 if (backTrack->matchAmount) {
1045 context = backTrack->lastContext;
1046 recordParenthesesMatch(term, context);
1047 }
1048 return JSRegExpMatch;
1049 }
1050
1051 // pop a match off the stack
1052 resetMatches(term, context);
1053 popParenthesesDisjunctionContext(backTrack);
1054 freeParenthesesDisjunctionContext(context);
1055
1056 if (result != JSRegExpNoMatch)
1057 return result;
1058 }
1059
1060 return JSRegExpNoMatch;
1061 }
1062 }
1063
1064 RELEASE_ASSERT_NOT_REACHED();
1065 return JSRegExpErrorNoMatch;
1066 }
1067
1068 bool matchDotStarEnclosure(ByteTerm& term, DisjunctionContext* context)
1069 {
1070 UNUSED_PARAM(term);
1071 unsigned matchBegin = context->matchBegin;
1072
1073 if (matchBegin) {
1074 for (matchBegin--; true; matchBegin--) {
1075 if (testCharacterClass(pattern->newlineCharacterClass, input.reread(matchBegin))) {
1076 ++matchBegin;
1077 break;
1078 }
1079
1080 if (!matchBegin)
1081 break;
1082 }
1083 }
1084
1085 unsigned matchEnd = input.getPos();
1086
1087 for (; (matchEnd != input.end())
1088 && (!testCharacterClass(pattern->newlineCharacterClass, input.reread(matchEnd))); matchEnd++) { }
1089
1090 if (((matchBegin && term.anchors.m_bol)
1091 || ((matchEnd != input.end()) && term.anchors.m_eol))
1092 && !pattern->m_multiline)
1093 return false;
1094
1095 context->matchBegin = matchBegin;
1096 context->matchEnd = matchEnd;
1097 return true;
1098 }
1099
1100 #define MATCH_NEXT() { ++context->term; goto matchAgain; }
1101 #define BACKTRACK() { --context->term; goto backtrack; }
1102 #define currentTerm() (disjunction->terms[context->term])
1103 JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
1104 {
1105 if (!--remainingMatchCount)
1106 return JSRegExpErrorHitLimit;
1107
1108 if (btrack)
1109 BACKTRACK();
1110
1111 context->matchBegin = input.getPos();
1112 context->term = 0;
1113
1114 matchAgain:
1115 ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
1116
1117 switch (currentTerm().type) {
1118 case ByteTerm::TypeSubpatternBegin:
1119 MATCH_NEXT();
1120 case ByteTerm::TypeSubpatternEnd:
1121 context->matchEnd = input.getPos();
1122 return JSRegExpMatch;
1123
1124 case ByteTerm::TypeBodyAlternativeBegin:
1125 MATCH_NEXT();
1126 case ByteTerm::TypeBodyAlternativeDisjunction:
1127 case ByteTerm::TypeBodyAlternativeEnd:
1128 context->matchEnd = input.getPos();
1129 return JSRegExpMatch;
1130
1131 case ByteTerm::TypeAlternativeBegin:
1132 MATCH_NEXT();
1133 case ByteTerm::TypeAlternativeDisjunction:
1134 case ByteTerm::TypeAlternativeEnd: {
1135 int offset = currentTerm().alternative.end;
1136 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
1137 backTrack->offset = offset;
1138 context->term += offset;
1139 MATCH_NEXT();
1140 }
1141
1142 case ByteTerm::TypeAssertionBOL:
1143 if (matchAssertionBOL(currentTerm()))
1144 MATCH_NEXT();
1145 BACKTRACK();
1146 case ByteTerm::TypeAssertionEOL:
1147 if (matchAssertionEOL(currentTerm()))
1148 MATCH_NEXT();
1149 BACKTRACK();
1150 case ByteTerm::TypeAssertionWordBoundary:
1151 if (matchAssertionWordBoundary(currentTerm()))
1152 MATCH_NEXT();
1153 BACKTRACK();
1154
1155 case ByteTerm::TypePatternCharacterOnce:
1156 case ByteTerm::TypePatternCharacterFixed: {
1157 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
1158 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - matchAmount))
1159 BACKTRACK();
1160 }
1161 MATCH_NEXT();
1162 }
1163 case ByteTerm::TypePatternCharacterGreedy: {
1164 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1165 unsigned matchAmount = 0;
1166 while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
1167 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + 1)) {
1168 input.uncheckInput(1);
1169 break;
1170 }
1171 ++matchAmount;
1172 }
1173 backTrack->matchAmount = matchAmount;
1174
1175 MATCH_NEXT();
1176 }
1177 case ByteTerm::TypePatternCharacterNonGreedy: {
1178 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1179 backTrack->matchAmount = 0;
1180 MATCH_NEXT();
1181 }
1182
1183 case ByteTerm::TypePatternCasedCharacterOnce:
1184 case ByteTerm::TypePatternCasedCharacterFixed: {
1185 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
1186 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - matchAmount))
1187 BACKTRACK();
1188 }
1189 MATCH_NEXT();
1190 }
1191 case ByteTerm::TypePatternCasedCharacterGreedy: {
1192 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1193 unsigned matchAmount = 0;
1194 while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
1195 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + 1)) {
1196 input.uncheckInput(1);
1197 break;
1198 }
1199 ++matchAmount;
1200 }
1201 backTrack->matchAmount = matchAmount;
1202
1203 MATCH_NEXT();
1204 }
1205 case ByteTerm::TypePatternCasedCharacterNonGreedy: {
1206 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1207 backTrack->matchAmount = 0;
1208 MATCH_NEXT();
1209 }
1210
1211 case ByteTerm::TypeCharacterClass:
1212 if (matchCharacterClass(currentTerm(), context))
1213 MATCH_NEXT();
1214 BACKTRACK();
1215 case ByteTerm::TypeBackReference:
1216 if (matchBackReference(currentTerm(), context))
1217 MATCH_NEXT();
1218 BACKTRACK();
1219 case ByteTerm::TypeParenthesesSubpattern: {
1220 JSRegExpResult result = matchParentheses(currentTerm(), context);
1221
1222 if (result == JSRegExpMatch) {
1223 MATCH_NEXT();
1224 } else if (result != JSRegExpNoMatch)
1225 return result;
1226
1227 BACKTRACK();
1228 }
1229 case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1230 if (matchParenthesesOnceBegin(currentTerm(), context))
1231 MATCH_NEXT();
1232 BACKTRACK();
1233 case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1234 if (matchParenthesesOnceEnd(currentTerm(), context))
1235 MATCH_NEXT();
1236 BACKTRACK();
1237 case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
1238 if (matchParenthesesTerminalBegin(currentTerm(), context))
1239 MATCH_NEXT();
1240 BACKTRACK();
1241 case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
1242 if (matchParenthesesTerminalEnd(currentTerm(), context))
1243 MATCH_NEXT();
1244 BACKTRACK();
1245 case ByteTerm::TypeParentheticalAssertionBegin:
1246 if (matchParentheticalAssertionBegin(currentTerm(), context))
1247 MATCH_NEXT();
1248 BACKTRACK();
1249 case ByteTerm::TypeParentheticalAssertionEnd:
1250 if (matchParentheticalAssertionEnd(currentTerm(), context))
1251 MATCH_NEXT();
1252 BACKTRACK();
1253
1254 case ByteTerm::TypeCheckInput:
1255 if (input.checkInput(currentTerm().checkInputCount))
1256 MATCH_NEXT();
1257 BACKTRACK();
1258
1259 case ByteTerm::TypeUncheckInput:
1260 input.uncheckInput(currentTerm().checkInputCount);
1261 MATCH_NEXT();
1262
1263 case ByteTerm::TypeDotStarEnclosure:
1264 if (matchDotStarEnclosure(currentTerm(), context))
1265 return JSRegExpMatch;
1266 BACKTRACK();
1267 }
1268
1269 // We should never fall-through to here.
1270 RELEASE_ASSERT_NOT_REACHED();
1271
1272 backtrack:
1273 ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
1274
1275 switch (currentTerm().type) {
1276 case ByteTerm::TypeSubpatternBegin:
1277 return JSRegExpNoMatch;
1278 case ByteTerm::TypeSubpatternEnd:
1279 RELEASE_ASSERT_NOT_REACHED();
1280
1281 case ByteTerm::TypeBodyAlternativeBegin:
1282 case ByteTerm::TypeBodyAlternativeDisjunction: {
1283 int offset = currentTerm().alternative.next;
1284 context->term += offset;
1285 if (offset > 0)
1286 MATCH_NEXT();
1287
1288 if (input.atEnd())
1289 return JSRegExpNoMatch;
1290
1291 input.next();
1292
1293 context->matchBegin = input.getPos();
1294
1295 if (currentTerm().alternative.onceThrough)
1296 context->term += currentTerm().alternative.next;
1297
1298 MATCH_NEXT();
1299 }
1300 case ByteTerm::TypeBodyAlternativeEnd:
1301 RELEASE_ASSERT_NOT_REACHED();
1302
1303 case ByteTerm::TypeAlternativeBegin:
1304 case ByteTerm::TypeAlternativeDisjunction: {
1305 int offset = currentTerm().alternative.next;
1306 context->term += offset;
1307 if (offset > 0)
1308 MATCH_NEXT();
1309 BACKTRACK();
1310 }
1311 case ByteTerm::TypeAlternativeEnd: {
1312 // We should never backtrack back into an alternative of the main body of the regex.
1313 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
1314 unsigned offset = backTrack->offset;
1315 context->term -= offset;
1316 BACKTRACK();
1317 }
1318
1319 case ByteTerm::TypeAssertionBOL:
1320 case ByteTerm::TypeAssertionEOL:
1321 case ByteTerm::TypeAssertionWordBoundary:
1322 BACKTRACK();
1323
1324 case ByteTerm::TypePatternCharacterOnce:
1325 case ByteTerm::TypePatternCharacterFixed:
1326 case ByteTerm::TypePatternCharacterGreedy:
1327 case ByteTerm::TypePatternCharacterNonGreedy:
1328 if (backtrackPatternCharacter(currentTerm(), context))
1329 MATCH_NEXT();
1330 BACKTRACK();
1331 case ByteTerm::TypePatternCasedCharacterOnce:
1332 case ByteTerm::TypePatternCasedCharacterFixed:
1333 case ByteTerm::TypePatternCasedCharacterGreedy:
1334 case ByteTerm::TypePatternCasedCharacterNonGreedy:
1335 if (backtrackPatternCasedCharacter(currentTerm(), context))
1336 MATCH_NEXT();
1337 BACKTRACK();
1338 case ByteTerm::TypeCharacterClass:
1339 if (backtrackCharacterClass(currentTerm(), context))
1340 MATCH_NEXT();
1341 BACKTRACK();
1342 case ByteTerm::TypeBackReference:
1343 if (backtrackBackReference(currentTerm(), context))
1344 MATCH_NEXT();
1345 BACKTRACK();
1346 case ByteTerm::TypeParenthesesSubpattern: {
1347 JSRegExpResult result = backtrackParentheses(currentTerm(), context);
1348
1349 if (result == JSRegExpMatch) {
1350 MATCH_NEXT();
1351 } else if (result != JSRegExpNoMatch)
1352 return result;
1353
1354 BACKTRACK();
1355 }
1356 case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1357 if (backtrackParenthesesOnceBegin(currentTerm(), context))
1358 MATCH_NEXT();
1359 BACKTRACK();
1360 case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1361 if (backtrackParenthesesOnceEnd(currentTerm(), context))
1362 MATCH_NEXT();
1363 BACKTRACK();
1364 case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
1365 if (backtrackParenthesesTerminalBegin(currentTerm(), context))
1366 MATCH_NEXT();
1367 BACKTRACK();
1368 case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
1369 if (backtrackParenthesesTerminalEnd(currentTerm(), context))
1370 MATCH_NEXT();
1371 BACKTRACK();
1372 case ByteTerm::TypeParentheticalAssertionBegin:
1373 if (backtrackParentheticalAssertionBegin(currentTerm(), context))
1374 MATCH_NEXT();
1375 BACKTRACK();
1376 case ByteTerm::TypeParentheticalAssertionEnd:
1377 if (backtrackParentheticalAssertionEnd(currentTerm(), context))
1378 MATCH_NEXT();
1379 BACKTRACK();
1380
1381 case ByteTerm::TypeCheckInput:
1382 input.uncheckInput(currentTerm().checkInputCount);
1383 BACKTRACK();
1384
1385 case ByteTerm::TypeUncheckInput:
1386 input.checkInput(currentTerm().checkInputCount);
1387 BACKTRACK();
1388
1389 case ByteTerm::TypeDotStarEnclosure:
1390 RELEASE_ASSERT_NOT_REACHED();
1391 }
1392
1393 RELEASE_ASSERT_NOT_REACHED();
1394 return JSRegExpErrorNoMatch;
1395 }
1396
1397 JSRegExpResult matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
1398 {
1399 JSRegExpResult result = matchDisjunction(disjunction, context, btrack);
1400
1401 if (result == JSRegExpMatch) {
1402 while (context->matchBegin == context->matchEnd) {
1403 result = matchDisjunction(disjunction, context, true);
1404 if (result != JSRegExpMatch)
1405 return result;
1406 }
1407 return JSRegExpMatch;
1408 }
1409
1410 return result;
1411 }
1412
1413 unsigned interpret()
1414 {
1415 if (!input.isAvailableInput(0))
1416 return offsetNoMatch;
1417
1418 for (unsigned i = 0; i < pattern->m_body->m_numSubpatterns + 1; ++i)
1419 output[i << 1] = offsetNoMatch;
1420
1421 allocatorPool = pattern->m_allocator->startAllocator();
1422 RELEASE_ASSERT(allocatorPool);
1423
1424 DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get());
1425
1426 JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false);
1427 if (result == JSRegExpMatch) {
1428 output[0] = context->matchBegin;
1429 output[1] = context->matchEnd;
1430 }
1431
1432 freeDisjunctionContext(context);
1433
1434 pattern->m_allocator->stopAllocator();
1435
1436 ASSERT((result == JSRegExpMatch) == (output[0] != offsetNoMatch));
1437 return output[0];
1438 }
1439
1440 Interpreter(BytecodePattern* pattern, unsigned* output, const CharType* input, unsigned length, unsigned start)
1441 : pattern(pattern)
1442 , output(output)
1443 , input(input, start, length)
1444 , allocatorPool(0)
1445 , remainingMatchCount(matchLimit)
1446 {
1447 }
1448
1449 private:
1450 BytecodePattern* pattern;
1451 unsigned* output;
1452 InputStream input;
1453 BumpPointerPool* allocatorPool;
1454 unsigned remainingMatchCount;
1455 };
1456
1457 class ByteCompiler {
1458 struct ParenthesesStackEntry {
1459 unsigned beginTerm;
1460 unsigned savedAlternativeIndex;
1461 ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/)
1462 : beginTerm(beginTerm)
1463 , savedAlternativeIndex(savedAlternativeIndex)
1464 {
1465 }
1466 };
1467
1468 public:
1469 ByteCompiler(YarrPattern& pattern)
1470 : m_pattern(pattern)
1471 {
1472 m_currentAlternativeIndex = 0;
1473 }
1474
1475 std::unique_ptr<BytecodePattern> compile(BumpPointerAllocator* allocator)
1476 {
1477 regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough());
1478 emitDisjunction(m_pattern.m_body);
1479 regexEnd();
1480
1481 return std::make_unique<BytecodePattern>(WTF::move(m_bodyDisjunction), m_allParenthesesInfo, m_pattern, allocator);
1482 }
1483
1484 void checkInput(unsigned count)
1485 {
1486 m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count));
1487 }
1488
1489 void uncheckInput(unsigned count)
1490 {
1491 m_bodyDisjunction->terms.append(ByteTerm::UncheckInput(count));
1492 }
1493
1494 void assertionBOL(unsigned inputPosition)
1495 {
1496 m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition));
1497 }
1498
1499 void assertionEOL(unsigned inputPosition)
1500 {
1501 m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition));
1502 }
1503
1504 void assertionWordBoundary(bool invert, unsigned inputPosition)
1505 {
1506 m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition));
1507 }
1508
1509 void atomPatternCharacter(UChar ch, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
1510 {
1511 if (m_pattern.m_ignoreCase) {
1512 ASSERT(u_tolower(ch) <= 0xFFFF);
1513 ASSERT(u_toupper(ch) <= 0xFFFF);
1514
1515 UChar lo = u_tolower(ch);
1516 UChar hi = u_toupper(ch);
1517
1518 if (lo != hi) {
1519 m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType));
1520 return;
1521 }
1522 }
1523
1524 m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType));
1525 }
1526
1527 void atomCharacterClass(CharacterClass* characterClass, bool invert, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
1528 {
1529 m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition));
1530
1531 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount.unsafeGet();
1532 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
1533 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1534 }
1535
1536 void atomBackReference(unsigned subpatternId, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
1537 {
1538 ASSERT(subpatternId);
1539
1540 m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition));
1541
1542 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount.unsafeGet();
1543 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
1544 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1545 }
1546
1547 void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1548 {
1549 int beginTerm = m_bodyDisjunction->terms.size();
1550
1551 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
1552 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1553 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1554 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1555
1556 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1557 m_currentAlternativeIndex = beginTerm + 1;
1558 }
1559
1560 void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1561 {
1562 int beginTerm = m_bodyDisjunction->terms.size();
1563
1564 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalBegin, subpatternId, capture, false, inputPosition));
1565 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1566 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1567 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1568
1569 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1570 m_currentAlternativeIndex = beginTerm + 1;
1571 }
1572
1573 void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1574 {
1575 // Errrk! - this is a little crazy, we initially generate as a TypeParenthesesSubpatternOnceBegin,
1576 // then fix this up at the end! - simplifying this should make it much clearer.
1577 // https://bugs.webkit.org/show_bug.cgi?id=50136
1578
1579 int beginTerm = m_bodyDisjunction->terms.size();
1580
1581 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
1582 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1583 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1584 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1585
1586 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1587 m_currentAlternativeIndex = beginTerm + 1;
1588 }
1589
1590 void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation)
1591 {
1592 int beginTerm = m_bodyDisjunction->terms.size();
1593
1594 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, false, invert, 0));
1595 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1596 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1597 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1598
1599 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1600 m_currentAlternativeIndex = beginTerm + 1;
1601 }
1602
1603 void atomParentheticalAssertionEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
1604 {
1605 unsigned beginTerm = popParenthesesStack();
1606 closeAlternative(beginTerm + 1);
1607 unsigned endTerm = m_bodyDisjunction->terms.size();
1608
1609 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin);
1610
1611 bool invert = m_bodyDisjunction->terms[beginTerm].invert();
1612 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1613
1614 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionEnd, subpatternId, false, invert, inputPosition));
1615 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1616 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1617 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1618
1619 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet();
1620 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1621 m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet();
1622 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1623 }
1624
1625 void assertionDotStarEnclosure(bool bolAnchored, bool eolAnchored)
1626 {
1627 m_bodyDisjunction->terms.append(ByteTerm::DotStarEnclosure(bolAnchored, eolAnchored));
1628 }
1629
1630 unsigned popParenthesesStack()
1631 {
1632 ASSERT(m_parenthesesStack.size());
1633 int stackEnd = m_parenthesesStack.size() - 1;
1634 unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm;
1635 m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex;
1636 m_parenthesesStack.shrink(stackEnd);
1637
1638 ASSERT(beginTerm < m_bodyDisjunction->terms.size());
1639 ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size());
1640
1641 return beginTerm;
1642 }
1643
1644 #ifndef NDEBUG
1645 void dumpDisjunction(ByteDisjunction* disjunction)
1646 {
1647 dataLogF("ByteDisjunction(%p):\n\t", disjunction);
1648 for (unsigned i = 0; i < disjunction->terms.size(); ++i)
1649 dataLogF("{ %d } ", disjunction->terms[i].type);
1650 dataLogF("\n");
1651 }
1652 #endif
1653
1654 void closeAlternative(int beginTerm)
1655 {
1656 int origBeginTerm = beginTerm;
1657 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin);
1658 int endIndex = m_bodyDisjunction->terms.size();
1659
1660 unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
1661
1662 if (!m_bodyDisjunction->terms[beginTerm].alternative.next)
1663 m_bodyDisjunction->terms.remove(beginTerm);
1664 else {
1665 while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
1666 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
1667 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction);
1668 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1669 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1670 }
1671
1672 m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1673
1674 m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd());
1675 m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1676 }
1677 }
1678
1679 void closeBodyAlternative()
1680 {
1681 int beginTerm = 0;
1682 int origBeginTerm = 0;
1683 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin);
1684 int endIndex = m_bodyDisjunction->terms.size();
1685
1686 unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
1687
1688 while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
1689 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
1690 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction);
1691 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1692 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1693 }
1694
1695 m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1696
1697 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd());
1698 m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1699 }
1700
1701 void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0)
1702 {
1703 unsigned beginTerm = popParenthesesStack();
1704 closeAlternative(beginTerm + 1);
1705 unsigned endTerm = m_bodyDisjunction->terms.size();
1706
1707 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
1708
1709 ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm];
1710
1711 bool capture = parenthesesBegin.capture();
1712 unsigned subpatternId = parenthesesBegin.atom.subpatternId;
1713
1714 unsigned numSubpatterns = lastSubpatternId - subpatternId + 1;
1715 auto parenthesesDisjunction = std::make_unique<ByteDisjunction>(numSubpatterns, callFrameSize);
1716
1717 unsigned firstTermInParentheses = beginTerm + 1;
1718 parenthesesDisjunction->terms.reserveInitialCapacity(endTerm - firstTermInParentheses + 2);
1719
1720 parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin());
1721 for (unsigned termInParentheses = firstTermInParentheses; termInParentheses < endTerm; ++termInParentheses)
1722 parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]);
1723 parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd());
1724
1725 m_bodyDisjunction->terms.shrink(beginTerm);
1726
1727 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, inputPosition));
1728 m_allParenthesesInfo.append(WTF::move(parenthesesDisjunction));
1729
1730 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet();
1731 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1732 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1733 }
1734
1735 void atomParenthesesOnceEnd(int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
1736 {
1737 unsigned beginTerm = popParenthesesStack();
1738 closeAlternative(beginTerm + 1);
1739 unsigned endTerm = m_bodyDisjunction->terms.size();
1740
1741 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
1742
1743 bool capture = m_bodyDisjunction->terms[beginTerm].capture();
1744 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1745
1746 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, capture, false, inputPosition));
1747 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1748 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1749 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1750
1751 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet();
1752 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1753 m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet();
1754 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1755 }
1756
1757 void atomParenthesesTerminalEnd(int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
1758 {
1759 unsigned beginTerm = popParenthesesStack();
1760 closeAlternative(beginTerm + 1);
1761 unsigned endTerm = m_bodyDisjunction->terms.size();
1762
1763 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
1764
1765 bool capture = m_bodyDisjunction->terms[beginTerm].capture();
1766 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1767
1768 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalEnd, subpatternId, capture, false, inputPosition));
1769 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1770 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1771 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1772
1773 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet();
1774 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1775 m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet();
1776 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1777 }
1778
1779 void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough)
1780 {
1781 m_bodyDisjunction = std::make_unique<ByteDisjunction>(numSubpatterns, callFrameSize);
1782 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough));
1783 m_bodyDisjunction->terms[0].frameLocation = 0;
1784 m_currentAlternativeIndex = 0;
1785 }
1786
1787 void regexEnd()
1788 {
1789 closeBodyAlternative();
1790 }
1791
1792 void alternativeBodyDisjunction(bool onceThrough)
1793 {
1794 int newAlternativeIndex = m_bodyDisjunction->terms.size();
1795 m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
1796 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction(onceThrough));
1797
1798 m_currentAlternativeIndex = newAlternativeIndex;
1799 }
1800
1801 void alternativeDisjunction()
1802 {
1803 int newAlternativeIndex = m_bodyDisjunction->terms.size();
1804 m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
1805 m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction());
1806
1807 m_currentAlternativeIndex = newAlternativeIndex;
1808 }
1809
1810 void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0)
1811 {
1812 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
1813 unsigned currentCountAlreadyChecked = inputCountAlreadyChecked;
1814
1815 PatternAlternative* alternative = disjunction->m_alternatives[alt].get();
1816
1817 if (alt) {
1818 if (disjunction == m_pattern.m_body)
1819 alternativeBodyDisjunction(alternative->onceThrough());
1820 else
1821 alternativeDisjunction();
1822 }
1823
1824 unsigned minimumSize = alternative->m_minimumSize;
1825 ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked);
1826 unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked;
1827
1828 if (countToCheck) {
1829 checkInput(countToCheck);
1830 currentCountAlreadyChecked += countToCheck;
1831 }
1832
1833 for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
1834 PatternTerm& term = alternative->m_terms[i];
1835
1836 switch (term.type) {
1837 case PatternTerm::TypeAssertionBOL:
1838 assertionBOL(currentCountAlreadyChecked - term.inputPosition);
1839 break;
1840
1841 case PatternTerm::TypeAssertionEOL:
1842 assertionEOL(currentCountAlreadyChecked - term.inputPosition);
1843 break;
1844
1845 case PatternTerm::TypeAssertionWordBoundary:
1846 assertionWordBoundary(term.invert(), currentCountAlreadyChecked - term.inputPosition);
1847 break;
1848
1849 case PatternTerm::TypePatternCharacter:
1850 atomPatternCharacter(term.patternCharacter, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType);
1851 break;
1852
1853 case PatternTerm::TypeCharacterClass:
1854 atomCharacterClass(term.characterClass, term.invert(), currentCountAlreadyChecked- term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType);
1855 break;
1856
1857 case PatternTerm::TypeBackReference:
1858 atomBackReference(term.backReferenceSubpatternId, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType);
1859 break;
1860
1861 case PatternTerm::TypeForwardReference:
1862 break;
1863
1864 case PatternTerm::TypeParenthesesSubpattern: {
1865 unsigned disjunctionAlreadyCheckedCount = 0;
1866 if (term.quantityCount == 1 && !term.parentheses.isCopy) {
1867 unsigned alternativeFrameLocation = term.frameLocation;
1868 // For QuantifierFixedCount we pre-check the minimum size; for greedy/non-greedy we reserve a slot in the frame.
1869 if (term.quantityType == QuantifierFixedCount)
1870 disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize;
1871 else
1872 alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
1873 unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1874 atomParenthesesOnceBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, alternativeFrameLocation);
1875 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount);
1876 atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType);
1877 } else if (term.parentheses.isTerminal) {
1878 unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1879 atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, term.frameLocation + YarrStackSpaceForBackTrackInfoParenthesesOnce);
1880 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount);
1881 atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType);
1882 } else {
1883 unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1884 atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, 0);
1885 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
1886 atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
1887 }
1888 break;
1889 }
1890
1891 case PatternTerm::TypeParentheticalAssertion: {
1892 unsigned alternativeFrameLocation = term.frameLocation + YarrStackSpaceForBackTrackInfoParentheticalAssertion;
1893
1894 ASSERT(currentCountAlreadyChecked >= static_cast<unsigned>(term.inputPosition));
1895 unsigned positiveInputOffset = currentCountAlreadyChecked - static_cast<unsigned>(term.inputPosition);
1896 unsigned uncheckAmount = 0;
1897 if (positiveInputOffset > term.parentheses.disjunction->m_minimumSize) {
1898 uncheckAmount = positiveInputOffset - term.parentheses.disjunction->m_minimumSize;
1899 uncheckInput(uncheckAmount);
1900 currentCountAlreadyChecked -= uncheckAmount;
1901 }
1902
1903 atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invert(), term.frameLocation, alternativeFrameLocation);
1904 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset - uncheckAmount);
1905 atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityCount, term.quantityType);
1906 if (uncheckAmount) {
1907 checkInput(uncheckAmount);
1908 currentCountAlreadyChecked += uncheckAmount;
1909 }
1910 break;
1911 }
1912
1913 case PatternTerm::TypeDotStarEnclosure:
1914 assertionDotStarEnclosure(term.anchors.bolAnchor, term.anchors.eolAnchor);
1915 break;
1916 }
1917 }
1918 }
1919 }
1920
1921 private:
1922 YarrPattern& m_pattern;
1923 std::unique_ptr<ByteDisjunction> m_bodyDisjunction;
1924 unsigned m_currentAlternativeIndex;
1925 Vector<ParenthesesStackEntry> m_parenthesesStack;
1926 Vector<std::unique_ptr<ByteDisjunction>> m_allParenthesesInfo;
1927 };
1928
1929 std::unique_ptr<BytecodePattern> byteCompile(YarrPattern& pattern, BumpPointerAllocator* allocator)
1930 {
1931 return ByteCompiler(pattern).compile(allocator);
1932 }
1933
1934 unsigned interpret(BytecodePattern* bytecode, const String& input, unsigned start, unsigned* output)
1935 {
1936 if (input.is8Bit())
1937 return Interpreter<LChar>(bytecode, output, input.characters8(), input.length(), start).interpret();
1938 return Interpreter<UChar>(bytecode, output, input.characters16(), input.length(), start).interpret();
1939 }
1940
1941 unsigned interpret(BytecodePattern* bytecode, const LChar* input, unsigned length, unsigned start, unsigned* output)
1942 {
1943 return Interpreter<LChar>(bytecode, output, input, length, start).interpret();
1944 }
1945
1946 unsigned interpret(BytecodePattern* bytecode, const UChar* input, unsigned length, unsigned start, unsigned* output)
1947 {
1948 return Interpreter<UChar>(bytecode, output, input, length, start).interpret();
1949 }
1950
1951 // These should be the same for both UChar & LChar.
1952 COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoPatternCharacter);
1953 COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoCharacterClass);
1954 COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoBackReference);
1955 COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoAlternative);
1956 COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion);
1957 COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce);
1958 COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheses) == (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses);
1959
1960
1961 } }