]> git.saurik.com Git - apple/javascriptcore.git/blame_incremental - yarr/YarrInterpreter.cpp
JavaScriptCore-1097.3.3.tar.gz
[apple/javascriptcore.git] / yarr / YarrInterpreter.cpp
... / ...
CommitLineData
1/*
2 * Copyright (C) 2009 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "config.h"
28#include "YarrInterpreter.h"
29
30#include "UString.h"
31#include "Yarr.h"
32#include "YarrCanonicalizeUCS2.h"
33#include <wtf/BumpPointerAllocator.h>
34#include <wtf/DataLog.h>
35#include <wtf/text/CString.h>
36
37#ifndef NDEBUG
38#include <stdio.h>
39#endif
40
41using namespace WTF;
42
43namespace JSC { namespace Yarr {
44
45template<typename CharType>
46class Interpreter {
47public:
48 struct ParenthesesDisjunctionContext;
49
50 struct BackTrackInfoPatternCharacter {
51 uintptr_t matchAmount;
52 };
53 struct BackTrackInfoCharacterClass {
54 uintptr_t matchAmount;
55 };
56 struct BackTrackInfoBackReference {
57 uintptr_t begin; // Not really needed for greedy quantifiers.
58 uintptr_t matchAmount; // Not really needed for fixed quantifiers.
59 };
60 struct BackTrackInfoAlternative {
61 uintptr_t offset;
62 };
63 struct BackTrackInfoParentheticalAssertion {
64 uintptr_t begin;
65 };
66 struct BackTrackInfoParenthesesOnce {
67 uintptr_t begin;
68 };
69 struct BackTrackInfoParenthesesTerminal {
70 uintptr_t begin;
71 };
72 struct BackTrackInfoParentheses {
73 uintptr_t matchAmount;
74 ParenthesesDisjunctionContext* lastContext;
75 };
76
77 static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context)
78 {
79 context->next = backTrack->lastContext;
80 backTrack->lastContext = context;
81 ++backTrack->matchAmount;
82 }
83
84 static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack)
85 {
86 ASSERT(backTrack->matchAmount);
87 ASSERT(backTrack->lastContext);
88 backTrack->lastContext = backTrack->lastContext->next;
89 --backTrack->matchAmount;
90 }
91
92 struct DisjunctionContext
93 {
94 DisjunctionContext()
95 : term(0)
96 {
97 }
98
99 void* operator new(size_t, void* where)
100 {
101 return where;
102 }
103
104 int term;
105 unsigned matchBegin;
106 unsigned matchEnd;
107 uintptr_t frame[1];
108 };
109
110 DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction)
111 {
112 size_t size = sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t);
113 allocatorPool = allocatorPool->ensureCapacity(size);
114 if (!allocatorPool)
115 CRASH();
116 return new (allocatorPool->alloc(size)) DisjunctionContext();
117 }
118
119 void freeDisjunctionContext(DisjunctionContext* context)
120 {
121 allocatorPool = allocatorPool->dealloc(context);
122 }
123
124 struct ParenthesesDisjunctionContext
125 {
126 ParenthesesDisjunctionContext(unsigned* output, ByteTerm& term)
127 : next(0)
128 {
129 unsigned firstSubpatternId = term.atom.subpatternId;
130 unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns;
131
132 for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) {
133 subpatternBackup[i] = output[(firstSubpatternId << 1) + i];
134 output[(firstSubpatternId << 1) + i] = offsetNoMatch;
135 }
136
137 new (getDisjunctionContext(term)) DisjunctionContext();
138 }
139
140 void* operator new(size_t, void* where)
141 {
142 return where;
143 }
144
145 void restoreOutput(unsigned* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns)
146 {
147 for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i)
148 output[(firstSubpatternId << 1) + i] = subpatternBackup[i];
149 }
150
151 DisjunctionContext* getDisjunctionContext(ByteTerm& term)
152 {
153 return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1]));
154 }
155
156 ParenthesesDisjunctionContext* next;
157 unsigned subpatternBackup[1];
158 };
159
160 ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, unsigned* output, ByteTerm& term)
161 {
162 size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(unsigned) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(unsigned) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t);
163 allocatorPool = allocatorPool->ensureCapacity(size);
164 if (!allocatorPool)
165 CRASH();
166 return new (allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term);
167 }
168
169 void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context)
170 {
171 allocatorPool = allocatorPool->dealloc(context);
172 }
173
174 class InputStream {
175 public:
176 InputStream(const CharType* input, unsigned start, unsigned length)
177 : input(input)
178 , pos(start)
179 , length(length)
180 {
181 }
182
183 void next()
184 {
185 ++pos;
186 }
187
188 void rewind(unsigned amount)
189 {
190 ASSERT(pos >= amount);
191 pos -= amount;
192 }
193
194 int read()
195 {
196 ASSERT(pos < length);
197 if (pos < length)
198 return input[pos];
199 return -1;
200 }
201
202 int readPair()
203 {
204 ASSERT(pos + 1 < length);
205 return input[pos] | input[pos + 1] << 16;
206 }
207
208 int readChecked(unsigned negativePositionOffest)
209 {
210 if (pos < negativePositionOffest)
211 CRASH();
212 unsigned p = pos - negativePositionOffest;
213 ASSERT(p < length);
214 return input[p];
215 }
216
217 int reread(unsigned from)
218 {
219 ASSERT(from < length);
220 return input[from];
221 }
222
223 int prev()
224 {
225 ASSERT(!(pos > length));
226 if (pos && length)
227 return input[pos - 1];
228 return -1;
229 }
230
231 unsigned getPos()
232 {
233 return pos;
234 }
235
236 void setPos(unsigned p)
237 {
238 pos = p;
239 }
240
241 bool atStart()
242 {
243 return pos == 0;
244 }
245
246 bool atEnd()
247 {
248 return pos == length;
249 }
250
251 unsigned end()
252 {
253 return length;
254 }
255
256 bool checkInput(unsigned count)
257 {
258 if (((pos + count) <= length) && ((pos + count) >= pos)) {
259 pos += count;
260 return true;
261 }
262 return false;
263 }
264
265 void uncheckInput(unsigned count)
266 {
267 if (pos < count)
268 CRASH();
269 pos -= count;
270 }
271
272 bool atStart(unsigned negativePositionOffest)
273 {
274 return pos == negativePositionOffest;
275 }
276
277 bool atEnd(unsigned negativePositionOffest)
278 {
279 if (pos < negativePositionOffest)
280 CRASH();
281 return (pos - negativePositionOffest) == length;
282 }
283
284 bool isAvailableInput(unsigned offset)
285 {
286 return (((pos + offset) <= length) && ((pos + offset) >= pos));
287 }
288
289 private:
290 const CharType* input;
291 unsigned pos;
292 unsigned length;
293 };
294
295 bool testCharacterClass(CharacterClass* characterClass, int ch)
296 {
297 if (ch & 0xFF80) {
298 for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i)
299 if (ch == characterClass->m_matchesUnicode[i])
300 return true;
301 for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i)
302 if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end))
303 return true;
304 } else {
305 for (unsigned i = 0; i < characterClass->m_matches.size(); ++i)
306 if (ch == characterClass->m_matches[i])
307 return true;
308 for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i)
309 if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end))
310 return true;
311 }
312
313 return false;
314 }
315
316 bool checkCharacter(int testChar, unsigned negativeInputOffset)
317 {
318 return testChar == input.readChecked(negativeInputOffset);
319 }
320
321 bool checkCasedCharacter(int loChar, int hiChar, unsigned negativeInputOffset)
322 {
323 int ch = input.readChecked(negativeInputOffset);
324 return (loChar == ch) || (hiChar == ch);
325 }
326
327 bool checkCharacterClass(CharacterClass* characterClass, bool invert, unsigned negativeInputOffset)
328 {
329 bool match = testCharacterClass(characterClass, input.readChecked(negativeInputOffset));
330 return invert ? !match : match;
331 }
332
333 bool tryConsumeBackReference(int matchBegin, int matchEnd, unsigned negativeInputOffset)
334 {
335 unsigned matchSize = (unsigned)(matchEnd - matchBegin);
336
337 if (!input.checkInput(matchSize))
338 return false;
339
340 if (pattern->m_ignoreCase) {
341 for (unsigned i = 0; i < matchSize; ++i) {
342 int oldCh = input.reread(matchBegin + i);
343 int ch = input.readChecked(negativeInputOffset + matchSize - i);
344
345 if (oldCh == ch)
346 continue;
347
348 // The definition for canonicalize (see ES 5.1, 15.10.2.8) means that
349 // unicode values are never allowed to match against ascii ones.
350 if (isASCII(oldCh) || isASCII(ch)) {
351 if (toASCIIUpper(oldCh) == toASCIIUpper(ch))
352 continue;
353 } else if (areCanonicallyEquivalent(oldCh, ch))
354 continue;
355
356 input.uncheckInput(matchSize);
357 return false;
358 }
359 } else {
360 for (unsigned i = 0; i < matchSize; ++i) {
361 if (!checkCharacter(input.reread(matchBegin + i), negativeInputOffset + matchSize - i)) {
362 input.uncheckInput(matchSize);
363 return false;
364 }
365 }
366 }
367
368 return true;
369 }
370
371 bool matchAssertionBOL(ByteTerm& term)
372 {
373 return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition + 1)));
374 }
375
376 bool matchAssertionEOL(ByteTerm& term)
377 {
378 if (term.inputPosition)
379 return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition)));
380
381 return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read()));
382 }
383
384 bool matchAssertionWordBoundary(ByteTerm& term)
385 {
386 bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition + 1));
387 bool readIsWordchar;
388 if (term.inputPosition)
389 readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition));
390 else
391 readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read());
392
393 bool wordBoundary = prevIsWordchar != readIsWordchar;
394 return term.invert() ? !wordBoundary : wordBoundary;
395 }
396
397 bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context)
398 {
399 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
400
401 switch (term.atom.quantityType) {
402 case QuantifierFixedCount:
403 break;
404
405 case QuantifierGreedy:
406 if (backTrack->matchAmount) {
407 --backTrack->matchAmount;
408 input.uncheckInput(1);
409 return true;
410 }
411 break;
412
413 case QuantifierNonGreedy:
414 if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
415 ++backTrack->matchAmount;
416 if (checkCharacter(term.atom.patternCharacter, term.inputPosition + 1))
417 return true;
418 }
419 input.uncheckInput(backTrack->matchAmount);
420 break;
421 }
422
423 return false;
424 }
425
426 bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context)
427 {
428 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
429
430 switch (term.atom.quantityType) {
431 case QuantifierFixedCount:
432 break;
433
434 case QuantifierGreedy:
435 if (backTrack->matchAmount) {
436 --backTrack->matchAmount;
437 input.uncheckInput(1);
438 return true;
439 }
440 break;
441
442 case QuantifierNonGreedy:
443 if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
444 ++backTrack->matchAmount;
445 if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition + 1))
446 return true;
447 }
448 input.uncheckInput(backTrack->matchAmount);
449 break;
450 }
451
452 return false;
453 }
454
455 bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context)
456 {
457 ASSERT(term.type == ByteTerm::TypeCharacterClass);
458 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
459
460 switch (term.atom.quantityType) {
461 case QuantifierFixedCount: {
462 for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
463 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - matchAmount))
464 return false;
465 }
466 return true;
467 }
468
469 case QuantifierGreedy: {
470 unsigned matchAmount = 0;
471 while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
472 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) {
473 input.uncheckInput(1);
474 break;
475 }
476 ++matchAmount;
477 }
478 backTrack->matchAmount = matchAmount;
479
480 return true;
481 }
482
483 case QuantifierNonGreedy:
484 backTrack->matchAmount = 0;
485 return true;
486 }
487
488 ASSERT_NOT_REACHED();
489 return false;
490 }
491
492 bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context)
493 {
494 ASSERT(term.type == ByteTerm::TypeCharacterClass);
495 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
496
497 switch (term.atom.quantityType) {
498 case QuantifierFixedCount:
499 break;
500
501 case QuantifierGreedy:
502 if (backTrack->matchAmount) {
503 --backTrack->matchAmount;
504 input.uncheckInput(1);
505 return true;
506 }
507 break;
508
509 case QuantifierNonGreedy:
510 if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
511 ++backTrack->matchAmount;
512 if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1))
513 return true;
514 }
515 input.uncheckInput(backTrack->matchAmount);
516 break;
517 }
518
519 return false;
520 }
521
522 bool matchBackReference(ByteTerm& term, DisjunctionContext* context)
523 {
524 ASSERT(term.type == ByteTerm::TypeBackReference);
525 BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
526
527 unsigned matchBegin = output[(term.atom.subpatternId << 1)];
528 unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1];
529
530 // If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that.
531 // In this case the result of match is empty string like when it references to a parentheses with zero-width match.
532 // Eg.: /(a\1)/
533 if (matchEnd == offsetNoMatch)
534 return true;
535
536 if (matchBegin == offsetNoMatch)
537 return true;
538
539 ASSERT(matchBegin <= matchEnd);
540
541 if (matchBegin == matchEnd)
542 return true;
543
544 switch (term.atom.quantityType) {
545 case QuantifierFixedCount: {
546 backTrack->begin = input.getPos();
547 for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
548 if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
549 input.setPos(backTrack->begin);
550 return false;
551 }
552 }
553 return true;
554 }
555
556 case QuantifierGreedy: {
557 unsigned matchAmount = 0;
558 while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition))
559 ++matchAmount;
560 backTrack->matchAmount = matchAmount;
561 return true;
562 }
563
564 case QuantifierNonGreedy:
565 backTrack->begin = input.getPos();
566 backTrack->matchAmount = 0;
567 return true;
568 }
569
570 ASSERT_NOT_REACHED();
571 return false;
572 }
573
574 bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context)
575 {
576 ASSERT(term.type == ByteTerm::TypeBackReference);
577 BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
578
579 unsigned matchBegin = output[(term.atom.subpatternId << 1)];
580 unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1];
581
582 if (matchBegin == offsetNoMatch)
583 return false;
584
585 ASSERT(matchBegin <= matchEnd);
586
587 if (matchBegin == matchEnd)
588 return false;
589
590 switch (term.atom.quantityType) {
591 case QuantifierFixedCount:
592 // for quantityCount == 1, could rewind.
593 input.setPos(backTrack->begin);
594 break;
595
596 case QuantifierGreedy:
597 if (backTrack->matchAmount) {
598 --backTrack->matchAmount;
599 input.rewind(matchEnd - matchBegin);
600 return true;
601 }
602 break;
603
604 case QuantifierNonGreedy:
605 if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
606 ++backTrack->matchAmount;
607 return true;
608 }
609 input.setPos(backTrack->begin);
610 break;
611 }
612
613 return false;
614 }
615
616 void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context)
617 {
618 if (term.capture()) {
619 unsigned subpatternId = term.atom.subpatternId;
620 output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition;
621 output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition;
622 }
623 }
624 void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context)
625 {
626 unsigned firstSubpatternId = term.atom.subpatternId;
627 unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
628 context->restoreOutput(output, firstSubpatternId, count);
629 }
630 JSRegExpResult parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack)
631 {
632 while (backTrack->matchAmount) {
633 ParenthesesDisjunctionContext* context = backTrack->lastContext;
634
635 JSRegExpResult result = matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true);
636 if (result == JSRegExpMatch)
637 return JSRegExpMatch;
638
639 resetMatches(term, context);
640 popParenthesesDisjunctionContext(backTrack);
641 freeParenthesesDisjunctionContext(context);
642
643 if (result != JSRegExpNoMatch)
644 return result;
645 }
646
647 return JSRegExpNoMatch;
648 }
649
650 bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
651 {
652 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
653 ASSERT(term.atom.quantityCount == 1);
654
655 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
656
657 switch (term.atom.quantityType) {
658 case QuantifierGreedy: {
659 // set this speculatively; if we get to the parens end this will be true.
660 backTrack->begin = input.getPos();
661 break;
662 }
663 case QuantifierNonGreedy: {
664 backTrack->begin = notFound;
665 context->term += term.atom.parenthesesWidth;
666 return true;
667 }
668 case QuantifierFixedCount:
669 break;
670 }
671
672 if (term.capture()) {
673 unsigned subpatternId = term.atom.subpatternId;
674 output[(subpatternId << 1)] = input.getPos() - term.inputPosition;
675 }
676
677 return true;
678 }
679
680 bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
681 {
682 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
683 ASSERT(term.atom.quantityCount == 1);
684
685 if (term.capture()) {
686 unsigned subpatternId = term.atom.subpatternId;
687 output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition;
688 }
689
690 if (term.atom.quantityType == QuantifierFixedCount)
691 return true;
692
693 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
694 return backTrack->begin != input.getPos();
695 }
696
697 bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
698 {
699 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
700 ASSERT(term.atom.quantityCount == 1);
701
702 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
703
704 if (term.capture()) {
705 unsigned subpatternId = term.atom.subpatternId;
706 output[(subpatternId << 1)] = offsetNoMatch;
707 output[(subpatternId << 1) + 1] = offsetNoMatch;
708 }
709
710 switch (term.atom.quantityType) {
711 case QuantifierGreedy:
712 // if we backtrack to this point, there is another chance - try matching nothing.
713 ASSERT(backTrack->begin != notFound);
714 backTrack->begin = notFound;
715 context->term += term.atom.parenthesesWidth;
716 return true;
717 case QuantifierNonGreedy:
718 ASSERT(backTrack->begin != notFound);
719 case QuantifierFixedCount:
720 break;
721 }
722
723 return false;
724 }
725
726 bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
727 {
728 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
729 ASSERT(term.atom.quantityCount == 1);
730
731 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
732
733 switch (term.atom.quantityType) {
734 case QuantifierGreedy:
735 if (backTrack->begin == notFound) {
736 context->term -= term.atom.parenthesesWidth;
737 return false;
738 }
739 case QuantifierNonGreedy:
740 if (backTrack->begin == notFound) {
741 backTrack->begin = input.getPos();
742 if (term.capture()) {
743 // Technically this access to inputPosition should be accessing the begin term's
744 // inputPosition, but for repeats other than fixed these values should be
745 // the same anyway! (We don't pre-check for greedy or non-greedy matches.)
746 ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
747 ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition);
748 unsigned subpatternId = term.atom.subpatternId;
749 output[subpatternId << 1] = input.getPos() + term.inputPosition;
750 }
751 context->term -= term.atom.parenthesesWidth;
752 return true;
753 }
754 case QuantifierFixedCount:
755 break;
756 }
757
758 return false;
759 }
760
761 bool matchParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
762 {
763 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
764 ASSERT(term.atom.quantityType == QuantifierGreedy);
765 ASSERT(term.atom.quantityCount == quantifyInfinite);
766 ASSERT(!term.capture());
767
768 BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
769 backTrack->begin = input.getPos();
770 return true;
771 }
772
773 bool matchParenthesesTerminalEnd(ByteTerm& term, DisjunctionContext* context)
774 {
775 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalEnd);
776
777 BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
778 // Empty match is a failed match.
779 if (backTrack->begin == input.getPos())
780 return false;
781
782 // Successful match! Okay, what's next? - loop around and try to match moar!
783 context->term -= (term.atom.parenthesesWidth + 1);
784 return true;
785 }
786
787 bool backtrackParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
788 {
789 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
790 ASSERT(term.atom.quantityType == QuantifierGreedy);
791 ASSERT(term.atom.quantityCount == quantifyInfinite);
792 ASSERT(!term.capture());
793
794 // If we backtrack to this point, we have failed to match this iteration of the parens.
795 // Since this is greedy / zero minimum a failed is also accepted as a match!
796 context->term += term.atom.parenthesesWidth;
797 return true;
798 }
799
800 bool backtrackParenthesesTerminalEnd(ByteTerm&, DisjunctionContext*)
801 {
802 // 'Terminal' parentheses are at the end of the regex, and as such a match past end
803 // should always be returned as a successful match - we should never backtrack to here.
804 ASSERT_NOT_REACHED();
805 return false;
806 }
807
808 bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
809 {
810 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
811 ASSERT(term.atom.quantityCount == 1);
812
813 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
814
815 backTrack->begin = input.getPos();
816 return true;
817 }
818
819 bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
820 {
821 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
822 ASSERT(term.atom.quantityCount == 1);
823
824 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
825
826 input.setPos(backTrack->begin);
827
828 // We've reached the end of the parens; if they are inverted, this is failure.
829 if (term.invert()) {
830 context->term -= term.atom.parenthesesWidth;
831 return false;
832 }
833
834 return true;
835 }
836
837 bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
838 {
839 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
840 ASSERT(term.atom.quantityCount == 1);
841
842 // We've failed to match parens; if they are inverted, this is win!
843 if (term.invert()) {
844 context->term += term.atom.parenthesesWidth;
845 return true;
846 }
847
848 return false;
849 }
850
851 bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
852 {
853 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
854 ASSERT(term.atom.quantityCount == 1);
855
856 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
857
858 input.setPos(backTrack->begin);
859
860 context->term -= term.atom.parenthesesWidth;
861 return false;
862 }
863
864 JSRegExpResult matchParentheses(ByteTerm& term, DisjunctionContext* context)
865 {
866 ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
867
868 BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
869 ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
870
871 backTrack->matchAmount = 0;
872 backTrack->lastContext = 0;
873
874 switch (term.atom.quantityType) {
875 case QuantifierFixedCount: {
876 // While we haven't yet reached our fixed limit,
877 while (backTrack->matchAmount < term.atom.quantityCount) {
878 // Try to do a match, and it it succeeds, add it to the list.
879 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
880 JSRegExpResult result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
881 if (result == JSRegExpMatch)
882 appendParenthesesDisjunctionContext(backTrack, context);
883 else {
884 // The match failed; try to find an alternate point to carry on from.
885 resetMatches(term, context);
886 freeParenthesesDisjunctionContext(context);
887
888 if (result != JSRegExpNoMatch)
889 return result;
890 JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
891 if (backtrackResult != JSRegExpMatch)
892 return backtrackResult;
893 }
894 }
895
896 ASSERT(backTrack->matchAmount == term.atom.quantityCount);
897 ParenthesesDisjunctionContext* context = backTrack->lastContext;
898 recordParenthesesMatch(term, context);
899 return JSRegExpMatch;
900 }
901
902 case QuantifierGreedy: {
903 while (backTrack->matchAmount < term.atom.quantityCount) {
904 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
905 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
906 if (result == JSRegExpMatch)
907 appendParenthesesDisjunctionContext(backTrack, context);
908 else {
909 resetMatches(term, context);
910 freeParenthesesDisjunctionContext(context);
911
912 if (result != JSRegExpNoMatch)
913 return result;
914
915 break;
916 }
917 }
918
919 if (backTrack->matchAmount) {
920 ParenthesesDisjunctionContext* context = backTrack->lastContext;
921 recordParenthesesMatch(term, context);
922 }
923 return JSRegExpMatch;
924 }
925
926 case QuantifierNonGreedy:
927 return JSRegExpMatch;
928 }
929
930 ASSERT_NOT_REACHED();
931 return JSRegExpErrorNoMatch;
932 }
933
934 // Rules for backtracking differ depending on whether this is greedy or non-greedy.
935 //
936 // Greedy matches never should try just adding more - you should already have done
937 // the 'more' cases. Always backtrack, at least a leetle bit. However cases where
938 // you backtrack an item off the list needs checking, since we'll never have matched
939 // the one less case. Tracking forwards, still add as much as possible.
940 //
941 // Non-greedy, we've already done the one less case, so don't match on popping.
942 // We haven't done the one more case, so always try to add that.
943 //
944 JSRegExpResult backtrackParentheses(ByteTerm& term, DisjunctionContext* context)
945 {
946 ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
947
948 BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
949 ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
950
951 switch (term.atom.quantityType) {
952 case QuantifierFixedCount: {
953 ASSERT(backTrack->matchAmount == term.atom.quantityCount);
954
955 ParenthesesDisjunctionContext* context = 0;
956 JSRegExpResult result = parenthesesDoBacktrack(term, backTrack);
957
958 if (result != JSRegExpMatch)
959 return result;
960
961 // While we haven't yet reached our fixed limit,
962 while (backTrack->matchAmount < term.atom.quantityCount) {
963 // Try to do a match, and it it succeeds, add it to the list.
964 context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
965 result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
966
967 if (result == JSRegExpMatch)
968 appendParenthesesDisjunctionContext(backTrack, context);
969 else {
970 // The match failed; try to find an alternate point to carry on from.
971 resetMatches(term, context);
972 freeParenthesesDisjunctionContext(context);
973
974 if (result != JSRegExpNoMatch)
975 return result;
976 JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
977 if (backtrackResult != JSRegExpMatch)
978 return backtrackResult;
979 }
980 }
981
982 ASSERT(backTrack->matchAmount == term.atom.quantityCount);
983 context = backTrack->lastContext;
984 recordParenthesesMatch(term, context);
985 return JSRegExpMatch;
986 }
987
988 case QuantifierGreedy: {
989 if (!backTrack->matchAmount)
990 return JSRegExpNoMatch;
991
992 ParenthesesDisjunctionContext* context = backTrack->lastContext;
993 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
994 if (result == JSRegExpMatch) {
995 while (backTrack->matchAmount < term.atom.quantityCount) {
996 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
997 JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
998 if (parenthesesResult == JSRegExpMatch)
999 appendParenthesesDisjunctionContext(backTrack, context);
1000 else {
1001 resetMatches(term, context);
1002 freeParenthesesDisjunctionContext(context);
1003
1004 if (parenthesesResult != JSRegExpNoMatch)
1005 return parenthesesResult;
1006
1007 break;
1008 }
1009 }
1010 } else {
1011 resetMatches(term, context);
1012 popParenthesesDisjunctionContext(backTrack);
1013 freeParenthesesDisjunctionContext(context);
1014
1015 if (result != JSRegExpNoMatch)
1016 return result;
1017 }
1018
1019 if (backTrack->matchAmount) {
1020 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1021 recordParenthesesMatch(term, context);
1022 }
1023 return JSRegExpMatch;
1024 }
1025
1026 case QuantifierNonGreedy: {
1027 // If we've not reached the limit, try to add one more match.
1028 if (backTrack->matchAmount < term.atom.quantityCount) {
1029 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
1030 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
1031 if (result == JSRegExpMatch) {
1032 appendParenthesesDisjunctionContext(backTrack, context);
1033 recordParenthesesMatch(term, context);
1034 return JSRegExpMatch;
1035 }
1036
1037 resetMatches(term, context);
1038 freeParenthesesDisjunctionContext(context);
1039
1040 if (result != JSRegExpNoMatch)
1041 return result;
1042 }
1043
1044 // Nope - okay backtrack looking for an alternative.
1045 while (backTrack->matchAmount) {
1046 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1047 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
1048 if (result == JSRegExpMatch) {
1049 // successful backtrack! we're back in the game!
1050 if (backTrack->matchAmount) {
1051 context = backTrack->lastContext;
1052 recordParenthesesMatch(term, context);
1053 }
1054 return JSRegExpMatch;
1055 }
1056
1057 // pop a match off the stack
1058 resetMatches(term, context);
1059 popParenthesesDisjunctionContext(backTrack);
1060 freeParenthesesDisjunctionContext(context);
1061
1062 if (result != JSRegExpNoMatch)
1063 return result;
1064 }
1065
1066 return JSRegExpNoMatch;
1067 }
1068 }
1069
1070 ASSERT_NOT_REACHED();
1071 return JSRegExpErrorNoMatch;
1072 }
1073
1074 bool matchDotStarEnclosure(ByteTerm& term, DisjunctionContext* context)
1075 {
1076 UNUSED_PARAM(term);
1077 unsigned matchBegin = context->matchBegin;
1078
1079 if (matchBegin) {
1080 for (matchBegin--; true; matchBegin--) {
1081 if (testCharacterClass(pattern->newlineCharacterClass, input.reread(matchBegin))) {
1082 ++matchBegin;
1083 break;
1084 }
1085
1086 if (!matchBegin)
1087 break;
1088 }
1089 }
1090
1091 unsigned matchEnd = input.getPos();
1092
1093 for (; (matchEnd != input.end())
1094 && (!testCharacterClass(pattern->newlineCharacterClass, input.reread(matchEnd))); matchEnd++) { }
1095
1096 if (((matchBegin && term.anchors.m_bol)
1097 || ((matchEnd != input.end()) && term.anchors.m_eol))
1098 && !pattern->m_multiline)
1099 return false;
1100
1101 context->matchBegin = matchBegin;
1102 context->matchEnd = matchEnd;
1103 return true;
1104 }
1105
1106#define MATCH_NEXT() { ++context->term; goto matchAgain; }
1107#define BACKTRACK() { --context->term; goto backtrack; }
1108#define currentTerm() (disjunction->terms[context->term])
1109 JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
1110 {
1111 if (!--remainingMatchCount)
1112 return JSRegExpErrorHitLimit;
1113
1114 if (btrack)
1115 BACKTRACK();
1116
1117 context->matchBegin = input.getPos();
1118 context->term = 0;
1119
1120 matchAgain:
1121 ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
1122
1123 switch (currentTerm().type) {
1124 case ByteTerm::TypeSubpatternBegin:
1125 MATCH_NEXT();
1126 case ByteTerm::TypeSubpatternEnd:
1127 context->matchEnd = input.getPos();
1128 return JSRegExpMatch;
1129
1130 case ByteTerm::TypeBodyAlternativeBegin:
1131 MATCH_NEXT();
1132 case ByteTerm::TypeBodyAlternativeDisjunction:
1133 case ByteTerm::TypeBodyAlternativeEnd:
1134 context->matchEnd = input.getPos();
1135 return JSRegExpMatch;
1136
1137 case ByteTerm::TypeAlternativeBegin:
1138 MATCH_NEXT();
1139 case ByteTerm::TypeAlternativeDisjunction:
1140 case ByteTerm::TypeAlternativeEnd: {
1141 int offset = currentTerm().alternative.end;
1142 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
1143 backTrack->offset = offset;
1144 context->term += offset;
1145 MATCH_NEXT();
1146 }
1147
1148 case ByteTerm::TypeAssertionBOL:
1149 if (matchAssertionBOL(currentTerm()))
1150 MATCH_NEXT();
1151 BACKTRACK();
1152 case ByteTerm::TypeAssertionEOL:
1153 if (matchAssertionEOL(currentTerm()))
1154 MATCH_NEXT();
1155 BACKTRACK();
1156 case ByteTerm::TypeAssertionWordBoundary:
1157 if (matchAssertionWordBoundary(currentTerm()))
1158 MATCH_NEXT();
1159 BACKTRACK();
1160
1161 case ByteTerm::TypePatternCharacterOnce:
1162 case ByteTerm::TypePatternCharacterFixed: {
1163 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
1164 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - matchAmount))
1165 BACKTRACK();
1166 }
1167 MATCH_NEXT();
1168 }
1169 case ByteTerm::TypePatternCharacterGreedy: {
1170 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1171 unsigned matchAmount = 0;
1172 while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
1173 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + 1)) {
1174 input.uncheckInput(1);
1175 break;
1176 }
1177 ++matchAmount;
1178 }
1179 backTrack->matchAmount = matchAmount;
1180
1181 MATCH_NEXT();
1182 }
1183 case ByteTerm::TypePatternCharacterNonGreedy: {
1184 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1185 backTrack->matchAmount = 0;
1186 MATCH_NEXT();
1187 }
1188
1189 case ByteTerm::TypePatternCasedCharacterOnce:
1190 case ByteTerm::TypePatternCasedCharacterFixed: {
1191 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
1192 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - matchAmount))
1193 BACKTRACK();
1194 }
1195 MATCH_NEXT();
1196 }
1197 case ByteTerm::TypePatternCasedCharacterGreedy: {
1198 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1199 unsigned matchAmount = 0;
1200 while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
1201 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + 1)) {
1202 input.uncheckInput(1);
1203 break;
1204 }
1205 ++matchAmount;
1206 }
1207 backTrack->matchAmount = matchAmount;
1208
1209 MATCH_NEXT();
1210 }
1211 case ByteTerm::TypePatternCasedCharacterNonGreedy: {
1212 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1213 backTrack->matchAmount = 0;
1214 MATCH_NEXT();
1215 }
1216
1217 case ByteTerm::TypeCharacterClass:
1218 if (matchCharacterClass(currentTerm(), context))
1219 MATCH_NEXT();
1220 BACKTRACK();
1221 case ByteTerm::TypeBackReference:
1222 if (matchBackReference(currentTerm(), context))
1223 MATCH_NEXT();
1224 BACKTRACK();
1225 case ByteTerm::TypeParenthesesSubpattern: {
1226 JSRegExpResult result = matchParentheses(currentTerm(), context);
1227
1228 if (result == JSRegExpMatch) {
1229 MATCH_NEXT();
1230 } else if (result != JSRegExpNoMatch)
1231 return result;
1232
1233 BACKTRACK();
1234 }
1235 case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1236 if (matchParenthesesOnceBegin(currentTerm(), context))
1237 MATCH_NEXT();
1238 BACKTRACK();
1239 case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1240 if (matchParenthesesOnceEnd(currentTerm(), context))
1241 MATCH_NEXT();
1242 BACKTRACK();
1243 case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
1244 if (matchParenthesesTerminalBegin(currentTerm(), context))
1245 MATCH_NEXT();
1246 BACKTRACK();
1247 case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
1248 if (matchParenthesesTerminalEnd(currentTerm(), context))
1249 MATCH_NEXT();
1250 BACKTRACK();
1251 case ByteTerm::TypeParentheticalAssertionBegin:
1252 if (matchParentheticalAssertionBegin(currentTerm(), context))
1253 MATCH_NEXT();
1254 BACKTRACK();
1255 case ByteTerm::TypeParentheticalAssertionEnd:
1256 if (matchParentheticalAssertionEnd(currentTerm(), context))
1257 MATCH_NEXT();
1258 BACKTRACK();
1259
1260 case ByteTerm::TypeCheckInput:
1261 if (input.checkInput(currentTerm().checkInputCount))
1262 MATCH_NEXT();
1263 BACKTRACK();
1264
1265 case ByteTerm::TypeUncheckInput:
1266 input.uncheckInput(currentTerm().checkInputCount);
1267 MATCH_NEXT();
1268
1269 case ByteTerm::TypeDotStarEnclosure:
1270 if (matchDotStarEnclosure(currentTerm(), context))
1271 return JSRegExpMatch;
1272 BACKTRACK();
1273 }
1274
1275 // We should never fall-through to here.
1276 ASSERT_NOT_REACHED();
1277
1278 backtrack:
1279 ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
1280
1281 switch (currentTerm().type) {
1282 case ByteTerm::TypeSubpatternBegin:
1283 return JSRegExpNoMatch;
1284 case ByteTerm::TypeSubpatternEnd:
1285 ASSERT_NOT_REACHED();
1286
1287 case ByteTerm::TypeBodyAlternativeBegin:
1288 case ByteTerm::TypeBodyAlternativeDisjunction: {
1289 int offset = currentTerm().alternative.next;
1290 context->term += offset;
1291 if (offset > 0)
1292 MATCH_NEXT();
1293
1294 if (input.atEnd())
1295 return JSRegExpNoMatch;
1296
1297 input.next();
1298
1299 context->matchBegin = input.getPos();
1300
1301 if (currentTerm().alternative.onceThrough)
1302 context->term += currentTerm().alternative.next;
1303
1304 MATCH_NEXT();
1305 }
1306 case ByteTerm::TypeBodyAlternativeEnd:
1307 ASSERT_NOT_REACHED();
1308
1309 case ByteTerm::TypeAlternativeBegin:
1310 case ByteTerm::TypeAlternativeDisjunction: {
1311 int offset = currentTerm().alternative.next;
1312 context->term += offset;
1313 if (offset > 0)
1314 MATCH_NEXT();
1315 BACKTRACK();
1316 }
1317 case ByteTerm::TypeAlternativeEnd: {
1318 // We should never backtrack back into an alternative of the main body of the regex.
1319 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
1320 unsigned offset = backTrack->offset;
1321 context->term -= offset;
1322 BACKTRACK();
1323 }
1324
1325 case ByteTerm::TypeAssertionBOL:
1326 case ByteTerm::TypeAssertionEOL:
1327 case ByteTerm::TypeAssertionWordBoundary:
1328 BACKTRACK();
1329
1330 case ByteTerm::TypePatternCharacterOnce:
1331 case ByteTerm::TypePatternCharacterFixed:
1332 case ByteTerm::TypePatternCharacterGreedy:
1333 case ByteTerm::TypePatternCharacterNonGreedy:
1334 if (backtrackPatternCharacter(currentTerm(), context))
1335 MATCH_NEXT();
1336 BACKTRACK();
1337 case ByteTerm::TypePatternCasedCharacterOnce:
1338 case ByteTerm::TypePatternCasedCharacterFixed:
1339 case ByteTerm::TypePatternCasedCharacterGreedy:
1340 case ByteTerm::TypePatternCasedCharacterNonGreedy:
1341 if (backtrackPatternCasedCharacter(currentTerm(), context))
1342 MATCH_NEXT();
1343 BACKTRACK();
1344 case ByteTerm::TypeCharacterClass:
1345 if (backtrackCharacterClass(currentTerm(), context))
1346 MATCH_NEXT();
1347 BACKTRACK();
1348 case ByteTerm::TypeBackReference:
1349 if (backtrackBackReference(currentTerm(), context))
1350 MATCH_NEXT();
1351 BACKTRACK();
1352 case ByteTerm::TypeParenthesesSubpattern: {
1353 JSRegExpResult result = backtrackParentheses(currentTerm(), context);
1354
1355 if (result == JSRegExpMatch) {
1356 MATCH_NEXT();
1357 } else if (result != JSRegExpNoMatch)
1358 return result;
1359
1360 BACKTRACK();
1361 }
1362 case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1363 if (backtrackParenthesesOnceBegin(currentTerm(), context))
1364 MATCH_NEXT();
1365 BACKTRACK();
1366 case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1367 if (backtrackParenthesesOnceEnd(currentTerm(), context))
1368 MATCH_NEXT();
1369 BACKTRACK();
1370 case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
1371 if (backtrackParenthesesTerminalBegin(currentTerm(), context))
1372 MATCH_NEXT();
1373 BACKTRACK();
1374 case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
1375 if (backtrackParenthesesTerminalEnd(currentTerm(), context))
1376 MATCH_NEXT();
1377 BACKTRACK();
1378 case ByteTerm::TypeParentheticalAssertionBegin:
1379 if (backtrackParentheticalAssertionBegin(currentTerm(), context))
1380 MATCH_NEXT();
1381 BACKTRACK();
1382 case ByteTerm::TypeParentheticalAssertionEnd:
1383 if (backtrackParentheticalAssertionEnd(currentTerm(), context))
1384 MATCH_NEXT();
1385 BACKTRACK();
1386
1387 case ByteTerm::TypeCheckInput:
1388 input.uncheckInput(currentTerm().checkInputCount);
1389 BACKTRACK();
1390
1391 case ByteTerm::TypeUncheckInput:
1392 input.checkInput(currentTerm().checkInputCount);
1393 BACKTRACK();
1394
1395 case ByteTerm::TypeDotStarEnclosure:
1396 ASSERT_NOT_REACHED();
1397 }
1398
1399 ASSERT_NOT_REACHED();
1400 return JSRegExpErrorNoMatch;
1401 }
1402
1403 JSRegExpResult matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
1404 {
1405 JSRegExpResult result = matchDisjunction(disjunction, context, btrack);
1406
1407 if (result == JSRegExpMatch) {
1408 while (context->matchBegin == context->matchEnd) {
1409 result = matchDisjunction(disjunction, context, true);
1410 if (result != JSRegExpMatch)
1411 return result;
1412 }
1413 return JSRegExpMatch;
1414 }
1415
1416 return result;
1417 }
1418
1419 unsigned interpret()
1420 {
1421 if (!input.isAvailableInput(0))
1422 return offsetNoMatch;
1423
1424 for (unsigned i = 0; i < pattern->m_body->m_numSubpatterns + 1; ++i)
1425 output[i << 1] = offsetNoMatch;
1426
1427 allocatorPool = pattern->m_allocator->startAllocator();
1428 if (!allocatorPool)
1429 CRASH();
1430
1431 DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get());
1432
1433 JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false);
1434 if (result == JSRegExpMatch) {
1435 output[0] = context->matchBegin;
1436 output[1] = context->matchEnd;
1437 }
1438
1439 freeDisjunctionContext(context);
1440
1441 pattern->m_allocator->stopAllocator();
1442
1443 ASSERT((result == JSRegExpMatch) == (output[0] != offsetNoMatch));
1444 return output[0];
1445 }
1446
1447 Interpreter(BytecodePattern* pattern, unsigned* output, const CharType* input, unsigned length, unsigned start)
1448 : pattern(pattern)
1449 , output(output)
1450 , input(input, start, length)
1451 , allocatorPool(0)
1452 , remainingMatchCount(matchLimit)
1453 {
1454 }
1455
1456private:
1457 BytecodePattern* pattern;
1458 unsigned* output;
1459 InputStream input;
1460 BumpPointerPool* allocatorPool;
1461 unsigned remainingMatchCount;
1462};
1463
1464
1465
1466class ByteCompiler {
1467 struct ParenthesesStackEntry {
1468 unsigned beginTerm;
1469 unsigned savedAlternativeIndex;
1470 ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/)
1471 : beginTerm(beginTerm)
1472 , savedAlternativeIndex(savedAlternativeIndex)
1473 {
1474 }
1475 };
1476
1477public:
1478 ByteCompiler(YarrPattern& pattern)
1479 : m_pattern(pattern)
1480 {
1481 m_currentAlternativeIndex = 0;
1482 }
1483
1484 PassOwnPtr<BytecodePattern> compile(BumpPointerAllocator* allocator)
1485 {
1486 regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough());
1487 emitDisjunction(m_pattern.m_body);
1488 regexEnd();
1489
1490 return adoptPtr(new BytecodePattern(m_bodyDisjunction.release(), m_allParenthesesInfo, m_pattern, allocator));
1491 }
1492
1493 void checkInput(unsigned count)
1494 {
1495 m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count));
1496 }
1497
1498 void uncheckInput(unsigned count)
1499 {
1500 m_bodyDisjunction->terms.append(ByteTerm::UncheckInput(count));
1501 }
1502
1503 void assertionBOL(unsigned inputPosition)
1504 {
1505 m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition));
1506 }
1507
1508 void assertionEOL(unsigned inputPosition)
1509 {
1510 m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition));
1511 }
1512
1513 void assertionWordBoundary(bool invert, unsigned inputPosition)
1514 {
1515 m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition));
1516 }
1517
1518 void atomPatternCharacter(UChar ch, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
1519 {
1520 if (m_pattern.m_ignoreCase) {
1521 UChar lo = Unicode::toLower(ch);
1522 UChar hi = Unicode::toUpper(ch);
1523
1524 if (lo != hi) {
1525 m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType));
1526 return;
1527 }
1528 }
1529
1530 m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType));
1531 }
1532
1533 void atomCharacterClass(CharacterClass* characterClass, bool invert, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
1534 {
1535 m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition));
1536
1537 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount.unsafeGet();
1538 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
1539 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1540 }
1541
1542 void atomBackReference(unsigned subpatternId, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
1543 {
1544 ASSERT(subpatternId);
1545
1546 m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition));
1547
1548 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount.unsafeGet();
1549 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
1550 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1551 }
1552
1553 void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1554 {
1555 int beginTerm = m_bodyDisjunction->terms.size();
1556
1557 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
1558 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1559 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1560 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1561
1562 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1563 m_currentAlternativeIndex = beginTerm + 1;
1564 }
1565
1566 void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1567 {
1568 int beginTerm = m_bodyDisjunction->terms.size();
1569
1570 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalBegin, subpatternId, capture, false, inputPosition));
1571 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1572 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1573 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1574
1575 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1576 m_currentAlternativeIndex = beginTerm + 1;
1577 }
1578
1579 void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1580 {
1581 // Errrk! - this is a little crazy, we initially generate as a TypeParenthesesSubpatternOnceBegin,
1582 // then fix this up at the end! - simplifying this should make it much clearer.
1583 // https://bugs.webkit.org/show_bug.cgi?id=50136
1584
1585 int beginTerm = m_bodyDisjunction->terms.size();
1586
1587 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
1588 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1589 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1590 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1591
1592 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1593 m_currentAlternativeIndex = beginTerm + 1;
1594 }
1595
1596 void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation)
1597 {
1598 int beginTerm = m_bodyDisjunction->terms.size();
1599
1600 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, false, invert, 0));
1601 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1602 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1603 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1604
1605 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1606 m_currentAlternativeIndex = beginTerm + 1;
1607 }
1608
1609 void atomParentheticalAssertionEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
1610 {
1611 unsigned beginTerm = popParenthesesStack();
1612 closeAlternative(beginTerm + 1);
1613 unsigned endTerm = m_bodyDisjunction->terms.size();
1614
1615 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin);
1616
1617 bool invert = m_bodyDisjunction->terms[beginTerm].invert();
1618 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1619
1620 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionEnd, subpatternId, false, invert, inputPosition));
1621 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1622 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1623 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1624
1625 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet();
1626 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1627 m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet();
1628 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1629 }
1630
1631 void assertionDotStarEnclosure(bool bolAnchored, bool eolAnchored)
1632 {
1633 m_bodyDisjunction->terms.append(ByteTerm::DotStarEnclosure(bolAnchored, eolAnchored));
1634 }
1635
1636 unsigned popParenthesesStack()
1637 {
1638 ASSERT(m_parenthesesStack.size());
1639 int stackEnd = m_parenthesesStack.size() - 1;
1640 unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm;
1641 m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex;
1642 m_parenthesesStack.shrink(stackEnd);
1643
1644 ASSERT(beginTerm < m_bodyDisjunction->terms.size());
1645 ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size());
1646
1647 return beginTerm;
1648 }
1649
1650#ifndef NDEBUG
1651 void dumpDisjunction(ByteDisjunction* disjunction)
1652 {
1653 dataLog("ByteDisjunction(%p):\n\t", disjunction);
1654 for (unsigned i = 0; i < disjunction->terms.size(); ++i)
1655 dataLog("{ %d } ", disjunction->terms[i].type);
1656 dataLog("\n");
1657 }
1658#endif
1659
1660 void closeAlternative(int beginTerm)
1661 {
1662 int origBeginTerm = beginTerm;
1663 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin);
1664 int endIndex = m_bodyDisjunction->terms.size();
1665
1666 unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
1667
1668 if (!m_bodyDisjunction->terms[beginTerm].alternative.next)
1669 m_bodyDisjunction->terms.remove(beginTerm);
1670 else {
1671 while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
1672 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
1673 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction);
1674 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1675 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1676 }
1677
1678 m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1679
1680 m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd());
1681 m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1682 }
1683 }
1684
1685 void closeBodyAlternative()
1686 {
1687 int beginTerm = 0;
1688 int origBeginTerm = 0;
1689 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin);
1690 int endIndex = m_bodyDisjunction->terms.size();
1691
1692 unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
1693
1694 while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
1695 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
1696 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction);
1697 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1698 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1699 }
1700
1701 m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1702
1703 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd());
1704 m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1705 }
1706
1707 void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0)
1708 {
1709 unsigned beginTerm = popParenthesesStack();
1710 closeAlternative(beginTerm + 1);
1711 unsigned endTerm = m_bodyDisjunction->terms.size();
1712
1713 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
1714
1715 ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm];
1716
1717 bool capture = parenthesesBegin.capture();
1718 unsigned subpatternId = parenthesesBegin.atom.subpatternId;
1719
1720 unsigned numSubpatterns = lastSubpatternId - subpatternId + 1;
1721 ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize);
1722
1723 parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin());
1724 for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses)
1725 parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]);
1726 parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd());
1727
1728 m_bodyDisjunction->terms.shrink(beginTerm);
1729
1730 m_allParenthesesInfo.append(parenthesesDisjunction);
1731 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, inputPosition));
1732
1733 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet();
1734 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1735 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1736 }
1737
1738 void atomParenthesesOnceEnd(int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
1739 {
1740 unsigned beginTerm = popParenthesesStack();
1741 closeAlternative(beginTerm + 1);
1742 unsigned endTerm = m_bodyDisjunction->terms.size();
1743
1744 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
1745
1746 bool capture = m_bodyDisjunction->terms[beginTerm].capture();
1747 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1748
1749 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, capture, false, inputPosition));
1750 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1751 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1752 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1753
1754 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet();
1755 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1756 m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet();
1757 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1758 }
1759
1760 void atomParenthesesTerminalEnd(int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
1761 {
1762 unsigned beginTerm = popParenthesesStack();
1763 closeAlternative(beginTerm + 1);
1764 unsigned endTerm = m_bodyDisjunction->terms.size();
1765
1766 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
1767
1768 bool capture = m_bodyDisjunction->terms[beginTerm].capture();
1769 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1770
1771 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalEnd, subpatternId, capture, false, inputPosition));
1772 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1773 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1774 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1775
1776 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet();
1777 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1778 m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet();
1779 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1780 }
1781
1782 void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough)
1783 {
1784 m_bodyDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize));
1785 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough));
1786 m_bodyDisjunction->terms[0].frameLocation = 0;
1787 m_currentAlternativeIndex = 0;
1788 }
1789
1790 void regexEnd()
1791 {
1792 closeBodyAlternative();
1793 }
1794
1795 void alternativeBodyDisjunction(bool onceThrough)
1796 {
1797 int newAlternativeIndex = m_bodyDisjunction->terms.size();
1798 m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
1799 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction(onceThrough));
1800
1801 m_currentAlternativeIndex = newAlternativeIndex;
1802 }
1803
1804 void alternativeDisjunction()
1805 {
1806 int newAlternativeIndex = m_bodyDisjunction->terms.size();
1807 m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
1808 m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction());
1809
1810 m_currentAlternativeIndex = newAlternativeIndex;
1811 }
1812
1813 void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0)
1814 {
1815 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
1816 unsigned currentCountAlreadyChecked = inputCountAlreadyChecked;
1817
1818 PatternAlternative* alternative = disjunction->m_alternatives[alt];
1819
1820 if (alt) {
1821 if (disjunction == m_pattern.m_body)
1822 alternativeBodyDisjunction(alternative->onceThrough());
1823 else
1824 alternativeDisjunction();
1825 }
1826
1827 unsigned minimumSize = alternative->m_minimumSize;
1828 ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked);
1829 unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked;
1830
1831 if (countToCheck) {
1832 checkInput(countToCheck);
1833 currentCountAlreadyChecked += countToCheck;
1834 }
1835
1836 for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
1837 PatternTerm& term = alternative->m_terms[i];
1838
1839 switch (term.type) {
1840 case PatternTerm::TypeAssertionBOL:
1841 assertionBOL(currentCountAlreadyChecked - term.inputPosition);
1842 break;
1843
1844 case PatternTerm::TypeAssertionEOL:
1845 assertionEOL(currentCountAlreadyChecked - term.inputPosition);
1846 break;
1847
1848 case PatternTerm::TypeAssertionWordBoundary:
1849 assertionWordBoundary(term.invert(), currentCountAlreadyChecked - term.inputPosition);
1850 break;
1851
1852 case PatternTerm::TypePatternCharacter:
1853 atomPatternCharacter(term.patternCharacter, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType);
1854 break;
1855
1856 case PatternTerm::TypeCharacterClass:
1857 atomCharacterClass(term.characterClass, term.invert(), currentCountAlreadyChecked- term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType);
1858 break;
1859
1860 case PatternTerm::TypeBackReference:
1861 atomBackReference(term.backReferenceSubpatternId, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType);
1862 break;
1863
1864 case PatternTerm::TypeForwardReference:
1865 break;
1866
1867 case PatternTerm::TypeParenthesesSubpattern: {
1868 unsigned disjunctionAlreadyCheckedCount = 0;
1869 if (term.quantityCount == 1 && !term.parentheses.isCopy) {
1870 unsigned alternativeFrameLocation = term.frameLocation;
1871 // For QuantifierFixedCount we pre-check the minimum size; for greedy/non-greedy we reserve a slot in the frame.
1872 if (term.quantityType == QuantifierFixedCount)
1873 disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize;
1874 else
1875 alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
1876 unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1877 atomParenthesesOnceBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, alternativeFrameLocation);
1878 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount);
1879 atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType);
1880 } else if (term.parentheses.isTerminal) {
1881 unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1882 atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, term.frameLocation + YarrStackSpaceForBackTrackInfoParenthesesOnce);
1883 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount);
1884 atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType);
1885 } else {
1886 unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1887 atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, 0);
1888 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
1889 atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
1890 }
1891 break;
1892 }
1893
1894 case PatternTerm::TypeParentheticalAssertion: {
1895 unsigned alternativeFrameLocation = term.frameLocation + YarrStackSpaceForBackTrackInfoParentheticalAssertion;
1896
1897 ASSERT(currentCountAlreadyChecked >= static_cast<unsigned>(term.inputPosition));
1898 unsigned positiveInputOffset = currentCountAlreadyChecked - static_cast<unsigned>(term.inputPosition);
1899 unsigned uncheckAmount = 0;
1900 if (positiveInputOffset > term.parentheses.disjunction->m_minimumSize) {
1901 uncheckAmount = positiveInputOffset - term.parentheses.disjunction->m_minimumSize;
1902 uncheckInput(uncheckAmount);
1903 currentCountAlreadyChecked -= uncheckAmount;
1904 }
1905
1906 atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invert(), term.frameLocation, alternativeFrameLocation);
1907 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset - uncheckAmount);
1908 atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityCount, term.quantityType);
1909 if (uncheckAmount) {
1910 checkInput(uncheckAmount);
1911 currentCountAlreadyChecked += uncheckAmount;
1912 }
1913 break;
1914 }
1915
1916 case PatternTerm::TypeDotStarEnclosure:
1917 assertionDotStarEnclosure(term.anchors.bolAnchor, term.anchors.eolAnchor);
1918 break;
1919 }
1920 }
1921 }
1922 }
1923
1924private:
1925 YarrPattern& m_pattern;
1926 OwnPtr<ByteDisjunction> m_bodyDisjunction;
1927 unsigned m_currentAlternativeIndex;
1928 Vector<ParenthesesStackEntry> m_parenthesesStack;
1929 Vector<ByteDisjunction*> m_allParenthesesInfo;
1930};
1931
1932PassOwnPtr<BytecodePattern> byteCompile(YarrPattern& pattern, BumpPointerAllocator* allocator)
1933{
1934 return ByteCompiler(pattern).compile(allocator);
1935}
1936
1937unsigned interpret(BytecodePattern* bytecode, const UString& input, unsigned start, unsigned* output)
1938{
1939 if (input.is8Bit())
1940 return Interpreter<LChar>(bytecode, output, input.characters8(), input.length(), start).interpret();
1941 return Interpreter<UChar>(bytecode, output, input.characters16(), input.length(), start).interpret();
1942}
1943
1944unsigned interpret(BytecodePattern* bytecode, const LChar* input, unsigned length, unsigned start, unsigned* output)
1945{
1946 return Interpreter<LChar>(bytecode, output, input, length, start).interpret();
1947}
1948
1949unsigned interpret(BytecodePattern* bytecode, const UChar* input, unsigned length, unsigned start, unsigned* output)
1950{
1951 return Interpreter<UChar>(bytecode, output, input, length, start).interpret();
1952}
1953
1954// These should be the same for both UChar & LChar.
1955COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoPatternCharacter);
1956COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoCharacterClass);
1957COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoBackReference);
1958COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoAlternative);
1959COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion);
1960COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce);
1961COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheses) == (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses);
1962
1963
1964} }