2 * Copyright (C) 2009 Apple Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 #include "RegexInterpreter.h"
29 #include "RegexCompiler.h"
30 #include "RegexPattern.h"
40 namespace JSC
{ namespace Yarr
{
44 struct ParenthesesDisjunctionContext
;
46 struct BackTrackInfoPatternCharacter
{
47 uintptr_t matchAmount
;
49 struct BackTrackInfoCharacterClass
{
50 uintptr_t matchAmount
;
52 struct BackTrackInfoBackReference
{
53 uintptr_t begin
; // Not really needed for greedy quantifiers.
54 uintptr_t matchAmount
; // Not really needed for fixed quantifiers.
56 struct BackTrackInfoAlternative
{
59 struct BackTrackInfoParentheticalAssertion
{
62 struct BackTrackInfoParenthesesOnce
{
63 uintptr_t inParentheses
;
65 struct BackTrackInfoParentheses
{
66 uintptr_t matchAmount
;
67 ParenthesesDisjunctionContext
* lastContext
;
72 static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses
* backTrack
, ParenthesesDisjunctionContext
* context
)
74 context
->next
= backTrack
->lastContext
;
75 backTrack
->lastContext
= context
;
76 ++backTrack
->matchAmount
;
79 static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses
* backTrack
)
81 ASSERT(backTrack
->matchAmount
);
82 ASSERT(backTrack
->lastContext
);
83 backTrack
->lastContext
= backTrack
->lastContext
->next
;
84 --backTrack
->matchAmount
;
87 struct DisjunctionContext
94 void* operator new(size_t, void* where
)
105 DisjunctionContext
* allocDisjunctionContext(ByteDisjunction
* disjunction
)
107 return new(malloc(sizeof(DisjunctionContext
) + (disjunction
->m_frameSize
- 1) * sizeof(uintptr_t))) DisjunctionContext();
110 void freeDisjunctionContext(DisjunctionContext
* context
)
115 struct ParenthesesDisjunctionContext
117 ParenthesesDisjunctionContext(int* output
, ByteTerm
& term
)
120 unsigned firstSubpatternId
= term
.atom
.subpatternId
;
121 unsigned numNestedSubpatterns
= term
.atom
.parenthesesDisjunction
->m_numSubpatterns
;
123 for (unsigned i
= 0; i
< (numNestedSubpatterns
<< 1); ++i
) {
124 subpatternBackup
[i
] = output
[(firstSubpatternId
<< 1) + i
];
125 output
[(firstSubpatternId
<< 1) + i
] = -1;
128 new(getDisjunctionContext(term
)) DisjunctionContext();
131 void* operator new(size_t, void* where
)
136 void restoreOutput(int* output
, unsigned firstSubpatternId
, unsigned numNestedSubpatterns
)
138 for (unsigned i
= 0; i
< (numNestedSubpatterns
<< 1); ++i
)
139 output
[(firstSubpatternId
<< 1) + i
] = subpatternBackup
[i
];
142 DisjunctionContext
* getDisjunctionContext(ByteTerm
& term
)
144 return reinterpret_cast<DisjunctionContext
*>(&(subpatternBackup
[term
.atom
.parenthesesDisjunction
->m_numSubpatterns
<< 1]));
147 ParenthesesDisjunctionContext
* next
;
148 int subpatternBackup
[1];
151 ParenthesesDisjunctionContext
* allocParenthesesDisjunctionContext(ByteDisjunction
* disjunction
, int* output
, ByteTerm
& term
)
153 return new(malloc(sizeof(ParenthesesDisjunctionContext
) + (((term
.atom
.parenthesesDisjunction
->m_numSubpatterns
<< 1) - 1) * sizeof(int)) + sizeof(DisjunctionContext
) + (disjunction
->m_frameSize
- 1) * sizeof(uintptr_t))) ParenthesesDisjunctionContext(output
, term
);
156 void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext
* context
)
163 InputStream(const UChar
* input
, unsigned start
, unsigned length
)
175 void rewind(unsigned amount
)
177 ASSERT(pos
>= amount
);
183 ASSERT(pos
< length
);
189 int readChecked(int position
)
191 ASSERT(position
< 0);
192 ASSERT((unsigned)-position
<= pos
);
193 unsigned p
= pos
+ position
;
198 int reread(unsigned from
)
200 ASSERT(from
< length
);
206 ASSERT(!(pos
> length
));
208 return input
[pos
- 1];
217 void setPos(unsigned p
)
229 return pos
== length
;
232 bool checkInput(int count
)
234 if ((pos
+ count
) <= length
) {
241 void uncheckInput(int count
)
246 bool atStart(int position
)
248 return (pos
+ position
) == 0;
251 bool atEnd(int position
)
253 return (pos
+ position
) == length
;
262 bool testCharacterClass(CharacterClass
* characterClass
, int ch
)
265 for (unsigned i
= 0; i
< characterClass
->m_matchesUnicode
.size(); ++i
)
266 if (ch
== characterClass
->m_matchesUnicode
[i
])
268 for (unsigned i
= 0; i
< characterClass
->m_rangesUnicode
.size(); ++i
)
269 if ((ch
>= characterClass
->m_rangesUnicode
[i
].begin
) && (ch
<= characterClass
->m_rangesUnicode
[i
].end
))
272 for (unsigned i
= 0; i
< characterClass
->m_matches
.size(); ++i
)
273 if (ch
== characterClass
->m_matches
[i
])
275 for (unsigned i
= 0; i
< characterClass
->m_ranges
.size(); ++i
)
276 if ((ch
>= characterClass
->m_ranges
[i
].begin
) && (ch
<= characterClass
->m_ranges
[i
].end
))
283 bool tryConsumeCharacter(int testChar
)
288 int ch
= input
.read();
290 if (pattern
->m_ignoreCase
? ((Unicode::toLower(testChar
) == ch
) || (Unicode::toUpper(testChar
) == ch
)) : (testChar
== ch
)) {
297 bool checkCharacter(int testChar
, int inputPosition
)
299 return testChar
== input
.readChecked(inputPosition
);
302 bool checkCasedCharacter(int loChar
, int hiChar
, int inputPosition
)
304 int ch
= input
.readChecked(inputPosition
);
305 return (loChar
== ch
) || (hiChar
== ch
);
308 bool tryConsumeCharacterClass(CharacterClass
* characterClass
, bool invert
)
313 bool match
= testCharacterClass(characterClass
, input
.read());
325 bool checkCharacterClass(CharacterClass
* characterClass
, bool invert
, int inputPosition
)
327 bool match
= testCharacterClass(characterClass
, input
.readChecked(inputPosition
));
328 return invert
? !match
: match
;
331 bool tryConsumeBackReference(int matchBegin
, int matchEnd
, int inputOffset
)
333 int matchSize
= matchEnd
- matchBegin
;
335 if (!input
.checkInput(matchSize
))
338 for (int i
= 0; i
< matchSize
; ++i
) {
339 if (!checkCharacter(input
.reread(matchBegin
+ i
), inputOffset
- matchSize
+ i
)) {
340 input
.uncheckInput(matchSize
);
348 bool matchAssertionBOL(ByteTerm
& term
)
350 return (input
.atStart(term
.inputPosition
)) || (pattern
->m_multiline
&& testCharacterClass(pattern
->newlineCharacterClass
, input
.readChecked(term
.inputPosition
- 1)));
353 bool matchAssertionEOL(ByteTerm
& term
)
355 if (term
.inputPosition
)
356 return (input
.atEnd(term
.inputPosition
)) || (pattern
->m_multiline
&& testCharacterClass(pattern
->newlineCharacterClass
, input
.readChecked(term
.inputPosition
)));
358 return (input
.atEnd()) || (pattern
->m_multiline
&& testCharacterClass(pattern
->newlineCharacterClass
, input
.read()));
361 bool matchAssertionWordBoundary(ByteTerm
& term
)
363 bool prevIsWordchar
= !input
.atStart(term
.inputPosition
) && testCharacterClass(pattern
->wordcharCharacterClass
, input
.readChecked(term
.inputPosition
- 1));
365 if (term
.inputPosition
)
366 readIsWordchar
= !input
.atEnd(term
.inputPosition
) && testCharacterClass(pattern
->wordcharCharacterClass
, input
.readChecked(term
.inputPosition
));
368 readIsWordchar
= !input
.atEnd() && testCharacterClass(pattern
->wordcharCharacterClass
, input
.read());
370 bool wordBoundary
= prevIsWordchar
!= readIsWordchar
;
371 return term
.invert() ? !wordBoundary
: wordBoundary
;
374 bool backtrackPatternCharacter(ByteTerm
& term
, DisjunctionContext
* context
)
376 BackTrackInfoPatternCharacter
* backTrack
= reinterpret_cast<BackTrackInfoPatternCharacter
*>(context
->frame
+ term
.frameLocation
);
378 switch (term
.atom
.quantityType
) {
379 case QuantifierFixedCount
:
382 case QuantifierGreedy
:
383 if (backTrack
->matchAmount
) {
384 --backTrack
->matchAmount
;
385 input
.uncheckInput(1);
390 case QuantifierNonGreedy
:
391 if ((backTrack
->matchAmount
< term
.atom
.quantityCount
) && input
.checkInput(1)) {
392 ++backTrack
->matchAmount
;
393 if (checkCharacter(term
.atom
.patternCharacter
, term
.inputPosition
- 1))
396 input
.uncheckInput(backTrack
->matchAmount
);
403 bool backtrackPatternCasedCharacter(ByteTerm
& term
, DisjunctionContext
* context
)
405 BackTrackInfoPatternCharacter
* backTrack
= reinterpret_cast<BackTrackInfoPatternCharacter
*>(context
->frame
+ term
.frameLocation
);
407 switch (term
.atom
.quantityType
) {
408 case QuantifierFixedCount
:
411 case QuantifierGreedy
:
412 if (backTrack
->matchAmount
) {
413 --backTrack
->matchAmount
;
414 input
.uncheckInput(1);
419 case QuantifierNonGreedy
:
420 if ((backTrack
->matchAmount
< term
.atom
.quantityCount
) && input
.checkInput(1)) {
421 ++backTrack
->matchAmount
;
422 if (checkCasedCharacter(term
.atom
.casedCharacter
.lo
, term
.atom
.casedCharacter
.hi
, term
.inputPosition
- 1))
425 input
.uncheckInput(backTrack
->matchAmount
);
432 bool matchCharacterClass(ByteTerm
& term
, DisjunctionContext
* context
)
434 ASSERT(term
.type
== ByteTerm::TypeCharacterClass
);
435 BackTrackInfoPatternCharacter
* backTrack
= reinterpret_cast<BackTrackInfoPatternCharacter
*>(context
->frame
+ term
.frameLocation
);
437 switch (term
.atom
.quantityType
) {
438 case QuantifierFixedCount
: {
439 for (unsigned matchAmount
= 0; matchAmount
< term
.atom
.quantityCount
; ++matchAmount
) {
440 if (!checkCharacterClass(term
.atom
.characterClass
, term
.invert(), term
.inputPosition
+ matchAmount
))
446 case QuantifierGreedy
: {
447 unsigned matchAmount
= 0;
448 while ((matchAmount
< term
.atom
.quantityCount
) && input
.checkInput(1)) {
449 if (!checkCharacterClass(term
.atom
.characterClass
, term
.invert(), term
.inputPosition
- 1)) {
450 input
.uncheckInput(1);
455 backTrack
->matchAmount
= matchAmount
;
460 case QuantifierNonGreedy
:
461 backTrack
->matchAmount
= 0;
465 ASSERT_NOT_REACHED();
469 bool backtrackCharacterClass(ByteTerm
& term
, DisjunctionContext
* context
)
471 ASSERT(term
.type
== ByteTerm::TypeCharacterClass
);
472 BackTrackInfoPatternCharacter
* backTrack
= reinterpret_cast<BackTrackInfoPatternCharacter
*>(context
->frame
+ term
.frameLocation
);
474 switch (term
.atom
.quantityType
) {
475 case QuantifierFixedCount
:
478 case QuantifierGreedy
:
479 if (backTrack
->matchAmount
) {
480 --backTrack
->matchAmount
;
481 input
.uncheckInput(1);
486 case QuantifierNonGreedy
:
487 if ((backTrack
->matchAmount
< term
.atom
.quantityCount
) && input
.checkInput(1)) {
488 ++backTrack
->matchAmount
;
489 if (checkCharacterClass(term
.atom
.characterClass
, term
.invert(), term
.inputPosition
- 1))
492 input
.uncheckInput(backTrack
->matchAmount
);
499 bool matchBackReference(ByteTerm
& term
, DisjunctionContext
* context
)
501 ASSERT(term
.type
== ByteTerm::TypeBackReference
);
502 BackTrackInfoBackReference
* backTrack
= reinterpret_cast<BackTrackInfoBackReference
*>(context
->frame
+ term
.frameLocation
);
504 int matchBegin
= output
[(term
.atom
.subpatternId
<< 1)];
505 int matchEnd
= output
[(term
.atom
.subpatternId
<< 1) + 1];
506 ASSERT((matchBegin
== -1) == (matchEnd
== -1));
507 ASSERT(matchBegin
<= matchEnd
);
509 if (matchBegin
== matchEnd
)
512 switch (term
.atom
.quantityType
) {
513 case QuantifierFixedCount
: {
514 backTrack
->begin
= input
.getPos();
515 for (unsigned matchAmount
= 0; matchAmount
< term
.atom
.quantityCount
; ++matchAmount
) {
516 if (!tryConsumeBackReference(matchBegin
, matchEnd
, term
.inputPosition
)) {
517 input
.setPos(backTrack
->begin
);
524 case QuantifierGreedy
: {
525 unsigned matchAmount
= 0;
526 while ((matchAmount
< term
.atom
.quantityCount
) && tryConsumeBackReference(matchBegin
, matchEnd
, term
.inputPosition
))
528 backTrack
->matchAmount
= matchAmount
;
532 case QuantifierNonGreedy
:
533 backTrack
->begin
= input
.getPos();
534 backTrack
->matchAmount
= 0;
538 ASSERT_NOT_REACHED();
542 bool backtrackBackReference(ByteTerm
& term
, DisjunctionContext
* context
)
544 ASSERT(term
.type
== ByteTerm::TypeBackReference
);
545 BackTrackInfoBackReference
* backTrack
= reinterpret_cast<BackTrackInfoBackReference
*>(context
->frame
+ term
.frameLocation
);
547 int matchBegin
= output
[(term
.atom
.subpatternId
<< 1)];
548 int matchEnd
= output
[(term
.atom
.subpatternId
<< 1) + 1];
549 ASSERT((matchBegin
== -1) == (matchEnd
== -1));
550 ASSERT(matchBegin
<= matchEnd
);
552 if (matchBegin
== matchEnd
)
555 switch (term
.atom
.quantityType
) {
556 case QuantifierFixedCount
:
557 // for quantityCount == 1, could rewind.
558 input
.setPos(backTrack
->begin
);
561 case QuantifierGreedy
:
562 if (backTrack
->matchAmount
) {
563 --backTrack
->matchAmount
;
564 input
.rewind(matchEnd
- matchBegin
);
569 case QuantifierNonGreedy
:
570 if ((backTrack
->matchAmount
< term
.atom
.quantityCount
) && tryConsumeBackReference(matchBegin
, matchEnd
, term
.inputPosition
)) {
571 ++backTrack
->matchAmount
;
574 input
.setPos(backTrack
->begin
);
581 void recordParenthesesMatch(ByteTerm
& term
, ParenthesesDisjunctionContext
* context
)
583 if (term
.capture()) {
584 unsigned subpatternId
= term
.atom
.subpatternId
;
585 output
[(subpatternId
<< 1)] = context
->getDisjunctionContext(term
)->matchBegin
+ term
.inputPosition
;
586 output
[(subpatternId
<< 1) + 1] = context
->getDisjunctionContext(term
)->matchEnd
+ term
.inputPosition
;
589 void resetMatches(ByteTerm
& term
, ParenthesesDisjunctionContext
* context
)
591 unsigned firstSubpatternId
= term
.atom
.subpatternId
;
592 unsigned count
= term
.atom
.parenthesesDisjunction
->m_numSubpatterns
;
593 context
->restoreOutput(output
, firstSubpatternId
, count
);
595 void resetAssertionMatches(ByteTerm
& term
)
597 unsigned firstSubpatternId
= term
.atom
.subpatternId
;
598 unsigned count
= term
.atom
.parenthesesDisjunction
->m_numSubpatterns
;
599 for (unsigned i
= 0; i
< (count
<< 1); ++i
)
600 output
[(firstSubpatternId
<< 1) + i
] = -1;
602 bool parenthesesDoBacktrack(ByteTerm
& term
, BackTrackInfoParentheses
* backTrack
)
604 while (backTrack
->matchAmount
) {
605 ParenthesesDisjunctionContext
* context
= backTrack
->lastContext
;
607 if (matchDisjunction(term
.atom
.parenthesesDisjunction
, context
->getDisjunctionContext(term
), true))
610 resetMatches(term
, context
);
611 popParenthesesDisjunctionContext(backTrack
);
612 freeParenthesesDisjunctionContext(context
);
618 bool matchParenthesesOnceBegin(ByteTerm
& term
, DisjunctionContext
* context
)
620 ASSERT(term
.type
== ByteTerm::TypeParenthesesSubpatternOnceBegin
);
621 ASSERT(term
.atom
.quantityCount
== 1);
623 BackTrackInfoParenthesesOnce
* backTrack
= reinterpret_cast<BackTrackInfoParenthesesOnce
*>(context
->frame
+ term
.frameLocation
);
625 switch (term
.atom
.quantityType
) {
626 case QuantifierGreedy
: {
627 // set this speculatively; if we get to the parens end this will be true.
628 backTrack
->inParentheses
= 1;
631 case QuantifierNonGreedy
: {
632 backTrack
->inParentheses
= 0;
633 context
->term
+= term
.atom
.parenthesesWidth
;
636 case QuantifierFixedCount
:
640 if (term
.capture()) {
641 unsigned subpatternId
= term
.atom
.subpatternId
;
642 output
[(subpatternId
<< 1)] = input
.getPos() + term
.inputPosition
;
648 bool matchParenthesesOnceEnd(ByteTerm
& term
, DisjunctionContext
*)
650 ASSERT(term
.type
== ByteTerm::TypeParenthesesSubpatternOnceEnd
);
651 ASSERT(term
.atom
.quantityCount
== 1);
653 if (term
.capture()) {
654 unsigned subpatternId
= term
.atom
.subpatternId
;
655 output
[(subpatternId
<< 1) + 1] = input
.getPos() + term
.inputPosition
;
660 bool backtrackParenthesesOnceBegin(ByteTerm
& term
, DisjunctionContext
* context
)
662 ASSERT(term
.type
== ByteTerm::TypeParenthesesSubpatternOnceBegin
);
663 ASSERT(term
.atom
.quantityCount
== 1);
665 BackTrackInfoParenthesesOnce
* backTrack
= reinterpret_cast<BackTrackInfoParenthesesOnce
*>(context
->frame
+ term
.frameLocation
);
667 if (term
.capture()) {
668 unsigned subpatternId
= term
.atom
.subpatternId
;
669 output
[(subpatternId
<< 1)] = -1;
670 output
[(subpatternId
<< 1) + 1] = -1;
673 switch (term
.atom
.quantityType
) {
674 case QuantifierGreedy
:
675 // if we backtrack to this point, there is another chance - try matching nothing.
676 ASSERT(backTrack
->inParentheses
);
677 backTrack
->inParentheses
= 0;
678 context
->term
+= term
.atom
.parenthesesWidth
;
680 case QuantifierNonGreedy
:
681 ASSERT(backTrack
->inParentheses
);
682 case QuantifierFixedCount
:
689 bool backtrackParenthesesOnceEnd(ByteTerm
& term
, DisjunctionContext
* context
)
691 ASSERT(term
.type
== ByteTerm::TypeParenthesesSubpatternOnceEnd
);
692 ASSERT(term
.atom
.quantityCount
== 1);
694 BackTrackInfoParenthesesOnce
* backTrack
= reinterpret_cast<BackTrackInfoParenthesesOnce
*>(context
->frame
+ term
.frameLocation
);
696 switch (term
.atom
.quantityType
) {
697 case QuantifierGreedy
:
698 if (!backTrack
->inParentheses
) {
699 context
->term
-= term
.atom
.parenthesesWidth
;
702 case QuantifierNonGreedy
:
703 if (!backTrack
->inParentheses
) {
704 // now try to match the parens; set this speculatively.
705 backTrack
->inParentheses
= 1;
706 if (term
.capture()) {
707 unsigned subpatternId
= term
.atom
.subpatternId
;
708 output
[(subpatternId
<< 1) + 1] = input
.getPos() + term
.inputPosition
;
710 context
->term
-= term
.atom
.parenthesesWidth
;
713 case QuantifierFixedCount
:
720 bool matchParentheticalAssertionBegin(ByteTerm
& term
, DisjunctionContext
* context
)
722 ASSERT(term
.type
== ByteTerm::TypeParentheticalAssertionBegin
);
723 ASSERT(term
.atom
.quantityCount
== 1);
725 BackTrackInfoParentheticalAssertion
* backTrack
= reinterpret_cast<BackTrackInfoParentheticalAssertion
*>(context
->frame
+ term
.frameLocation
);
727 backTrack
->begin
= input
.getPos();
731 bool matchParentheticalAssertionEnd(ByteTerm
& term
, DisjunctionContext
* context
)
733 ASSERT(term
.type
== ByteTerm::TypeParentheticalAssertionEnd
);
734 ASSERT(term
.atom
.quantityCount
== 1);
736 BackTrackInfoParentheticalAssertion
* backTrack
= reinterpret_cast<BackTrackInfoParentheticalAssertion
*>(context
->frame
+ term
.frameLocation
);
738 input
.setPos(backTrack
->begin
);
740 // We've reached the end of the parens; if they are inverted, this is failure.
742 context
->term
-= term
.atom
.parenthesesWidth
;
749 bool backtrackParentheticalAssertionBegin(ByteTerm
& term
, DisjunctionContext
* context
)
751 ASSERT(term
.type
== ByteTerm::TypeParentheticalAssertionBegin
);
752 ASSERT(term
.atom
.quantityCount
== 1);
754 // We've failed to match parens; if they are inverted, this is win!
756 context
->term
+= term
.atom
.parenthesesWidth
;
763 bool backtrackParentheticalAssertionEnd(ByteTerm
& term
, DisjunctionContext
* context
)
765 ASSERT(term
.type
== ByteTerm::TypeParentheticalAssertionEnd
);
766 ASSERT(term
.atom
.quantityCount
== 1);
768 BackTrackInfoParentheticalAssertion
* backTrack
= reinterpret_cast<BackTrackInfoParentheticalAssertion
*>(context
->frame
+ term
.frameLocation
);
770 input
.setPos(backTrack
->begin
);
772 context
->term
-= term
.atom
.parenthesesWidth
;
776 bool matchParentheses(ByteTerm
& term
, DisjunctionContext
* context
)
778 ASSERT(term
.type
== ByteTerm::TypeParenthesesSubpattern
);
780 BackTrackInfoParentheses
* backTrack
= reinterpret_cast<BackTrackInfoParentheses
*>(context
->frame
+ term
.frameLocation
);
782 unsigned subpatternId
= term
.atom
.subpatternId
;
783 ByteDisjunction
* disjunctionBody
= term
.atom
.parenthesesDisjunction
;
785 backTrack
->prevBegin
= output
[(subpatternId
<< 1)];
786 backTrack
->prevEnd
= output
[(subpatternId
<< 1) + 1];
788 backTrack
->matchAmount
= 0;
789 backTrack
->lastContext
= 0;
791 switch (term
.atom
.quantityType
) {
792 case QuantifierFixedCount
: {
793 // While we haven't yet reached our fixed limit,
794 while (backTrack
->matchAmount
< term
.atom
.quantityCount
) {
795 // Try to do a match, and it it succeeds, add it to the list.
796 ParenthesesDisjunctionContext
* context
= allocParenthesesDisjunctionContext(disjunctionBody
, output
, term
);
797 if (matchDisjunction(disjunctionBody
, context
->getDisjunctionContext(term
)))
798 appendParenthesesDisjunctionContext(backTrack
, context
);
800 // The match failed; try to find an alternate point to carry on from.
801 resetMatches(term
, context
);
802 freeParenthesesDisjunctionContext(context
);
803 if (!parenthesesDoBacktrack(term
, backTrack
))
808 ASSERT(backTrack
->matchAmount
== term
.atom
.quantityCount
);
809 ParenthesesDisjunctionContext
* context
= backTrack
->lastContext
;
810 recordParenthesesMatch(term
, context
);
814 case QuantifierGreedy
: {
815 while (backTrack
->matchAmount
< term
.atom
.quantityCount
) {
816 ParenthesesDisjunctionContext
* context
= allocParenthesesDisjunctionContext(disjunctionBody
, output
, term
);
817 if (matchNonZeroDisjunction(disjunctionBody
, context
->getDisjunctionContext(term
)))
818 appendParenthesesDisjunctionContext(backTrack
, context
);
820 resetMatches(term
, context
);
821 freeParenthesesDisjunctionContext(context
);
826 if (backTrack
->matchAmount
) {
827 ParenthesesDisjunctionContext
* context
= backTrack
->lastContext
;
828 recordParenthesesMatch(term
, context
);
833 case QuantifierNonGreedy
:
837 ASSERT_NOT_REACHED();
841 // Rules for backtracking differ depending on whether this is greedy or non-greedy.
843 // Greedy matches never should try just adding more - you should already have done
844 // the 'more' cases. Always backtrack, at least a leetle bit. However cases where
845 // you backtrack an item off the list needs checking, since we'll never have matched
846 // the one less case. Tracking forwards, still add as much as possible.
848 // Non-greedy, we've already done the one less case, so don't match on popping.
849 // We haven't done the one more case, so always try to add that.
851 bool backtrackParentheses(ByteTerm
& term
, DisjunctionContext
* context
)
853 ASSERT(term
.type
== ByteTerm::TypeParenthesesSubpattern
);
855 BackTrackInfoParentheses
* backTrack
= reinterpret_cast<BackTrackInfoParentheses
*>(context
->frame
+ term
.frameLocation
);
857 if (term
.capture()) {
858 unsigned subpatternId
= term
.atom
.subpatternId
;
859 output
[(subpatternId
<< 1)] = backTrack
->prevBegin
;
860 output
[(subpatternId
<< 1) + 1] = backTrack
->prevEnd
;
863 ByteDisjunction
* disjunctionBody
= term
.atom
.parenthesesDisjunction
;
865 switch (term
.atom
.quantityType
) {
866 case QuantifierFixedCount
: {
867 ASSERT(backTrack
->matchAmount
== term
.atom
.quantityCount
);
869 ParenthesesDisjunctionContext
* context
= 0;
871 if (!parenthesesDoBacktrack(term
, backTrack
))
874 // While we haven't yet reached our fixed limit,
875 while (backTrack
->matchAmount
< term
.atom
.quantityCount
) {
876 // Try to do a match, and it it succeeds, add it to the list.
877 context
= allocParenthesesDisjunctionContext(disjunctionBody
, output
, term
);
878 if (matchDisjunction(disjunctionBody
, context
->getDisjunctionContext(term
)))
879 appendParenthesesDisjunctionContext(backTrack
, context
);
881 // The match failed; try to find an alternate point to carry on from.
882 resetMatches(term
, context
);
883 freeParenthesesDisjunctionContext(context
);
884 if (!parenthesesDoBacktrack(term
, backTrack
))
889 ASSERT(backTrack
->matchAmount
== term
.atom
.quantityCount
);
890 context
= backTrack
->lastContext
;
891 recordParenthesesMatch(term
, context
);
895 case QuantifierGreedy
: {
896 if (!backTrack
->matchAmount
)
899 ParenthesesDisjunctionContext
* context
= backTrack
->lastContext
;
900 if (matchNonZeroDisjunction(disjunctionBody
, context
->getDisjunctionContext(term
), true)) {
901 while (backTrack
->matchAmount
< term
.atom
.quantityCount
) {
902 ParenthesesDisjunctionContext
* context
= allocParenthesesDisjunctionContext(disjunctionBody
, output
, term
);
903 if (matchNonZeroDisjunction(disjunctionBody
, context
->getDisjunctionContext(term
)))
904 appendParenthesesDisjunctionContext(backTrack
, context
);
906 resetMatches(term
, context
);
907 freeParenthesesDisjunctionContext(context
);
912 resetMatches(term
, context
);
913 popParenthesesDisjunctionContext(backTrack
);
914 freeParenthesesDisjunctionContext(context
);
917 if (backTrack
->matchAmount
) {
918 ParenthesesDisjunctionContext
* context
= backTrack
->lastContext
;
919 recordParenthesesMatch(term
, context
);
924 case QuantifierNonGreedy
: {
925 // If we've not reached the limit, try to add one more match.
926 if (backTrack
->matchAmount
< term
.atom
.quantityCount
) {
927 ParenthesesDisjunctionContext
* context
= allocParenthesesDisjunctionContext(disjunctionBody
, output
, term
);
928 if (matchNonZeroDisjunction(disjunctionBody
, context
->getDisjunctionContext(term
))) {
929 appendParenthesesDisjunctionContext(backTrack
, context
);
930 recordParenthesesMatch(term
, context
);
933 resetMatches(term
, context
);
934 freeParenthesesDisjunctionContext(context
);
938 // Nope - okay backtrack looking for an alternative.
939 while (backTrack
->matchAmount
) {
940 ParenthesesDisjunctionContext
* context
= backTrack
->lastContext
;
941 if (matchNonZeroDisjunction(disjunctionBody
, context
->getDisjunctionContext(term
), true)) {
942 // successful backtrack! we're back in the game!
943 if (backTrack
->matchAmount
) {
944 context
= backTrack
->lastContext
;
945 recordParenthesesMatch(term
, context
);
950 // pop a match off the stack
951 resetMatches(term
, context
);
952 popParenthesesDisjunctionContext(backTrack
);
953 freeParenthesesDisjunctionContext(context
);
960 ASSERT_NOT_REACHED();
964 #define MATCH_NEXT() { ++context->term; goto matchAgain; }
965 #define BACKTRACK() { --context->term; goto backtrack; }
966 #define currentTerm() (disjunction->terms[context->term])
967 bool matchDisjunction(ByteDisjunction
* disjunction
, DisjunctionContext
* context
, bool btrack
= false)
972 context
->matchBegin
= input
.getPos();
976 ASSERT(context
->term
< static_cast<int>(disjunction
->terms
.size()));
978 switch (currentTerm().type
) {
979 case ByteTerm::TypeSubpatternBegin
:
981 case ByteTerm::TypeSubpatternEnd
:
982 context
->matchEnd
= input
.getPos();
985 case ByteTerm::TypeBodyAlternativeBegin
:
987 case ByteTerm::TypeBodyAlternativeDisjunction
:
988 case ByteTerm::TypeBodyAlternativeEnd
:
989 context
->matchEnd
= input
.getPos();
992 case ByteTerm::TypeAlternativeBegin
:
994 case ByteTerm::TypeAlternativeDisjunction
:
995 case ByteTerm::TypeAlternativeEnd
: {
996 int offset
= currentTerm().alternative
.end
;
997 BackTrackInfoAlternative
* backTrack
= reinterpret_cast<BackTrackInfoAlternative
*>(context
->frame
+ currentTerm().frameLocation
);
998 backTrack
->offset
= offset
;
999 context
->term
+= offset
;
1003 case ByteTerm::TypeAssertionBOL
:
1004 if (matchAssertionBOL(currentTerm()))
1007 case ByteTerm::TypeAssertionEOL
:
1008 if (matchAssertionEOL(currentTerm()))
1011 case ByteTerm::TypeAssertionWordBoundary
:
1012 if (matchAssertionWordBoundary(currentTerm()))
1016 case ByteTerm::TypePatternCharacterOnce
:
1017 case ByteTerm::TypePatternCharacterFixed
: {
1018 for (unsigned matchAmount
= 0; matchAmount
< currentTerm().atom
.quantityCount
; ++matchAmount
) {
1019 if (!checkCharacter(currentTerm().atom
.patternCharacter
, currentTerm().inputPosition
+ matchAmount
))
1024 case ByteTerm::TypePatternCharacterGreedy
: {
1025 BackTrackInfoPatternCharacter
* backTrack
= reinterpret_cast<BackTrackInfoPatternCharacter
*>(context
->frame
+ currentTerm().frameLocation
);
1026 unsigned matchAmount
= 0;
1027 while ((matchAmount
< currentTerm().atom
.quantityCount
) && input
.checkInput(1)) {
1028 if (!checkCharacter(currentTerm().atom
.patternCharacter
, currentTerm().inputPosition
- 1)) {
1029 input
.uncheckInput(1);
1034 backTrack
->matchAmount
= matchAmount
;
1038 case ByteTerm::TypePatternCharacterNonGreedy
: {
1039 BackTrackInfoPatternCharacter
* backTrack
= reinterpret_cast<BackTrackInfoPatternCharacter
*>(context
->frame
+ currentTerm().frameLocation
);
1040 backTrack
->matchAmount
= 0;
1044 case ByteTerm::TypePatternCasedCharacterOnce
:
1045 case ByteTerm::TypePatternCasedCharacterFixed
: {
1046 for (unsigned matchAmount
= 0; matchAmount
< currentTerm().atom
.quantityCount
; ++matchAmount
) {
1047 if (!checkCasedCharacter(currentTerm().atom
.casedCharacter
.lo
, currentTerm().atom
.casedCharacter
.hi
, currentTerm().inputPosition
+ matchAmount
))
1052 case ByteTerm::TypePatternCasedCharacterGreedy
: {
1053 BackTrackInfoPatternCharacter
* backTrack
= reinterpret_cast<BackTrackInfoPatternCharacter
*>(context
->frame
+ currentTerm().frameLocation
);
1054 unsigned matchAmount
= 0;
1055 while ((matchAmount
< currentTerm().atom
.quantityCount
) && input
.checkInput(1)) {
1056 if (!checkCasedCharacter(currentTerm().atom
.casedCharacter
.lo
, currentTerm().atom
.casedCharacter
.hi
, currentTerm().inputPosition
- 1)) {
1057 input
.uncheckInput(1);
1062 backTrack
->matchAmount
= matchAmount
;
1066 case ByteTerm::TypePatternCasedCharacterNonGreedy
: {
1067 BackTrackInfoPatternCharacter
* backTrack
= reinterpret_cast<BackTrackInfoPatternCharacter
*>(context
->frame
+ currentTerm().frameLocation
);
1068 backTrack
->matchAmount
= 0;
1072 case ByteTerm::TypeCharacterClass
:
1073 if (matchCharacterClass(currentTerm(), context
))
1076 case ByteTerm::TypeBackReference
:
1077 if (matchBackReference(currentTerm(), context
))
1080 case ByteTerm::TypeParenthesesSubpattern
:
1081 if (matchParentheses(currentTerm(), context
))
1084 case ByteTerm::TypeParenthesesSubpatternOnceBegin
:
1085 if (matchParenthesesOnceBegin(currentTerm(), context
))
1088 case ByteTerm::TypeParenthesesSubpatternOnceEnd
:
1089 if (matchParenthesesOnceEnd(currentTerm(), context
))
1092 case ByteTerm::TypeParentheticalAssertionBegin
:
1093 if (matchParentheticalAssertionBegin(currentTerm(), context
))
1096 case ByteTerm::TypeParentheticalAssertionEnd
:
1097 if (matchParentheticalAssertionEnd(currentTerm(), context
))
1101 case ByteTerm::TypeCheckInput
:
1102 if (input
.checkInput(currentTerm().checkInputCount
))
1107 // We should never fall-through to here.
1108 ASSERT_NOT_REACHED();
1111 ASSERT(context
->term
< static_cast<int>(disjunction
->terms
.size()));
1113 switch (currentTerm().type
) {
1114 case ByteTerm::TypeSubpatternBegin
:
1116 case ByteTerm::TypeSubpatternEnd
:
1117 ASSERT_NOT_REACHED();
1119 case ByteTerm::TypeBodyAlternativeBegin
:
1120 case ByteTerm::TypeBodyAlternativeDisjunction
: {
1121 int offset
= currentTerm().alternative
.next
;
1122 context
->term
+= offset
;
1130 context
->matchBegin
= input
.getPos();
1133 case ByteTerm::TypeBodyAlternativeEnd
:
1134 ASSERT_NOT_REACHED();
1136 case ByteTerm::TypeAlternativeBegin
:
1137 case ByteTerm::TypeAlternativeDisjunction
: {
1138 int offset
= currentTerm().alternative
.next
;
1139 context
->term
+= offset
;
1144 case ByteTerm::TypeAlternativeEnd
: {
1145 // We should never backtrack back into an alternative of the main body of the regex.
1146 BackTrackInfoAlternative
* backTrack
= reinterpret_cast<BackTrackInfoAlternative
*>(context
->frame
+ currentTerm().frameLocation
);
1147 unsigned offset
= backTrack
->offset
;
1148 context
->term
-= offset
;
1152 case ByteTerm::TypeAssertionBOL
:
1153 case ByteTerm::TypeAssertionEOL
:
1154 case ByteTerm::TypeAssertionWordBoundary
:
1157 case ByteTerm::TypePatternCharacterOnce
:
1158 case ByteTerm::TypePatternCharacterFixed
:
1159 case ByteTerm::TypePatternCharacterGreedy
:
1160 case ByteTerm::TypePatternCharacterNonGreedy
:
1161 if (backtrackPatternCharacter(currentTerm(), context
))
1164 case ByteTerm::TypePatternCasedCharacterOnce
:
1165 case ByteTerm::TypePatternCasedCharacterFixed
:
1166 case ByteTerm::TypePatternCasedCharacterGreedy
:
1167 case ByteTerm::TypePatternCasedCharacterNonGreedy
:
1168 if (backtrackPatternCasedCharacter(currentTerm(), context
))
1171 case ByteTerm::TypeCharacterClass
:
1172 if (backtrackCharacterClass(currentTerm(), context
))
1175 case ByteTerm::TypeBackReference
:
1176 if (backtrackBackReference(currentTerm(), context
))
1179 case ByteTerm::TypeParenthesesSubpattern
:
1180 if (backtrackParentheses(currentTerm(), context
))
1183 case ByteTerm::TypeParenthesesSubpatternOnceBegin
:
1184 if (backtrackParenthesesOnceBegin(currentTerm(), context
))
1187 case ByteTerm::TypeParenthesesSubpatternOnceEnd
:
1188 if (backtrackParenthesesOnceEnd(currentTerm(), context
))
1191 case ByteTerm::TypeParentheticalAssertionBegin
:
1192 if (backtrackParentheticalAssertionBegin(currentTerm(), context
))
1195 case ByteTerm::TypeParentheticalAssertionEnd
:
1196 if (backtrackParentheticalAssertionEnd(currentTerm(), context
))
1200 case ByteTerm::TypeCheckInput
:
1201 input
.uncheckInput(currentTerm().checkInputCount
);
1205 ASSERT_NOT_REACHED();
1209 bool matchNonZeroDisjunction(ByteDisjunction
* disjunction
, DisjunctionContext
* context
, bool btrack
= false)
1211 if (matchDisjunction(disjunction
, context
, btrack
)) {
1212 while (context
->matchBegin
== context
->matchEnd
) {
1213 if (!matchDisjunction(disjunction
, context
, true))
1224 for (unsigned i
= 0; i
< ((pattern
->m_body
->m_numSubpatterns
+ 1) << 1); ++i
)
1227 DisjunctionContext
* context
= allocDisjunctionContext(pattern
->m_body
.get());
1229 if (matchDisjunction(pattern
->m_body
.get(), context
)) {
1230 output
[0] = context
->matchBegin
;
1231 output
[1] = context
->matchEnd
;
1234 freeDisjunctionContext(context
);
1239 Interpreter(BytecodePattern
* pattern
, int* output
, const UChar
* inputChar
, unsigned start
, unsigned length
)
1242 , input(inputChar
, start
, length
)
1247 BytecodePattern
*pattern
;
1254 class ByteCompiler
{
1255 struct ParenthesesStackEntry
{
1257 unsigned savedAlternativeIndex
;
1258 ParenthesesStackEntry(unsigned beginTerm
, unsigned savedAlternativeIndex
/*, unsigned subpatternId, bool capture = false*/)
1259 : beginTerm(beginTerm
)
1260 , savedAlternativeIndex(savedAlternativeIndex
)
1266 ByteCompiler(RegexPattern
& pattern
)
1267 : m_pattern(pattern
)
1269 m_bodyDisjunction
= 0;
1270 m_currentAlternativeIndex
= 0;
1273 BytecodePattern
* compile()
1275 regexBegin(m_pattern
.m_numSubpatterns
, m_pattern
.m_body
->m_callFrameSize
);
1276 emitDisjunction(m_pattern
.m_body
);
1279 return new BytecodePattern(m_bodyDisjunction
, m_allParenthesesInfo
, m_pattern
);
1282 void checkInput(unsigned count
)
1284 m_bodyDisjunction
->terms
.append(ByteTerm::CheckInput(count
));
1287 void assertionBOL(int inputPosition
)
1289 m_bodyDisjunction
->terms
.append(ByteTerm::BOL(inputPosition
));
1292 void assertionEOL(int inputPosition
)
1294 m_bodyDisjunction
->terms
.append(ByteTerm::EOL(inputPosition
));
1297 void assertionWordBoundary(bool invert
, int inputPosition
)
1299 m_bodyDisjunction
->terms
.append(ByteTerm::WordBoundary(invert
, inputPosition
));
1302 void atomPatternCharacter(UChar ch
, int inputPosition
, unsigned frameLocation
, unsigned quantityCount
, QuantifierType quantityType
)
1304 if (m_pattern
.m_ignoreCase
) {
1305 UChar lo
= Unicode::toLower(ch
);
1306 UChar hi
= Unicode::toUpper(ch
);
1309 m_bodyDisjunction
->terms
.append(ByteTerm(lo
, hi
, inputPosition
, frameLocation
, quantityCount
, quantityType
));
1314 m_bodyDisjunction
->terms
.append(ByteTerm(ch
, inputPosition
, frameLocation
, quantityCount
, quantityType
));
1317 void atomCharacterClass(CharacterClass
* characterClass
, bool invert
, int inputPosition
, unsigned frameLocation
, unsigned quantityCount
, QuantifierType quantityType
)
1319 m_bodyDisjunction
->terms
.append(ByteTerm(characterClass
, invert
, inputPosition
));
1321 m_bodyDisjunction
->terms
[m_bodyDisjunction
->terms
.size() - 1].atom
.quantityCount
= quantityCount
;
1322 m_bodyDisjunction
->terms
[m_bodyDisjunction
->terms
.size() - 1].atom
.quantityType
= quantityType
;
1323 m_bodyDisjunction
->terms
[m_bodyDisjunction
->terms
.size() - 1].frameLocation
= frameLocation
;
1326 void atomBackReference(unsigned subpatternId
, int inputPosition
, unsigned frameLocation
, unsigned quantityCount
, QuantifierType quantityType
)
1328 ASSERT(subpatternId
);
1330 m_bodyDisjunction
->terms
.append(ByteTerm::BackReference(subpatternId
, inputPosition
));
1332 m_bodyDisjunction
->terms
[m_bodyDisjunction
->terms
.size() - 1].atom
.quantityCount
= quantityCount
;
1333 m_bodyDisjunction
->terms
[m_bodyDisjunction
->terms
.size() - 1].atom
.quantityType
= quantityType
;
1334 m_bodyDisjunction
->terms
[m_bodyDisjunction
->terms
.size() - 1].frameLocation
= frameLocation
;
1337 void atomParenthesesSubpatternBegin(unsigned subpatternId
, bool capture
, int inputPosition
, unsigned frameLocation
, unsigned alternativeFrameLocation
)
1339 int beginTerm
= m_bodyDisjunction
->terms
.size();
1341 m_bodyDisjunction
->terms
.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin
, subpatternId
, capture
, inputPosition
));
1342 m_bodyDisjunction
->terms
[m_bodyDisjunction
->terms
.size() - 1].frameLocation
= frameLocation
;
1343 m_bodyDisjunction
->terms
.append(ByteTerm::AlternativeBegin());
1344 m_bodyDisjunction
->terms
[m_bodyDisjunction
->terms
.size() - 1].frameLocation
= alternativeFrameLocation
;
1346 m_parenthesesStack
.append(ParenthesesStackEntry(beginTerm
, m_currentAlternativeIndex
));
1347 m_currentAlternativeIndex
= beginTerm
+ 1;
1350 void atomParentheticalAssertionBegin(unsigned subpatternId
, bool invert
, unsigned frameLocation
, unsigned alternativeFrameLocation
)
1352 int beginTerm
= m_bodyDisjunction
->terms
.size();
1354 m_bodyDisjunction
->terms
.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin
, subpatternId
, invert
, 0));
1355 m_bodyDisjunction
->terms
[m_bodyDisjunction
->terms
.size() - 1].frameLocation
= frameLocation
;
1356 m_bodyDisjunction
->terms
.append(ByteTerm::AlternativeBegin());
1357 m_bodyDisjunction
->terms
[m_bodyDisjunction
->terms
.size() - 1].frameLocation
= alternativeFrameLocation
;
1359 m_parenthesesStack
.append(ParenthesesStackEntry(beginTerm
, m_currentAlternativeIndex
));
1360 m_currentAlternativeIndex
= beginTerm
+ 1;
1363 unsigned popParenthesesStack()
1365 ASSERT(m_parenthesesStack
.size());
1366 int stackEnd
= m_parenthesesStack
.size() - 1;
1367 unsigned beginTerm
= m_parenthesesStack
[stackEnd
].beginTerm
;
1368 m_currentAlternativeIndex
= m_parenthesesStack
[stackEnd
].savedAlternativeIndex
;
1369 m_parenthesesStack
.shrink(stackEnd
);
1371 ASSERT(beginTerm
< m_bodyDisjunction
->terms
.size());
1372 ASSERT(m_currentAlternativeIndex
< m_bodyDisjunction
->terms
.size());
1378 void dumpDisjunction(ByteDisjunction
* disjunction
)
1380 printf("ByteDisjunction(%p):\n\t", disjunction
);
1381 for (unsigned i
= 0; i
< disjunction
->terms
.size(); ++i
)
1382 printf("{ %d } ", disjunction
->terms
[i
].type
);
1387 void closeAlternative(int beginTerm
)
1389 int origBeginTerm
= beginTerm
;
1390 ASSERT(m_bodyDisjunction
->terms
[beginTerm
].type
== ByteTerm::TypeAlternativeBegin
);
1391 int endIndex
= m_bodyDisjunction
->terms
.size();
1393 unsigned frameLocation
= m_bodyDisjunction
->terms
[beginTerm
].frameLocation
;
1395 if (!m_bodyDisjunction
->terms
[beginTerm
].alternative
.next
)
1396 m_bodyDisjunction
->terms
.remove(beginTerm
);
1398 while (m_bodyDisjunction
->terms
[beginTerm
].alternative
.next
) {
1399 beginTerm
+= m_bodyDisjunction
->terms
[beginTerm
].alternative
.next
;
1400 ASSERT(m_bodyDisjunction
->terms
[beginTerm
].type
== ByteTerm::TypeAlternativeDisjunction
);
1401 m_bodyDisjunction
->terms
[beginTerm
].alternative
.end
= endIndex
- beginTerm
;
1402 m_bodyDisjunction
->terms
[beginTerm
].frameLocation
= frameLocation
;
1405 m_bodyDisjunction
->terms
[beginTerm
].alternative
.next
= origBeginTerm
- beginTerm
;
1407 m_bodyDisjunction
->terms
.append(ByteTerm::AlternativeEnd());
1408 m_bodyDisjunction
->terms
[endIndex
].frameLocation
= frameLocation
;
1412 void closeBodyAlternative()
1415 int origBeginTerm
= 0;
1416 ASSERT(m_bodyDisjunction
->terms
[beginTerm
].type
== ByteTerm::TypeBodyAlternativeBegin
);
1417 int endIndex
= m_bodyDisjunction
->terms
.size();
1419 unsigned frameLocation
= m_bodyDisjunction
->terms
[beginTerm
].frameLocation
;
1421 while (m_bodyDisjunction
->terms
[beginTerm
].alternative
.next
) {
1422 beginTerm
+= m_bodyDisjunction
->terms
[beginTerm
].alternative
.next
;
1423 ASSERT(m_bodyDisjunction
->terms
[beginTerm
].type
== ByteTerm::TypeBodyAlternativeDisjunction
);
1424 m_bodyDisjunction
->terms
[beginTerm
].alternative
.end
= endIndex
- beginTerm
;
1425 m_bodyDisjunction
->terms
[beginTerm
].frameLocation
= frameLocation
;
1428 m_bodyDisjunction
->terms
[beginTerm
].alternative
.next
= origBeginTerm
- beginTerm
;
1430 m_bodyDisjunction
->terms
.append(ByteTerm::BodyAlternativeEnd());
1431 m_bodyDisjunction
->terms
[endIndex
].frameLocation
= frameLocation
;
1434 void atomParenthesesEnd(bool doInline
, unsigned lastSubpatternId
, int inputPosition
, unsigned frameLocation
, unsigned quantityCount
, QuantifierType quantityType
, unsigned callFrameSize
= 0)
1436 unsigned beginTerm
= popParenthesesStack();
1437 closeAlternative(beginTerm
+ 1);
1438 unsigned endTerm
= m_bodyDisjunction
->terms
.size();
1440 bool isAssertion
= m_bodyDisjunction
->terms
[beginTerm
].type
== ByteTerm::TypeParentheticalAssertionBegin
;
1441 bool invertOrCapture
= m_bodyDisjunction
->terms
[beginTerm
].invertOrCapture
;
1442 unsigned subpatternId
= m_bodyDisjunction
->terms
[beginTerm
].atom
.subpatternId
;
1444 m_bodyDisjunction
->terms
.append(ByteTerm(isAssertion
? ByteTerm::TypeParentheticalAssertionEnd
: ByteTerm::TypeParenthesesSubpatternOnceEnd
, subpatternId
, invertOrCapture
, inputPosition
));
1445 m_bodyDisjunction
->terms
[beginTerm
].atom
.parenthesesWidth
= endTerm
- beginTerm
;
1446 m_bodyDisjunction
->terms
[endTerm
].atom
.parenthesesWidth
= endTerm
- beginTerm
;
1447 m_bodyDisjunction
->terms
[endTerm
].frameLocation
= frameLocation
;
1450 m_bodyDisjunction
->terms
[beginTerm
].atom
.quantityCount
= quantityCount
;
1451 m_bodyDisjunction
->terms
[beginTerm
].atom
.quantityType
= quantityType
;
1452 m_bodyDisjunction
->terms
[endTerm
].atom
.quantityCount
= quantityCount
;
1453 m_bodyDisjunction
->terms
[endTerm
].atom
.quantityType
= quantityType
;
1455 ByteTerm
& parenthesesBegin
= m_bodyDisjunction
->terms
[beginTerm
];
1456 ASSERT(parenthesesBegin
.type
== ByteTerm::TypeParenthesesSubpatternOnceBegin
);
1458 bool invertOrCapture
= parenthesesBegin
.invertOrCapture
;
1459 unsigned subpatternId
= parenthesesBegin
.atom
.subpatternId
;
1461 unsigned numSubpatterns
= lastSubpatternId
- subpatternId
+ 1;
1462 ByteDisjunction
* parenthesesDisjunction
= new ByteDisjunction(numSubpatterns
, callFrameSize
);
1464 parenthesesDisjunction
->terms
.append(ByteTerm::SubpatternBegin());
1465 for (unsigned termInParentheses
= beginTerm
+ 1; termInParentheses
< endTerm
; ++termInParentheses
)
1466 parenthesesDisjunction
->terms
.append(m_bodyDisjunction
->terms
[termInParentheses
]);
1467 parenthesesDisjunction
->terms
.append(ByteTerm::SubpatternEnd());
1469 m_bodyDisjunction
->terms
.shrink(beginTerm
);
1471 m_allParenthesesInfo
.append(parenthesesDisjunction
);
1472 m_bodyDisjunction
->terms
.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern
, subpatternId
, parenthesesDisjunction
, invertOrCapture
, inputPosition
));
1474 m_bodyDisjunction
->terms
[beginTerm
].atom
.quantityCount
= quantityCount
;
1475 m_bodyDisjunction
->terms
[beginTerm
].atom
.quantityType
= quantityType
;
1476 m_bodyDisjunction
->terms
[beginTerm
].frameLocation
= frameLocation
;
1480 void regexBegin(unsigned numSubpatterns
, unsigned callFrameSize
)
1482 m_bodyDisjunction
= new ByteDisjunction(numSubpatterns
, callFrameSize
);
1483 m_bodyDisjunction
->terms
.append(ByteTerm::BodyAlternativeBegin());
1484 m_bodyDisjunction
->terms
[0].frameLocation
= 0;
1485 m_currentAlternativeIndex
= 0;
1490 closeBodyAlternative();
1493 void alternativeBodyDisjunction()
1495 int newAlternativeIndex
= m_bodyDisjunction
->terms
.size();
1496 m_bodyDisjunction
->terms
[m_currentAlternativeIndex
].alternative
.next
= newAlternativeIndex
- m_currentAlternativeIndex
;
1497 m_bodyDisjunction
->terms
.append(ByteTerm::BodyAlternativeDisjunction());
1499 m_currentAlternativeIndex
= newAlternativeIndex
;
1502 void alternativeDisjunction()
1504 int newAlternativeIndex
= m_bodyDisjunction
->terms
.size();
1505 m_bodyDisjunction
->terms
[m_currentAlternativeIndex
].alternative
.next
= newAlternativeIndex
- m_currentAlternativeIndex
;
1506 m_bodyDisjunction
->terms
.append(ByteTerm::AlternativeDisjunction());
1508 m_currentAlternativeIndex
= newAlternativeIndex
;
1511 void emitDisjunction(PatternDisjunction
* disjunction
, unsigned inputCountAlreadyChecked
= 0, unsigned parenthesesInputCountAlreadyChecked
= 0)
1513 for (unsigned alt
= 0; alt
< disjunction
->m_alternatives
.size(); ++alt
) {
1514 unsigned currentCountAlreadyChecked
= inputCountAlreadyChecked
;
1517 if (disjunction
== m_pattern
.m_body
)
1518 alternativeBodyDisjunction();
1520 alternativeDisjunction();
1523 PatternAlternative
* alternative
= disjunction
->m_alternatives
[alt
];
1524 unsigned minimumSize
= alternative
->m_minimumSize
;
1526 ASSERT(minimumSize
>= parenthesesInputCountAlreadyChecked
);
1527 unsigned countToCheck
= minimumSize
- parenthesesInputCountAlreadyChecked
;
1529 checkInput(countToCheck
);
1530 currentCountAlreadyChecked
+= countToCheck
;
1532 for (unsigned i
= 0; i
< alternative
->m_terms
.size(); ++i
) {
1533 PatternTerm
& term
= alternative
->m_terms
[i
];
1535 switch (term
.type
) {
1536 case PatternTerm::TypeAssertionBOL
:
1537 assertionBOL(term
.inputPosition
- currentCountAlreadyChecked
);
1540 case PatternTerm::TypeAssertionEOL
:
1541 assertionEOL(term
.inputPosition
- currentCountAlreadyChecked
);
1544 case PatternTerm::TypeAssertionWordBoundary
:
1545 assertionWordBoundary(term
.invertOrCapture
, term
.inputPosition
- currentCountAlreadyChecked
);
1548 case PatternTerm::TypePatternCharacter
:
1549 atomPatternCharacter(term
.patternCharacter
, term
.inputPosition
- currentCountAlreadyChecked
, term
.frameLocation
, term
.quantityCount
, term
.quantityType
);
1552 case PatternTerm::TypeCharacterClass
:
1553 atomCharacterClass(term
.characterClass
, term
.invertOrCapture
, term
.inputPosition
- currentCountAlreadyChecked
, term
.frameLocation
, term
.quantityCount
, term
.quantityType
);
1556 case PatternTerm::TypeBackReference
:
1557 atomBackReference(term
.subpatternId
, term
.inputPosition
- currentCountAlreadyChecked
, term
.frameLocation
, term
.quantityCount
, term
.quantityType
);
1560 case PatternTerm::TypeForwardReference
:
1563 case PatternTerm::TypeParenthesesSubpattern
: {
1564 unsigned disjunctionAlreadyCheckedCount
= 0;
1565 if ((term
.quantityCount
== 1) && !term
.parentheses
.isCopy
) {
1566 if (term
.quantityType
== QuantifierFixedCount
) {
1567 disjunctionAlreadyCheckedCount
= term
.parentheses
.disjunction
->m_minimumSize
;
1568 unsigned delegateEndInputOffset
= term
.inputPosition
- currentCountAlreadyChecked
;
1569 atomParenthesesSubpatternBegin(term
.parentheses
.subpatternId
, term
.invertOrCapture
, delegateEndInputOffset
- disjunctionAlreadyCheckedCount
, term
.frameLocation
, term
.frameLocation
);
1570 emitDisjunction(term
.parentheses
.disjunction
, currentCountAlreadyChecked
, term
.parentheses
.disjunction
->m_minimumSize
);
1571 atomParenthesesEnd(true, term
.parentheses
.lastSubpatternId
, delegateEndInputOffset
, term
.frameLocation
, term
.quantityCount
, term
.quantityType
, term
.parentheses
.disjunction
->m_callFrameSize
);
1573 unsigned delegateEndInputOffset
= term
.inputPosition
- currentCountAlreadyChecked
;
1574 atomParenthesesSubpatternBegin(term
.parentheses
.subpatternId
, term
.invertOrCapture
, delegateEndInputOffset
- disjunctionAlreadyCheckedCount
, term
.frameLocation
, term
.frameLocation
+ RegexStackSpaceForBackTrackInfoParenthesesOnce
);
1575 emitDisjunction(term
.parentheses
.disjunction
, currentCountAlreadyChecked
, 0);
1576 atomParenthesesEnd(true, term
.parentheses
.lastSubpatternId
, delegateEndInputOffset
, term
.frameLocation
, term
.quantityCount
, term
.quantityType
, term
.parentheses
.disjunction
->m_callFrameSize
);
1579 unsigned delegateEndInputOffset
= term
.inputPosition
- currentCountAlreadyChecked
;
1580 atomParenthesesSubpatternBegin(term
.parentheses
.subpatternId
, term
.invertOrCapture
, delegateEndInputOffset
- disjunctionAlreadyCheckedCount
, term
.frameLocation
, 0);
1581 emitDisjunction(term
.parentheses
.disjunction
, currentCountAlreadyChecked
, 0);
1582 atomParenthesesEnd(false, term
.parentheses
.lastSubpatternId
, delegateEndInputOffset
, term
.frameLocation
, term
.quantityCount
, term
.quantityType
, term
.parentheses
.disjunction
->m_callFrameSize
);
1587 case PatternTerm::TypeParentheticalAssertion
: {
1588 unsigned alternativeFrameLocation
= term
.inputPosition
+ RegexStackSpaceForBackTrackInfoParentheticalAssertion
;
1590 atomParentheticalAssertionBegin(term
.parentheses
.subpatternId
, term
.invertOrCapture
, term
.frameLocation
, alternativeFrameLocation
);
1591 emitDisjunction(term
.parentheses
.disjunction
, currentCountAlreadyChecked
, 0);
1592 atomParenthesesEnd(true, term
.parentheses
.lastSubpatternId
, 0, term
.frameLocation
, term
.quantityCount
, term
.quantityType
);
1601 RegexPattern
& m_pattern
;
1602 ByteDisjunction
* m_bodyDisjunction
;
1603 unsigned m_currentAlternativeIndex
;
1604 Vector
<ParenthesesStackEntry
> m_parenthesesStack
;
1605 Vector
<ByteDisjunction
*> m_allParenthesesInfo
;
1609 BytecodePattern
* byteCompileRegex(const UString
& patternString
, unsigned& numSubpatterns
, const char*& error
, bool ignoreCase
, bool multiline
)
1611 RegexPattern
pattern(ignoreCase
, multiline
);
1613 if ((error
= compileRegex(patternString
, pattern
)))
1616 numSubpatterns
= pattern
.m_numSubpatterns
;
1618 return ByteCompiler(pattern
).compile();
1621 int interpretRegex(BytecodePattern
* regex
, const UChar
* input
, unsigned start
, unsigned length
, int* output
)
1623 return Interpreter(regex
, output
, input
, start
, length
).interpret();
1627 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter
) == (RegexStackSpaceForBackTrackInfoPatternCharacter
* sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoPatternCharacter
);
1628 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass
) == (RegexStackSpaceForBackTrackInfoCharacterClass
* sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoCharacterClass
);
1629 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference
) == (RegexStackSpaceForBackTrackInfoBackReference
* sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoBackReference
);
1630 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative
) == (RegexStackSpaceForBackTrackInfoAlternative
* sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoAlternative
);
1631 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion
) == (RegexStackSpaceForBackTrackInfoParentheticalAssertion
* sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheticalAssertion
);
1632 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce
) == (RegexStackSpaceForBackTrackInfoParenthesesOnce
* sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParenthesesOnce
);
1633 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses
) == (RegexStackSpaceForBackTrackInfoParentheses
* sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheses
);