2 * Copyright (C) 2009 Apple Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 #include "RegexInterpreter.h"
29 #include "RegexCompiler.h"
30 #include "RegexPattern.h"
40 namespace JSC
{ namespace Yarr
{
44 struct ParenthesesDisjunctionContext
;
46 struct BackTrackInfoPatternCharacter
{
47 uintptr_t matchAmount
;
49 struct BackTrackInfoCharacterClass
{
50 uintptr_t matchAmount
;
52 struct BackTrackInfoBackReference
{
53 uintptr_t begin
; // Not really needed for greedy quantifiers.
54 uintptr_t matchAmount
; // Not really needed for fixed quantifiers.
56 struct BackTrackInfoAlternative
{
59 struct BackTrackInfoParentheticalAssertion
{
62 struct BackTrackInfoParenthesesOnce
{
63 uintptr_t inParentheses
;
65 struct BackTrackInfoParentheses
{
66 uintptr_t matchAmount
;
67 ParenthesesDisjunctionContext
* lastContext
;
72 static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses
* backTrack
, ParenthesesDisjunctionContext
* context
)
74 context
->next
= backTrack
->lastContext
;
75 backTrack
->lastContext
= context
;
76 ++backTrack
->matchAmount
;
79 static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses
* backTrack
)
81 ASSERT(backTrack
->matchAmount
);
82 ASSERT(backTrack
->lastContext
);
83 backTrack
->lastContext
= backTrack
->lastContext
->next
;
84 --backTrack
->matchAmount
;
87 struct DisjunctionContext
94 void* operator new(size_t, void* where
)
105 DisjunctionContext
* allocDisjunctionContext(ByteDisjunction
* disjunction
)
107 return new(malloc(sizeof(DisjunctionContext
) + (disjunction
->m_frameSize
- 1) * sizeof(uintptr_t))) DisjunctionContext();
110 void freeDisjunctionContext(DisjunctionContext
* context
)
115 struct ParenthesesDisjunctionContext
117 ParenthesesDisjunctionContext(int* output
, ByteTerm
& term
)
120 unsigned firstSubpatternId
= term
.atom
.subpatternId
;
121 unsigned numNestedSubpatterns
= term
.atom
.parenthesesDisjunction
->m_numSubpatterns
;
123 for (unsigned i
= 0; i
< (numNestedSubpatterns
<< 1); ++i
) {
124 subpatternBackup
[i
] = output
[(firstSubpatternId
<< 1) + i
];
125 output
[(firstSubpatternId
<< 1) + i
] = -1;
128 new(getDisjunctionContext(term
)) DisjunctionContext();
131 void* operator new(size_t, void* where
)
136 void restoreOutput(int* output
, unsigned firstSubpatternId
, unsigned numNestedSubpatterns
)
138 for (unsigned i
= 0; i
< (numNestedSubpatterns
<< 1); ++i
)
139 output
[(firstSubpatternId
<< 1) + i
] = subpatternBackup
[i
];
142 DisjunctionContext
* getDisjunctionContext(ByteTerm
& term
)
144 return reinterpret_cast<DisjunctionContext
*>(&(subpatternBackup
[term
.atom
.parenthesesDisjunction
->m_numSubpatterns
<< 1]));
147 ParenthesesDisjunctionContext
* next
;
148 int subpatternBackup
[1];
151 ParenthesesDisjunctionContext
* allocParenthesesDisjunctionContext(ByteDisjunction
* disjunction
, int* output
, ByteTerm
& term
)
153 return new(malloc(sizeof(ParenthesesDisjunctionContext
) + (((term
.atom
.parenthesesDisjunction
->m_numSubpatterns
<< 1) - 1) * sizeof(int)) + sizeof(DisjunctionContext
) + (disjunction
->m_frameSize
- 1) * sizeof(uintptr_t))) ParenthesesDisjunctionContext(output
, term
);
156 void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext
* context
)
163 InputStream(const UChar
* input
, unsigned start
, unsigned length
)
175 void rewind(unsigned amount
)
177 ASSERT(pos
>= amount
);
183 ASSERT(pos
< length
);
189 int readChecked(int position
)
191 ASSERT(position
< 0);
192 ASSERT((unsigned)-position
<= pos
);
193 unsigned p
= pos
+ position
;
198 int reread(unsigned from
)
200 ASSERT(from
< length
);
206 ASSERT(!(pos
> length
));
208 return input
[pos
- 1];
217 void setPos(unsigned p
)
229 return pos
== length
;
232 bool checkInput(int count
)
234 if ((pos
+ count
) <= length
) {
241 void uncheckInput(int count
)
246 bool atStart(int position
)
248 return (pos
+ position
) == 0;
251 bool atEnd(int position
)
253 return (pos
+ position
) == length
;
262 bool testCharacterClass(CharacterClass
* characterClass
, int ch
)
265 for (unsigned i
= 0; i
< characterClass
->m_matchesUnicode
.size(); ++i
)
266 if (ch
== characterClass
->m_matchesUnicode
[i
])
268 for (unsigned i
= 0; i
< characterClass
->m_rangesUnicode
.size(); ++i
)
269 if ((ch
>= characterClass
->m_rangesUnicode
[i
].begin
) && (ch
<= characterClass
->m_rangesUnicode
[i
].end
))
272 for (unsigned i
= 0; i
< characterClass
->m_matches
.size(); ++i
)
273 if (ch
== characterClass
->m_matches
[i
])
275 for (unsigned i
= 0; i
< characterClass
->m_ranges
.size(); ++i
)
276 if ((ch
>= characterClass
->m_ranges
[i
].begin
) && (ch
<= characterClass
->m_ranges
[i
].end
))
283 bool checkCharacter(int testChar
, int inputPosition
)
285 return testChar
== input
.readChecked(inputPosition
);
288 bool checkCasedCharacter(int loChar
, int hiChar
, int inputPosition
)
290 int ch
= input
.readChecked(inputPosition
);
291 return (loChar
== ch
) || (hiChar
== ch
);
294 bool checkCharacterClass(CharacterClass
* characterClass
, bool invert
, int inputPosition
)
296 bool match
= testCharacterClass(characterClass
, input
.readChecked(inputPosition
));
297 return invert
? !match
: match
;
300 bool tryConsumeBackReference(int matchBegin
, int matchEnd
, int inputOffset
)
302 int matchSize
= matchEnd
- matchBegin
;
304 if (!input
.checkInput(matchSize
))
307 for (int i
= 0; i
< matchSize
; ++i
) {
308 if (!checkCharacter(input
.reread(matchBegin
+ i
), inputOffset
- matchSize
+ i
)) {
309 input
.uncheckInput(matchSize
);
317 bool matchAssertionBOL(ByteTerm
& term
)
319 return (input
.atStart(term
.inputPosition
)) || (pattern
->m_multiline
&& testCharacterClass(pattern
->newlineCharacterClass
, input
.readChecked(term
.inputPosition
- 1)));
322 bool matchAssertionEOL(ByteTerm
& term
)
324 if (term
.inputPosition
)
325 return (input
.atEnd(term
.inputPosition
)) || (pattern
->m_multiline
&& testCharacterClass(pattern
->newlineCharacterClass
, input
.readChecked(term
.inputPosition
)));
327 return (input
.atEnd()) || (pattern
->m_multiline
&& testCharacterClass(pattern
->newlineCharacterClass
, input
.read()));
330 bool matchAssertionWordBoundary(ByteTerm
& term
)
332 bool prevIsWordchar
= !input
.atStart(term
.inputPosition
) && testCharacterClass(pattern
->wordcharCharacterClass
, input
.readChecked(term
.inputPosition
- 1));
334 if (term
.inputPosition
)
335 readIsWordchar
= !input
.atEnd(term
.inputPosition
) && testCharacterClass(pattern
->wordcharCharacterClass
, input
.readChecked(term
.inputPosition
));
337 readIsWordchar
= !input
.atEnd() && testCharacterClass(pattern
->wordcharCharacterClass
, input
.read());
339 bool wordBoundary
= prevIsWordchar
!= readIsWordchar
;
340 return term
.invert() ? !wordBoundary
: wordBoundary
;
343 bool backtrackPatternCharacter(ByteTerm
& term
, DisjunctionContext
* context
)
345 BackTrackInfoPatternCharacter
* backTrack
= reinterpret_cast<BackTrackInfoPatternCharacter
*>(context
->frame
+ term
.frameLocation
);
347 switch (term
.atom
.quantityType
) {
348 case QuantifierFixedCount
:
351 case QuantifierGreedy
:
352 if (backTrack
->matchAmount
) {
353 --backTrack
->matchAmount
;
354 input
.uncheckInput(1);
359 case QuantifierNonGreedy
:
360 if ((backTrack
->matchAmount
< term
.atom
.quantityCount
) && input
.checkInput(1)) {
361 ++backTrack
->matchAmount
;
362 if (checkCharacter(term
.atom
.patternCharacter
, term
.inputPosition
- 1))
365 input
.uncheckInput(backTrack
->matchAmount
);
372 bool backtrackPatternCasedCharacter(ByteTerm
& term
, DisjunctionContext
* context
)
374 BackTrackInfoPatternCharacter
* backTrack
= reinterpret_cast<BackTrackInfoPatternCharacter
*>(context
->frame
+ term
.frameLocation
);
376 switch (term
.atom
.quantityType
) {
377 case QuantifierFixedCount
:
380 case QuantifierGreedy
:
381 if (backTrack
->matchAmount
) {
382 --backTrack
->matchAmount
;
383 input
.uncheckInput(1);
388 case QuantifierNonGreedy
:
389 if ((backTrack
->matchAmount
< term
.atom
.quantityCount
) && input
.checkInput(1)) {
390 ++backTrack
->matchAmount
;
391 if (checkCasedCharacter(term
.atom
.casedCharacter
.lo
, term
.atom
.casedCharacter
.hi
, term
.inputPosition
- 1))
394 input
.uncheckInput(backTrack
->matchAmount
);
401 bool matchCharacterClass(ByteTerm
& term
, DisjunctionContext
* context
)
403 ASSERT(term
.type
== ByteTerm::TypeCharacterClass
);
404 BackTrackInfoPatternCharacter
* backTrack
= reinterpret_cast<BackTrackInfoPatternCharacter
*>(context
->frame
+ term
.frameLocation
);
406 switch (term
.atom
.quantityType
) {
407 case QuantifierFixedCount
: {
408 for (unsigned matchAmount
= 0; matchAmount
< term
.atom
.quantityCount
; ++matchAmount
) {
409 if (!checkCharacterClass(term
.atom
.characterClass
, term
.invert(), term
.inputPosition
+ matchAmount
))
415 case QuantifierGreedy
: {
416 unsigned matchAmount
= 0;
417 while ((matchAmount
< term
.atom
.quantityCount
) && input
.checkInput(1)) {
418 if (!checkCharacterClass(term
.atom
.characterClass
, term
.invert(), term
.inputPosition
- 1)) {
419 input
.uncheckInput(1);
424 backTrack
->matchAmount
= matchAmount
;
429 case QuantifierNonGreedy
:
430 backTrack
->matchAmount
= 0;
434 ASSERT_NOT_REACHED();
438 bool backtrackCharacterClass(ByteTerm
& term
, DisjunctionContext
* context
)
440 ASSERT(term
.type
== ByteTerm::TypeCharacterClass
);
441 BackTrackInfoPatternCharacter
* backTrack
= reinterpret_cast<BackTrackInfoPatternCharacter
*>(context
->frame
+ term
.frameLocation
);
443 switch (term
.atom
.quantityType
) {
444 case QuantifierFixedCount
:
447 case QuantifierGreedy
:
448 if (backTrack
->matchAmount
) {
449 --backTrack
->matchAmount
;
450 input
.uncheckInput(1);
455 case QuantifierNonGreedy
:
456 if ((backTrack
->matchAmount
< term
.atom
.quantityCount
) && input
.checkInput(1)) {
457 ++backTrack
->matchAmount
;
458 if (checkCharacterClass(term
.atom
.characterClass
, term
.invert(), term
.inputPosition
- 1))
461 input
.uncheckInput(backTrack
->matchAmount
);
468 bool matchBackReference(ByteTerm
& term
, DisjunctionContext
* context
)
470 ASSERT(term
.type
== ByteTerm::TypeBackReference
);
471 BackTrackInfoBackReference
* backTrack
= reinterpret_cast<BackTrackInfoBackReference
*>(context
->frame
+ term
.frameLocation
);
473 int matchBegin
= output
[(term
.atom
.subpatternId
<< 1)];
474 int matchEnd
= output
[(term
.atom
.subpatternId
<< 1) + 1];
475 ASSERT((matchBegin
== -1) == (matchEnd
== -1));
476 ASSERT(matchBegin
<= matchEnd
);
478 if (matchBegin
== matchEnd
)
481 switch (term
.atom
.quantityType
) {
482 case QuantifierFixedCount
: {
483 backTrack
->begin
= input
.getPos();
484 for (unsigned matchAmount
= 0; matchAmount
< term
.atom
.quantityCount
; ++matchAmount
) {
485 if (!tryConsumeBackReference(matchBegin
, matchEnd
, term
.inputPosition
)) {
486 input
.setPos(backTrack
->begin
);
493 case QuantifierGreedy
: {
494 unsigned matchAmount
= 0;
495 while ((matchAmount
< term
.atom
.quantityCount
) && tryConsumeBackReference(matchBegin
, matchEnd
, term
.inputPosition
))
497 backTrack
->matchAmount
= matchAmount
;
501 case QuantifierNonGreedy
:
502 backTrack
->begin
= input
.getPos();
503 backTrack
->matchAmount
= 0;
507 ASSERT_NOT_REACHED();
511 bool backtrackBackReference(ByteTerm
& term
, DisjunctionContext
* context
)
513 ASSERT(term
.type
== ByteTerm::TypeBackReference
);
514 BackTrackInfoBackReference
* backTrack
= reinterpret_cast<BackTrackInfoBackReference
*>(context
->frame
+ term
.frameLocation
);
516 int matchBegin
= output
[(term
.atom
.subpatternId
<< 1)];
517 int matchEnd
= output
[(term
.atom
.subpatternId
<< 1) + 1];
518 ASSERT((matchBegin
== -1) == (matchEnd
== -1));
519 ASSERT(matchBegin
<= matchEnd
);
521 if (matchBegin
== matchEnd
)
524 switch (term
.atom
.quantityType
) {
525 case QuantifierFixedCount
:
526 // for quantityCount == 1, could rewind.
527 input
.setPos(backTrack
->begin
);
530 case QuantifierGreedy
:
531 if (backTrack
->matchAmount
) {
532 --backTrack
->matchAmount
;
533 input
.rewind(matchEnd
- matchBegin
);
538 case QuantifierNonGreedy
:
539 if ((backTrack
->matchAmount
< term
.atom
.quantityCount
) && tryConsumeBackReference(matchBegin
, matchEnd
, term
.inputPosition
)) {
540 ++backTrack
->matchAmount
;
543 input
.setPos(backTrack
->begin
);
550 void recordParenthesesMatch(ByteTerm
& term
, ParenthesesDisjunctionContext
* context
)
552 if (term
.capture()) {
553 unsigned subpatternId
= term
.atom
.subpatternId
;
554 output
[(subpatternId
<< 1)] = context
->getDisjunctionContext(term
)->matchBegin
+ term
.inputPosition
;
555 output
[(subpatternId
<< 1) + 1] = context
->getDisjunctionContext(term
)->matchEnd
+ term
.inputPosition
;
558 void resetMatches(ByteTerm
& term
, ParenthesesDisjunctionContext
* context
)
560 unsigned firstSubpatternId
= term
.atom
.subpatternId
;
561 unsigned count
= term
.atom
.parenthesesDisjunction
->m_numSubpatterns
;
562 context
->restoreOutput(output
, firstSubpatternId
, count
);
564 void resetAssertionMatches(ByteTerm
& term
)
566 unsigned firstSubpatternId
= term
.atom
.subpatternId
;
567 unsigned count
= term
.atom
.parenthesesDisjunction
->m_numSubpatterns
;
568 for (unsigned i
= 0; i
< (count
<< 1); ++i
)
569 output
[(firstSubpatternId
<< 1) + i
] = -1;
571 bool parenthesesDoBacktrack(ByteTerm
& term
, BackTrackInfoParentheses
* backTrack
)
573 while (backTrack
->matchAmount
) {
574 ParenthesesDisjunctionContext
* context
= backTrack
->lastContext
;
576 if (matchDisjunction(term
.atom
.parenthesesDisjunction
, context
->getDisjunctionContext(term
), true))
579 resetMatches(term
, context
);
580 popParenthesesDisjunctionContext(backTrack
);
581 freeParenthesesDisjunctionContext(context
);
587 bool matchParenthesesOnceBegin(ByteTerm
& term
, DisjunctionContext
* context
)
589 ASSERT(term
.type
== ByteTerm::TypeParenthesesSubpatternOnceBegin
);
590 ASSERT(term
.atom
.quantityCount
== 1);
592 BackTrackInfoParenthesesOnce
* backTrack
= reinterpret_cast<BackTrackInfoParenthesesOnce
*>(context
->frame
+ term
.frameLocation
);
594 switch (term
.atom
.quantityType
) {
595 case QuantifierGreedy
: {
596 // set this speculatively; if we get to the parens end this will be true.
597 backTrack
->inParentheses
= 1;
600 case QuantifierNonGreedy
: {
601 backTrack
->inParentheses
= 0;
602 context
->term
+= term
.atom
.parenthesesWidth
;
605 case QuantifierFixedCount
:
609 if (term
.capture()) {
610 unsigned subpatternId
= term
.atom
.subpatternId
;
611 output
[(subpatternId
<< 1)] = input
.getPos() + term
.inputPosition
;
617 bool matchParenthesesOnceEnd(ByteTerm
& term
, DisjunctionContext
*)
619 ASSERT(term
.type
== ByteTerm::TypeParenthesesSubpatternOnceEnd
);
620 ASSERT(term
.atom
.quantityCount
== 1);
622 if (term
.capture()) {
623 unsigned subpatternId
= term
.atom
.subpatternId
;
624 output
[(subpatternId
<< 1) + 1] = input
.getPos() + term
.inputPosition
;
629 bool backtrackParenthesesOnceBegin(ByteTerm
& term
, DisjunctionContext
* context
)
631 ASSERT(term
.type
== ByteTerm::TypeParenthesesSubpatternOnceBegin
);
632 ASSERT(term
.atom
.quantityCount
== 1);
634 BackTrackInfoParenthesesOnce
* backTrack
= reinterpret_cast<BackTrackInfoParenthesesOnce
*>(context
->frame
+ term
.frameLocation
);
636 if (term
.capture()) {
637 unsigned subpatternId
= term
.atom
.subpatternId
;
638 output
[(subpatternId
<< 1)] = -1;
639 output
[(subpatternId
<< 1) + 1] = -1;
642 switch (term
.atom
.quantityType
) {
643 case QuantifierGreedy
:
644 // if we backtrack to this point, there is another chance - try matching nothing.
645 ASSERT(backTrack
->inParentheses
);
646 backTrack
->inParentheses
= 0;
647 context
->term
+= term
.atom
.parenthesesWidth
;
649 case QuantifierNonGreedy
:
650 ASSERT(backTrack
->inParentheses
);
651 case QuantifierFixedCount
:
658 bool backtrackParenthesesOnceEnd(ByteTerm
& term
, DisjunctionContext
* context
)
660 ASSERT(term
.type
== ByteTerm::TypeParenthesesSubpatternOnceEnd
);
661 ASSERT(term
.atom
.quantityCount
== 1);
663 BackTrackInfoParenthesesOnce
* backTrack
= reinterpret_cast<BackTrackInfoParenthesesOnce
*>(context
->frame
+ term
.frameLocation
);
665 switch (term
.atom
.quantityType
) {
666 case QuantifierGreedy
:
667 if (!backTrack
->inParentheses
) {
668 context
->term
-= term
.atom
.parenthesesWidth
;
671 case QuantifierNonGreedy
:
672 if (!backTrack
->inParentheses
) {
673 // now try to match the parens; set this speculatively.
674 backTrack
->inParentheses
= 1;
675 if (term
.capture()) {
676 unsigned subpatternId
= term
.atom
.subpatternId
;
677 output
[(subpatternId
<< 1) + 1] = input
.getPos() + term
.inputPosition
;
679 context
->term
-= term
.atom
.parenthesesWidth
;
682 case QuantifierFixedCount
:
689 bool matchParentheticalAssertionBegin(ByteTerm
& term
, DisjunctionContext
* context
)
691 ASSERT(term
.type
== ByteTerm::TypeParentheticalAssertionBegin
);
692 ASSERT(term
.atom
.quantityCount
== 1);
694 BackTrackInfoParentheticalAssertion
* backTrack
= reinterpret_cast<BackTrackInfoParentheticalAssertion
*>(context
->frame
+ term
.frameLocation
);
696 backTrack
->begin
= input
.getPos();
700 bool matchParentheticalAssertionEnd(ByteTerm
& term
, DisjunctionContext
* context
)
702 ASSERT(term
.type
== ByteTerm::TypeParentheticalAssertionEnd
);
703 ASSERT(term
.atom
.quantityCount
== 1);
705 BackTrackInfoParentheticalAssertion
* backTrack
= reinterpret_cast<BackTrackInfoParentheticalAssertion
*>(context
->frame
+ term
.frameLocation
);
707 input
.setPos(backTrack
->begin
);
709 // We've reached the end of the parens; if they are inverted, this is failure.
711 context
->term
-= term
.atom
.parenthesesWidth
;
718 bool backtrackParentheticalAssertionBegin(ByteTerm
& term
, DisjunctionContext
* context
)
720 ASSERT(term
.type
== ByteTerm::TypeParentheticalAssertionBegin
);
721 ASSERT(term
.atom
.quantityCount
== 1);
723 // We've failed to match parens; if they are inverted, this is win!
725 context
->term
+= term
.atom
.parenthesesWidth
;
732 bool backtrackParentheticalAssertionEnd(ByteTerm
& term
, DisjunctionContext
* context
)
734 ASSERT(term
.type
== ByteTerm::TypeParentheticalAssertionEnd
);
735 ASSERT(term
.atom
.quantityCount
== 1);
737 BackTrackInfoParentheticalAssertion
* backTrack
= reinterpret_cast<BackTrackInfoParentheticalAssertion
*>(context
->frame
+ term
.frameLocation
);
739 input
.setPos(backTrack
->begin
);
741 context
->term
-= term
.atom
.parenthesesWidth
;
745 bool matchParentheses(ByteTerm
& term
, DisjunctionContext
* context
)
747 ASSERT(term
.type
== ByteTerm::TypeParenthesesSubpattern
);
749 BackTrackInfoParentheses
* backTrack
= reinterpret_cast<BackTrackInfoParentheses
*>(context
->frame
+ term
.frameLocation
);
751 unsigned subpatternId
= term
.atom
.subpatternId
;
752 ByteDisjunction
* disjunctionBody
= term
.atom
.parenthesesDisjunction
;
754 backTrack
->prevBegin
= output
[(subpatternId
<< 1)];
755 backTrack
->prevEnd
= output
[(subpatternId
<< 1) + 1];
757 backTrack
->matchAmount
= 0;
758 backTrack
->lastContext
= 0;
760 switch (term
.atom
.quantityType
) {
761 case QuantifierFixedCount
: {
762 // While we haven't yet reached our fixed limit,
763 while (backTrack
->matchAmount
< term
.atom
.quantityCount
) {
764 // Try to do a match, and it it succeeds, add it to the list.
765 ParenthesesDisjunctionContext
* context
= allocParenthesesDisjunctionContext(disjunctionBody
, output
, term
);
766 if (matchDisjunction(disjunctionBody
, context
->getDisjunctionContext(term
)))
767 appendParenthesesDisjunctionContext(backTrack
, context
);
769 // The match failed; try to find an alternate point to carry on from.
770 resetMatches(term
, context
);
771 freeParenthesesDisjunctionContext(context
);
772 if (!parenthesesDoBacktrack(term
, backTrack
))
777 ASSERT(backTrack
->matchAmount
== term
.atom
.quantityCount
);
778 ParenthesesDisjunctionContext
* context
= backTrack
->lastContext
;
779 recordParenthesesMatch(term
, context
);
783 case QuantifierGreedy
: {
784 while (backTrack
->matchAmount
< term
.atom
.quantityCount
) {
785 ParenthesesDisjunctionContext
* context
= allocParenthesesDisjunctionContext(disjunctionBody
, output
, term
);
786 if (matchNonZeroDisjunction(disjunctionBody
, context
->getDisjunctionContext(term
)))
787 appendParenthesesDisjunctionContext(backTrack
, context
);
789 resetMatches(term
, context
);
790 freeParenthesesDisjunctionContext(context
);
795 if (backTrack
->matchAmount
) {
796 ParenthesesDisjunctionContext
* context
= backTrack
->lastContext
;
797 recordParenthesesMatch(term
, context
);
802 case QuantifierNonGreedy
:
806 ASSERT_NOT_REACHED();
810 // Rules for backtracking differ depending on whether this is greedy or non-greedy.
812 // Greedy matches never should try just adding more - you should already have done
813 // the 'more' cases. Always backtrack, at least a leetle bit. However cases where
814 // you backtrack an item off the list needs checking, since we'll never have matched
815 // the one less case. Tracking forwards, still add as much as possible.
817 // Non-greedy, we've already done the one less case, so don't match on popping.
818 // We haven't done the one more case, so always try to add that.
820 bool backtrackParentheses(ByteTerm
& term
, DisjunctionContext
* context
)
822 ASSERT(term
.type
== ByteTerm::TypeParenthesesSubpattern
);
824 BackTrackInfoParentheses
* backTrack
= reinterpret_cast<BackTrackInfoParentheses
*>(context
->frame
+ term
.frameLocation
);
826 if (term
.capture()) {
827 unsigned subpatternId
= term
.atom
.subpatternId
;
828 output
[(subpatternId
<< 1)] = backTrack
->prevBegin
;
829 output
[(subpatternId
<< 1) + 1] = backTrack
->prevEnd
;
832 ByteDisjunction
* disjunctionBody
= term
.atom
.parenthesesDisjunction
;
834 switch (term
.atom
.quantityType
) {
835 case QuantifierFixedCount
: {
836 ASSERT(backTrack
->matchAmount
== term
.atom
.quantityCount
);
838 ParenthesesDisjunctionContext
* context
= 0;
840 if (!parenthesesDoBacktrack(term
, backTrack
))
843 // While we haven't yet reached our fixed limit,
844 while (backTrack
->matchAmount
< term
.atom
.quantityCount
) {
845 // Try to do a match, and it it succeeds, add it to the list.
846 context
= allocParenthesesDisjunctionContext(disjunctionBody
, output
, term
);
847 if (matchDisjunction(disjunctionBody
, context
->getDisjunctionContext(term
)))
848 appendParenthesesDisjunctionContext(backTrack
, context
);
850 // The match failed; try to find an alternate point to carry on from.
851 resetMatches(term
, context
);
852 freeParenthesesDisjunctionContext(context
);
853 if (!parenthesesDoBacktrack(term
, backTrack
))
858 ASSERT(backTrack
->matchAmount
== term
.atom
.quantityCount
);
859 context
= backTrack
->lastContext
;
860 recordParenthesesMatch(term
, context
);
864 case QuantifierGreedy
: {
865 if (!backTrack
->matchAmount
)
868 ParenthesesDisjunctionContext
* context
= backTrack
->lastContext
;
869 if (matchNonZeroDisjunction(disjunctionBody
, context
->getDisjunctionContext(term
), true)) {
870 while (backTrack
->matchAmount
< term
.atom
.quantityCount
) {
871 ParenthesesDisjunctionContext
* context
= allocParenthesesDisjunctionContext(disjunctionBody
, output
, term
);
872 if (matchNonZeroDisjunction(disjunctionBody
, context
->getDisjunctionContext(term
)))
873 appendParenthesesDisjunctionContext(backTrack
, context
);
875 resetMatches(term
, context
);
876 freeParenthesesDisjunctionContext(context
);
881 resetMatches(term
, context
);
882 popParenthesesDisjunctionContext(backTrack
);
883 freeParenthesesDisjunctionContext(context
);
886 if (backTrack
->matchAmount
) {
887 ParenthesesDisjunctionContext
* context
= backTrack
->lastContext
;
888 recordParenthesesMatch(term
, context
);
893 case QuantifierNonGreedy
: {
894 // If we've not reached the limit, try to add one more match.
895 if (backTrack
->matchAmount
< term
.atom
.quantityCount
) {
896 ParenthesesDisjunctionContext
* context
= allocParenthesesDisjunctionContext(disjunctionBody
, output
, term
);
897 if (matchNonZeroDisjunction(disjunctionBody
, context
->getDisjunctionContext(term
))) {
898 appendParenthesesDisjunctionContext(backTrack
, context
);
899 recordParenthesesMatch(term
, context
);
902 resetMatches(term
, context
);
903 freeParenthesesDisjunctionContext(context
);
907 // Nope - okay backtrack looking for an alternative.
908 while (backTrack
->matchAmount
) {
909 ParenthesesDisjunctionContext
* context
= backTrack
->lastContext
;
910 if (matchNonZeroDisjunction(disjunctionBody
, context
->getDisjunctionContext(term
), true)) {
911 // successful backtrack! we're back in the game!
912 if (backTrack
->matchAmount
) {
913 context
= backTrack
->lastContext
;
914 recordParenthesesMatch(term
, context
);
919 // pop a match off the stack
920 resetMatches(term
, context
);
921 popParenthesesDisjunctionContext(backTrack
);
922 freeParenthesesDisjunctionContext(context
);
929 ASSERT_NOT_REACHED();
933 #define MATCH_NEXT() { ++context->term; goto matchAgain; }
934 #define BACKTRACK() { --context->term; goto backtrack; }
935 #define currentTerm() (disjunction->terms[context->term])
936 bool matchDisjunction(ByteDisjunction
* disjunction
, DisjunctionContext
* context
, bool btrack
= false)
941 context
->matchBegin
= input
.getPos();
945 ASSERT(context
->term
< static_cast<int>(disjunction
->terms
.size()));
947 switch (currentTerm().type
) {
948 case ByteTerm::TypeSubpatternBegin
:
950 case ByteTerm::TypeSubpatternEnd
:
951 context
->matchEnd
= input
.getPos();
954 case ByteTerm::TypeBodyAlternativeBegin
:
956 case ByteTerm::TypeBodyAlternativeDisjunction
:
957 case ByteTerm::TypeBodyAlternativeEnd
:
958 context
->matchEnd
= input
.getPos();
961 case ByteTerm::TypeAlternativeBegin
:
963 case ByteTerm::TypeAlternativeDisjunction
:
964 case ByteTerm::TypeAlternativeEnd
: {
965 int offset
= currentTerm().alternative
.end
;
966 BackTrackInfoAlternative
* backTrack
= reinterpret_cast<BackTrackInfoAlternative
*>(context
->frame
+ currentTerm().frameLocation
);
967 backTrack
->offset
= offset
;
968 context
->term
+= offset
;
972 case ByteTerm::TypeAssertionBOL
:
973 if (matchAssertionBOL(currentTerm()))
976 case ByteTerm::TypeAssertionEOL
:
977 if (matchAssertionEOL(currentTerm()))
980 case ByteTerm::TypeAssertionWordBoundary
:
981 if (matchAssertionWordBoundary(currentTerm()))
985 case ByteTerm::TypePatternCharacterOnce
:
986 case ByteTerm::TypePatternCharacterFixed
: {
987 for (unsigned matchAmount
= 0; matchAmount
< currentTerm().atom
.quantityCount
; ++matchAmount
) {
988 if (!checkCharacter(currentTerm().atom
.patternCharacter
, currentTerm().inputPosition
+ matchAmount
))
993 case ByteTerm::TypePatternCharacterGreedy
: {
994 BackTrackInfoPatternCharacter
* backTrack
= reinterpret_cast<BackTrackInfoPatternCharacter
*>(context
->frame
+ currentTerm().frameLocation
);
995 unsigned matchAmount
= 0;
996 while ((matchAmount
< currentTerm().atom
.quantityCount
) && input
.checkInput(1)) {
997 if (!checkCharacter(currentTerm().atom
.patternCharacter
, currentTerm().inputPosition
- 1)) {
998 input
.uncheckInput(1);
1003 backTrack
->matchAmount
= matchAmount
;
1007 case ByteTerm::TypePatternCharacterNonGreedy
: {
1008 BackTrackInfoPatternCharacter
* backTrack
= reinterpret_cast<BackTrackInfoPatternCharacter
*>(context
->frame
+ currentTerm().frameLocation
);
1009 backTrack
->matchAmount
= 0;
1013 case ByteTerm::TypePatternCasedCharacterOnce
:
1014 case ByteTerm::TypePatternCasedCharacterFixed
: {
1015 for (unsigned matchAmount
= 0; matchAmount
< currentTerm().atom
.quantityCount
; ++matchAmount
) {
1016 if (!checkCasedCharacter(currentTerm().atom
.casedCharacter
.lo
, currentTerm().atom
.casedCharacter
.hi
, currentTerm().inputPosition
+ matchAmount
))
1021 case ByteTerm::TypePatternCasedCharacterGreedy
: {
1022 BackTrackInfoPatternCharacter
* backTrack
= reinterpret_cast<BackTrackInfoPatternCharacter
*>(context
->frame
+ currentTerm().frameLocation
);
1023 unsigned matchAmount
= 0;
1024 while ((matchAmount
< currentTerm().atom
.quantityCount
) && input
.checkInput(1)) {
1025 if (!checkCasedCharacter(currentTerm().atom
.casedCharacter
.lo
, currentTerm().atom
.casedCharacter
.hi
, currentTerm().inputPosition
- 1)) {
1026 input
.uncheckInput(1);
1031 backTrack
->matchAmount
= matchAmount
;
1035 case ByteTerm::TypePatternCasedCharacterNonGreedy
: {
1036 BackTrackInfoPatternCharacter
* backTrack
= reinterpret_cast<BackTrackInfoPatternCharacter
*>(context
->frame
+ currentTerm().frameLocation
);
1037 backTrack
->matchAmount
= 0;
1041 case ByteTerm::TypeCharacterClass
:
1042 if (matchCharacterClass(currentTerm(), context
))
1045 case ByteTerm::TypeBackReference
:
1046 if (matchBackReference(currentTerm(), context
))
1049 case ByteTerm::TypeParenthesesSubpattern
:
1050 if (matchParentheses(currentTerm(), context
))
1053 case ByteTerm::TypeParenthesesSubpatternOnceBegin
:
1054 if (matchParenthesesOnceBegin(currentTerm(), context
))
1057 case ByteTerm::TypeParenthesesSubpatternOnceEnd
:
1058 if (matchParenthesesOnceEnd(currentTerm(), context
))
1061 case ByteTerm::TypeParentheticalAssertionBegin
:
1062 if (matchParentheticalAssertionBegin(currentTerm(), context
))
1065 case ByteTerm::TypeParentheticalAssertionEnd
:
1066 if (matchParentheticalAssertionEnd(currentTerm(), context
))
1070 case ByteTerm::TypeCheckInput
:
1071 if (input
.checkInput(currentTerm().checkInputCount
))
1076 // We should never fall-through to here.
1077 ASSERT_NOT_REACHED();
1080 ASSERT(context
->term
< static_cast<int>(disjunction
->terms
.size()));
1082 switch (currentTerm().type
) {
1083 case ByteTerm::TypeSubpatternBegin
:
1085 case ByteTerm::TypeSubpatternEnd
:
1086 ASSERT_NOT_REACHED();
1088 case ByteTerm::TypeBodyAlternativeBegin
:
1089 case ByteTerm::TypeBodyAlternativeDisjunction
: {
1090 int offset
= currentTerm().alternative
.next
;
1091 context
->term
+= offset
;
1099 context
->matchBegin
= input
.getPos();
1102 case ByteTerm::TypeBodyAlternativeEnd
:
1103 ASSERT_NOT_REACHED();
1105 case ByteTerm::TypeAlternativeBegin
:
1106 case ByteTerm::TypeAlternativeDisjunction
: {
1107 int offset
= currentTerm().alternative
.next
;
1108 context
->term
+= offset
;
1113 case ByteTerm::TypeAlternativeEnd
: {
1114 // We should never backtrack back into an alternative of the main body of the regex.
1115 BackTrackInfoAlternative
* backTrack
= reinterpret_cast<BackTrackInfoAlternative
*>(context
->frame
+ currentTerm().frameLocation
);
1116 unsigned offset
= backTrack
->offset
;
1117 context
->term
-= offset
;
1121 case ByteTerm::TypeAssertionBOL
:
1122 case ByteTerm::TypeAssertionEOL
:
1123 case ByteTerm::TypeAssertionWordBoundary
:
1126 case ByteTerm::TypePatternCharacterOnce
:
1127 case ByteTerm::TypePatternCharacterFixed
:
1128 case ByteTerm::TypePatternCharacterGreedy
:
1129 case ByteTerm::TypePatternCharacterNonGreedy
:
1130 if (backtrackPatternCharacter(currentTerm(), context
))
1133 case ByteTerm::TypePatternCasedCharacterOnce
:
1134 case ByteTerm::TypePatternCasedCharacterFixed
:
1135 case ByteTerm::TypePatternCasedCharacterGreedy
:
1136 case ByteTerm::TypePatternCasedCharacterNonGreedy
:
1137 if (backtrackPatternCasedCharacter(currentTerm(), context
))
1140 case ByteTerm::TypeCharacterClass
:
1141 if (backtrackCharacterClass(currentTerm(), context
))
1144 case ByteTerm::TypeBackReference
:
1145 if (backtrackBackReference(currentTerm(), context
))
1148 case ByteTerm::TypeParenthesesSubpattern
:
1149 if (backtrackParentheses(currentTerm(), context
))
1152 case ByteTerm::TypeParenthesesSubpatternOnceBegin
:
1153 if (backtrackParenthesesOnceBegin(currentTerm(), context
))
1156 case ByteTerm::TypeParenthesesSubpatternOnceEnd
:
1157 if (backtrackParenthesesOnceEnd(currentTerm(), context
))
1160 case ByteTerm::TypeParentheticalAssertionBegin
:
1161 if (backtrackParentheticalAssertionBegin(currentTerm(), context
))
1164 case ByteTerm::TypeParentheticalAssertionEnd
:
1165 if (backtrackParentheticalAssertionEnd(currentTerm(), context
))
1169 case ByteTerm::TypeCheckInput
:
1170 input
.uncheckInput(currentTerm().checkInputCount
);
1174 ASSERT_NOT_REACHED();
1178 bool matchNonZeroDisjunction(ByteDisjunction
* disjunction
, DisjunctionContext
* context
, bool btrack
= false)
1180 if (matchDisjunction(disjunction
, context
, btrack
)) {
1181 while (context
->matchBegin
== context
->matchEnd
) {
1182 if (!matchDisjunction(disjunction
, context
, true))
1193 for (unsigned i
= 0; i
< ((pattern
->m_body
->m_numSubpatterns
+ 1) << 1); ++i
)
1196 DisjunctionContext
* context
= allocDisjunctionContext(pattern
->m_body
.get());
1198 if (matchDisjunction(pattern
->m_body
.get(), context
)) {
1199 output
[0] = context
->matchBegin
;
1200 output
[1] = context
->matchEnd
;
1203 freeDisjunctionContext(context
);
1208 Interpreter(BytecodePattern
* pattern
, int* output
, const UChar
* inputChar
, unsigned start
, unsigned length
)
1211 , input(inputChar
, start
, length
)
1216 BytecodePattern
*pattern
;
1223 class ByteCompiler
{
1224 struct ParenthesesStackEntry
{
1226 unsigned savedAlternativeIndex
;
1227 ParenthesesStackEntry(unsigned beginTerm
, unsigned savedAlternativeIndex
/*, unsigned subpatternId, bool capture = false*/)
1228 : beginTerm(beginTerm
)
1229 , savedAlternativeIndex(savedAlternativeIndex
)
1235 ByteCompiler(RegexPattern
& pattern
)
1236 : m_pattern(pattern
)
1238 m_bodyDisjunction
= 0;
1239 m_currentAlternativeIndex
= 0;
1242 BytecodePattern
* compile()
1244 regexBegin(m_pattern
.m_numSubpatterns
, m_pattern
.m_body
->m_callFrameSize
);
1245 emitDisjunction(m_pattern
.m_body
);
1248 return new BytecodePattern(m_bodyDisjunction
, m_allParenthesesInfo
, m_pattern
);
1251 void checkInput(unsigned count
)
1253 m_bodyDisjunction
->terms
.append(ByteTerm::CheckInput(count
));
1256 void assertionBOL(int inputPosition
)
1258 m_bodyDisjunction
->terms
.append(ByteTerm::BOL(inputPosition
));
1261 void assertionEOL(int inputPosition
)
1263 m_bodyDisjunction
->terms
.append(ByteTerm::EOL(inputPosition
));
1266 void assertionWordBoundary(bool invert
, int inputPosition
)
1268 m_bodyDisjunction
->terms
.append(ByteTerm::WordBoundary(invert
, inputPosition
));
1271 void atomPatternCharacter(UChar ch
, int inputPosition
, unsigned frameLocation
, unsigned quantityCount
, QuantifierType quantityType
)
1273 if (m_pattern
.m_ignoreCase
) {
1274 UChar lo
= Unicode::toLower(ch
);
1275 UChar hi
= Unicode::toUpper(ch
);
1278 m_bodyDisjunction
->terms
.append(ByteTerm(lo
, hi
, inputPosition
, frameLocation
, quantityCount
, quantityType
));
1283 m_bodyDisjunction
->terms
.append(ByteTerm(ch
, inputPosition
, frameLocation
, quantityCount
, quantityType
));
1286 void atomCharacterClass(CharacterClass
* characterClass
, bool invert
, int inputPosition
, unsigned frameLocation
, unsigned quantityCount
, QuantifierType quantityType
)
1288 m_bodyDisjunction
->terms
.append(ByteTerm(characterClass
, invert
, inputPosition
));
1290 m_bodyDisjunction
->terms
[m_bodyDisjunction
->terms
.size() - 1].atom
.quantityCount
= quantityCount
;
1291 m_bodyDisjunction
->terms
[m_bodyDisjunction
->terms
.size() - 1].atom
.quantityType
= quantityType
;
1292 m_bodyDisjunction
->terms
[m_bodyDisjunction
->terms
.size() - 1].frameLocation
= frameLocation
;
1295 void atomBackReference(unsigned subpatternId
, int inputPosition
, unsigned frameLocation
, unsigned quantityCount
, QuantifierType quantityType
)
1297 ASSERT(subpatternId
);
1299 m_bodyDisjunction
->terms
.append(ByteTerm::BackReference(subpatternId
, inputPosition
));
1301 m_bodyDisjunction
->terms
[m_bodyDisjunction
->terms
.size() - 1].atom
.quantityCount
= quantityCount
;
1302 m_bodyDisjunction
->terms
[m_bodyDisjunction
->terms
.size() - 1].atom
.quantityType
= quantityType
;
1303 m_bodyDisjunction
->terms
[m_bodyDisjunction
->terms
.size() - 1].frameLocation
= frameLocation
;
1306 void atomParenthesesSubpatternBegin(unsigned subpatternId
, bool capture
, int inputPosition
, unsigned frameLocation
, unsigned alternativeFrameLocation
)
1308 int beginTerm
= m_bodyDisjunction
->terms
.size();
1310 m_bodyDisjunction
->terms
.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin
, subpatternId
, capture
, inputPosition
));
1311 m_bodyDisjunction
->terms
[m_bodyDisjunction
->terms
.size() - 1].frameLocation
= frameLocation
;
1312 m_bodyDisjunction
->terms
.append(ByteTerm::AlternativeBegin());
1313 m_bodyDisjunction
->terms
[m_bodyDisjunction
->terms
.size() - 1].frameLocation
= alternativeFrameLocation
;
1315 m_parenthesesStack
.append(ParenthesesStackEntry(beginTerm
, m_currentAlternativeIndex
));
1316 m_currentAlternativeIndex
= beginTerm
+ 1;
1319 void atomParentheticalAssertionBegin(unsigned subpatternId
, bool invert
, unsigned frameLocation
, unsigned alternativeFrameLocation
)
1321 int beginTerm
= m_bodyDisjunction
->terms
.size();
1323 m_bodyDisjunction
->terms
.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin
, subpatternId
, invert
, 0));
1324 m_bodyDisjunction
->terms
[m_bodyDisjunction
->terms
.size() - 1].frameLocation
= frameLocation
;
1325 m_bodyDisjunction
->terms
.append(ByteTerm::AlternativeBegin());
1326 m_bodyDisjunction
->terms
[m_bodyDisjunction
->terms
.size() - 1].frameLocation
= alternativeFrameLocation
;
1328 m_parenthesesStack
.append(ParenthesesStackEntry(beginTerm
, m_currentAlternativeIndex
));
1329 m_currentAlternativeIndex
= beginTerm
+ 1;
1332 unsigned popParenthesesStack()
1334 ASSERT(m_parenthesesStack
.size());
1335 int stackEnd
= m_parenthesesStack
.size() - 1;
1336 unsigned beginTerm
= m_parenthesesStack
[stackEnd
].beginTerm
;
1337 m_currentAlternativeIndex
= m_parenthesesStack
[stackEnd
].savedAlternativeIndex
;
1338 m_parenthesesStack
.shrink(stackEnd
);
1340 ASSERT(beginTerm
< m_bodyDisjunction
->terms
.size());
1341 ASSERT(m_currentAlternativeIndex
< m_bodyDisjunction
->terms
.size());
1347 void dumpDisjunction(ByteDisjunction
* disjunction
)
1349 printf("ByteDisjunction(%p):\n\t", disjunction
);
1350 for (unsigned i
= 0; i
< disjunction
->terms
.size(); ++i
)
1351 printf("{ %d } ", disjunction
->terms
[i
].type
);
1356 void closeAlternative(int beginTerm
)
1358 int origBeginTerm
= beginTerm
;
1359 ASSERT(m_bodyDisjunction
->terms
[beginTerm
].type
== ByteTerm::TypeAlternativeBegin
);
1360 int endIndex
= m_bodyDisjunction
->terms
.size();
1362 unsigned frameLocation
= m_bodyDisjunction
->terms
[beginTerm
].frameLocation
;
1364 if (!m_bodyDisjunction
->terms
[beginTerm
].alternative
.next
)
1365 m_bodyDisjunction
->terms
.remove(beginTerm
);
1367 while (m_bodyDisjunction
->terms
[beginTerm
].alternative
.next
) {
1368 beginTerm
+= m_bodyDisjunction
->terms
[beginTerm
].alternative
.next
;
1369 ASSERT(m_bodyDisjunction
->terms
[beginTerm
].type
== ByteTerm::TypeAlternativeDisjunction
);
1370 m_bodyDisjunction
->terms
[beginTerm
].alternative
.end
= endIndex
- beginTerm
;
1371 m_bodyDisjunction
->terms
[beginTerm
].frameLocation
= frameLocation
;
1374 m_bodyDisjunction
->terms
[beginTerm
].alternative
.next
= origBeginTerm
- beginTerm
;
1376 m_bodyDisjunction
->terms
.append(ByteTerm::AlternativeEnd());
1377 m_bodyDisjunction
->terms
[endIndex
].frameLocation
= frameLocation
;
1381 void closeBodyAlternative()
1384 int origBeginTerm
= 0;
1385 ASSERT(m_bodyDisjunction
->terms
[beginTerm
].type
== ByteTerm::TypeBodyAlternativeBegin
);
1386 int endIndex
= m_bodyDisjunction
->terms
.size();
1388 unsigned frameLocation
= m_bodyDisjunction
->terms
[beginTerm
].frameLocation
;
1390 while (m_bodyDisjunction
->terms
[beginTerm
].alternative
.next
) {
1391 beginTerm
+= m_bodyDisjunction
->terms
[beginTerm
].alternative
.next
;
1392 ASSERT(m_bodyDisjunction
->terms
[beginTerm
].type
== ByteTerm::TypeBodyAlternativeDisjunction
);
1393 m_bodyDisjunction
->terms
[beginTerm
].alternative
.end
= endIndex
- beginTerm
;
1394 m_bodyDisjunction
->terms
[beginTerm
].frameLocation
= frameLocation
;
1397 m_bodyDisjunction
->terms
[beginTerm
].alternative
.next
= origBeginTerm
- beginTerm
;
1399 m_bodyDisjunction
->terms
.append(ByteTerm::BodyAlternativeEnd());
1400 m_bodyDisjunction
->terms
[endIndex
].frameLocation
= frameLocation
;
1403 void atomParenthesesEnd(bool doInline
, unsigned lastSubpatternId
, int inputPosition
, unsigned frameLocation
, unsigned quantityCount
, QuantifierType quantityType
, unsigned callFrameSize
= 0)
1405 unsigned beginTerm
= popParenthesesStack();
1406 closeAlternative(beginTerm
+ 1);
1407 unsigned endTerm
= m_bodyDisjunction
->terms
.size();
1409 bool isAssertion
= m_bodyDisjunction
->terms
[beginTerm
].type
== ByteTerm::TypeParentheticalAssertionBegin
;
1410 bool invertOrCapture
= m_bodyDisjunction
->terms
[beginTerm
].invertOrCapture
;
1411 unsigned subpatternId
= m_bodyDisjunction
->terms
[beginTerm
].atom
.subpatternId
;
1413 m_bodyDisjunction
->terms
.append(ByteTerm(isAssertion
? ByteTerm::TypeParentheticalAssertionEnd
: ByteTerm::TypeParenthesesSubpatternOnceEnd
, subpatternId
, invertOrCapture
, inputPosition
));
1414 m_bodyDisjunction
->terms
[beginTerm
].atom
.parenthesesWidth
= endTerm
- beginTerm
;
1415 m_bodyDisjunction
->terms
[endTerm
].atom
.parenthesesWidth
= endTerm
- beginTerm
;
1416 m_bodyDisjunction
->terms
[endTerm
].frameLocation
= frameLocation
;
1419 m_bodyDisjunction
->terms
[beginTerm
].atom
.quantityCount
= quantityCount
;
1420 m_bodyDisjunction
->terms
[beginTerm
].atom
.quantityType
= quantityType
;
1421 m_bodyDisjunction
->terms
[endTerm
].atom
.quantityCount
= quantityCount
;
1422 m_bodyDisjunction
->terms
[endTerm
].atom
.quantityType
= quantityType
;
1424 ByteTerm
& parenthesesBegin
= m_bodyDisjunction
->terms
[beginTerm
];
1425 ASSERT(parenthesesBegin
.type
== ByteTerm::TypeParenthesesSubpatternOnceBegin
);
1427 bool invertOrCapture
= parenthesesBegin
.invertOrCapture
;
1428 unsigned subpatternId
= parenthesesBegin
.atom
.subpatternId
;
1430 unsigned numSubpatterns
= lastSubpatternId
- subpatternId
+ 1;
1431 ByteDisjunction
* parenthesesDisjunction
= new ByteDisjunction(numSubpatterns
, callFrameSize
);
1433 parenthesesDisjunction
->terms
.append(ByteTerm::SubpatternBegin());
1434 for (unsigned termInParentheses
= beginTerm
+ 1; termInParentheses
< endTerm
; ++termInParentheses
)
1435 parenthesesDisjunction
->terms
.append(m_bodyDisjunction
->terms
[termInParentheses
]);
1436 parenthesesDisjunction
->terms
.append(ByteTerm::SubpatternEnd());
1438 m_bodyDisjunction
->terms
.shrink(beginTerm
);
1440 m_allParenthesesInfo
.append(parenthesesDisjunction
);
1441 m_bodyDisjunction
->terms
.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern
, subpatternId
, parenthesesDisjunction
, invertOrCapture
, inputPosition
));
1443 m_bodyDisjunction
->terms
[beginTerm
].atom
.quantityCount
= quantityCount
;
1444 m_bodyDisjunction
->terms
[beginTerm
].atom
.quantityType
= quantityType
;
1445 m_bodyDisjunction
->terms
[beginTerm
].frameLocation
= frameLocation
;
1449 void regexBegin(unsigned numSubpatterns
, unsigned callFrameSize
)
1451 m_bodyDisjunction
= new ByteDisjunction(numSubpatterns
, callFrameSize
);
1452 m_bodyDisjunction
->terms
.append(ByteTerm::BodyAlternativeBegin());
1453 m_bodyDisjunction
->terms
[0].frameLocation
= 0;
1454 m_currentAlternativeIndex
= 0;
1459 closeBodyAlternative();
1462 void alternativeBodyDisjunction()
1464 int newAlternativeIndex
= m_bodyDisjunction
->terms
.size();
1465 m_bodyDisjunction
->terms
[m_currentAlternativeIndex
].alternative
.next
= newAlternativeIndex
- m_currentAlternativeIndex
;
1466 m_bodyDisjunction
->terms
.append(ByteTerm::BodyAlternativeDisjunction());
1468 m_currentAlternativeIndex
= newAlternativeIndex
;
1471 void alternativeDisjunction()
1473 int newAlternativeIndex
= m_bodyDisjunction
->terms
.size();
1474 m_bodyDisjunction
->terms
[m_currentAlternativeIndex
].alternative
.next
= newAlternativeIndex
- m_currentAlternativeIndex
;
1475 m_bodyDisjunction
->terms
.append(ByteTerm::AlternativeDisjunction());
1477 m_currentAlternativeIndex
= newAlternativeIndex
;
1480 void emitDisjunction(PatternDisjunction
* disjunction
, unsigned inputCountAlreadyChecked
= 0, unsigned parenthesesInputCountAlreadyChecked
= 0)
1482 for (unsigned alt
= 0; alt
< disjunction
->m_alternatives
.size(); ++alt
) {
1483 unsigned currentCountAlreadyChecked
= inputCountAlreadyChecked
;
1486 if (disjunction
== m_pattern
.m_body
)
1487 alternativeBodyDisjunction();
1489 alternativeDisjunction();
1492 PatternAlternative
* alternative
= disjunction
->m_alternatives
[alt
];
1493 unsigned minimumSize
= alternative
->m_minimumSize
;
1495 ASSERT(minimumSize
>= parenthesesInputCountAlreadyChecked
);
1496 unsigned countToCheck
= minimumSize
- parenthesesInputCountAlreadyChecked
;
1498 checkInput(countToCheck
);
1499 currentCountAlreadyChecked
+= countToCheck
;
1501 for (unsigned i
= 0; i
< alternative
->m_terms
.size(); ++i
) {
1502 PatternTerm
& term
= alternative
->m_terms
[i
];
1504 switch (term
.type
) {
1505 case PatternTerm::TypeAssertionBOL
:
1506 assertionBOL(term
.inputPosition
- currentCountAlreadyChecked
);
1509 case PatternTerm::TypeAssertionEOL
:
1510 assertionEOL(term
.inputPosition
- currentCountAlreadyChecked
);
1513 case PatternTerm::TypeAssertionWordBoundary
:
1514 assertionWordBoundary(term
.invertOrCapture
, term
.inputPosition
- currentCountAlreadyChecked
);
1517 case PatternTerm::TypePatternCharacter
:
1518 atomPatternCharacter(term
.patternCharacter
, term
.inputPosition
- currentCountAlreadyChecked
, term
.frameLocation
, term
.quantityCount
, term
.quantityType
);
1521 case PatternTerm::TypeCharacterClass
:
1522 atomCharacterClass(term
.characterClass
, term
.invertOrCapture
, term
.inputPosition
- currentCountAlreadyChecked
, term
.frameLocation
, term
.quantityCount
, term
.quantityType
);
1525 case PatternTerm::TypeBackReference
:
1526 atomBackReference(term
.subpatternId
, term
.inputPosition
- currentCountAlreadyChecked
, term
.frameLocation
, term
.quantityCount
, term
.quantityType
);
1529 case PatternTerm::TypeForwardReference
:
1532 case PatternTerm::TypeParenthesesSubpattern
: {
1533 unsigned disjunctionAlreadyCheckedCount
= 0;
1534 if ((term
.quantityCount
== 1) && !term
.parentheses
.isCopy
) {
1535 if (term
.quantityType
== QuantifierFixedCount
) {
1536 disjunctionAlreadyCheckedCount
= term
.parentheses
.disjunction
->m_minimumSize
;
1537 unsigned delegateEndInputOffset
= term
.inputPosition
- currentCountAlreadyChecked
;
1538 atomParenthesesSubpatternBegin(term
.parentheses
.subpatternId
, term
.invertOrCapture
, delegateEndInputOffset
- disjunctionAlreadyCheckedCount
, term
.frameLocation
, term
.frameLocation
);
1539 emitDisjunction(term
.parentheses
.disjunction
, currentCountAlreadyChecked
, term
.parentheses
.disjunction
->m_minimumSize
);
1540 atomParenthesesEnd(true, term
.parentheses
.lastSubpatternId
, delegateEndInputOffset
, term
.frameLocation
, term
.quantityCount
, term
.quantityType
, term
.parentheses
.disjunction
->m_callFrameSize
);
1542 unsigned delegateEndInputOffset
= term
.inputPosition
- currentCountAlreadyChecked
;
1543 atomParenthesesSubpatternBegin(term
.parentheses
.subpatternId
, term
.invertOrCapture
, delegateEndInputOffset
- disjunctionAlreadyCheckedCount
, term
.frameLocation
, term
.frameLocation
+ RegexStackSpaceForBackTrackInfoParenthesesOnce
);
1544 emitDisjunction(term
.parentheses
.disjunction
, currentCountAlreadyChecked
, 0);
1545 atomParenthesesEnd(true, term
.parentheses
.lastSubpatternId
, delegateEndInputOffset
, term
.frameLocation
, term
.quantityCount
, term
.quantityType
, term
.parentheses
.disjunction
->m_callFrameSize
);
1548 unsigned delegateEndInputOffset
= term
.inputPosition
- currentCountAlreadyChecked
;
1549 atomParenthesesSubpatternBegin(term
.parentheses
.subpatternId
, term
.invertOrCapture
, delegateEndInputOffset
- disjunctionAlreadyCheckedCount
, term
.frameLocation
, 0);
1550 emitDisjunction(term
.parentheses
.disjunction
, currentCountAlreadyChecked
, 0);
1551 atomParenthesesEnd(false, term
.parentheses
.lastSubpatternId
, delegateEndInputOffset
, term
.frameLocation
, term
.quantityCount
, term
.quantityType
, term
.parentheses
.disjunction
->m_callFrameSize
);
1556 case PatternTerm::TypeParentheticalAssertion
: {
1557 unsigned alternativeFrameLocation
= term
.frameLocation
+ RegexStackSpaceForBackTrackInfoParentheticalAssertion
;
1559 atomParentheticalAssertionBegin(term
.parentheses
.subpatternId
, term
.invertOrCapture
, term
.frameLocation
, alternativeFrameLocation
);
1560 emitDisjunction(term
.parentheses
.disjunction
, currentCountAlreadyChecked
, 0);
1561 atomParenthesesEnd(true, term
.parentheses
.lastSubpatternId
, 0, term
.frameLocation
, term
.quantityCount
, term
.quantityType
);
1570 RegexPattern
& m_pattern
;
1571 ByteDisjunction
* m_bodyDisjunction
;
1572 unsigned m_currentAlternativeIndex
;
1573 Vector
<ParenthesesStackEntry
> m_parenthesesStack
;
1574 Vector
<ByteDisjunction
*> m_allParenthesesInfo
;
1578 BytecodePattern
* byteCompileRegex(const UString
& patternString
, unsigned& numSubpatterns
, const char*& error
, bool ignoreCase
, bool multiline
)
1580 RegexPattern
pattern(ignoreCase
, multiline
);
1582 if ((error
= compileRegex(patternString
, pattern
)))
1585 numSubpatterns
= pattern
.m_numSubpatterns
;
1587 return ByteCompiler(pattern
).compile();
1590 int interpretRegex(BytecodePattern
* regex
, const UChar
* input
, unsigned start
, unsigned length
, int* output
)
1592 return Interpreter(regex
, output
, input
, start
, length
).interpret();
1596 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter
) == (RegexStackSpaceForBackTrackInfoPatternCharacter
* sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoPatternCharacter
);
1597 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass
) == (RegexStackSpaceForBackTrackInfoCharacterClass
* sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoCharacterClass
);
1598 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference
) == (RegexStackSpaceForBackTrackInfoBackReference
* sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoBackReference
);
1599 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative
) == (RegexStackSpaceForBackTrackInfoAlternative
* sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoAlternative
);
1600 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion
) == (RegexStackSpaceForBackTrackInfoParentheticalAssertion
* sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheticalAssertion
);
1601 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce
) == (RegexStackSpaceForBackTrackInfoParenthesesOnce
* sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParenthesesOnce
);
1602 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses
) == (RegexStackSpaceForBackTrackInfoParentheses
* sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheses
);