2 * Copyright (C) 2009 Apple Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 #include "RegexCompiler.h"
29 #include "RegexInterpreter.h"
30 #include "RegexPattern.h"
31 #include <wtf/Vector.h>
37 namespace JSC
{ namespace Yarr
{
39 class CharacterClassConstructor
{
41 CharacterClassConstructor(bool isCaseInsensitive
= false)
42 : m_isCaseInsensitive(isCaseInsensitive
)
50 m_matchesUnicode
.clear();
51 m_rangesUnicode
.clear();
54 void append(const CharacterClass
* other
)
56 for (size_t i
= 0; i
< other
->m_matches
.size(); ++i
)
57 addSorted(m_matches
, other
->m_matches
[i
]);
58 for (size_t i
= 0; i
< other
->m_ranges
.size(); ++i
)
59 addSortedRange(m_ranges
, other
->m_ranges
[i
].begin
, other
->m_ranges
[i
].end
);
60 for (size_t i
= 0; i
< other
->m_matchesUnicode
.size(); ++i
)
61 addSorted(m_matchesUnicode
, other
->m_matchesUnicode
[i
]);
62 for (size_t i
= 0; i
< other
->m_rangesUnicode
.size(); ++i
)
63 addSortedRange(m_rangesUnicode
, other
->m_rangesUnicode
[i
].begin
, other
->m_rangesUnicode
[i
].end
);
66 void putChar(UChar ch
)
69 if (m_isCaseInsensitive
&& isASCIIAlpha(ch
)) {
70 addSorted(m_matches
, toASCIIUpper(ch
));
71 addSorted(m_matches
, toASCIILower(ch
));
73 addSorted(m_matches
, ch
);
76 if (m_isCaseInsensitive
&& ((upper
= Unicode::toUpper(ch
)) != (lower
= Unicode::toLower(ch
)))) {
77 addSorted(m_matchesUnicode
, upper
);
78 addSorted(m_matchesUnicode
, lower
);
80 addSorted(m_matchesUnicode
, ch
);
84 // returns true if this character has another case, and 'ch' is the upper case form.
85 static inline bool isUnicodeUpper(UChar ch
)
87 return ch
!= Unicode::toLower(ch
);
90 // returns true if this character has another case, and 'ch' is the lower case form.
91 static inline bool isUnicodeLower(UChar ch
)
93 return ch
!= Unicode::toUpper(ch
);
96 void putRange(UChar lo
, UChar hi
)
100 char asciiHi
= std::min(hi
, (UChar
)0x7f);
101 addSortedRange(m_ranges
, lo
, asciiHi
);
103 if (m_isCaseInsensitive
) {
104 if ((asciiLo
<= 'Z') && (asciiHi
>= 'A'))
105 addSortedRange(m_ranges
, std::max(asciiLo
, 'A')+('a'-'A'), std::min(asciiHi
, 'Z')+('a'-'A'));
106 if ((asciiLo
<= 'z') && (asciiHi
>= 'a'))
107 addSortedRange(m_ranges
, std::max(asciiLo
, 'a')+('A'-'a'), std::min(asciiHi
, 'z')+('A'-'a'));
111 uint32_t unicodeCurr
= std::max(lo
, (UChar
)0x80);
112 addSortedRange(m_rangesUnicode
, unicodeCurr
, hi
);
114 if (m_isCaseInsensitive
) {
115 while (unicodeCurr
<= hi
) {
116 // If the upper bound of the range (hi) is 0xffff, the increments to
117 // unicodeCurr in this loop may take it to 0x10000. This is fine
118 // (if so we won't re-enter the loop, since the loop condition above
119 // will definitely fail) - but this does mean we cannot use a UChar
120 // to represent unicodeCurr, we must use a 32-bit value instead.
121 ASSERT(unicodeCurr
<= 0xffff);
123 if (isUnicodeUpper(unicodeCurr
)) {
124 UChar lowerCaseRangeBegin
= Unicode::toLower(unicodeCurr
);
125 UChar lowerCaseRangeEnd
= lowerCaseRangeBegin
;
126 while ((++unicodeCurr
<= hi
) && isUnicodeUpper(unicodeCurr
) && (Unicode::toLower(unicodeCurr
) == (lowerCaseRangeEnd
+ 1)))
128 addSortedRange(m_rangesUnicode
, lowerCaseRangeBegin
, lowerCaseRangeEnd
);
129 } else if (isUnicodeLower(unicodeCurr
)) {
130 UChar upperCaseRangeBegin
= Unicode::toUpper(unicodeCurr
);
131 UChar upperCaseRangeEnd
= upperCaseRangeBegin
;
132 while ((++unicodeCurr
<= hi
) && isUnicodeLower(unicodeCurr
) && (Unicode::toUpper(unicodeCurr
) == (upperCaseRangeEnd
+ 1)))
134 addSortedRange(m_rangesUnicode
, upperCaseRangeBegin
, upperCaseRangeEnd
);
142 CharacterClass
* charClass()
144 CharacterClass
* characterClass
= new CharacterClass();
146 characterClass
->m_matches
.append(m_matches
);
147 characterClass
->m_ranges
.append(m_ranges
);
148 characterClass
->m_matchesUnicode
.append(m_matchesUnicode
);
149 characterClass
->m_rangesUnicode
.append(m_rangesUnicode
);
153 return characterClass
;
157 void addSorted(Vector
<UChar
>& matches
, UChar ch
)
160 unsigned range
= matches
.size();
162 // binary chop, find position to insert char.
164 unsigned index
= range
>> 1;
166 int val
= matches
[pos
+index
] - ch
;
177 if (pos
== matches
.size())
180 matches
.insert(pos
, ch
);
183 void addSortedRange(Vector
<CharacterRange
>& ranges
, UChar lo
, UChar hi
)
185 unsigned end
= ranges
.size();
187 // Simple linear scan - I doubt there are that many ranges anyway...
188 // feel free to fix this with something faster (eg binary chop).
189 for (unsigned i
= 0; i
< end
; ++i
) {
190 // does the new range fall before the current position in the array
191 if (hi
< ranges
[i
].begin
) {
192 // optional optimization: concatenate appending ranges? - may not be worthwhile.
193 if (hi
== (ranges
[i
].begin
- 1)) {
194 ranges
[i
].begin
= lo
;
197 ranges
.insert(i
, CharacterRange(lo
, hi
));
200 // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
201 // If the new range start at or before the end of the last range, then the overlap (if it starts one after the
202 // end of the last range they concatenate, which is just as good.
203 if (lo
<= (ranges
[i
].end
+ 1)) {
204 // found an intersect! we'll replace this entry in the array.
205 ranges
[i
].begin
= std::min(ranges
[i
].begin
, lo
);
206 ranges
[i
].end
= std::max(ranges
[i
].end
, hi
);
208 // now check if the new range can subsume any subsequent ranges.
210 // each iteration of the loop we will either remove something from the list, or break the loop.
211 while (next
< ranges
.size()) {
212 if (ranges
[next
].begin
<= (ranges
[i
].end
+ 1)) {
213 // the next entry now overlaps / concatenates this one.
214 ranges
[i
].end
= std::max(ranges
[i
].end
, ranges
[next
].end
);
224 // CharacterRange comes after all existing ranges.
225 ranges
.append(CharacterRange(lo
, hi
));
228 bool m_isCaseInsensitive
;
230 Vector
<UChar
> m_matches
;
231 Vector
<CharacterRange
> m_ranges
;
232 Vector
<UChar
> m_matchesUnicode
;
233 Vector
<CharacterRange
> m_rangesUnicode
;
237 CharacterClass
* newlineCreate()
239 CharacterClass
* characterClass
= new CharacterClass();
241 characterClass
->m_matches
.append('\n');
242 characterClass
->m_matches
.append('\r');
243 characterClass
->m_matchesUnicode
.append(0x2028);
244 characterClass
->m_matchesUnicode
.append(0x2029);
246 return characterClass
;
249 CharacterClass
* digitsCreate()
251 CharacterClass
* characterClass
= new CharacterClass();
253 characterClass
->m_ranges
.append(CharacterRange('0', '9'));
255 return characterClass
;
258 CharacterClass
* spacesCreate()
260 CharacterClass
* characterClass
= new CharacterClass();
262 characterClass
->m_matches
.append(' ');
263 characterClass
->m_ranges
.append(CharacterRange('\t', '\r'));
264 characterClass
->m_matchesUnicode
.append(0x00a0);
265 characterClass
->m_matchesUnicode
.append(0x1680);
266 characterClass
->m_matchesUnicode
.append(0x180e);
267 characterClass
->m_matchesUnicode
.append(0x2028);
268 characterClass
->m_matchesUnicode
.append(0x2029);
269 characterClass
->m_matchesUnicode
.append(0x202f);
270 characterClass
->m_matchesUnicode
.append(0x205f);
271 characterClass
->m_matchesUnicode
.append(0x3000);
272 characterClass
->m_rangesUnicode
.append(CharacterRange(0x2000, 0x200a));
274 return characterClass
;
277 CharacterClass
* wordcharCreate()
279 CharacterClass
* characterClass
= new CharacterClass();
281 characterClass
->m_matches
.append('_');
282 characterClass
->m_ranges
.append(CharacterRange('0', '9'));
283 characterClass
->m_ranges
.append(CharacterRange('A', 'Z'));
284 characterClass
->m_ranges
.append(CharacterRange('a', 'z'));
286 return characterClass
;
289 CharacterClass
* nondigitsCreate()
291 CharacterClass
* characterClass
= new CharacterClass();
293 characterClass
->m_ranges
.append(CharacterRange(0, '0' - 1));
294 characterClass
->m_ranges
.append(CharacterRange('9' + 1, 0x7f));
295 characterClass
->m_rangesUnicode
.append(CharacterRange(0x80, 0xffff));
297 return characterClass
;
300 CharacterClass
* nonspacesCreate()
302 CharacterClass
* characterClass
= new CharacterClass();
304 characterClass
->m_ranges
.append(CharacterRange(0, '\t' - 1));
305 characterClass
->m_ranges
.append(CharacterRange('\r' + 1, ' ' - 1));
306 characterClass
->m_ranges
.append(CharacterRange(' ' + 1, 0x7f));
307 characterClass
->m_rangesUnicode
.append(CharacterRange(0x0080, 0x009f));
308 characterClass
->m_rangesUnicode
.append(CharacterRange(0x00a1, 0x167f));
309 characterClass
->m_rangesUnicode
.append(CharacterRange(0x1681, 0x180d));
310 characterClass
->m_rangesUnicode
.append(CharacterRange(0x180f, 0x1fff));
311 characterClass
->m_rangesUnicode
.append(CharacterRange(0x200b, 0x2027));
312 characterClass
->m_rangesUnicode
.append(CharacterRange(0x202a, 0x202e));
313 characterClass
->m_rangesUnicode
.append(CharacterRange(0x2030, 0x205e));
314 characterClass
->m_rangesUnicode
.append(CharacterRange(0x2060, 0x2fff));
315 characterClass
->m_rangesUnicode
.append(CharacterRange(0x3001, 0xffff));
317 return characterClass
;
320 CharacterClass
* nonwordcharCreate()
322 CharacterClass
* characterClass
= new CharacterClass();
324 characterClass
->m_matches
.append('`');
325 characterClass
->m_ranges
.append(CharacterRange(0, '0' - 1));
326 characterClass
->m_ranges
.append(CharacterRange('9' + 1, 'A' - 1));
327 characterClass
->m_ranges
.append(CharacterRange('Z' + 1, '_' - 1));
328 characterClass
->m_ranges
.append(CharacterRange('z' + 1, 0x7f));
329 characterClass
->m_rangesUnicode
.append(CharacterRange(0x80, 0xffff));
331 return characterClass
;
335 class RegexPatternConstructor
{
337 RegexPatternConstructor(RegexPattern
& pattern
)
339 , m_characterClassConstructor(pattern
.m_ignoreCase
)
343 ~RegexPatternConstructor()
350 m_characterClassConstructor
.reset();
355 m_alternative
->m_terms
.append(PatternTerm::BOL());
359 m_alternative
->m_terms
.append(PatternTerm::EOL());
361 void assertionWordBoundary(bool invert
)
363 m_alternative
->m_terms
.append(PatternTerm::WordBoundary(invert
));
366 void atomPatternCharacter(UChar ch
)
368 // We handle case-insensitive checking of unicode characters which do have both
369 // cases by handling them as if they were defined using a CharacterClass.
370 if (m_pattern
.m_ignoreCase
&& !isASCII(ch
) && (Unicode::toUpper(ch
) != Unicode::toLower(ch
))) {
371 atomCharacterClassBegin();
372 atomCharacterClassAtom(ch
);
373 atomCharacterClassEnd();
375 m_alternative
->m_terms
.append(PatternTerm(ch
));
378 void atomBuiltInCharacterClass(BuiltInCharacterClassID classID
, bool invert
)
382 m_alternative
->m_terms
.append(PatternTerm(m_pattern
.digitsCharacterClass(), invert
));
385 m_alternative
->m_terms
.append(PatternTerm(m_pattern
.spacesCharacterClass(), invert
));
388 m_alternative
->m_terms
.append(PatternTerm(m_pattern
.wordcharCharacterClass(), invert
));
391 m_alternative
->m_terms
.append(PatternTerm(m_pattern
.newlineCharacterClass(), invert
));
396 void atomCharacterClassBegin(bool invert
= false)
398 m_invertCharacterClass
= invert
;
401 void atomCharacterClassAtom(UChar ch
)
403 m_characterClassConstructor
.putChar(ch
);
406 void atomCharacterClassRange(UChar begin
, UChar end
)
408 m_characterClassConstructor
.putRange(begin
, end
);
411 void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID
, bool invert
)
413 ASSERT(classID
!= NewlineClassID
);
417 m_characterClassConstructor
.append(invert
? m_pattern
.nondigitsCharacterClass() : m_pattern
.digitsCharacterClass());
421 m_characterClassConstructor
.append(invert
? m_pattern
.nonspacesCharacterClass() : m_pattern
.spacesCharacterClass());
425 m_characterClassConstructor
.append(invert
? m_pattern
.nonwordcharCharacterClass() : m_pattern
.wordcharCharacterClass());
429 ASSERT_NOT_REACHED();
433 void atomCharacterClassEnd()
435 CharacterClass
* newCharacterClass
= m_characterClassConstructor
.charClass();
436 m_pattern
.m_userCharacterClasses
.append(newCharacterClass
);
437 m_alternative
->m_terms
.append(PatternTerm(newCharacterClass
, m_invertCharacterClass
));
440 void atomParenthesesSubpatternBegin(bool capture
= true)
442 unsigned subpatternId
= m_pattern
.m_numSubpatterns
+ 1;
444 m_pattern
.m_numSubpatterns
++;
446 PatternDisjunction
* parenthesesDisjunction
= new PatternDisjunction(m_alternative
);
447 m_pattern
.m_disjunctions
.append(parenthesesDisjunction
);
448 m_alternative
->m_terms
.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern
, subpatternId
, parenthesesDisjunction
, capture
));
449 m_alternative
= parenthesesDisjunction
->addNewAlternative();
452 void atomParentheticalAssertionBegin(bool invert
= false)
454 PatternDisjunction
* parenthesesDisjunction
= new PatternDisjunction(m_alternative
);
455 m_pattern
.m_disjunctions
.append(parenthesesDisjunction
);
456 m_alternative
->m_terms
.append(PatternTerm(PatternTerm::TypeParentheticalAssertion
, m_pattern
.m_numSubpatterns
+ 1, parenthesesDisjunction
, invert
));
457 m_alternative
= parenthesesDisjunction
->addNewAlternative();
460 void atomParenthesesEnd()
462 ASSERT(m_alternative
->m_parent
);
463 ASSERT(m_alternative
->m_parent
->m_parent
);
464 m_alternative
= m_alternative
->m_parent
->m_parent
;
466 m_alternative
->lastTerm().parentheses
.lastSubpatternId
= m_pattern
.m_numSubpatterns
;
469 void atomBackReference(unsigned subpatternId
)
471 ASSERT(subpatternId
);
472 m_pattern
.m_maxBackReference
= std::max(m_pattern
.m_maxBackReference
, subpatternId
);
474 if (subpatternId
> m_pattern
.m_numSubpatterns
) {
475 m_alternative
->m_terms
.append(PatternTerm::ForwardReference());
479 PatternAlternative
* currentAlternative
= m_alternative
;
480 ASSERT(currentAlternative
);
482 // Note to self: if we waited until the AST was baked, we could also remove forwards refs
483 while ((currentAlternative
= currentAlternative
->m_parent
->m_parent
)) {
484 PatternTerm
& term
= currentAlternative
->lastTerm();
485 ASSERT((term
.type
== PatternTerm::TypeParenthesesSubpattern
) || (term
.type
== PatternTerm::TypeParentheticalAssertion
));
487 if ((term
.type
== PatternTerm::TypeParenthesesSubpattern
) && term
.invertOrCapture
&& (subpatternId
== term
.subpatternId
)) {
488 m_alternative
->m_terms
.append(PatternTerm::ForwardReference());
493 m_alternative
->m_terms
.append(PatternTerm(subpatternId
));
496 PatternDisjunction
* copyDisjunction(PatternDisjunction
* disjunction
)
498 PatternDisjunction
* newDisjunction
= new PatternDisjunction();
500 newDisjunction
->m_parent
= disjunction
->m_parent
;
501 for (unsigned alt
= 0; alt
< disjunction
->m_alternatives
.size(); ++alt
) {
502 PatternAlternative
* alternative
= disjunction
->m_alternatives
[alt
];
503 PatternAlternative
* newAlternative
= newDisjunction
->addNewAlternative();
504 for (unsigned i
= 0; i
< alternative
->m_terms
.size(); ++i
)
505 newAlternative
->m_terms
.append(copyTerm(alternative
->m_terms
[i
]));
508 m_pattern
.m_disjunctions
.append(newDisjunction
);
509 return newDisjunction
;
512 PatternTerm
copyTerm(PatternTerm
& term
)
514 if ((term
.type
!= PatternTerm::TypeParenthesesSubpattern
) && (term
.type
!= PatternTerm::TypeParentheticalAssertion
))
515 return PatternTerm(term
);
517 PatternTerm termCopy
= term
;
518 termCopy
.parentheses
.disjunction
= copyDisjunction(termCopy
.parentheses
.disjunction
);
522 void quantifyAtom(unsigned min
, unsigned max
, bool greedy
)
525 ASSERT(m_alternative
->m_terms
.size());
528 m_alternative
->removeLastTerm();
532 PatternTerm
& term
= m_alternative
->lastTerm();
533 ASSERT(term
.type
> PatternTerm::TypeAssertionWordBoundary
);
534 ASSERT((term
.quantityCount
== 1) && (term
.quantityType
== QuantifierFixedCount
));
536 // For any assertion with a zero minimum, not matching is valid and has no effect,
537 // remove it. Otherwise, we need to match as least once, but there is no point
538 // matching more than once, so remove the quantifier. It is not entirely clear
539 // from the spec whether or not this behavior is correct, but I believe this
540 // matches Firefox. :-/
541 if (term
.type
== PatternTerm::TypeParentheticalAssertion
) {
543 m_alternative
->removeLastTerm();
548 term
.quantify(max
, greedy
? QuantifierGreedy
: QuantifierNonGreedy
);
550 term
.quantify(min
, QuantifierFixedCount
);
552 term
.quantify(min
, QuantifierFixedCount
);
553 m_alternative
->m_terms
.append(copyTerm(term
));
554 // NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
555 m_alternative
->lastTerm().quantify((max
== UINT_MAX
) ? max
: max
- min
, greedy
? QuantifierGreedy
: QuantifierNonGreedy
);
556 if (m_alternative
->lastTerm().type
== PatternTerm::TypeParenthesesSubpattern
)
557 m_alternative
->lastTerm().parentheses
.isCopy
= true;
563 m_alternative
= m_alternative
->m_parent
->addNewAlternative();
568 m_pattern
.m_body
= new PatternDisjunction();
569 m_alternative
= m_pattern
.m_body
->addNewAlternative();
570 m_pattern
.m_disjunctions
.append(m_pattern
.m_body
);
579 unsigned setupAlternativeOffsets(PatternAlternative
* alternative
, unsigned currentCallFrameSize
, unsigned initialInputPosition
)
581 alternative
->m_hasFixedSize
= true;
582 unsigned currentInputPosition
= initialInputPosition
;
584 for (unsigned i
= 0; i
< alternative
->m_terms
.size(); ++i
) {
585 PatternTerm
& term
= alternative
->m_terms
[i
];
588 case PatternTerm::TypeAssertionBOL
:
589 case PatternTerm::TypeAssertionEOL
:
590 case PatternTerm::TypeAssertionWordBoundary
:
591 term
.inputPosition
= currentInputPosition
;
594 case PatternTerm::TypeBackReference
:
595 term
.inputPosition
= currentInputPosition
;
596 term
.frameLocation
= currentCallFrameSize
;
597 currentCallFrameSize
+= RegexStackSpaceForBackTrackInfoBackReference
;
598 alternative
->m_hasFixedSize
= false;
601 case PatternTerm::TypeForwardReference
:
604 case PatternTerm::TypePatternCharacter
:
605 term
.inputPosition
= currentInputPosition
;
606 if (term
.quantityType
!= QuantifierFixedCount
) {
607 term
.frameLocation
= currentCallFrameSize
;
608 currentCallFrameSize
+= RegexStackSpaceForBackTrackInfoPatternCharacter
;
609 alternative
->m_hasFixedSize
= false;
611 currentInputPosition
+= term
.quantityCount
;
614 case PatternTerm::TypeCharacterClass
:
615 term
.inputPosition
= currentInputPosition
;
616 if (term
.quantityType
!= QuantifierFixedCount
) {
617 term
.frameLocation
= currentCallFrameSize
;
618 currentCallFrameSize
+= RegexStackSpaceForBackTrackInfoCharacterClass
;
619 alternative
->m_hasFixedSize
= false;
621 currentInputPosition
+= term
.quantityCount
;
624 case PatternTerm::TypeParenthesesSubpattern
:
625 // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
626 term
.frameLocation
= currentCallFrameSize
;
627 if ((term
.quantityCount
== 1) && !term
.parentheses
.isCopy
) {
628 if (term
.quantityType
== QuantifierFixedCount
) {
629 currentCallFrameSize
= setupDisjunctionOffsets(term
.parentheses
.disjunction
, currentCallFrameSize
, currentInputPosition
);
630 currentInputPosition
+= term
.parentheses
.disjunction
->m_minimumSize
;
632 currentCallFrameSize
+= RegexStackSpaceForBackTrackInfoParenthesesOnce
;
633 currentCallFrameSize
= setupDisjunctionOffsets(term
.parentheses
.disjunction
, currentCallFrameSize
, currentInputPosition
);
635 term
.inputPosition
= currentInputPosition
;
637 term
.inputPosition
= currentInputPosition
;
638 setupDisjunctionOffsets(term
.parentheses
.disjunction
, 0, currentInputPosition
);
639 currentCallFrameSize
+= RegexStackSpaceForBackTrackInfoParentheses
;
641 // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length.
642 alternative
->m_hasFixedSize
= false;
645 case PatternTerm::TypeParentheticalAssertion
:
646 term
.inputPosition
= currentInputPosition
;
647 term
.frameLocation
= currentCallFrameSize
;
648 currentCallFrameSize
= setupDisjunctionOffsets(term
.parentheses
.disjunction
, currentCallFrameSize
+ RegexStackSpaceForBackTrackInfoParentheticalAssertion
, currentInputPosition
);
653 alternative
->m_minimumSize
= currentInputPosition
- initialInputPosition
;
654 return currentCallFrameSize
;
657 unsigned setupDisjunctionOffsets(PatternDisjunction
* disjunction
, unsigned initialCallFrameSize
, unsigned initialInputPosition
)
659 if ((disjunction
!= m_pattern
.m_body
) && (disjunction
->m_alternatives
.size() > 1))
660 initialCallFrameSize
+= RegexStackSpaceForBackTrackInfoAlternative
;
662 unsigned minimumInputSize
= UINT_MAX
;
663 unsigned maximumCallFrameSize
= 0;
664 bool hasFixedSize
= true;
666 for (unsigned alt
= 0; alt
< disjunction
->m_alternatives
.size(); ++alt
) {
667 PatternAlternative
* alternative
= disjunction
->m_alternatives
[alt
];
668 unsigned currentAlternativeCallFrameSize
= setupAlternativeOffsets(alternative
, initialCallFrameSize
, initialInputPosition
);
669 minimumInputSize
= min(minimumInputSize
, alternative
->m_minimumSize
);
670 maximumCallFrameSize
= max(maximumCallFrameSize
, currentAlternativeCallFrameSize
);
671 hasFixedSize
&= alternative
->m_hasFixedSize
;
674 ASSERT(minimumInputSize
!= UINT_MAX
);
675 ASSERT(maximumCallFrameSize
>= initialCallFrameSize
);
677 disjunction
->m_hasFixedSize
= hasFixedSize
;
678 disjunction
->m_minimumSize
= minimumInputSize
;
679 disjunction
->m_callFrameSize
= maximumCallFrameSize
;
680 return maximumCallFrameSize
;
685 setupDisjunctionOffsets(m_pattern
.m_body
, 0, 0);
689 RegexPattern
& m_pattern
;
690 PatternAlternative
* m_alternative
;
691 CharacterClassConstructor m_characterClassConstructor
;
692 bool m_invertCharacterClass
;
696 const char* compileRegex(const UString
& patternString
, RegexPattern
& pattern
)
698 RegexPatternConstructor
constructor(pattern
);
700 if (const char* error
= parse(constructor
, patternString
))
703 // If the pattern contains illegal backreferences reset & reparse.
704 // Quoting Netscape's "What's new in JavaScript 1.2",
705 // "Note: if the number of left parentheses is less than the number specified
706 // in \#, the \# is taken as an octal escape as described in the next row."
707 if (pattern
.containsIllegalBackReference()) {
708 unsigned numSubpatterns
= pattern
.m_numSubpatterns
;
714 parse(constructor
, patternString
, numSubpatterns
);
717 ASSERT(numSubpatterns
== pattern
.m_numSubpatterns
);
720 constructor
.setupOffsets();