2 * Copyright (C) 2009 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 #include "YarrPattern.h"
31 #include "YarrParser.h"
32 #include <wtf/Vector.h>
36 namespace JSC
{ namespace Yarr
{
38 #include "RegExpJitTables.h"
40 class CharacterClassConstructor
{
42 CharacterClassConstructor(bool isCaseInsensitive
= false)
43 : m_isCaseInsensitive(isCaseInsensitive
)
51 m_matchesUnicode
.clear();
52 m_rangesUnicode
.clear();
55 void append(const CharacterClass
* other
)
57 for (size_t i
= 0; i
< other
->m_matches
.size(); ++i
)
58 addSorted(m_matches
, other
->m_matches
[i
]);
59 for (size_t i
= 0; i
< other
->m_ranges
.size(); ++i
)
60 addSortedRange(m_ranges
, other
->m_ranges
[i
].begin
, other
->m_ranges
[i
].end
);
61 for (size_t i
= 0; i
< other
->m_matchesUnicode
.size(); ++i
)
62 addSorted(m_matchesUnicode
, other
->m_matchesUnicode
[i
]);
63 for (size_t i
= 0; i
< other
->m_rangesUnicode
.size(); ++i
)
64 addSortedRange(m_rangesUnicode
, other
->m_rangesUnicode
[i
].begin
, other
->m_rangesUnicode
[i
].end
);
67 void putChar(UChar ch
)
70 if (m_isCaseInsensitive
&& isASCIIAlpha(ch
)) {
71 addSorted(m_matches
, toASCIIUpper(ch
));
72 addSorted(m_matches
, toASCIILower(ch
));
74 addSorted(m_matches
, ch
);
77 if (m_isCaseInsensitive
&& ((upper
= Unicode::toUpper(ch
)) != (lower
= Unicode::toLower(ch
)))) {
78 addSorted(m_matchesUnicode
, upper
);
79 addSorted(m_matchesUnicode
, lower
);
81 addSorted(m_matchesUnicode
, ch
);
85 // returns true if this character has another case, and 'ch' is the upper case form.
86 static inline bool isUnicodeUpper(UChar ch
)
88 return ch
!= Unicode::toLower(ch
);
91 // returns true if this character has another case, and 'ch' is the lower case form.
92 static inline bool isUnicodeLower(UChar ch
)
94 return ch
!= Unicode::toUpper(ch
);
97 void putRange(UChar lo
, UChar hi
)
101 char asciiHi
= std::min(hi
, (UChar
)0x7f);
102 addSortedRange(m_ranges
, lo
, asciiHi
);
104 if (m_isCaseInsensitive
) {
105 if ((asciiLo
<= 'Z') && (asciiHi
>= 'A'))
106 addSortedRange(m_ranges
, std::max(asciiLo
, 'A')+('a'-'A'), std::min(asciiHi
, 'Z')+('a'-'A'));
107 if ((asciiLo
<= 'z') && (asciiHi
>= 'a'))
108 addSortedRange(m_ranges
, std::max(asciiLo
, 'a')+('A'-'a'), std::min(asciiHi
, 'z')+('A'-'a'));
112 uint32_t unicodeCurr
= std::max(lo
, (UChar
)0x80);
113 addSortedRange(m_rangesUnicode
, unicodeCurr
, hi
);
115 if (m_isCaseInsensitive
) {
116 while (unicodeCurr
<= hi
) {
117 // If the upper bound of the range (hi) is 0xffff, the increments to
118 // unicodeCurr in this loop may take it to 0x10000. This is fine
119 // (if so we won't re-enter the loop, since the loop condition above
120 // will definitely fail) - but this does mean we cannot use a UChar
121 // to represent unicodeCurr, we must use a 32-bit value instead.
122 ASSERT(unicodeCurr
<= 0xffff);
124 if (isUnicodeUpper(unicodeCurr
)) {
125 UChar lowerCaseRangeBegin
= Unicode::toLower(unicodeCurr
);
126 UChar lowerCaseRangeEnd
= lowerCaseRangeBegin
;
127 while ((++unicodeCurr
<= hi
) && isUnicodeUpper(unicodeCurr
) && (Unicode::toLower(unicodeCurr
) == (lowerCaseRangeEnd
+ 1)))
129 addSortedRange(m_rangesUnicode
, lowerCaseRangeBegin
, lowerCaseRangeEnd
);
130 } else if (isUnicodeLower(unicodeCurr
)) {
131 UChar upperCaseRangeBegin
= Unicode::toUpper(unicodeCurr
);
132 UChar upperCaseRangeEnd
= upperCaseRangeBegin
;
133 while ((++unicodeCurr
<= hi
) && isUnicodeLower(unicodeCurr
) && (Unicode::toUpper(unicodeCurr
) == (upperCaseRangeEnd
+ 1)))
135 addSortedRange(m_rangesUnicode
, upperCaseRangeBegin
, upperCaseRangeEnd
);
143 CharacterClass
* charClass()
145 CharacterClass
* characterClass
= new CharacterClass(0);
147 characterClass
->m_matches
.append(m_matches
);
148 characterClass
->m_ranges
.append(m_ranges
);
149 characterClass
->m_matchesUnicode
.append(m_matchesUnicode
);
150 characterClass
->m_rangesUnicode
.append(m_rangesUnicode
);
154 return characterClass
;
158 void addSorted(Vector
<UChar
>& matches
, UChar ch
)
161 unsigned range
= matches
.size();
163 // binary chop, find position to insert char.
165 unsigned index
= range
>> 1;
167 int val
= matches
[pos
+index
] - ch
;
178 if (pos
== matches
.size())
181 matches
.insert(pos
, ch
);
184 void addSortedRange(Vector
<CharacterRange
>& ranges
, UChar lo
, UChar hi
)
186 unsigned end
= ranges
.size();
188 // Simple linear scan - I doubt there are that many ranges anyway...
189 // feel free to fix this with something faster (eg binary chop).
190 for (unsigned i
= 0; i
< end
; ++i
) {
191 // does the new range fall before the current position in the array
192 if (hi
< ranges
[i
].begin
) {
193 // optional optimization: concatenate appending ranges? - may not be worthwhile.
194 if (hi
== (ranges
[i
].begin
- 1)) {
195 ranges
[i
].begin
= lo
;
198 ranges
.insert(i
, CharacterRange(lo
, hi
));
201 // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
202 // If the new range start at or before the end of the last range, then the overlap (if it starts one after the
203 // end of the last range they concatenate, which is just as good.
204 if (lo
<= (ranges
[i
].end
+ 1)) {
205 // found an intersect! we'll replace this entry in the array.
206 ranges
[i
].begin
= std::min(ranges
[i
].begin
, lo
);
207 ranges
[i
].end
= std::max(ranges
[i
].end
, hi
);
209 // now check if the new range can subsume any subsequent ranges.
211 // each iteration of the loop we will either remove something from the list, or break the loop.
212 while (next
< ranges
.size()) {
213 if (ranges
[next
].begin
<= (ranges
[i
].end
+ 1)) {
214 // the next entry now overlaps / concatenates this one.
215 ranges
[i
].end
= std::max(ranges
[i
].end
, ranges
[next
].end
);
225 // CharacterRange comes after all existing ranges.
226 ranges
.append(CharacterRange(lo
, hi
));
229 bool m_isCaseInsensitive
;
231 Vector
<UChar
> m_matches
;
232 Vector
<CharacterRange
> m_ranges
;
233 Vector
<UChar
> m_matchesUnicode
;
234 Vector
<CharacterRange
> m_rangesUnicode
;
237 class YarrPatternConstructor
{
239 YarrPatternConstructor(YarrPattern
& pattern
)
241 , m_characterClassConstructor(pattern
.m_ignoreCase
)
242 , m_invertParentheticalAssertion(false)
244 m_pattern
.m_body
= new PatternDisjunction();
245 m_alternative
= m_pattern
.m_body
->addNewAlternative();
246 m_pattern
.m_disjunctions
.append(m_pattern
.m_body
);
249 ~YarrPatternConstructor()
256 m_characterClassConstructor
.reset();
258 m_pattern
.m_body
= new PatternDisjunction();
259 m_alternative
= m_pattern
.m_body
->addNewAlternative();
260 m_pattern
.m_disjunctions
.append(m_pattern
.m_body
);
265 if (!m_alternative
->m_terms
.size() & !m_invertParentheticalAssertion
) {
266 m_alternative
->m_startsWithBOL
= true;
267 m_alternative
->m_containsBOL
= true;
268 m_pattern
.m_containsBOL
= true;
270 m_alternative
->m_terms
.append(PatternTerm::BOL());
274 m_alternative
->m_terms
.append(PatternTerm::EOL());
276 void assertionWordBoundary(bool invert
)
278 m_alternative
->m_terms
.append(PatternTerm::WordBoundary(invert
));
281 void atomPatternCharacter(UChar ch
)
283 // We handle case-insensitive checking of unicode characters which do have both
284 // cases by handling them as if they were defined using a CharacterClass.
285 if (m_pattern
.m_ignoreCase
&& !isASCII(ch
) && (Unicode::toUpper(ch
) != Unicode::toLower(ch
))) {
286 atomCharacterClassBegin();
287 atomCharacterClassAtom(ch
);
288 atomCharacterClassEnd();
290 m_alternative
->m_terms
.append(PatternTerm(ch
));
293 void atomBuiltInCharacterClass(BuiltInCharacterClassID classID
, bool invert
)
297 m_alternative
->m_terms
.append(PatternTerm(m_pattern
.digitsCharacterClass(), invert
));
300 m_alternative
->m_terms
.append(PatternTerm(m_pattern
.spacesCharacterClass(), invert
));
303 m_alternative
->m_terms
.append(PatternTerm(m_pattern
.wordcharCharacterClass(), invert
));
306 m_alternative
->m_terms
.append(PatternTerm(m_pattern
.newlineCharacterClass(), invert
));
311 void atomCharacterClassBegin(bool invert
= false)
313 m_invertCharacterClass
= invert
;
316 void atomCharacterClassAtom(UChar ch
)
318 m_characterClassConstructor
.putChar(ch
);
321 void atomCharacterClassRange(UChar begin
, UChar end
)
323 m_characterClassConstructor
.putRange(begin
, end
);
326 void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID
, bool invert
)
328 ASSERT(classID
!= NewlineClassID
);
332 m_characterClassConstructor
.append(invert
? m_pattern
.nondigitsCharacterClass() : m_pattern
.digitsCharacterClass());
336 m_characterClassConstructor
.append(invert
? m_pattern
.nonspacesCharacterClass() : m_pattern
.spacesCharacterClass());
340 m_characterClassConstructor
.append(invert
? m_pattern
.nonwordcharCharacterClass() : m_pattern
.wordcharCharacterClass());
344 ASSERT_NOT_REACHED();
348 void atomCharacterClassEnd()
350 CharacterClass
* newCharacterClass
= m_characterClassConstructor
.charClass();
351 m_pattern
.m_userCharacterClasses
.append(newCharacterClass
);
352 m_alternative
->m_terms
.append(PatternTerm(newCharacterClass
, m_invertCharacterClass
));
355 void atomParenthesesSubpatternBegin(bool capture
= true)
357 unsigned subpatternId
= m_pattern
.m_numSubpatterns
+ 1;
359 m_pattern
.m_numSubpatterns
++;
361 PatternDisjunction
* parenthesesDisjunction
= new PatternDisjunction(m_alternative
);
362 m_pattern
.m_disjunctions
.append(parenthesesDisjunction
);
363 m_alternative
->m_terms
.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern
, subpatternId
, parenthesesDisjunction
, capture
, false));
364 m_alternative
= parenthesesDisjunction
->addNewAlternative();
367 void atomParentheticalAssertionBegin(bool invert
= false)
369 PatternDisjunction
* parenthesesDisjunction
= new PatternDisjunction(m_alternative
);
370 m_pattern
.m_disjunctions
.append(parenthesesDisjunction
);
371 m_alternative
->m_terms
.append(PatternTerm(PatternTerm::TypeParentheticalAssertion
, m_pattern
.m_numSubpatterns
+ 1, parenthesesDisjunction
, false, invert
));
372 m_alternative
= parenthesesDisjunction
->addNewAlternative();
373 m_invertParentheticalAssertion
= invert
;
376 void atomParenthesesEnd()
378 ASSERT(m_alternative
->m_parent
);
379 ASSERT(m_alternative
->m_parent
->m_parent
);
381 PatternDisjunction
* parenthesesDisjunction
= m_alternative
->m_parent
;
382 m_alternative
= m_alternative
->m_parent
->m_parent
;
384 PatternTerm
& lastTerm
= m_alternative
->lastTerm();
386 unsigned numParenAlternatives
= parenthesesDisjunction
->m_alternatives
.size();
387 unsigned numBOLAnchoredAlts
= 0;
389 for (unsigned i
= 0; i
< numParenAlternatives
; i
++) {
390 // Bubble up BOL flags
391 if (parenthesesDisjunction
->m_alternatives
[i
]->m_startsWithBOL
)
392 numBOLAnchoredAlts
++;
395 if (numBOLAnchoredAlts
) {
396 m_alternative
->m_containsBOL
= true;
397 // If all the alternatives in parens start with BOL, then so does this one
398 if (numBOLAnchoredAlts
== numParenAlternatives
)
399 m_alternative
->m_startsWithBOL
= true;
402 lastTerm
.parentheses
.lastSubpatternId
= m_pattern
.m_numSubpatterns
;
403 m_invertParentheticalAssertion
= false;
406 void atomBackReference(unsigned subpatternId
)
408 ASSERT(subpatternId
);
409 m_pattern
.m_containsBackreferences
= true;
410 m_pattern
.m_maxBackReference
= std::max(m_pattern
.m_maxBackReference
, subpatternId
);
412 if (subpatternId
> m_pattern
.m_numSubpatterns
) {
413 m_alternative
->m_terms
.append(PatternTerm::ForwardReference());
417 PatternAlternative
* currentAlternative
= m_alternative
;
418 ASSERT(currentAlternative
);
420 // Note to self: if we waited until the AST was baked, we could also remove forwards refs
421 while ((currentAlternative
= currentAlternative
->m_parent
->m_parent
)) {
422 PatternTerm
& term
= currentAlternative
->lastTerm();
423 ASSERT((term
.type
== PatternTerm::TypeParenthesesSubpattern
) || (term
.type
== PatternTerm::TypeParentheticalAssertion
));
425 if ((term
.type
== PatternTerm::TypeParenthesesSubpattern
) && term
.capture() && (subpatternId
== term
.parentheses
.subpatternId
)) {
426 m_alternative
->m_terms
.append(PatternTerm::ForwardReference());
431 m_alternative
->m_terms
.append(PatternTerm(subpatternId
));
434 // deep copy the argument disjunction. If filterStartsWithBOL is true,
435 // skip alternatives with m_startsWithBOL set true.
436 PatternDisjunction
* copyDisjunction(PatternDisjunction
* disjunction
, bool filterStartsWithBOL
= false)
438 PatternDisjunction
* newDisjunction
= 0;
439 for (unsigned alt
= 0; alt
< disjunction
->m_alternatives
.size(); ++alt
) {
440 PatternAlternative
* alternative
= disjunction
->m_alternatives
[alt
];
441 if (!filterStartsWithBOL
|| !alternative
->m_startsWithBOL
) {
442 if (!newDisjunction
) {
443 newDisjunction
= new PatternDisjunction();
444 newDisjunction
->m_parent
= disjunction
->m_parent
;
446 PatternAlternative
* newAlternative
= newDisjunction
->addNewAlternative();
447 for (unsigned i
= 0; i
< alternative
->m_terms
.size(); ++i
)
448 newAlternative
->m_terms
.append(copyTerm(alternative
->m_terms
[i
], filterStartsWithBOL
));
453 m_pattern
.m_disjunctions
.append(newDisjunction
);
454 return newDisjunction
;
457 PatternTerm
copyTerm(PatternTerm
& term
, bool filterStartsWithBOL
= false)
459 if ((term
.type
!= PatternTerm::TypeParenthesesSubpattern
) && (term
.type
!= PatternTerm::TypeParentheticalAssertion
))
460 return PatternTerm(term
);
462 PatternTerm termCopy
= term
;
463 termCopy
.parentheses
.disjunction
= copyDisjunction(termCopy
.parentheses
.disjunction
, filterStartsWithBOL
);
467 void quantifyAtom(unsigned min
, unsigned max
, bool greedy
)
470 ASSERT(m_alternative
->m_terms
.size());
473 m_alternative
->removeLastTerm();
477 PatternTerm
& term
= m_alternative
->lastTerm();
478 ASSERT(term
.type
> PatternTerm::TypeAssertionWordBoundary
);
479 ASSERT((term
.quantityCount
== 1) && (term
.quantityType
== QuantifierFixedCount
));
481 // For any assertion with a zero minimum, not matching is valid and has no effect,
482 // remove it. Otherwise, we need to match as least once, but there is no point
483 // matching more than once, so remove the quantifier. It is not entirely clear
484 // from the spec whether or not this behavior is correct, but I believe this
485 // matches Firefox. :-/
486 if (term
.type
== PatternTerm::TypeParentheticalAssertion
) {
488 m_alternative
->removeLastTerm();
493 term
.quantify(max
, greedy
? QuantifierGreedy
: QuantifierNonGreedy
);
495 term
.quantify(min
, QuantifierFixedCount
);
497 term
.quantify(min
, QuantifierFixedCount
);
498 m_alternative
->m_terms
.append(copyTerm(term
));
499 // NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
500 m_alternative
->lastTerm().quantify((max
== quantifyInfinite
) ? max
: max
- min
, greedy
? QuantifierGreedy
: QuantifierNonGreedy
);
501 if (m_alternative
->lastTerm().type
== PatternTerm::TypeParenthesesSubpattern
)
502 m_alternative
->lastTerm().parentheses
.isCopy
= true;
508 m_alternative
= m_alternative
->m_parent
->addNewAlternative();
511 unsigned setupAlternativeOffsets(PatternAlternative
* alternative
, unsigned currentCallFrameSize
, unsigned initialInputPosition
)
513 alternative
->m_hasFixedSize
= true;
514 unsigned currentInputPosition
= initialInputPosition
;
516 for (unsigned i
= 0; i
< alternative
->m_terms
.size(); ++i
) {
517 PatternTerm
& term
= alternative
->m_terms
[i
];
520 case PatternTerm::TypeAssertionBOL
:
521 case PatternTerm::TypeAssertionEOL
:
522 case PatternTerm::TypeAssertionWordBoundary
:
523 term
.inputPosition
= currentInputPosition
;
526 case PatternTerm::TypeBackReference
:
527 term
.inputPosition
= currentInputPosition
;
528 term
.frameLocation
= currentCallFrameSize
;
529 currentCallFrameSize
+= YarrStackSpaceForBackTrackInfoBackReference
;
530 alternative
->m_hasFixedSize
= false;
533 case PatternTerm::TypeForwardReference
:
536 case PatternTerm::TypePatternCharacter
:
537 term
.inputPosition
= currentInputPosition
;
538 if (term
.quantityType
!= QuantifierFixedCount
) {
539 term
.frameLocation
= currentCallFrameSize
;
540 currentCallFrameSize
+= YarrStackSpaceForBackTrackInfoPatternCharacter
;
541 alternative
->m_hasFixedSize
= false;
543 currentInputPosition
+= term
.quantityCount
;
546 case PatternTerm::TypeCharacterClass
:
547 term
.inputPosition
= currentInputPosition
;
548 if (term
.quantityType
!= QuantifierFixedCount
) {
549 term
.frameLocation
= currentCallFrameSize
;
550 currentCallFrameSize
+= YarrStackSpaceForBackTrackInfoCharacterClass
;
551 alternative
->m_hasFixedSize
= false;
553 currentInputPosition
+= term
.quantityCount
;
556 case PatternTerm::TypeParenthesesSubpattern
:
557 // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
558 term
.frameLocation
= currentCallFrameSize
;
559 if (term
.quantityCount
== 1 && !term
.parentheses
.isCopy
) {
560 if (term
.quantityType
!= QuantifierFixedCount
)
561 currentCallFrameSize
+= YarrStackSpaceForBackTrackInfoParenthesesOnce
;
562 currentCallFrameSize
= setupDisjunctionOffsets(term
.parentheses
.disjunction
, currentCallFrameSize
, currentInputPosition
);
563 // If quantity is fixed, then pre-check its minimum size.
564 if (term
.quantityType
== QuantifierFixedCount
)
565 currentInputPosition
+= term
.parentheses
.disjunction
->m_minimumSize
;
566 term
.inputPosition
= currentInputPosition
;
567 } else if (term
.parentheses
.isTerminal
) {
568 currentCallFrameSize
+= YarrStackSpaceForBackTrackInfoParenthesesTerminal
;
569 currentCallFrameSize
= setupDisjunctionOffsets(term
.parentheses
.disjunction
, currentCallFrameSize
, currentInputPosition
);
570 term
.inputPosition
= currentInputPosition
;
572 term
.inputPosition
= currentInputPosition
;
573 setupDisjunctionOffsets(term
.parentheses
.disjunction
, 0, currentInputPosition
);
574 currentCallFrameSize
+= YarrStackSpaceForBackTrackInfoParentheses
;
576 // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length.
577 alternative
->m_hasFixedSize
= false;
580 case PatternTerm::TypeParentheticalAssertion
:
581 term
.inputPosition
= currentInputPosition
;
582 term
.frameLocation
= currentCallFrameSize
;
583 currentCallFrameSize
= setupDisjunctionOffsets(term
.parentheses
.disjunction
, currentCallFrameSize
+ YarrStackSpaceForBackTrackInfoParentheticalAssertion
, currentInputPosition
);
586 case PatternTerm::TypeDotStarEnclosure
:
587 alternative
->m_hasFixedSize
= false;
588 term
.inputPosition
= initialInputPosition
;
593 alternative
->m_minimumSize
= currentInputPosition
- initialInputPosition
;
594 return currentCallFrameSize
;
597 unsigned setupDisjunctionOffsets(PatternDisjunction
* disjunction
, unsigned initialCallFrameSize
, unsigned initialInputPosition
)
599 if ((disjunction
!= m_pattern
.m_body
) && (disjunction
->m_alternatives
.size() > 1))
600 initialCallFrameSize
+= YarrStackSpaceForBackTrackInfoAlternative
;
602 unsigned minimumInputSize
= UINT_MAX
;
603 unsigned maximumCallFrameSize
= 0;
604 bool hasFixedSize
= true;
606 for (unsigned alt
= 0; alt
< disjunction
->m_alternatives
.size(); ++alt
) {
607 PatternAlternative
* alternative
= disjunction
->m_alternatives
[alt
];
608 unsigned currentAlternativeCallFrameSize
= setupAlternativeOffsets(alternative
, initialCallFrameSize
, initialInputPosition
);
609 minimumInputSize
= min(minimumInputSize
, alternative
->m_minimumSize
);
610 maximumCallFrameSize
= max(maximumCallFrameSize
, currentAlternativeCallFrameSize
);
611 hasFixedSize
&= alternative
->m_hasFixedSize
;
614 ASSERT(minimumInputSize
!= UINT_MAX
);
615 ASSERT(maximumCallFrameSize
>= initialCallFrameSize
);
617 disjunction
->m_hasFixedSize
= hasFixedSize
;
618 disjunction
->m_minimumSize
= minimumInputSize
;
619 disjunction
->m_callFrameSize
= maximumCallFrameSize
;
620 return maximumCallFrameSize
;
625 setupDisjunctionOffsets(m_pattern
.m_body
, 0, 0);
628 // This optimization identifies sets of parentheses that we will never need to backtrack.
629 // In these cases we do not need to store state from prior iterations.
630 // We can presently avoid backtracking for:
631 // * where the parens are at the end of the regular expression (last term in any of the
632 // alternatives of the main body disjunction).
633 // * where the parens are non-capturing, and quantified unbounded greedy (*).
634 // * where the parens do not contain any capturing subpatterns.
635 void checkForTerminalParentheses()
637 // This check is much too crude; should be just checking whether the candidate
638 // node contains nested capturing subpatterns, not the whole expression!
639 if (m_pattern
.m_numSubpatterns
)
642 Vector
<PatternAlternative
*>& alternatives
= m_pattern
.m_body
->m_alternatives
;
643 for (size_t i
= 0; i
< alternatives
.size(); ++i
) {
644 Vector
<PatternTerm
>& terms
= alternatives
[i
]->m_terms
;
646 PatternTerm
& term
= terms
.last();
647 if (term
.type
== PatternTerm::TypeParenthesesSubpattern
648 && term
.quantityType
== QuantifierGreedy
649 && term
.quantityCount
== quantifyInfinite
651 term
.parentheses
.isTerminal
= true;
658 // Look for expressions containing beginning of line (^) anchoring and unroll them.
659 // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops
660 // This code relies on the parsing code tagging alternatives with m_containsBOL and
661 // m_startsWithBOL and rolling those up to containing alternatives.
662 // At this point, this is only valid for non-multiline expressions.
663 PatternDisjunction
* disjunction
= m_pattern
.m_body
;
665 if (!m_pattern
.m_containsBOL
|| m_pattern
.m_multiline
)
668 PatternDisjunction
* loopDisjunction
= copyDisjunction(disjunction
, true);
670 // Set alternatives in disjunction to "onceThrough"
671 for (unsigned alt
= 0; alt
< disjunction
->m_alternatives
.size(); ++alt
)
672 disjunction
->m_alternatives
[alt
]->setOnceThrough();
674 if (loopDisjunction
) {
675 // Move alternatives from loopDisjunction to disjunction
676 for (unsigned alt
= 0; alt
< loopDisjunction
->m_alternatives
.size(); ++alt
)
677 disjunction
->m_alternatives
.append(loopDisjunction
->m_alternatives
[alt
]);
679 loopDisjunction
->m_alternatives
.clear();
683 bool containsCapturingTerms(PatternAlternative
* alternative
, size_t firstTermIndex
, size_t lastTermIndex
)
685 Vector
<PatternTerm
>& terms
= alternative
->m_terms
;
687 for (size_t termIndex
= firstTermIndex
; termIndex
<= lastTermIndex
; ++termIndex
) {
688 PatternTerm
& term
= terms
[termIndex
];
693 if (term
.type
== PatternTerm::TypeParenthesesSubpattern
) {
694 PatternDisjunction
* nestedDisjunction
= term
.parentheses
.disjunction
;
695 for (unsigned alt
= 0; alt
< nestedDisjunction
->m_alternatives
.size(); ++alt
) {
696 if (containsCapturingTerms(nestedDisjunction
->m_alternatives
[alt
], 0, nestedDisjunction
->m_alternatives
[alt
]->m_terms
.size() - 1))
705 // This optimization identifies alternatives in the form of
706 // [^].*[?]<expression>.*[$] for expressions that don't have any
707 // capturing terms. The alternative is changed to <expression>
708 // followed by processing of the dot stars to find and adjust the
709 // beginning and the end of the match.
710 void optimizeDotStarWrappedExpressions()
712 Vector
<PatternAlternative
*>& alternatives
= m_pattern
.m_body
->m_alternatives
;
713 if (alternatives
.size() != 1)
716 PatternAlternative
* alternative
= alternatives
[0];
717 Vector
<PatternTerm
>& terms
= alternative
->m_terms
;
718 if (terms
.size() >= 3) {
719 bool startsWithBOL
= false;
720 bool endsWithEOL
= false;
721 size_t termIndex
, firstExpressionTerm
, lastExpressionTerm
;
724 if (terms
[termIndex
].type
== PatternTerm::TypeAssertionBOL
) {
725 startsWithBOL
= true;
729 PatternTerm
& firstNonAnchorTerm
= terms
[termIndex
];
730 if ((firstNonAnchorTerm
.type
!= PatternTerm::TypeCharacterClass
) || (firstNonAnchorTerm
.characterClass
!= m_pattern
.newlineCharacterClass()) || !((firstNonAnchorTerm
.quantityType
== QuantifierGreedy
) || (firstNonAnchorTerm
.quantityType
== QuantifierNonGreedy
)))
733 firstExpressionTerm
= termIndex
+ 1;
735 termIndex
= terms
.size() - 1;
736 if (terms
[termIndex
].type
== PatternTerm::TypeAssertionEOL
) {
741 PatternTerm
& lastNonAnchorTerm
= terms
[termIndex
];
742 if ((lastNonAnchorTerm
.type
!= PatternTerm::TypeCharacterClass
) || (lastNonAnchorTerm
.characterClass
!= m_pattern
.newlineCharacterClass()) || (lastNonAnchorTerm
.quantityType
!= QuantifierGreedy
))
745 lastExpressionTerm
= termIndex
- 1;
747 if (firstExpressionTerm
> lastExpressionTerm
)
750 if (!containsCapturingTerms(alternative
, firstExpressionTerm
, lastExpressionTerm
)) {
751 for (termIndex
= terms
.size() - 1; termIndex
> lastExpressionTerm
; --termIndex
)
752 terms
.remove(termIndex
);
754 for (termIndex
= firstExpressionTerm
; termIndex
> 0; --termIndex
)
755 terms
.remove(termIndex
- 1);
757 terms
.append(PatternTerm(startsWithBOL
, endsWithEOL
));
759 m_pattern
.m_containsBOL
= false;
765 YarrPattern
& m_pattern
;
766 PatternAlternative
* m_alternative
;
767 CharacterClassConstructor m_characterClassConstructor
;
768 bool m_invertCharacterClass
;
769 bool m_invertParentheticalAssertion
;
772 const char* YarrPattern::compile(const UString
& patternString
)
774 YarrPatternConstructor
constructor(*this);
776 if (const char* error
= parse(constructor
, patternString
))
779 // If the pattern contains illegal backreferences reset & reparse.
780 // Quoting Netscape's "What's new in JavaScript 1.2",
781 // "Note: if the number of left parentheses is less than the number specified
782 // in \#, the \# is taken as an octal escape as described in the next row."
783 if (containsIllegalBackReference()) {
784 unsigned numSubpatterns
= m_numSubpatterns
;
790 parse(constructor
, patternString
, numSubpatterns
);
793 ASSERT(numSubpatterns
== m_numSubpatterns
);
796 constructor
.checkForTerminalParentheses();
797 constructor
.optimizeDotStarWrappedExpressions();
798 constructor
.optimizeBOL();
800 constructor
.setupOffsets();
805 YarrPattern::YarrPattern(const UString
& pattern
, bool ignoreCase
, bool multiline
, const char** error
)
806 : m_ignoreCase(ignoreCase
)
807 , m_multiline(multiline
)
808 , m_containsBackreferences(false)
809 , m_containsBOL(false)
810 , m_numSubpatterns(0)
811 , m_maxBackReference(0)
818 , nonwordcharCached(0)
820 *error
= compile(pattern
);