2 * Copyright (C) 2009 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 #include "YarrPattern.h"
31 #include "YarrCanonicalizeUCS2.h"
32 #include "YarrParser.h"
33 #include <wtf/Vector.h>
37 namespace JSC
{ namespace Yarr
{
39 #include "RegExpJitTables.h"
41 class CharacterClassConstructor
{
43 CharacterClassConstructor(bool isCaseInsensitive
= false)
44 : m_isCaseInsensitive(isCaseInsensitive
)
52 m_matchesUnicode
.clear();
53 m_rangesUnicode
.clear();
56 void append(const CharacterClass
* other
)
58 for (size_t i
= 0; i
< other
->m_matches
.size(); ++i
)
59 addSorted(m_matches
, other
->m_matches
[i
]);
60 for (size_t i
= 0; i
< other
->m_ranges
.size(); ++i
)
61 addSortedRange(m_ranges
, other
->m_ranges
[i
].begin
, other
->m_ranges
[i
].end
);
62 for (size_t i
= 0; i
< other
->m_matchesUnicode
.size(); ++i
)
63 addSorted(m_matchesUnicode
, other
->m_matchesUnicode
[i
]);
64 for (size_t i
= 0; i
< other
->m_rangesUnicode
.size(); ++i
)
65 addSortedRange(m_rangesUnicode
, other
->m_rangesUnicode
[i
].begin
, other
->m_rangesUnicode
[i
].end
);
68 void putChar(UChar ch
)
70 // Handle ascii cases.
72 if (m_isCaseInsensitive
&& isASCIIAlpha(ch
)) {
73 addSorted(m_matches
, toASCIIUpper(ch
));
74 addSorted(m_matches
, toASCIILower(ch
));
76 addSorted(m_matches
, ch
);
80 // Simple case, not a case-insensitive match.
81 if (!m_isCaseInsensitive
) {
82 addSorted(m_matchesUnicode
, ch
);
86 // Add multiple matches, if necessary.
87 UCS2CanonicalizationRange
* info
= rangeInfoFor(ch
);
88 if (info
->type
== CanonicalizeUnique
)
89 addSorted(m_matchesUnicode
, ch
);
91 putUnicodeIgnoreCase(ch
, info
);
94 void putUnicodeIgnoreCase(UChar ch
, UCS2CanonicalizationRange
* info
)
96 ASSERT(m_isCaseInsensitive
);
98 ASSERT(ch
>= info
->begin
&& ch
<= info
->end
);
99 ASSERT(info
->type
!= CanonicalizeUnique
);
100 if (info
->type
== CanonicalizeSet
) {
101 for (uint16_t* set
= characterSetInfo
[info
->value
]; (ch
= *set
); ++set
)
102 addSorted(m_matchesUnicode
, ch
);
104 addSorted(m_matchesUnicode
, ch
);
105 addSorted(m_matchesUnicode
, getCanonicalPair(info
, ch
));
109 void putRange(UChar lo
, UChar hi
)
113 char asciiHi
= std::min(hi
, (UChar
)0x7f);
114 addSortedRange(m_ranges
, lo
, asciiHi
);
116 if (m_isCaseInsensitive
) {
117 if ((asciiLo
<= 'Z') && (asciiHi
>= 'A'))
118 addSortedRange(m_ranges
, std::max(asciiLo
, 'A')+('a'-'A'), std::min(asciiHi
, 'Z')+('a'-'A'));
119 if ((asciiLo
<= 'z') && (asciiHi
>= 'a'))
120 addSortedRange(m_ranges
, std::max(asciiLo
, 'a')+('A'-'a'), std::min(asciiHi
, 'z')+('A'-'a'));
126 lo
= std::max(lo
, (UChar
)0x80);
127 addSortedRange(m_rangesUnicode
, lo
, hi
);
129 if (!m_isCaseInsensitive
)
132 UCS2CanonicalizationRange
* info
= rangeInfoFor(lo
);
134 // Handle the range [lo .. end]
135 UChar end
= std::min
<UChar
>(info
->end
, hi
);
137 switch (info
->type
) {
138 case CanonicalizeUnique
:
139 // Nothing to do - no canonical equivalents.
141 case CanonicalizeSet
: {
143 for (uint16_t* set
= characterSetInfo
[info
->value
]; (ch
= *set
); ++set
)
144 addSorted(m_matchesUnicode
, ch
);
147 case CanonicalizeRangeLo
:
148 addSortedRange(m_rangesUnicode
, lo
+ info
->value
, end
+ info
->value
);
150 case CanonicalizeRangeHi
:
151 addSortedRange(m_rangesUnicode
, lo
- info
->value
, end
- info
->value
);
153 case CanonicalizeAlternatingAligned
:
154 // Use addSortedRange since there is likely an abutting range to combine with.
156 addSortedRange(m_rangesUnicode
, lo
- 1, lo
- 1);
158 addSortedRange(m_rangesUnicode
, end
+ 1, end
+ 1);
160 case CanonicalizeAlternatingUnaligned
:
161 // Use addSortedRange since there is likely an abutting range to combine with.
163 addSortedRange(m_rangesUnicode
, lo
- 1, lo
- 1);
165 addSortedRange(m_rangesUnicode
, end
+ 1, end
+ 1);
178 CharacterClass
* charClass()
180 CharacterClass
* characterClass
= new CharacterClass(0);
182 characterClass
->m_matches
.swap(m_matches
);
183 characterClass
->m_ranges
.swap(m_ranges
);
184 characterClass
->m_matchesUnicode
.swap(m_matchesUnicode
);
185 characterClass
->m_rangesUnicode
.swap(m_rangesUnicode
);
187 return characterClass
;
191 void addSorted(Vector
<UChar
>& matches
, UChar ch
)
194 unsigned range
= matches
.size();
196 // binary chop, find position to insert char.
198 unsigned index
= range
>> 1;
200 int val
= matches
[pos
+index
] - ch
;
211 if (pos
== matches
.size())
214 matches
.insert(pos
, ch
);
217 void addSortedRange(Vector
<CharacterRange
>& ranges
, UChar lo
, UChar hi
)
219 unsigned end
= ranges
.size();
221 // Simple linear scan - I doubt there are that many ranges anyway...
222 // feel free to fix this with something faster (eg binary chop).
223 for (unsigned i
= 0; i
< end
; ++i
) {
224 // does the new range fall before the current position in the array
225 if (hi
< ranges
[i
].begin
) {
226 // optional optimization: concatenate appending ranges? - may not be worthwhile.
227 if (hi
== (ranges
[i
].begin
- 1)) {
228 ranges
[i
].begin
= lo
;
231 ranges
.insert(i
, CharacterRange(lo
, hi
));
234 // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
235 // If the new range start at or before the end of the last range, then the overlap (if it starts one after the
236 // end of the last range they concatenate, which is just as good.
237 if (lo
<= (ranges
[i
].end
+ 1)) {
238 // found an intersect! we'll replace this entry in the array.
239 ranges
[i
].begin
= std::min(ranges
[i
].begin
, lo
);
240 ranges
[i
].end
= std::max(ranges
[i
].end
, hi
);
242 // now check if the new range can subsume any subsequent ranges.
244 // each iteration of the loop we will either remove something from the list, or break the loop.
245 while (next
< ranges
.size()) {
246 if (ranges
[next
].begin
<= (ranges
[i
].end
+ 1)) {
247 // the next entry now overlaps / concatenates this one.
248 ranges
[i
].end
= std::max(ranges
[i
].end
, ranges
[next
].end
);
258 // CharacterRange comes after all existing ranges.
259 ranges
.append(CharacterRange(lo
, hi
));
262 bool m_isCaseInsensitive
;
264 Vector
<UChar
> m_matches
;
265 Vector
<CharacterRange
> m_ranges
;
266 Vector
<UChar
> m_matchesUnicode
;
267 Vector
<CharacterRange
> m_rangesUnicode
;
270 class YarrPatternConstructor
{
272 YarrPatternConstructor(YarrPattern
& pattern
)
274 , m_characterClassConstructor(pattern
.m_ignoreCase
)
275 , m_invertParentheticalAssertion(false)
277 m_pattern
.m_body
= new PatternDisjunction();
278 m_alternative
= m_pattern
.m_body
->addNewAlternative();
279 m_pattern
.m_disjunctions
.append(m_pattern
.m_body
);
282 ~YarrPatternConstructor()
289 m_characterClassConstructor
.reset();
291 m_pattern
.m_body
= new PatternDisjunction();
292 m_alternative
= m_pattern
.m_body
->addNewAlternative();
293 m_pattern
.m_disjunctions
.append(m_pattern
.m_body
);
298 if (!m_alternative
->m_terms
.size() & !m_invertParentheticalAssertion
) {
299 m_alternative
->m_startsWithBOL
= true;
300 m_alternative
->m_containsBOL
= true;
301 m_pattern
.m_containsBOL
= true;
303 m_alternative
->m_terms
.append(PatternTerm::BOL());
307 m_alternative
->m_terms
.append(PatternTerm::EOL());
309 void assertionWordBoundary(bool invert
)
311 m_alternative
->m_terms
.append(PatternTerm::WordBoundary(invert
));
314 void atomPatternCharacter(UChar ch
)
316 // We handle case-insensitive checking of unicode characters which do have both
317 // cases by handling them as if they were defined using a CharacterClass.
318 if (!m_pattern
.m_ignoreCase
|| isASCII(ch
)) {
319 m_alternative
->m_terms
.append(PatternTerm(ch
));
323 UCS2CanonicalizationRange
* info
= rangeInfoFor(ch
);
324 if (info
->type
== CanonicalizeUnique
) {
325 m_alternative
->m_terms
.append(PatternTerm(ch
));
329 m_characterClassConstructor
.putUnicodeIgnoreCase(ch
, info
);
330 CharacterClass
* newCharacterClass
= m_characterClassConstructor
.charClass();
331 m_pattern
.m_userCharacterClasses
.append(newCharacterClass
);
332 m_alternative
->m_terms
.append(PatternTerm(newCharacterClass
, false));
335 void atomBuiltInCharacterClass(BuiltInCharacterClassID classID
, bool invert
)
339 m_alternative
->m_terms
.append(PatternTerm(m_pattern
.digitsCharacterClass(), invert
));
342 m_alternative
->m_terms
.append(PatternTerm(m_pattern
.spacesCharacterClass(), invert
));
345 m_alternative
->m_terms
.append(PatternTerm(m_pattern
.wordcharCharacterClass(), invert
));
348 m_alternative
->m_terms
.append(PatternTerm(m_pattern
.newlineCharacterClass(), invert
));
353 void atomCharacterClassBegin(bool invert
= false)
355 m_invertCharacterClass
= invert
;
358 void atomCharacterClassAtom(UChar ch
)
360 m_characterClassConstructor
.putChar(ch
);
363 void atomCharacterClassRange(UChar begin
, UChar end
)
365 m_characterClassConstructor
.putRange(begin
, end
);
368 void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID
, bool invert
)
370 ASSERT(classID
!= NewlineClassID
);
374 m_characterClassConstructor
.append(invert
? m_pattern
.nondigitsCharacterClass() : m_pattern
.digitsCharacterClass());
378 m_characterClassConstructor
.append(invert
? m_pattern
.nonspacesCharacterClass() : m_pattern
.spacesCharacterClass());
382 m_characterClassConstructor
.append(invert
? m_pattern
.nonwordcharCharacterClass() : m_pattern
.wordcharCharacterClass());
386 ASSERT_NOT_REACHED();
390 void atomCharacterClassEnd()
392 CharacterClass
* newCharacterClass
= m_characterClassConstructor
.charClass();
393 m_pattern
.m_userCharacterClasses
.append(newCharacterClass
);
394 m_alternative
->m_terms
.append(PatternTerm(newCharacterClass
, m_invertCharacterClass
));
397 void atomParenthesesSubpatternBegin(bool capture
= true)
399 unsigned subpatternId
= m_pattern
.m_numSubpatterns
+ 1;
401 m_pattern
.m_numSubpatterns
++;
403 PatternDisjunction
* parenthesesDisjunction
= new PatternDisjunction(m_alternative
);
404 m_pattern
.m_disjunctions
.append(parenthesesDisjunction
);
405 m_alternative
->m_terms
.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern
, subpatternId
, parenthesesDisjunction
, capture
, false));
406 m_alternative
= parenthesesDisjunction
->addNewAlternative();
409 void atomParentheticalAssertionBegin(bool invert
= false)
411 PatternDisjunction
* parenthesesDisjunction
= new PatternDisjunction(m_alternative
);
412 m_pattern
.m_disjunctions
.append(parenthesesDisjunction
);
413 m_alternative
->m_terms
.append(PatternTerm(PatternTerm::TypeParentheticalAssertion
, m_pattern
.m_numSubpatterns
+ 1, parenthesesDisjunction
, false, invert
));
414 m_alternative
= parenthesesDisjunction
->addNewAlternative();
415 m_invertParentheticalAssertion
= invert
;
418 void atomParenthesesEnd()
420 ASSERT(m_alternative
->m_parent
);
421 ASSERT(m_alternative
->m_parent
->m_parent
);
423 PatternDisjunction
* parenthesesDisjunction
= m_alternative
->m_parent
;
424 m_alternative
= m_alternative
->m_parent
->m_parent
;
426 PatternTerm
& lastTerm
= m_alternative
->lastTerm();
428 unsigned numParenAlternatives
= parenthesesDisjunction
->m_alternatives
.size();
429 unsigned numBOLAnchoredAlts
= 0;
431 for (unsigned i
= 0; i
< numParenAlternatives
; i
++) {
432 // Bubble up BOL flags
433 if (parenthesesDisjunction
->m_alternatives
[i
]->m_startsWithBOL
)
434 numBOLAnchoredAlts
++;
437 if (numBOLAnchoredAlts
) {
438 m_alternative
->m_containsBOL
= true;
439 // If all the alternatives in parens start with BOL, then so does this one
440 if (numBOLAnchoredAlts
== numParenAlternatives
)
441 m_alternative
->m_startsWithBOL
= true;
444 lastTerm
.parentheses
.lastSubpatternId
= m_pattern
.m_numSubpatterns
;
445 m_invertParentheticalAssertion
= false;
448 void atomBackReference(unsigned subpatternId
)
450 ASSERT(subpatternId
);
451 m_pattern
.m_containsBackreferences
= true;
452 m_pattern
.m_maxBackReference
= std::max(m_pattern
.m_maxBackReference
, subpatternId
);
454 if (subpatternId
> m_pattern
.m_numSubpatterns
) {
455 m_alternative
->m_terms
.append(PatternTerm::ForwardReference());
459 PatternAlternative
* currentAlternative
= m_alternative
;
460 ASSERT(currentAlternative
);
462 // Note to self: if we waited until the AST was baked, we could also remove forwards refs
463 while ((currentAlternative
= currentAlternative
->m_parent
->m_parent
)) {
464 PatternTerm
& term
= currentAlternative
->lastTerm();
465 ASSERT((term
.type
== PatternTerm::TypeParenthesesSubpattern
) || (term
.type
== PatternTerm::TypeParentheticalAssertion
));
467 if ((term
.type
== PatternTerm::TypeParenthesesSubpattern
) && term
.capture() && (subpatternId
== term
.parentheses
.subpatternId
)) {
468 m_alternative
->m_terms
.append(PatternTerm::ForwardReference());
473 m_alternative
->m_terms
.append(PatternTerm(subpatternId
));
476 // deep copy the argument disjunction. If filterStartsWithBOL is true,
477 // skip alternatives with m_startsWithBOL set true.
478 PatternDisjunction
* copyDisjunction(PatternDisjunction
* disjunction
, bool filterStartsWithBOL
= false)
480 PatternDisjunction
* newDisjunction
= 0;
481 for (unsigned alt
= 0; alt
< disjunction
->m_alternatives
.size(); ++alt
) {
482 PatternAlternative
* alternative
= disjunction
->m_alternatives
[alt
];
483 if (!filterStartsWithBOL
|| !alternative
->m_startsWithBOL
) {
484 if (!newDisjunction
) {
485 newDisjunction
= new PatternDisjunction();
486 newDisjunction
->m_parent
= disjunction
->m_parent
;
488 PatternAlternative
* newAlternative
= newDisjunction
->addNewAlternative();
489 for (unsigned i
= 0; i
< alternative
->m_terms
.size(); ++i
)
490 newAlternative
->m_terms
.append(copyTerm(alternative
->m_terms
[i
], filterStartsWithBOL
));
495 m_pattern
.m_disjunctions
.append(newDisjunction
);
496 return newDisjunction
;
499 PatternTerm
copyTerm(PatternTerm
& term
, bool filterStartsWithBOL
= false)
501 if ((term
.type
!= PatternTerm::TypeParenthesesSubpattern
) && (term
.type
!= PatternTerm::TypeParentheticalAssertion
))
502 return PatternTerm(term
);
504 PatternTerm termCopy
= term
;
505 termCopy
.parentheses
.disjunction
= copyDisjunction(termCopy
.parentheses
.disjunction
, filterStartsWithBOL
);
509 void quantifyAtom(unsigned min
, unsigned max
, bool greedy
)
512 ASSERT(m_alternative
->m_terms
.size());
515 m_alternative
->removeLastTerm();
519 PatternTerm
& term
= m_alternative
->lastTerm();
520 ASSERT(term
.type
> PatternTerm::TypeAssertionWordBoundary
);
521 ASSERT((term
.quantityCount
== 1) && (term
.quantityType
== QuantifierFixedCount
));
523 if (term
.type
== PatternTerm::TypeParentheticalAssertion
) {
524 // If an assertion is quantified with a minimum count of zero, it can simply be removed.
525 // This arises from the RepeatMatcher behaviour in the spec. Matching an assertion never
526 // results in any input being consumed, however the continuation passed to the assertion
527 // (called in steps, 8c and 9 of the RepeatMatcher definition, ES5.1 15.10.2.5) will
528 // reject all zero length matches (see step 2.1). A match from the continuation of the
529 // expression will still be accepted regardless (via steps 8a and 11) - the upshot of all
530 // this is that matches from the assertion are not required, and won't be accepted anyway,
531 // so no need to ever run it.
533 m_alternative
->removeLastTerm();
534 // We never need to run an assertion more than once. Subsequent interations will be run
535 // with the same start index (since assertions are non-capturing) and the same captures
536 // (per step 4 of RepeatMatcher in ES5.1 15.10.2.5), and as such will always produce the
537 // same result and captures. If the first match succeeds then the subsequent (min - 1)
538 // matches will too. Any additional optional matches will fail (on the same basis as the
539 // minimum zero quantified assertions, above), but this will still result in a match.
544 term
.quantify(max
, greedy
? QuantifierGreedy
: QuantifierNonGreedy
);
546 term
.quantify(min
, QuantifierFixedCount
);
548 term
.quantify(min
, QuantifierFixedCount
);
549 m_alternative
->m_terms
.append(copyTerm(term
));
550 // NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
551 m_alternative
->lastTerm().quantify((max
== quantifyInfinite
) ? max
: max
- min
, greedy
? QuantifierGreedy
: QuantifierNonGreedy
);
552 if (m_alternative
->lastTerm().type
== PatternTerm::TypeParenthesesSubpattern
)
553 m_alternative
->lastTerm().parentheses
.isCopy
= true;
559 m_alternative
= m_alternative
->m_parent
->addNewAlternative();
562 unsigned setupAlternativeOffsets(PatternAlternative
* alternative
, unsigned currentCallFrameSize
, unsigned initialInputPosition
)
564 alternative
->m_hasFixedSize
= true;
565 Checked
<unsigned> currentInputPosition
= initialInputPosition
;
567 for (unsigned i
= 0; i
< alternative
->m_terms
.size(); ++i
) {
568 PatternTerm
& term
= alternative
->m_terms
[i
];
571 case PatternTerm::TypeAssertionBOL
:
572 case PatternTerm::TypeAssertionEOL
:
573 case PatternTerm::TypeAssertionWordBoundary
:
574 term
.inputPosition
= currentInputPosition
.unsafeGet();
577 case PatternTerm::TypeBackReference
:
578 term
.inputPosition
= currentInputPosition
.unsafeGet();
579 term
.frameLocation
= currentCallFrameSize
;
580 currentCallFrameSize
+= YarrStackSpaceForBackTrackInfoBackReference
;
581 alternative
->m_hasFixedSize
= false;
584 case PatternTerm::TypeForwardReference
:
587 case PatternTerm::TypePatternCharacter
:
588 term
.inputPosition
= currentInputPosition
.unsafeGet();
589 if (term
.quantityType
!= QuantifierFixedCount
) {
590 term
.frameLocation
= currentCallFrameSize
;
591 currentCallFrameSize
+= YarrStackSpaceForBackTrackInfoPatternCharacter
;
592 alternative
->m_hasFixedSize
= false;
594 currentInputPosition
+= term
.quantityCount
;
597 case PatternTerm::TypeCharacterClass
:
598 term
.inputPosition
= currentInputPosition
.unsafeGet();
599 if (term
.quantityType
!= QuantifierFixedCount
) {
600 term
.frameLocation
= currentCallFrameSize
;
601 currentCallFrameSize
+= YarrStackSpaceForBackTrackInfoCharacterClass
;
602 alternative
->m_hasFixedSize
= false;
604 currentInputPosition
+= term
.quantityCount
;
607 case PatternTerm::TypeParenthesesSubpattern
:
608 // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
609 term
.frameLocation
= currentCallFrameSize
;
610 if (term
.quantityCount
== 1 && !term
.parentheses
.isCopy
) {
611 if (term
.quantityType
!= QuantifierFixedCount
)
612 currentCallFrameSize
+= YarrStackSpaceForBackTrackInfoParenthesesOnce
;
613 currentCallFrameSize
= setupDisjunctionOffsets(term
.parentheses
.disjunction
, currentCallFrameSize
, currentInputPosition
.unsafeGet());
614 // If quantity is fixed, then pre-check its minimum size.
615 if (term
.quantityType
== QuantifierFixedCount
)
616 currentInputPosition
+= term
.parentheses
.disjunction
->m_minimumSize
;
617 term
.inputPosition
= currentInputPosition
.unsafeGet();
618 } else if (term
.parentheses
.isTerminal
) {
619 currentCallFrameSize
+= YarrStackSpaceForBackTrackInfoParenthesesTerminal
;
620 currentCallFrameSize
= setupDisjunctionOffsets(term
.parentheses
.disjunction
, currentCallFrameSize
, currentInputPosition
.unsafeGet());
621 term
.inputPosition
= currentInputPosition
.unsafeGet();
623 term
.inputPosition
= currentInputPosition
.unsafeGet();
624 setupDisjunctionOffsets(term
.parentheses
.disjunction
, 0, currentInputPosition
.unsafeGet());
625 currentCallFrameSize
+= YarrStackSpaceForBackTrackInfoParentheses
;
627 // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length.
628 alternative
->m_hasFixedSize
= false;
631 case PatternTerm::TypeParentheticalAssertion
:
632 term
.inputPosition
= currentInputPosition
.unsafeGet();
633 term
.frameLocation
= currentCallFrameSize
;
634 currentCallFrameSize
= setupDisjunctionOffsets(term
.parentheses
.disjunction
, currentCallFrameSize
+ YarrStackSpaceForBackTrackInfoParentheticalAssertion
, currentInputPosition
.unsafeGet());
637 case PatternTerm::TypeDotStarEnclosure
:
638 alternative
->m_hasFixedSize
= false;
639 term
.inputPosition
= initialInputPosition
;
644 alternative
->m_minimumSize
= (currentInputPosition
- initialInputPosition
).unsafeGet();
645 return currentCallFrameSize
;
648 unsigned setupDisjunctionOffsets(PatternDisjunction
* disjunction
, unsigned initialCallFrameSize
, unsigned initialInputPosition
)
650 if ((disjunction
!= m_pattern
.m_body
) && (disjunction
->m_alternatives
.size() > 1))
651 initialCallFrameSize
+= YarrStackSpaceForBackTrackInfoAlternative
;
653 unsigned minimumInputSize
= UINT_MAX
;
654 unsigned maximumCallFrameSize
= 0;
655 bool hasFixedSize
= true;
657 for (unsigned alt
= 0; alt
< disjunction
->m_alternatives
.size(); ++alt
) {
658 PatternAlternative
* alternative
= disjunction
->m_alternatives
[alt
];
659 unsigned currentAlternativeCallFrameSize
= setupAlternativeOffsets(alternative
, initialCallFrameSize
, initialInputPosition
);
660 minimumInputSize
= min(minimumInputSize
, alternative
->m_minimumSize
);
661 maximumCallFrameSize
= max(maximumCallFrameSize
, currentAlternativeCallFrameSize
);
662 hasFixedSize
&= alternative
->m_hasFixedSize
;
665 ASSERT(minimumInputSize
!= UINT_MAX
);
666 ASSERT(maximumCallFrameSize
>= initialCallFrameSize
);
668 disjunction
->m_hasFixedSize
= hasFixedSize
;
669 disjunction
->m_minimumSize
= minimumInputSize
;
670 disjunction
->m_callFrameSize
= maximumCallFrameSize
;
671 return maximumCallFrameSize
;
676 setupDisjunctionOffsets(m_pattern
.m_body
, 0, 0);
679 // This optimization identifies sets of parentheses that we will never need to backtrack.
680 // In these cases we do not need to store state from prior iterations.
681 // We can presently avoid backtracking for:
682 // * where the parens are at the end of the regular expression (last term in any of the
683 // alternatives of the main body disjunction).
684 // * where the parens are non-capturing, and quantified unbounded greedy (*).
685 // * where the parens do not contain any capturing subpatterns.
686 void checkForTerminalParentheses()
688 // This check is much too crude; should be just checking whether the candidate
689 // node contains nested capturing subpatterns, not the whole expression!
690 if (m_pattern
.m_numSubpatterns
)
693 Vector
<PatternAlternative
*>& alternatives
= m_pattern
.m_body
->m_alternatives
;
694 for (size_t i
= 0; i
< alternatives
.size(); ++i
) {
695 Vector
<PatternTerm
>& terms
= alternatives
[i
]->m_terms
;
697 PatternTerm
& term
= terms
.last();
698 if (term
.type
== PatternTerm::TypeParenthesesSubpattern
699 && term
.quantityType
== QuantifierGreedy
700 && term
.quantityCount
== quantifyInfinite
702 term
.parentheses
.isTerminal
= true;
709 // Look for expressions containing beginning of line (^) anchoring and unroll them.
710 // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops
711 // This code relies on the parsing code tagging alternatives with m_containsBOL and
712 // m_startsWithBOL and rolling those up to containing alternatives.
713 // At this point, this is only valid for non-multiline expressions.
714 PatternDisjunction
* disjunction
= m_pattern
.m_body
;
716 if (!m_pattern
.m_containsBOL
|| m_pattern
.m_multiline
)
719 PatternDisjunction
* loopDisjunction
= copyDisjunction(disjunction
, true);
721 // Set alternatives in disjunction to "onceThrough"
722 for (unsigned alt
= 0; alt
< disjunction
->m_alternatives
.size(); ++alt
)
723 disjunction
->m_alternatives
[alt
]->setOnceThrough();
725 if (loopDisjunction
) {
726 // Move alternatives from loopDisjunction to disjunction
727 for (unsigned alt
= 0; alt
< loopDisjunction
->m_alternatives
.size(); ++alt
)
728 disjunction
->m_alternatives
.append(loopDisjunction
->m_alternatives
[alt
]);
730 loopDisjunction
->m_alternatives
.clear();
734 bool containsCapturingTerms(PatternAlternative
* alternative
, size_t firstTermIndex
, size_t lastTermIndex
)
736 Vector
<PatternTerm
>& terms
= alternative
->m_terms
;
738 for (size_t termIndex
= firstTermIndex
; termIndex
<= lastTermIndex
; ++termIndex
) {
739 PatternTerm
& term
= terms
[termIndex
];
744 if (term
.type
== PatternTerm::TypeParenthesesSubpattern
) {
745 PatternDisjunction
* nestedDisjunction
= term
.parentheses
.disjunction
;
746 for (unsigned alt
= 0; alt
< nestedDisjunction
->m_alternatives
.size(); ++alt
) {
747 if (containsCapturingTerms(nestedDisjunction
->m_alternatives
[alt
], 0, nestedDisjunction
->m_alternatives
[alt
]->m_terms
.size() - 1))
756 // This optimization identifies alternatives in the form of
757 // [^].*[?]<expression>.*[$] for expressions that don't have any
758 // capturing terms. The alternative is changed to <expression>
759 // followed by processing of the dot stars to find and adjust the
760 // beginning and the end of the match.
761 void optimizeDotStarWrappedExpressions()
763 Vector
<PatternAlternative
*>& alternatives
= m_pattern
.m_body
->m_alternatives
;
764 if (alternatives
.size() != 1)
767 PatternAlternative
* alternative
= alternatives
[0];
768 Vector
<PatternTerm
>& terms
= alternative
->m_terms
;
769 if (terms
.size() >= 3) {
770 bool startsWithBOL
= false;
771 bool endsWithEOL
= false;
772 size_t termIndex
, firstExpressionTerm
, lastExpressionTerm
;
775 if (terms
[termIndex
].type
== PatternTerm::TypeAssertionBOL
) {
776 startsWithBOL
= true;
780 PatternTerm
& firstNonAnchorTerm
= terms
[termIndex
];
781 if ((firstNonAnchorTerm
.type
!= PatternTerm::TypeCharacterClass
) || (firstNonAnchorTerm
.characterClass
!= m_pattern
.newlineCharacterClass()) || !((firstNonAnchorTerm
.quantityType
== QuantifierGreedy
) || (firstNonAnchorTerm
.quantityType
== QuantifierNonGreedy
)))
784 firstExpressionTerm
= termIndex
+ 1;
786 termIndex
= terms
.size() - 1;
787 if (terms
[termIndex
].type
== PatternTerm::TypeAssertionEOL
) {
792 PatternTerm
& lastNonAnchorTerm
= terms
[termIndex
];
793 if ((lastNonAnchorTerm
.type
!= PatternTerm::TypeCharacterClass
) || (lastNonAnchorTerm
.characterClass
!= m_pattern
.newlineCharacterClass()) || (lastNonAnchorTerm
.quantityType
!= QuantifierGreedy
))
796 lastExpressionTerm
= termIndex
- 1;
798 if (firstExpressionTerm
> lastExpressionTerm
)
801 if (!containsCapturingTerms(alternative
, firstExpressionTerm
, lastExpressionTerm
)) {
802 for (termIndex
= terms
.size() - 1; termIndex
> lastExpressionTerm
; --termIndex
)
803 terms
.remove(termIndex
);
805 for (termIndex
= firstExpressionTerm
; termIndex
> 0; --termIndex
)
806 terms
.remove(termIndex
- 1);
808 terms
.append(PatternTerm(startsWithBOL
, endsWithEOL
));
810 m_pattern
.m_containsBOL
= false;
816 YarrPattern
& m_pattern
;
817 PatternAlternative
* m_alternative
;
818 CharacterClassConstructor m_characterClassConstructor
;
819 bool m_invertCharacterClass
;
820 bool m_invertParentheticalAssertion
;
823 const char* YarrPattern::compile(const UString
& patternString
)
825 YarrPatternConstructor
constructor(*this);
827 if (const char* error
= parse(constructor
, patternString
))
830 // If the pattern contains illegal backreferences reset & reparse.
831 // Quoting Netscape's "What's new in JavaScript 1.2",
832 // "Note: if the number of left parentheses is less than the number specified
833 // in \#, the \# is taken as an octal escape as described in the next row."
834 if (containsIllegalBackReference()) {
835 unsigned numSubpatterns
= m_numSubpatterns
;
841 parse(constructor
, patternString
, numSubpatterns
);
844 ASSERT(numSubpatterns
== m_numSubpatterns
);
847 constructor
.checkForTerminalParentheses();
848 constructor
.optimizeDotStarWrappedExpressions();
849 constructor
.optimizeBOL();
851 constructor
.setupOffsets();
856 YarrPattern::YarrPattern(const UString
& pattern
, bool ignoreCase
, bool multiline
, const char** error
)
857 : m_ignoreCase(ignoreCase
)
858 , m_multiline(multiline
)
859 , m_containsBackreferences(false)
860 , m_containsBOL(false)
861 , m_numSubpatterns(0)
862 , m_maxBackReference(0)
869 , nonwordcharCached(0)
871 *error
= compile(pattern
);