2 * Copyright (C) 2009, 2013 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 #include "YarrPattern.h"
31 #include "YarrCanonicalizeUCS2.h"
32 #include "YarrParser.h"
33 #include <wtf/Vector.h>
37 namespace JSC
{ namespace Yarr
{
39 #include "RegExpJitTables.h"
41 class CharacterClassConstructor
{
43 CharacterClassConstructor(bool isCaseInsensitive
= false)
44 : m_isCaseInsensitive(isCaseInsensitive
)
52 m_matchesUnicode
.clear();
53 m_rangesUnicode
.clear();
56 void append(const CharacterClass
* other
)
58 for (size_t i
= 0; i
< other
->m_matches
.size(); ++i
)
59 addSorted(m_matches
, other
->m_matches
[i
]);
60 for (size_t i
= 0; i
< other
->m_ranges
.size(); ++i
)
61 addSortedRange(m_ranges
, other
->m_ranges
[i
].begin
, other
->m_ranges
[i
].end
);
62 for (size_t i
= 0; i
< other
->m_matchesUnicode
.size(); ++i
)
63 addSorted(m_matchesUnicode
, other
->m_matchesUnicode
[i
]);
64 for (size_t i
= 0; i
< other
->m_rangesUnicode
.size(); ++i
)
65 addSortedRange(m_rangesUnicode
, other
->m_rangesUnicode
[i
].begin
, other
->m_rangesUnicode
[i
].end
);
68 void putChar(UChar ch
)
70 // Handle ascii cases.
72 if (m_isCaseInsensitive
&& isASCIIAlpha(ch
)) {
73 addSorted(m_matches
, toASCIIUpper(ch
));
74 addSorted(m_matches
, toASCIILower(ch
));
76 addSorted(m_matches
, ch
);
80 // Simple case, not a case-insensitive match.
81 if (!m_isCaseInsensitive
) {
82 addSorted(m_matchesUnicode
, ch
);
86 // Add multiple matches, if necessary.
87 const UCS2CanonicalizationRange
* info
= rangeInfoFor(ch
);
88 if (info
->type
== CanonicalizeUnique
)
89 addSorted(m_matchesUnicode
, ch
);
91 putUnicodeIgnoreCase(ch
, info
);
94 void putUnicodeIgnoreCase(UChar ch
, const UCS2CanonicalizationRange
* info
)
96 ASSERT(m_isCaseInsensitive
);
98 ASSERT(ch
>= info
->begin
&& ch
<= info
->end
);
99 ASSERT(info
->type
!= CanonicalizeUnique
);
100 if (info
->type
== CanonicalizeSet
) {
101 for (const uint16_t* set
= characterSetInfo
[info
->value
]; (ch
= *set
); ++set
)
102 addSorted(m_matchesUnicode
, ch
);
104 addSorted(m_matchesUnicode
, ch
);
105 addSorted(m_matchesUnicode
, getCanonicalPair(info
, ch
));
109 void putRange(UChar lo
, UChar hi
)
113 char asciiHi
= std::min(hi
, (UChar
)0x7f);
114 addSortedRange(m_ranges
, lo
, asciiHi
);
116 if (m_isCaseInsensitive
) {
117 if ((asciiLo
<= 'Z') && (asciiHi
>= 'A'))
118 addSortedRange(m_ranges
, std::max(asciiLo
, 'A')+('a'-'A'), std::min(asciiHi
, 'Z')+('a'-'A'));
119 if ((asciiLo
<= 'z') && (asciiHi
>= 'a'))
120 addSortedRange(m_ranges
, std::max(asciiLo
, 'a')+('A'-'a'), std::min(asciiHi
, 'z')+('A'-'a'));
126 lo
= std::max(lo
, (UChar
)0x80);
127 addSortedRange(m_rangesUnicode
, lo
, hi
);
129 if (!m_isCaseInsensitive
)
132 const UCS2CanonicalizationRange
* info
= rangeInfoFor(lo
);
134 // Handle the range [lo .. end]
135 UChar end
= std::min
<UChar
>(info
->end
, hi
);
137 switch (info
->type
) {
138 case CanonicalizeUnique
:
139 // Nothing to do - no canonical equivalents.
141 case CanonicalizeSet
: {
143 for (const uint16_t* set
= characterSetInfo
[info
->value
]; (ch
= *set
); ++set
)
144 addSorted(m_matchesUnicode
, ch
);
147 case CanonicalizeRangeLo
:
148 addSortedRange(m_rangesUnicode
, lo
+ info
->value
, end
+ info
->value
);
150 case CanonicalizeRangeHi
:
151 addSortedRange(m_rangesUnicode
, lo
- info
->value
, end
- info
->value
);
153 case CanonicalizeAlternatingAligned
:
154 // Use addSortedRange since there is likely an abutting range to combine with.
156 addSortedRange(m_rangesUnicode
, lo
- 1, lo
- 1);
158 addSortedRange(m_rangesUnicode
, end
+ 1, end
+ 1);
160 case CanonicalizeAlternatingUnaligned
:
161 // Use addSortedRange since there is likely an abutting range to combine with.
163 addSortedRange(m_rangesUnicode
, lo
- 1, lo
- 1);
165 addSortedRange(m_rangesUnicode
, end
+ 1, end
+ 1);
178 PassOwnPtr
<CharacterClass
> charClass()
180 OwnPtr
<CharacterClass
> characterClass
= adoptPtr(new CharacterClass
);
182 characterClass
->m_matches
.swap(m_matches
);
183 characterClass
->m_ranges
.swap(m_ranges
);
184 characterClass
->m_matchesUnicode
.swap(m_matchesUnicode
);
185 characterClass
->m_rangesUnicode
.swap(m_rangesUnicode
);
187 return characterClass
.release();
191 void addSorted(Vector
<UChar
>& matches
, UChar ch
)
194 unsigned range
= matches
.size();
196 // binary chop, find position to insert char.
198 unsigned index
= range
>> 1;
200 int val
= matches
[pos
+index
] - ch
;
211 if (pos
== matches
.size())
214 matches
.insert(pos
, ch
);
217 void addSortedRange(Vector
<CharacterRange
>& ranges
, UChar lo
, UChar hi
)
219 unsigned end
= ranges
.size();
221 // Simple linear scan - I doubt there are that many ranges anyway...
222 // feel free to fix this with something faster (eg binary chop).
223 for (unsigned i
= 0; i
< end
; ++i
) {
224 // does the new range fall before the current position in the array
225 if (hi
< ranges
[i
].begin
) {
226 // optional optimization: concatenate appending ranges? - may not be worthwhile.
227 if (hi
== (ranges
[i
].begin
- 1)) {
228 ranges
[i
].begin
= lo
;
231 ranges
.insert(i
, CharacterRange(lo
, hi
));
234 // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
235 // If the new range start at or before the end of the last range, then the overlap (if it starts one after the
236 // end of the last range they concatenate, which is just as good.
237 if (lo
<= (ranges
[i
].end
+ 1)) {
238 // found an intersect! we'll replace this entry in the array.
239 ranges
[i
].begin
= std::min(ranges
[i
].begin
, lo
);
240 ranges
[i
].end
= std::max(ranges
[i
].end
, hi
);
242 // now check if the new range can subsume any subsequent ranges.
244 // each iteration of the loop we will either remove something from the list, or break the loop.
245 while (next
< ranges
.size()) {
246 if (ranges
[next
].begin
<= (ranges
[i
].end
+ 1)) {
247 // the next entry now overlaps / concatenates this one.
248 ranges
[i
].end
= std::max(ranges
[i
].end
, ranges
[next
].end
);
258 // CharacterRange comes after all existing ranges.
259 ranges
.append(CharacterRange(lo
, hi
));
262 bool m_isCaseInsensitive
;
264 Vector
<UChar
> m_matches
;
265 Vector
<CharacterRange
> m_ranges
;
266 Vector
<UChar
> m_matchesUnicode
;
267 Vector
<CharacterRange
> m_rangesUnicode
;
270 class YarrPatternConstructor
{
272 YarrPatternConstructor(YarrPattern
& pattern
)
274 , m_characterClassConstructor(pattern
.m_ignoreCase
)
275 , m_invertParentheticalAssertion(false)
277 OwnPtr
<PatternDisjunction
> body
= adoptPtr(new PatternDisjunction
);
278 m_pattern
.m_body
= body
.get();
279 m_alternative
= body
->addNewAlternative();
280 m_pattern
.m_disjunctions
.append(body
.release());
283 ~YarrPatternConstructor()
290 m_characterClassConstructor
.reset();
292 OwnPtr
<PatternDisjunction
> body
= adoptPtr(new PatternDisjunction
);
293 m_pattern
.m_body
= body
.get();
294 m_alternative
= body
->addNewAlternative();
295 m_pattern
.m_disjunctions
.append(body
.release());
300 if (!m_alternative
->m_terms
.size() & !m_invertParentheticalAssertion
) {
301 m_alternative
->m_startsWithBOL
= true;
302 m_alternative
->m_containsBOL
= true;
303 m_pattern
.m_containsBOL
= true;
305 m_alternative
->m_terms
.append(PatternTerm::BOL());
309 m_alternative
->m_terms
.append(PatternTerm::EOL());
311 void assertionWordBoundary(bool invert
)
313 m_alternative
->m_terms
.append(PatternTerm::WordBoundary(invert
));
316 void atomPatternCharacter(UChar ch
)
318 // We handle case-insensitive checking of unicode characters which do have both
319 // cases by handling them as if they were defined using a CharacterClass.
320 if (!m_pattern
.m_ignoreCase
|| isASCII(ch
)) {
321 m_alternative
->m_terms
.append(PatternTerm(ch
));
325 const UCS2CanonicalizationRange
* info
= rangeInfoFor(ch
);
326 if (info
->type
== CanonicalizeUnique
) {
327 m_alternative
->m_terms
.append(PatternTerm(ch
));
331 m_characterClassConstructor
.putUnicodeIgnoreCase(ch
, info
);
332 OwnPtr
<CharacterClass
> newCharacterClass
= m_characterClassConstructor
.charClass();
333 m_alternative
->m_terms
.append(PatternTerm(newCharacterClass
.get(), false));
334 m_pattern
.m_userCharacterClasses
.append(newCharacterClass
.release());
337 void atomBuiltInCharacterClass(BuiltInCharacterClassID classID
, bool invert
)
341 m_alternative
->m_terms
.append(PatternTerm(m_pattern
.digitsCharacterClass(), invert
));
344 m_alternative
->m_terms
.append(PatternTerm(m_pattern
.spacesCharacterClass(), invert
));
347 m_alternative
->m_terms
.append(PatternTerm(m_pattern
.wordcharCharacterClass(), invert
));
350 m_alternative
->m_terms
.append(PatternTerm(m_pattern
.newlineCharacterClass(), invert
));
355 void atomCharacterClassBegin(bool invert
= false)
357 m_invertCharacterClass
= invert
;
360 void atomCharacterClassAtom(UChar ch
)
362 m_characterClassConstructor
.putChar(ch
);
365 void atomCharacterClassRange(UChar begin
, UChar end
)
367 m_characterClassConstructor
.putRange(begin
, end
);
370 void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID
, bool invert
)
372 ASSERT(classID
!= NewlineClassID
);
376 m_characterClassConstructor
.append(invert
? m_pattern
.nondigitsCharacterClass() : m_pattern
.digitsCharacterClass());
380 m_characterClassConstructor
.append(invert
? m_pattern
.nonspacesCharacterClass() : m_pattern
.spacesCharacterClass());
384 m_characterClassConstructor
.append(invert
? m_pattern
.nonwordcharCharacterClass() : m_pattern
.wordcharCharacterClass());
388 RELEASE_ASSERT_NOT_REACHED();
392 void atomCharacterClassEnd()
394 OwnPtr
<CharacterClass
> newCharacterClass
= m_characterClassConstructor
.charClass();
395 m_alternative
->m_terms
.append(PatternTerm(newCharacterClass
.get(), m_invertCharacterClass
));
396 m_pattern
.m_userCharacterClasses
.append(newCharacterClass
.release());
399 void atomParenthesesSubpatternBegin(bool capture
= true)
401 unsigned subpatternId
= m_pattern
.m_numSubpatterns
+ 1;
403 m_pattern
.m_numSubpatterns
++;
405 OwnPtr
<PatternDisjunction
> parenthesesDisjunction
= adoptPtr(new PatternDisjunction(m_alternative
));
406 m_alternative
->m_terms
.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern
, subpatternId
, parenthesesDisjunction
.get(), capture
, false));
407 m_alternative
= parenthesesDisjunction
->addNewAlternative();
408 m_pattern
.m_disjunctions
.append(parenthesesDisjunction
.release());
411 void atomParentheticalAssertionBegin(bool invert
= false)
413 OwnPtr
<PatternDisjunction
> parenthesesDisjunction
= adoptPtr(new PatternDisjunction(m_alternative
));
414 m_alternative
->m_terms
.append(PatternTerm(PatternTerm::TypeParentheticalAssertion
, m_pattern
.m_numSubpatterns
+ 1, parenthesesDisjunction
.get(), false, invert
));
415 m_alternative
= parenthesesDisjunction
->addNewAlternative();
416 m_invertParentheticalAssertion
= invert
;
417 m_pattern
.m_disjunctions
.append(parenthesesDisjunction
.release());
420 void atomParenthesesEnd()
422 ASSERT(m_alternative
->m_parent
);
423 ASSERT(m_alternative
->m_parent
->m_parent
);
425 PatternDisjunction
* parenthesesDisjunction
= m_alternative
->m_parent
;
426 m_alternative
= m_alternative
->m_parent
->m_parent
;
428 PatternTerm
& lastTerm
= m_alternative
->lastTerm();
430 unsigned numParenAlternatives
= parenthesesDisjunction
->m_alternatives
.size();
431 unsigned numBOLAnchoredAlts
= 0;
433 for (unsigned i
= 0; i
< numParenAlternatives
; i
++) {
434 // Bubble up BOL flags
435 if (parenthesesDisjunction
->m_alternatives
[i
]->m_startsWithBOL
)
436 numBOLAnchoredAlts
++;
439 if (numBOLAnchoredAlts
) {
440 m_alternative
->m_containsBOL
= true;
441 // If all the alternatives in parens start with BOL, then so does this one
442 if (numBOLAnchoredAlts
== numParenAlternatives
)
443 m_alternative
->m_startsWithBOL
= true;
446 lastTerm
.parentheses
.lastSubpatternId
= m_pattern
.m_numSubpatterns
;
447 m_invertParentheticalAssertion
= false;
450 void atomBackReference(unsigned subpatternId
)
452 ASSERT(subpatternId
);
453 m_pattern
.m_containsBackreferences
= true;
454 m_pattern
.m_maxBackReference
= std::max(m_pattern
.m_maxBackReference
, subpatternId
);
456 if (subpatternId
> m_pattern
.m_numSubpatterns
) {
457 m_alternative
->m_terms
.append(PatternTerm::ForwardReference());
461 PatternAlternative
* currentAlternative
= m_alternative
;
462 ASSERT(currentAlternative
);
464 // Note to self: if we waited until the AST was baked, we could also remove forwards refs
465 while ((currentAlternative
= currentAlternative
->m_parent
->m_parent
)) {
466 PatternTerm
& term
= currentAlternative
->lastTerm();
467 ASSERT((term
.type
== PatternTerm::TypeParenthesesSubpattern
) || (term
.type
== PatternTerm::TypeParentheticalAssertion
));
469 if ((term
.type
== PatternTerm::TypeParenthesesSubpattern
) && term
.capture() && (subpatternId
== term
.parentheses
.subpatternId
)) {
470 m_alternative
->m_terms
.append(PatternTerm::ForwardReference());
475 m_alternative
->m_terms
.append(PatternTerm(subpatternId
));
478 // deep copy the argument disjunction. If filterStartsWithBOL is true,
479 // skip alternatives with m_startsWithBOL set true.
480 PatternDisjunction
* copyDisjunction(PatternDisjunction
* disjunction
, bool filterStartsWithBOL
= false)
482 OwnPtr
<PatternDisjunction
> newDisjunction
;
483 for (unsigned alt
= 0; alt
< disjunction
->m_alternatives
.size(); ++alt
) {
484 PatternAlternative
* alternative
= disjunction
->m_alternatives
[alt
].get();
485 if (!filterStartsWithBOL
|| !alternative
->m_startsWithBOL
) {
486 if (!newDisjunction
) {
487 newDisjunction
= adoptPtr(new PatternDisjunction());
488 newDisjunction
->m_parent
= disjunction
->m_parent
;
490 PatternAlternative
* newAlternative
= newDisjunction
->addNewAlternative();
491 newAlternative
->m_terms
.reserveInitialCapacity(alternative
->m_terms
.size());
492 for (unsigned i
= 0; i
< alternative
->m_terms
.size(); ++i
)
493 newAlternative
->m_terms
.append(copyTerm(alternative
->m_terms
[i
], filterStartsWithBOL
));
500 PatternDisjunction
* copiedDisjunction
= newDisjunction
.get();
501 m_pattern
.m_disjunctions
.append(newDisjunction
.release());
502 return copiedDisjunction
;
505 PatternTerm
copyTerm(PatternTerm
& term
, bool filterStartsWithBOL
= false)
507 if ((term
.type
!= PatternTerm::TypeParenthesesSubpattern
) && (term
.type
!= PatternTerm::TypeParentheticalAssertion
))
508 return PatternTerm(term
);
510 PatternTerm termCopy
= term
;
511 termCopy
.parentheses
.disjunction
= copyDisjunction(termCopy
.parentheses
.disjunction
, filterStartsWithBOL
);
515 void quantifyAtom(unsigned min
, unsigned max
, bool greedy
)
518 ASSERT(m_alternative
->m_terms
.size());
521 m_alternative
->removeLastTerm();
525 PatternTerm
& term
= m_alternative
->lastTerm();
526 ASSERT(term
.type
> PatternTerm::TypeAssertionWordBoundary
);
527 ASSERT((term
.quantityCount
== 1) && (term
.quantityType
== QuantifierFixedCount
));
529 if (term
.type
== PatternTerm::TypeParentheticalAssertion
) {
530 // If an assertion is quantified with a minimum count of zero, it can simply be removed.
531 // This arises from the RepeatMatcher behaviour in the spec. Matching an assertion never
532 // results in any input being consumed, however the continuation passed to the assertion
533 // (called in steps, 8c and 9 of the RepeatMatcher definition, ES5.1 15.10.2.5) will
534 // reject all zero length matches (see step 2.1). A match from the continuation of the
535 // expression will still be accepted regardless (via steps 8a and 11) - the upshot of all
536 // this is that matches from the assertion are not required, and won't be accepted anyway,
537 // so no need to ever run it.
539 m_alternative
->removeLastTerm();
540 // We never need to run an assertion more than once. Subsequent interations will be run
541 // with the same start index (since assertions are non-capturing) and the same captures
542 // (per step 4 of RepeatMatcher in ES5.1 15.10.2.5), and as such will always produce the
543 // same result and captures. If the first match succeeds then the subsequent (min - 1)
544 // matches will too. Any additional optional matches will fail (on the same basis as the
545 // minimum zero quantified assertions, above), but this will still result in a match.
550 term
.quantify(max
, greedy
? QuantifierGreedy
: QuantifierNonGreedy
);
552 term
.quantify(min
, QuantifierFixedCount
);
554 term
.quantify(min
, QuantifierFixedCount
);
555 m_alternative
->m_terms
.append(copyTerm(term
));
556 // NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
557 m_alternative
->lastTerm().quantify((max
== quantifyInfinite
) ? max
: max
- min
, greedy
? QuantifierGreedy
: QuantifierNonGreedy
);
558 if (m_alternative
->lastTerm().type
== PatternTerm::TypeParenthesesSubpattern
)
559 m_alternative
->lastTerm().parentheses
.isCopy
= true;
565 m_alternative
= m_alternative
->m_parent
->addNewAlternative();
568 unsigned setupAlternativeOffsets(PatternAlternative
* alternative
, unsigned currentCallFrameSize
, unsigned initialInputPosition
)
570 alternative
->m_hasFixedSize
= true;
571 Checked
<unsigned> currentInputPosition
= initialInputPosition
;
573 for (unsigned i
= 0; i
< alternative
->m_terms
.size(); ++i
) {
574 PatternTerm
& term
= alternative
->m_terms
[i
];
577 case PatternTerm::TypeAssertionBOL
:
578 case PatternTerm::TypeAssertionEOL
:
579 case PatternTerm::TypeAssertionWordBoundary
:
580 term
.inputPosition
= currentInputPosition
.unsafeGet();
583 case PatternTerm::TypeBackReference
:
584 term
.inputPosition
= currentInputPosition
.unsafeGet();
585 term
.frameLocation
= currentCallFrameSize
;
586 currentCallFrameSize
+= YarrStackSpaceForBackTrackInfoBackReference
;
587 alternative
->m_hasFixedSize
= false;
590 case PatternTerm::TypeForwardReference
:
593 case PatternTerm::TypePatternCharacter
:
594 term
.inputPosition
= currentInputPosition
.unsafeGet();
595 if (term
.quantityType
!= QuantifierFixedCount
) {
596 term
.frameLocation
= currentCallFrameSize
;
597 currentCallFrameSize
+= YarrStackSpaceForBackTrackInfoPatternCharacter
;
598 alternative
->m_hasFixedSize
= false;
600 currentInputPosition
+= term
.quantityCount
;
603 case PatternTerm::TypeCharacterClass
:
604 term
.inputPosition
= currentInputPosition
.unsafeGet();
605 if (term
.quantityType
!= QuantifierFixedCount
) {
606 term
.frameLocation
= currentCallFrameSize
;
607 currentCallFrameSize
+= YarrStackSpaceForBackTrackInfoCharacterClass
;
608 alternative
->m_hasFixedSize
= false;
610 currentInputPosition
+= term
.quantityCount
;
613 case PatternTerm::TypeParenthesesSubpattern
:
614 // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
615 term
.frameLocation
= currentCallFrameSize
;
616 if (term
.quantityCount
== 1 && !term
.parentheses
.isCopy
) {
617 if (term
.quantityType
!= QuantifierFixedCount
)
618 currentCallFrameSize
+= YarrStackSpaceForBackTrackInfoParenthesesOnce
;
619 currentCallFrameSize
= setupDisjunctionOffsets(term
.parentheses
.disjunction
, currentCallFrameSize
, currentInputPosition
.unsafeGet());
620 // If quantity is fixed, then pre-check its minimum size.
621 if (term
.quantityType
== QuantifierFixedCount
)
622 currentInputPosition
+= term
.parentheses
.disjunction
->m_minimumSize
;
623 term
.inputPosition
= currentInputPosition
.unsafeGet();
624 } else if (term
.parentheses
.isTerminal
) {
625 currentCallFrameSize
+= YarrStackSpaceForBackTrackInfoParenthesesTerminal
;
626 currentCallFrameSize
= setupDisjunctionOffsets(term
.parentheses
.disjunction
, currentCallFrameSize
, currentInputPosition
.unsafeGet());
627 term
.inputPosition
= currentInputPosition
.unsafeGet();
629 term
.inputPosition
= currentInputPosition
.unsafeGet();
630 setupDisjunctionOffsets(term
.parentheses
.disjunction
, 0, currentInputPosition
.unsafeGet());
631 currentCallFrameSize
+= YarrStackSpaceForBackTrackInfoParentheses
;
633 // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length.
634 alternative
->m_hasFixedSize
= false;
637 case PatternTerm::TypeParentheticalAssertion
:
638 term
.inputPosition
= currentInputPosition
.unsafeGet();
639 term
.frameLocation
= currentCallFrameSize
;
640 currentCallFrameSize
= setupDisjunctionOffsets(term
.parentheses
.disjunction
, currentCallFrameSize
+ YarrStackSpaceForBackTrackInfoParentheticalAssertion
, currentInputPosition
.unsafeGet());
643 case PatternTerm::TypeDotStarEnclosure
:
644 alternative
->m_hasFixedSize
= false;
645 term
.inputPosition
= initialInputPosition
;
650 alternative
->m_minimumSize
= (currentInputPosition
- initialInputPosition
).unsafeGet();
651 return currentCallFrameSize
;
654 unsigned setupDisjunctionOffsets(PatternDisjunction
* disjunction
, unsigned initialCallFrameSize
, unsigned initialInputPosition
)
656 if ((disjunction
!= m_pattern
.m_body
) && (disjunction
->m_alternatives
.size() > 1))
657 initialCallFrameSize
+= YarrStackSpaceForBackTrackInfoAlternative
;
659 unsigned minimumInputSize
= UINT_MAX
;
660 unsigned maximumCallFrameSize
= 0;
661 bool hasFixedSize
= true;
663 for (unsigned alt
= 0; alt
< disjunction
->m_alternatives
.size(); ++alt
) {
664 PatternAlternative
* alternative
= disjunction
->m_alternatives
[alt
].get();
665 unsigned currentAlternativeCallFrameSize
= setupAlternativeOffsets(alternative
, initialCallFrameSize
, initialInputPosition
);
666 minimumInputSize
= std::min(minimumInputSize
, alternative
->m_minimumSize
);
667 maximumCallFrameSize
= std::max(maximumCallFrameSize
, currentAlternativeCallFrameSize
);
668 hasFixedSize
&= alternative
->m_hasFixedSize
;
669 if (alternative
->m_minimumSize
> INT_MAX
)
670 m_pattern
.m_containsUnsignedLengthPattern
= true;
673 ASSERT(minimumInputSize
!= UINT_MAX
);
674 ASSERT(maximumCallFrameSize
>= initialCallFrameSize
);
676 disjunction
->m_hasFixedSize
= hasFixedSize
;
677 disjunction
->m_minimumSize
= minimumInputSize
;
678 disjunction
->m_callFrameSize
= maximumCallFrameSize
;
679 return maximumCallFrameSize
;
684 setupDisjunctionOffsets(m_pattern
.m_body
, 0, 0);
687 // This optimization identifies sets of parentheses that we will never need to backtrack.
688 // In these cases we do not need to store state from prior iterations.
689 // We can presently avoid backtracking for:
690 // * where the parens are at the end of the regular expression (last term in any of the
691 // alternatives of the main body disjunction).
692 // * where the parens are non-capturing, and quantified unbounded greedy (*).
693 // * where the parens do not contain any capturing subpatterns.
694 void checkForTerminalParentheses()
696 // This check is much too crude; should be just checking whether the candidate
697 // node contains nested capturing subpatterns, not the whole expression!
698 if (m_pattern
.m_numSubpatterns
)
701 Vector
<OwnPtr
<PatternAlternative
>>& alternatives
= m_pattern
.m_body
->m_alternatives
;
702 for (size_t i
= 0; i
< alternatives
.size(); ++i
) {
703 Vector
<PatternTerm
>& terms
= alternatives
[i
]->m_terms
;
705 PatternTerm
& term
= terms
.last();
706 if (term
.type
== PatternTerm::TypeParenthesesSubpattern
707 && term
.quantityType
== QuantifierGreedy
708 && term
.quantityCount
== quantifyInfinite
710 term
.parentheses
.isTerminal
= true;
717 // Look for expressions containing beginning of line (^) anchoring and unroll them.
718 // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops
719 // This code relies on the parsing code tagging alternatives with m_containsBOL and
720 // m_startsWithBOL and rolling those up to containing alternatives.
721 // At this point, this is only valid for non-multiline expressions.
722 PatternDisjunction
* disjunction
= m_pattern
.m_body
;
724 if (!m_pattern
.m_containsBOL
|| m_pattern
.m_multiline
)
727 PatternDisjunction
* loopDisjunction
= copyDisjunction(disjunction
, true);
729 // Set alternatives in disjunction to "onceThrough"
730 for (unsigned alt
= 0; alt
< disjunction
->m_alternatives
.size(); ++alt
)
731 disjunction
->m_alternatives
[alt
]->setOnceThrough();
733 if (loopDisjunction
) {
734 // Move alternatives from loopDisjunction to disjunction
735 for (unsigned alt
= 0; alt
< loopDisjunction
->m_alternatives
.size(); ++alt
)
736 disjunction
->m_alternatives
.append(loopDisjunction
->m_alternatives
[alt
].release());
738 loopDisjunction
->m_alternatives
.clear();
742 bool containsCapturingTerms(PatternAlternative
* alternative
, size_t firstTermIndex
, size_t lastTermIndex
)
744 Vector
<PatternTerm
>& terms
= alternative
->m_terms
;
746 for (size_t termIndex
= firstTermIndex
; termIndex
<= lastTermIndex
; ++termIndex
) {
747 PatternTerm
& term
= terms
[termIndex
];
752 if (term
.type
== PatternTerm::TypeParenthesesSubpattern
) {
753 PatternDisjunction
* nestedDisjunction
= term
.parentheses
.disjunction
;
754 for (unsigned alt
= 0; alt
< nestedDisjunction
->m_alternatives
.size(); ++alt
) {
755 if (containsCapturingTerms(nestedDisjunction
->m_alternatives
[alt
].get(), 0, nestedDisjunction
->m_alternatives
[alt
]->m_terms
.size() - 1))
764 // This optimization identifies alternatives in the form of
765 // [^].*[?]<expression>.*[$] for expressions that don't have any
766 // capturing terms. The alternative is changed to <expression>
767 // followed by processing of the dot stars to find and adjust the
768 // beginning and the end of the match.
769 void optimizeDotStarWrappedExpressions()
771 Vector
<OwnPtr
<PatternAlternative
>>& alternatives
= m_pattern
.m_body
->m_alternatives
;
772 if (alternatives
.size() != 1)
775 PatternAlternative
* alternative
= alternatives
[0].get();
776 Vector
<PatternTerm
>& terms
= alternative
->m_terms
;
777 if (terms
.size() >= 3) {
778 bool startsWithBOL
= false;
779 bool endsWithEOL
= false;
780 size_t termIndex
, firstExpressionTerm
, lastExpressionTerm
;
783 if (terms
[termIndex
].type
== PatternTerm::TypeAssertionBOL
) {
784 startsWithBOL
= true;
788 PatternTerm
& firstNonAnchorTerm
= terms
[termIndex
];
789 if ((firstNonAnchorTerm
.type
!= PatternTerm::TypeCharacterClass
) || (firstNonAnchorTerm
.characterClass
!= m_pattern
.newlineCharacterClass()) || !((firstNonAnchorTerm
.quantityType
== QuantifierGreedy
) || (firstNonAnchorTerm
.quantityType
== QuantifierNonGreedy
)))
792 firstExpressionTerm
= termIndex
+ 1;
794 termIndex
= terms
.size() - 1;
795 if (terms
[termIndex
].type
== PatternTerm::TypeAssertionEOL
) {
800 PatternTerm
& lastNonAnchorTerm
= terms
[termIndex
];
801 if ((lastNonAnchorTerm
.type
!= PatternTerm::TypeCharacterClass
) || (lastNonAnchorTerm
.characterClass
!= m_pattern
.newlineCharacterClass()) || (lastNonAnchorTerm
.quantityType
!= QuantifierGreedy
))
804 lastExpressionTerm
= termIndex
- 1;
806 if (firstExpressionTerm
> lastExpressionTerm
)
809 if (!containsCapturingTerms(alternative
, firstExpressionTerm
, lastExpressionTerm
)) {
810 for (termIndex
= terms
.size() - 1; termIndex
> lastExpressionTerm
; --termIndex
)
811 terms
.remove(termIndex
);
813 for (termIndex
= firstExpressionTerm
; termIndex
> 0; --termIndex
)
814 terms
.remove(termIndex
- 1);
816 terms
.append(PatternTerm(startsWithBOL
, endsWithEOL
));
818 m_pattern
.m_containsBOL
= false;
824 YarrPattern
& m_pattern
;
825 PatternAlternative
* m_alternative
;
826 CharacterClassConstructor m_characterClassConstructor
;
827 bool m_invertCharacterClass
;
828 bool m_invertParentheticalAssertion
;
831 const char* YarrPattern::compile(const String
& patternString
)
833 YarrPatternConstructor
constructor(*this);
835 if (const char* error
= parse(constructor
, patternString
))
838 // If the pattern contains illegal backreferences reset & reparse.
839 // Quoting Netscape's "What's new in JavaScript 1.2",
840 // "Note: if the number of left parentheses is less than the number specified
841 // in \#, the \# is taken as an octal escape as described in the next row."
842 if (containsIllegalBackReference()) {
843 unsigned numSubpatterns
= m_numSubpatterns
;
849 parse(constructor
, patternString
, numSubpatterns
);
852 ASSERT(numSubpatterns
== m_numSubpatterns
);
855 constructor
.checkForTerminalParentheses();
856 constructor
.optimizeDotStarWrappedExpressions();
857 constructor
.optimizeBOL();
859 constructor
.setupOffsets();
864 YarrPattern::YarrPattern(const String
& pattern
, bool ignoreCase
, bool multiline
, const char** error
)
865 : m_ignoreCase(ignoreCase
)
866 , m_multiline(multiline
)
867 , m_containsBackreferences(false)
868 , m_containsBOL(false)
869 , m_containsUnsignedLengthPattern(false)
870 , m_numSubpatterns(0)
871 , m_maxBackReference(0)
878 , nonwordcharCached(0)
880 *error
= compile(pattern
);