2 * Copyright (C) 2009 Apple Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 #include "RegexCompiler.h"
29 #include "RegexInterpreter.h"
30 #include "RegexPattern.h"
31 #include <wtf/Vector.h>
37 namespace JSC
{ namespace Yarr
{
39 #include "RegExpJitTables.h"
41 class CharacterClassConstructor
{
43 CharacterClassConstructor(bool isCaseInsensitive
= false)
44 : m_isCaseInsensitive(isCaseInsensitive
)
52 m_matchesUnicode
.clear();
53 m_rangesUnicode
.clear();
56 void append(const CharacterClass
* other
)
58 for (size_t i
= 0; i
< other
->m_matches
.size(); ++i
)
59 addSorted(m_matches
, other
->m_matches
[i
]);
60 for (size_t i
= 0; i
< other
->m_ranges
.size(); ++i
)
61 addSortedRange(m_ranges
, other
->m_ranges
[i
].begin
, other
->m_ranges
[i
].end
);
62 for (size_t i
= 0; i
< other
->m_matchesUnicode
.size(); ++i
)
63 addSorted(m_matchesUnicode
, other
->m_matchesUnicode
[i
]);
64 for (size_t i
= 0; i
< other
->m_rangesUnicode
.size(); ++i
)
65 addSortedRange(m_rangesUnicode
, other
->m_rangesUnicode
[i
].begin
, other
->m_rangesUnicode
[i
].end
);
68 void putChar(UChar ch
)
71 if (m_isCaseInsensitive
&& isASCIIAlpha(ch
)) {
72 addSorted(m_matches
, toASCIIUpper(ch
));
73 addSorted(m_matches
, toASCIILower(ch
));
75 addSorted(m_matches
, ch
);
78 if (m_isCaseInsensitive
&& ((upper
= Unicode::toUpper(ch
)) != (lower
= Unicode::toLower(ch
)))) {
79 addSorted(m_matchesUnicode
, upper
);
80 addSorted(m_matchesUnicode
, lower
);
82 addSorted(m_matchesUnicode
, ch
);
86 // returns true if this character has another case, and 'ch' is the upper case form.
87 static inline bool isUnicodeUpper(UChar ch
)
89 return ch
!= Unicode::toLower(ch
);
92 // returns true if this character has another case, and 'ch' is the lower case form.
93 static inline bool isUnicodeLower(UChar ch
)
95 return ch
!= Unicode::toUpper(ch
);
98 void putRange(UChar lo
, UChar hi
)
102 char asciiHi
= std::min(hi
, (UChar
)0x7f);
103 addSortedRange(m_ranges
, lo
, asciiHi
);
105 if (m_isCaseInsensitive
) {
106 if ((asciiLo
<= 'Z') && (asciiHi
>= 'A'))
107 addSortedRange(m_ranges
, std::max(asciiLo
, 'A')+('a'-'A'), std::min(asciiHi
, 'Z')+('a'-'A'));
108 if ((asciiLo
<= 'z') && (asciiHi
>= 'a'))
109 addSortedRange(m_ranges
, std::max(asciiLo
, 'a')+('A'-'a'), std::min(asciiHi
, 'z')+('A'-'a'));
113 uint32_t unicodeCurr
= std::max(lo
, (UChar
)0x80);
114 addSortedRange(m_rangesUnicode
, unicodeCurr
, hi
);
116 if (m_isCaseInsensitive
) {
117 while (unicodeCurr
<= hi
) {
118 // If the upper bound of the range (hi) is 0xffff, the increments to
119 // unicodeCurr in this loop may take it to 0x10000. This is fine
120 // (if so we won't re-enter the loop, since the loop condition above
121 // will definitely fail) - but this does mean we cannot use a UChar
122 // to represent unicodeCurr, we must use a 32-bit value instead.
123 ASSERT(unicodeCurr
<= 0xffff);
125 if (isUnicodeUpper(unicodeCurr
)) {
126 UChar lowerCaseRangeBegin
= Unicode::toLower(unicodeCurr
);
127 UChar lowerCaseRangeEnd
= lowerCaseRangeBegin
;
128 while ((++unicodeCurr
<= hi
) && isUnicodeUpper(unicodeCurr
) && (Unicode::toLower(unicodeCurr
) == (lowerCaseRangeEnd
+ 1)))
130 addSortedRange(m_rangesUnicode
, lowerCaseRangeBegin
, lowerCaseRangeEnd
);
131 } else if (isUnicodeLower(unicodeCurr
)) {
132 UChar upperCaseRangeBegin
= Unicode::toUpper(unicodeCurr
);
133 UChar upperCaseRangeEnd
= upperCaseRangeBegin
;
134 while ((++unicodeCurr
<= hi
) && isUnicodeLower(unicodeCurr
) && (Unicode::toUpper(unicodeCurr
) == (upperCaseRangeEnd
+ 1)))
136 addSortedRange(m_rangesUnicode
, upperCaseRangeBegin
, upperCaseRangeEnd
);
144 CharacterClass
* charClass()
146 CharacterClass
* characterClass
= new CharacterClass(0);
148 characterClass
->m_matches
.append(m_matches
);
149 characterClass
->m_ranges
.append(m_ranges
);
150 characterClass
->m_matchesUnicode
.append(m_matchesUnicode
);
151 characterClass
->m_rangesUnicode
.append(m_rangesUnicode
);
155 return characterClass
;
159 void addSorted(Vector
<UChar
>& matches
, UChar ch
)
162 unsigned range
= matches
.size();
164 // binary chop, find position to insert char.
166 unsigned index
= range
>> 1;
168 int val
= matches
[pos
+index
] - ch
;
179 if (pos
== matches
.size())
182 matches
.insert(pos
, ch
);
185 void addSortedRange(Vector
<CharacterRange
>& ranges
, UChar lo
, UChar hi
)
187 unsigned end
= ranges
.size();
189 // Simple linear scan - I doubt there are that many ranges anyway...
190 // feel free to fix this with something faster (eg binary chop).
191 for (unsigned i
= 0; i
< end
; ++i
) {
192 // does the new range fall before the current position in the array
193 if (hi
< ranges
[i
].begin
) {
194 // optional optimization: concatenate appending ranges? - may not be worthwhile.
195 if (hi
== (ranges
[i
].begin
- 1)) {
196 ranges
[i
].begin
= lo
;
199 ranges
.insert(i
, CharacterRange(lo
, hi
));
202 // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
203 // If the new range start at or before the end of the last range, then the overlap (if it starts one after the
204 // end of the last range they concatenate, which is just as good.
205 if (lo
<= (ranges
[i
].end
+ 1)) {
206 // found an intersect! we'll replace this entry in the array.
207 ranges
[i
].begin
= std::min(ranges
[i
].begin
, lo
);
208 ranges
[i
].end
= std::max(ranges
[i
].end
, hi
);
210 // now check if the new range can subsume any subsequent ranges.
212 // each iteration of the loop we will either remove something from the list, or break the loop.
213 while (next
< ranges
.size()) {
214 if (ranges
[next
].begin
<= (ranges
[i
].end
+ 1)) {
215 // the next entry now overlaps / concatenates this one.
216 ranges
[i
].end
= std::max(ranges
[i
].end
, ranges
[next
].end
);
226 // CharacterRange comes after all existing ranges.
227 ranges
.append(CharacterRange(lo
, hi
));
230 bool m_isCaseInsensitive
;
232 Vector
<UChar
> m_matches
;
233 Vector
<CharacterRange
> m_ranges
;
234 Vector
<UChar
> m_matchesUnicode
;
235 Vector
<CharacterRange
> m_rangesUnicode
;
238 class RegexPatternConstructor
{
240 RegexPatternConstructor(RegexPattern
& pattern
)
242 , m_characterClassConstructor(pattern
.m_ignoreCase
)
246 ~RegexPatternConstructor()
253 m_characterClassConstructor
.reset();
258 m_alternative
->m_terms
.append(PatternTerm::BOL());
262 m_alternative
->m_terms
.append(PatternTerm::EOL());
264 void assertionWordBoundary(bool invert
)
266 m_alternative
->m_terms
.append(PatternTerm::WordBoundary(invert
));
269 void atomPatternCharacter(UChar ch
)
271 // We handle case-insensitive checking of unicode characters which do have both
272 // cases by handling them as if they were defined using a CharacterClass.
273 if (m_pattern
.m_ignoreCase
&& !isASCII(ch
) && (Unicode::toUpper(ch
) != Unicode::toLower(ch
))) {
274 atomCharacterClassBegin();
275 atomCharacterClassAtom(ch
);
276 atomCharacterClassEnd();
278 m_alternative
->m_terms
.append(PatternTerm(ch
));
281 void atomBuiltInCharacterClass(BuiltInCharacterClassID classID
, bool invert
)
285 m_alternative
->m_terms
.append(PatternTerm(m_pattern
.digitsCharacterClass(), invert
));
288 m_alternative
->m_terms
.append(PatternTerm(m_pattern
.spacesCharacterClass(), invert
));
291 m_alternative
->m_terms
.append(PatternTerm(m_pattern
.wordcharCharacterClass(), invert
));
294 m_alternative
->m_terms
.append(PatternTerm(m_pattern
.newlineCharacterClass(), invert
));
299 void atomCharacterClassBegin(bool invert
= false)
301 m_invertCharacterClass
= invert
;
304 void atomCharacterClassAtom(UChar ch
)
306 m_characterClassConstructor
.putChar(ch
);
309 void atomCharacterClassRange(UChar begin
, UChar end
)
311 m_characterClassConstructor
.putRange(begin
, end
);
314 void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID
, bool invert
)
316 ASSERT(classID
!= NewlineClassID
);
320 m_characterClassConstructor
.append(invert
? m_pattern
.nondigitsCharacterClass() : m_pattern
.digitsCharacterClass());
324 m_characterClassConstructor
.append(invert
? m_pattern
.nonspacesCharacterClass() : m_pattern
.spacesCharacterClass());
328 m_characterClassConstructor
.append(invert
? m_pattern
.nonwordcharCharacterClass() : m_pattern
.wordcharCharacterClass());
332 ASSERT_NOT_REACHED();
336 void atomCharacterClassEnd()
338 CharacterClass
* newCharacterClass
= m_characterClassConstructor
.charClass();
339 m_pattern
.m_userCharacterClasses
.append(newCharacterClass
);
340 m_alternative
->m_terms
.append(PatternTerm(newCharacterClass
, m_invertCharacterClass
));
343 void atomParenthesesSubpatternBegin(bool capture
= true)
345 unsigned subpatternId
= m_pattern
.m_numSubpatterns
+ 1;
347 m_pattern
.m_numSubpatterns
++;
349 PatternDisjunction
* parenthesesDisjunction
= new PatternDisjunction(m_alternative
);
350 m_pattern
.m_disjunctions
.append(parenthesesDisjunction
);
351 m_alternative
->m_terms
.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern
, subpatternId
, parenthesesDisjunction
, capture
));
352 m_alternative
= parenthesesDisjunction
->addNewAlternative();
355 void atomParentheticalAssertionBegin(bool invert
= false)
357 PatternDisjunction
* parenthesesDisjunction
= new PatternDisjunction(m_alternative
);
358 m_pattern
.m_disjunctions
.append(parenthesesDisjunction
);
359 m_alternative
->m_terms
.append(PatternTerm(PatternTerm::TypeParentheticalAssertion
, m_pattern
.m_numSubpatterns
+ 1, parenthesesDisjunction
, invert
));
360 m_alternative
= parenthesesDisjunction
->addNewAlternative();
363 void atomParenthesesEnd()
365 ASSERT(m_alternative
->m_parent
);
366 ASSERT(m_alternative
->m_parent
->m_parent
);
367 m_alternative
= m_alternative
->m_parent
->m_parent
;
369 m_alternative
->lastTerm().parentheses
.lastSubpatternId
= m_pattern
.m_numSubpatterns
;
372 void atomBackReference(unsigned subpatternId
)
374 ASSERT(subpatternId
);
375 m_pattern
.m_shouldFallBack
= true;
376 m_pattern
.m_maxBackReference
= std::max(m_pattern
.m_maxBackReference
, subpatternId
);
378 if (subpatternId
> m_pattern
.m_numSubpatterns
) {
379 m_alternative
->m_terms
.append(PatternTerm::ForwardReference());
383 PatternAlternative
* currentAlternative
= m_alternative
;
384 ASSERT(currentAlternative
);
386 // Note to self: if we waited until the AST was baked, we could also remove forwards refs
387 while ((currentAlternative
= currentAlternative
->m_parent
->m_parent
)) {
388 PatternTerm
& term
= currentAlternative
->lastTerm();
389 ASSERT((term
.type
== PatternTerm::TypeParenthesesSubpattern
) || (term
.type
== PatternTerm::TypeParentheticalAssertion
));
391 if ((term
.type
== PatternTerm::TypeParenthesesSubpattern
) && term
.invertOrCapture
&& (subpatternId
== term
.subpatternId
)) {
392 m_alternative
->m_terms
.append(PatternTerm::ForwardReference());
397 m_alternative
->m_terms
.append(PatternTerm(subpatternId
));
400 PatternDisjunction
* copyDisjunction(PatternDisjunction
* disjunction
)
402 PatternDisjunction
* newDisjunction
= new PatternDisjunction();
404 newDisjunction
->m_parent
= disjunction
->m_parent
;
405 for (unsigned alt
= 0; alt
< disjunction
->m_alternatives
.size(); ++alt
) {
406 PatternAlternative
* alternative
= disjunction
->m_alternatives
[alt
];
407 PatternAlternative
* newAlternative
= newDisjunction
->addNewAlternative();
408 for (unsigned i
= 0; i
< alternative
->m_terms
.size(); ++i
)
409 newAlternative
->m_terms
.append(copyTerm(alternative
->m_terms
[i
]));
412 m_pattern
.m_disjunctions
.append(newDisjunction
);
413 return newDisjunction
;
416 PatternTerm
copyTerm(PatternTerm
& term
)
418 if ((term
.type
!= PatternTerm::TypeParenthesesSubpattern
) && (term
.type
!= PatternTerm::TypeParentheticalAssertion
))
419 return PatternTerm(term
);
421 PatternTerm termCopy
= term
;
422 termCopy
.parentheses
.disjunction
= copyDisjunction(termCopy
.parentheses
.disjunction
);
426 void quantifyAtom(unsigned min
, unsigned max
, bool greedy
)
429 ASSERT(m_alternative
->m_terms
.size());
432 m_alternative
->removeLastTerm();
436 PatternTerm
& term
= m_alternative
->lastTerm();
437 ASSERT(term
.type
> PatternTerm::TypeAssertionWordBoundary
);
438 ASSERT((term
.quantityCount
== 1) && (term
.quantityType
== QuantifierFixedCount
));
440 // For any assertion with a zero minimum, not matching is valid and has no effect,
441 // remove it. Otherwise, we need to match as least once, but there is no point
442 // matching more than once, so remove the quantifier. It is not entirely clear
443 // from the spec whether or not this behavior is correct, but I believe this
444 // matches Firefox. :-/
445 if (term
.type
== PatternTerm::TypeParentheticalAssertion
) {
447 m_alternative
->removeLastTerm();
451 if (max
> 1 && term
.type
== PatternTerm::TypeParenthesesSubpattern
)
452 m_pattern
.m_shouldFallBack
= true;
455 term
.quantify(max
, greedy
? QuantifierGreedy
: QuantifierNonGreedy
);
457 term
.quantify(min
, QuantifierFixedCount
);
459 term
.quantify(min
, QuantifierFixedCount
);
460 m_alternative
->m_terms
.append(copyTerm(term
));
461 // NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
462 m_alternative
->lastTerm().quantify((max
== UINT_MAX
) ? max
: max
- min
, greedy
? QuantifierGreedy
: QuantifierNonGreedy
);
463 if (m_alternative
->lastTerm().type
== PatternTerm::TypeParenthesesSubpattern
)
464 m_alternative
->lastTerm().parentheses
.isCopy
= true;
470 m_alternative
= m_alternative
->m_parent
->addNewAlternative();
475 m_pattern
.m_body
= new PatternDisjunction();
476 m_alternative
= m_pattern
.m_body
->addNewAlternative();
477 m_pattern
.m_disjunctions
.append(m_pattern
.m_body
);
486 unsigned setupAlternativeOffsets(PatternAlternative
* alternative
, unsigned currentCallFrameSize
, unsigned initialInputPosition
)
488 alternative
->m_hasFixedSize
= true;
489 unsigned currentInputPosition
= initialInputPosition
;
491 for (unsigned i
= 0; i
< alternative
->m_terms
.size(); ++i
) {
492 PatternTerm
& term
= alternative
->m_terms
[i
];
495 case PatternTerm::TypeAssertionBOL
:
496 case PatternTerm::TypeAssertionEOL
:
497 case PatternTerm::TypeAssertionWordBoundary
:
498 term
.inputPosition
= currentInputPosition
;
501 case PatternTerm::TypeBackReference
:
502 term
.inputPosition
= currentInputPosition
;
503 term
.frameLocation
= currentCallFrameSize
;
504 currentCallFrameSize
+= RegexStackSpaceForBackTrackInfoBackReference
;
505 alternative
->m_hasFixedSize
= false;
508 case PatternTerm::TypeForwardReference
:
511 case PatternTerm::TypePatternCharacter
:
512 term
.inputPosition
= currentInputPosition
;
513 if (term
.quantityType
!= QuantifierFixedCount
) {
514 term
.frameLocation
= currentCallFrameSize
;
515 currentCallFrameSize
+= RegexStackSpaceForBackTrackInfoPatternCharacter
;
516 alternative
->m_hasFixedSize
= false;
518 currentInputPosition
+= term
.quantityCount
;
521 case PatternTerm::TypeCharacterClass
:
522 term
.inputPosition
= currentInputPosition
;
523 if (term
.quantityType
!= QuantifierFixedCount
) {
524 term
.frameLocation
= currentCallFrameSize
;
525 currentCallFrameSize
+= RegexStackSpaceForBackTrackInfoCharacterClass
;
526 alternative
->m_hasFixedSize
= false;
528 currentInputPosition
+= term
.quantityCount
;
531 case PatternTerm::TypeParenthesesSubpattern
:
532 // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
533 term
.frameLocation
= currentCallFrameSize
;
534 if ((term
.quantityCount
== 1) && !term
.parentheses
.isCopy
) {
535 if (term
.quantityType
== QuantifierFixedCount
) {
536 currentCallFrameSize
= setupDisjunctionOffsets(term
.parentheses
.disjunction
, currentCallFrameSize
, currentInputPosition
);
537 currentInputPosition
+= term
.parentheses
.disjunction
->m_minimumSize
;
539 currentCallFrameSize
+= RegexStackSpaceForBackTrackInfoParenthesesOnce
;
540 currentCallFrameSize
= setupDisjunctionOffsets(term
.parentheses
.disjunction
, currentCallFrameSize
, currentInputPosition
);
542 term
.inputPosition
= currentInputPosition
;
544 term
.inputPosition
= currentInputPosition
;
545 setupDisjunctionOffsets(term
.parentheses
.disjunction
, 0, currentInputPosition
);
546 currentCallFrameSize
+= RegexStackSpaceForBackTrackInfoParentheses
;
548 // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length.
549 alternative
->m_hasFixedSize
= false;
552 case PatternTerm::TypeParentheticalAssertion
:
553 term
.inputPosition
= currentInputPosition
;
554 term
.frameLocation
= currentCallFrameSize
;
555 currentCallFrameSize
= setupDisjunctionOffsets(term
.parentheses
.disjunction
, currentCallFrameSize
+ RegexStackSpaceForBackTrackInfoParentheticalAssertion
, currentInputPosition
);
560 alternative
->m_minimumSize
= currentInputPosition
- initialInputPosition
;
561 return currentCallFrameSize
;
564 unsigned setupDisjunctionOffsets(PatternDisjunction
* disjunction
, unsigned initialCallFrameSize
, unsigned initialInputPosition
)
566 if ((disjunction
!= m_pattern
.m_body
) && (disjunction
->m_alternatives
.size() > 1))
567 initialCallFrameSize
+= RegexStackSpaceForBackTrackInfoAlternative
;
569 unsigned minimumInputSize
= UINT_MAX
;
570 unsigned maximumCallFrameSize
= 0;
571 bool hasFixedSize
= true;
573 for (unsigned alt
= 0; alt
< disjunction
->m_alternatives
.size(); ++alt
) {
574 PatternAlternative
* alternative
= disjunction
->m_alternatives
[alt
];
575 unsigned currentAlternativeCallFrameSize
= setupAlternativeOffsets(alternative
, initialCallFrameSize
, initialInputPosition
);
576 minimumInputSize
= min(minimumInputSize
, alternative
->m_minimumSize
);
577 maximumCallFrameSize
= max(maximumCallFrameSize
, currentAlternativeCallFrameSize
);
578 hasFixedSize
&= alternative
->m_hasFixedSize
;
581 ASSERT(minimumInputSize
!= UINT_MAX
);
582 ASSERT(maximumCallFrameSize
>= initialCallFrameSize
);
584 disjunction
->m_hasFixedSize
= hasFixedSize
;
585 disjunction
->m_minimumSize
= minimumInputSize
;
586 disjunction
->m_callFrameSize
= maximumCallFrameSize
;
587 return maximumCallFrameSize
;
592 setupDisjunctionOffsets(m_pattern
.m_body
, 0, 0);
596 RegexPattern
& m_pattern
;
597 PatternAlternative
* m_alternative
;
598 CharacterClassConstructor m_characterClassConstructor
;
599 bool m_invertCharacterClass
;
603 const char* compileRegex(const UString
& patternString
, RegexPattern
& pattern
)
605 RegexPatternConstructor
constructor(pattern
);
607 if (const char* error
= parse(constructor
, patternString
))
610 // If the pattern contains illegal backreferences reset & reparse.
611 // Quoting Netscape's "What's new in JavaScript 1.2",
612 // "Note: if the number of left parentheses is less than the number specified
613 // in \#, the \# is taken as an octal escape as described in the next row."
614 if (pattern
.containsIllegalBackReference()) {
615 unsigned numSubpatterns
= pattern
.m_numSubpatterns
;
621 parse(constructor
, patternString
, numSubpatterns
);
624 ASSERT(numSubpatterns
== pattern
.m_numSubpatterns
);
627 constructor
.setupOffsets();