]> git.saurik.com Git - apple/javascriptcore.git/blame - yarr/YarrPattern.cpp
JavaScriptCore-1097.13.tar.gz
[apple/javascriptcore.git] / yarr / YarrPattern.cpp
CommitLineData
ba379fdc
A
1/*
2 * Copyright (C) 2009 Apple Inc. All rights reserved.
14957cd0 3 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
ba379fdc
A
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "config.h"
14957cd0 28#include "YarrPattern.h"
ba379fdc 29
14957cd0 30#include "Yarr.h"
6fe7ccc8 31#include "YarrCanonicalizeUCS2.h"
14957cd0 32#include "YarrParser.h"
ba379fdc
A
33#include <wtf/Vector.h>
34
ba379fdc
A
35using namespace WTF;
36
37namespace JSC { namespace Yarr {
38
4e4e5a6f
A
39#include "RegExpJitTables.h"
40
ba379fdc
A
41class CharacterClassConstructor {
42public:
43 CharacterClassConstructor(bool isCaseInsensitive = false)
44 : m_isCaseInsensitive(isCaseInsensitive)
45 {
46 }
47
48 void reset()
49 {
50 m_matches.clear();
51 m_ranges.clear();
52 m_matchesUnicode.clear();
53 m_rangesUnicode.clear();
54 }
55
56 void append(const CharacterClass* other)
57 {
58 for (size_t i = 0; i < other->m_matches.size(); ++i)
59 addSorted(m_matches, other->m_matches[i]);
60 for (size_t i = 0; i < other->m_ranges.size(); ++i)
61 addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end);
62 for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i)
63 addSorted(m_matchesUnicode, other->m_matchesUnicode[i]);
64 for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i)
65 addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end);
66 }
67
68 void putChar(UChar ch)
69 {
6fe7ccc8 70 // Handle ascii cases.
ba379fdc
A
71 if (ch <= 0x7f) {
72 if (m_isCaseInsensitive && isASCIIAlpha(ch)) {
73 addSorted(m_matches, toASCIIUpper(ch));
74 addSorted(m_matches, toASCIILower(ch));
75 } else
76 addSorted(m_matches, ch);
6fe7ccc8 77 return;
ba379fdc 78 }
ba379fdc 79
6fe7ccc8
A
80 // Simple case, not a case-insensitive match.
81 if (!m_isCaseInsensitive) {
82 addSorted(m_matchesUnicode, ch);
83 return;
84 }
85
86 // Add multiple matches, if necessary.
87 UCS2CanonicalizationRange* info = rangeInfoFor(ch);
88 if (info->type == CanonicalizeUnique)
89 addSorted(m_matchesUnicode, ch);
90 else
91 putUnicodeIgnoreCase(ch, info);
ba379fdc
A
92 }
93
6fe7ccc8 94 void putUnicodeIgnoreCase(UChar ch, UCS2CanonicalizationRange* info)
ba379fdc 95 {
6fe7ccc8
A
96 ASSERT(m_isCaseInsensitive);
97 ASSERT(ch > 0x7f);
98 ASSERT(ch >= info->begin && ch <= info->end);
99 ASSERT(info->type != CanonicalizeUnique);
100 if (info->type == CanonicalizeSet) {
101 for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set)
102 addSorted(m_matchesUnicode, ch);
103 } else {
104 addSorted(m_matchesUnicode, ch);
105 addSorted(m_matchesUnicode, getCanonicalPair(info, ch));
106 }
ba379fdc
A
107 }
108
109 void putRange(UChar lo, UChar hi)
110 {
111 if (lo <= 0x7f) {
112 char asciiLo = lo;
113 char asciiHi = std::min(hi, (UChar)0x7f);
114 addSortedRange(m_ranges, lo, asciiHi);
115
116 if (m_isCaseInsensitive) {
117 if ((asciiLo <= 'Z') && (asciiHi >= 'A'))
118 addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A'));
119 if ((asciiLo <= 'z') && (asciiHi >= 'a'))
120 addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
121 }
122 }
6fe7ccc8
A
123 if (hi <= 0x7f)
124 return;
125
126 lo = std::max(lo, (UChar)0x80);
127 addSortedRange(m_rangesUnicode, lo, hi);
128
129 if (!m_isCaseInsensitive)
130 return;
131
132 UCS2CanonicalizationRange* info = rangeInfoFor(lo);
133 while (true) {
134 // Handle the range [lo .. end]
135 UChar end = std::min<UChar>(info->end, hi);
136
137 switch (info->type) {
138 case CanonicalizeUnique:
139 // Nothing to do - no canonical equivalents.
140 break;
141 case CanonicalizeSet: {
142 UChar ch;
143 for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set)
144 addSorted(m_matchesUnicode, ch);
145 break;
ba379fdc 146 }
6fe7ccc8
A
147 case CanonicalizeRangeLo:
148 addSortedRange(m_rangesUnicode, lo + info->value, end + info->value);
149 break;
150 case CanonicalizeRangeHi:
151 addSortedRange(m_rangesUnicode, lo - info->value, end - info->value);
152 break;
153 case CanonicalizeAlternatingAligned:
154 // Use addSortedRange since there is likely an abutting range to combine with.
155 if (lo & 1)
156 addSortedRange(m_rangesUnicode, lo - 1, lo - 1);
157 if (!(end & 1))
158 addSortedRange(m_rangesUnicode, end + 1, end + 1);
159 break;
160 case CanonicalizeAlternatingUnaligned:
161 // Use addSortedRange since there is likely an abutting range to combine with.
162 if (!(lo & 1))
163 addSortedRange(m_rangesUnicode, lo - 1, lo - 1);
164 if (end & 1)
165 addSortedRange(m_rangesUnicode, end + 1, end + 1);
166 break;
167 }
168
169 if (hi == end)
170 return;
171
172 ++info;
173 lo = info->begin;
174 };
175
ba379fdc
A
176 }
177
178 CharacterClass* charClass()
179 {
4e4e5a6f 180 CharacterClass* characterClass = new CharacterClass(0);
ba379fdc 181
6fe7ccc8
A
182 characterClass->m_matches.swap(m_matches);
183 characterClass->m_ranges.swap(m_ranges);
184 characterClass->m_matchesUnicode.swap(m_matchesUnicode);
185 characterClass->m_rangesUnicode.swap(m_rangesUnicode);
ba379fdc
A
186
187 return characterClass;
188 }
189
190private:
191 void addSorted(Vector<UChar>& matches, UChar ch)
192 {
193 unsigned pos = 0;
194 unsigned range = matches.size();
195
196 // binary chop, find position to insert char.
197 while (range) {
198 unsigned index = range >> 1;
199
200 int val = matches[pos+index] - ch;
201 if (!val)
202 return;
203 else if (val > 0)
204 range = index;
205 else {
206 pos += (index+1);
207 range -= (index+1);
208 }
209 }
210
211 if (pos == matches.size())
212 matches.append(ch);
213 else
214 matches.insert(pos, ch);
215 }
216
217 void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi)
218 {
219 unsigned end = ranges.size();
220
221 // Simple linear scan - I doubt there are that many ranges anyway...
222 // feel free to fix this with something faster (eg binary chop).
223 for (unsigned i = 0; i < end; ++i) {
224 // does the new range fall before the current position in the array
225 if (hi < ranges[i].begin) {
226 // optional optimization: concatenate appending ranges? - may not be worthwhile.
227 if (hi == (ranges[i].begin - 1)) {
228 ranges[i].begin = lo;
229 return;
230 }
231 ranges.insert(i, CharacterRange(lo, hi));
232 return;
233 }
234 // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
235 // If the new range start at or before the end of the last range, then the overlap (if it starts one after the
236 // end of the last range they concatenate, which is just as good.
237 if (lo <= (ranges[i].end + 1)) {
238 // found an intersect! we'll replace this entry in the array.
239 ranges[i].begin = std::min(ranges[i].begin, lo);
240 ranges[i].end = std::max(ranges[i].end, hi);
241
242 // now check if the new range can subsume any subsequent ranges.
243 unsigned next = i+1;
244 // each iteration of the loop we will either remove something from the list, or break the loop.
245 while (next < ranges.size()) {
246 if (ranges[next].begin <= (ranges[i].end + 1)) {
247 // the next entry now overlaps / concatenates this one.
248 ranges[i].end = std::max(ranges[i].end, ranges[next].end);
249 ranges.remove(next);
250 } else
251 break;
252 }
253
254 return;
255 }
256 }
257
258 // CharacterRange comes after all existing ranges.
259 ranges.append(CharacterRange(lo, hi));
260 }
261
262 bool m_isCaseInsensitive;
263
264 Vector<UChar> m_matches;
265 Vector<CharacterRange> m_ranges;
266 Vector<UChar> m_matchesUnicode;
267 Vector<CharacterRange> m_rangesUnicode;
268};
269
14957cd0 270class YarrPatternConstructor {
ba379fdc 271public:
14957cd0 272 YarrPatternConstructor(YarrPattern& pattern)
ba379fdc
A
273 : m_pattern(pattern)
274 , m_characterClassConstructor(pattern.m_ignoreCase)
14957cd0 275 , m_invertParentheticalAssertion(false)
ba379fdc 276 {
14957cd0
A
277 m_pattern.m_body = new PatternDisjunction();
278 m_alternative = m_pattern.m_body->addNewAlternative();
279 m_pattern.m_disjunctions.append(m_pattern.m_body);
ba379fdc
A
280 }
281
14957cd0 282 ~YarrPatternConstructor()
ba379fdc
A
283 {
284 }
285
286 void reset()
287 {
288 m_pattern.reset();
289 m_characterClassConstructor.reset();
14957cd0
A
290
291 m_pattern.m_body = new PatternDisjunction();
292 m_alternative = m_pattern.m_body->addNewAlternative();
293 m_pattern.m_disjunctions.append(m_pattern.m_body);
ba379fdc
A
294 }
295
296 void assertionBOL()
297 {
14957cd0
A
298 if (!m_alternative->m_terms.size() & !m_invertParentheticalAssertion) {
299 m_alternative->m_startsWithBOL = true;
300 m_alternative->m_containsBOL = true;
301 m_pattern.m_containsBOL = true;
302 }
ba379fdc
A
303 m_alternative->m_terms.append(PatternTerm::BOL());
304 }
305 void assertionEOL()
306 {
307 m_alternative->m_terms.append(PatternTerm::EOL());
308 }
309 void assertionWordBoundary(bool invert)
310 {
311 m_alternative->m_terms.append(PatternTerm::WordBoundary(invert));
312 }
313
314 void atomPatternCharacter(UChar ch)
315 {
316 // We handle case-insensitive checking of unicode characters which do have both
317 // cases by handling them as if they were defined using a CharacterClass.
6fe7ccc8 318 if (!m_pattern.m_ignoreCase || isASCII(ch)) {
ba379fdc 319 m_alternative->m_terms.append(PatternTerm(ch));
6fe7ccc8
A
320 return;
321 }
322
323 UCS2CanonicalizationRange* info = rangeInfoFor(ch);
324 if (info->type == CanonicalizeUnique) {
325 m_alternative->m_terms.append(PatternTerm(ch));
326 return;
327 }
328
329 m_characterClassConstructor.putUnicodeIgnoreCase(ch, info);
330 CharacterClass* newCharacterClass = m_characterClassConstructor.charClass();
331 m_pattern.m_userCharacterClasses.append(newCharacterClass);
332 m_alternative->m_terms.append(PatternTerm(newCharacterClass, false));
ba379fdc
A
333 }
334
335 void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
336 {
337 switch (classID) {
338 case DigitClassID:
339 m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert));
340 break;
341 case SpaceClassID:
342 m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert));
343 break;
344 case WordClassID:
345 m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
346 break;
347 case NewlineClassID:
348 m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert));
349 break;
350 }
351 }
352
353 void atomCharacterClassBegin(bool invert = false)
354 {
355 m_invertCharacterClass = invert;
356 }
357
358 void atomCharacterClassAtom(UChar ch)
359 {
360 m_characterClassConstructor.putChar(ch);
361 }
362
363 void atomCharacterClassRange(UChar begin, UChar end)
364 {
365 m_characterClassConstructor.putRange(begin, end);
366 }
367
368 void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
369 {
370 ASSERT(classID != NewlineClassID);
371
372 switch (classID) {
373 case DigitClassID:
374 m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass());
375 break;
376
377 case SpaceClassID:
378 m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass());
379 break;
380
381 case WordClassID:
382 m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
383 break;
384
385 default:
386 ASSERT_NOT_REACHED();
387 }
388 }
389
390 void atomCharacterClassEnd()
391 {
392 CharacterClass* newCharacterClass = m_characterClassConstructor.charClass();
393 m_pattern.m_userCharacterClasses.append(newCharacterClass);
394 m_alternative->m_terms.append(PatternTerm(newCharacterClass, m_invertCharacterClass));
395 }
396
397 void atomParenthesesSubpatternBegin(bool capture = true)
398 {
399 unsigned subpatternId = m_pattern.m_numSubpatterns + 1;
400 if (capture)
401 m_pattern.m_numSubpatterns++;
402
403 PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative);
404 m_pattern.m_disjunctions.append(parenthesesDisjunction);
14957cd0 405 m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, false));
ba379fdc
A
406 m_alternative = parenthesesDisjunction->addNewAlternative();
407 }
408
409 void atomParentheticalAssertionBegin(bool invert = false)
410 {
411 PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative);
412 m_pattern.m_disjunctions.append(parenthesesDisjunction);
14957cd0 413 m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, false, invert));
ba379fdc 414 m_alternative = parenthesesDisjunction->addNewAlternative();
14957cd0 415 m_invertParentheticalAssertion = invert;
ba379fdc
A
416 }
417
418 void atomParenthesesEnd()
419 {
420 ASSERT(m_alternative->m_parent);
421 ASSERT(m_alternative->m_parent->m_parent);
14957cd0
A
422
423 PatternDisjunction* parenthesesDisjunction = m_alternative->m_parent;
ba379fdc 424 m_alternative = m_alternative->m_parent->m_parent;
14957cd0
A
425
426 PatternTerm& lastTerm = m_alternative->lastTerm();
427
428 unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size();
429 unsigned numBOLAnchoredAlts = 0;
430
431 for (unsigned i = 0; i < numParenAlternatives; i++) {
432 // Bubble up BOL flags
433 if (parenthesesDisjunction->m_alternatives[i]->m_startsWithBOL)
434 numBOLAnchoredAlts++;
435 }
436
437 if (numBOLAnchoredAlts) {
438 m_alternative->m_containsBOL = true;
439 // If all the alternatives in parens start with BOL, then so does this one
440 if (numBOLAnchoredAlts == numParenAlternatives)
441 m_alternative->m_startsWithBOL = true;
442 }
443
444 lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns;
445 m_invertParentheticalAssertion = false;
ba379fdc
A
446 }
447
448 void atomBackReference(unsigned subpatternId)
449 {
450 ASSERT(subpatternId);
14957cd0 451 m_pattern.m_containsBackreferences = true;
ba379fdc
A
452 m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId);
453
454 if (subpatternId > m_pattern.m_numSubpatterns) {
455 m_alternative->m_terms.append(PatternTerm::ForwardReference());
456 return;
457 }
458
459 PatternAlternative* currentAlternative = m_alternative;
460 ASSERT(currentAlternative);
461
462 // Note to self: if we waited until the AST was baked, we could also remove forwards refs
463 while ((currentAlternative = currentAlternative->m_parent->m_parent)) {
464 PatternTerm& term = currentAlternative->lastTerm();
465 ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion));
466
14957cd0 467 if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.capture() && (subpatternId == term.parentheses.subpatternId)) {
ba379fdc
A
468 m_alternative->m_terms.append(PatternTerm::ForwardReference());
469 return;
470 }
471 }
472
473 m_alternative->m_terms.append(PatternTerm(subpatternId));
474 }
475
14957cd0
A
476 // deep copy the argument disjunction. If filterStartsWithBOL is true,
477 // skip alternatives with m_startsWithBOL set true.
478 PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false)
ba379fdc 479 {
14957cd0 480 PatternDisjunction* newDisjunction = 0;
ba379fdc
A
481 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
482 PatternAlternative* alternative = disjunction->m_alternatives[alt];
14957cd0
A
483 if (!filterStartsWithBOL || !alternative->m_startsWithBOL) {
484 if (!newDisjunction) {
485 newDisjunction = new PatternDisjunction();
486 newDisjunction->m_parent = disjunction->m_parent;
487 }
488 PatternAlternative* newAlternative = newDisjunction->addNewAlternative();
489 for (unsigned i = 0; i < alternative->m_terms.size(); ++i)
490 newAlternative->m_terms.append(copyTerm(alternative->m_terms[i], filterStartsWithBOL));
491 }
ba379fdc 492 }
14957cd0
A
493
494 if (newDisjunction)
495 m_pattern.m_disjunctions.append(newDisjunction);
ba379fdc
A
496 return newDisjunction;
497 }
14957cd0
A
498
499 PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false)
ba379fdc
A
500 {
501 if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion))
502 return PatternTerm(term);
14957cd0 503
ba379fdc 504 PatternTerm termCopy = term;
14957cd0 505 termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL);
ba379fdc
A
506 return termCopy;
507 }
14957cd0 508
ba379fdc
A
509 void quantifyAtom(unsigned min, unsigned max, bool greedy)
510 {
511 ASSERT(min <= max);
512 ASSERT(m_alternative->m_terms.size());
513
514 if (!max) {
515 m_alternative->removeLastTerm();
516 return;
517 }
518
519 PatternTerm& term = m_alternative->lastTerm();
520 ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary);
521 ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount));
522
ba379fdc 523 if (term.type == PatternTerm::TypeParentheticalAssertion) {
6fe7ccc8
A
524 // If an assertion is quantified with a minimum count of zero, it can simply be removed.
525 // This arises from the RepeatMatcher behaviour in the spec. Matching an assertion never
526 // results in any input being consumed, however the continuation passed to the assertion
527 // (called in steps, 8c and 9 of the RepeatMatcher definition, ES5.1 15.10.2.5) will
528 // reject all zero length matches (see step 2.1). A match from the continuation of the
529 // expression will still be accepted regardless (via steps 8a and 11) - the upshot of all
530 // this is that matches from the assertion are not required, and won't be accepted anyway,
531 // so no need to ever run it.
ba379fdc
A
532 if (!min)
533 m_alternative->removeLastTerm();
6fe7ccc8
A
534 // We never need to run an assertion more than once. Subsequent interations will be run
535 // with the same start index (since assertions are non-capturing) and the same captures
536 // (per step 4 of RepeatMatcher in ES5.1 15.10.2.5), and as such will always produce the
537 // same result and captures. If the first match succeeds then the subsequent (min - 1)
538 // matches will too. Any additional optional matches will fail (on the same basis as the
539 // minimum zero quantified assertions, above), but this will still result in a match.
ba379fdc
A
540 return;
541 }
542
543 if (min == 0)
544 term.quantify(max, greedy ? QuantifierGreedy : QuantifierNonGreedy);
545 else if (min == max)
546 term.quantify(min, QuantifierFixedCount);
547 else {
548 term.quantify(min, QuantifierFixedCount);
549 m_alternative->m_terms.append(copyTerm(term));
550 // NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
14957cd0 551 m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy);
ba379fdc
A
552 if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern)
553 m_alternative->lastTerm().parentheses.isCopy = true;
554 }
555 }
556
557 void disjunction()
558 {
559 m_alternative = m_alternative->m_parent->addNewAlternative();
560 }
561
ba379fdc
A
562 unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition)
563 {
564 alternative->m_hasFixedSize = true;
1df5f87f 565 Checked<unsigned> currentInputPosition = initialInputPosition;
ba379fdc
A
566
567 for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
568 PatternTerm& term = alternative->m_terms[i];
569
570 switch (term.type) {
571 case PatternTerm::TypeAssertionBOL:
572 case PatternTerm::TypeAssertionEOL:
573 case PatternTerm::TypeAssertionWordBoundary:
1df5f87f 574 term.inputPosition = currentInputPosition.unsafeGet();
ba379fdc
A
575 break;
576
577 case PatternTerm::TypeBackReference:
1df5f87f 578 term.inputPosition = currentInputPosition.unsafeGet();
ba379fdc 579 term.frameLocation = currentCallFrameSize;
14957cd0 580 currentCallFrameSize += YarrStackSpaceForBackTrackInfoBackReference;
ba379fdc
A
581 alternative->m_hasFixedSize = false;
582 break;
583
584 case PatternTerm::TypeForwardReference:
585 break;
586
587 case PatternTerm::TypePatternCharacter:
1df5f87f 588 term.inputPosition = currentInputPosition.unsafeGet();
ba379fdc
A
589 if (term.quantityType != QuantifierFixedCount) {
590 term.frameLocation = currentCallFrameSize;
14957cd0 591 currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter;
ba379fdc
A
592 alternative->m_hasFixedSize = false;
593 } else
594 currentInputPosition += term.quantityCount;
595 break;
596
597 case PatternTerm::TypeCharacterClass:
1df5f87f 598 term.inputPosition = currentInputPosition.unsafeGet();
ba379fdc
A
599 if (term.quantityType != QuantifierFixedCount) {
600 term.frameLocation = currentCallFrameSize;
14957cd0 601 currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
ba379fdc
A
602 alternative->m_hasFixedSize = false;
603 } else
604 currentInputPosition += term.quantityCount;
605 break;
606
607 case PatternTerm::TypeParenthesesSubpattern:
608 // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
609 term.frameLocation = currentCallFrameSize;
14957cd0
A
610 if (term.quantityCount == 1 && !term.parentheses.isCopy) {
611 if (term.quantityType != QuantifierFixedCount)
612 currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce;
1df5f87f 613 currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet());
14957cd0
A
614 // If quantity is fixed, then pre-check its minimum size.
615 if (term.quantityType == QuantifierFixedCount)
ba379fdc 616 currentInputPosition += term.parentheses.disjunction->m_minimumSize;
1df5f87f 617 term.inputPosition = currentInputPosition.unsafeGet();
14957cd0
A
618 } else if (term.parentheses.isTerminal) {
619 currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal;
1df5f87f
A
620 currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet());
621 term.inputPosition = currentInputPosition.unsafeGet();
ba379fdc 622 } else {
1df5f87f
A
623 term.inputPosition = currentInputPosition.unsafeGet();
624 setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition.unsafeGet());
14957cd0 625 currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses;
ba379fdc
A
626 }
627 // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length.
628 alternative->m_hasFixedSize = false;
629 break;
630
631 case PatternTerm::TypeParentheticalAssertion:
1df5f87f 632 term.inputPosition = currentInputPosition.unsafeGet();
ba379fdc 633 term.frameLocation = currentCallFrameSize;
1df5f87f 634 currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition.unsafeGet());
14957cd0
A
635 break;
636
637 case PatternTerm::TypeDotStarEnclosure:
638 alternative->m_hasFixedSize = false;
639 term.inputPosition = initialInputPosition;
ba379fdc
A
640 break;
641 }
642 }
643
1df5f87f 644 alternative->m_minimumSize = (currentInputPosition - initialInputPosition).unsafeGet();
ba379fdc
A
645 return currentCallFrameSize;
646 }
647
648 unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition)
649 {
650 if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1))
14957cd0 651 initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative;
ba379fdc
A
652
653 unsigned minimumInputSize = UINT_MAX;
654 unsigned maximumCallFrameSize = 0;
655 bool hasFixedSize = true;
656
657 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
658 PatternAlternative* alternative = disjunction->m_alternatives[alt];
659 unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition);
660 minimumInputSize = min(minimumInputSize, alternative->m_minimumSize);
661 maximumCallFrameSize = max(maximumCallFrameSize, currentAlternativeCallFrameSize);
662 hasFixedSize &= alternative->m_hasFixedSize;
663 }
664
665 ASSERT(minimumInputSize != UINT_MAX);
666 ASSERT(maximumCallFrameSize >= initialCallFrameSize);
667
668 disjunction->m_hasFixedSize = hasFixedSize;
669 disjunction->m_minimumSize = minimumInputSize;
670 disjunction->m_callFrameSize = maximumCallFrameSize;
671 return maximumCallFrameSize;
672 }
673
674 void setupOffsets()
675 {
676 setupDisjunctionOffsets(m_pattern.m_body, 0, 0);
677 }
678
14957cd0
A
679 // This optimization identifies sets of parentheses that we will never need to backtrack.
680 // In these cases we do not need to store state from prior iterations.
681 // We can presently avoid backtracking for:
682 // * where the parens are at the end of the regular expression (last term in any of the
683 // alternatives of the main body disjunction).
684 // * where the parens are non-capturing, and quantified unbounded greedy (*).
685 // * where the parens do not contain any capturing subpatterns.
686 void checkForTerminalParentheses()
687 {
688 // This check is much too crude; should be just checking whether the candidate
689 // node contains nested capturing subpatterns, not the whole expression!
690 if (m_pattern.m_numSubpatterns)
691 return;
692
693 Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives;
694 for (size_t i = 0; i < alternatives.size(); ++i) {
695 Vector<PatternTerm>& terms = alternatives[i]->m_terms;
696 if (terms.size()) {
697 PatternTerm& term = terms.last();
698 if (term.type == PatternTerm::TypeParenthesesSubpattern
699 && term.quantityType == QuantifierGreedy
700 && term.quantityCount == quantifyInfinite
701 && !term.capture())
702 term.parentheses.isTerminal = true;
703 }
704 }
705 }
706
707 void optimizeBOL()
708 {
709 // Look for expressions containing beginning of line (^) anchoring and unroll them.
710 // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops
711 // This code relies on the parsing code tagging alternatives with m_containsBOL and
712 // m_startsWithBOL and rolling those up to containing alternatives.
713 // At this point, this is only valid for non-multiline expressions.
714 PatternDisjunction* disjunction = m_pattern.m_body;
715
716 if (!m_pattern.m_containsBOL || m_pattern.m_multiline)
717 return;
718
719 PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true);
720
721 // Set alternatives in disjunction to "onceThrough"
722 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt)
723 disjunction->m_alternatives[alt]->setOnceThrough();
724
725 if (loopDisjunction) {
726 // Move alternatives from loopDisjunction to disjunction
727 for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt)
728 disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt]);
729
730 loopDisjunction->m_alternatives.clear();
731 }
732 }
733
734 bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t lastTermIndex)
735 {
736 Vector<PatternTerm>& terms = alternative->m_terms;
737
738 for (size_t termIndex = firstTermIndex; termIndex <= lastTermIndex; ++termIndex) {
739 PatternTerm& term = terms[termIndex];
740
741 if (term.m_capture)
742 return true;
743
744 if (term.type == PatternTerm::TypeParenthesesSubpattern) {
745 PatternDisjunction* nestedDisjunction = term.parentheses.disjunction;
746 for (unsigned alt = 0; alt < nestedDisjunction->m_alternatives.size(); ++alt) {
747 if (containsCapturingTerms(nestedDisjunction->m_alternatives[alt], 0, nestedDisjunction->m_alternatives[alt]->m_terms.size() - 1))
748 return true;
749 }
750 }
751 }
752
753 return false;
754 }
755
756 // This optimization identifies alternatives in the form of
757 // [^].*[?]<expression>.*[$] for expressions that don't have any
758 // capturing terms. The alternative is changed to <expression>
759 // followed by processing of the dot stars to find and adjust the
760 // beginning and the end of the match.
761 void optimizeDotStarWrappedExpressions()
762 {
763 Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives;
764 if (alternatives.size() != 1)
765 return;
766
767 PatternAlternative* alternative = alternatives[0];
768 Vector<PatternTerm>& terms = alternative->m_terms;
769 if (terms.size() >= 3) {
770 bool startsWithBOL = false;
771 bool endsWithEOL = false;
772 size_t termIndex, firstExpressionTerm, lastExpressionTerm;
773
774 termIndex = 0;
775 if (terms[termIndex].type == PatternTerm::TypeAssertionBOL) {
776 startsWithBOL = true;
777 ++termIndex;
778 }
779
780 PatternTerm& firstNonAnchorTerm = terms[termIndex];
781 if ((firstNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (firstNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || !((firstNonAnchorTerm.quantityType == QuantifierGreedy) || (firstNonAnchorTerm.quantityType == QuantifierNonGreedy)))
782 return;
783
784 firstExpressionTerm = termIndex + 1;
785
786 termIndex = terms.size() - 1;
787 if (terms[termIndex].type == PatternTerm::TypeAssertionEOL) {
788 endsWithEOL = true;
789 --termIndex;
790 }
791
792 PatternTerm& lastNonAnchorTerm = terms[termIndex];
793 if ((lastNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (lastNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || (lastNonAnchorTerm.quantityType != QuantifierGreedy))
794 return;
795
796 lastExpressionTerm = termIndex - 1;
797
798 if (firstExpressionTerm > lastExpressionTerm)
799 return;
800
801 if (!containsCapturingTerms(alternative, firstExpressionTerm, lastExpressionTerm)) {
802 for (termIndex = terms.size() - 1; termIndex > lastExpressionTerm; --termIndex)
803 terms.remove(termIndex);
804
805 for (termIndex = firstExpressionTerm; termIndex > 0; --termIndex)
806 terms.remove(termIndex - 1);
807
808 terms.append(PatternTerm(startsWithBOL, endsWithEOL));
809
810 m_pattern.m_containsBOL = false;
811 }
812 }
813 }
814
ba379fdc 815private:
14957cd0 816 YarrPattern& m_pattern;
ba379fdc
A
817 PatternAlternative* m_alternative;
818 CharacterClassConstructor m_characterClassConstructor;
819 bool m_invertCharacterClass;
14957cd0 820 bool m_invertParentheticalAssertion;
ba379fdc
A
821};
822
14957cd0 823const char* YarrPattern::compile(const UString& patternString)
ba379fdc 824{
14957cd0 825 YarrPatternConstructor constructor(*this);
ba379fdc
A
826
827 if (const char* error = parse(constructor, patternString))
828 return error;
829
830 // If the pattern contains illegal backreferences reset & reparse.
831 // Quoting Netscape's "What's new in JavaScript 1.2",
832 // "Note: if the number of left parentheses is less than the number specified
833 // in \#, the \# is taken as an octal escape as described in the next row."
14957cd0
A
834 if (containsIllegalBackReference()) {
835 unsigned numSubpatterns = m_numSubpatterns;
ba379fdc
A
836
837 constructor.reset();
f9bf01c6 838#if !ASSERT_DISABLED
ba379fdc
A
839 const char* error =
840#endif
841 parse(constructor, patternString, numSubpatterns);
842
843 ASSERT(!error);
14957cd0 844 ASSERT(numSubpatterns == m_numSubpatterns);
ba379fdc
A
845 }
846
14957cd0
A
847 constructor.checkForTerminalParentheses();
848 constructor.optimizeDotStarWrappedExpressions();
849 constructor.optimizeBOL();
850
ba379fdc
A
851 constructor.setupOffsets();
852
14957cd0
A
853 return 0;
854}
855
856YarrPattern::YarrPattern(const UString& pattern, bool ignoreCase, bool multiline, const char** error)
857 : m_ignoreCase(ignoreCase)
858 , m_multiline(multiline)
859 , m_containsBackreferences(false)
860 , m_containsBOL(false)
861 , m_numSubpatterns(0)
862 , m_maxBackReference(0)
863 , newlineCached(0)
864 , digitsCached(0)
865 , spacesCached(0)
866 , wordcharCached(0)
867 , nondigitsCached(0)
868 , nonspacesCached(0)
869 , nonwordcharCached(0)
870{
871 *error = compile(pattern);
872}
ba379fdc
A
873
874} }