]>
Commit | Line | Data |
---|---|---|
ba379fdc A |
1 | /* |
2 | * Copyright (C) 2009 Apple Inc. All rights reserved. | |
3 | * | |
4 | * Redistribution and use in source and binary forms, with or without | |
5 | * modification, are permitted provided that the following conditions | |
6 | * are met: | |
7 | * 1. Redistributions of source code must retain the above copyright | |
8 | * notice, this list of conditions and the following disclaimer. | |
9 | * 2. Redistributions in binary form must reproduce the above copyright | |
10 | * notice, this list of conditions and the following disclaimer in the | |
11 | * documentation and/or other materials provided with the distribution. | |
12 | * | |
13 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY | |
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
16 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR | |
17 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
18 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
19 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
20 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
21 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
23 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
24 | */ | |
25 | ||
26 | #include "config.h" | |
27 | #include "RegexCompiler.h" | |
28 | ||
29 | #include "RegexInterpreter.h" | |
30 | #include "RegexPattern.h" | |
31 | #include <wtf/Vector.h> | |
32 | ||
33 | #if ENABLE(YARR) | |
34 | ||
35 | using namespace WTF; | |
36 | ||
37 | namespace JSC { namespace Yarr { | |
38 | ||
4e4e5a6f A |
39 | #include "RegExpJitTables.h" |
40 | ||
ba379fdc A |
41 | class CharacterClassConstructor { |
42 | public: | |
43 | CharacterClassConstructor(bool isCaseInsensitive = false) | |
44 | : m_isCaseInsensitive(isCaseInsensitive) | |
45 | { | |
46 | } | |
47 | ||
48 | void reset() | |
49 | { | |
50 | m_matches.clear(); | |
51 | m_ranges.clear(); | |
52 | m_matchesUnicode.clear(); | |
53 | m_rangesUnicode.clear(); | |
54 | } | |
55 | ||
56 | void append(const CharacterClass* other) | |
57 | { | |
58 | for (size_t i = 0; i < other->m_matches.size(); ++i) | |
59 | addSorted(m_matches, other->m_matches[i]); | |
60 | for (size_t i = 0; i < other->m_ranges.size(); ++i) | |
61 | addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end); | |
62 | for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i) | |
63 | addSorted(m_matchesUnicode, other->m_matchesUnicode[i]); | |
64 | for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i) | |
65 | addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end); | |
66 | } | |
67 | ||
68 | void putChar(UChar ch) | |
69 | { | |
70 | if (ch <= 0x7f) { | |
71 | if (m_isCaseInsensitive && isASCIIAlpha(ch)) { | |
72 | addSorted(m_matches, toASCIIUpper(ch)); | |
73 | addSorted(m_matches, toASCIILower(ch)); | |
74 | } else | |
75 | addSorted(m_matches, ch); | |
76 | } else { | |
77 | UChar upper, lower; | |
78 | if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) { | |
79 | addSorted(m_matchesUnicode, upper); | |
80 | addSorted(m_matchesUnicode, lower); | |
81 | } else | |
82 | addSorted(m_matchesUnicode, ch); | |
83 | } | |
84 | } | |
85 | ||
86 | // returns true if this character has another case, and 'ch' is the upper case form. | |
87 | static inline bool isUnicodeUpper(UChar ch) | |
88 | { | |
89 | return ch != Unicode::toLower(ch); | |
90 | } | |
91 | ||
92 | // returns true if this character has another case, and 'ch' is the lower case form. | |
93 | static inline bool isUnicodeLower(UChar ch) | |
94 | { | |
95 | return ch != Unicode::toUpper(ch); | |
96 | } | |
97 | ||
98 | void putRange(UChar lo, UChar hi) | |
99 | { | |
100 | if (lo <= 0x7f) { | |
101 | char asciiLo = lo; | |
102 | char asciiHi = std::min(hi, (UChar)0x7f); | |
103 | addSortedRange(m_ranges, lo, asciiHi); | |
104 | ||
105 | if (m_isCaseInsensitive) { | |
106 | if ((asciiLo <= 'Z') && (asciiHi >= 'A')) | |
107 | addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A')); | |
108 | if ((asciiLo <= 'z') && (asciiHi >= 'a')) | |
109 | addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a')); | |
110 | } | |
111 | } | |
112 | if (hi >= 0x80) { | |
113 | uint32_t unicodeCurr = std::max(lo, (UChar)0x80); | |
114 | addSortedRange(m_rangesUnicode, unicodeCurr, hi); | |
115 | ||
116 | if (m_isCaseInsensitive) { | |
117 | while (unicodeCurr <= hi) { | |
118 | // If the upper bound of the range (hi) is 0xffff, the increments to | |
119 | // unicodeCurr in this loop may take it to 0x10000. This is fine | |
120 | // (if so we won't re-enter the loop, since the loop condition above | |
121 | // will definitely fail) - but this does mean we cannot use a UChar | |
122 | // to represent unicodeCurr, we must use a 32-bit value instead. | |
123 | ASSERT(unicodeCurr <= 0xffff); | |
124 | ||
125 | if (isUnicodeUpper(unicodeCurr)) { | |
126 | UChar lowerCaseRangeBegin = Unicode::toLower(unicodeCurr); | |
127 | UChar lowerCaseRangeEnd = lowerCaseRangeBegin; | |
128 | while ((++unicodeCurr <= hi) && isUnicodeUpper(unicodeCurr) && (Unicode::toLower(unicodeCurr) == (lowerCaseRangeEnd + 1))) | |
129 | lowerCaseRangeEnd++; | |
130 | addSortedRange(m_rangesUnicode, lowerCaseRangeBegin, lowerCaseRangeEnd); | |
131 | } else if (isUnicodeLower(unicodeCurr)) { | |
132 | UChar upperCaseRangeBegin = Unicode::toUpper(unicodeCurr); | |
133 | UChar upperCaseRangeEnd = upperCaseRangeBegin; | |
134 | while ((++unicodeCurr <= hi) && isUnicodeLower(unicodeCurr) && (Unicode::toUpper(unicodeCurr) == (upperCaseRangeEnd + 1))) | |
135 | upperCaseRangeEnd++; | |
136 | addSortedRange(m_rangesUnicode, upperCaseRangeBegin, upperCaseRangeEnd); | |
137 | } else | |
138 | ++unicodeCurr; | |
139 | } | |
140 | } | |
141 | } | |
142 | } | |
143 | ||
144 | CharacterClass* charClass() | |
145 | { | |
4e4e5a6f | 146 | CharacterClass* characterClass = new CharacterClass(0); |
ba379fdc A |
147 | |
148 | characterClass->m_matches.append(m_matches); | |
149 | characterClass->m_ranges.append(m_ranges); | |
150 | characterClass->m_matchesUnicode.append(m_matchesUnicode); | |
151 | characterClass->m_rangesUnicode.append(m_rangesUnicode); | |
152 | ||
153 | reset(); | |
154 | ||
155 | return characterClass; | |
156 | } | |
157 | ||
158 | private: | |
159 | void addSorted(Vector<UChar>& matches, UChar ch) | |
160 | { | |
161 | unsigned pos = 0; | |
162 | unsigned range = matches.size(); | |
163 | ||
164 | // binary chop, find position to insert char. | |
165 | while (range) { | |
166 | unsigned index = range >> 1; | |
167 | ||
168 | int val = matches[pos+index] - ch; | |
169 | if (!val) | |
170 | return; | |
171 | else if (val > 0) | |
172 | range = index; | |
173 | else { | |
174 | pos += (index+1); | |
175 | range -= (index+1); | |
176 | } | |
177 | } | |
178 | ||
179 | if (pos == matches.size()) | |
180 | matches.append(ch); | |
181 | else | |
182 | matches.insert(pos, ch); | |
183 | } | |
184 | ||
185 | void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi) | |
186 | { | |
187 | unsigned end = ranges.size(); | |
188 | ||
189 | // Simple linear scan - I doubt there are that many ranges anyway... | |
190 | // feel free to fix this with something faster (eg binary chop). | |
191 | for (unsigned i = 0; i < end; ++i) { | |
192 | // does the new range fall before the current position in the array | |
193 | if (hi < ranges[i].begin) { | |
194 | // optional optimization: concatenate appending ranges? - may not be worthwhile. | |
195 | if (hi == (ranges[i].begin - 1)) { | |
196 | ranges[i].begin = lo; | |
197 | return; | |
198 | } | |
199 | ranges.insert(i, CharacterRange(lo, hi)); | |
200 | return; | |
201 | } | |
202 | // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining | |
203 | // If the new range start at or before the end of the last range, then the overlap (if it starts one after the | |
204 | // end of the last range they concatenate, which is just as good. | |
205 | if (lo <= (ranges[i].end + 1)) { | |
206 | // found an intersect! we'll replace this entry in the array. | |
207 | ranges[i].begin = std::min(ranges[i].begin, lo); | |
208 | ranges[i].end = std::max(ranges[i].end, hi); | |
209 | ||
210 | // now check if the new range can subsume any subsequent ranges. | |
211 | unsigned next = i+1; | |
212 | // each iteration of the loop we will either remove something from the list, or break the loop. | |
213 | while (next < ranges.size()) { | |
214 | if (ranges[next].begin <= (ranges[i].end + 1)) { | |
215 | // the next entry now overlaps / concatenates this one. | |
216 | ranges[i].end = std::max(ranges[i].end, ranges[next].end); | |
217 | ranges.remove(next); | |
218 | } else | |
219 | break; | |
220 | } | |
221 | ||
222 | return; | |
223 | } | |
224 | } | |
225 | ||
226 | // CharacterRange comes after all existing ranges. | |
227 | ranges.append(CharacterRange(lo, hi)); | |
228 | } | |
229 | ||
230 | bool m_isCaseInsensitive; | |
231 | ||
232 | Vector<UChar> m_matches; | |
233 | Vector<CharacterRange> m_ranges; | |
234 | Vector<UChar> m_matchesUnicode; | |
235 | Vector<CharacterRange> m_rangesUnicode; | |
236 | }; | |
237 | ||
ba379fdc A |
238 | class RegexPatternConstructor { |
239 | public: | |
240 | RegexPatternConstructor(RegexPattern& pattern) | |
241 | : m_pattern(pattern) | |
242 | , m_characterClassConstructor(pattern.m_ignoreCase) | |
243 | { | |
244 | } | |
245 | ||
246 | ~RegexPatternConstructor() | |
247 | { | |
248 | } | |
249 | ||
250 | void reset() | |
251 | { | |
252 | m_pattern.reset(); | |
253 | m_characterClassConstructor.reset(); | |
254 | } | |
255 | ||
256 | void assertionBOL() | |
257 | { | |
258 | m_alternative->m_terms.append(PatternTerm::BOL()); | |
259 | } | |
260 | void assertionEOL() | |
261 | { | |
262 | m_alternative->m_terms.append(PatternTerm::EOL()); | |
263 | } | |
264 | void assertionWordBoundary(bool invert) | |
265 | { | |
266 | m_alternative->m_terms.append(PatternTerm::WordBoundary(invert)); | |
267 | } | |
268 | ||
269 | void atomPatternCharacter(UChar ch) | |
270 | { | |
271 | // We handle case-insensitive checking of unicode characters which do have both | |
272 | // cases by handling them as if they were defined using a CharacterClass. | |
273 | if (m_pattern.m_ignoreCase && !isASCII(ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) { | |
274 | atomCharacterClassBegin(); | |
275 | atomCharacterClassAtom(ch); | |
276 | atomCharacterClassEnd(); | |
277 | } else | |
278 | m_alternative->m_terms.append(PatternTerm(ch)); | |
279 | } | |
280 | ||
281 | void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) | |
282 | { | |
283 | switch (classID) { | |
284 | case DigitClassID: | |
285 | m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert)); | |
286 | break; | |
287 | case SpaceClassID: | |
288 | m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert)); | |
289 | break; | |
290 | case WordClassID: | |
291 | m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert)); | |
292 | break; | |
293 | case NewlineClassID: | |
294 | m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert)); | |
295 | break; | |
296 | } | |
297 | } | |
298 | ||
299 | void atomCharacterClassBegin(bool invert = false) | |
300 | { | |
301 | m_invertCharacterClass = invert; | |
302 | } | |
303 | ||
304 | void atomCharacterClassAtom(UChar ch) | |
305 | { | |
306 | m_characterClassConstructor.putChar(ch); | |
307 | } | |
308 | ||
309 | void atomCharacterClassRange(UChar begin, UChar end) | |
310 | { | |
311 | m_characterClassConstructor.putRange(begin, end); | |
312 | } | |
313 | ||
314 | void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) | |
315 | { | |
316 | ASSERT(classID != NewlineClassID); | |
317 | ||
318 | switch (classID) { | |
319 | case DigitClassID: | |
320 | m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass()); | |
321 | break; | |
322 | ||
323 | case SpaceClassID: | |
324 | m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass()); | |
325 | break; | |
326 | ||
327 | case WordClassID: | |
328 | m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass()); | |
329 | break; | |
330 | ||
331 | default: | |
332 | ASSERT_NOT_REACHED(); | |
333 | } | |
334 | } | |
335 | ||
336 | void atomCharacterClassEnd() | |
337 | { | |
338 | CharacterClass* newCharacterClass = m_characterClassConstructor.charClass(); | |
339 | m_pattern.m_userCharacterClasses.append(newCharacterClass); | |
340 | m_alternative->m_terms.append(PatternTerm(newCharacterClass, m_invertCharacterClass)); | |
341 | } | |
342 | ||
343 | void atomParenthesesSubpatternBegin(bool capture = true) | |
344 | { | |
345 | unsigned subpatternId = m_pattern.m_numSubpatterns + 1; | |
346 | if (capture) | |
347 | m_pattern.m_numSubpatterns++; | |
348 | ||
349 | PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative); | |
350 | m_pattern.m_disjunctions.append(parenthesesDisjunction); | |
351 | m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture)); | |
352 | m_alternative = parenthesesDisjunction->addNewAlternative(); | |
353 | } | |
354 | ||
355 | void atomParentheticalAssertionBegin(bool invert = false) | |
356 | { | |
357 | PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative); | |
358 | m_pattern.m_disjunctions.append(parenthesesDisjunction); | |
359 | m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, invert)); | |
360 | m_alternative = parenthesesDisjunction->addNewAlternative(); | |
361 | } | |
362 | ||
363 | void atomParenthesesEnd() | |
364 | { | |
365 | ASSERT(m_alternative->m_parent); | |
366 | ASSERT(m_alternative->m_parent->m_parent); | |
367 | m_alternative = m_alternative->m_parent->m_parent; | |
368 | ||
369 | m_alternative->lastTerm().parentheses.lastSubpatternId = m_pattern.m_numSubpatterns; | |
370 | } | |
371 | ||
372 | void atomBackReference(unsigned subpatternId) | |
373 | { | |
374 | ASSERT(subpatternId); | |
4e4e5a6f | 375 | m_pattern.m_shouldFallBack = true; |
ba379fdc A |
376 | m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId); |
377 | ||
378 | if (subpatternId > m_pattern.m_numSubpatterns) { | |
379 | m_alternative->m_terms.append(PatternTerm::ForwardReference()); | |
380 | return; | |
381 | } | |
382 | ||
383 | PatternAlternative* currentAlternative = m_alternative; | |
384 | ASSERT(currentAlternative); | |
385 | ||
386 | // Note to self: if we waited until the AST was baked, we could also remove forwards refs | |
387 | while ((currentAlternative = currentAlternative->m_parent->m_parent)) { | |
388 | PatternTerm& term = currentAlternative->lastTerm(); | |
389 | ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion)); | |
390 | ||
391 | if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.invertOrCapture && (subpatternId == term.subpatternId)) { | |
392 | m_alternative->m_terms.append(PatternTerm::ForwardReference()); | |
393 | return; | |
394 | } | |
395 | } | |
396 | ||
397 | m_alternative->m_terms.append(PatternTerm(subpatternId)); | |
398 | } | |
399 | ||
400 | PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction) | |
401 | { | |
402 | PatternDisjunction* newDisjunction = new PatternDisjunction(); | |
403 | ||
404 | newDisjunction->m_parent = disjunction->m_parent; | |
405 | for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { | |
406 | PatternAlternative* alternative = disjunction->m_alternatives[alt]; | |
407 | PatternAlternative* newAlternative = newDisjunction->addNewAlternative(); | |
408 | for (unsigned i = 0; i < alternative->m_terms.size(); ++i) | |
409 | newAlternative->m_terms.append(copyTerm(alternative->m_terms[i])); | |
410 | } | |
411 | ||
412 | m_pattern.m_disjunctions.append(newDisjunction); | |
413 | return newDisjunction; | |
414 | } | |
415 | ||
416 | PatternTerm copyTerm(PatternTerm& term) | |
417 | { | |
418 | if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion)) | |
419 | return PatternTerm(term); | |
420 | ||
421 | PatternTerm termCopy = term; | |
422 | termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction); | |
423 | return termCopy; | |
424 | } | |
425 | ||
426 | void quantifyAtom(unsigned min, unsigned max, bool greedy) | |
427 | { | |
428 | ASSERT(min <= max); | |
429 | ASSERT(m_alternative->m_terms.size()); | |
430 | ||
431 | if (!max) { | |
432 | m_alternative->removeLastTerm(); | |
433 | return; | |
434 | } | |
435 | ||
436 | PatternTerm& term = m_alternative->lastTerm(); | |
437 | ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary); | |
438 | ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount)); | |
439 | ||
440 | // For any assertion with a zero minimum, not matching is valid and has no effect, | |
441 | // remove it. Otherwise, we need to match as least once, but there is no point | |
442 | // matching more than once, so remove the quantifier. It is not entirely clear | |
443 | // from the spec whether or not this behavior is correct, but I believe this | |
444 | // matches Firefox. :-/ | |
445 | if (term.type == PatternTerm::TypeParentheticalAssertion) { | |
446 | if (!min) | |
447 | m_alternative->removeLastTerm(); | |
448 | return; | |
449 | } | |
450 | ||
4e4e5a6f A |
451 | if (max > 1 && term.type == PatternTerm::TypeParenthesesSubpattern) |
452 | m_pattern.m_shouldFallBack = true; | |
453 | ||
ba379fdc A |
454 | if (min == 0) |
455 | term.quantify(max, greedy ? QuantifierGreedy : QuantifierNonGreedy); | |
456 | else if (min == max) | |
457 | term.quantify(min, QuantifierFixedCount); | |
458 | else { | |
459 | term.quantify(min, QuantifierFixedCount); | |
460 | m_alternative->m_terms.append(copyTerm(term)); | |
461 | // NOTE: this term is interesting from an analysis perspective, in that it can be ignored..... | |
462 | m_alternative->lastTerm().quantify((max == UINT_MAX) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy); | |
463 | if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern) | |
464 | m_alternative->lastTerm().parentheses.isCopy = true; | |
465 | } | |
466 | } | |
467 | ||
468 | void disjunction() | |
469 | { | |
470 | m_alternative = m_alternative->m_parent->addNewAlternative(); | |
471 | } | |
472 | ||
473 | void regexBegin() | |
474 | { | |
475 | m_pattern.m_body = new PatternDisjunction(); | |
476 | m_alternative = m_pattern.m_body->addNewAlternative(); | |
477 | m_pattern.m_disjunctions.append(m_pattern.m_body); | |
478 | } | |
479 | void regexEnd() | |
480 | { | |
481 | } | |
482 | void regexError() | |
483 | { | |
484 | } | |
485 | ||
486 | unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition) | |
487 | { | |
488 | alternative->m_hasFixedSize = true; | |
489 | unsigned currentInputPosition = initialInputPosition; | |
490 | ||
491 | for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { | |
492 | PatternTerm& term = alternative->m_terms[i]; | |
493 | ||
494 | switch (term.type) { | |
495 | case PatternTerm::TypeAssertionBOL: | |
496 | case PatternTerm::TypeAssertionEOL: | |
497 | case PatternTerm::TypeAssertionWordBoundary: | |
498 | term.inputPosition = currentInputPosition; | |
499 | break; | |
500 | ||
501 | case PatternTerm::TypeBackReference: | |
502 | term.inputPosition = currentInputPosition; | |
503 | term.frameLocation = currentCallFrameSize; | |
504 | currentCallFrameSize += RegexStackSpaceForBackTrackInfoBackReference; | |
505 | alternative->m_hasFixedSize = false; | |
506 | break; | |
507 | ||
508 | case PatternTerm::TypeForwardReference: | |
509 | break; | |
510 | ||
511 | case PatternTerm::TypePatternCharacter: | |
512 | term.inputPosition = currentInputPosition; | |
513 | if (term.quantityType != QuantifierFixedCount) { | |
514 | term.frameLocation = currentCallFrameSize; | |
515 | currentCallFrameSize += RegexStackSpaceForBackTrackInfoPatternCharacter; | |
516 | alternative->m_hasFixedSize = false; | |
517 | } else | |
518 | currentInputPosition += term.quantityCount; | |
519 | break; | |
520 | ||
521 | case PatternTerm::TypeCharacterClass: | |
522 | term.inputPosition = currentInputPosition; | |
523 | if (term.quantityType != QuantifierFixedCount) { | |
524 | term.frameLocation = currentCallFrameSize; | |
525 | currentCallFrameSize += RegexStackSpaceForBackTrackInfoCharacterClass; | |
526 | alternative->m_hasFixedSize = false; | |
527 | } else | |
528 | currentInputPosition += term.quantityCount; | |
529 | break; | |
530 | ||
531 | case PatternTerm::TypeParenthesesSubpattern: | |
532 | // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own. | |
533 | term.frameLocation = currentCallFrameSize; | |
534 | if ((term.quantityCount == 1) && !term.parentheses.isCopy) { | |
535 | if (term.quantityType == QuantifierFixedCount) { | |
536 | currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition); | |
537 | currentInputPosition += term.parentheses.disjunction->m_minimumSize; | |
538 | } else { | |
539 | currentCallFrameSize += RegexStackSpaceForBackTrackInfoParenthesesOnce; | |
540 | currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition); | |
541 | } | |
542 | term.inputPosition = currentInputPosition; | |
543 | } else { | |
544 | term.inputPosition = currentInputPosition; | |
545 | setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition); | |
546 | currentCallFrameSize += RegexStackSpaceForBackTrackInfoParentheses; | |
547 | } | |
548 | // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length. | |
549 | alternative->m_hasFixedSize = false; | |
550 | break; | |
551 | ||
552 | case PatternTerm::TypeParentheticalAssertion: | |
553 | term.inputPosition = currentInputPosition; | |
554 | term.frameLocation = currentCallFrameSize; | |
555 | currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + RegexStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition); | |
556 | break; | |
557 | } | |
558 | } | |
559 | ||
560 | alternative->m_minimumSize = currentInputPosition - initialInputPosition; | |
561 | return currentCallFrameSize; | |
562 | } | |
563 | ||
564 | unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition) | |
565 | { | |
566 | if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1)) | |
567 | initialCallFrameSize += RegexStackSpaceForBackTrackInfoAlternative; | |
568 | ||
569 | unsigned minimumInputSize = UINT_MAX; | |
570 | unsigned maximumCallFrameSize = 0; | |
571 | bool hasFixedSize = true; | |
572 | ||
573 | for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { | |
574 | PatternAlternative* alternative = disjunction->m_alternatives[alt]; | |
575 | unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition); | |
576 | minimumInputSize = min(minimumInputSize, alternative->m_minimumSize); | |
577 | maximumCallFrameSize = max(maximumCallFrameSize, currentAlternativeCallFrameSize); | |
578 | hasFixedSize &= alternative->m_hasFixedSize; | |
579 | } | |
580 | ||
581 | ASSERT(minimumInputSize != UINT_MAX); | |
582 | ASSERT(maximumCallFrameSize >= initialCallFrameSize); | |
583 | ||
584 | disjunction->m_hasFixedSize = hasFixedSize; | |
585 | disjunction->m_minimumSize = minimumInputSize; | |
586 | disjunction->m_callFrameSize = maximumCallFrameSize; | |
587 | return maximumCallFrameSize; | |
588 | } | |
589 | ||
590 | void setupOffsets() | |
591 | { | |
592 | setupDisjunctionOffsets(m_pattern.m_body, 0, 0); | |
593 | } | |
594 | ||
595 | private: | |
596 | RegexPattern& m_pattern; | |
597 | PatternAlternative* m_alternative; | |
598 | CharacterClassConstructor m_characterClassConstructor; | |
599 | bool m_invertCharacterClass; | |
600 | }; | |
601 | ||
602 | ||
603 | const char* compileRegex(const UString& patternString, RegexPattern& pattern) | |
604 | { | |
605 | RegexPatternConstructor constructor(pattern); | |
606 | ||
607 | if (const char* error = parse(constructor, patternString)) | |
608 | return error; | |
609 | ||
610 | // If the pattern contains illegal backreferences reset & reparse. | |
611 | // Quoting Netscape's "What's new in JavaScript 1.2", | |
612 | // "Note: if the number of left parentheses is less than the number specified | |
613 | // in \#, the \# is taken as an octal escape as described in the next row." | |
614 | if (pattern.containsIllegalBackReference()) { | |
615 | unsigned numSubpatterns = pattern.m_numSubpatterns; | |
616 | ||
617 | constructor.reset(); | |
f9bf01c6 | 618 | #if !ASSERT_DISABLED |
ba379fdc A |
619 | const char* error = |
620 | #endif | |
621 | parse(constructor, patternString, numSubpatterns); | |
622 | ||
623 | ASSERT(!error); | |
624 | ASSERT(numSubpatterns == pattern.m_numSubpatterns); | |
625 | } | |
626 | ||
627 | constructor.setupOffsets(); | |
628 | ||
629 | return false; | |
630 | }; | |
631 | ||
632 | ||
633 | } } | |
634 | ||
635 | #endif |