]>
Commit | Line | Data |
---|---|---|
14957cd0 A |
1 | /* |
2 | * Copyright (C) 2009 Apple Inc. All rights reserved. | |
3 | * | |
4 | * Redistribution and use in source and binary forms, with or without | |
5 | * modification, are permitted provided that the following conditions | |
6 | * are met: | |
7 | * 1. Redistributions of source code must retain the above copyright | |
8 | * notice, this list of conditions and the following disclaimer. | |
9 | * 2. Redistributions in binary form must reproduce the above copyright | |
10 | * notice, this list of conditions and the following disclaimer in the | |
11 | * documentation and/or other materials provided with the distribution. | |
12 | * | |
13 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY | |
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
16 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR | |
17 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
18 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
19 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
20 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
21 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
23 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
24 | */ | |
25 | ||
26 | #include "config.h" | |
27 | #include "YarrJIT.h" | |
28 | ||
29 | #include "ASCIICType.h" | |
30 | #include "LinkBuffer.h" | |
31 | #include "Yarr.h" | |
32 | ||
33 | #if ENABLE(YARR_JIT) | |
34 | ||
35 | using namespace WTF; | |
36 | ||
37 | namespace JSC { namespace Yarr { | |
38 | ||
39 | class YarrGenerator : private MacroAssembler { | |
40 | friend void jitCompile(JSGlobalData*, YarrCodeBlock& jitObject, const UString& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline); | |
41 | ||
42 | #if CPU(ARM) | |
43 | static const RegisterID input = ARMRegisters::r0; | |
44 | static const RegisterID index = ARMRegisters::r1; | |
45 | static const RegisterID length = ARMRegisters::r2; | |
46 | static const RegisterID output = ARMRegisters::r4; | |
47 | ||
48 | static const RegisterID regT0 = ARMRegisters::r5; | |
49 | static const RegisterID regT1 = ARMRegisters::r6; | |
50 | ||
51 | static const RegisterID returnRegister = ARMRegisters::r0; | |
52 | #elif CPU(MIPS) | |
53 | static const RegisterID input = MIPSRegisters::a0; | |
54 | static const RegisterID index = MIPSRegisters::a1; | |
55 | static const RegisterID length = MIPSRegisters::a2; | |
56 | static const RegisterID output = MIPSRegisters::a3; | |
57 | ||
58 | static const RegisterID regT0 = MIPSRegisters::t4; | |
59 | static const RegisterID regT1 = MIPSRegisters::t5; | |
60 | ||
61 | static const RegisterID returnRegister = MIPSRegisters::v0; | |
62 | #elif CPU(SH4) | |
63 | static const RegisterID input = SH4Registers::r4; | |
64 | static const RegisterID index = SH4Registers::r5; | |
65 | static const RegisterID length = SH4Registers::r6; | |
66 | static const RegisterID output = SH4Registers::r7; | |
67 | ||
68 | static const RegisterID regT0 = SH4Registers::r0; | |
69 | static const RegisterID regT1 = SH4Registers::r1; | |
70 | ||
71 | static const RegisterID returnRegister = SH4Registers::r0; | |
72 | #elif CPU(X86) | |
73 | static const RegisterID input = X86Registers::eax; | |
74 | static const RegisterID index = X86Registers::edx; | |
75 | static const RegisterID length = X86Registers::ecx; | |
76 | static const RegisterID output = X86Registers::edi; | |
77 | ||
78 | static const RegisterID regT0 = X86Registers::ebx; | |
79 | static const RegisterID regT1 = X86Registers::esi; | |
80 | ||
81 | static const RegisterID returnRegister = X86Registers::eax; | |
82 | #elif CPU(X86_64) | |
83 | static const RegisterID input = X86Registers::edi; | |
84 | static const RegisterID index = X86Registers::esi; | |
85 | static const RegisterID length = X86Registers::edx; | |
86 | static const RegisterID output = X86Registers::ecx; | |
87 | ||
88 | static const RegisterID regT0 = X86Registers::eax; | |
89 | static const RegisterID regT1 = X86Registers::ebx; | |
90 | ||
91 | static const RegisterID returnRegister = X86Registers::eax; | |
92 | #endif | |
93 | ||
94 | void optimizeAlternative(PatternAlternative* alternative) | |
95 | { | |
96 | if (!alternative->m_terms.size()) | |
97 | return; | |
98 | ||
99 | for (unsigned i = 0; i < alternative->m_terms.size() - 1; ++i) { | |
100 | PatternTerm& term = alternative->m_terms[i]; | |
101 | PatternTerm& nextTerm = alternative->m_terms[i + 1]; | |
102 | ||
103 | if ((term.type == PatternTerm::TypeCharacterClass) | |
104 | && (term.quantityType == QuantifierFixedCount) | |
105 | && (nextTerm.type == PatternTerm::TypePatternCharacter) | |
106 | && (nextTerm.quantityType == QuantifierFixedCount)) { | |
107 | PatternTerm termCopy = term; | |
108 | alternative->m_terms[i] = nextTerm; | |
109 | alternative->m_terms[i + 1] = termCopy; | |
110 | } | |
111 | } | |
112 | } | |
113 | ||
114 | void matchCharacterClassRange(RegisterID character, JumpList& failures, JumpList& matchDest, const CharacterRange* ranges, unsigned count, unsigned* matchIndex, const UChar* matches, unsigned matchCount) | |
115 | { | |
116 | do { | |
117 | // pick which range we're going to generate | |
118 | int which = count >> 1; | |
119 | char lo = ranges[which].begin; | |
120 | char hi = ranges[which].end; | |
121 | ||
122 | // check if there are any ranges or matches below lo. If not, just jl to failure - | |
123 | // if there is anything else to check, check that first, if it falls through jmp to failure. | |
124 | if ((*matchIndex < matchCount) && (matches[*matchIndex] < lo)) { | |
125 | Jump loOrAbove = branch32(GreaterThanOrEqual, character, Imm32((unsigned short)lo)); | |
126 | ||
127 | // generate code for all ranges before this one | |
128 | if (which) | |
129 | matchCharacterClassRange(character, failures, matchDest, ranges, which, matchIndex, matches, matchCount); | |
130 | ||
131 | while ((*matchIndex < matchCount) && (matches[*matchIndex] < lo)) { | |
132 | matchDest.append(branch32(Equal, character, Imm32((unsigned short)matches[*matchIndex]))); | |
133 | ++*matchIndex; | |
134 | } | |
135 | failures.append(jump()); | |
136 | ||
137 | loOrAbove.link(this); | |
138 | } else if (which) { | |
139 | Jump loOrAbove = branch32(GreaterThanOrEqual, character, Imm32((unsigned short)lo)); | |
140 | ||
141 | matchCharacterClassRange(character, failures, matchDest, ranges, which, matchIndex, matches, matchCount); | |
142 | failures.append(jump()); | |
143 | ||
144 | loOrAbove.link(this); | |
145 | } else | |
146 | failures.append(branch32(LessThan, character, Imm32((unsigned short)lo))); | |
147 | ||
148 | while ((*matchIndex < matchCount) && (matches[*matchIndex] <= hi)) | |
149 | ++*matchIndex; | |
150 | ||
151 | matchDest.append(branch32(LessThanOrEqual, character, Imm32((unsigned short)hi))); | |
152 | // fall through to here, the value is above hi. | |
153 | ||
154 | // shuffle along & loop around if there are any more matches to handle. | |
155 | unsigned next = which + 1; | |
156 | ranges += next; | |
157 | count -= next; | |
158 | } while (count); | |
159 | } | |
160 | ||
161 | void matchCharacterClass(RegisterID character, JumpList& matchDest, const CharacterClass* charClass) | |
162 | { | |
163 | if (charClass->m_table) { | |
164 | ExtendedAddress tableEntry(character, reinterpret_cast<intptr_t>(charClass->m_table->m_table)); | |
165 | matchDest.append(branchTest8(charClass->m_table->m_inverted ? Zero : NonZero, tableEntry)); | |
166 | return; | |
167 | } | |
168 | Jump unicodeFail; | |
169 | if (charClass->m_matchesUnicode.size() || charClass->m_rangesUnicode.size()) { | |
170 | Jump isAscii = branch32(LessThanOrEqual, character, TrustedImm32(0x7f)); | |
171 | ||
172 | if (charClass->m_matchesUnicode.size()) { | |
173 | for (unsigned i = 0; i < charClass->m_matchesUnicode.size(); ++i) { | |
174 | UChar ch = charClass->m_matchesUnicode[i]; | |
175 | matchDest.append(branch32(Equal, character, Imm32(ch))); | |
176 | } | |
177 | } | |
178 | ||
179 | if (charClass->m_rangesUnicode.size()) { | |
180 | for (unsigned i = 0; i < charClass->m_rangesUnicode.size(); ++i) { | |
181 | UChar lo = charClass->m_rangesUnicode[i].begin; | |
182 | UChar hi = charClass->m_rangesUnicode[i].end; | |
183 | ||
184 | Jump below = branch32(LessThan, character, Imm32(lo)); | |
185 | matchDest.append(branch32(LessThanOrEqual, character, Imm32(hi))); | |
186 | below.link(this); | |
187 | } | |
188 | } | |
189 | ||
190 | unicodeFail = jump(); | |
191 | isAscii.link(this); | |
192 | } | |
193 | ||
194 | if (charClass->m_ranges.size()) { | |
195 | unsigned matchIndex = 0; | |
196 | JumpList failures; | |
197 | matchCharacterClassRange(character, failures, matchDest, charClass->m_ranges.begin(), charClass->m_ranges.size(), &matchIndex, charClass->m_matches.begin(), charClass->m_matches.size()); | |
198 | while (matchIndex < charClass->m_matches.size()) | |
199 | matchDest.append(branch32(Equal, character, Imm32((unsigned short)charClass->m_matches[matchIndex++]))); | |
200 | ||
201 | failures.link(this); | |
202 | } else if (charClass->m_matches.size()) { | |
203 | // optimization: gather 'a','A' etc back together, can mask & test once. | |
204 | Vector<char> matchesAZaz; | |
205 | ||
206 | for (unsigned i = 0; i < charClass->m_matches.size(); ++i) { | |
207 | char ch = charClass->m_matches[i]; | |
208 | if (m_pattern.m_ignoreCase) { | |
209 | if (isASCIILower(ch)) { | |
210 | matchesAZaz.append(ch); | |
211 | continue; | |
212 | } | |
213 | if (isASCIIUpper(ch)) | |
214 | continue; | |
215 | } | |
216 | matchDest.append(branch32(Equal, character, Imm32((unsigned short)ch))); | |
217 | } | |
218 | ||
219 | if (unsigned countAZaz = matchesAZaz.size()) { | |
220 | or32(TrustedImm32(32), character); | |
221 | for (unsigned i = 0; i < countAZaz; ++i) | |
222 | matchDest.append(branch32(Equal, character, TrustedImm32(matchesAZaz[i]))); | |
223 | } | |
224 | } | |
225 | ||
226 | if (charClass->m_matchesUnicode.size() || charClass->m_rangesUnicode.size()) | |
227 | unicodeFail.link(this); | |
228 | } | |
229 | ||
230 | // Jumps if input not available; will have (incorrectly) incremented already! | |
231 | Jump jumpIfNoAvailableInput(unsigned countToCheck = 0) | |
232 | { | |
233 | if (countToCheck) | |
234 | add32(Imm32(countToCheck), index); | |
235 | return branch32(Above, index, length); | |
236 | } | |
237 | ||
238 | Jump jumpIfAvailableInput(unsigned countToCheck) | |
239 | { | |
240 | add32(Imm32(countToCheck), index); | |
241 | return branch32(BelowOrEqual, index, length); | |
242 | } | |
243 | ||
244 | Jump checkInput() | |
245 | { | |
246 | return branch32(BelowOrEqual, index, length); | |
247 | } | |
248 | ||
249 | Jump atEndOfInput() | |
250 | { | |
251 | return branch32(Equal, index, length); | |
252 | } | |
253 | ||
254 | Jump notAtEndOfInput() | |
255 | { | |
256 | return branch32(NotEqual, index, length); | |
257 | } | |
258 | ||
259 | Jump jumpIfCharEquals(UChar ch, int inputPosition) | |
260 | { | |
261 | return branch16(Equal, BaseIndex(input, index, TimesTwo, inputPosition * sizeof(UChar)), Imm32(ch)); | |
262 | } | |
263 | ||
264 | Jump jumpIfCharNotEquals(UChar ch, int inputPosition) | |
265 | { | |
266 | return branch16(NotEqual, BaseIndex(input, index, TimesTwo, inputPosition * sizeof(UChar)), Imm32(ch)); | |
267 | } | |
268 | ||
269 | void readCharacter(int inputPosition, RegisterID reg) | |
270 | { | |
271 | load16(BaseIndex(input, index, TimesTwo, inputPosition * sizeof(UChar)), reg); | |
272 | } | |
273 | ||
274 | void storeToFrame(RegisterID reg, unsigned frameLocation) | |
275 | { | |
276 | poke(reg, frameLocation); | |
277 | } | |
278 | ||
279 | void storeToFrame(TrustedImm32 imm, unsigned frameLocation) | |
280 | { | |
281 | poke(imm, frameLocation); | |
282 | } | |
283 | ||
284 | DataLabelPtr storeToFrameWithPatch(unsigned frameLocation) | |
285 | { | |
286 | return storePtrWithPatch(TrustedImmPtr(0), Address(stackPointerRegister, frameLocation * sizeof(void*))); | |
287 | } | |
288 | ||
289 | void loadFromFrame(unsigned frameLocation, RegisterID reg) | |
290 | { | |
291 | peek(reg, frameLocation); | |
292 | } | |
293 | ||
294 | void loadFromFrameAndJump(unsigned frameLocation) | |
295 | { | |
296 | jump(Address(stackPointerRegister, frameLocation * sizeof(void*))); | |
297 | } | |
298 | ||
299 | enum YarrOpCode { | |
300 | // These nodes wrap body alternatives - those in the main disjunction, | |
301 | // rather than subpatterns or assertions. These are chained together in | |
302 | // a doubly linked list, with a 'begin' node for the first alternative, | |
303 | // a 'next' node for each subsequent alternative, and an 'end' node at | |
304 | // the end. In the case of repeating alternatives, the 'end' node also | |
305 | // has a reference back to 'begin'. | |
306 | OpBodyAlternativeBegin, | |
307 | OpBodyAlternativeNext, | |
308 | OpBodyAlternativeEnd, | |
309 | // Similar to the body alternatives, but used for subpatterns with two | |
310 | // or more alternatives. | |
311 | OpNestedAlternativeBegin, | |
312 | OpNestedAlternativeNext, | |
313 | OpNestedAlternativeEnd, | |
314 | // Used for alternatives in subpatterns where there is only a single | |
315 | // alternative (backtrackingis easier in these cases), or for alternatives | |
316 | // which never need to be backtracked (those in parenthetical assertions, | |
317 | // terminal subpatterns). | |
318 | OpSimpleNestedAlternativeBegin, | |
319 | OpSimpleNestedAlternativeNext, | |
320 | OpSimpleNestedAlternativeEnd, | |
321 | // Used to wrap 'Once' subpattern matches (quantityCount == 1). | |
322 | OpParenthesesSubpatternOnceBegin, | |
323 | OpParenthesesSubpatternOnceEnd, | |
324 | // Used to wrap 'Terminal' subpattern matches (at the end of the regexp). | |
325 | OpParenthesesSubpatternTerminalBegin, | |
326 | OpParenthesesSubpatternTerminalEnd, | |
327 | // Used to wrap parenthetical assertions. | |
328 | OpParentheticalAssertionBegin, | |
329 | OpParentheticalAssertionEnd, | |
330 | // Wraps all simple terms (pattern characters, character classes). | |
331 | OpTerm, | |
332 | // Where an expression contains only 'once through' body alternatives | |
333 | // and no repeating ones, this op is used to return match failure. | |
334 | OpMatchFailed | |
335 | }; | |
336 | ||
337 | // This structure is used to hold the compiled opcode information, | |
338 | // including reference back to the original PatternTerm/PatternAlternatives, | |
339 | // and JIT compilation data structures. | |
340 | struct YarrOp { | |
341 | explicit YarrOp(PatternTerm* term) | |
342 | : m_op(OpTerm) | |
343 | , m_term(term) | |
344 | , m_isDeadCode(false) | |
345 | { | |
346 | } | |
347 | ||
348 | explicit YarrOp(YarrOpCode op) | |
349 | : m_op(op) | |
350 | , m_isDeadCode(false) | |
351 | { | |
352 | } | |
353 | ||
354 | // The operation, as a YarrOpCode, and also a reference to the PatternTerm. | |
355 | YarrOpCode m_op; | |
356 | PatternTerm* m_term; | |
357 | ||
358 | // For alternatives, this holds the PatternAlternative and doubly linked | |
359 | // references to this alternative's siblings. In the case of the | |
360 | // OpBodyAlternativeEnd node at the end of a section of repeating nodes, | |
361 | // m_nextOp will reference the OpBodyAlternativeBegin node of the first | |
362 | // repeating alternative. | |
363 | PatternAlternative* m_alternative; | |
364 | size_t m_previousOp; | |
365 | size_t m_nextOp; | |
366 | ||
367 | // Used to record a set of Jumps out of the generated code, typically | |
368 | // used for jumps out to backtracking code, and a single reentry back | |
369 | // into the code for a node (likely where a backtrack will trigger | |
370 | // rematching). | |
371 | Label m_reentry; | |
372 | JumpList m_jumps; | |
373 | ||
374 | // This flag is used to null out the second pattern character, when | |
375 | // two are fused to match a pair together. | |
376 | bool m_isDeadCode; | |
377 | ||
378 | // Currently used in the case of some of the more complex management of | |
379 | // 'm_checked', to cache the offset used in this alternative, to avoid | |
380 | // recalculating it. | |
381 | int m_checkAdjust; | |
382 | ||
383 | // Used by OpNestedAlternativeNext/End to hold the pointer to the | |
384 | // value that will be pushed into the pattern's frame to return to, | |
385 | // upon backtracking back into the disjunction. | |
386 | DataLabelPtr m_returnAddress; | |
387 | }; | |
388 | ||
389 | // BacktrackingState | |
390 | // This class encapsulates information about the state of code generation | |
391 | // whilst generating the code for backtracking, when a term fails to match. | |
392 | // Upon entry to code generation of the backtracking code for a given node, | |
393 | // the Backtracking state will hold references to all control flow sources | |
394 | // that are outputs in need of further backtracking from the prior node | |
395 | // generated (which is the subsequent operation in the regular expression, | |
396 | // and in the m_ops Vector, since we generated backtracking backwards). | |
397 | // These references to control flow take the form of: | |
398 | // - A jump list of jumps, to be linked to code that will backtrack them | |
399 | // further. | |
400 | // - A set of DataLabelPtr values, to be populated with values to be | |
401 | // treated effectively as return addresses backtracking into complex | |
402 | // subpatterns. | |
403 | // - A flag indicating that the current sequence of generated code up to | |
404 | // this point requires backtracking. | |
405 | class BacktrackingState { | |
406 | public: | |
407 | BacktrackingState() | |
408 | : m_pendingFallthrough(false) | |
409 | { | |
410 | } | |
411 | ||
412 | // Add a jump or jumps, a return address, or set the flag indicating | |
413 | // that the current 'fallthrough' control flow requires backtracking. | |
414 | void append(const Jump& jump) | |
415 | { | |
416 | m_laterFailures.append(jump); | |
417 | } | |
418 | void append(JumpList& jumpList) | |
419 | { | |
420 | m_laterFailures.append(jumpList); | |
421 | } | |
422 | void append(const DataLabelPtr& returnAddress) | |
423 | { | |
424 | m_pendingReturns.append(returnAddress); | |
425 | } | |
426 | void fallthrough() | |
427 | { | |
428 | ASSERT(!m_pendingFallthrough); | |
429 | m_pendingFallthrough = true; | |
430 | } | |
431 | ||
432 | // These methods clear the backtracking state, either linking to the | |
433 | // current location, a provided label, or copying the backtracking out | |
434 | // to a JumpList. All actions may require code generation to take place, | |
435 | // and as such are passed a pointer to the assembler. | |
436 | void link(MacroAssembler* assembler) | |
437 | { | |
438 | if (m_pendingReturns.size()) { | |
439 | Label here(assembler); | |
440 | for (unsigned i = 0; i < m_pendingReturns.size(); ++i) | |
441 | m_backtrackRecords.append(ReturnAddressRecord(m_pendingReturns[i], here)); | |
442 | m_pendingReturns.clear(); | |
443 | } | |
444 | m_laterFailures.link(assembler); | |
445 | m_laterFailures.clear(); | |
446 | m_pendingFallthrough = false; | |
447 | } | |
448 | void linkTo(Label label, MacroAssembler* assembler) | |
449 | { | |
450 | if (m_pendingReturns.size()) { | |
451 | for (unsigned i = 0; i < m_pendingReturns.size(); ++i) | |
452 | m_backtrackRecords.append(ReturnAddressRecord(m_pendingReturns[i], label)); | |
453 | m_pendingReturns.clear(); | |
454 | } | |
455 | if (m_pendingFallthrough) | |
456 | assembler->jump(label); | |
457 | m_laterFailures.linkTo(label, assembler); | |
458 | m_laterFailures.clear(); | |
459 | m_pendingFallthrough = false; | |
460 | } | |
461 | void takeBacktracksToJumpList(JumpList& jumpList, MacroAssembler* assembler) | |
462 | { | |
463 | if (m_pendingReturns.size()) { | |
464 | Label here(assembler); | |
465 | for (unsigned i = 0; i < m_pendingReturns.size(); ++i) | |
466 | m_backtrackRecords.append(ReturnAddressRecord(m_pendingReturns[i], here)); | |
467 | m_pendingReturns.clear(); | |
468 | m_pendingFallthrough = true; | |
469 | } | |
470 | if (m_pendingFallthrough) | |
471 | jumpList.append(assembler->jump()); | |
472 | jumpList.append(m_laterFailures); | |
473 | m_laterFailures.clear(); | |
474 | m_pendingFallthrough = false; | |
475 | } | |
476 | ||
477 | bool isEmpty() | |
478 | { | |
479 | return m_laterFailures.empty() && m_pendingReturns.isEmpty() && !m_pendingFallthrough; | |
480 | } | |
481 | ||
482 | // Called at the end of code generation to link all return addresses. | |
483 | void linkDataLabels(LinkBuffer& linkBuffer) | |
484 | { | |
485 | ASSERT(isEmpty()); | |
486 | for (unsigned i = 0; i < m_backtrackRecords.size(); ++i) | |
487 | linkBuffer.patch(m_backtrackRecords[i].m_dataLabel, linkBuffer.locationOf(m_backtrackRecords[i].m_backtrackLocation)); | |
488 | } | |
489 | ||
490 | private: | |
491 | struct ReturnAddressRecord { | |
492 | ReturnAddressRecord(DataLabelPtr dataLabel, Label backtrackLocation) | |
493 | : m_dataLabel(dataLabel) | |
494 | , m_backtrackLocation(backtrackLocation) | |
495 | { | |
496 | } | |
497 | ||
498 | DataLabelPtr m_dataLabel; | |
499 | Label m_backtrackLocation; | |
500 | }; | |
501 | ||
502 | JumpList m_laterFailures; | |
503 | bool m_pendingFallthrough; | |
504 | Vector<DataLabelPtr, 4> m_pendingReturns; | |
505 | Vector<ReturnAddressRecord, 4> m_backtrackRecords; | |
506 | }; | |
507 | ||
508 | // Generation methods: | |
509 | // =================== | |
510 | ||
511 | // This method provides a default implementation of backtracking common | |
512 | // to many terms; terms commonly jump out of the forwards matching path | |
513 | // on any failed conditions, and add these jumps to the m_jumps list. If | |
514 | // no special handling is required we can often just backtrack to m_jumps. | |
515 | void backtrackTermDefault(size_t opIndex) | |
516 | { | |
517 | YarrOp& op = m_ops[opIndex]; | |
518 | m_backtrackingState.append(op.m_jumps); | |
519 | } | |
520 | ||
521 | void generateAssertionBOL(size_t opIndex) | |
522 | { | |
523 | YarrOp& op = m_ops[opIndex]; | |
524 | PatternTerm* term = op.m_term; | |
525 | ||
526 | if (m_pattern.m_multiline) { | |
527 | const RegisterID character = regT0; | |
528 | ||
529 | JumpList matchDest; | |
530 | if (!term->inputPosition) | |
531 | matchDest.append(branch32(Equal, index, Imm32(m_checked))); | |
532 | ||
533 | readCharacter((term->inputPosition - m_checked) - 1, character); | |
534 | matchCharacterClass(character, matchDest, m_pattern.newlineCharacterClass()); | |
535 | op.m_jumps.append(jump()); | |
536 | ||
537 | matchDest.link(this); | |
538 | } else { | |
539 | // Erk, really should poison out these alternatives early. :-/ | |
540 | if (term->inputPosition) | |
541 | op.m_jumps.append(jump()); | |
542 | else | |
543 | op.m_jumps.append(branch32(NotEqual, index, Imm32(m_checked))); | |
544 | } | |
545 | } | |
546 | void backtrackAssertionBOL(size_t opIndex) | |
547 | { | |
548 | backtrackTermDefault(opIndex); | |
549 | } | |
550 | ||
551 | void generateAssertionEOL(size_t opIndex) | |
552 | { | |
553 | YarrOp& op = m_ops[opIndex]; | |
554 | PatternTerm* term = op.m_term; | |
555 | ||
556 | if (m_pattern.m_multiline) { | |
557 | const RegisterID character = regT0; | |
558 | ||
559 | JumpList matchDest; | |
560 | if (term->inputPosition == m_checked) | |
561 | matchDest.append(atEndOfInput()); | |
562 | ||
1df5f87f | 563 | readCharacter(term->inputPosition - m_checked, character); |
14957cd0 A |
564 | matchCharacterClass(character, matchDest, m_pattern.newlineCharacterClass()); |
565 | op.m_jumps.append(jump()); | |
566 | ||
567 | matchDest.link(this); | |
568 | } else { | |
569 | if (term->inputPosition == m_checked) | |
570 | op.m_jumps.append(notAtEndOfInput()); | |
571 | // Erk, really should poison out these alternatives early. :-/ | |
572 | else | |
573 | op.m_jumps.append(jump()); | |
574 | } | |
575 | } | |
576 | void backtrackAssertionEOL(size_t opIndex) | |
577 | { | |
578 | backtrackTermDefault(opIndex); | |
579 | } | |
580 | ||
581 | // Also falls though on nextIsNotWordChar. | |
582 | void matchAssertionWordchar(size_t opIndex, JumpList& nextIsWordChar, JumpList& nextIsNotWordChar) | |
583 | { | |
584 | YarrOp& op = m_ops[opIndex]; | |
585 | PatternTerm* term = op.m_term; | |
586 | ||
587 | const RegisterID character = regT0; | |
588 | ||
589 | if (term->inputPosition == m_checked) | |
590 | nextIsNotWordChar.append(atEndOfInput()); | |
591 | ||
592 | readCharacter((term->inputPosition - m_checked), character); | |
593 | matchCharacterClass(character, nextIsWordChar, m_pattern.wordcharCharacterClass()); | |
594 | } | |
595 | ||
596 | void generateAssertionWordBoundary(size_t opIndex) | |
597 | { | |
598 | YarrOp& op = m_ops[opIndex]; | |
599 | PatternTerm* term = op.m_term; | |
600 | ||
601 | const RegisterID character = regT0; | |
602 | ||
603 | Jump atBegin; | |
604 | JumpList matchDest; | |
605 | if (!term->inputPosition) | |
606 | atBegin = branch32(Equal, index, Imm32(m_checked)); | |
607 | readCharacter((term->inputPosition - m_checked) - 1, character); | |
608 | matchCharacterClass(character, matchDest, m_pattern.wordcharCharacterClass()); | |
609 | if (!term->inputPosition) | |
610 | atBegin.link(this); | |
611 | ||
612 | // We fall through to here if the last character was not a wordchar. | |
613 | JumpList nonWordCharThenWordChar; | |
614 | JumpList nonWordCharThenNonWordChar; | |
615 | if (term->invert()) { | |
616 | matchAssertionWordchar(opIndex, nonWordCharThenNonWordChar, nonWordCharThenWordChar); | |
617 | nonWordCharThenWordChar.append(jump()); | |
618 | } else { | |
619 | matchAssertionWordchar(opIndex, nonWordCharThenWordChar, nonWordCharThenNonWordChar); | |
620 | nonWordCharThenNonWordChar.append(jump()); | |
621 | } | |
622 | op.m_jumps.append(nonWordCharThenNonWordChar); | |
623 | ||
624 | // We jump here if the last character was a wordchar. | |
625 | matchDest.link(this); | |
626 | JumpList wordCharThenWordChar; | |
627 | JumpList wordCharThenNonWordChar; | |
628 | if (term->invert()) { | |
629 | matchAssertionWordchar(opIndex, wordCharThenNonWordChar, wordCharThenWordChar); | |
630 | wordCharThenWordChar.append(jump()); | |
631 | } else { | |
632 | matchAssertionWordchar(opIndex, wordCharThenWordChar, wordCharThenNonWordChar); | |
633 | // This can fall-though! | |
634 | } | |
635 | ||
636 | op.m_jumps.append(wordCharThenWordChar); | |
637 | ||
638 | nonWordCharThenWordChar.link(this); | |
639 | wordCharThenNonWordChar.link(this); | |
640 | } | |
641 | void backtrackAssertionWordBoundary(size_t opIndex) | |
642 | { | |
643 | backtrackTermDefault(opIndex); | |
644 | } | |
645 | ||
646 | void generatePatternCharacterOnce(size_t opIndex) | |
647 | { | |
648 | YarrOp& op = m_ops[opIndex]; | |
649 | ||
650 | // m_ops always ends with a OpBodyAlternativeEnd or OpMatchFailed | |
651 | // node, so there must always be at least one more node. | |
652 | ASSERT(opIndex + 1 < m_ops.size()); | |
653 | YarrOp& nextOp = m_ops[opIndex + 1]; | |
654 | ||
655 | if (op.m_isDeadCode) | |
656 | return; | |
657 | ||
658 | PatternTerm* term = op.m_term; | |
659 | UChar ch = term->patternCharacter; | |
660 | ||
661 | const RegisterID character = regT0; | |
662 | ||
663 | if (nextOp.m_op == OpTerm) { | |
664 | PatternTerm* nextTerm = nextOp.m_term; | |
665 | if (nextTerm->type == PatternTerm::TypePatternCharacter | |
666 | && nextTerm->quantityType == QuantifierFixedCount | |
667 | && nextTerm->quantityCount == 1 | |
668 | && nextTerm->inputPosition == (term->inputPosition + 1)) { | |
669 | ||
670 | UChar ch2 = nextTerm->patternCharacter; | |
671 | ||
672 | int mask = 0; | |
673 | int chPair = ch | (ch2 << 16); | |
674 | ||
675 | if (m_pattern.m_ignoreCase) { | |
676 | if (isASCIIAlpha(ch)) | |
677 | mask |= 32; | |
678 | if (isASCIIAlpha(ch2)) | |
679 | mask |= 32 << 16; | |
680 | } | |
681 | ||
682 | BaseIndex address(input, index, TimesTwo, (term->inputPosition - m_checked) * sizeof(UChar)); | |
683 | if (mask) { | |
684 | load32WithUnalignedHalfWords(address, character); | |
685 | or32(Imm32(mask), character); | |
686 | op.m_jumps.append(branch32(NotEqual, character, Imm32(chPair | mask))); | |
687 | } else | |
688 | op.m_jumps.append(branch32WithUnalignedHalfWords(NotEqual, address, Imm32(chPair))); | |
689 | ||
690 | nextOp.m_isDeadCode = true; | |
691 | return; | |
692 | } | |
693 | } | |
694 | ||
695 | if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { | |
696 | readCharacter(term->inputPosition - m_checked, character); | |
697 | or32(TrustedImm32(32), character); | |
698 | op.m_jumps.append(branch32(NotEqual, character, Imm32(Unicode::toLower(ch)))); | |
699 | } else { | |
700 | ASSERT(!m_pattern.m_ignoreCase || (Unicode::toLower(ch) == Unicode::toUpper(ch))); | |
701 | op.m_jumps.append(jumpIfCharNotEquals(ch, term->inputPosition - m_checked)); | |
702 | } | |
703 | } | |
704 | void backtrackPatternCharacterOnce(size_t opIndex) | |
705 | { | |
706 | backtrackTermDefault(opIndex); | |
707 | } | |
708 | ||
709 | void generatePatternCharacterFixed(size_t opIndex) | |
710 | { | |
711 | YarrOp& op = m_ops[opIndex]; | |
712 | PatternTerm* term = op.m_term; | |
713 | UChar ch = term->patternCharacter; | |
714 | ||
715 | const RegisterID character = regT0; | |
716 | const RegisterID countRegister = regT1; | |
717 | ||
718 | move(index, countRegister); | |
1df5f87f | 719 | sub32(Imm32(term->quantityCount.unsafeGet()), countRegister); |
14957cd0 A |
720 | |
721 | Label loop(this); | |
1df5f87f | 722 | BaseIndex address(input, countRegister, TimesTwo, ((term->inputPosition - m_checked + Checked<int>(term->quantityCount)) * static_cast<int>(sizeof(UChar))).unsafeGet()); |
14957cd0 A |
723 | |
724 | if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { | |
725 | load16(address, character); | |
726 | or32(TrustedImm32(32), character); | |
727 | op.m_jumps.append(branch32(NotEqual, character, Imm32(Unicode::toLower(ch)))); | |
728 | } else { | |
729 | ASSERT(!m_pattern.m_ignoreCase || (Unicode::toLower(ch) == Unicode::toUpper(ch))); | |
730 | op.m_jumps.append(branch16(NotEqual, address, Imm32(ch))); | |
731 | } | |
732 | add32(TrustedImm32(1), countRegister); | |
733 | branch32(NotEqual, countRegister, index).linkTo(loop, this); | |
734 | } | |
735 | void backtrackPatternCharacterFixed(size_t opIndex) | |
736 | { | |
737 | backtrackTermDefault(opIndex); | |
738 | } | |
739 | ||
740 | void generatePatternCharacterGreedy(size_t opIndex) | |
741 | { | |
742 | YarrOp& op = m_ops[opIndex]; | |
743 | PatternTerm* term = op.m_term; | |
744 | UChar ch = term->patternCharacter; | |
745 | ||
746 | const RegisterID character = regT0; | |
747 | const RegisterID countRegister = regT1; | |
748 | ||
749 | move(TrustedImm32(0), countRegister); | |
750 | ||
751 | JumpList failures; | |
752 | Label loop(this); | |
753 | failures.append(atEndOfInput()); | |
754 | if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { | |
755 | readCharacter(term->inputPosition - m_checked, character); | |
756 | or32(TrustedImm32(32), character); | |
757 | failures.append(branch32(NotEqual, character, Imm32(Unicode::toLower(ch)))); | |
758 | } else { | |
759 | ASSERT(!m_pattern.m_ignoreCase || (Unicode::toLower(ch) == Unicode::toUpper(ch))); | |
760 | failures.append(jumpIfCharNotEquals(ch, term->inputPosition - m_checked)); | |
761 | } | |
762 | ||
763 | add32(TrustedImm32(1), countRegister); | |
764 | add32(TrustedImm32(1), index); | |
765 | if (term->quantityCount == quantifyInfinite) | |
766 | jump(loop); | |
767 | else | |
1df5f87f | 768 | branch32(NotEqual, countRegister, Imm32(term->quantityCount.unsafeGet())).linkTo(loop, this); |
14957cd0 A |
769 | |
770 | failures.link(this); | |
771 | op.m_reentry = label(); | |
772 | ||
773 | storeToFrame(countRegister, term->frameLocation); | |
774 | ||
775 | } | |
776 | void backtrackPatternCharacterGreedy(size_t opIndex) | |
777 | { | |
778 | YarrOp& op = m_ops[opIndex]; | |
779 | PatternTerm* term = op.m_term; | |
780 | ||
781 | const RegisterID countRegister = regT1; | |
782 | ||
783 | m_backtrackingState.link(this); | |
784 | ||
785 | loadFromFrame(term->frameLocation, countRegister); | |
786 | m_backtrackingState.append(branchTest32(Zero, countRegister)); | |
787 | sub32(TrustedImm32(1), countRegister); | |
788 | sub32(TrustedImm32(1), index); | |
789 | jump(op.m_reentry); | |
790 | } | |
791 | ||
792 | void generatePatternCharacterNonGreedy(size_t opIndex) | |
793 | { | |
794 | YarrOp& op = m_ops[opIndex]; | |
795 | PatternTerm* term = op.m_term; | |
796 | ||
797 | const RegisterID countRegister = regT1; | |
798 | ||
799 | move(TrustedImm32(0), countRegister); | |
800 | op.m_reentry = label(); | |
801 | storeToFrame(countRegister, term->frameLocation); | |
802 | } | |
803 | void backtrackPatternCharacterNonGreedy(size_t opIndex) | |
804 | { | |
805 | YarrOp& op = m_ops[opIndex]; | |
806 | PatternTerm* term = op.m_term; | |
807 | UChar ch = term->patternCharacter; | |
808 | ||
809 | const RegisterID character = regT0; | |
810 | const RegisterID countRegister = regT1; | |
811 | ||
812 | JumpList nonGreedyFailures; | |
813 | ||
814 | m_backtrackingState.link(this); | |
815 | ||
816 | loadFromFrame(term->frameLocation, countRegister); | |
817 | ||
818 | nonGreedyFailures.append(atEndOfInput()); | |
819 | if (term->quantityCount != quantifyInfinite) | |
1df5f87f | 820 | nonGreedyFailures.append(branch32(Equal, countRegister, Imm32(term->quantityCount.unsafeGet()))); |
14957cd0 A |
821 | if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { |
822 | readCharacter(term->inputPosition - m_checked, character); | |
823 | or32(TrustedImm32(32), character); | |
824 | nonGreedyFailures.append(branch32(NotEqual, character, Imm32(Unicode::toLower(ch)))); | |
825 | } else { | |
826 | ASSERT(!m_pattern.m_ignoreCase || (Unicode::toLower(ch) == Unicode::toUpper(ch))); | |
827 | nonGreedyFailures.append(jumpIfCharNotEquals(ch, term->inputPosition - m_checked)); | |
828 | } | |
829 | ||
830 | add32(TrustedImm32(1), countRegister); | |
831 | add32(TrustedImm32(1), index); | |
832 | ||
833 | jump(op.m_reentry); | |
834 | ||
835 | nonGreedyFailures.link(this); | |
836 | sub32(countRegister, index); | |
837 | m_backtrackingState.fallthrough(); | |
838 | } | |
839 | ||
840 | void generateCharacterClassOnce(size_t opIndex) | |
841 | { | |
842 | YarrOp& op = m_ops[opIndex]; | |
843 | PatternTerm* term = op.m_term; | |
844 | ||
845 | const RegisterID character = regT0; | |
846 | ||
847 | JumpList matchDest; | |
1df5f87f | 848 | readCharacter(term->inputPosition - m_checked, character); |
14957cd0 A |
849 | matchCharacterClass(character, matchDest, term->characterClass); |
850 | ||
851 | if (term->invert()) | |
852 | op.m_jumps.append(matchDest); | |
853 | else { | |
854 | op.m_jumps.append(jump()); | |
855 | matchDest.link(this); | |
856 | } | |
857 | } | |
858 | void backtrackCharacterClassOnce(size_t opIndex) | |
859 | { | |
860 | backtrackTermDefault(opIndex); | |
861 | } | |
862 | ||
863 | void generateCharacterClassFixed(size_t opIndex) | |
864 | { | |
865 | YarrOp& op = m_ops[opIndex]; | |
866 | PatternTerm* term = op.m_term; | |
867 | ||
868 | const RegisterID character = regT0; | |
869 | const RegisterID countRegister = regT1; | |
870 | ||
871 | move(index, countRegister); | |
1df5f87f | 872 | sub32(Imm32(term->quantityCount.unsafeGet()), countRegister); |
14957cd0 A |
873 | |
874 | Label loop(this); | |
875 | JumpList matchDest; | |
1df5f87f | 876 | load16(BaseIndex(input, countRegister, TimesTwo, ((term->inputPosition - m_checked + Checked<int>(term->quantityCount)) * static_cast<int>(sizeof(UChar))).unsafeGet()), character); |
14957cd0 A |
877 | matchCharacterClass(character, matchDest, term->characterClass); |
878 | ||
879 | if (term->invert()) | |
880 | op.m_jumps.append(matchDest); | |
881 | else { | |
882 | op.m_jumps.append(jump()); | |
883 | matchDest.link(this); | |
884 | } | |
885 | ||
886 | add32(TrustedImm32(1), countRegister); | |
887 | branch32(NotEqual, countRegister, index).linkTo(loop, this); | |
888 | } | |
889 | void backtrackCharacterClassFixed(size_t opIndex) | |
890 | { | |
891 | backtrackTermDefault(opIndex); | |
892 | } | |
893 | ||
894 | void generateCharacterClassGreedy(size_t opIndex) | |
895 | { | |
896 | YarrOp& op = m_ops[opIndex]; | |
897 | PatternTerm* term = op.m_term; | |
898 | ||
899 | const RegisterID character = regT0; | |
900 | const RegisterID countRegister = regT1; | |
901 | ||
902 | move(TrustedImm32(0), countRegister); | |
903 | ||
904 | JumpList failures; | |
905 | Label loop(this); | |
906 | failures.append(atEndOfInput()); | |
907 | ||
908 | if (term->invert()) { | |
909 | readCharacter(term->inputPosition - m_checked, character); | |
910 | matchCharacterClass(character, failures, term->characterClass); | |
911 | } else { | |
912 | JumpList matchDest; | |
913 | readCharacter(term->inputPosition - m_checked, character); | |
914 | matchCharacterClass(character, matchDest, term->characterClass); | |
915 | failures.append(jump()); | |
916 | matchDest.link(this); | |
917 | } | |
918 | ||
919 | add32(TrustedImm32(1), countRegister); | |
920 | add32(TrustedImm32(1), index); | |
921 | if (term->quantityCount != quantifyInfinite) { | |
1df5f87f | 922 | branch32(NotEqual, countRegister, Imm32(term->quantityCount.unsafeGet())).linkTo(loop, this); |
14957cd0 A |
923 | failures.append(jump()); |
924 | } else | |
925 | jump(loop); | |
926 | ||
927 | failures.link(this); | |
928 | op.m_reentry = label(); | |
929 | ||
930 | storeToFrame(countRegister, term->frameLocation); | |
931 | } | |
932 | void backtrackCharacterClassGreedy(size_t opIndex) | |
933 | { | |
934 | YarrOp& op = m_ops[opIndex]; | |
935 | PatternTerm* term = op.m_term; | |
936 | ||
937 | const RegisterID countRegister = regT1; | |
938 | ||
939 | m_backtrackingState.link(this); | |
940 | ||
941 | loadFromFrame(term->frameLocation, countRegister); | |
942 | m_backtrackingState.append(branchTest32(Zero, countRegister)); | |
943 | sub32(TrustedImm32(1), countRegister); | |
944 | sub32(TrustedImm32(1), index); | |
945 | jump(op.m_reentry); | |
946 | } | |
947 | ||
948 | void generateCharacterClassNonGreedy(size_t opIndex) | |
949 | { | |
950 | YarrOp& op = m_ops[opIndex]; | |
951 | PatternTerm* term = op.m_term; | |
952 | ||
953 | const RegisterID countRegister = regT1; | |
954 | ||
955 | move(TrustedImm32(0), countRegister); | |
956 | op.m_reentry = label(); | |
957 | storeToFrame(countRegister, term->frameLocation); | |
958 | } | |
959 | void backtrackCharacterClassNonGreedy(size_t opIndex) | |
960 | { | |
961 | YarrOp& op = m_ops[opIndex]; | |
962 | PatternTerm* term = op.m_term; | |
963 | ||
964 | const RegisterID character = regT0; | |
965 | const RegisterID countRegister = regT1; | |
966 | ||
967 | JumpList nonGreedyFailures; | |
968 | ||
969 | m_backtrackingState.link(this); | |
970 | ||
971 | Label backtrackBegin(this); | |
972 | loadFromFrame(term->frameLocation, countRegister); | |
973 | ||
974 | nonGreedyFailures.append(atEndOfInput()); | |
1df5f87f | 975 | nonGreedyFailures.append(branch32(Equal, countRegister, Imm32(term->quantityCount.unsafeGet()))); |
14957cd0 A |
976 | |
977 | JumpList matchDest; | |
978 | readCharacter(term->inputPosition - m_checked, character); | |
979 | matchCharacterClass(character, matchDest, term->characterClass); | |
980 | ||
981 | if (term->invert()) | |
982 | nonGreedyFailures.append(matchDest); | |
983 | else { | |
984 | nonGreedyFailures.append(jump()); | |
985 | matchDest.link(this); | |
986 | } | |
987 | ||
988 | add32(TrustedImm32(1), countRegister); | |
989 | add32(TrustedImm32(1), index); | |
990 | ||
991 | jump(op.m_reentry); | |
992 | ||
993 | nonGreedyFailures.link(this); | |
994 | sub32(countRegister, index); | |
995 | m_backtrackingState.fallthrough(); | |
996 | } | |
997 | ||
998 | void generateDotStarEnclosure(size_t opIndex) | |
999 | { | |
1000 | YarrOp& op = m_ops[opIndex]; | |
1001 | PatternTerm* term = op.m_term; | |
1002 | ||
1003 | const RegisterID character = regT0; | |
1004 | const RegisterID matchPos = regT1; | |
1005 | ||
1006 | JumpList foundBeginningNewLine; | |
1007 | JumpList saveStartIndex; | |
1008 | JumpList foundEndingNewLine; | |
1009 | ||
1010 | if (m_pattern.m_body->m_hasFixedSize) { | |
1011 | move(index, matchPos); | |
1012 | sub32(Imm32(m_checked), matchPos); | |
1013 | } else | |
1014 | load32(Address(output), matchPos); | |
1015 | ||
1016 | saveStartIndex.append(branchTest32(Zero, matchPos)); | |
1017 | Label findBOLLoop(this); | |
1018 | sub32(TrustedImm32(1), matchPos); | |
1019 | load16(BaseIndex(input, matchPos, TimesTwo, 0), character); | |
1020 | matchCharacterClass(character, foundBeginningNewLine, m_pattern.newlineCharacterClass()); | |
1021 | branchTest32(NonZero, matchPos).linkTo(findBOLLoop, this); | |
1022 | saveStartIndex.append(jump()); | |
1023 | ||
1024 | foundBeginningNewLine.link(this); | |
1025 | add32(TrustedImm32(1), matchPos); // Advance past newline | |
1026 | saveStartIndex.link(this); | |
1027 | ||
1028 | if (!m_pattern.m_multiline && term->anchors.bolAnchor) | |
1029 | op.m_jumps.append(branchTest32(NonZero, matchPos)); | |
1030 | ||
1031 | store32(matchPos, Address(output)); | |
1032 | ||
1033 | move(index, matchPos); | |
1034 | ||
1035 | Label findEOLLoop(this); | |
1036 | foundEndingNewLine.append(branch32(Equal, matchPos, length)); | |
1037 | load16(BaseIndex(input, matchPos, TimesTwo, 0), character); | |
1038 | matchCharacterClass(character, foundEndingNewLine, m_pattern.newlineCharacterClass()); | |
1039 | add32(TrustedImm32(1), matchPos); | |
1040 | jump(findEOLLoop); | |
1041 | ||
1042 | foundEndingNewLine.link(this); | |
1043 | ||
1044 | if (!m_pattern.m_multiline && term->anchors.eolAnchor) | |
1045 | op.m_jumps.append(branch32(NotEqual, matchPos, length)); | |
1046 | ||
1047 | move(matchPos, index); | |
1048 | } | |
1049 | ||
1050 | void backtrackDotStarEnclosure(size_t opIndex) | |
1051 | { | |
1052 | backtrackTermDefault(opIndex); | |
1053 | } | |
1054 | ||
1055 | // Code generation/backtracking for simple terms | |
1056 | // (pattern characters, character classes, and assertions). | |
1057 | // These methods farm out work to the set of functions above. | |
1058 | void generateTerm(size_t opIndex) | |
1059 | { | |
1060 | YarrOp& op = m_ops[opIndex]; | |
1061 | PatternTerm* term = op.m_term; | |
1062 | ||
1063 | switch (term->type) { | |
1064 | case PatternTerm::TypePatternCharacter: | |
1065 | switch (term->quantityType) { | |
1066 | case QuantifierFixedCount: | |
1067 | if (term->quantityCount == 1) | |
1068 | generatePatternCharacterOnce(opIndex); | |
1069 | else | |
1070 | generatePatternCharacterFixed(opIndex); | |
1071 | break; | |
1072 | case QuantifierGreedy: | |
1073 | generatePatternCharacterGreedy(opIndex); | |
1074 | break; | |
1075 | case QuantifierNonGreedy: | |
1076 | generatePatternCharacterNonGreedy(opIndex); | |
1077 | break; | |
1078 | } | |
1079 | break; | |
1080 | ||
1081 | case PatternTerm::TypeCharacterClass: | |
1082 | switch (term->quantityType) { | |
1083 | case QuantifierFixedCount: | |
1084 | if (term->quantityCount == 1) | |
1085 | generateCharacterClassOnce(opIndex); | |
1086 | else | |
1087 | generateCharacterClassFixed(opIndex); | |
1088 | break; | |
1089 | case QuantifierGreedy: | |
1090 | generateCharacterClassGreedy(opIndex); | |
1091 | break; | |
1092 | case QuantifierNonGreedy: | |
1093 | generateCharacterClassNonGreedy(opIndex); | |
1094 | break; | |
1095 | } | |
1096 | break; | |
1097 | ||
1098 | case PatternTerm::TypeAssertionBOL: | |
1099 | generateAssertionBOL(opIndex); | |
1100 | break; | |
1101 | ||
1102 | case PatternTerm::TypeAssertionEOL: | |
1103 | generateAssertionEOL(opIndex); | |
1104 | break; | |
1105 | ||
1106 | case PatternTerm::TypeAssertionWordBoundary: | |
1107 | generateAssertionWordBoundary(opIndex); | |
1108 | break; | |
1109 | ||
1110 | case PatternTerm::TypeForwardReference: | |
1111 | break; | |
1112 | ||
1113 | case PatternTerm::TypeParenthesesSubpattern: | |
1114 | case PatternTerm::TypeParentheticalAssertion: | |
1115 | ASSERT_NOT_REACHED(); | |
1116 | case PatternTerm::TypeBackReference: | |
1117 | m_shouldFallBack = true; | |
1118 | break; | |
1119 | case PatternTerm::TypeDotStarEnclosure: | |
1120 | generateDotStarEnclosure(opIndex); | |
1121 | break; | |
1122 | } | |
1123 | } | |
1124 | void backtrackTerm(size_t opIndex) | |
1125 | { | |
1126 | YarrOp& op = m_ops[opIndex]; | |
1127 | PatternTerm* term = op.m_term; | |
1128 | ||
1129 | switch (term->type) { | |
1130 | case PatternTerm::TypePatternCharacter: | |
1131 | switch (term->quantityType) { | |
1132 | case QuantifierFixedCount: | |
1133 | if (term->quantityCount == 1) | |
1134 | backtrackPatternCharacterOnce(opIndex); | |
1135 | else | |
1136 | backtrackPatternCharacterFixed(opIndex); | |
1137 | break; | |
1138 | case QuantifierGreedy: | |
1139 | backtrackPatternCharacterGreedy(opIndex); | |
1140 | break; | |
1141 | case QuantifierNonGreedy: | |
1142 | backtrackPatternCharacterNonGreedy(opIndex); | |
1143 | break; | |
1144 | } | |
1145 | break; | |
1146 | ||
1147 | case PatternTerm::TypeCharacterClass: | |
1148 | switch (term->quantityType) { | |
1149 | case QuantifierFixedCount: | |
1150 | if (term->quantityCount == 1) | |
1151 | backtrackCharacterClassOnce(opIndex); | |
1152 | else | |
1153 | backtrackCharacterClassFixed(opIndex); | |
1154 | break; | |
1155 | case QuantifierGreedy: | |
1156 | backtrackCharacterClassGreedy(opIndex); | |
1157 | break; | |
1158 | case QuantifierNonGreedy: | |
1159 | backtrackCharacterClassNonGreedy(opIndex); | |
1160 | break; | |
1161 | } | |
1162 | break; | |
1163 | ||
1164 | case PatternTerm::TypeAssertionBOL: | |
1165 | backtrackAssertionBOL(opIndex); | |
1166 | break; | |
1167 | ||
1168 | case PatternTerm::TypeAssertionEOL: | |
1169 | backtrackAssertionEOL(opIndex); | |
1170 | break; | |
1171 | ||
1172 | case PatternTerm::TypeAssertionWordBoundary: | |
1173 | backtrackAssertionWordBoundary(opIndex); | |
1174 | break; | |
1175 | ||
1176 | case PatternTerm::TypeForwardReference: | |
1177 | break; | |
1178 | ||
1179 | case PatternTerm::TypeParenthesesSubpattern: | |
1180 | case PatternTerm::TypeParentheticalAssertion: | |
1181 | ASSERT_NOT_REACHED(); | |
1182 | ||
1183 | case PatternTerm::TypeDotStarEnclosure: | |
1184 | backtrackDotStarEnclosure(opIndex); | |
1185 | break; | |
1186 | ||
1187 | case PatternTerm::TypeBackReference: | |
1188 | m_shouldFallBack = true; | |
1189 | break; | |
1190 | } | |
1191 | } | |
1192 | ||
1193 | void generate() | |
1194 | { | |
1195 | // Forwards generate the matching code. | |
1196 | ASSERT(m_ops.size()); | |
1197 | size_t opIndex = 0; | |
1198 | ||
1199 | do { | |
1200 | YarrOp& op = m_ops[opIndex]; | |
1201 | switch (op.m_op) { | |
1202 | ||
1203 | case OpTerm: | |
1204 | generateTerm(opIndex); | |
1205 | break; | |
1206 | ||
1207 | // OpBodyAlternativeBegin/Next/End | |
1208 | // | |
1209 | // These nodes wrap the set of alternatives in the body of the regular expression. | |
1210 | // There may be either one or two chains of OpBodyAlternative nodes, one representing | |
1211 | // the 'once through' sequence of alternatives (if any exist), and one representing | |
1212 | // the repeating alternatives (again, if any exist). | |
1213 | // | |
1214 | // Upon normal entry to the Begin alternative, we will check that input is available. | |
1215 | // Reentry to the Begin alternative will take place after the check has taken place, | |
1216 | // and will assume that the input position has already been progressed as appropriate. | |
1217 | // | |
1218 | // Entry to subsequent Next/End alternatives occurs when the prior alternative has | |
1219 | // successfully completed a match - return a success state from JIT code. | |
1220 | // | |
1221 | // Next alternatives allow for reentry optimized to suit backtracking from its | |
1222 | // preceding alternative. It expects the input position to still be set to a position | |
1223 | // appropriate to its predecessor, and it will only perform an input check if the | |
1224 | // predecessor had a minimum size less than its own. | |
1225 | // | |
1226 | // In the case 'once through' expressions, the End node will also have a reentry | |
1227 | // point to jump to when the last alternative fails. Again, this expects the input | |
1228 | // position to still reflect that expected by the prior alternative. | |
1229 | case OpBodyAlternativeBegin: { | |
1230 | PatternAlternative* alternative = op.m_alternative; | |
1231 | ||
1232 | // Upon entry at the head of the set of alternatives, check if input is available | |
1233 | // to run the first alternative. (This progresses the input position). | |
1234 | op.m_jumps.append(jumpIfNoAvailableInput(alternative->m_minimumSize)); | |
1235 | // We will reenter after the check, and assume the input position to have been | |
1236 | // set as appropriate to this alternative. | |
1237 | op.m_reentry = label(); | |
1238 | ||
1239 | m_checked += alternative->m_minimumSize; | |
1240 | break; | |
1241 | } | |
1242 | case OpBodyAlternativeNext: | |
1243 | case OpBodyAlternativeEnd: { | |
1244 | PatternAlternative* priorAlternative = m_ops[op.m_previousOp].m_alternative; | |
1245 | PatternAlternative* alternative = op.m_alternative; | |
1246 | ||
1247 | // If we get here, the prior alternative matched - return success. | |
1248 | ||
1249 | // Adjust the stack pointer to remove the pattern's frame. | |
1250 | if (m_pattern.m_body->m_callFrameSize) | |
1251 | addPtr(Imm32(m_pattern.m_body->m_callFrameSize * sizeof(void*)), stackPointerRegister); | |
1252 | ||
1253 | // Load appropriate values into the return register and the first output | |
1254 | // slot, and return. In the case of pattern with a fixed size, we will | |
1255 | // not have yet set the value in the first | |
1256 | ASSERT(index != returnRegister); | |
1257 | if (m_pattern.m_body->m_hasFixedSize) { | |
1258 | move(index, returnRegister); | |
1259 | if (priorAlternative->m_minimumSize) | |
1260 | sub32(Imm32(priorAlternative->m_minimumSize), returnRegister); | |
1261 | store32(returnRegister, output); | |
1262 | } else | |
1263 | load32(Address(output), returnRegister); | |
1264 | store32(index, Address(output, 4)); | |
1265 | generateReturn(); | |
1266 | ||
1267 | // This is the divide between the tail of the prior alternative, above, and | |
1268 | // the head of the subsequent alternative, below. | |
1269 | ||
1270 | if (op.m_op == OpBodyAlternativeNext) { | |
1271 | // This is the reentry point for the Next alternative. We expect any code | |
1272 | // that jumps here to do so with the input position matching that of the | |
1273 | // PRIOR alteranative, and we will only check input availability if we | |
1274 | // need to progress it forwards. | |
1275 | op.m_reentry = label(); | |
1276 | if (alternative->m_minimumSize > priorAlternative->m_minimumSize) { | |
1277 | add32(Imm32(alternative->m_minimumSize - priorAlternative->m_minimumSize), index); | |
1278 | op.m_jumps.append(jumpIfNoAvailableInput()); | |
1279 | } else if (priorAlternative->m_minimumSize > alternative->m_minimumSize) | |
1280 | sub32(Imm32(priorAlternative->m_minimumSize - alternative->m_minimumSize), index); | |
1281 | } else if (op.m_nextOp == notFound) { | |
1282 | // This is the reentry point for the End of 'once through' alternatives, | |
1283 | // jumped to when the las alternative fails to match. | |
1284 | op.m_reentry = label(); | |
1285 | sub32(Imm32(priorAlternative->m_minimumSize), index); | |
1286 | } | |
1287 | ||
1288 | if (op.m_op == OpBodyAlternativeNext) | |
1289 | m_checked += alternative->m_minimumSize; | |
1290 | m_checked -= priorAlternative->m_minimumSize; | |
1291 | break; | |
1292 | } | |
1293 | ||
1294 | // OpSimpleNestedAlternativeBegin/Next/End | |
1295 | // OpNestedAlternativeBegin/Next/End | |
1296 | // | |
1297 | // These nodes are used to handle sets of alternatives that are nested within | |
1298 | // subpatterns and parenthetical assertions. The 'simple' forms are used where | |
1299 | // we do not need to be able to backtrack back into any alternative other than | |
1300 | // the last, the normal forms allow backtracking into any alternative. | |
1301 | // | |
1302 | // Each Begin/Next node is responsible for planting an input check to ensure | |
1303 | // sufficient input is available on entry. Next nodes additionally need to | |
1304 | // jump to the end - Next nodes use the End node's m_jumps list to hold this | |
1305 | // set of jumps. | |
1306 | // | |
1307 | // In the non-simple forms, successful alternative matches must store a | |
1308 | // 'return address' using a DataLabelPtr, used to store the address to jump | |
1309 | // to when backtracking, to get to the code for the appropriate alternative. | |
1310 | case OpSimpleNestedAlternativeBegin: | |
1311 | case OpNestedAlternativeBegin: { | |
1312 | PatternTerm* term = op.m_term; | |
1313 | PatternAlternative* alternative = op.m_alternative; | |
1314 | PatternDisjunction* disjunction = term->parentheses.disjunction; | |
1315 | ||
1316 | // Calculate how much input we need to check for, and if non-zero check. | |
1317 | op.m_checkAdjust = alternative->m_minimumSize; | |
1318 | if ((term->quantityType == QuantifierFixedCount) && (term->type != PatternTerm::TypeParentheticalAssertion)) | |
1319 | op.m_checkAdjust -= disjunction->m_minimumSize; | |
1320 | if (op.m_checkAdjust) | |
1321 | op.m_jumps.append(jumpIfNoAvailableInput(op.m_checkAdjust)); | |
1322 | ||
1323 | m_checked += op.m_checkAdjust; | |
1324 | break; | |
1325 | } | |
1326 | case OpSimpleNestedAlternativeNext: | |
1327 | case OpNestedAlternativeNext: { | |
1328 | PatternTerm* term = op.m_term; | |
1329 | PatternAlternative* alternative = op.m_alternative; | |
1330 | PatternDisjunction* disjunction = term->parentheses.disjunction; | |
1331 | ||
1332 | // In the non-simple case, store a 'return address' so we can backtrack correctly. | |
1333 | if (op.m_op == OpNestedAlternativeNext) { | |
1334 | unsigned parenthesesFrameLocation = term->frameLocation; | |
1335 | unsigned alternativeFrameLocation = parenthesesFrameLocation; | |
1336 | if (term->quantityType != QuantifierFixedCount) | |
1337 | alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce; | |
1338 | op.m_returnAddress = storeToFrameWithPatch(alternativeFrameLocation); | |
1339 | } | |
1340 | ||
1341 | // If we reach here then the last alternative has matched - jump to the | |
1342 | // End node, to skip over any further alternatives. | |
1343 | // | |
1344 | // FIXME: this is logically O(N^2) (though N can be expected to be very | |
1345 | // small). We could avoid this either by adding an extra jump to the JIT | |
1346 | // data structures, or by making backtracking code that jumps to Next | |
1347 | // alternatives are responsible for checking that input is available (if | |
1348 | // we didn't need to plant the input checks, then m_jumps would be free). | |
1349 | YarrOp* endOp = &m_ops[op.m_nextOp]; | |
1350 | while (endOp->m_nextOp != notFound) { | |
1351 | ASSERT(endOp->m_op == OpSimpleNestedAlternativeNext || endOp->m_op == OpNestedAlternativeNext); | |
1352 | endOp = &m_ops[endOp->m_nextOp]; | |
1353 | } | |
1354 | ASSERT(endOp->m_op == OpSimpleNestedAlternativeEnd || endOp->m_op == OpNestedAlternativeEnd); | |
1355 | endOp->m_jumps.append(jump()); | |
1356 | ||
1357 | // This is the entry point for the next alternative. | |
1358 | op.m_reentry = label(); | |
1359 | ||
1360 | // Calculate how much input we need to check for, and if non-zero check. | |
1361 | op.m_checkAdjust = alternative->m_minimumSize; | |
1362 | if ((term->quantityType == QuantifierFixedCount) && (term->type != PatternTerm::TypeParentheticalAssertion)) | |
1363 | op.m_checkAdjust -= disjunction->m_minimumSize; | |
1364 | if (op.m_checkAdjust) | |
1365 | op.m_jumps.append(jumpIfNoAvailableInput(op.m_checkAdjust)); | |
1366 | ||
1367 | YarrOp& lastOp = m_ops[op.m_previousOp]; | |
1368 | m_checked -= lastOp.m_checkAdjust; | |
1369 | m_checked += op.m_checkAdjust; | |
1370 | break; | |
1371 | } | |
1372 | case OpSimpleNestedAlternativeEnd: | |
1373 | case OpNestedAlternativeEnd: { | |
1374 | PatternTerm* term = op.m_term; | |
1375 | ||
1376 | // In the non-simple case, store a 'return address' so we can backtrack correctly. | |
1377 | if (op.m_op == OpNestedAlternativeEnd) { | |
1378 | unsigned parenthesesFrameLocation = term->frameLocation; | |
1379 | unsigned alternativeFrameLocation = parenthesesFrameLocation; | |
1380 | if (term->quantityType != QuantifierFixedCount) | |
1381 | alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce; | |
1382 | op.m_returnAddress = storeToFrameWithPatch(alternativeFrameLocation); | |
1383 | } | |
1384 | ||
1385 | // If this set of alternatives contains more than one alternative, | |
1386 | // then the Next nodes will have planted jumps to the End, and added | |
1387 | // them to this node's m_jumps list. | |
1388 | op.m_jumps.link(this); | |
1389 | op.m_jumps.clear(); | |
1390 | ||
1391 | YarrOp& lastOp = m_ops[op.m_previousOp]; | |
1392 | m_checked -= lastOp.m_checkAdjust; | |
1393 | break; | |
1394 | } | |
1395 | ||
1396 | // OpParenthesesSubpatternOnceBegin/End | |
1397 | // | |
1398 | // These nodes support (optionally) capturing subpatterns, that have a | |
1399 | // quantity count of 1 (this covers fixed once, and ?/?? quantifiers). | |
1400 | case OpParenthesesSubpatternOnceBegin: { | |
1401 | PatternTerm* term = op.m_term; | |
1402 | unsigned parenthesesFrameLocation = term->frameLocation; | |
1403 | const RegisterID indexTemporary = regT0; | |
1404 | ASSERT(term->quantityCount == 1); | |
1405 | ||
1406 | // Upon entry to a Greedy quantified set of parenthese store the index. | |
1407 | // We'll use this for two purposes: | |
1408 | // - To indicate which iteration we are on of mathing the remainder of | |
1409 | // the expression after the parentheses - the first, including the | |
1410 | // match within the parentheses, or the second having skipped over them. | |
1411 | // - To check for empty matches, which must be rejected. | |
1412 | // | |
1413 | // At the head of a NonGreedy set of parentheses we'll immediately set the | |
1414 | // value on the stack to -1 (indicating a match skipping the subpattern), | |
1415 | // and plant a jump to the end. We'll also plant a label to backtrack to | |
1416 | // to reenter the subpattern later, with a store to set up index on the | |
1417 | // second iteration. | |
1418 | // | |
1419 | // FIXME: for capturing parens, could use the index in the capture array? | |
1420 | if (term->quantityType == QuantifierGreedy) | |
1421 | storeToFrame(index, parenthesesFrameLocation); | |
1422 | else if (term->quantityType == QuantifierNonGreedy) { | |
1423 | storeToFrame(TrustedImm32(-1), parenthesesFrameLocation); | |
1424 | op.m_jumps.append(jump()); | |
1425 | op.m_reentry = label(); | |
1426 | storeToFrame(index, parenthesesFrameLocation); | |
1427 | } | |
1428 | ||
1429 | // If the parenthese are capturing, store the starting index value to the | |
1430 | // captures array, offsetting as necessary. | |
1431 | // | |
1432 | // FIXME: could avoid offsetting this value in JIT code, apply | |
1433 | // offsets only afterwards, at the point the results array is | |
1434 | // being accessed. | |
1435 | if (term->capture()) { | |
1436 | int offsetId = term->parentheses.subpatternId << 1; | |
1437 | int inputOffset = term->inputPosition - m_checked; | |
1438 | if (term->quantityType == QuantifierFixedCount) | |
1439 | inputOffset -= term->parentheses.disjunction->m_minimumSize; | |
1440 | if (inputOffset) { | |
1441 | move(index, indexTemporary); | |
1442 | add32(Imm32(inputOffset), indexTemporary); | |
1443 | store32(indexTemporary, Address(output, offsetId * sizeof(int))); | |
1444 | } else | |
1445 | store32(index, Address(output, offsetId * sizeof(int))); | |
1446 | } | |
1447 | break; | |
1448 | } | |
1449 | case OpParenthesesSubpatternOnceEnd: { | |
1450 | PatternTerm* term = op.m_term; | |
1451 | unsigned parenthesesFrameLocation = term->frameLocation; | |
1452 | const RegisterID indexTemporary = regT0; | |
1453 | ASSERT(term->quantityCount == 1); | |
1454 | ||
1455 | // For Greedy/NonGreedy quantified parentheses, we must reject zero length | |
1456 | // matches. If the minimum size is know to be non-zero we need not check. | |
1457 | if (term->quantityType != QuantifierFixedCount && !term->parentheses.disjunction->m_minimumSize) | |
1458 | op.m_jumps.append(branch32(Equal, index, Address(stackPointerRegister, parenthesesFrameLocation * sizeof(void*)))); | |
1459 | ||
1460 | // If the parenthese are capturing, store the ending index value to the | |
1461 | // captures array, offsetting as necessary. | |
1462 | // | |
1463 | // FIXME: could avoid offsetting this value in JIT code, apply | |
1464 | // offsets only afterwards, at the point the results array is | |
1465 | // being accessed. | |
1466 | if (term->capture()) { | |
1467 | int offsetId = (term->parentheses.subpatternId << 1) + 1; | |
1468 | int inputOffset = term->inputPosition - m_checked; | |
1469 | if (inputOffset) { | |
1470 | move(index, indexTemporary); | |
1471 | add32(Imm32(inputOffset), indexTemporary); | |
1472 | store32(indexTemporary, Address(output, offsetId * sizeof(int))); | |
1473 | } else | |
1474 | store32(index, Address(output, offsetId * sizeof(int))); | |
1475 | } | |
1476 | ||
1477 | // If the parentheses are quantified Greedy then add a label to jump back | |
1478 | // to if get a failed match from after the parentheses. For NonGreedy | |
1479 | // parentheses, link the jump from before the subpattern to here. | |
1480 | if (term->quantityType == QuantifierGreedy) | |
1481 | op.m_reentry = label(); | |
1482 | else if (term->quantityType == QuantifierNonGreedy) { | |
1483 | YarrOp& beginOp = m_ops[op.m_previousOp]; | |
1484 | beginOp.m_jumps.link(this); | |
1485 | } | |
1486 | break; | |
1487 | } | |
1488 | ||
1489 | // OpParenthesesSubpatternTerminalBegin/End | |
1490 | case OpParenthesesSubpatternTerminalBegin: { | |
1491 | PatternTerm* term = op.m_term; | |
1492 | ASSERT(term->quantityType == QuantifierGreedy); | |
1493 | ASSERT(term->quantityCount == quantifyInfinite); | |
1494 | ASSERT(!term->capture()); | |
1495 | ||
1496 | // Upon entry set a label to loop back to. | |
1497 | op.m_reentry = label(); | |
1498 | ||
1499 | // Store the start index of the current match; we need to reject zero | |
1500 | // length matches. | |
1501 | storeToFrame(index, term->frameLocation); | |
1502 | break; | |
1503 | } | |
1504 | case OpParenthesesSubpatternTerminalEnd: { | |
1505 | PatternTerm* term = op.m_term; | |
1506 | ||
1507 | // Check for zero length matches - if the match is non-zero, then we | |
1508 | // can accept it & loop back up to the head of the subpattern. | |
1509 | YarrOp& beginOp = m_ops[op.m_previousOp]; | |
1510 | branch32(NotEqual, index, Address(stackPointerRegister, term->frameLocation * sizeof(void*)), beginOp.m_reentry); | |
1511 | ||
1512 | // Reject the match - backtrack back into the subpattern. | |
1513 | op.m_jumps.append(jump()); | |
1514 | ||
1515 | // This is the entry point to jump to when we stop matching - we will | |
1516 | // do so once the subpattern cannot match any more. | |
1517 | op.m_reentry = label(); | |
1518 | break; | |
1519 | } | |
1520 | ||
1521 | // OpParentheticalAssertionBegin/End | |
1522 | case OpParentheticalAssertionBegin: { | |
1523 | PatternTerm* term = op.m_term; | |
1524 | ||
1525 | // Store the current index - assertions should not update index, so | |
1526 | // we will need to restore it upon a successful match. | |
1527 | unsigned parenthesesFrameLocation = term->frameLocation; | |
1528 | storeToFrame(index, parenthesesFrameLocation); | |
1529 | ||
1530 | // Check | |
1531 | op.m_checkAdjust = m_checked - term->inputPosition; | |
1532 | if (op.m_checkAdjust) | |
1533 | sub32(Imm32(op.m_checkAdjust), index); | |
1534 | ||
1535 | m_checked -= op.m_checkAdjust; | |
1536 | break; | |
1537 | } | |
1538 | case OpParentheticalAssertionEnd: { | |
1539 | PatternTerm* term = op.m_term; | |
1540 | ||
1541 | // Restore the input index value. | |
1542 | unsigned parenthesesFrameLocation = term->frameLocation; | |
1543 | loadFromFrame(parenthesesFrameLocation, index); | |
1544 | ||
1545 | // If inverted, a successful match of the assertion must be treated | |
1546 | // as a failure, so jump to backtracking. | |
1547 | if (term->invert()) { | |
1548 | op.m_jumps.append(jump()); | |
1549 | op.m_reentry = label(); | |
1550 | } | |
1551 | ||
1552 | YarrOp& lastOp = m_ops[op.m_previousOp]; | |
1553 | m_checked += lastOp.m_checkAdjust; | |
1554 | break; | |
1555 | } | |
1556 | ||
1557 | case OpMatchFailed: | |
1558 | if (m_pattern.m_body->m_callFrameSize) | |
1559 | addPtr(Imm32(m_pattern.m_body->m_callFrameSize * sizeof(void*)), stackPointerRegister); | |
1560 | move(TrustedImm32(-1), returnRegister); | |
1561 | generateReturn(); | |
1562 | break; | |
1563 | } | |
1564 | ||
1565 | ++opIndex; | |
1566 | } while (opIndex < m_ops.size()); | |
1567 | } | |
1568 | ||
1569 | void backtrack() | |
1570 | { | |
1571 | // Backwards generate the backtracking code. | |
1572 | size_t opIndex = m_ops.size(); | |
1573 | ASSERT(opIndex); | |
1574 | ||
1575 | do { | |
1576 | --opIndex; | |
1577 | YarrOp& op = m_ops[opIndex]; | |
1578 | switch (op.m_op) { | |
1579 | ||
1580 | case OpTerm: | |
1581 | backtrackTerm(opIndex); | |
1582 | break; | |
1583 | ||
1584 | // OpBodyAlternativeBegin/Next/End | |
1585 | // | |
1586 | // For each Begin/Next node representing an alternative, we need to decide what to do | |
1587 | // in two circumstances: | |
1588 | // - If we backtrack back into this node, from within the alternative. | |
1589 | // - If the input check at the head of the alternative fails (if this exists). | |
1590 | // | |
1591 | // We treat these two cases differently since in the former case we have slightly | |
1592 | // more information - since we are backtracking out of a prior alternative we know | |
1593 | // that at least enough input was available to run it. For example, given the regular | |
1594 | // expression /a|b/, if we backtrack out of the first alternative (a failed pattern | |
1595 | // character match of 'a'), then we need not perform an additional input availability | |
1596 | // check before running the second alternative. | |
1597 | // | |
1598 | // Backtracking required differs for the last alternative, which in the case of the | |
1599 | // repeating set of alternatives must loop. The code generated for the last alternative | |
1600 | // will also be used to handle all input check failures from any prior alternatives - | |
1601 | // these require similar functionality, in seeking the next available alternative for | |
1602 | // which there is sufficient input. | |
1603 | // | |
1604 | // Since backtracking of all other alternatives simply requires us to link backtracks | |
1605 | // to the reentry point for the subsequent alternative, we will only be generating any | |
1606 | // code when backtracking the last alternative. | |
1607 | case OpBodyAlternativeBegin: | |
1608 | case OpBodyAlternativeNext: { | |
1609 | PatternAlternative* alternative = op.m_alternative; | |
1610 | ||
1611 | if (op.m_op == OpBodyAlternativeNext) { | |
1612 | PatternAlternative* priorAlternative = m_ops[op.m_previousOp].m_alternative; | |
1613 | m_checked += priorAlternative->m_minimumSize; | |
1614 | } | |
1615 | m_checked -= alternative->m_minimumSize; | |
1616 | ||
1617 | // Is this the last alternative? If not, then if we backtrack to this point we just | |
1618 | // need to jump to try to match the next alternative. | |
1619 | if (m_ops[op.m_nextOp].m_op != OpBodyAlternativeEnd) { | |
1620 | m_backtrackingState.linkTo(m_ops[op.m_nextOp].m_reentry, this); | |
1621 | break; | |
1622 | } | |
1623 | YarrOp& endOp = m_ops[op.m_nextOp]; | |
1624 | ||
1625 | YarrOp* beginOp = &op; | |
1626 | while (beginOp->m_op != OpBodyAlternativeBegin) { | |
1627 | ASSERT(beginOp->m_op == OpBodyAlternativeNext); | |
1628 | beginOp = &m_ops[beginOp->m_previousOp]; | |
1629 | } | |
1630 | ||
1631 | bool onceThrough = endOp.m_nextOp == notFound; | |
1632 | ||
1633 | // First, generate code to handle cases where we backtrack out of an attempted match | |
1634 | // of the last alternative. If this is a 'once through' set of alternatives then we | |
1635 | // have nothing to do - link this straight through to the End. | |
1636 | if (onceThrough) | |
1637 | m_backtrackingState.linkTo(endOp.m_reentry, this); | |
1638 | else { | |
1639 | // If we don't need to move the input poistion, and the pattern has a fixed size | |
1640 | // (in which case we omit the store of the start index until the pattern has matched) | |
1641 | // then we can just link the backtrack out of the last alternative straight to the | |
1642 | // head of the first alternative. | |
1643 | if (m_pattern.m_body->m_hasFixedSize | |
1644 | && (alternative->m_minimumSize > beginOp->m_alternative->m_minimumSize) | |
1645 | && (alternative->m_minimumSize - beginOp->m_alternative->m_minimumSize == 1)) | |
1646 | m_backtrackingState.linkTo(beginOp->m_reentry, this); | |
1647 | else { | |
1648 | // We need to generate a trampoline of code to execute before looping back | |
1649 | // around to the first alternative. | |
1650 | m_backtrackingState.link(this); | |
1651 | ||
1652 | // If the pattern size is not fixed, then store the start index, for use if we match. | |
1653 | if (!m_pattern.m_body->m_hasFixedSize) { | |
1654 | if (alternative->m_minimumSize == 1) | |
1655 | store32(index, Address(output)); | |
1656 | else { | |
1657 | move(index, regT0); | |
1658 | if (alternative->m_minimumSize) | |
1659 | sub32(Imm32(alternative->m_minimumSize - 1), regT0); | |
1660 | else | |
1661 | add32(Imm32(1), regT0); | |
1662 | store32(regT0, Address(output)); | |
1663 | } | |
1664 | } | |
1665 | ||
1666 | // Generate code to loop. Check whether the last alternative is longer than the | |
1667 | // first (e.g. /a|xy/ or /a|xyz/). | |
1668 | if (alternative->m_minimumSize > beginOp->m_alternative->m_minimumSize) { | |
1669 | // We want to loop, and increment input position. If the delta is 1, it is | |
1670 | // already correctly incremented, if more than one then decrement as appropriate. | |
1671 | unsigned delta = alternative->m_minimumSize - beginOp->m_alternative->m_minimumSize; | |
1672 | ASSERT(delta); | |
1673 | if (delta != 1) | |
1674 | sub32(Imm32(delta - 1), index); | |
1675 | jump(beginOp->m_reentry); | |
1676 | } else { | |
1677 | // If the first alternative has minimum size 0xFFFFFFFFu, then there cannot | |
1678 | // be sufficent input available to handle this, so just fall through. | |
1679 | unsigned delta = beginOp->m_alternative->m_minimumSize - alternative->m_minimumSize; | |
1680 | if (delta != 0xFFFFFFFFu) { | |
1681 | // We need to check input because we are incrementing the input. | |
1682 | add32(Imm32(delta + 1), index); | |
1683 | checkInput().linkTo(beginOp->m_reentry, this); | |
1684 | } | |
1685 | } | |
1686 | } | |
1687 | } | |
1688 | ||
1689 | // We can reach this point in the code in two ways: | |
1690 | // - Fallthrough from the code above (a repeating alternative backtracked out of its | |
1691 | // last alternative, and did not have sufficent input to run the first). | |
1692 | // - We will loop back up to the following label when a releating alternative loops, | |
1693 | // following a failed input check. | |
1694 | // | |
1695 | // Either way, we have just failed the input check for the first alternative. | |
1696 | Label firstInputCheckFailed(this); | |
1697 | ||
1698 | // Generate code to handle input check failures from alternatives except the last. | |
1699 | // prevOp is the alternative we're handling a bail out from (initially Begin), and | |
1700 | // nextOp is the alternative we will be attempting to reenter into. | |
1701 | // | |
1702 | // We will link input check failures from the forwards matching path back to the code | |
1703 | // that can handle them. | |
1704 | YarrOp* prevOp = beginOp; | |
1705 | YarrOp* nextOp = &m_ops[beginOp->m_nextOp]; | |
1706 | while (nextOp->m_op != OpBodyAlternativeEnd) { | |
1707 | prevOp->m_jumps.link(this); | |
1708 | ||
1709 | // We only get here if an input check fails, it is only worth checking again | |
1710 | // if the next alternative has a minimum size less than the last. | |
1711 | if (prevOp->m_alternative->m_minimumSize > nextOp->m_alternative->m_minimumSize) { | |
1712 | // FIXME: if we added an extra label to YarrOp, we could avoid needing to | |
1713 | // subtract delta back out, and reduce this code. Should performance test | |
1714 | // the benefit of this. | |
1715 | unsigned delta = prevOp->m_alternative->m_minimumSize - nextOp->m_alternative->m_minimumSize; | |
1716 | sub32(Imm32(delta), index); | |
1717 | Jump fail = jumpIfNoAvailableInput(); | |
1718 | add32(Imm32(delta), index); | |
1719 | jump(nextOp->m_reentry); | |
1720 | fail.link(this); | |
1721 | } else if (prevOp->m_alternative->m_minimumSize < nextOp->m_alternative->m_minimumSize) | |
1722 | add32(Imm32(nextOp->m_alternative->m_minimumSize - prevOp->m_alternative->m_minimumSize), index); | |
1723 | prevOp = nextOp; | |
1724 | nextOp = &m_ops[nextOp->m_nextOp]; | |
1725 | } | |
1726 | ||
1727 | // We fall through to here if there is insufficient input to run the last alternative. | |
1728 | ||
1729 | // If there is insufficient input to run the last alternative, then for 'once through' | |
1730 | // alternatives we are done - just jump back up into the forwards matching path at the End. | |
1731 | if (onceThrough) { | |
1732 | op.m_jumps.linkTo(endOp.m_reentry, this); | |
1733 | jump(endOp.m_reentry); | |
1734 | break; | |
1735 | } | |
1736 | ||
1737 | // For repeating alternatives, link any input check failure from the last alternative to | |
1738 | // this point. | |
1739 | op.m_jumps.link(this); | |
1740 | ||
1741 | bool needsToUpdateMatchStart = !m_pattern.m_body->m_hasFixedSize; | |
1742 | ||
1743 | // Check for cases where input position is already incremented by 1 for the last | |
1744 | // alternative (this is particularly useful where the minimum size of the body | |
1745 | // disjunction is 0, e.g. /a*|b/). | |
1746 | if (needsToUpdateMatchStart && alternative->m_minimumSize == 1) { | |
1747 | // index is already incremented by 1, so just store it now! | |
1748 | store32(index, Address(output)); | |
1749 | needsToUpdateMatchStart = false; | |
1750 | } | |
1751 | ||
1752 | // Check whether there is sufficient input to loop. Increment the input position by | |
1753 | // one, and check. Also add in the minimum disjunction size before checking - there | |
1754 | // is no point in looping if we're just going to fail all the input checks around | |
1755 | // the next iteration. | |
1756 | ASSERT(alternative->m_minimumSize >= m_pattern.m_body->m_minimumSize); | |
1757 | if (alternative->m_minimumSize == m_pattern.m_body->m_minimumSize) { | |
1758 | // If the last alternative had the same minimum size as the disjunction, | |
1759 | // just simply increment input pos by 1, no adjustment based on minimum size. | |
1760 | add32(Imm32(1), index); | |
1761 | } else { | |
1762 | // If the minumum for the last alternative was one greater than than that | |
1763 | // for the disjunction, we're already progressed by 1, nothing to do! | |
1764 | unsigned delta = (alternative->m_minimumSize - m_pattern.m_body->m_minimumSize) - 1; | |
1765 | if (delta) | |
1766 | sub32(Imm32(delta), index); | |
1767 | } | |
1768 | Jump matchFailed = jumpIfNoAvailableInput(); | |
1769 | ||
1770 | if (needsToUpdateMatchStart) { | |
1771 | if (!m_pattern.m_body->m_minimumSize) | |
1772 | store32(index, Address(output)); | |
1773 | else { | |
1774 | move(index, regT0); | |
1775 | sub32(Imm32(m_pattern.m_body->m_minimumSize), regT0); | |
1776 | store32(regT0, Address(output)); | |
1777 | } | |
1778 | } | |
1779 | ||
1780 | // Calculate how much more input the first alternative requires than the minimum | |
1781 | // for the body as a whole. If no more is needed then we dont need an additional | |
1782 | // input check here - jump straight back up to the start of the first alternative. | |
1783 | if (beginOp->m_alternative->m_minimumSize == m_pattern.m_body->m_minimumSize) | |
1784 | jump(beginOp->m_reentry); | |
1785 | else { | |
1786 | if (beginOp->m_alternative->m_minimumSize > m_pattern.m_body->m_minimumSize) | |
1787 | add32(Imm32(beginOp->m_alternative->m_minimumSize - m_pattern.m_body->m_minimumSize), index); | |
1788 | else | |
1789 | sub32(Imm32(m_pattern.m_body->m_minimumSize - beginOp->m_alternative->m_minimumSize), index); | |
1790 | checkInput().linkTo(beginOp->m_reentry, this); | |
1791 | jump(firstInputCheckFailed); | |
1792 | } | |
1793 | ||
1794 | // We jump to here if we iterate to the point that there is insufficient input to | |
1795 | // run any matches, and need to return a failure state from JIT code. | |
1796 | matchFailed.link(this); | |
1797 | ||
1798 | if (m_pattern.m_body->m_callFrameSize) | |
1799 | addPtr(Imm32(m_pattern.m_body->m_callFrameSize * sizeof(void*)), stackPointerRegister); | |
1800 | move(TrustedImm32(-1), returnRegister); | |
1801 | generateReturn(); | |
1802 | break; | |
1803 | } | |
1804 | case OpBodyAlternativeEnd: { | |
1805 | // We should never backtrack back into a body disjunction. | |
1806 | ASSERT(m_backtrackingState.isEmpty()); | |
1807 | ||
1808 | PatternAlternative* priorAlternative = m_ops[op.m_previousOp].m_alternative; | |
1809 | m_checked += priorAlternative->m_minimumSize; | |
1810 | break; | |
1811 | } | |
1812 | ||
1813 | // OpSimpleNestedAlternativeBegin/Next/End | |
1814 | // OpNestedAlternativeBegin/Next/End | |
1815 | // | |
1816 | // Generate code for when we backtrack back out of an alternative into | |
1817 | // a Begin or Next node, or when the entry input count check fails. If | |
1818 | // there are more alternatives we need to jump to the next alternative, | |
1819 | // if not we backtrack back out of the current set of parentheses. | |
1820 | // | |
1821 | // In the case of non-simple nested assertions we need to also link the | |
1822 | // 'return address' appropriately to backtrack back out into the correct | |
1823 | // alternative. | |
1824 | case OpSimpleNestedAlternativeBegin: | |
1825 | case OpSimpleNestedAlternativeNext: | |
1826 | case OpNestedAlternativeBegin: | |
1827 | case OpNestedAlternativeNext: { | |
1828 | YarrOp& nextOp = m_ops[op.m_nextOp]; | |
1829 | bool isBegin = op.m_previousOp == notFound; | |
1830 | bool isLastAlternative = nextOp.m_nextOp == notFound; | |
1831 | ASSERT(isBegin == (op.m_op == OpSimpleNestedAlternativeBegin || op.m_op == OpNestedAlternativeBegin)); | |
1832 | ASSERT(isLastAlternative == (nextOp.m_op == OpSimpleNestedAlternativeEnd || nextOp.m_op == OpNestedAlternativeEnd)); | |
1833 | ||
1834 | // Treat an input check failure the same as a failed match. | |
1835 | m_backtrackingState.append(op.m_jumps); | |
1836 | ||
1837 | // Set the backtracks to jump to the appropriate place. We may need | |
1838 | // to link the backtracks in one of three different way depending on | |
1839 | // the type of alternative we are dealing with: | |
1840 | // - A single alternative, with no simplings. | |
1841 | // - The last alternative of a set of two or more. | |
1842 | // - An alternative other than the last of a set of two or more. | |
1843 | // | |
1844 | // In the case of a single alternative on its own, we don't need to | |
1845 | // jump anywhere - if the alternative fails to match we can just | |
1846 | // continue to backtrack out of the parentheses without jumping. | |
1847 | // | |
1848 | // In the case of the last alternative in a set of more than one, we | |
1849 | // need to jump to return back out to the beginning. We'll do so by | |
1850 | // adding a jump to the End node's m_jumps list, and linking this | |
1851 | // when we come to generate the Begin node. For alternatives other | |
1852 | // than the last, we need to jump to the next alternative. | |
1853 | // | |
1854 | // If the alternative had adjusted the input position we must link | |
1855 | // backtracking to here, correct, and then jump on. If not we can | |
1856 | // link the backtracks directly to their destination. | |
1857 | if (op.m_checkAdjust) { | |
1858 | // Handle the cases where we need to link the backtracks here. | |
1859 | m_backtrackingState.link(this); | |
1860 | sub32(Imm32(op.m_checkAdjust), index); | |
1861 | if (!isLastAlternative) { | |
1862 | // An alternative that is not the last should jump to its successor. | |
1863 | jump(nextOp.m_reentry); | |
1864 | } else if (!isBegin) { | |
1865 | // The last of more than one alternatives must jump back to the begnning. | |
1866 | nextOp.m_jumps.append(jump()); | |
1867 | } else { | |
1868 | // A single alternative on its own can fall through. | |
1869 | m_backtrackingState.fallthrough(); | |
1870 | } | |
1871 | } else { | |
1872 | // Handle the cases where we can link the backtracks directly to their destinations. | |
1873 | if (!isLastAlternative) { | |
1874 | // An alternative that is not the last should jump to its successor. | |
1875 | m_backtrackingState.linkTo(nextOp.m_reentry, this); | |
1876 | } else if (!isBegin) { | |
1877 | // The last of more than one alternatives must jump back to the begnning. | |
1878 | m_backtrackingState.takeBacktracksToJumpList(nextOp.m_jumps, this); | |
1879 | } | |
1880 | // In the case of a single alternative on its own do nothing - it can fall through. | |
1881 | } | |
1882 | ||
1883 | // At this point we've handled the backtracking back into this node. | |
1884 | // Now link any backtracks that need to jump to here. | |
1885 | ||
1886 | // For non-simple alternatives, link the alternative's 'return address' | |
1887 | // so that we backtrack back out into the previous alternative. | |
1888 | if (op.m_op == OpNestedAlternativeNext) | |
1889 | m_backtrackingState.append(op.m_returnAddress); | |
1890 | ||
1891 | // If there is more than one alternative, then the last alternative will | |
1892 | // have planted a jump to be linked to the end. This jump was added to the | |
1893 | // End node's m_jumps list. If we are back at the beginning, link it here. | |
1894 | if (isBegin) { | |
1895 | YarrOp* endOp = &m_ops[op.m_nextOp]; | |
1896 | while (endOp->m_nextOp != notFound) { | |
1897 | ASSERT(endOp->m_op == OpSimpleNestedAlternativeNext || endOp->m_op == OpNestedAlternativeNext); | |
1898 | endOp = &m_ops[endOp->m_nextOp]; | |
1899 | } | |
1900 | ASSERT(endOp->m_op == OpSimpleNestedAlternativeEnd || endOp->m_op == OpNestedAlternativeEnd); | |
1901 | m_backtrackingState.append(endOp->m_jumps); | |
1902 | } | |
1903 | ||
1904 | if (!isBegin) { | |
1905 | YarrOp& lastOp = m_ops[op.m_previousOp]; | |
1906 | m_checked += lastOp.m_checkAdjust; | |
1907 | } | |
1908 | m_checked -= op.m_checkAdjust; | |
1909 | break; | |
1910 | } | |
1911 | case OpSimpleNestedAlternativeEnd: | |
1912 | case OpNestedAlternativeEnd: { | |
1913 | PatternTerm* term = op.m_term; | |
1914 | ||
1915 | // If we backtrack into the end of a simple subpattern do nothing; | |
1916 | // just continue through into the last alternative. If we backtrack | |
1917 | // into the end of a non-simple set of alterntives we need to jump | |
1918 | // to the backtracking return address set up during generation. | |
1919 | if (op.m_op == OpNestedAlternativeEnd) { | |
1920 | m_backtrackingState.link(this); | |
1921 | ||
1922 | // Plant a jump to the return address. | |
1923 | unsigned parenthesesFrameLocation = term->frameLocation; | |
1924 | unsigned alternativeFrameLocation = parenthesesFrameLocation; | |
1925 | if (term->quantityType != QuantifierFixedCount) | |
1926 | alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce; | |
1927 | loadFromFrameAndJump(alternativeFrameLocation); | |
1928 | ||
1929 | // Link the DataLabelPtr associated with the end of the last | |
1930 | // alternative to this point. | |
1931 | m_backtrackingState.append(op.m_returnAddress); | |
1932 | } | |
1933 | ||
1934 | YarrOp& lastOp = m_ops[op.m_previousOp]; | |
1935 | m_checked += lastOp.m_checkAdjust; | |
1936 | break; | |
1937 | } | |
1938 | ||
1939 | // OpParenthesesSubpatternOnceBegin/End | |
1940 | // | |
1941 | // When we are backtracking back out of a capturing subpattern we need | |
1942 | // to clear the start index in the matches output array, to record that | |
1943 | // this subpattern has not been captured. | |
1944 | // | |
1945 | // When backtracking back out of a Greedy quantified subpattern we need | |
1946 | // to catch this, and try running the remainder of the alternative after | |
1947 | // the subpattern again, skipping the parentheses. | |
1948 | // | |
1949 | // Upon backtracking back into a quantified set of parentheses we need to | |
1950 | // check whether we were currently skipping the subpattern. If not, we | |
1951 | // can backtrack into them, if we were we need to either backtrack back | |
1952 | // out of the start of the parentheses, or jump back to the forwards | |
1953 | // matching start, depending of whether the match is Greedy or NonGreedy. | |
1954 | case OpParenthesesSubpatternOnceBegin: { | |
1955 | PatternTerm* term = op.m_term; | |
1956 | ASSERT(term->quantityCount == 1); | |
1957 | ||
1958 | // We only need to backtrack to thispoint if capturing or greedy. | |
1959 | if (term->capture() || term->quantityType == QuantifierGreedy) { | |
1960 | m_backtrackingState.link(this); | |
1961 | ||
1962 | // If capturing, clear the capture (we only need to reset start). | |
1963 | if (term->capture()) | |
1964 | store32(TrustedImm32(-1), Address(output, (term->parentheses.subpatternId << 1) * sizeof(int))); | |
1965 | ||
1966 | // If Greedy, jump to the end. | |
1967 | if (term->quantityType == QuantifierGreedy) { | |
1968 | // Clear the flag in the stackframe indicating we ran through the subpattern. | |
1969 | unsigned parenthesesFrameLocation = term->frameLocation; | |
1970 | storeToFrame(TrustedImm32(-1), parenthesesFrameLocation); | |
1971 | // Jump to after the parentheses, skipping the subpattern. | |
1972 | jump(m_ops[op.m_nextOp].m_reentry); | |
1973 | // A backtrack from after the parentheses, when skipping the subpattern, | |
1974 | // will jump back to here. | |
1975 | op.m_jumps.link(this); | |
1976 | } | |
1977 | ||
1978 | m_backtrackingState.fallthrough(); | |
1979 | } | |
1980 | break; | |
1981 | } | |
1982 | case OpParenthesesSubpatternOnceEnd: { | |
1983 | PatternTerm* term = op.m_term; | |
1984 | ||
1985 | if (term->quantityType != QuantifierFixedCount) { | |
1986 | m_backtrackingState.link(this); | |
1987 | ||
1988 | // Check whether we should backtrack back into the parentheses, or if we | |
1989 | // are currently in a state where we had skipped over the subpattern | |
1990 | // (in which case the flag value on the stack will be -1). | |
1991 | unsigned parenthesesFrameLocation = term->frameLocation; | |
1992 | Jump hadSkipped = branch32(Equal, Address(stackPointerRegister, parenthesesFrameLocation * sizeof(void*)), TrustedImm32(-1)); | |
1993 | ||
1994 | if (term->quantityType == QuantifierGreedy) { | |
1995 | // For Greedy parentheses, we skip after having already tried going | |
1996 | // through the subpattern, so if we get here we're done. | |
1997 | YarrOp& beginOp = m_ops[op.m_previousOp]; | |
1998 | beginOp.m_jumps.append(hadSkipped); | |
1999 | } else { | |
2000 | // For NonGreedy parentheses, we try skipping the subpattern first, | |
2001 | // so if we get here we need to try running through the subpattern | |
2002 | // next. Jump back to the start of the parentheses in the forwards | |
2003 | // matching path. | |
2004 | ASSERT(term->quantityType == QuantifierNonGreedy); | |
2005 | YarrOp& beginOp = m_ops[op.m_previousOp]; | |
2006 | hadSkipped.linkTo(beginOp.m_reentry, this); | |
2007 | } | |
2008 | ||
2009 | m_backtrackingState.fallthrough(); | |
2010 | } | |
2011 | ||
2012 | m_backtrackingState.append(op.m_jumps); | |
2013 | break; | |
2014 | } | |
2015 | ||
2016 | // OpParenthesesSubpatternTerminalBegin/End | |
2017 | // | |
2018 | // Terminal subpatterns will always match - there is nothing after them to | |
2019 | // force a backtrack, and they have a minimum count of 0, and as such will | |
2020 | // always produce an acceptable result. | |
2021 | case OpParenthesesSubpatternTerminalBegin: { | |
2022 | // We will backtrack to this point once the subpattern cannot match any | |
2023 | // more. Since no match is accepted as a successful match (we are Greedy | |
2024 | // quantified with a minimum of zero) jump back to the forwards matching | |
2025 | // path at the end. | |
2026 | YarrOp& endOp = m_ops[op.m_nextOp]; | |
2027 | m_backtrackingState.linkTo(endOp.m_reentry, this); | |
2028 | break; | |
2029 | } | |
2030 | case OpParenthesesSubpatternTerminalEnd: | |
2031 | // We should never be backtracking to here (hence the 'terminal' in the name). | |
2032 | ASSERT(m_backtrackingState.isEmpty()); | |
2033 | m_backtrackingState.append(op.m_jumps); | |
2034 | break; | |
2035 | ||
2036 | // OpParentheticalAssertionBegin/End | |
2037 | case OpParentheticalAssertionBegin: { | |
2038 | PatternTerm* term = op.m_term; | |
2039 | YarrOp& endOp = m_ops[op.m_nextOp]; | |
2040 | ||
2041 | // We need to handle the backtracks upon backtracking back out | |
2042 | // of a parenthetical assertion if either we need to correct | |
2043 | // the input index, or the assertion was inverted. | |
2044 | if (op.m_checkAdjust || term->invert()) { | |
2045 | m_backtrackingState.link(this); | |
2046 | ||
2047 | if (op.m_checkAdjust) | |
2048 | add32(Imm32(op.m_checkAdjust), index); | |
2049 | ||
2050 | // In an inverted assertion failure to match the subpattern | |
2051 | // is treated as a successful match - jump to the end of the | |
2052 | // subpattern. We already have adjusted the input position | |
2053 | // back to that before the assertion, which is correct. | |
2054 | if (term->invert()) | |
2055 | jump(endOp.m_reentry); | |
2056 | ||
2057 | m_backtrackingState.fallthrough(); | |
2058 | } | |
2059 | ||
2060 | // The End node's jump list will contain any backtracks into | |
2061 | // the end of the assertion. Also, if inverted, we will have | |
2062 | // added the failure caused by a successful match to this. | |
2063 | m_backtrackingState.append(endOp.m_jumps); | |
2064 | ||
2065 | m_checked += op.m_checkAdjust; | |
2066 | break; | |
2067 | } | |
2068 | case OpParentheticalAssertionEnd: { | |
2069 | // FIXME: We should really be clearing any nested subpattern | |
2070 | // matches on bailing out from after the pattern. Firefox has | |
2071 | // this bug too (presumably because they use YARR!) | |
2072 | ||
2073 | // Never backtrack into an assertion; later failures bail to before the begin. | |
2074 | m_backtrackingState.takeBacktracksToJumpList(op.m_jumps, this); | |
2075 | ||
2076 | YarrOp& lastOp = m_ops[op.m_previousOp]; | |
2077 | m_checked -= lastOp.m_checkAdjust; | |
2078 | break; | |
2079 | } | |
2080 | ||
2081 | case OpMatchFailed: | |
2082 | break; | |
2083 | } | |
2084 | ||
2085 | } while (opIndex); | |
2086 | } | |
2087 | ||
2088 | // Compilation methods: | |
2089 | // ==================== | |
2090 | ||
2091 | // opCompileParenthesesSubpattern | |
2092 | // Emits ops for a subpattern (set of parentheses). These consist | |
2093 | // of a set of alternatives wrapped in an outer set of nodes for | |
2094 | // the parentheses. | |
2095 | // Supported types of parentheses are 'Once' (quantityCount == 1) | |
2096 | // and 'Terminal' (non-capturing parentheses quantified as greedy | |
2097 | // and infinite). | |
2098 | // Alternatives will use the 'Simple' set of ops if either the | |
2099 | // subpattern is terminal (in which case we will never need to | |
2100 | // backtrack), or if the subpattern only contains one alternative. | |
2101 | void opCompileParenthesesSubpattern(PatternTerm* term) | |
2102 | { | |
2103 | YarrOpCode parenthesesBeginOpCode; | |
2104 | YarrOpCode parenthesesEndOpCode; | |
2105 | YarrOpCode alternativeBeginOpCode = OpSimpleNestedAlternativeBegin; | |
2106 | YarrOpCode alternativeNextOpCode = OpSimpleNestedAlternativeNext; | |
2107 | YarrOpCode alternativeEndOpCode = OpSimpleNestedAlternativeEnd; | |
2108 | ||
2109 | // We can currently only compile quantity 1 subpatterns that are | |
2110 | // not copies. We generate a copy in the case of a range quantifier, | |
2111 | // e.g. /(?:x){3,9}/, or /(?:x)+/ (These are effectively expanded to | |
2112 | // /(?:x){3,3}(?:x){0,6}/ and /(?:x)(?:x)*/ repectively). The problem | |
2113 | // comes where the subpattern is capturing, in which case we would | |
2114 | // need to restore the capture from the first subpattern upon a | |
2115 | // failure in the second. | |
2116 | if (term->quantityCount == 1 && !term->parentheses.isCopy) { | |
2117 | // Select the 'Once' nodes. | |
2118 | parenthesesBeginOpCode = OpParenthesesSubpatternOnceBegin; | |
2119 | parenthesesEndOpCode = OpParenthesesSubpatternOnceEnd; | |
2120 | ||
2121 | // If there is more than one alternative we cannot use the 'simple' nodes. | |
2122 | if (term->parentheses.disjunction->m_alternatives.size() != 1) { | |
2123 | alternativeBeginOpCode = OpNestedAlternativeBegin; | |
2124 | alternativeNextOpCode = OpNestedAlternativeNext; | |
2125 | alternativeEndOpCode = OpNestedAlternativeEnd; | |
2126 | } | |
2127 | } else if (term->parentheses.isTerminal) { | |
2128 | // Select the 'Terminal' nodes. | |
2129 | parenthesesBeginOpCode = OpParenthesesSubpatternTerminalBegin; | |
2130 | parenthesesEndOpCode = OpParenthesesSubpatternTerminalEnd; | |
2131 | } else { | |
2132 | // This subpattern is not supported by the JIT. | |
2133 | m_shouldFallBack = true; | |
2134 | return; | |
2135 | } | |
2136 | ||
2137 | size_t parenBegin = m_ops.size(); | |
2138 | m_ops.append(parenthesesBeginOpCode); | |
2139 | ||
2140 | m_ops.append(alternativeBeginOpCode); | |
2141 | m_ops.last().m_previousOp = notFound; | |
2142 | m_ops.last().m_term = term; | |
2143 | Vector<PatternAlternative*>& alternatives = term->parentheses.disjunction->m_alternatives; | |
2144 | for (unsigned i = 0; i < alternatives.size(); ++i) { | |
2145 | size_t lastOpIndex = m_ops.size() - 1; | |
2146 | ||
2147 | PatternAlternative* nestedAlternative = alternatives[i]; | |
2148 | opCompileAlternative(nestedAlternative); | |
2149 | ||
2150 | size_t thisOpIndex = m_ops.size(); | |
2151 | m_ops.append(YarrOp(alternativeNextOpCode)); | |
2152 | ||
2153 | YarrOp& lastOp = m_ops[lastOpIndex]; | |
2154 | YarrOp& thisOp = m_ops[thisOpIndex]; | |
2155 | ||
2156 | lastOp.m_alternative = nestedAlternative; | |
2157 | lastOp.m_nextOp = thisOpIndex; | |
2158 | thisOp.m_previousOp = lastOpIndex; | |
2159 | thisOp.m_term = term; | |
2160 | } | |
2161 | YarrOp& lastOp = m_ops.last(); | |
2162 | ASSERT(lastOp.m_op == alternativeNextOpCode); | |
2163 | lastOp.m_op = alternativeEndOpCode; | |
2164 | lastOp.m_alternative = 0; | |
2165 | lastOp.m_nextOp = notFound; | |
2166 | ||
2167 | size_t parenEnd = m_ops.size(); | |
2168 | m_ops.append(parenthesesEndOpCode); | |
2169 | ||
2170 | m_ops[parenBegin].m_term = term; | |
2171 | m_ops[parenBegin].m_previousOp = notFound; | |
2172 | m_ops[parenBegin].m_nextOp = parenEnd; | |
2173 | m_ops[parenEnd].m_term = term; | |
2174 | m_ops[parenEnd].m_previousOp = parenBegin; | |
2175 | m_ops[parenEnd].m_nextOp = notFound; | |
2176 | } | |
2177 | ||
2178 | // opCompileParentheticalAssertion | |
2179 | // Emits ops for a parenthetical assertion. These consist of an | |
2180 | // OpSimpleNestedAlternativeBegin/Next/End set of nodes wrapping | |
2181 | // the alternatives, with these wrapped by an outer pair of | |
2182 | // OpParentheticalAssertionBegin/End nodes. | |
2183 | // We can always use the OpSimpleNestedAlternative nodes in the | |
2184 | // case of parenthetical assertions since these only ever match | |
2185 | // once, and will never backtrack back into the assertion. | |
2186 | void opCompileParentheticalAssertion(PatternTerm* term) | |
2187 | { | |
2188 | size_t parenBegin = m_ops.size(); | |
2189 | m_ops.append(OpParentheticalAssertionBegin); | |
2190 | ||
2191 | m_ops.append(OpSimpleNestedAlternativeBegin); | |
2192 | m_ops.last().m_previousOp = notFound; | |
2193 | m_ops.last().m_term = term; | |
2194 | Vector<PatternAlternative*>& alternatives = term->parentheses.disjunction->m_alternatives; | |
2195 | for (unsigned i = 0; i < alternatives.size(); ++i) { | |
2196 | size_t lastOpIndex = m_ops.size() - 1; | |
2197 | ||
2198 | PatternAlternative* nestedAlternative = alternatives[i]; | |
2199 | opCompileAlternative(nestedAlternative); | |
2200 | ||
2201 | size_t thisOpIndex = m_ops.size(); | |
2202 | m_ops.append(YarrOp(OpSimpleNestedAlternativeNext)); | |
2203 | ||
2204 | YarrOp& lastOp = m_ops[lastOpIndex]; | |
2205 | YarrOp& thisOp = m_ops[thisOpIndex]; | |
2206 | ||
2207 | lastOp.m_alternative = nestedAlternative; | |
2208 | lastOp.m_nextOp = thisOpIndex; | |
2209 | thisOp.m_previousOp = lastOpIndex; | |
2210 | thisOp.m_term = term; | |
2211 | } | |
2212 | YarrOp& lastOp = m_ops.last(); | |
2213 | ASSERT(lastOp.m_op == OpSimpleNestedAlternativeNext); | |
2214 | lastOp.m_op = OpSimpleNestedAlternativeEnd; | |
2215 | lastOp.m_alternative = 0; | |
2216 | lastOp.m_nextOp = notFound; | |
2217 | ||
2218 | size_t parenEnd = m_ops.size(); | |
2219 | m_ops.append(OpParentheticalAssertionEnd); | |
2220 | ||
2221 | m_ops[parenBegin].m_term = term; | |
2222 | m_ops[parenBegin].m_previousOp = notFound; | |
2223 | m_ops[parenBegin].m_nextOp = parenEnd; | |
2224 | m_ops[parenEnd].m_term = term; | |
2225 | m_ops[parenEnd].m_previousOp = parenBegin; | |
2226 | m_ops[parenEnd].m_nextOp = notFound; | |
2227 | } | |
2228 | ||
2229 | // opCompileAlternative | |
2230 | // Called to emit nodes for all terms in an alternative. | |
2231 | void opCompileAlternative(PatternAlternative* alternative) | |
2232 | { | |
2233 | optimizeAlternative(alternative); | |
2234 | ||
2235 | for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { | |
2236 | PatternTerm* term = &alternative->m_terms[i]; | |
2237 | ||
2238 | switch (term->type) { | |
2239 | case PatternTerm::TypeParenthesesSubpattern: | |
2240 | opCompileParenthesesSubpattern(term); | |
2241 | break; | |
2242 | ||
2243 | case PatternTerm::TypeParentheticalAssertion: | |
2244 | opCompileParentheticalAssertion(term); | |
2245 | break; | |
2246 | ||
2247 | default: | |
2248 | m_ops.append(term); | |
2249 | } | |
2250 | } | |
2251 | } | |
2252 | ||
2253 | // opCompileBody | |
2254 | // This method compiles the body disjunction of the regular expression. | |
2255 | // The body consists of two sets of alternatives - zero or more 'once | |
2256 | // through' (BOL anchored) alternatives, followed by zero or more | |
2257 | // repeated alternatives. | |
2258 | // For each of these two sets of alteratives, if not empty they will be | |
2259 | // wrapped in a set of OpBodyAlternativeBegin/Next/End nodes (with the | |
2260 | // 'begin' node referencing the first alternative, and 'next' nodes | |
2261 | // referencing any further alternatives. The begin/next/end nodes are | |
2262 | // linked together in a doubly linked list. In the case of repeating | |
2263 | // alternatives, the end node is also linked back to the beginning. | |
2264 | // If no repeating alternatives exist, then a OpMatchFailed node exists | |
2265 | // to return the failing result. | |
2266 | void opCompileBody(PatternDisjunction* disjunction) | |
2267 | { | |
2268 | Vector<PatternAlternative*>& alternatives = disjunction->m_alternatives; | |
2269 | size_t currentAlternativeIndex = 0; | |
2270 | ||
2271 | // Emit the 'once through' alternatives. | |
2272 | if (alternatives.size() && alternatives[0]->onceThrough()) { | |
2273 | m_ops.append(YarrOp(OpBodyAlternativeBegin)); | |
2274 | m_ops.last().m_previousOp = notFound; | |
2275 | ||
2276 | do { | |
2277 | size_t lastOpIndex = m_ops.size() - 1; | |
2278 | PatternAlternative* alternative = alternatives[currentAlternativeIndex]; | |
2279 | opCompileAlternative(alternative); | |
2280 | ||
2281 | size_t thisOpIndex = m_ops.size(); | |
2282 | m_ops.append(YarrOp(OpBodyAlternativeNext)); | |
2283 | ||
2284 | YarrOp& lastOp = m_ops[lastOpIndex]; | |
2285 | YarrOp& thisOp = m_ops[thisOpIndex]; | |
2286 | ||
2287 | lastOp.m_alternative = alternative; | |
2288 | lastOp.m_nextOp = thisOpIndex; | |
2289 | thisOp.m_previousOp = lastOpIndex; | |
2290 | ||
2291 | ++currentAlternativeIndex; | |
2292 | } while (currentAlternativeIndex < alternatives.size() && alternatives[currentAlternativeIndex]->onceThrough()); | |
2293 | ||
2294 | YarrOp& lastOp = m_ops.last(); | |
2295 | ||
2296 | ASSERT(lastOp.m_op == OpBodyAlternativeNext); | |
2297 | lastOp.m_op = OpBodyAlternativeEnd; | |
2298 | lastOp.m_alternative = 0; | |
2299 | lastOp.m_nextOp = notFound; | |
2300 | } | |
2301 | ||
2302 | if (currentAlternativeIndex == alternatives.size()) { | |
2303 | m_ops.append(YarrOp(OpMatchFailed)); | |
2304 | return; | |
2305 | } | |
2306 | ||
2307 | // Emit the repeated alternatives. | |
2308 | size_t repeatLoop = m_ops.size(); | |
2309 | m_ops.append(YarrOp(OpBodyAlternativeBegin)); | |
2310 | m_ops.last().m_previousOp = notFound; | |
2311 | do { | |
2312 | size_t lastOpIndex = m_ops.size() - 1; | |
2313 | PatternAlternative* alternative = alternatives[currentAlternativeIndex]; | |
2314 | ASSERT(!alternative->onceThrough()); | |
2315 | opCompileAlternative(alternative); | |
2316 | ||
2317 | size_t thisOpIndex = m_ops.size(); | |
2318 | m_ops.append(YarrOp(OpBodyAlternativeNext)); | |
2319 | ||
2320 | YarrOp& lastOp = m_ops[lastOpIndex]; | |
2321 | YarrOp& thisOp = m_ops[thisOpIndex]; | |
2322 | ||
2323 | lastOp.m_alternative = alternative; | |
2324 | lastOp.m_nextOp = thisOpIndex; | |
2325 | thisOp.m_previousOp = lastOpIndex; | |
2326 | ||
2327 | ++currentAlternativeIndex; | |
2328 | } while (currentAlternativeIndex < alternatives.size()); | |
2329 | YarrOp& lastOp = m_ops.last(); | |
2330 | ASSERT(lastOp.m_op == OpBodyAlternativeNext); | |
2331 | lastOp.m_op = OpBodyAlternativeEnd; | |
2332 | lastOp.m_alternative = 0; | |
2333 | lastOp.m_nextOp = repeatLoop; | |
2334 | } | |
2335 | ||
2336 | void generateEnter() | |
2337 | { | |
2338 | #if CPU(X86_64) | |
2339 | push(X86Registers::ebp); | |
2340 | move(stackPointerRegister, X86Registers::ebp); | |
2341 | push(X86Registers::ebx); | |
2342 | #elif CPU(X86) | |
2343 | push(X86Registers::ebp); | |
2344 | move(stackPointerRegister, X86Registers::ebp); | |
2345 | // TODO: do we need spill registers to fill the output pointer if there are no sub captures? | |
2346 | push(X86Registers::ebx); | |
2347 | push(X86Registers::edi); | |
2348 | push(X86Registers::esi); | |
2349 | // load output into edi (2 = saved ebp + return address). | |
2350 | #if COMPILER(MSVC) | |
2351 | loadPtr(Address(X86Registers::ebp, 2 * sizeof(void*)), input); | |
2352 | loadPtr(Address(X86Registers::ebp, 3 * sizeof(void*)), index); | |
2353 | loadPtr(Address(X86Registers::ebp, 4 * sizeof(void*)), length); | |
2354 | loadPtr(Address(X86Registers::ebp, 5 * sizeof(void*)), output); | |
2355 | #else | |
2356 | loadPtr(Address(X86Registers::ebp, 2 * sizeof(void*)), output); | |
2357 | #endif | |
2358 | #elif CPU(ARM) | |
2359 | push(ARMRegisters::r4); | |
2360 | push(ARMRegisters::r5); | |
2361 | push(ARMRegisters::r6); | |
2362 | #if CPU(ARM_TRADITIONAL) | |
2363 | push(ARMRegisters::r8); // scratch register | |
2364 | #endif | |
2365 | move(ARMRegisters::r3, output); | |
2366 | #elif CPU(SH4) | |
2367 | push(SH4Registers::r11); | |
2368 | push(SH4Registers::r13); | |
2369 | #elif CPU(MIPS) | |
2370 | // Do nothing. | |
2371 | #endif | |
2372 | } | |
2373 | ||
2374 | void generateReturn() | |
2375 | { | |
2376 | #if CPU(X86_64) | |
2377 | pop(X86Registers::ebx); | |
2378 | pop(X86Registers::ebp); | |
2379 | #elif CPU(X86) | |
2380 | pop(X86Registers::esi); | |
2381 | pop(X86Registers::edi); | |
2382 | pop(X86Registers::ebx); | |
2383 | pop(X86Registers::ebp); | |
2384 | #elif CPU(ARM) | |
2385 | #if CPU(ARM_TRADITIONAL) | |
2386 | pop(ARMRegisters::r8); // scratch register | |
2387 | #endif | |
2388 | pop(ARMRegisters::r6); | |
2389 | pop(ARMRegisters::r5); | |
2390 | pop(ARMRegisters::r4); | |
2391 | #elif CPU(SH4) | |
2392 | pop(SH4Registers::r13); | |
2393 | pop(SH4Registers::r11); | |
2394 | #elif CPU(MIPS) | |
2395 | // Do nothing | |
2396 | #endif | |
2397 | ret(); | |
2398 | } | |
2399 | ||
2400 | public: | |
2401 | YarrGenerator(YarrPattern& pattern) | |
2402 | : m_pattern(pattern) | |
2403 | , m_shouldFallBack(false) | |
2404 | , m_checked(0) | |
2405 | { | |
2406 | } | |
2407 | ||
2408 | void compile(JSGlobalData* globalData, YarrCodeBlock& jitObject) | |
2409 | { | |
2410 | generateEnter(); | |
2411 | ||
2412 | if (!m_pattern.m_body->m_hasFixedSize) | |
2413 | store32(index, Address(output)); | |
2414 | ||
2415 | if (m_pattern.m_body->m_callFrameSize) | |
2416 | subPtr(Imm32(m_pattern.m_body->m_callFrameSize * sizeof(void*)), stackPointerRegister); | |
2417 | ||
2418 | // Compile the pattern to the internal 'YarrOp' representation. | |
2419 | opCompileBody(m_pattern.m_body); | |
2420 | ||
2421 | // If we encountered anything we can't handle in the JIT code | |
2422 | // (e.g. backreferences) then return early. | |
2423 | if (m_shouldFallBack) { | |
2424 | jitObject.setFallBack(true); | |
2425 | return; | |
2426 | } | |
2427 | ||
2428 | generate(); | |
2429 | backtrack(); | |
2430 | ||
2431 | // Link & finalize the code. | |
2432 | LinkBuffer linkBuffer(*globalData, this, globalData->regexAllocator); | |
2433 | m_backtrackingState.linkDataLabels(linkBuffer); | |
2434 | jitObject.set(linkBuffer.finalizeCode()); | |
2435 | jitObject.setFallBack(m_shouldFallBack); | |
2436 | } | |
2437 | ||
2438 | private: | |
2439 | YarrPattern& m_pattern; | |
2440 | ||
2441 | // Used to detect regular expression constructs that are not currently | |
2442 | // supported in the JIT; fall back to the interpreter when this is detected. | |
2443 | bool m_shouldFallBack; | |
2444 | ||
2445 | // The regular expression expressed as a linear sequence of operations. | |
2446 | Vector<YarrOp, 128> m_ops; | |
2447 | ||
2448 | // This records the current input offset being applied due to the current | |
2449 | // set of alternatives we are nested within. E.g. when matching the | |
2450 | // character 'b' within the regular expression /abc/, we will know that | |
2451 | // the minimum size for the alternative is 3, checked upon entry to the | |
2452 | // alternative, and that 'b' is at offset 1 from the start, and as such | |
2453 | // when matching 'b' we need to apply an offset of -2 to the load. | |
2454 | // | |
2455 | // FIXME: This should go away. Rather than tracking this value throughout | |
2456 | // code generation, we should gather this information up front & store it | |
2457 | // on the YarrOp structure. | |
2458 | int m_checked; | |
2459 | ||
2460 | // This class records state whilst generating the backtracking path of code. | |
2461 | BacktrackingState m_backtrackingState; | |
2462 | }; | |
2463 | ||
2464 | void jitCompile(YarrPattern& pattern, JSGlobalData* globalData, YarrCodeBlock& jitObject) | |
2465 | { | |
2466 | YarrGenerator(pattern).compile(globalData, jitObject); | |
2467 | } | |
2468 | ||
2469 | int execute(YarrCodeBlock& jitObject, const UChar* input, unsigned start, unsigned length, int* output) | |
2470 | { | |
2471 | return jitObject.execute(input, start, length, output); | |
2472 | } | |
2473 | ||
2474 | }} | |
2475 | ||
2476 | #endif |