]>
Commit | Line | Data |
---|---|---|
b37bf2e1 A |
1 | /* This is JavaScriptCore's variant of the PCRE library. While this library |
2 | started out as a copy of PCRE, many of the features of PCRE have been | |
3 | removed. This library now supports only the regular expression features | |
4 | required by the JavaScript language specification, and has only the functions | |
5 | needed by JavaScriptCore and the rest of WebKit. | |
6 | ||
7 | Originally written by Philip Hazel | |
8 | Copyright (c) 1997-2006 University of Cambridge | |
9dae56ea | 9 | Copyright (C) 2002, 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. |
b37bf2e1 A |
10 | Copyright (C) 2007 Eric Seidel <eric@webkit.org> |
11 | ||
12 | ----------------------------------------------------------------------------- | |
13 | Redistribution and use in source and binary forms, with or without | |
14 | modification, are permitted provided that the following conditions are met: | |
15 | ||
16 | * Redistributions of source code must retain the above copyright notice, | |
17 | this list of conditions and the following disclaimer. | |
18 | ||
19 | * Redistributions in binary form must reproduce the above copyright | |
20 | notice, this list of conditions and the following disclaimer in the | |
21 | documentation and/or other materials provided with the distribution. | |
22 | ||
23 | * Neither the name of the University of Cambridge nor the names of its | |
24 | contributors may be used to endorse or promote products derived from | |
25 | this software without specific prior written permission. | |
26 | ||
27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
37 | POSSIBILITY OF SUCH DAMAGE. | |
38 | ----------------------------------------------------------------------------- | |
39 | */ | |
40 | ||
41 | /* This module contains jsRegExpExecute(), the externally visible function | |
42 | that does pattern matching using an NFA algorithm, following the rules from | |
43 | the JavaScript specification. There are also some supporting functions. */ | |
44 | ||
45 | #include "config.h" | |
b37bf2e1 A |
46 | #include "pcre_internal.h" |
47 | ||
9dae56ea | 48 | #include <limits.h> |
b37bf2e1 A |
49 | #include <wtf/ASCIICType.h> |
50 | #include <wtf/Vector.h> | |
51 | ||
9dae56ea A |
52 | #if REGEXP_HISTOGRAM |
53 | #include <parser/DateMath.h> | |
54 | #include <runtime/UString.h> | |
55 | #endif | |
b37bf2e1 A |
56 | |
57 | using namespace WTF; | |
58 | ||
59 | #ifdef __GNUC__ | |
60 | #define USE_COMPUTED_GOTO_FOR_MATCH_RECURSION | |
61 | //#define USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP | |
62 | #endif | |
63 | ||
64 | /* Avoid warnings on Windows. */ | |
65 | #undef min | |
66 | #undef max | |
67 | ||
68 | #ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION | |
69 | typedef int ReturnLocation; | |
70 | #else | |
71 | typedef void* ReturnLocation; | |
72 | #endif | |
73 | ||
9dae56ea A |
74 | #if !REGEXP_HISTOGRAM |
75 | ||
76 | class HistogramTimeLogger { | |
77 | public: | |
78 | HistogramTimeLogger(const JSRegExp*) { } | |
79 | }; | |
80 | ||
81 | #else | |
82 | ||
83 | using namespace JSC; | |
84 | ||
85 | class Histogram { | |
86 | public: | |
87 | ~Histogram(); | |
88 | void add(const JSRegExp*, double); | |
89 | ||
90 | private: | |
91 | typedef HashMap<RefPtr<UString::Rep>, double> Map; | |
92 | Map times; | |
93 | }; | |
94 | ||
95 | class HistogramTimeLogger { | |
96 | public: | |
97 | HistogramTimeLogger(const JSRegExp*); | |
98 | ~HistogramTimeLogger(); | |
99 | ||
100 | private: | |
101 | const JSRegExp* m_re; | |
102 | double m_startTime; | |
103 | }; | |
104 | ||
105 | #endif | |
106 | ||
b37bf2e1 A |
107 | /* Structure for building a chain of data for holding the values of |
108 | the subject pointer at the start of each bracket, used to detect when | |
109 | an empty string has been matched by a bracket to break infinite loops. */ | |
110 | struct BracketChainNode { | |
111 | BracketChainNode* previousBracket; | |
112 | const UChar* bracketStart; | |
113 | }; | |
114 | ||
115 | struct MatchFrame { | |
116 | ReturnLocation returnLocation; | |
117 | struct MatchFrame* previousFrame; | |
118 | ||
119 | /* Function arguments that may change */ | |
120 | struct { | |
121 | const UChar* subjectPtr; | |
122 | const unsigned char* instructionPtr; | |
123 | int offsetTop; | |
124 | BracketChainNode* bracketChain; | |
125 | } args; | |
126 | ||
127 | ||
128 | /* PCRE uses "fake" recursion built off of gotos, thus | |
129 | stack-based local variables are not safe to use. Instead we have to | |
130 | store local variables on the current MatchFrame. */ | |
131 | struct { | |
132 | const unsigned char* data; | |
133 | const unsigned char* startOfRepeatingBracket; | |
134 | const UChar* subjectPtrAtStartOfInstruction; // Several instrutions stash away a subjectPtr here for later compare | |
135 | const unsigned char* instructionPtrAtStartOfOnce; | |
136 | ||
137 | int repeatOthercase; | |
138 | ||
139 | int ctype; | |
140 | int fc; | |
141 | int fi; | |
142 | int length; | |
143 | int max; | |
144 | int number; | |
145 | int offset; | |
146 | int saveOffset1; | |
147 | int saveOffset2; | |
148 | int saveOffset3; | |
149 | ||
150 | BracketChainNode bracketChainNode; | |
151 | } locals; | |
152 | }; | |
153 | ||
154 | /* Structure for passing "static" information around between the functions | |
155 | doing traditional NFA matching, so that they are thread-safe. */ | |
156 | ||
157 | struct MatchData { | |
158 | int* offsetVector; /* Offset vector */ | |
159 | int offsetEnd; /* One past the end */ | |
160 | int offsetMax; /* The maximum usable for return data */ | |
161 | bool offsetOverflow; /* Set if too many extractions */ | |
162 | const UChar* startSubject; /* Start of the subject string */ | |
163 | const UChar* endSubject; /* End of the subject string */ | |
164 | const UChar* endMatchPtr; /* Subject position at end match */ | |
165 | int endOffsetTop; /* Highwater mark at end of match */ | |
166 | bool multiline; | |
167 | bool ignoreCase; | |
168 | }; | |
169 | ||
170 | /* The maximum remaining length of subject we are prepared to search for a | |
9dae56ea | 171 | reqByte match. */ |
b37bf2e1 A |
172 | |
173 | #define REQ_BYTE_MAX 1000 | |
174 | ||
175 | /* The below limit restricts the number of "recursive" match calls in order to | |
176 | avoid spending exponential time on complex regular expressions. */ | |
177 | ||
178 | static const unsigned matchLimit = 100000; | |
179 | ||
180 | #ifdef DEBUG | |
181 | /************************************************* | |
182 | * Debugging function to print chars * | |
183 | *************************************************/ | |
184 | ||
185 | /* Print a sequence of chars in printable format, stopping at the end of the | |
186 | subject if the requested. | |
187 | ||
188 | Arguments: | |
189 | p points to characters | |
190 | length number to print | |
191 | isSubject true if printing from within md.startSubject | |
192 | md pointer to matching data block, if isSubject is true | |
193 | */ | |
194 | ||
195 | static void pchars(const UChar* p, int length, bool isSubject, const MatchData& md) | |
196 | { | |
197 | if (isSubject && length > md.endSubject - p) | |
198 | length = md.endSubject - p; | |
199 | while (length-- > 0) { | |
200 | int c; | |
201 | if (isprint(c = *(p++))) | |
202 | printf("%c", c); | |
203 | else if (c < 256) | |
204 | printf("\\x%02x", c); | |
205 | else | |
206 | printf("\\x{%x}", c); | |
207 | } | |
208 | } | |
209 | #endif | |
210 | ||
211 | /************************************************* | |
212 | * Match a back-reference * | |
213 | *************************************************/ | |
214 | ||
215 | /* If a back reference hasn't been set, the length that is passed is greater | |
216 | than the number of characters left in the string, so the match fails. | |
217 | ||
218 | Arguments: | |
219 | offset index into the offset vector | |
220 | subjectPtr points into the subject | |
221 | length length to be matched | |
222 | md points to match data block | |
223 | ||
224 | Returns: true if matched | |
225 | */ | |
226 | ||
227 | static bool matchRef(int offset, const UChar* subjectPtr, int length, const MatchData& md) | |
228 | { | |
229 | const UChar* p = md.startSubject + md.offsetVector[offset]; | |
230 | ||
231 | #ifdef DEBUG | |
232 | if (subjectPtr >= md.endSubject) | |
233 | printf("matching subject <null>"); | |
234 | else { | |
235 | printf("matching subject "); | |
236 | pchars(subjectPtr, length, true, md); | |
237 | } | |
238 | printf(" against backref "); | |
239 | pchars(p, length, false, md); | |
240 | printf("\n"); | |
241 | #endif | |
242 | ||
243 | /* Always fail if not enough characters left */ | |
244 | ||
245 | if (length > md.endSubject - subjectPtr) | |
246 | return false; | |
247 | ||
248 | /* Separate the caselesss case for speed */ | |
249 | ||
250 | if (md.ignoreCase) { | |
251 | while (length-- > 0) { | |
252 | UChar c = *p++; | |
9dae56ea | 253 | int othercase = jsc_pcre_ucp_othercase(c); |
b37bf2e1 A |
254 | UChar d = *subjectPtr++; |
255 | if (c != d && othercase != d) | |
256 | return false; | |
257 | } | |
258 | } | |
259 | else { | |
260 | while (length-- > 0) | |
261 | if (*p++ != *subjectPtr++) | |
262 | return false; | |
263 | } | |
264 | ||
265 | return true; | |
266 | } | |
267 | ||
268 | #ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION | |
269 | ||
270 | /* Use numbered labels and switch statement at the bottom of the match function. */ | |
271 | ||
272 | #define RMATCH_WHERE(num) num | |
273 | #define RRETURN_LABEL RRETURN_SWITCH | |
274 | ||
275 | #else | |
276 | ||
277 | /* Use GCC's computed goto extension. */ | |
278 | ||
279 | /* For one test case this is more than 40% faster than the switch statement. | |
280 | We could avoid the use of the num argument entirely by using local labels, | |
281 | but using it for the GCC case as well as the non-GCC case allows us to share | |
282 | a bit more code and notice if we use conflicting numbers.*/ | |
283 | ||
284 | #define RMATCH_WHERE(num) &&RRETURN_##num | |
285 | #define RRETURN_LABEL *stack.currentFrame->returnLocation | |
286 | ||
287 | #endif | |
288 | ||
289 | #define RECURSIVE_MATCH_COMMON(num) \ | |
290 | goto RECURSE;\ | |
291 | RRETURN_##num: \ | |
292 | stack.popCurrentFrame(); | |
293 | ||
294 | #define RECURSIVE_MATCH(num, ra, rb) \ | |
295 | do { \ | |
296 | stack.pushNewFrame((ra), (rb), RMATCH_WHERE(num)); \ | |
297 | RECURSIVE_MATCH_COMMON(num) \ | |
298 | } while (0) | |
299 | ||
9dae56ea | 300 | #define RECURSIVE_MATCH_NEW_GROUP(num, ra, rb) \ |
b37bf2e1 A |
301 | do { \ |
302 | stack.pushNewFrame((ra), (rb), RMATCH_WHERE(num)); \ | |
303 | startNewGroup(stack.currentFrame); \ | |
304 | RECURSIVE_MATCH_COMMON(num) \ | |
305 | } while (0) | |
306 | ||
307 | #define RRETURN goto RRETURN_LABEL | |
308 | ||
309 | #define RRETURN_NO_MATCH do { isMatch = false; RRETURN; } while (0) | |
310 | ||
311 | /************************************************* | |
312 | * Match from current position * | |
313 | *************************************************/ | |
314 | ||
315 | /* On entry instructionPtr points to the first opcode, and subjectPtr to the first character | |
316 | in the subject string, while substringStart holds the value of subjectPtr at the start of the | |
317 | last bracketed group - used for breaking infinite loops matching zero-length | |
318 | strings. This function is called recursively in many circumstances. Whenever it | |
319 | returns a negative (error) response, the outer match() call must also return the | |
320 | same response. | |
321 | ||
322 | Arguments: | |
323 | subjectPtr pointer in subject | |
324 | instructionPtr position in code | |
325 | offsetTop current top pointer | |
326 | md pointer to "static" info for the match | |
327 | ||
328 | Returns: 1 if matched ) these values are >= 0 | |
329 | 0 if failed to match ) | |
330 | a negative error value if aborted by an error condition | |
331 | (e.g. stopped by repeated call or recursion limit) | |
332 | */ | |
333 | ||
9dae56ea | 334 | static const unsigned numFramesOnStack = 16; |
b37bf2e1 A |
335 | |
336 | struct MatchStack { | |
337 | MatchStack() | |
9dae56ea | 338 | : framesEnd(frames + numFramesOnStack) |
b37bf2e1 A |
339 | , currentFrame(frames) |
340 | , size(1) // match() creates accesses the first frame w/o calling pushNewFrame | |
341 | { | |
9dae56ea | 342 | ASSERT((sizeof(frames) / sizeof(frames[0])) == numFramesOnStack); |
b37bf2e1 A |
343 | } |
344 | ||
9dae56ea | 345 | MatchFrame frames[numFramesOnStack]; |
b37bf2e1 A |
346 | MatchFrame* framesEnd; |
347 | MatchFrame* currentFrame; | |
348 | unsigned size; | |
349 | ||
350 | inline bool canUseStackBufferForNextFrame() | |
351 | { | |
9dae56ea | 352 | return size < numFramesOnStack; |
b37bf2e1 A |
353 | } |
354 | ||
355 | inline MatchFrame* allocateNextFrame() | |
356 | { | |
357 | if (canUseStackBufferForNextFrame()) | |
358 | return currentFrame + 1; | |
359 | return new MatchFrame; | |
360 | } | |
361 | ||
362 | inline void pushNewFrame(const unsigned char* instructionPtr, BracketChainNode* bracketChain, ReturnLocation returnLocation) | |
363 | { | |
364 | MatchFrame* newframe = allocateNextFrame(); | |
365 | newframe->previousFrame = currentFrame; | |
366 | ||
367 | newframe->args.subjectPtr = currentFrame->args.subjectPtr; | |
368 | newframe->args.offsetTop = currentFrame->args.offsetTop; | |
369 | newframe->args.instructionPtr = instructionPtr; | |
370 | newframe->args.bracketChain = bracketChain; | |
371 | newframe->returnLocation = returnLocation; | |
372 | size++; | |
373 | ||
374 | currentFrame = newframe; | |
375 | } | |
376 | ||
377 | inline void popCurrentFrame() | |
378 | { | |
379 | MatchFrame* oldFrame = currentFrame; | |
380 | currentFrame = currentFrame->previousFrame; | |
9dae56ea | 381 | if (size > numFramesOnStack) |
b37bf2e1 A |
382 | delete oldFrame; |
383 | size--; | |
384 | } | |
385 | ||
386 | void popAllFrames() | |
387 | { | |
388 | while (size) | |
389 | popCurrentFrame(); | |
390 | } | |
391 | }; | |
392 | ||
393 | static int matchError(int errorCode, MatchStack& stack) | |
394 | { | |
395 | stack.popAllFrames(); | |
396 | return errorCode; | |
397 | } | |
398 | ||
399 | /* Get the next UTF-8 character, not advancing the pointer, incrementing length | |
400 | if there are extra bytes. This is called when we know we are in UTF-8 mode. */ | |
401 | ||
402 | static inline void getUTF8CharAndIncrementLength(int& c, const unsigned char* subjectPtr, int& len) | |
403 | { | |
404 | c = *subjectPtr; | |
405 | if ((c & 0xc0) == 0xc0) { | |
9dae56ea | 406 | int gcaa = jsc_pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ |
b37bf2e1 | 407 | int gcss = 6 * gcaa; |
9dae56ea | 408 | c = (c & jsc_pcre_utf8_table3[gcaa]) << gcss; |
b37bf2e1 A |
409 | for (int gcii = 1; gcii <= gcaa; gcii++) { |
410 | gcss -= 6; | |
411 | c |= (subjectPtr[gcii] & 0x3f) << gcss; | |
412 | } | |
413 | len += gcaa; | |
414 | } | |
415 | } | |
416 | ||
417 | static inline void startNewGroup(MatchFrame* currentFrame) | |
418 | { | |
419 | /* At the start of a bracketed group, add the current subject pointer to the | |
420 | stack of such pointers, to be re-instated at the end of the group when we hit | |
421 | the closing ket. When match() is called in other circumstances, we don't add to | |
422 | this stack. */ | |
423 | ||
424 | currentFrame->locals.bracketChainNode.previousBracket = currentFrame->args.bracketChain; | |
425 | currentFrame->locals.bracketChainNode.bracketStart = currentFrame->args.subjectPtr; | |
426 | currentFrame->args.bracketChain = ¤tFrame->locals.bracketChainNode; | |
427 | } | |
428 | ||
429 | // FIXME: "minimize" means "not greedy", we should invert the callers to ask for "greedy" to be less confusing | |
430 | static inline void repeatInformationFromInstructionOffset(short instructionOffset, bool& minimize, int& minimumRepeats, int& maximumRepeats) | |
431 | { | |
432 | // Instruction offsets are based off of OP_CRSTAR, OP_STAR, OP_TYPESTAR, OP_NOTSTAR | |
433 | static const char minimumRepeatsFromInstructionOffset[] = { 0, 0, 1, 1, 0, 0 }; | |
434 | static const int maximumRepeatsFromInstructionOffset[] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, 1, 1 }; | |
435 | ||
436 | ASSERT(instructionOffset >= 0); | |
437 | ASSERT(instructionOffset <= (OP_CRMINQUERY - OP_CRSTAR)); | |
438 | ||
439 | minimize = (instructionOffset & 1); // this assumes ordering: Instruction, MinimizeInstruction, Instruction2, MinimizeInstruction2 | |
440 | minimumRepeats = minimumRepeatsFromInstructionOffset[instructionOffset]; | |
441 | maximumRepeats = maximumRepeatsFromInstructionOffset[instructionOffset]; | |
442 | } | |
443 | ||
444 | static int match(const UChar* subjectPtr, const unsigned char* instructionPtr, int offsetTop, MatchData& md) | |
445 | { | |
446 | bool isMatch = false; | |
447 | int min; | |
448 | bool minimize = false; /* Initialization not really needed, but some compilers think so. */ | |
9dae56ea | 449 | unsigned remainingMatchCount = matchLimit; |
b37bf2e1 A |
450 | |
451 | MatchStack stack; | |
452 | ||
453 | /* The opcode jump table. */ | |
454 | #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP | |
455 | #define EMIT_JUMP_TABLE_ENTRY(opcode) &&LABEL_OP_##opcode, | |
456 | static void* opcodeJumpTable[256] = { FOR_EACH_OPCODE(EMIT_JUMP_TABLE_ENTRY) }; | |
457 | #undef EMIT_JUMP_TABLE_ENTRY | |
458 | #endif | |
459 | ||
460 | /* One-time setup of the opcode jump table. */ | |
461 | #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP | |
462 | for (int i = 255; !opcodeJumpTable[i]; i--) | |
463 | opcodeJumpTable[i] = &&CAPTURING_BRACKET; | |
464 | #endif | |
465 | ||
466 | #ifdef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION | |
467 | // Shark shows this as a hot line | |
468 | // Using a static const here makes this line disappear, but makes later access hotter (not sure why) | |
469 | stack.currentFrame->returnLocation = &&RETURN; | |
470 | #else | |
471 | stack.currentFrame->returnLocation = 0; | |
472 | #endif | |
473 | stack.currentFrame->args.subjectPtr = subjectPtr; | |
474 | stack.currentFrame->args.instructionPtr = instructionPtr; | |
475 | stack.currentFrame->args.offsetTop = offsetTop; | |
476 | stack.currentFrame->args.bracketChain = 0; | |
477 | startNewGroup(stack.currentFrame); | |
478 | ||
479 | /* This is where control jumps back to to effect "recursion" */ | |
480 | ||
481 | RECURSE: | |
9dae56ea | 482 | if (!--remainingMatchCount) |
b37bf2e1 A |
483 | return matchError(JSRegExpErrorHitLimit, stack); |
484 | ||
485 | /* Now start processing the operations. */ | |
486 | ||
487 | #ifndef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP | |
488 | while (true) | |
489 | #endif | |
490 | { | |
491 | ||
492 | #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP | |
493 | #define BEGIN_OPCODE(opcode) LABEL_OP_##opcode | |
494 | #define NEXT_OPCODE goto *opcodeJumpTable[*stack.currentFrame->args.instructionPtr] | |
495 | #else | |
496 | #define BEGIN_OPCODE(opcode) case OP_##opcode | |
497 | #define NEXT_OPCODE continue | |
498 | #endif | |
499 | ||
500 | #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP | |
501 | NEXT_OPCODE; | |
502 | #else | |
503 | switch (*stack.currentFrame->args.instructionPtr) | |
504 | #endif | |
505 | { | |
506 | /* Non-capturing bracket: optimized */ | |
507 | ||
508 | BEGIN_OPCODE(BRA): | |
509 | NON_CAPTURING_BRACKET: | |
510 | DPRINTF(("start bracket 0\n")); | |
511 | do { | |
9dae56ea | 512 | RECURSIVE_MATCH_NEW_GROUP(2, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain); |
b37bf2e1 A |
513 | if (isMatch) |
514 | RRETURN; | |
515 | stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1); | |
516 | } while (*stack.currentFrame->args.instructionPtr == OP_ALT); | |
517 | DPRINTF(("bracket 0 failed\n")); | |
518 | RRETURN; | |
519 | ||
520 | /* Skip over large extraction number data if encountered. */ | |
521 | ||
522 | BEGIN_OPCODE(BRANUMBER): | |
523 | stack.currentFrame->args.instructionPtr += 3; | |
524 | NEXT_OPCODE; | |
525 | ||
526 | /* End of the pattern. */ | |
527 | ||
528 | BEGIN_OPCODE(END): | |
529 | md.endMatchPtr = stack.currentFrame->args.subjectPtr; /* Record where we ended */ | |
530 | md.endOffsetTop = stack.currentFrame->args.offsetTop; /* and how many extracts were taken */ | |
531 | isMatch = true; | |
532 | RRETURN; | |
533 | ||
534 | /* Assertion brackets. Check the alternative branches in turn - the | |
535 | matching won't pass the KET for an assertion. If any one branch matches, | |
536 | the assertion is true. Lookbehind assertions have an OP_REVERSE item at the | |
537 | start of each branch to move the current point backwards, so the code at | |
538 | this level is identical to the lookahead case. */ | |
539 | ||
540 | BEGIN_OPCODE(ASSERT): | |
541 | do { | |
9dae56ea | 542 | RECURSIVE_MATCH_NEW_GROUP(6, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, NULL); |
b37bf2e1 A |
543 | if (isMatch) |
544 | break; | |
545 | stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1); | |
546 | } while (*stack.currentFrame->args.instructionPtr == OP_ALT); | |
547 | if (*stack.currentFrame->args.instructionPtr == OP_KET) | |
548 | RRETURN_NO_MATCH; | |
549 | ||
550 | /* Continue from after the assertion, updating the offsets high water | |
551 | mark, since extracts may have been taken during the assertion. */ | |
552 | ||
553 | advanceToEndOfBracket(stack.currentFrame->args.instructionPtr); | |
554 | stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE; | |
555 | stack.currentFrame->args.offsetTop = md.endOffsetTop; | |
556 | NEXT_OPCODE; | |
557 | ||
558 | /* Negative assertion: all branches must fail to match */ | |
559 | ||
560 | BEGIN_OPCODE(ASSERT_NOT): | |
561 | do { | |
9dae56ea | 562 | RECURSIVE_MATCH_NEW_GROUP(7, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, NULL); |
b37bf2e1 A |
563 | if (isMatch) |
564 | RRETURN_NO_MATCH; | |
565 | stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1); | |
566 | } while (*stack.currentFrame->args.instructionPtr == OP_ALT); | |
567 | ||
568 | stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE; | |
569 | NEXT_OPCODE; | |
570 | ||
571 | /* An alternation is the end of a branch; scan along to find the end of the | |
572 | bracketed group and go to there. */ | |
573 | ||
574 | BEGIN_OPCODE(ALT): | |
575 | advanceToEndOfBracket(stack.currentFrame->args.instructionPtr); | |
576 | NEXT_OPCODE; | |
577 | ||
578 | /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating | |
579 | that it may occur zero times. It may repeat infinitely, or not at all - | |
580 | i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper | |
581 | repeat limits are compiled as a number of copies, with the optional ones | |
582 | preceded by BRAZERO or BRAMINZERO. */ | |
583 | ||
584 | BEGIN_OPCODE(BRAZERO): { | |
585 | stack.currentFrame->locals.startOfRepeatingBracket = stack.currentFrame->args.instructionPtr + 1; | |
9dae56ea | 586 | RECURSIVE_MATCH_NEW_GROUP(14, stack.currentFrame->locals.startOfRepeatingBracket, stack.currentFrame->args.bracketChain); |
b37bf2e1 A |
587 | if (isMatch) |
588 | RRETURN; | |
589 | advanceToEndOfBracket(stack.currentFrame->locals.startOfRepeatingBracket); | |
590 | stack.currentFrame->args.instructionPtr = stack.currentFrame->locals.startOfRepeatingBracket + 1 + LINK_SIZE; | |
591 | NEXT_OPCODE; | |
592 | } | |
593 | ||
594 | BEGIN_OPCODE(BRAMINZERO): { | |
595 | stack.currentFrame->locals.startOfRepeatingBracket = stack.currentFrame->args.instructionPtr + 1; | |
596 | advanceToEndOfBracket(stack.currentFrame->locals.startOfRepeatingBracket); | |
9dae56ea | 597 | RECURSIVE_MATCH_NEW_GROUP(15, stack.currentFrame->locals.startOfRepeatingBracket + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain); |
b37bf2e1 A |
598 | if (isMatch) |
599 | RRETURN; | |
600 | stack.currentFrame->args.instructionPtr++; | |
601 | NEXT_OPCODE; | |
602 | } | |
603 | ||
604 | /* End of a group, repeated or non-repeating. If we are at the end of | |
605 | an assertion "group", stop matching and return 1, but record the | |
606 | current high water mark for use by positive assertions. Do this also | |
607 | for the "once" (not-backup up) groups. */ | |
608 | ||
609 | BEGIN_OPCODE(KET): | |
610 | BEGIN_OPCODE(KETRMIN): | |
611 | BEGIN_OPCODE(KETRMAX): | |
612 | stack.currentFrame->locals.instructionPtrAtStartOfOnce = stack.currentFrame->args.instructionPtr - getLinkValue(stack.currentFrame->args.instructionPtr + 1); | |
613 | stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.bracketChain->bracketStart; | |
614 | ||
615 | /* Back up the stack of bracket start pointers. */ | |
616 | ||
617 | stack.currentFrame->args.bracketChain = stack.currentFrame->args.bracketChain->previousBracket; | |
618 | ||
619 | if (*stack.currentFrame->locals.instructionPtrAtStartOfOnce == OP_ASSERT || *stack.currentFrame->locals.instructionPtrAtStartOfOnce == OP_ASSERT_NOT) { | |
620 | md.endOffsetTop = stack.currentFrame->args.offsetTop; | |
621 | isMatch = true; | |
622 | RRETURN; | |
623 | } | |
624 | ||
625 | /* In all other cases except a conditional group we have to check the | |
626 | group number back at the start and if necessary complete handling an | |
627 | extraction by setting the offsets and bumping the high water mark. */ | |
628 | ||
629 | stack.currentFrame->locals.number = *stack.currentFrame->locals.instructionPtrAtStartOfOnce - OP_BRA; | |
630 | ||
631 | /* For extended extraction brackets (large number), we have to fish out | |
632 | the number from a dummy opcode at the start. */ | |
633 | ||
634 | if (stack.currentFrame->locals.number > EXTRACT_BASIC_MAX) | |
635 | stack.currentFrame->locals.number = get2ByteValue(stack.currentFrame->locals.instructionPtrAtStartOfOnce + 2 + LINK_SIZE); | |
636 | stack.currentFrame->locals.offset = stack.currentFrame->locals.number << 1; | |
637 | ||
638 | #ifdef DEBUG | |
639 | printf("end bracket %d", stack.currentFrame->locals.number); | |
640 | printf("\n"); | |
641 | #endif | |
642 | ||
643 | /* Test for a numbered group. This includes groups called as a result | |
644 | of recursion. Note that whole-pattern recursion is coded as a recurse | |
645 | into group 0, so it won't be picked up here. Instead, we catch it when | |
646 | the OP_END is reached. */ | |
647 | ||
648 | if (stack.currentFrame->locals.number > 0) { | |
649 | if (stack.currentFrame->locals.offset >= md.offsetMax) | |
650 | md.offsetOverflow = true; | |
651 | else { | |
652 | md.offsetVector[stack.currentFrame->locals.offset] = | |
653 | md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number]; | |
654 | md.offsetVector[stack.currentFrame->locals.offset+1] = stack.currentFrame->args.subjectPtr - md.startSubject; | |
655 | if (stack.currentFrame->args.offsetTop <= stack.currentFrame->locals.offset) | |
656 | stack.currentFrame->args.offsetTop = stack.currentFrame->locals.offset + 2; | |
657 | } | |
658 | } | |
659 | ||
660 | /* For a non-repeating ket, just continue at this level. This also | |
661 | happens for a repeating ket if no characters were matched in the group. | |
662 | This is the forcible breaking of infinite loops as implemented in Perl | |
663 | 5.005. If there is an options reset, it will get obeyed in the normal | |
664 | course of events. */ | |
665 | ||
666 | if (*stack.currentFrame->args.instructionPtr == OP_KET || stack.currentFrame->args.subjectPtr == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) { | |
667 | stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE; | |
668 | NEXT_OPCODE; | |
669 | } | |
670 | ||
671 | /* The repeating kets try the rest of the pattern or restart from the | |
672 | preceding bracket, in the appropriate order. */ | |
673 | ||
674 | if (*stack.currentFrame->args.instructionPtr == OP_KETRMIN) { | |
675 | RECURSIVE_MATCH(16, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain); | |
676 | if (isMatch) | |
677 | RRETURN; | |
9dae56ea | 678 | RECURSIVE_MATCH_NEW_GROUP(17, stack.currentFrame->locals.instructionPtrAtStartOfOnce, stack.currentFrame->args.bracketChain); |
b37bf2e1 A |
679 | if (isMatch) |
680 | RRETURN; | |
681 | } else { /* OP_KETRMAX */ | |
9dae56ea | 682 | RECURSIVE_MATCH_NEW_GROUP(18, stack.currentFrame->locals.instructionPtrAtStartOfOnce, stack.currentFrame->args.bracketChain); |
b37bf2e1 A |
683 | if (isMatch) |
684 | RRETURN; | |
685 | RECURSIVE_MATCH(19, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain); | |
686 | if (isMatch) | |
687 | RRETURN; | |
688 | } | |
689 | RRETURN; | |
690 | ||
9dae56ea A |
691 | /* Start of subject. */ |
692 | ||
b37bf2e1 | 693 | BEGIN_OPCODE(CIRC): |
9dae56ea | 694 | if (stack.currentFrame->args.subjectPtr != md.startSubject) |
b37bf2e1 A |
695 | RRETURN_NO_MATCH; |
696 | stack.currentFrame->args.instructionPtr++; | |
697 | NEXT_OPCODE; | |
9dae56ea A |
698 | |
699 | /* After internal newline if multiline. */ | |
700 | ||
701 | BEGIN_OPCODE(BOL): | |
702 | if (stack.currentFrame->args.subjectPtr != md.startSubject && !isNewline(stack.currentFrame->args.subjectPtr[-1])) | |
703 | RRETURN_NO_MATCH; | |
704 | stack.currentFrame->args.instructionPtr++; | |
705 | NEXT_OPCODE; | |
706 | ||
707 | /* End of subject. */ | |
708 | ||
b37bf2e1 | 709 | BEGIN_OPCODE(DOLL): |
9dae56ea A |
710 | if (stack.currentFrame->args.subjectPtr < md.endSubject) |
711 | RRETURN_NO_MATCH; | |
712 | stack.currentFrame->args.instructionPtr++; | |
713 | NEXT_OPCODE; | |
714 | ||
715 | /* Before internal newline if multiline. */ | |
716 | ||
717 | BEGIN_OPCODE(EOL): | |
718 | if (stack.currentFrame->args.subjectPtr < md.endSubject && !isNewline(*stack.currentFrame->args.subjectPtr)) | |
b37bf2e1 A |
719 | RRETURN_NO_MATCH; |
720 | stack.currentFrame->args.instructionPtr++; | |
721 | NEXT_OPCODE; | |
722 | ||
723 | /* Word boundary assertions */ | |
724 | ||
725 | BEGIN_OPCODE(NOT_WORD_BOUNDARY): | |
726 | BEGIN_OPCODE(WORD_BOUNDARY): { | |
727 | bool currentCharIsWordChar = false; | |
728 | bool previousCharIsWordChar = false; | |
729 | ||
730 | if (stack.currentFrame->args.subjectPtr > md.startSubject) | |
731 | previousCharIsWordChar = isWordChar(stack.currentFrame->args.subjectPtr[-1]); | |
732 | if (stack.currentFrame->args.subjectPtr < md.endSubject) | |
733 | currentCharIsWordChar = isWordChar(*stack.currentFrame->args.subjectPtr); | |
734 | ||
735 | /* Now see if the situation is what we want */ | |
736 | bool wordBoundaryDesired = (*stack.currentFrame->args.instructionPtr++ == OP_WORD_BOUNDARY); | |
737 | if (wordBoundaryDesired ? currentCharIsWordChar == previousCharIsWordChar : currentCharIsWordChar != previousCharIsWordChar) | |
738 | RRETURN_NO_MATCH; | |
739 | NEXT_OPCODE; | |
740 | } | |
741 | ||
742 | /* Match a single character type; inline for speed */ | |
743 | ||
744 | BEGIN_OPCODE(NOT_NEWLINE): | |
745 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
746 | RRETURN_NO_MATCH; | |
747 | if (isNewline(*stack.currentFrame->args.subjectPtr++)) | |
748 | RRETURN_NO_MATCH; | |
749 | stack.currentFrame->args.instructionPtr++; | |
750 | NEXT_OPCODE; | |
751 | ||
752 | BEGIN_OPCODE(NOT_DIGIT): | |
753 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
754 | RRETURN_NO_MATCH; | |
755 | if (isASCIIDigit(*stack.currentFrame->args.subjectPtr++)) | |
756 | RRETURN_NO_MATCH; | |
757 | stack.currentFrame->args.instructionPtr++; | |
758 | NEXT_OPCODE; | |
759 | ||
760 | BEGIN_OPCODE(DIGIT): | |
761 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
762 | RRETURN_NO_MATCH; | |
763 | if (!isASCIIDigit(*stack.currentFrame->args.subjectPtr++)) | |
764 | RRETURN_NO_MATCH; | |
765 | stack.currentFrame->args.instructionPtr++; | |
766 | NEXT_OPCODE; | |
767 | ||
768 | BEGIN_OPCODE(NOT_WHITESPACE): | |
769 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
770 | RRETURN_NO_MATCH; | |
771 | if (isSpaceChar(*stack.currentFrame->args.subjectPtr++)) | |
772 | RRETURN_NO_MATCH; | |
773 | stack.currentFrame->args.instructionPtr++; | |
774 | NEXT_OPCODE; | |
775 | ||
776 | BEGIN_OPCODE(WHITESPACE): | |
777 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
778 | RRETURN_NO_MATCH; | |
779 | if (!isSpaceChar(*stack.currentFrame->args.subjectPtr++)) | |
780 | RRETURN_NO_MATCH; | |
781 | stack.currentFrame->args.instructionPtr++; | |
782 | NEXT_OPCODE; | |
783 | ||
784 | BEGIN_OPCODE(NOT_WORDCHAR): | |
785 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
786 | RRETURN_NO_MATCH; | |
787 | if (isWordChar(*stack.currentFrame->args.subjectPtr++)) | |
788 | RRETURN_NO_MATCH; | |
789 | stack.currentFrame->args.instructionPtr++; | |
790 | NEXT_OPCODE; | |
791 | ||
792 | BEGIN_OPCODE(WORDCHAR): | |
793 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
794 | RRETURN_NO_MATCH; | |
795 | if (!isWordChar(*stack.currentFrame->args.subjectPtr++)) | |
796 | RRETURN_NO_MATCH; | |
797 | stack.currentFrame->args.instructionPtr++; | |
798 | NEXT_OPCODE; | |
799 | ||
800 | /* Match a back reference, possibly repeatedly. Look past the end of the | |
801 | item to see if there is repeat information following. The code is similar | |
802 | to that for character classes, but repeated for efficiency. Then obey | |
803 | similar code to character type repeats - written out again for speed. | |
804 | However, if the referenced string is the empty string, always treat | |
805 | it as matched, any number of times (otherwise there could be infinite | |
806 | loops). */ | |
807 | ||
808 | BEGIN_OPCODE(REF): | |
809 | stack.currentFrame->locals.offset = get2ByteValue(stack.currentFrame->args.instructionPtr + 1) << 1; /* Doubled ref number */ | |
810 | stack.currentFrame->args.instructionPtr += 3; /* Advance past item */ | |
811 | ||
812 | /* If the reference is unset, set the length to be longer than the amount | |
813 | of subject left; this ensures that every attempt at a match fails. We | |
814 | can't just fail here, because of the possibility of quantifiers with zero | |
815 | minima. */ | |
816 | ||
817 | if (stack.currentFrame->locals.offset >= stack.currentFrame->args.offsetTop || md.offsetVector[stack.currentFrame->locals.offset] < 0) | |
818 | stack.currentFrame->locals.length = 0; | |
819 | else | |
820 | stack.currentFrame->locals.length = md.offsetVector[stack.currentFrame->locals.offset+1] - md.offsetVector[stack.currentFrame->locals.offset]; | |
821 | ||
822 | /* Set up for repetition, or handle the non-repeated case */ | |
823 | ||
824 | switch (*stack.currentFrame->args.instructionPtr) { | |
825 | case OP_CRSTAR: | |
826 | case OP_CRMINSTAR: | |
827 | case OP_CRPLUS: | |
828 | case OP_CRMINPLUS: | |
829 | case OP_CRQUERY: | |
830 | case OP_CRMINQUERY: | |
831 | repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max); | |
832 | break; | |
833 | ||
834 | case OP_CRRANGE: | |
835 | case OP_CRMINRANGE: | |
836 | minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE); | |
837 | min = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); | |
838 | stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 3); | |
839 | if (stack.currentFrame->locals.max == 0) | |
840 | stack.currentFrame->locals.max = INT_MAX; | |
841 | stack.currentFrame->args.instructionPtr += 5; | |
842 | break; | |
843 | ||
844 | default: /* No repeat follows */ | |
845 | if (!matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md)) | |
846 | RRETURN_NO_MATCH; | |
847 | stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length; | |
848 | NEXT_OPCODE; | |
849 | } | |
850 | ||
851 | /* If the length of the reference is zero, just continue with the | |
852 | main loop. */ | |
853 | ||
854 | if (stack.currentFrame->locals.length == 0) | |
855 | NEXT_OPCODE; | |
856 | ||
857 | /* First, ensure the minimum number of matches are present. */ | |
858 | ||
859 | for (int i = 1; i <= min; i++) { | |
860 | if (!matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md)) | |
861 | RRETURN_NO_MATCH; | |
862 | stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length; | |
863 | } | |
864 | ||
865 | /* If min = max, continue at the same level without recursion. | |
866 | They are not both allowed to be zero. */ | |
867 | ||
868 | if (min == stack.currentFrame->locals.max) | |
869 | NEXT_OPCODE; | |
870 | ||
871 | /* If minimizing, keep trying and advancing the pointer */ | |
872 | ||
873 | if (minimize) { | |
874 | for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { | |
875 | RECURSIVE_MATCH(20, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); | |
876 | if (isMatch) | |
877 | RRETURN; | |
878 | if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || !matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md)) | |
879 | RRETURN; | |
880 | stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length; | |
881 | } | |
882 | /* Control never reaches here */ | |
883 | } | |
884 | ||
885 | /* If maximizing, find the longest string and work backwards */ | |
886 | ||
887 | else { | |
888 | stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; | |
889 | for (int i = min; i < stack.currentFrame->locals.max; i++) { | |
890 | if (!matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md)) | |
891 | break; | |
892 | stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length; | |
893 | } | |
894 | while (stack.currentFrame->args.subjectPtr >= stack.currentFrame->locals.subjectPtrAtStartOfInstruction) { | |
895 | RECURSIVE_MATCH(21, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); | |
896 | if (isMatch) | |
897 | RRETURN; | |
898 | stack.currentFrame->args.subjectPtr -= stack.currentFrame->locals.length; | |
899 | } | |
900 | RRETURN_NO_MATCH; | |
901 | } | |
902 | /* Control never reaches here */ | |
903 | ||
904 | /* Match a bit-mapped character class, possibly repeatedly. This op code is | |
905 | used when all the characters in the class have values in the range 0-255, | |
906 | and either the matching is caseful, or the characters are in the range | |
907 | 0-127 when UTF-8 processing is enabled. The only difference between | |
908 | OP_CLASS and OP_NCLASS occurs when a data character outside the range is | |
909 | encountered. | |
910 | ||
911 | First, look past the end of the item to see if there is repeat information | |
912 | following. Then obey similar code to character type repeats - written out | |
913 | again for speed. */ | |
914 | ||
915 | BEGIN_OPCODE(NCLASS): | |
916 | BEGIN_OPCODE(CLASS): | |
917 | stack.currentFrame->locals.data = stack.currentFrame->args.instructionPtr + 1; /* Save for matching */ | |
918 | stack.currentFrame->args.instructionPtr += 33; /* Advance past the item */ | |
919 | ||
920 | switch (*stack.currentFrame->args.instructionPtr) { | |
921 | case OP_CRSTAR: | |
922 | case OP_CRMINSTAR: | |
923 | case OP_CRPLUS: | |
924 | case OP_CRMINPLUS: | |
925 | case OP_CRQUERY: | |
926 | case OP_CRMINQUERY: | |
927 | repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max); | |
928 | break; | |
929 | ||
930 | case OP_CRRANGE: | |
931 | case OP_CRMINRANGE: | |
932 | minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE); | |
933 | min = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); | |
934 | stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 3); | |
935 | if (stack.currentFrame->locals.max == 0) | |
936 | stack.currentFrame->locals.max = INT_MAX; | |
937 | stack.currentFrame->args.instructionPtr += 5; | |
938 | break; | |
939 | ||
940 | default: /* No repeat follows */ | |
941 | min = stack.currentFrame->locals.max = 1; | |
942 | break; | |
943 | } | |
944 | ||
945 | /* First, ensure the minimum number of matches are present. */ | |
946 | ||
947 | for (int i = 1; i <= min; i++) { | |
948 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
949 | RRETURN_NO_MATCH; | |
950 | int c = *stack.currentFrame->args.subjectPtr++; | |
951 | if (c > 255) { | |
952 | if (stack.currentFrame->locals.data[-1] == OP_CLASS) | |
953 | RRETURN_NO_MATCH; | |
954 | } else { | |
955 | if (!(stack.currentFrame->locals.data[c / 8] & (1 << (c & 7)))) | |
956 | RRETURN_NO_MATCH; | |
957 | } | |
958 | } | |
959 | ||
960 | /* If max == min we can continue with the main loop without the | |
961 | need to recurse. */ | |
962 | ||
963 | if (min == stack.currentFrame->locals.max) | |
964 | NEXT_OPCODE; | |
965 | ||
966 | /* If minimizing, keep testing the rest of the expression and advancing | |
967 | the pointer while it matches the class. */ | |
968 | if (minimize) { | |
969 | for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { | |
970 | RECURSIVE_MATCH(22, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); | |
971 | if (isMatch) | |
972 | RRETURN; | |
973 | if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject) | |
974 | RRETURN; | |
975 | int c = *stack.currentFrame->args.subjectPtr++; | |
976 | if (c > 255) { | |
977 | if (stack.currentFrame->locals.data[-1] == OP_CLASS) | |
978 | RRETURN; | |
979 | } else { | |
980 | if ((stack.currentFrame->locals.data[c/8] & (1 << (c&7))) == 0) | |
981 | RRETURN; | |
982 | } | |
983 | } | |
984 | /* Control never reaches here */ | |
985 | } | |
986 | /* If maximizing, find the longest possible run, then work backwards. */ | |
987 | else { | |
988 | stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; | |
989 | ||
990 | for (int i = min; i < stack.currentFrame->locals.max; i++) { | |
991 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
992 | break; | |
993 | int c = *stack.currentFrame->args.subjectPtr; | |
994 | if (c > 255) { | |
995 | if (stack.currentFrame->locals.data[-1] == OP_CLASS) | |
996 | break; | |
997 | } else { | |
998 | if (!(stack.currentFrame->locals.data[c / 8] & (1 << (c & 7)))) | |
999 | break; | |
1000 | } | |
1001 | ++stack.currentFrame->args.subjectPtr; | |
1002 | } | |
1003 | for (;;) { | |
1004 | RECURSIVE_MATCH(24, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); | |
1005 | if (isMatch) | |
1006 | RRETURN; | |
1007 | if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) | |
1008 | break; /* Stop if tried at original pos */ | |
1009 | } | |
1010 | ||
1011 | RRETURN; | |
1012 | } | |
1013 | /* Control never reaches here */ | |
1014 | ||
1015 | /* Match an extended character class. */ | |
1016 | ||
1017 | BEGIN_OPCODE(XCLASS): | |
1018 | stack.currentFrame->locals.data = stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE; /* Save for matching */ | |
1019 | stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1); /* Advance past the item */ | |
1020 | ||
1021 | switch (*stack.currentFrame->args.instructionPtr) { | |
1022 | case OP_CRSTAR: | |
1023 | case OP_CRMINSTAR: | |
1024 | case OP_CRPLUS: | |
1025 | case OP_CRMINPLUS: | |
1026 | case OP_CRQUERY: | |
1027 | case OP_CRMINQUERY: | |
1028 | repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max); | |
1029 | break; | |
1030 | ||
1031 | case OP_CRRANGE: | |
1032 | case OP_CRMINRANGE: | |
1033 | minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE); | |
1034 | min = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); | |
1035 | stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 3); | |
1036 | if (stack.currentFrame->locals.max == 0) | |
1037 | stack.currentFrame->locals.max = INT_MAX; | |
1038 | stack.currentFrame->args.instructionPtr += 5; | |
1039 | break; | |
1040 | ||
1041 | default: /* No repeat follows */ | |
1042 | min = stack.currentFrame->locals.max = 1; | |
1043 | } | |
1044 | ||
1045 | /* First, ensure the minimum number of matches are present. */ | |
1046 | ||
1047 | for (int i = 1; i <= min; i++) { | |
1048 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1049 | RRETURN_NO_MATCH; | |
1050 | int c = *stack.currentFrame->args.subjectPtr++; | |
9dae56ea | 1051 | if (!jsc_pcre_xclass(c, stack.currentFrame->locals.data)) |
b37bf2e1 A |
1052 | RRETURN_NO_MATCH; |
1053 | } | |
1054 | ||
1055 | /* If max == min we can continue with the main loop without the | |
1056 | need to recurse. */ | |
1057 | ||
1058 | if (min == stack.currentFrame->locals.max) | |
1059 | NEXT_OPCODE; | |
1060 | ||
1061 | /* If minimizing, keep testing the rest of the expression and advancing | |
1062 | the pointer while it matches the class. */ | |
1063 | ||
1064 | if (minimize) { | |
1065 | for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { | |
1066 | RECURSIVE_MATCH(26, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); | |
1067 | if (isMatch) | |
1068 | RRETURN; | |
1069 | if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1070 | RRETURN; | |
1071 | int c = *stack.currentFrame->args.subjectPtr++; | |
9dae56ea | 1072 | if (!jsc_pcre_xclass(c, stack.currentFrame->locals.data)) |
b37bf2e1 A |
1073 | RRETURN; |
1074 | } | |
1075 | /* Control never reaches here */ | |
1076 | } | |
1077 | ||
1078 | /* If maximizing, find the longest possible run, then work backwards. */ | |
1079 | ||
1080 | else { | |
1081 | stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; | |
1082 | for (int i = min; i < stack.currentFrame->locals.max; i++) { | |
1083 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1084 | break; | |
1085 | int c = *stack.currentFrame->args.subjectPtr; | |
9dae56ea | 1086 | if (!jsc_pcre_xclass(c, stack.currentFrame->locals.data)) |
b37bf2e1 A |
1087 | break; |
1088 | ++stack.currentFrame->args.subjectPtr; | |
1089 | } | |
1090 | for(;;) { | |
1091 | RECURSIVE_MATCH(27, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); | |
1092 | if (isMatch) | |
1093 | RRETURN; | |
1094 | if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) | |
1095 | break; /* Stop if tried at original pos */ | |
1096 | } | |
1097 | RRETURN; | |
1098 | } | |
1099 | ||
1100 | /* Control never reaches here */ | |
1101 | ||
1102 | /* Match a single character, casefully */ | |
1103 | ||
1104 | BEGIN_OPCODE(CHAR): | |
1105 | stack.currentFrame->locals.length = 1; | |
1106 | stack.currentFrame->args.instructionPtr++; | |
1107 | getUTF8CharAndIncrementLength(stack.currentFrame->locals.fc, stack.currentFrame->args.instructionPtr, stack.currentFrame->locals.length); | |
1108 | stack.currentFrame->args.instructionPtr += stack.currentFrame->locals.length; | |
1109 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1110 | RRETURN_NO_MATCH; | |
1111 | if (stack.currentFrame->locals.fc != *stack.currentFrame->args.subjectPtr++) | |
1112 | RRETURN_NO_MATCH; | |
1113 | NEXT_OPCODE; | |
1114 | ||
1115 | /* Match a single character, caselessly */ | |
1116 | ||
1117 | BEGIN_OPCODE(CHAR_IGNORING_CASE): { | |
1118 | stack.currentFrame->locals.length = 1; | |
1119 | stack.currentFrame->args.instructionPtr++; | |
1120 | getUTF8CharAndIncrementLength(stack.currentFrame->locals.fc, stack.currentFrame->args.instructionPtr, stack.currentFrame->locals.length); | |
1121 | stack.currentFrame->args.instructionPtr += stack.currentFrame->locals.length; | |
1122 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1123 | RRETURN_NO_MATCH; | |
1124 | int dc = *stack.currentFrame->args.subjectPtr++; | |
9dae56ea | 1125 | if (stack.currentFrame->locals.fc != dc && jsc_pcre_ucp_othercase(stack.currentFrame->locals.fc) != dc) |
b37bf2e1 A |
1126 | RRETURN_NO_MATCH; |
1127 | NEXT_OPCODE; | |
1128 | } | |
1129 | ||
1130 | /* Match a single ASCII character. */ | |
1131 | ||
1132 | BEGIN_OPCODE(ASCII_CHAR): | |
1133 | if (md.endSubject == stack.currentFrame->args.subjectPtr) | |
1134 | RRETURN_NO_MATCH; | |
1135 | if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->args.instructionPtr[1]) | |
1136 | RRETURN_NO_MATCH; | |
1137 | ++stack.currentFrame->args.subjectPtr; | |
1138 | stack.currentFrame->args.instructionPtr += 2; | |
1139 | NEXT_OPCODE; | |
1140 | ||
1141 | /* Match one of two cases of an ASCII letter. */ | |
1142 | ||
1143 | BEGIN_OPCODE(ASCII_LETTER_IGNORING_CASE): | |
1144 | if (md.endSubject == stack.currentFrame->args.subjectPtr) | |
1145 | RRETURN_NO_MATCH; | |
1146 | if ((*stack.currentFrame->args.subjectPtr | 0x20) != stack.currentFrame->args.instructionPtr[1]) | |
1147 | RRETURN_NO_MATCH; | |
1148 | ++stack.currentFrame->args.subjectPtr; | |
1149 | stack.currentFrame->args.instructionPtr += 2; | |
1150 | NEXT_OPCODE; | |
1151 | ||
1152 | /* Match a single character repeatedly; different opcodes share code. */ | |
1153 | ||
1154 | BEGIN_OPCODE(EXACT): | |
1155 | min = stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); | |
1156 | minimize = false; | |
1157 | stack.currentFrame->args.instructionPtr += 3; | |
1158 | goto REPEATCHAR; | |
1159 | ||
1160 | BEGIN_OPCODE(UPTO): | |
1161 | BEGIN_OPCODE(MINUPTO): | |
1162 | min = 0; | |
1163 | stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); | |
1164 | minimize = *stack.currentFrame->args.instructionPtr == OP_MINUPTO; | |
1165 | stack.currentFrame->args.instructionPtr += 3; | |
1166 | goto REPEATCHAR; | |
1167 | ||
1168 | BEGIN_OPCODE(STAR): | |
1169 | BEGIN_OPCODE(MINSTAR): | |
1170 | BEGIN_OPCODE(PLUS): | |
1171 | BEGIN_OPCODE(MINPLUS): | |
1172 | BEGIN_OPCODE(QUERY): | |
1173 | BEGIN_OPCODE(MINQUERY): | |
1174 | repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_STAR, minimize, min, stack.currentFrame->locals.max); | |
1175 | ||
1176 | /* Common code for all repeated single-character matches. We can give | |
1177 | up quickly if there are fewer than the minimum number of characters left in | |
1178 | the subject. */ | |
1179 | ||
1180 | REPEATCHAR: | |
1181 | ||
1182 | stack.currentFrame->locals.length = 1; | |
1183 | getUTF8CharAndIncrementLength(stack.currentFrame->locals.fc, stack.currentFrame->args.instructionPtr, stack.currentFrame->locals.length); | |
1184 | if (min * (stack.currentFrame->locals.fc > 0xFFFF ? 2 : 1) > md.endSubject - stack.currentFrame->args.subjectPtr) | |
1185 | RRETURN_NO_MATCH; | |
1186 | stack.currentFrame->args.instructionPtr += stack.currentFrame->locals.length; | |
1187 | ||
1188 | if (stack.currentFrame->locals.fc <= 0xFFFF) { | |
9dae56ea | 1189 | int othercase = md.ignoreCase ? jsc_pcre_ucp_othercase(stack.currentFrame->locals.fc) : -1; |
b37bf2e1 A |
1190 | |
1191 | for (int i = 1; i <= min; i++) { | |
1192 | if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc && *stack.currentFrame->args.subjectPtr != othercase) | |
1193 | RRETURN_NO_MATCH; | |
1194 | ++stack.currentFrame->args.subjectPtr; | |
1195 | } | |
1196 | ||
1197 | if (min == stack.currentFrame->locals.max) | |
1198 | NEXT_OPCODE; | |
1199 | ||
1200 | if (minimize) { | |
1201 | stack.currentFrame->locals.repeatOthercase = othercase; | |
1202 | for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { | |
1203 | RECURSIVE_MATCH(28, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); | |
1204 | if (isMatch) | |
1205 | RRETURN; | |
1206 | if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1207 | RRETURN; | |
1208 | if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc && *stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.repeatOthercase) | |
1209 | RRETURN; | |
1210 | ++stack.currentFrame->args.subjectPtr; | |
1211 | } | |
1212 | /* Control never reaches here */ | |
1213 | } else { | |
1214 | stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; | |
1215 | for (int i = min; i < stack.currentFrame->locals.max; i++) { | |
1216 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1217 | break; | |
1218 | if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc && *stack.currentFrame->args.subjectPtr != othercase) | |
1219 | break; | |
1220 | ++stack.currentFrame->args.subjectPtr; | |
1221 | } | |
1222 | while (stack.currentFrame->args.subjectPtr >= stack.currentFrame->locals.subjectPtrAtStartOfInstruction) { | |
1223 | RECURSIVE_MATCH(29, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); | |
1224 | if (isMatch) | |
1225 | RRETURN; | |
1226 | --stack.currentFrame->args.subjectPtr; | |
1227 | } | |
1228 | RRETURN_NO_MATCH; | |
1229 | } | |
1230 | /* Control never reaches here */ | |
1231 | } else { | |
1232 | /* No case on surrogate pairs, so no need to bother with "othercase". */ | |
1233 | ||
1234 | for (int i = 1; i <= min; i++) { | |
1235 | if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc) | |
1236 | RRETURN_NO_MATCH; | |
1237 | stack.currentFrame->args.subjectPtr += 2; | |
1238 | } | |
1239 | ||
1240 | if (min == stack.currentFrame->locals.max) | |
1241 | NEXT_OPCODE; | |
1242 | ||
1243 | if (minimize) { | |
1244 | for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { | |
1245 | RECURSIVE_MATCH(30, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); | |
1246 | if (isMatch) | |
1247 | RRETURN; | |
1248 | if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1249 | RRETURN; | |
1250 | if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc) | |
1251 | RRETURN; | |
1252 | stack.currentFrame->args.subjectPtr += 2; | |
1253 | } | |
1254 | /* Control never reaches here */ | |
1255 | } else { | |
1256 | stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; | |
1257 | for (int i = min; i < stack.currentFrame->locals.max; i++) { | |
1258 | if (stack.currentFrame->args.subjectPtr > md.endSubject - 2) | |
1259 | break; | |
1260 | if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc) | |
1261 | break; | |
1262 | stack.currentFrame->args.subjectPtr += 2; | |
1263 | } | |
1264 | while (stack.currentFrame->args.subjectPtr >= stack.currentFrame->locals.subjectPtrAtStartOfInstruction) { | |
1265 | RECURSIVE_MATCH(31, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); | |
1266 | if (isMatch) | |
1267 | RRETURN; | |
1268 | stack.currentFrame->args.subjectPtr -= 2; | |
1269 | } | |
1270 | RRETURN_NO_MATCH; | |
1271 | } | |
1272 | /* Control never reaches here */ | |
1273 | } | |
1274 | /* Control never reaches here */ | |
1275 | ||
1276 | /* Match a negated single one-byte character. */ | |
1277 | ||
1278 | BEGIN_OPCODE(NOT): { | |
1279 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1280 | RRETURN_NO_MATCH; | |
9dae56ea | 1281 | int b = stack.currentFrame->args.instructionPtr[1]; |
b37bf2e1 | 1282 | int c = *stack.currentFrame->args.subjectPtr++; |
9dae56ea | 1283 | stack.currentFrame->args.instructionPtr += 2; |
b37bf2e1 A |
1284 | if (md.ignoreCase) { |
1285 | if (c < 128) | |
1286 | c = toLowerCase(c); | |
9dae56ea | 1287 | if (toLowerCase(b) == c) |
b37bf2e1 A |
1288 | RRETURN_NO_MATCH; |
1289 | } else { | |
9dae56ea | 1290 | if (b == c) |
b37bf2e1 A |
1291 | RRETURN_NO_MATCH; |
1292 | } | |
1293 | NEXT_OPCODE; | |
1294 | } | |
1295 | ||
1296 | /* Match a negated single one-byte character repeatedly. This is almost a | |
1297 | repeat of the code for a repeated single character, but I haven't found a | |
1298 | nice way of commoning these up that doesn't require a test of the | |
1299 | positive/negative option for each character match. Maybe that wouldn't add | |
1300 | very much to the time taken, but character matching *is* what this is all | |
1301 | about... */ | |
1302 | ||
1303 | BEGIN_OPCODE(NOTEXACT): | |
1304 | min = stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); | |
1305 | minimize = false; | |
1306 | stack.currentFrame->args.instructionPtr += 3; | |
1307 | goto REPEATNOTCHAR; | |
1308 | ||
1309 | BEGIN_OPCODE(NOTUPTO): | |
1310 | BEGIN_OPCODE(NOTMINUPTO): | |
1311 | min = 0; | |
1312 | stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); | |
1313 | minimize = *stack.currentFrame->args.instructionPtr == OP_NOTMINUPTO; | |
1314 | stack.currentFrame->args.instructionPtr += 3; | |
1315 | goto REPEATNOTCHAR; | |
1316 | ||
1317 | BEGIN_OPCODE(NOTSTAR): | |
1318 | BEGIN_OPCODE(NOTMINSTAR): | |
1319 | BEGIN_OPCODE(NOTPLUS): | |
1320 | BEGIN_OPCODE(NOTMINPLUS): | |
1321 | BEGIN_OPCODE(NOTQUERY): | |
1322 | BEGIN_OPCODE(NOTMINQUERY): | |
1323 | repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_NOTSTAR, minimize, min, stack.currentFrame->locals.max); | |
1324 | ||
1325 | /* Common code for all repeated single-byte matches. We can give up quickly | |
1326 | if there are fewer than the minimum number of bytes left in the | |
1327 | subject. */ | |
1328 | ||
1329 | REPEATNOTCHAR: | |
1330 | if (min > md.endSubject - stack.currentFrame->args.subjectPtr) | |
1331 | RRETURN_NO_MATCH; | |
1332 | stack.currentFrame->locals.fc = *stack.currentFrame->args.instructionPtr++; | |
1333 | ||
1334 | /* The code is duplicated for the caseless and caseful cases, for speed, | |
1335 | since matching characters is likely to be quite common. First, ensure the | |
1336 | minimum number of matches are present. If min = max, continue at the same | |
1337 | level without recursing. Otherwise, if minimizing, keep trying the rest of | |
1338 | the expression and advancing one matching character if failing, up to the | |
1339 | maximum. Alternatively, if maximizing, find the maximum number of | |
1340 | characters and work backwards. */ | |
1341 | ||
1342 | DPRINTF(("negative matching %c{%d,%d}\n", stack.currentFrame->locals.fc, min, stack.currentFrame->locals.max)); | |
1343 | ||
1344 | if (md.ignoreCase) { | |
1345 | if (stack.currentFrame->locals.fc < 128) | |
1346 | stack.currentFrame->locals.fc = toLowerCase(stack.currentFrame->locals.fc); | |
1347 | ||
1348 | for (int i = 1; i <= min; i++) { | |
1349 | int d = *stack.currentFrame->args.subjectPtr++; | |
1350 | if (d < 128) | |
1351 | d = toLowerCase(d); | |
1352 | if (stack.currentFrame->locals.fc == d) | |
1353 | RRETURN_NO_MATCH; | |
1354 | } | |
1355 | ||
1356 | if (min == stack.currentFrame->locals.max) | |
1357 | NEXT_OPCODE; | |
1358 | ||
1359 | if (minimize) { | |
1360 | for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { | |
1361 | RECURSIVE_MATCH(38, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); | |
1362 | if (isMatch) | |
1363 | RRETURN; | |
1364 | int d = *stack.currentFrame->args.subjectPtr++; | |
1365 | if (d < 128) | |
1366 | d = toLowerCase(d); | |
1367 | if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject || stack.currentFrame->locals.fc == d) | |
1368 | RRETURN; | |
1369 | } | |
1370 | /* Control never reaches here */ | |
1371 | } | |
1372 | ||
1373 | /* Maximize case */ | |
1374 | ||
1375 | else { | |
1376 | stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; | |
1377 | ||
1378 | for (int i = min; i < stack.currentFrame->locals.max; i++) { | |
1379 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1380 | break; | |
1381 | int d = *stack.currentFrame->args.subjectPtr; | |
1382 | if (d < 128) | |
1383 | d = toLowerCase(d); | |
1384 | if (stack.currentFrame->locals.fc == d) | |
1385 | break; | |
1386 | ++stack.currentFrame->args.subjectPtr; | |
1387 | } | |
1388 | for (;;) { | |
1389 | RECURSIVE_MATCH(40, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); | |
1390 | if (isMatch) | |
1391 | RRETURN; | |
1392 | if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) | |
1393 | break; /* Stop if tried at original pos */ | |
1394 | } | |
1395 | ||
1396 | RRETURN; | |
1397 | } | |
1398 | /* Control never reaches here */ | |
1399 | } | |
1400 | ||
1401 | /* Caseful comparisons */ | |
1402 | ||
1403 | else { | |
1404 | for (int i = 1; i <= min; i++) { | |
1405 | int d = *stack.currentFrame->args.subjectPtr++; | |
1406 | if (stack.currentFrame->locals.fc == d) | |
1407 | RRETURN_NO_MATCH; | |
1408 | } | |
1409 | ||
1410 | if (min == stack.currentFrame->locals.max) | |
1411 | NEXT_OPCODE; | |
1412 | ||
1413 | if (minimize) { | |
1414 | for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { | |
1415 | RECURSIVE_MATCH(42, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); | |
1416 | if (isMatch) | |
1417 | RRETURN; | |
1418 | int d = *stack.currentFrame->args.subjectPtr++; | |
1419 | if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject || stack.currentFrame->locals.fc == d) | |
1420 | RRETURN; | |
1421 | } | |
1422 | /* Control never reaches here */ | |
1423 | } | |
1424 | ||
1425 | /* Maximize case */ | |
1426 | ||
1427 | else { | |
1428 | stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; | |
1429 | ||
1430 | for (int i = min; i < stack.currentFrame->locals.max; i++) { | |
1431 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1432 | break; | |
1433 | int d = *stack.currentFrame->args.subjectPtr; | |
1434 | if (stack.currentFrame->locals.fc == d) | |
1435 | break; | |
1436 | ++stack.currentFrame->args.subjectPtr; | |
1437 | } | |
1438 | for (;;) { | |
1439 | RECURSIVE_MATCH(44, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); | |
1440 | if (isMatch) | |
1441 | RRETURN; | |
1442 | if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) | |
1443 | break; /* Stop if tried at original pos */ | |
1444 | } | |
1445 | ||
1446 | RRETURN; | |
1447 | } | |
1448 | } | |
1449 | /* Control never reaches here */ | |
1450 | ||
1451 | /* Match a single character type repeatedly; several different opcodes | |
1452 | share code. This is very similar to the code for single characters, but we | |
1453 | repeat it in the interests of efficiency. */ | |
1454 | ||
1455 | BEGIN_OPCODE(TYPEEXACT): | |
1456 | min = stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); | |
1457 | minimize = true; | |
1458 | stack.currentFrame->args.instructionPtr += 3; | |
1459 | goto REPEATTYPE; | |
1460 | ||
1461 | BEGIN_OPCODE(TYPEUPTO): | |
1462 | BEGIN_OPCODE(TYPEMINUPTO): | |
1463 | min = 0; | |
1464 | stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); | |
1465 | minimize = *stack.currentFrame->args.instructionPtr == OP_TYPEMINUPTO; | |
1466 | stack.currentFrame->args.instructionPtr += 3; | |
1467 | goto REPEATTYPE; | |
1468 | ||
1469 | BEGIN_OPCODE(TYPESTAR): | |
1470 | BEGIN_OPCODE(TYPEMINSTAR): | |
1471 | BEGIN_OPCODE(TYPEPLUS): | |
1472 | BEGIN_OPCODE(TYPEMINPLUS): | |
1473 | BEGIN_OPCODE(TYPEQUERY): | |
1474 | BEGIN_OPCODE(TYPEMINQUERY): | |
1475 | repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_TYPESTAR, minimize, min, stack.currentFrame->locals.max); | |
1476 | ||
1477 | /* Common code for all repeated single character type matches. Note that | |
1478 | in UTF-8 mode, '.' matches a character of any length, but for the other | |
1479 | character types, the valid characters are all one-byte long. */ | |
1480 | ||
1481 | REPEATTYPE: | |
1482 | stack.currentFrame->locals.ctype = *stack.currentFrame->args.instructionPtr++; /* Code for the character type */ | |
1483 | ||
1484 | /* First, ensure the minimum number of matches are present. Use inline | |
1485 | code for maximizing the speed, and do the type test once at the start | |
1486 | (i.e. keep it out of the loop). Also we can test that there are at least | |
1487 | the minimum number of characters before we start. */ | |
1488 | ||
1489 | if (min > md.endSubject - stack.currentFrame->args.subjectPtr) | |
1490 | RRETURN_NO_MATCH; | |
1491 | if (min > 0) { | |
1492 | switch (stack.currentFrame->locals.ctype) { | |
1493 | case OP_NOT_NEWLINE: | |
1494 | for (int i = 1; i <= min; i++) { | |
1495 | if (isNewline(*stack.currentFrame->args.subjectPtr)) | |
1496 | RRETURN_NO_MATCH; | |
1497 | ++stack.currentFrame->args.subjectPtr; | |
1498 | } | |
1499 | break; | |
1500 | ||
1501 | case OP_NOT_DIGIT: | |
1502 | for (int i = 1; i <= min; i++) { | |
1503 | if (isASCIIDigit(*stack.currentFrame->args.subjectPtr)) | |
1504 | RRETURN_NO_MATCH; | |
1505 | ++stack.currentFrame->args.subjectPtr; | |
1506 | } | |
1507 | break; | |
1508 | ||
1509 | case OP_DIGIT: | |
1510 | for (int i = 1; i <= min; i++) { | |
1511 | if (!isASCIIDigit(*stack.currentFrame->args.subjectPtr)) | |
1512 | RRETURN_NO_MATCH; | |
1513 | ++stack.currentFrame->args.subjectPtr; | |
1514 | } | |
1515 | break; | |
1516 | ||
1517 | case OP_NOT_WHITESPACE: | |
1518 | for (int i = 1; i <= min; i++) { | |
1519 | if (isSpaceChar(*stack.currentFrame->args.subjectPtr)) | |
1520 | RRETURN_NO_MATCH; | |
1521 | ++stack.currentFrame->args.subjectPtr; | |
1522 | } | |
1523 | break; | |
1524 | ||
1525 | case OP_WHITESPACE: | |
1526 | for (int i = 1; i <= min; i++) { | |
1527 | if (!isSpaceChar(*stack.currentFrame->args.subjectPtr)) | |
1528 | RRETURN_NO_MATCH; | |
1529 | ++stack.currentFrame->args.subjectPtr; | |
1530 | } | |
1531 | break; | |
1532 | ||
1533 | case OP_NOT_WORDCHAR: | |
1534 | for (int i = 1; i <= min; i++) { | |
1535 | if (isWordChar(*stack.currentFrame->args.subjectPtr)) | |
1536 | RRETURN_NO_MATCH; | |
1537 | ++stack.currentFrame->args.subjectPtr; | |
1538 | } | |
1539 | break; | |
1540 | ||
1541 | case OP_WORDCHAR: | |
1542 | for (int i = 1; i <= min; i++) { | |
1543 | if (!isWordChar(*stack.currentFrame->args.subjectPtr)) | |
1544 | RRETURN_NO_MATCH; | |
1545 | ++stack.currentFrame->args.subjectPtr; | |
1546 | } | |
1547 | break; | |
1548 | ||
1549 | default: | |
1550 | ASSERT_NOT_REACHED(); | |
1551 | return matchError(JSRegExpErrorInternal, stack); | |
1552 | } /* End switch(stack.currentFrame->locals.ctype) */ | |
1553 | } | |
1554 | ||
1555 | /* If min = max, continue at the same level without recursing */ | |
1556 | ||
1557 | if (min == stack.currentFrame->locals.max) | |
1558 | NEXT_OPCODE; | |
1559 | ||
1560 | /* If minimizing, we have to test the rest of the pattern before each | |
1561 | subsequent match. */ | |
1562 | ||
1563 | if (minimize) { | |
1564 | for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { | |
1565 | RECURSIVE_MATCH(48, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); | |
1566 | if (isMatch) | |
1567 | RRETURN; | |
1568 | if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1569 | RRETURN; | |
1570 | ||
1571 | int c = *stack.currentFrame->args.subjectPtr++; | |
1572 | switch (stack.currentFrame->locals.ctype) { | |
1573 | case OP_NOT_NEWLINE: | |
1574 | if (isNewline(c)) | |
1575 | RRETURN; | |
1576 | break; | |
1577 | ||
1578 | case OP_NOT_DIGIT: | |
1579 | if (isASCIIDigit(c)) | |
1580 | RRETURN; | |
1581 | break; | |
1582 | ||
1583 | case OP_DIGIT: | |
1584 | if (!isASCIIDigit(c)) | |
1585 | RRETURN; | |
1586 | break; | |
1587 | ||
1588 | case OP_NOT_WHITESPACE: | |
1589 | if (isSpaceChar(c)) | |
1590 | RRETURN; | |
1591 | break; | |
1592 | ||
1593 | case OP_WHITESPACE: | |
1594 | if (!isSpaceChar(c)) | |
1595 | RRETURN; | |
1596 | break; | |
1597 | ||
1598 | case OP_NOT_WORDCHAR: | |
1599 | if (isWordChar(c)) | |
1600 | RRETURN; | |
1601 | break; | |
1602 | ||
1603 | case OP_WORDCHAR: | |
1604 | if (!isWordChar(c)) | |
1605 | RRETURN; | |
1606 | break; | |
1607 | ||
1608 | default: | |
1609 | ASSERT_NOT_REACHED(); | |
1610 | return matchError(JSRegExpErrorInternal, stack); | |
1611 | } | |
1612 | } | |
1613 | /* Control never reaches here */ | |
1614 | } | |
1615 | ||
1616 | /* If maximizing it is worth using inline code for speed, doing the type | |
1617 | test once at the start (i.e. keep it out of the loop). */ | |
1618 | ||
1619 | else { | |
1620 | stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; /* Remember where we started */ | |
1621 | ||
1622 | switch (stack.currentFrame->locals.ctype) { | |
1623 | case OP_NOT_NEWLINE: | |
1624 | for (int i = min; i < stack.currentFrame->locals.max; i++) { | |
1625 | if (stack.currentFrame->args.subjectPtr >= md.endSubject || isNewline(*stack.currentFrame->args.subjectPtr)) | |
1626 | break; | |
1627 | stack.currentFrame->args.subjectPtr++; | |
1628 | } | |
1629 | break; | |
1630 | ||
1631 | case OP_NOT_DIGIT: | |
1632 | for (int i = min; i < stack.currentFrame->locals.max; i++) { | |
1633 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1634 | break; | |
1635 | int c = *stack.currentFrame->args.subjectPtr; | |
1636 | if (isASCIIDigit(c)) | |
1637 | break; | |
1638 | ++stack.currentFrame->args.subjectPtr; | |
1639 | } | |
1640 | break; | |
1641 | ||
1642 | case OP_DIGIT: | |
1643 | for (int i = min; i < stack.currentFrame->locals.max; i++) { | |
1644 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1645 | break; | |
1646 | int c = *stack.currentFrame->args.subjectPtr; | |
1647 | if (!isASCIIDigit(c)) | |
1648 | break; | |
1649 | ++stack.currentFrame->args.subjectPtr; | |
1650 | } | |
1651 | break; | |
1652 | ||
1653 | case OP_NOT_WHITESPACE: | |
1654 | for (int i = min; i < stack.currentFrame->locals.max; i++) { | |
1655 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1656 | break; | |
1657 | int c = *stack.currentFrame->args.subjectPtr; | |
1658 | if (isSpaceChar(c)) | |
1659 | break; | |
1660 | ++stack.currentFrame->args.subjectPtr; | |
1661 | } | |
1662 | break; | |
1663 | ||
1664 | case OP_WHITESPACE: | |
1665 | for (int i = min; i < stack.currentFrame->locals.max; i++) { | |
1666 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1667 | break; | |
1668 | int c = *stack.currentFrame->args.subjectPtr; | |
1669 | if (!isSpaceChar(c)) | |
1670 | break; | |
1671 | ++stack.currentFrame->args.subjectPtr; | |
1672 | } | |
1673 | break; | |
1674 | ||
1675 | case OP_NOT_WORDCHAR: | |
1676 | for (int i = min; i < stack.currentFrame->locals.max; i++) { | |
1677 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1678 | break; | |
1679 | int c = *stack.currentFrame->args.subjectPtr; | |
1680 | if (isWordChar(c)) | |
1681 | break; | |
1682 | ++stack.currentFrame->args.subjectPtr; | |
1683 | } | |
1684 | break; | |
1685 | ||
1686 | case OP_WORDCHAR: | |
1687 | for (int i = min; i < stack.currentFrame->locals.max; i++) { | |
1688 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1689 | break; | |
1690 | int c = *stack.currentFrame->args.subjectPtr; | |
1691 | if (!isWordChar(c)) | |
1692 | break; | |
1693 | ++stack.currentFrame->args.subjectPtr; | |
1694 | } | |
1695 | break; | |
1696 | ||
1697 | default: | |
1698 | ASSERT_NOT_REACHED(); | |
1699 | return matchError(JSRegExpErrorInternal, stack); | |
1700 | } | |
1701 | ||
1702 | /* stack.currentFrame->args.subjectPtr is now past the end of the maximum run */ | |
1703 | ||
1704 | for (;;) { | |
1705 | RECURSIVE_MATCH(52, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); | |
1706 | if (isMatch) | |
1707 | RRETURN; | |
1708 | if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) | |
1709 | break; /* Stop if tried at original pos */ | |
1710 | } | |
1711 | ||
1712 | /* Get here if we can't make it match with any permitted repetitions */ | |
1713 | ||
1714 | RRETURN; | |
1715 | } | |
1716 | /* Control never reaches here */ | |
1717 | ||
1718 | BEGIN_OPCODE(CRMINPLUS): | |
1719 | BEGIN_OPCODE(CRMINQUERY): | |
1720 | BEGIN_OPCODE(CRMINRANGE): | |
1721 | BEGIN_OPCODE(CRMINSTAR): | |
1722 | BEGIN_OPCODE(CRPLUS): | |
1723 | BEGIN_OPCODE(CRQUERY): | |
1724 | BEGIN_OPCODE(CRRANGE): | |
1725 | BEGIN_OPCODE(CRSTAR): | |
1726 | ASSERT_NOT_REACHED(); | |
1727 | return matchError(JSRegExpErrorInternal, stack); | |
1728 | ||
1729 | #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP | |
1730 | CAPTURING_BRACKET: | |
1731 | #else | |
1732 | default: | |
1733 | #endif | |
1734 | /* Opening capturing bracket. If there is space in the offset vector, save | |
1735 | the current subject position in the working slot at the top of the vector. We | |
1736 | mustn't change the current values of the data slot, because they may be set | |
1737 | from a previous iteration of this group, and be referred to by a reference | |
1738 | inside the group. | |
1739 | ||
1740 | If the bracket fails to match, we need to restore this value and also the | |
1741 | values of the final offsets, in case they were set by a previous iteration of | |
1742 | the same bracket. | |
1743 | ||
1744 | If there isn't enough space in the offset vector, treat this as if it were a | |
1745 | non-capturing bracket. Don't worry about setting the flag for the error case | |
1746 | here; that is handled in the code for KET. */ | |
1747 | ||
1748 | ASSERT(*stack.currentFrame->args.instructionPtr > OP_BRA); | |
1749 | ||
1750 | stack.currentFrame->locals.number = *stack.currentFrame->args.instructionPtr - OP_BRA; | |
1751 | ||
1752 | /* For extended extraction brackets (large number), we have to fish out the | |
1753 | number from a dummy opcode at the start. */ | |
1754 | ||
1755 | if (stack.currentFrame->locals.number > EXTRACT_BASIC_MAX) | |
1756 | stack.currentFrame->locals.number = get2ByteValue(stack.currentFrame->args.instructionPtr + 2 + LINK_SIZE); | |
1757 | stack.currentFrame->locals.offset = stack.currentFrame->locals.number << 1; | |
1758 | ||
1759 | #ifdef DEBUG | |
1760 | printf("start bracket %d subject=", stack.currentFrame->locals.number); | |
1761 | pchars(stack.currentFrame->args.subjectPtr, 16, true, md); | |
1762 | printf("\n"); | |
1763 | #endif | |
1764 | ||
1765 | if (stack.currentFrame->locals.offset < md.offsetMax) { | |
1766 | stack.currentFrame->locals.saveOffset1 = md.offsetVector[stack.currentFrame->locals.offset]; | |
1767 | stack.currentFrame->locals.saveOffset2 = md.offsetVector[stack.currentFrame->locals.offset + 1]; | |
1768 | stack.currentFrame->locals.saveOffset3 = md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number]; | |
1769 | ||
1770 | DPRINTF(("saving %d %d %d\n", stack.currentFrame->locals.saveOffset1, stack.currentFrame->locals.saveOffset2, stack.currentFrame->locals.saveOffset3)); | |
1771 | md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number] = stack.currentFrame->args.subjectPtr - md.startSubject; | |
1772 | ||
1773 | do { | |
9dae56ea | 1774 | RECURSIVE_MATCH_NEW_GROUP(1, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain); |
b37bf2e1 A |
1775 | if (isMatch) |
1776 | RRETURN; | |
1777 | stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1); | |
1778 | } while (*stack.currentFrame->args.instructionPtr == OP_ALT); | |
1779 | ||
1780 | DPRINTF(("bracket %d failed\n", stack.currentFrame->locals.number)); | |
1781 | ||
1782 | md.offsetVector[stack.currentFrame->locals.offset] = stack.currentFrame->locals.saveOffset1; | |
1783 | md.offsetVector[stack.currentFrame->locals.offset + 1] = stack.currentFrame->locals.saveOffset2; | |
1784 | md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number] = stack.currentFrame->locals.saveOffset3; | |
1785 | ||
1786 | RRETURN; | |
1787 | } | |
1788 | ||
1789 | /* Insufficient room for saving captured contents */ | |
1790 | ||
1791 | goto NON_CAPTURING_BRACKET; | |
1792 | } | |
1793 | ||
1794 | /* Do not stick any code in here without much thought; it is assumed | |
1795 | that "continue" in the code above comes out to here to repeat the main | |
1796 | loop. */ | |
1797 | ||
1798 | } /* End of main loop */ | |
1799 | ||
1800 | ASSERT_NOT_REACHED(); | |
1801 | ||
1802 | #ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION | |
1803 | ||
1804 | RRETURN_SWITCH: | |
1805 | switch (stack.currentFrame->returnLocation) { | |
1806 | case 0: goto RETURN; | |
1807 | case 1: goto RRETURN_1; | |
1808 | case 2: goto RRETURN_2; | |
1809 | case 6: goto RRETURN_6; | |
1810 | case 7: goto RRETURN_7; | |
1811 | case 14: goto RRETURN_14; | |
1812 | case 15: goto RRETURN_15; | |
1813 | case 16: goto RRETURN_16; | |
1814 | case 17: goto RRETURN_17; | |
1815 | case 18: goto RRETURN_18; | |
1816 | case 19: goto RRETURN_19; | |
1817 | case 20: goto RRETURN_20; | |
1818 | case 21: goto RRETURN_21; | |
1819 | case 22: goto RRETURN_22; | |
1820 | case 24: goto RRETURN_24; | |
1821 | case 26: goto RRETURN_26; | |
1822 | case 27: goto RRETURN_27; | |
1823 | case 28: goto RRETURN_28; | |
1824 | case 29: goto RRETURN_29; | |
1825 | case 30: goto RRETURN_30; | |
1826 | case 31: goto RRETURN_31; | |
1827 | case 38: goto RRETURN_38; | |
1828 | case 40: goto RRETURN_40; | |
1829 | case 42: goto RRETURN_42; | |
1830 | case 44: goto RRETURN_44; | |
1831 | case 48: goto RRETURN_48; | |
1832 | case 52: goto RRETURN_52; | |
1833 | } | |
1834 | ||
1835 | ASSERT_NOT_REACHED(); | |
1836 | return matchError(JSRegExpErrorInternal, stack); | |
1837 | ||
1838 | #endif | |
1839 | ||
1840 | RETURN: | |
1841 | return isMatch; | |
1842 | } | |
1843 | ||
1844 | ||
1845 | /************************************************* | |
1846 | * Execute a Regular Expression * | |
1847 | *************************************************/ | |
1848 | ||
1849 | /* This function applies a compiled re to a subject string and picks out | |
1850 | portions of the string if it matches. Two elements in the vector are set for | |
1851 | each substring: the offsets to the start and end of the substring. | |
1852 | ||
1853 | Arguments: | |
1854 | re points to the compiled expression | |
1855 | extra_data points to extra data or is NULL | |
1856 | subject points to the subject string | |
1857 | length length of subject string (may contain binary zeros) | |
1858 | start_offset where to start in the subject string | |
1859 | options option bits | |
1860 | offsets points to a vector of ints to be filled in with offsets | |
9dae56ea | 1861 | offsetCount the number of elements in the vector |
b37bf2e1 A |
1862 | |
1863 | Returns: > 0 => success; value is the number of elements filled in | |
1864 | = 0 => success, but offsets is not big enough | |
1865 | -1 => failed to match | |
1866 | < -1 => some kind of unexpected problem | |
1867 | */ | |
1868 | ||
9dae56ea | 1869 | static void tryFirstByteOptimization(const UChar*& subjectPtr, const UChar* endSubject, int firstByte, bool firstByteIsCaseless, bool useMultiLineFirstCharOptimization, const UChar* originalSubjectStart) |
b37bf2e1 | 1870 | { |
9dae56ea | 1871 | // If firstByte is set, try scanning to the first instance of that byte |
b37bf2e1 | 1872 | // no need to try and match against any earlier part of the subject string. |
9dae56ea A |
1873 | if (firstByte >= 0) { |
1874 | UChar firstChar = firstByte; | |
1875 | if (firstByteIsCaseless) | |
b37bf2e1 A |
1876 | while (subjectPtr < endSubject) { |
1877 | int c = *subjectPtr; | |
1878 | if (c > 127) | |
1879 | break; | |
9dae56ea | 1880 | if (toLowerCase(c) == firstChar) |
b37bf2e1 A |
1881 | break; |
1882 | subjectPtr++; | |
1883 | } | |
1884 | else { | |
9dae56ea | 1885 | while (subjectPtr < endSubject && *subjectPtr != firstChar) |
b37bf2e1 A |
1886 | subjectPtr++; |
1887 | } | |
1888 | } else if (useMultiLineFirstCharOptimization) { | |
1889 | /* Or to just after \n for a multiline match if possible */ | |
1890 | // I'm not sure why this != originalSubjectStart check is necessary -- ecs 11/18/07 | |
1891 | if (subjectPtr > originalSubjectStart) { | |
1892 | while (subjectPtr < endSubject && !isNewline(subjectPtr[-1])) | |
1893 | subjectPtr++; | |
1894 | } | |
1895 | } | |
1896 | } | |
1897 | ||
9dae56ea | 1898 | static bool tryRequiredByteOptimization(const UChar*& subjectPtr, const UChar* endSubject, int reqByte, int reqByte2, bool reqByteIsCaseless, bool hasFirstByte, const UChar*& reqBytePtr) |
b37bf2e1 | 1899 | { |
9dae56ea A |
1900 | /* If reqByte is set, we know that that character must appear in the subject |
1901 | for the match to succeed. If the first character is set, reqByte must be | |
b37bf2e1 A |
1902 | later in the subject; otherwise the test starts at the match point. This |
1903 | optimization can save a huge amount of backtracking in patterns with nested | |
1904 | unlimited repeats that aren't going to match. Writing separate code for | |
1905 | cased/caseless versions makes it go faster, as does using an autoincrement | |
1906 | and backing off on a match. | |
1907 | ||
1908 | HOWEVER: when the subject string is very, very long, searching to its end can | |
1909 | take a long time, and give bad performance on quite ordinary patterns. This | |
1910 | showed up when somebody was matching /^C/ on a 32-megabyte string... so we | |
1911 | don't do this when the string is sufficiently long. | |
1912 | */ | |
1913 | ||
9dae56ea | 1914 | if (reqByte >= 0 && endSubject - subjectPtr < REQ_BYTE_MAX) { |
b37bf2e1 A |
1915 | const UChar* p = subjectPtr + (hasFirstByte ? 1 : 0); |
1916 | ||
1917 | /* We don't need to repeat the search if we haven't yet reached the | |
1918 | place we found it at last time. */ | |
1919 | ||
1920 | if (p > reqBytePtr) { | |
9dae56ea | 1921 | if (reqByteIsCaseless) { |
b37bf2e1 A |
1922 | while (p < endSubject) { |
1923 | int pp = *p++; | |
9dae56ea | 1924 | if (pp == reqByte || pp == reqByte2) { |
b37bf2e1 A |
1925 | p--; |
1926 | break; | |
1927 | } | |
1928 | } | |
1929 | } else { | |
1930 | while (p < endSubject) { | |
9dae56ea | 1931 | if (*p++ == reqByte) { |
b37bf2e1 A |
1932 | p--; |
1933 | break; | |
1934 | } | |
1935 | } | |
1936 | } | |
1937 | ||
1938 | /* If we can't find the required character, break the matching loop */ | |
1939 | ||
1940 | if (p >= endSubject) | |
1941 | return true; | |
1942 | ||
1943 | /* If we have found the required character, save the point where we | |
1944 | found it, so that we don't search again next time round the loop if | |
1945 | the start hasn't passed this character yet. */ | |
1946 | ||
1947 | reqBytePtr = p; | |
1948 | } | |
1949 | } | |
1950 | return false; | |
1951 | } | |
1952 | ||
1953 | int jsRegExpExecute(const JSRegExp* re, | |
1954 | const UChar* subject, int length, int start_offset, int* offsets, | |
9dae56ea | 1955 | int offsetCount) |
b37bf2e1 A |
1956 | { |
1957 | ASSERT(re); | |
9dae56ea A |
1958 | ASSERT(subject || !length); |
1959 | ASSERT(offsetCount >= 0); | |
1960 | ASSERT(offsets || offsetCount == 0); | |
1961 | ||
1962 | HistogramTimeLogger logger(re); | |
1963 | ||
b37bf2e1 A |
1964 | MatchData matchBlock; |
1965 | matchBlock.startSubject = subject; | |
1966 | matchBlock.endSubject = matchBlock.startSubject + length; | |
1967 | const UChar* endSubject = matchBlock.endSubject; | |
1968 | ||
1969 | matchBlock.multiline = (re->options & MatchAcrossMultipleLinesOption); | |
1970 | matchBlock.ignoreCase = (re->options & IgnoreCaseOption); | |
1971 | ||
1972 | /* If the expression has got more back references than the offsets supplied can | |
1973 | hold, we get a temporary chunk of working store to use during the matching. | |
1974 | Otherwise, we can use the vector supplied, rounding down its size to a multiple | |
1975 | of 3. */ | |
1976 | ||
9dae56ea | 1977 | int ocount = offsetCount - (offsetCount % 3); |
b37bf2e1 A |
1978 | |
1979 | // FIXME: This is lame that we have to second-guess our caller here. | |
1980 | // The API should change to either fail-hard when we don't have enough offset space | |
1981 | // or that we shouldn't ask our callers to pre-allocate in the first place. | |
9dae56ea A |
1982 | bool usingTemporaryOffsets = false; |
1983 | if (re->topBackref > 0 && re->topBackref >= ocount/3) { | |
1984 | ocount = re->topBackref * 3 + 3; | |
b37bf2e1 A |
1985 | matchBlock.offsetVector = new int[ocount]; |
1986 | if (!matchBlock.offsetVector) | |
1987 | return JSRegExpErrorNoMemory; | |
9dae56ea | 1988 | usingTemporaryOffsets = true; |
b37bf2e1 A |
1989 | } else |
1990 | matchBlock.offsetVector = offsets; | |
1991 | ||
1992 | matchBlock.offsetEnd = ocount; | |
1993 | matchBlock.offsetMax = (2*ocount)/3; | |
1994 | matchBlock.offsetOverflow = false; | |
1995 | ||
1996 | /* Compute the minimum number of offsets that we need to reset each time. Doing | |
1997 | this makes a huge difference to execution time when there aren't many brackets | |
1998 | in the pattern. */ | |
1999 | ||
9dae56ea A |
2000 | int resetCount = 2 + re->topBracket * 2; |
2001 | if (resetCount > offsetCount) | |
2002 | resetCount = ocount; | |
b37bf2e1 A |
2003 | |
2004 | /* Reset the working variable associated with each extraction. These should | |
2005 | never be used unless previously set, but they get saved and restored, and so we | |
2006 | initialize them to avoid reading uninitialized locations. */ | |
2007 | ||
2008 | if (matchBlock.offsetVector) { | |
2009 | int* iptr = matchBlock.offsetVector + ocount; | |
9dae56ea | 2010 | int* iend = iptr - resetCount/2 + 1; |
b37bf2e1 A |
2011 | while (--iptr >= iend) |
2012 | *iptr = -1; | |
2013 | } | |
2014 | ||
9dae56ea | 2015 | /* Set up the first character to match, if available. The firstByte value is |
b37bf2e1 A |
2016 | never set for an anchored regular expression, but the anchoring may be forced |
2017 | at run time, so we have to test for anchoring. The first char may be unset for | |
2018 | an unanchored pattern, of course. If there's no first char and the pattern was | |
2019 | studied, there may be a bitmap of possible first characters. */ | |
2020 | ||
9dae56ea A |
2021 | bool firstByteIsCaseless = false; |
2022 | int firstByte = -1; | |
b37bf2e1 | 2023 | if (re->options & UseFirstByteOptimizationOption) { |
9dae56ea A |
2024 | firstByte = re->firstByte & 255; |
2025 | if ((firstByteIsCaseless = (re->firstByte & REQ_IGNORE_CASE))) | |
2026 | firstByte = toLowerCase(firstByte); | |
b37bf2e1 A |
2027 | } |
2028 | ||
2029 | /* For anchored or unanchored matches, there may be a "last known required | |
2030 | character" set. */ | |
2031 | ||
9dae56ea A |
2032 | bool reqByteIsCaseless = false; |
2033 | int reqByte = -1; | |
2034 | int reqByte2 = -1; | |
b37bf2e1 | 2035 | if (re->options & UseRequiredByteOptimizationOption) { |
9dae56ea A |
2036 | reqByte = re->reqByte & 255; // FIXME: This optimization could be made to work for UTF16 chars as well... |
2037 | reqByteIsCaseless = (re->reqByte & REQ_IGNORE_CASE); | |
2038 | reqByte2 = flipCase(reqByte); | |
b37bf2e1 A |
2039 | } |
2040 | ||
2041 | /* Loop for handling unanchored repeated matching attempts; for anchored regexs | |
2042 | the loop runs just once. */ | |
2043 | ||
2044 | const UChar* startMatch = subject + start_offset; | |
2045 | const UChar* reqBytePtr = startMatch - 1; | |
2046 | bool useMultiLineFirstCharOptimization = re->options & UseMultiLineFirstByteOptimizationOption; | |
2047 | ||
2048 | do { | |
2049 | /* Reset the maximum number of extractions we might see. */ | |
2050 | if (matchBlock.offsetVector) { | |
2051 | int* iptr = matchBlock.offsetVector; | |
9dae56ea | 2052 | int* iend = iptr + resetCount; |
b37bf2e1 A |
2053 | while (iptr < iend) |
2054 | *iptr++ = -1; | |
2055 | } | |
2056 | ||
9dae56ea A |
2057 | tryFirstByteOptimization(startMatch, endSubject, firstByte, firstByteIsCaseless, useMultiLineFirstCharOptimization, matchBlock.startSubject + start_offset); |
2058 | if (tryRequiredByteOptimization(startMatch, endSubject, reqByte, reqByte2, reqByteIsCaseless, firstByte >= 0, reqBytePtr)) | |
b37bf2e1 A |
2059 | break; |
2060 | ||
2061 | /* When a match occurs, substrings will be set for all internal extractions; | |
2062 | we just need to set up the whole thing as substring 0 before returning. If | |
2063 | there were too many extractions, set the return code to zero. In the case | |
2064 | where we had to get some local store to hold offsets for backreferences, copy | |
2065 | those back references that we can. In this case there need not be overflow | |
2066 | if certain parts of the pattern were not used. */ | |
2067 | ||
2068 | /* The code starts after the JSRegExp block and the capture name table. */ | |
2069 | const unsigned char* start_code = (const unsigned char*)(re + 1); | |
2070 | ||
2071 | int returnCode = match(startMatch, start_code, 2, matchBlock); | |
2072 | ||
2073 | /* When the result is no match, advance the pointer to the next character | |
2074 | and continue. */ | |
2075 | if (returnCode == 0) { | |
2076 | startMatch++; | |
2077 | continue; | |
2078 | } | |
2079 | ||
2080 | if (returnCode != 1) { | |
2081 | ASSERT(returnCode == JSRegExpErrorHitLimit || returnCode == JSRegExpErrorNoMemory); | |
9dae56ea | 2082 | DPRINTF((">>>> error: returning %d\n", returnCode)); |
b37bf2e1 A |
2083 | return returnCode; |
2084 | } | |
2085 | ||
2086 | /* We have a match! Copy the offset information from temporary store if | |
2087 | necessary */ | |
2088 | ||
9dae56ea A |
2089 | if (usingTemporaryOffsets) { |
2090 | if (offsetCount >= 4) { | |
2091 | memcpy(offsets + 2, matchBlock.offsetVector + 2, (offsetCount - 2) * sizeof(int)); | |
b37bf2e1 A |
2092 | DPRINTF(("Copied offsets from temporary memory\n")); |
2093 | } | |
9dae56ea | 2094 | if (matchBlock.endOffsetTop > offsetCount) |
b37bf2e1 A |
2095 | matchBlock.offsetOverflow = true; |
2096 | ||
2097 | DPRINTF(("Freeing temporary memory\n")); | |
2098 | delete [] matchBlock.offsetVector; | |
2099 | } | |
2100 | ||
2101 | returnCode = matchBlock.offsetOverflow ? 0 : matchBlock.endOffsetTop / 2; | |
2102 | ||
9dae56ea | 2103 | if (offsetCount < 2) |
b37bf2e1 A |
2104 | returnCode = 0; |
2105 | else { | |
2106 | offsets[0] = startMatch - matchBlock.startSubject; | |
2107 | offsets[1] = matchBlock.endMatchPtr - matchBlock.startSubject; | |
2108 | } | |
2109 | ||
9dae56ea | 2110 | DPRINTF((">>>> returning %d\n", returnCode)); |
b37bf2e1 | 2111 | return returnCode; |
9dae56ea | 2112 | } while (!(re->options & IsAnchoredOption) && startMatch <= endSubject); |
b37bf2e1 | 2113 | |
9dae56ea | 2114 | if (usingTemporaryOffsets) { |
b37bf2e1 A |
2115 | DPRINTF(("Freeing temporary memory\n")); |
2116 | delete [] matchBlock.offsetVector; | |
2117 | } | |
2118 | ||
2119 | DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n")); | |
2120 | return JSRegExpErrorNoMatch; | |
2121 | } | |
9dae56ea A |
2122 | |
2123 | #if REGEXP_HISTOGRAM | |
2124 | ||
2125 | class CompareHistogramEntries { | |
2126 | public: | |
2127 | bool operator()(const pair<UString, double>& a, const pair<UString, double>& b) | |
2128 | { | |
2129 | if (a.second == b.second) | |
2130 | return a.first < b.first; | |
2131 | return a.second < b.second; | |
2132 | } | |
2133 | }; | |
2134 | ||
2135 | Histogram::~Histogram() | |
2136 | { | |
2137 | Vector<pair<UString, double> > values; | |
2138 | Map::iterator end = times.end(); | |
2139 | for (Map::iterator it = times.begin(); it != end; ++it) | |
2140 | values.append(*it); | |
2141 | sort(values.begin(), values.end(), CompareHistogramEntries()); | |
2142 | size_t size = values.size(); | |
2143 | printf("Regular Expressions, sorted by time spent evaluating them:\n"); | |
2144 | for (size_t i = 0; i < size; ++i) | |
2145 | printf(" %f - %s\n", values[size - i - 1].second, values[size - i - 1].first.UTF8String().c_str()); | |
2146 | } | |
2147 | ||
2148 | void Histogram::add(const JSRegExp* re, double elapsedTime) | |
2149 | { | |
2150 | UString string(reinterpret_cast<const UChar*>(reinterpret_cast<const char*>(re) + re->stringOffset), re->stringLength); | |
2151 | if (re->options & IgnoreCaseOption && re->options & MatchAcrossMultipleLinesOption) | |
2152 | string += " (multi-line, ignore case)"; | |
2153 | else { | |
2154 | if (re->options & IgnoreCaseOption) | |
2155 | string += " (ignore case)"; | |
2156 | if (re->options & MatchAcrossMultipleLinesOption) | |
2157 | string += " (multi-line)"; | |
2158 | } | |
2159 | pair<Map::iterator, bool> result = times.add(string.rep(), elapsedTime); | |
2160 | if (!result.second) | |
2161 | result.first->second += elapsedTime; | |
2162 | } | |
2163 | ||
2164 | HistogramTimeLogger::HistogramTimeLogger(const JSRegExp* re) | |
2165 | : m_re(re) | |
2166 | , m_startTime(getCurrentUTCTimeWithMicroseconds()) | |
2167 | { | |
2168 | } | |
2169 | ||
2170 | HistogramTimeLogger::~HistogramTimeLogger() | |
2171 | { | |
2172 | static Histogram histogram; | |
2173 | histogram.add(m_re, getCurrentUTCTimeWithMicroseconds() - m_startTime); | |
2174 | } | |
2175 | ||
2176 | #endif |