]>
Commit | Line | Data |
---|---|---|
b37bf2e1 A |
1 | /* This is JavaScriptCore's variant of the PCRE library. While this library |
2 | started out as a copy of PCRE, many of the features of PCRE have been | |
3 | removed. This library now supports only the regular expression features | |
4 | required by the JavaScript language specification, and has only the functions | |
5 | needed by JavaScriptCore and the rest of WebKit. | |
6 | ||
7 | Originally written by Philip Hazel | |
8 | Copyright (c) 1997-2006 University of Cambridge | |
9dae56ea | 9 | Copyright (C) 2002, 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. |
b37bf2e1 A |
10 | Copyright (C) 2007 Eric Seidel <eric@webkit.org> |
11 | ||
12 | ----------------------------------------------------------------------------- | |
13 | Redistribution and use in source and binary forms, with or without | |
14 | modification, are permitted provided that the following conditions are met: | |
15 | ||
16 | * Redistributions of source code must retain the above copyright notice, | |
17 | this list of conditions and the following disclaimer. | |
18 | ||
19 | * Redistributions in binary form must reproduce the above copyright | |
20 | notice, this list of conditions and the following disclaimer in the | |
21 | documentation and/or other materials provided with the distribution. | |
22 | ||
23 | * Neither the name of the University of Cambridge nor the names of its | |
24 | contributors may be used to endorse or promote products derived from | |
25 | this software without specific prior written permission. | |
26 | ||
27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
37 | POSSIBILITY OF SUCH DAMAGE. | |
38 | ----------------------------------------------------------------------------- | |
39 | */ | |
40 | ||
41 | /* This module contains jsRegExpExecute(), the externally visible function | |
42 | that does pattern matching using an NFA algorithm, following the rules from | |
43 | the JavaScript specification. There are also some supporting functions. */ | |
44 | ||
45 | #include "config.h" | |
b37bf2e1 A |
46 | #include "pcre_internal.h" |
47 | ||
9dae56ea | 48 | #include <limits.h> |
b37bf2e1 A |
49 | #include <wtf/ASCIICType.h> |
50 | #include <wtf/Vector.h> | |
51 | ||
9dae56ea | 52 | #if REGEXP_HISTOGRAM |
ba379fdc | 53 | #include <wtf/DateMath.h> |
9dae56ea A |
54 | #include <runtime/UString.h> |
55 | #endif | |
b37bf2e1 A |
56 | |
57 | using namespace WTF; | |
58 | ||
ba379fdc | 59 | #if COMPILER(GCC) |
b37bf2e1 A |
60 | #define USE_COMPUTED_GOTO_FOR_MATCH_RECURSION |
61 | //#define USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP | |
62 | #endif | |
63 | ||
64 | /* Avoid warnings on Windows. */ | |
65 | #undef min | |
66 | #undef max | |
67 | ||
68 | #ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION | |
69 | typedef int ReturnLocation; | |
70 | #else | |
71 | typedef void* ReturnLocation; | |
72 | #endif | |
73 | ||
9dae56ea A |
74 | #if !REGEXP_HISTOGRAM |
75 | ||
76 | class HistogramTimeLogger { | |
77 | public: | |
78 | HistogramTimeLogger(const JSRegExp*) { } | |
79 | }; | |
80 | ||
81 | #else | |
82 | ||
83 | using namespace JSC; | |
84 | ||
85 | class Histogram { | |
86 | public: | |
87 | ~Histogram(); | |
88 | void add(const JSRegExp*, double); | |
89 | ||
90 | private: | |
91 | typedef HashMap<RefPtr<UString::Rep>, double> Map; | |
92 | Map times; | |
93 | }; | |
94 | ||
95 | class HistogramTimeLogger { | |
96 | public: | |
97 | HistogramTimeLogger(const JSRegExp*); | |
98 | ~HistogramTimeLogger(); | |
99 | ||
100 | private: | |
101 | const JSRegExp* m_re; | |
102 | double m_startTime; | |
103 | }; | |
104 | ||
105 | #endif | |
106 | ||
b37bf2e1 A |
107 | /* Structure for building a chain of data for holding the values of |
108 | the subject pointer at the start of each bracket, used to detect when | |
109 | an empty string has been matched by a bracket to break infinite loops. */ | |
110 | struct BracketChainNode { | |
111 | BracketChainNode* previousBracket; | |
112 | const UChar* bracketStart; | |
113 | }; | |
114 | ||
f9bf01c6 | 115 | struct MatchFrame : FastAllocBase { |
b37bf2e1 A |
116 | ReturnLocation returnLocation; |
117 | struct MatchFrame* previousFrame; | |
118 | ||
119 | /* Function arguments that may change */ | |
120 | struct { | |
121 | const UChar* subjectPtr; | |
122 | const unsigned char* instructionPtr; | |
123 | int offsetTop; | |
124 | BracketChainNode* bracketChain; | |
125 | } args; | |
126 | ||
127 | ||
128 | /* PCRE uses "fake" recursion built off of gotos, thus | |
129 | stack-based local variables are not safe to use. Instead we have to | |
130 | store local variables on the current MatchFrame. */ | |
131 | struct { | |
132 | const unsigned char* data; | |
133 | const unsigned char* startOfRepeatingBracket; | |
134 | const UChar* subjectPtrAtStartOfInstruction; // Several instrutions stash away a subjectPtr here for later compare | |
135 | const unsigned char* instructionPtrAtStartOfOnce; | |
136 | ||
137 | int repeatOthercase; | |
138 | ||
139 | int ctype; | |
140 | int fc; | |
141 | int fi; | |
142 | int length; | |
143 | int max; | |
144 | int number; | |
145 | int offset; | |
146 | int saveOffset1; | |
147 | int saveOffset2; | |
148 | int saveOffset3; | |
149 | ||
150 | BracketChainNode bracketChainNode; | |
151 | } locals; | |
152 | }; | |
153 | ||
154 | /* Structure for passing "static" information around between the functions | |
155 | doing traditional NFA matching, so that they are thread-safe. */ | |
156 | ||
157 | struct MatchData { | |
158 | int* offsetVector; /* Offset vector */ | |
159 | int offsetEnd; /* One past the end */ | |
160 | int offsetMax; /* The maximum usable for return data */ | |
161 | bool offsetOverflow; /* Set if too many extractions */ | |
162 | const UChar* startSubject; /* Start of the subject string */ | |
163 | const UChar* endSubject; /* End of the subject string */ | |
164 | const UChar* endMatchPtr; /* Subject position at end match */ | |
165 | int endOffsetTop; /* Highwater mark at end of match */ | |
166 | bool multiline; | |
167 | bool ignoreCase; | |
168 | }; | |
169 | ||
170 | /* The maximum remaining length of subject we are prepared to search for a | |
9dae56ea | 171 | reqByte match. */ |
b37bf2e1 A |
172 | |
173 | #define REQ_BYTE_MAX 1000 | |
174 | ||
175 | /* The below limit restricts the number of "recursive" match calls in order to | |
176 | avoid spending exponential time on complex regular expressions. */ | |
177 | ||
ba379fdc | 178 | static const unsigned matchLimit = 1000000; |
b37bf2e1 A |
179 | |
180 | #ifdef DEBUG | |
181 | /************************************************* | |
182 | * Debugging function to print chars * | |
183 | *************************************************/ | |
184 | ||
185 | /* Print a sequence of chars in printable format, stopping at the end of the | |
186 | subject if the requested. | |
187 | ||
188 | Arguments: | |
189 | p points to characters | |
190 | length number to print | |
191 | isSubject true if printing from within md.startSubject | |
192 | md pointer to matching data block, if isSubject is true | |
193 | */ | |
194 | ||
195 | static void pchars(const UChar* p, int length, bool isSubject, const MatchData& md) | |
196 | { | |
197 | if (isSubject && length > md.endSubject - p) | |
198 | length = md.endSubject - p; | |
199 | while (length-- > 0) { | |
200 | int c; | |
201 | if (isprint(c = *(p++))) | |
202 | printf("%c", c); | |
203 | else if (c < 256) | |
204 | printf("\\x%02x", c); | |
205 | else | |
206 | printf("\\x{%x}", c); | |
207 | } | |
208 | } | |
209 | #endif | |
210 | ||
211 | /************************************************* | |
212 | * Match a back-reference * | |
213 | *************************************************/ | |
214 | ||
215 | /* If a back reference hasn't been set, the length that is passed is greater | |
216 | than the number of characters left in the string, so the match fails. | |
217 | ||
218 | Arguments: | |
219 | offset index into the offset vector | |
220 | subjectPtr points into the subject | |
221 | length length to be matched | |
222 | md points to match data block | |
223 | ||
224 | Returns: true if matched | |
225 | */ | |
226 | ||
227 | static bool matchRef(int offset, const UChar* subjectPtr, int length, const MatchData& md) | |
228 | { | |
229 | const UChar* p = md.startSubject + md.offsetVector[offset]; | |
230 | ||
231 | #ifdef DEBUG | |
232 | if (subjectPtr >= md.endSubject) | |
233 | printf("matching subject <null>"); | |
234 | else { | |
235 | printf("matching subject "); | |
236 | pchars(subjectPtr, length, true, md); | |
237 | } | |
238 | printf(" against backref "); | |
239 | pchars(p, length, false, md); | |
240 | printf("\n"); | |
241 | #endif | |
242 | ||
243 | /* Always fail if not enough characters left */ | |
244 | ||
245 | if (length > md.endSubject - subjectPtr) | |
246 | return false; | |
247 | ||
248 | /* Separate the caselesss case for speed */ | |
249 | ||
250 | if (md.ignoreCase) { | |
251 | while (length-- > 0) { | |
252 | UChar c = *p++; | |
9dae56ea | 253 | int othercase = jsc_pcre_ucp_othercase(c); |
b37bf2e1 A |
254 | UChar d = *subjectPtr++; |
255 | if (c != d && othercase != d) | |
256 | return false; | |
257 | } | |
258 | } | |
259 | else { | |
260 | while (length-- > 0) | |
261 | if (*p++ != *subjectPtr++) | |
262 | return false; | |
263 | } | |
264 | ||
265 | return true; | |
266 | } | |
267 | ||
268 | #ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION | |
269 | ||
270 | /* Use numbered labels and switch statement at the bottom of the match function. */ | |
271 | ||
272 | #define RMATCH_WHERE(num) num | |
273 | #define RRETURN_LABEL RRETURN_SWITCH | |
274 | ||
275 | #else | |
276 | ||
277 | /* Use GCC's computed goto extension. */ | |
278 | ||
279 | /* For one test case this is more than 40% faster than the switch statement. | |
280 | We could avoid the use of the num argument entirely by using local labels, | |
281 | but using it for the GCC case as well as the non-GCC case allows us to share | |
282 | a bit more code and notice if we use conflicting numbers.*/ | |
283 | ||
284 | #define RMATCH_WHERE(num) &&RRETURN_##num | |
285 | #define RRETURN_LABEL *stack.currentFrame->returnLocation | |
286 | ||
287 | #endif | |
288 | ||
289 | #define RECURSIVE_MATCH_COMMON(num) \ | |
290 | goto RECURSE;\ | |
291 | RRETURN_##num: \ | |
292 | stack.popCurrentFrame(); | |
293 | ||
294 | #define RECURSIVE_MATCH(num, ra, rb) \ | |
295 | do { \ | |
296 | stack.pushNewFrame((ra), (rb), RMATCH_WHERE(num)); \ | |
297 | RECURSIVE_MATCH_COMMON(num) \ | |
298 | } while (0) | |
299 | ||
9dae56ea | 300 | #define RECURSIVE_MATCH_NEW_GROUP(num, ra, rb) \ |
b37bf2e1 A |
301 | do { \ |
302 | stack.pushNewFrame((ra), (rb), RMATCH_WHERE(num)); \ | |
303 | startNewGroup(stack.currentFrame); \ | |
304 | RECURSIVE_MATCH_COMMON(num) \ | |
305 | } while (0) | |
306 | ||
307 | #define RRETURN goto RRETURN_LABEL | |
308 | ||
309 | #define RRETURN_NO_MATCH do { isMatch = false; RRETURN; } while (0) | |
310 | ||
311 | /************************************************* | |
312 | * Match from current position * | |
313 | *************************************************/ | |
314 | ||
315 | /* On entry instructionPtr points to the first opcode, and subjectPtr to the first character | |
316 | in the subject string, while substringStart holds the value of subjectPtr at the start of the | |
317 | last bracketed group - used for breaking infinite loops matching zero-length | |
318 | strings. This function is called recursively in many circumstances. Whenever it | |
319 | returns a negative (error) response, the outer match() call must also return the | |
320 | same response. | |
321 | ||
322 | Arguments: | |
323 | subjectPtr pointer in subject | |
324 | instructionPtr position in code | |
325 | offsetTop current top pointer | |
326 | md pointer to "static" info for the match | |
327 | ||
328 | Returns: 1 if matched ) these values are >= 0 | |
329 | 0 if failed to match ) | |
330 | a negative error value if aborted by an error condition | |
331 | (e.g. stopped by repeated call or recursion limit) | |
332 | */ | |
333 | ||
9dae56ea | 334 | static const unsigned numFramesOnStack = 16; |
b37bf2e1 A |
335 | |
336 | struct MatchStack { | |
337 | MatchStack() | |
9dae56ea | 338 | : framesEnd(frames + numFramesOnStack) |
b37bf2e1 A |
339 | , currentFrame(frames) |
340 | , size(1) // match() creates accesses the first frame w/o calling pushNewFrame | |
341 | { | |
9dae56ea | 342 | ASSERT((sizeof(frames) / sizeof(frames[0])) == numFramesOnStack); |
b37bf2e1 A |
343 | } |
344 | ||
9dae56ea | 345 | MatchFrame frames[numFramesOnStack]; |
b37bf2e1 A |
346 | MatchFrame* framesEnd; |
347 | MatchFrame* currentFrame; | |
348 | unsigned size; | |
349 | ||
350 | inline bool canUseStackBufferForNextFrame() | |
351 | { | |
9dae56ea | 352 | return size < numFramesOnStack; |
b37bf2e1 A |
353 | } |
354 | ||
355 | inline MatchFrame* allocateNextFrame() | |
356 | { | |
357 | if (canUseStackBufferForNextFrame()) | |
358 | return currentFrame + 1; | |
359 | return new MatchFrame; | |
360 | } | |
361 | ||
362 | inline void pushNewFrame(const unsigned char* instructionPtr, BracketChainNode* bracketChain, ReturnLocation returnLocation) | |
363 | { | |
364 | MatchFrame* newframe = allocateNextFrame(); | |
365 | newframe->previousFrame = currentFrame; | |
366 | ||
367 | newframe->args.subjectPtr = currentFrame->args.subjectPtr; | |
368 | newframe->args.offsetTop = currentFrame->args.offsetTop; | |
369 | newframe->args.instructionPtr = instructionPtr; | |
370 | newframe->args.bracketChain = bracketChain; | |
371 | newframe->returnLocation = returnLocation; | |
372 | size++; | |
373 | ||
374 | currentFrame = newframe; | |
375 | } | |
376 | ||
377 | inline void popCurrentFrame() | |
378 | { | |
379 | MatchFrame* oldFrame = currentFrame; | |
380 | currentFrame = currentFrame->previousFrame; | |
9dae56ea | 381 | if (size > numFramesOnStack) |
b37bf2e1 A |
382 | delete oldFrame; |
383 | size--; | |
384 | } | |
385 | ||
386 | void popAllFrames() | |
387 | { | |
388 | while (size) | |
389 | popCurrentFrame(); | |
390 | } | |
391 | }; | |
392 | ||
393 | static int matchError(int errorCode, MatchStack& stack) | |
394 | { | |
395 | stack.popAllFrames(); | |
396 | return errorCode; | |
397 | } | |
398 | ||
399 | /* Get the next UTF-8 character, not advancing the pointer, incrementing length | |
400 | if there are extra bytes. This is called when we know we are in UTF-8 mode. */ | |
401 | ||
402 | static inline void getUTF8CharAndIncrementLength(int& c, const unsigned char* subjectPtr, int& len) | |
403 | { | |
404 | c = *subjectPtr; | |
405 | if ((c & 0xc0) == 0xc0) { | |
9dae56ea | 406 | int gcaa = jsc_pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ |
b37bf2e1 | 407 | int gcss = 6 * gcaa; |
9dae56ea | 408 | c = (c & jsc_pcre_utf8_table3[gcaa]) << gcss; |
b37bf2e1 A |
409 | for (int gcii = 1; gcii <= gcaa; gcii++) { |
410 | gcss -= 6; | |
411 | c |= (subjectPtr[gcii] & 0x3f) << gcss; | |
412 | } | |
413 | len += gcaa; | |
414 | } | |
415 | } | |
416 | ||
417 | static inline void startNewGroup(MatchFrame* currentFrame) | |
418 | { | |
419 | /* At the start of a bracketed group, add the current subject pointer to the | |
420 | stack of such pointers, to be re-instated at the end of the group when we hit | |
421 | the closing ket. When match() is called in other circumstances, we don't add to | |
422 | this stack. */ | |
423 | ||
424 | currentFrame->locals.bracketChainNode.previousBracket = currentFrame->args.bracketChain; | |
425 | currentFrame->locals.bracketChainNode.bracketStart = currentFrame->args.subjectPtr; | |
426 | currentFrame->args.bracketChain = ¤tFrame->locals.bracketChainNode; | |
427 | } | |
428 | ||
429 | // FIXME: "minimize" means "not greedy", we should invert the callers to ask for "greedy" to be less confusing | |
430 | static inline void repeatInformationFromInstructionOffset(short instructionOffset, bool& minimize, int& minimumRepeats, int& maximumRepeats) | |
431 | { | |
432 | // Instruction offsets are based off of OP_CRSTAR, OP_STAR, OP_TYPESTAR, OP_NOTSTAR | |
433 | static const char minimumRepeatsFromInstructionOffset[] = { 0, 0, 1, 1, 0, 0 }; | |
434 | static const int maximumRepeatsFromInstructionOffset[] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, 1, 1 }; | |
435 | ||
436 | ASSERT(instructionOffset >= 0); | |
437 | ASSERT(instructionOffset <= (OP_CRMINQUERY - OP_CRSTAR)); | |
438 | ||
439 | minimize = (instructionOffset & 1); // this assumes ordering: Instruction, MinimizeInstruction, Instruction2, MinimizeInstruction2 | |
440 | minimumRepeats = minimumRepeatsFromInstructionOffset[instructionOffset]; | |
441 | maximumRepeats = maximumRepeatsFromInstructionOffset[instructionOffset]; | |
442 | } | |
443 | ||
444 | static int match(const UChar* subjectPtr, const unsigned char* instructionPtr, int offsetTop, MatchData& md) | |
445 | { | |
446 | bool isMatch = false; | |
447 | int min; | |
448 | bool minimize = false; /* Initialization not really needed, but some compilers think so. */ | |
9dae56ea | 449 | unsigned remainingMatchCount = matchLimit; |
ba379fdc | 450 | int othercase; /* Declare here to avoid errors during jumps */ |
b37bf2e1 A |
451 | |
452 | MatchStack stack; | |
453 | ||
454 | /* The opcode jump table. */ | |
455 | #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP | |
456 | #define EMIT_JUMP_TABLE_ENTRY(opcode) &&LABEL_OP_##opcode, | |
457 | static void* opcodeJumpTable[256] = { FOR_EACH_OPCODE(EMIT_JUMP_TABLE_ENTRY) }; | |
458 | #undef EMIT_JUMP_TABLE_ENTRY | |
459 | #endif | |
460 | ||
461 | /* One-time setup of the opcode jump table. */ | |
462 | #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP | |
463 | for (int i = 255; !opcodeJumpTable[i]; i--) | |
464 | opcodeJumpTable[i] = &&CAPTURING_BRACKET; | |
465 | #endif | |
466 | ||
467 | #ifdef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION | |
468 | // Shark shows this as a hot line | |
469 | // Using a static const here makes this line disappear, but makes later access hotter (not sure why) | |
470 | stack.currentFrame->returnLocation = &&RETURN; | |
471 | #else | |
472 | stack.currentFrame->returnLocation = 0; | |
473 | #endif | |
474 | stack.currentFrame->args.subjectPtr = subjectPtr; | |
475 | stack.currentFrame->args.instructionPtr = instructionPtr; | |
476 | stack.currentFrame->args.offsetTop = offsetTop; | |
477 | stack.currentFrame->args.bracketChain = 0; | |
478 | startNewGroup(stack.currentFrame); | |
479 | ||
480 | /* This is where control jumps back to to effect "recursion" */ | |
481 | ||
482 | RECURSE: | |
9dae56ea | 483 | if (!--remainingMatchCount) |
b37bf2e1 A |
484 | return matchError(JSRegExpErrorHitLimit, stack); |
485 | ||
486 | /* Now start processing the operations. */ | |
487 | ||
488 | #ifndef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP | |
489 | while (true) | |
490 | #endif | |
491 | { | |
492 | ||
493 | #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP | |
494 | #define BEGIN_OPCODE(opcode) LABEL_OP_##opcode | |
495 | #define NEXT_OPCODE goto *opcodeJumpTable[*stack.currentFrame->args.instructionPtr] | |
496 | #else | |
497 | #define BEGIN_OPCODE(opcode) case OP_##opcode | |
498 | #define NEXT_OPCODE continue | |
499 | #endif | |
500 | ||
501 | #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP | |
502 | NEXT_OPCODE; | |
503 | #else | |
504 | switch (*stack.currentFrame->args.instructionPtr) | |
505 | #endif | |
506 | { | |
507 | /* Non-capturing bracket: optimized */ | |
508 | ||
509 | BEGIN_OPCODE(BRA): | |
510 | NON_CAPTURING_BRACKET: | |
511 | DPRINTF(("start bracket 0\n")); | |
512 | do { | |
9dae56ea | 513 | RECURSIVE_MATCH_NEW_GROUP(2, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain); |
b37bf2e1 A |
514 | if (isMatch) |
515 | RRETURN; | |
516 | stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1); | |
517 | } while (*stack.currentFrame->args.instructionPtr == OP_ALT); | |
518 | DPRINTF(("bracket 0 failed\n")); | |
519 | RRETURN; | |
520 | ||
521 | /* Skip over large extraction number data if encountered. */ | |
522 | ||
523 | BEGIN_OPCODE(BRANUMBER): | |
524 | stack.currentFrame->args.instructionPtr += 3; | |
525 | NEXT_OPCODE; | |
526 | ||
527 | /* End of the pattern. */ | |
528 | ||
529 | BEGIN_OPCODE(END): | |
530 | md.endMatchPtr = stack.currentFrame->args.subjectPtr; /* Record where we ended */ | |
531 | md.endOffsetTop = stack.currentFrame->args.offsetTop; /* and how many extracts were taken */ | |
532 | isMatch = true; | |
533 | RRETURN; | |
534 | ||
535 | /* Assertion brackets. Check the alternative branches in turn - the | |
536 | matching won't pass the KET for an assertion. If any one branch matches, | |
537 | the assertion is true. Lookbehind assertions have an OP_REVERSE item at the | |
538 | start of each branch to move the current point backwards, so the code at | |
539 | this level is identical to the lookahead case. */ | |
540 | ||
541 | BEGIN_OPCODE(ASSERT): | |
542 | do { | |
9dae56ea | 543 | RECURSIVE_MATCH_NEW_GROUP(6, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, NULL); |
b37bf2e1 A |
544 | if (isMatch) |
545 | break; | |
546 | stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1); | |
547 | } while (*stack.currentFrame->args.instructionPtr == OP_ALT); | |
548 | if (*stack.currentFrame->args.instructionPtr == OP_KET) | |
549 | RRETURN_NO_MATCH; | |
550 | ||
551 | /* Continue from after the assertion, updating the offsets high water | |
552 | mark, since extracts may have been taken during the assertion. */ | |
553 | ||
554 | advanceToEndOfBracket(stack.currentFrame->args.instructionPtr); | |
555 | stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE; | |
556 | stack.currentFrame->args.offsetTop = md.endOffsetTop; | |
557 | NEXT_OPCODE; | |
558 | ||
559 | /* Negative assertion: all branches must fail to match */ | |
560 | ||
561 | BEGIN_OPCODE(ASSERT_NOT): | |
562 | do { | |
9dae56ea | 563 | RECURSIVE_MATCH_NEW_GROUP(7, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, NULL); |
b37bf2e1 A |
564 | if (isMatch) |
565 | RRETURN_NO_MATCH; | |
566 | stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1); | |
567 | } while (*stack.currentFrame->args.instructionPtr == OP_ALT); | |
568 | ||
569 | stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE; | |
570 | NEXT_OPCODE; | |
571 | ||
572 | /* An alternation is the end of a branch; scan along to find the end of the | |
573 | bracketed group and go to there. */ | |
574 | ||
575 | BEGIN_OPCODE(ALT): | |
576 | advanceToEndOfBracket(stack.currentFrame->args.instructionPtr); | |
577 | NEXT_OPCODE; | |
578 | ||
579 | /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating | |
580 | that it may occur zero times. It may repeat infinitely, or not at all - | |
581 | i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper | |
582 | repeat limits are compiled as a number of copies, with the optional ones | |
583 | preceded by BRAZERO or BRAMINZERO. */ | |
584 | ||
585 | BEGIN_OPCODE(BRAZERO): { | |
586 | stack.currentFrame->locals.startOfRepeatingBracket = stack.currentFrame->args.instructionPtr + 1; | |
9dae56ea | 587 | RECURSIVE_MATCH_NEW_GROUP(14, stack.currentFrame->locals.startOfRepeatingBracket, stack.currentFrame->args.bracketChain); |
b37bf2e1 A |
588 | if (isMatch) |
589 | RRETURN; | |
590 | advanceToEndOfBracket(stack.currentFrame->locals.startOfRepeatingBracket); | |
591 | stack.currentFrame->args.instructionPtr = stack.currentFrame->locals.startOfRepeatingBracket + 1 + LINK_SIZE; | |
592 | NEXT_OPCODE; | |
593 | } | |
594 | ||
595 | BEGIN_OPCODE(BRAMINZERO): { | |
596 | stack.currentFrame->locals.startOfRepeatingBracket = stack.currentFrame->args.instructionPtr + 1; | |
597 | advanceToEndOfBracket(stack.currentFrame->locals.startOfRepeatingBracket); | |
9dae56ea | 598 | RECURSIVE_MATCH_NEW_GROUP(15, stack.currentFrame->locals.startOfRepeatingBracket + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain); |
b37bf2e1 A |
599 | if (isMatch) |
600 | RRETURN; | |
601 | stack.currentFrame->args.instructionPtr++; | |
602 | NEXT_OPCODE; | |
603 | } | |
604 | ||
605 | /* End of a group, repeated or non-repeating. If we are at the end of | |
606 | an assertion "group", stop matching and return 1, but record the | |
607 | current high water mark for use by positive assertions. Do this also | |
608 | for the "once" (not-backup up) groups. */ | |
609 | ||
610 | BEGIN_OPCODE(KET): | |
611 | BEGIN_OPCODE(KETRMIN): | |
612 | BEGIN_OPCODE(KETRMAX): | |
613 | stack.currentFrame->locals.instructionPtrAtStartOfOnce = stack.currentFrame->args.instructionPtr - getLinkValue(stack.currentFrame->args.instructionPtr + 1); | |
614 | stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.bracketChain->bracketStart; | |
615 | ||
616 | /* Back up the stack of bracket start pointers. */ | |
617 | ||
618 | stack.currentFrame->args.bracketChain = stack.currentFrame->args.bracketChain->previousBracket; | |
619 | ||
620 | if (*stack.currentFrame->locals.instructionPtrAtStartOfOnce == OP_ASSERT || *stack.currentFrame->locals.instructionPtrAtStartOfOnce == OP_ASSERT_NOT) { | |
621 | md.endOffsetTop = stack.currentFrame->args.offsetTop; | |
622 | isMatch = true; | |
623 | RRETURN; | |
624 | } | |
625 | ||
626 | /* In all other cases except a conditional group we have to check the | |
627 | group number back at the start and if necessary complete handling an | |
628 | extraction by setting the offsets and bumping the high water mark. */ | |
629 | ||
630 | stack.currentFrame->locals.number = *stack.currentFrame->locals.instructionPtrAtStartOfOnce - OP_BRA; | |
631 | ||
632 | /* For extended extraction brackets (large number), we have to fish out | |
633 | the number from a dummy opcode at the start. */ | |
634 | ||
635 | if (stack.currentFrame->locals.number > EXTRACT_BASIC_MAX) | |
636 | stack.currentFrame->locals.number = get2ByteValue(stack.currentFrame->locals.instructionPtrAtStartOfOnce + 2 + LINK_SIZE); | |
637 | stack.currentFrame->locals.offset = stack.currentFrame->locals.number << 1; | |
638 | ||
639 | #ifdef DEBUG | |
640 | printf("end bracket %d", stack.currentFrame->locals.number); | |
641 | printf("\n"); | |
642 | #endif | |
643 | ||
644 | /* Test for a numbered group. This includes groups called as a result | |
645 | of recursion. Note that whole-pattern recursion is coded as a recurse | |
646 | into group 0, so it won't be picked up here. Instead, we catch it when | |
647 | the OP_END is reached. */ | |
648 | ||
649 | if (stack.currentFrame->locals.number > 0) { | |
650 | if (stack.currentFrame->locals.offset >= md.offsetMax) | |
651 | md.offsetOverflow = true; | |
652 | else { | |
653 | md.offsetVector[stack.currentFrame->locals.offset] = | |
654 | md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number]; | |
655 | md.offsetVector[stack.currentFrame->locals.offset+1] = stack.currentFrame->args.subjectPtr - md.startSubject; | |
656 | if (stack.currentFrame->args.offsetTop <= stack.currentFrame->locals.offset) | |
657 | stack.currentFrame->args.offsetTop = stack.currentFrame->locals.offset + 2; | |
658 | } | |
659 | } | |
660 | ||
661 | /* For a non-repeating ket, just continue at this level. This also | |
662 | happens for a repeating ket if no characters were matched in the group. | |
663 | This is the forcible breaking of infinite loops as implemented in Perl | |
664 | 5.005. If there is an options reset, it will get obeyed in the normal | |
665 | course of events. */ | |
666 | ||
667 | if (*stack.currentFrame->args.instructionPtr == OP_KET || stack.currentFrame->args.subjectPtr == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) { | |
668 | stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE; | |
669 | NEXT_OPCODE; | |
670 | } | |
671 | ||
672 | /* The repeating kets try the rest of the pattern or restart from the | |
673 | preceding bracket, in the appropriate order. */ | |
674 | ||
675 | if (*stack.currentFrame->args.instructionPtr == OP_KETRMIN) { | |
676 | RECURSIVE_MATCH(16, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain); | |
677 | if (isMatch) | |
678 | RRETURN; | |
9dae56ea | 679 | RECURSIVE_MATCH_NEW_GROUP(17, stack.currentFrame->locals.instructionPtrAtStartOfOnce, stack.currentFrame->args.bracketChain); |
b37bf2e1 A |
680 | if (isMatch) |
681 | RRETURN; | |
682 | } else { /* OP_KETRMAX */ | |
9dae56ea | 683 | RECURSIVE_MATCH_NEW_GROUP(18, stack.currentFrame->locals.instructionPtrAtStartOfOnce, stack.currentFrame->args.bracketChain); |
b37bf2e1 A |
684 | if (isMatch) |
685 | RRETURN; | |
686 | RECURSIVE_MATCH(19, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain); | |
687 | if (isMatch) | |
688 | RRETURN; | |
689 | } | |
690 | RRETURN; | |
691 | ||
9dae56ea A |
692 | /* Start of subject. */ |
693 | ||
b37bf2e1 | 694 | BEGIN_OPCODE(CIRC): |
9dae56ea | 695 | if (stack.currentFrame->args.subjectPtr != md.startSubject) |
b37bf2e1 A |
696 | RRETURN_NO_MATCH; |
697 | stack.currentFrame->args.instructionPtr++; | |
698 | NEXT_OPCODE; | |
9dae56ea A |
699 | |
700 | /* After internal newline if multiline. */ | |
701 | ||
702 | BEGIN_OPCODE(BOL): | |
703 | if (stack.currentFrame->args.subjectPtr != md.startSubject && !isNewline(stack.currentFrame->args.subjectPtr[-1])) | |
704 | RRETURN_NO_MATCH; | |
705 | stack.currentFrame->args.instructionPtr++; | |
706 | NEXT_OPCODE; | |
707 | ||
708 | /* End of subject. */ | |
709 | ||
b37bf2e1 | 710 | BEGIN_OPCODE(DOLL): |
9dae56ea A |
711 | if (stack.currentFrame->args.subjectPtr < md.endSubject) |
712 | RRETURN_NO_MATCH; | |
713 | stack.currentFrame->args.instructionPtr++; | |
714 | NEXT_OPCODE; | |
715 | ||
716 | /* Before internal newline if multiline. */ | |
717 | ||
718 | BEGIN_OPCODE(EOL): | |
719 | if (stack.currentFrame->args.subjectPtr < md.endSubject && !isNewline(*stack.currentFrame->args.subjectPtr)) | |
b37bf2e1 A |
720 | RRETURN_NO_MATCH; |
721 | stack.currentFrame->args.instructionPtr++; | |
722 | NEXT_OPCODE; | |
723 | ||
724 | /* Word boundary assertions */ | |
725 | ||
726 | BEGIN_OPCODE(NOT_WORD_BOUNDARY): | |
727 | BEGIN_OPCODE(WORD_BOUNDARY): { | |
728 | bool currentCharIsWordChar = false; | |
729 | bool previousCharIsWordChar = false; | |
730 | ||
731 | if (stack.currentFrame->args.subjectPtr > md.startSubject) | |
732 | previousCharIsWordChar = isWordChar(stack.currentFrame->args.subjectPtr[-1]); | |
733 | if (stack.currentFrame->args.subjectPtr < md.endSubject) | |
734 | currentCharIsWordChar = isWordChar(*stack.currentFrame->args.subjectPtr); | |
735 | ||
736 | /* Now see if the situation is what we want */ | |
737 | bool wordBoundaryDesired = (*stack.currentFrame->args.instructionPtr++ == OP_WORD_BOUNDARY); | |
738 | if (wordBoundaryDesired ? currentCharIsWordChar == previousCharIsWordChar : currentCharIsWordChar != previousCharIsWordChar) | |
739 | RRETURN_NO_MATCH; | |
740 | NEXT_OPCODE; | |
741 | } | |
742 | ||
743 | /* Match a single character type; inline for speed */ | |
744 | ||
745 | BEGIN_OPCODE(NOT_NEWLINE): | |
746 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
747 | RRETURN_NO_MATCH; | |
748 | if (isNewline(*stack.currentFrame->args.subjectPtr++)) | |
749 | RRETURN_NO_MATCH; | |
750 | stack.currentFrame->args.instructionPtr++; | |
751 | NEXT_OPCODE; | |
752 | ||
753 | BEGIN_OPCODE(NOT_DIGIT): | |
754 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
755 | RRETURN_NO_MATCH; | |
756 | if (isASCIIDigit(*stack.currentFrame->args.subjectPtr++)) | |
757 | RRETURN_NO_MATCH; | |
758 | stack.currentFrame->args.instructionPtr++; | |
759 | NEXT_OPCODE; | |
760 | ||
761 | BEGIN_OPCODE(DIGIT): | |
762 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
763 | RRETURN_NO_MATCH; | |
764 | if (!isASCIIDigit(*stack.currentFrame->args.subjectPtr++)) | |
765 | RRETURN_NO_MATCH; | |
766 | stack.currentFrame->args.instructionPtr++; | |
767 | NEXT_OPCODE; | |
768 | ||
769 | BEGIN_OPCODE(NOT_WHITESPACE): | |
770 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
771 | RRETURN_NO_MATCH; | |
772 | if (isSpaceChar(*stack.currentFrame->args.subjectPtr++)) | |
773 | RRETURN_NO_MATCH; | |
774 | stack.currentFrame->args.instructionPtr++; | |
775 | NEXT_OPCODE; | |
776 | ||
777 | BEGIN_OPCODE(WHITESPACE): | |
778 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
779 | RRETURN_NO_MATCH; | |
780 | if (!isSpaceChar(*stack.currentFrame->args.subjectPtr++)) | |
781 | RRETURN_NO_MATCH; | |
782 | stack.currentFrame->args.instructionPtr++; | |
783 | NEXT_OPCODE; | |
784 | ||
785 | BEGIN_OPCODE(NOT_WORDCHAR): | |
786 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
787 | RRETURN_NO_MATCH; | |
788 | if (isWordChar(*stack.currentFrame->args.subjectPtr++)) | |
789 | RRETURN_NO_MATCH; | |
790 | stack.currentFrame->args.instructionPtr++; | |
791 | NEXT_OPCODE; | |
792 | ||
793 | BEGIN_OPCODE(WORDCHAR): | |
794 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
795 | RRETURN_NO_MATCH; | |
796 | if (!isWordChar(*stack.currentFrame->args.subjectPtr++)) | |
797 | RRETURN_NO_MATCH; | |
798 | stack.currentFrame->args.instructionPtr++; | |
799 | NEXT_OPCODE; | |
800 | ||
801 | /* Match a back reference, possibly repeatedly. Look past the end of the | |
802 | item to see if there is repeat information following. The code is similar | |
803 | to that for character classes, but repeated for efficiency. Then obey | |
804 | similar code to character type repeats - written out again for speed. | |
805 | However, if the referenced string is the empty string, always treat | |
806 | it as matched, any number of times (otherwise there could be infinite | |
807 | loops). */ | |
808 | ||
809 | BEGIN_OPCODE(REF): | |
810 | stack.currentFrame->locals.offset = get2ByteValue(stack.currentFrame->args.instructionPtr + 1) << 1; /* Doubled ref number */ | |
811 | stack.currentFrame->args.instructionPtr += 3; /* Advance past item */ | |
812 | ||
813 | /* If the reference is unset, set the length to be longer than the amount | |
814 | of subject left; this ensures that every attempt at a match fails. We | |
815 | can't just fail here, because of the possibility of quantifiers with zero | |
816 | minima. */ | |
817 | ||
818 | if (stack.currentFrame->locals.offset >= stack.currentFrame->args.offsetTop || md.offsetVector[stack.currentFrame->locals.offset] < 0) | |
819 | stack.currentFrame->locals.length = 0; | |
820 | else | |
821 | stack.currentFrame->locals.length = md.offsetVector[stack.currentFrame->locals.offset+1] - md.offsetVector[stack.currentFrame->locals.offset]; | |
822 | ||
823 | /* Set up for repetition, or handle the non-repeated case */ | |
824 | ||
825 | switch (*stack.currentFrame->args.instructionPtr) { | |
826 | case OP_CRSTAR: | |
827 | case OP_CRMINSTAR: | |
828 | case OP_CRPLUS: | |
829 | case OP_CRMINPLUS: | |
830 | case OP_CRQUERY: | |
831 | case OP_CRMINQUERY: | |
832 | repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max); | |
833 | break; | |
834 | ||
835 | case OP_CRRANGE: | |
836 | case OP_CRMINRANGE: | |
837 | minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE); | |
838 | min = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); | |
839 | stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 3); | |
840 | if (stack.currentFrame->locals.max == 0) | |
841 | stack.currentFrame->locals.max = INT_MAX; | |
842 | stack.currentFrame->args.instructionPtr += 5; | |
843 | break; | |
844 | ||
845 | default: /* No repeat follows */ | |
846 | if (!matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md)) | |
847 | RRETURN_NO_MATCH; | |
848 | stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length; | |
849 | NEXT_OPCODE; | |
850 | } | |
851 | ||
852 | /* If the length of the reference is zero, just continue with the | |
853 | main loop. */ | |
854 | ||
855 | if (stack.currentFrame->locals.length == 0) | |
856 | NEXT_OPCODE; | |
857 | ||
858 | /* First, ensure the minimum number of matches are present. */ | |
859 | ||
860 | for (int i = 1; i <= min; i++) { | |
861 | if (!matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md)) | |
862 | RRETURN_NO_MATCH; | |
863 | stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length; | |
864 | } | |
865 | ||
866 | /* If min = max, continue at the same level without recursion. | |
867 | They are not both allowed to be zero. */ | |
868 | ||
869 | if (min == stack.currentFrame->locals.max) | |
870 | NEXT_OPCODE; | |
871 | ||
872 | /* If minimizing, keep trying and advancing the pointer */ | |
873 | ||
874 | if (minimize) { | |
875 | for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { | |
876 | RECURSIVE_MATCH(20, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); | |
877 | if (isMatch) | |
878 | RRETURN; | |
879 | if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || !matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md)) | |
880 | RRETURN; | |
881 | stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length; | |
882 | } | |
883 | /* Control never reaches here */ | |
884 | } | |
885 | ||
886 | /* If maximizing, find the longest string and work backwards */ | |
887 | ||
888 | else { | |
889 | stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; | |
890 | for (int i = min; i < stack.currentFrame->locals.max; i++) { | |
891 | if (!matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md)) | |
892 | break; | |
893 | stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length; | |
894 | } | |
895 | while (stack.currentFrame->args.subjectPtr >= stack.currentFrame->locals.subjectPtrAtStartOfInstruction) { | |
896 | RECURSIVE_MATCH(21, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); | |
897 | if (isMatch) | |
898 | RRETURN; | |
899 | stack.currentFrame->args.subjectPtr -= stack.currentFrame->locals.length; | |
900 | } | |
901 | RRETURN_NO_MATCH; | |
902 | } | |
903 | /* Control never reaches here */ | |
904 | ||
905 | /* Match a bit-mapped character class, possibly repeatedly. This op code is | |
906 | used when all the characters in the class have values in the range 0-255, | |
907 | and either the matching is caseful, or the characters are in the range | |
908 | 0-127 when UTF-8 processing is enabled. The only difference between | |
909 | OP_CLASS and OP_NCLASS occurs when a data character outside the range is | |
910 | encountered. | |
911 | ||
912 | First, look past the end of the item to see if there is repeat information | |
913 | following. Then obey similar code to character type repeats - written out | |
914 | again for speed. */ | |
915 | ||
916 | BEGIN_OPCODE(NCLASS): | |
917 | BEGIN_OPCODE(CLASS): | |
918 | stack.currentFrame->locals.data = stack.currentFrame->args.instructionPtr + 1; /* Save for matching */ | |
919 | stack.currentFrame->args.instructionPtr += 33; /* Advance past the item */ | |
920 | ||
921 | switch (*stack.currentFrame->args.instructionPtr) { | |
922 | case OP_CRSTAR: | |
923 | case OP_CRMINSTAR: | |
924 | case OP_CRPLUS: | |
925 | case OP_CRMINPLUS: | |
926 | case OP_CRQUERY: | |
927 | case OP_CRMINQUERY: | |
928 | repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max); | |
929 | break; | |
930 | ||
931 | case OP_CRRANGE: | |
932 | case OP_CRMINRANGE: | |
933 | minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE); | |
934 | min = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); | |
935 | stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 3); | |
936 | if (stack.currentFrame->locals.max == 0) | |
937 | stack.currentFrame->locals.max = INT_MAX; | |
938 | stack.currentFrame->args.instructionPtr += 5; | |
939 | break; | |
940 | ||
941 | default: /* No repeat follows */ | |
942 | min = stack.currentFrame->locals.max = 1; | |
943 | break; | |
944 | } | |
945 | ||
946 | /* First, ensure the minimum number of matches are present. */ | |
947 | ||
948 | for (int i = 1; i <= min; i++) { | |
949 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
950 | RRETURN_NO_MATCH; | |
951 | int c = *stack.currentFrame->args.subjectPtr++; | |
952 | if (c > 255) { | |
953 | if (stack.currentFrame->locals.data[-1] == OP_CLASS) | |
954 | RRETURN_NO_MATCH; | |
955 | } else { | |
956 | if (!(stack.currentFrame->locals.data[c / 8] & (1 << (c & 7)))) | |
957 | RRETURN_NO_MATCH; | |
958 | } | |
959 | } | |
960 | ||
961 | /* If max == min we can continue with the main loop without the | |
962 | need to recurse. */ | |
963 | ||
964 | if (min == stack.currentFrame->locals.max) | |
965 | NEXT_OPCODE; | |
966 | ||
967 | /* If minimizing, keep testing the rest of the expression and advancing | |
968 | the pointer while it matches the class. */ | |
969 | if (minimize) { | |
970 | for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { | |
971 | RECURSIVE_MATCH(22, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); | |
972 | if (isMatch) | |
973 | RRETURN; | |
974 | if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject) | |
975 | RRETURN; | |
976 | int c = *stack.currentFrame->args.subjectPtr++; | |
977 | if (c > 255) { | |
978 | if (stack.currentFrame->locals.data[-1] == OP_CLASS) | |
979 | RRETURN; | |
980 | } else { | |
981 | if ((stack.currentFrame->locals.data[c/8] & (1 << (c&7))) == 0) | |
982 | RRETURN; | |
983 | } | |
984 | } | |
985 | /* Control never reaches here */ | |
986 | } | |
987 | /* If maximizing, find the longest possible run, then work backwards. */ | |
988 | else { | |
989 | stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; | |
990 | ||
991 | for (int i = min; i < stack.currentFrame->locals.max; i++) { | |
992 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
993 | break; | |
994 | int c = *stack.currentFrame->args.subjectPtr; | |
995 | if (c > 255) { | |
996 | if (stack.currentFrame->locals.data[-1] == OP_CLASS) | |
997 | break; | |
998 | } else { | |
999 | if (!(stack.currentFrame->locals.data[c / 8] & (1 << (c & 7)))) | |
1000 | break; | |
1001 | } | |
1002 | ++stack.currentFrame->args.subjectPtr; | |
1003 | } | |
1004 | for (;;) { | |
1005 | RECURSIVE_MATCH(24, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); | |
1006 | if (isMatch) | |
1007 | RRETURN; | |
1008 | if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) | |
1009 | break; /* Stop if tried at original pos */ | |
1010 | } | |
1011 | ||
1012 | RRETURN; | |
1013 | } | |
1014 | /* Control never reaches here */ | |
1015 | ||
1016 | /* Match an extended character class. */ | |
1017 | ||
1018 | BEGIN_OPCODE(XCLASS): | |
1019 | stack.currentFrame->locals.data = stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE; /* Save for matching */ | |
1020 | stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1); /* Advance past the item */ | |
1021 | ||
1022 | switch (*stack.currentFrame->args.instructionPtr) { | |
1023 | case OP_CRSTAR: | |
1024 | case OP_CRMINSTAR: | |
1025 | case OP_CRPLUS: | |
1026 | case OP_CRMINPLUS: | |
1027 | case OP_CRQUERY: | |
1028 | case OP_CRMINQUERY: | |
1029 | repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max); | |
1030 | break; | |
1031 | ||
1032 | case OP_CRRANGE: | |
1033 | case OP_CRMINRANGE: | |
1034 | minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE); | |
1035 | min = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); | |
1036 | stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 3); | |
1037 | if (stack.currentFrame->locals.max == 0) | |
1038 | stack.currentFrame->locals.max = INT_MAX; | |
1039 | stack.currentFrame->args.instructionPtr += 5; | |
1040 | break; | |
1041 | ||
1042 | default: /* No repeat follows */ | |
1043 | min = stack.currentFrame->locals.max = 1; | |
1044 | } | |
1045 | ||
1046 | /* First, ensure the minimum number of matches are present. */ | |
1047 | ||
1048 | for (int i = 1; i <= min; i++) { | |
1049 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1050 | RRETURN_NO_MATCH; | |
1051 | int c = *stack.currentFrame->args.subjectPtr++; | |
9dae56ea | 1052 | if (!jsc_pcre_xclass(c, stack.currentFrame->locals.data)) |
b37bf2e1 A |
1053 | RRETURN_NO_MATCH; |
1054 | } | |
1055 | ||
1056 | /* If max == min we can continue with the main loop without the | |
1057 | need to recurse. */ | |
1058 | ||
1059 | if (min == stack.currentFrame->locals.max) | |
1060 | NEXT_OPCODE; | |
1061 | ||
1062 | /* If minimizing, keep testing the rest of the expression and advancing | |
1063 | the pointer while it matches the class. */ | |
1064 | ||
1065 | if (minimize) { | |
1066 | for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { | |
1067 | RECURSIVE_MATCH(26, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); | |
1068 | if (isMatch) | |
1069 | RRETURN; | |
1070 | if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1071 | RRETURN; | |
1072 | int c = *stack.currentFrame->args.subjectPtr++; | |
9dae56ea | 1073 | if (!jsc_pcre_xclass(c, stack.currentFrame->locals.data)) |
b37bf2e1 A |
1074 | RRETURN; |
1075 | } | |
1076 | /* Control never reaches here */ | |
1077 | } | |
1078 | ||
1079 | /* If maximizing, find the longest possible run, then work backwards. */ | |
1080 | ||
1081 | else { | |
1082 | stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; | |
1083 | for (int i = min; i < stack.currentFrame->locals.max; i++) { | |
1084 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1085 | break; | |
1086 | int c = *stack.currentFrame->args.subjectPtr; | |
9dae56ea | 1087 | if (!jsc_pcre_xclass(c, stack.currentFrame->locals.data)) |
b37bf2e1 A |
1088 | break; |
1089 | ++stack.currentFrame->args.subjectPtr; | |
1090 | } | |
1091 | for(;;) { | |
1092 | RECURSIVE_MATCH(27, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); | |
1093 | if (isMatch) | |
1094 | RRETURN; | |
1095 | if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) | |
1096 | break; /* Stop if tried at original pos */ | |
1097 | } | |
1098 | RRETURN; | |
1099 | } | |
1100 | ||
1101 | /* Control never reaches here */ | |
1102 | ||
1103 | /* Match a single character, casefully */ | |
1104 | ||
1105 | BEGIN_OPCODE(CHAR): | |
1106 | stack.currentFrame->locals.length = 1; | |
1107 | stack.currentFrame->args.instructionPtr++; | |
1108 | getUTF8CharAndIncrementLength(stack.currentFrame->locals.fc, stack.currentFrame->args.instructionPtr, stack.currentFrame->locals.length); | |
1109 | stack.currentFrame->args.instructionPtr += stack.currentFrame->locals.length; | |
1110 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1111 | RRETURN_NO_MATCH; | |
1112 | if (stack.currentFrame->locals.fc != *stack.currentFrame->args.subjectPtr++) | |
1113 | RRETURN_NO_MATCH; | |
1114 | NEXT_OPCODE; | |
1115 | ||
1116 | /* Match a single character, caselessly */ | |
1117 | ||
1118 | BEGIN_OPCODE(CHAR_IGNORING_CASE): { | |
1119 | stack.currentFrame->locals.length = 1; | |
1120 | stack.currentFrame->args.instructionPtr++; | |
1121 | getUTF8CharAndIncrementLength(stack.currentFrame->locals.fc, stack.currentFrame->args.instructionPtr, stack.currentFrame->locals.length); | |
1122 | stack.currentFrame->args.instructionPtr += stack.currentFrame->locals.length; | |
1123 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1124 | RRETURN_NO_MATCH; | |
1125 | int dc = *stack.currentFrame->args.subjectPtr++; | |
9dae56ea | 1126 | if (stack.currentFrame->locals.fc != dc && jsc_pcre_ucp_othercase(stack.currentFrame->locals.fc) != dc) |
b37bf2e1 A |
1127 | RRETURN_NO_MATCH; |
1128 | NEXT_OPCODE; | |
1129 | } | |
1130 | ||
1131 | /* Match a single ASCII character. */ | |
1132 | ||
1133 | BEGIN_OPCODE(ASCII_CHAR): | |
1134 | if (md.endSubject == stack.currentFrame->args.subjectPtr) | |
1135 | RRETURN_NO_MATCH; | |
1136 | if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->args.instructionPtr[1]) | |
1137 | RRETURN_NO_MATCH; | |
1138 | ++stack.currentFrame->args.subjectPtr; | |
1139 | stack.currentFrame->args.instructionPtr += 2; | |
1140 | NEXT_OPCODE; | |
1141 | ||
1142 | /* Match one of two cases of an ASCII letter. */ | |
1143 | ||
1144 | BEGIN_OPCODE(ASCII_LETTER_IGNORING_CASE): | |
1145 | if (md.endSubject == stack.currentFrame->args.subjectPtr) | |
1146 | RRETURN_NO_MATCH; | |
1147 | if ((*stack.currentFrame->args.subjectPtr | 0x20) != stack.currentFrame->args.instructionPtr[1]) | |
1148 | RRETURN_NO_MATCH; | |
1149 | ++stack.currentFrame->args.subjectPtr; | |
1150 | stack.currentFrame->args.instructionPtr += 2; | |
1151 | NEXT_OPCODE; | |
1152 | ||
1153 | /* Match a single character repeatedly; different opcodes share code. */ | |
1154 | ||
1155 | BEGIN_OPCODE(EXACT): | |
1156 | min = stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); | |
1157 | minimize = false; | |
1158 | stack.currentFrame->args.instructionPtr += 3; | |
1159 | goto REPEATCHAR; | |
1160 | ||
1161 | BEGIN_OPCODE(UPTO): | |
1162 | BEGIN_OPCODE(MINUPTO): | |
1163 | min = 0; | |
1164 | stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); | |
1165 | minimize = *stack.currentFrame->args.instructionPtr == OP_MINUPTO; | |
1166 | stack.currentFrame->args.instructionPtr += 3; | |
1167 | goto REPEATCHAR; | |
1168 | ||
1169 | BEGIN_OPCODE(STAR): | |
1170 | BEGIN_OPCODE(MINSTAR): | |
1171 | BEGIN_OPCODE(PLUS): | |
1172 | BEGIN_OPCODE(MINPLUS): | |
1173 | BEGIN_OPCODE(QUERY): | |
1174 | BEGIN_OPCODE(MINQUERY): | |
1175 | repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_STAR, minimize, min, stack.currentFrame->locals.max); | |
1176 | ||
1177 | /* Common code for all repeated single-character matches. We can give | |
1178 | up quickly if there are fewer than the minimum number of characters left in | |
1179 | the subject. */ | |
1180 | ||
1181 | REPEATCHAR: | |
1182 | ||
1183 | stack.currentFrame->locals.length = 1; | |
1184 | getUTF8CharAndIncrementLength(stack.currentFrame->locals.fc, stack.currentFrame->args.instructionPtr, stack.currentFrame->locals.length); | |
1185 | if (min * (stack.currentFrame->locals.fc > 0xFFFF ? 2 : 1) > md.endSubject - stack.currentFrame->args.subjectPtr) | |
1186 | RRETURN_NO_MATCH; | |
1187 | stack.currentFrame->args.instructionPtr += stack.currentFrame->locals.length; | |
1188 | ||
1189 | if (stack.currentFrame->locals.fc <= 0xFFFF) { | |
ba379fdc | 1190 | othercase = md.ignoreCase ? jsc_pcre_ucp_othercase(stack.currentFrame->locals.fc) : -1; |
b37bf2e1 A |
1191 | |
1192 | for (int i = 1; i <= min; i++) { | |
1193 | if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc && *stack.currentFrame->args.subjectPtr != othercase) | |
1194 | RRETURN_NO_MATCH; | |
1195 | ++stack.currentFrame->args.subjectPtr; | |
1196 | } | |
1197 | ||
1198 | if (min == stack.currentFrame->locals.max) | |
1199 | NEXT_OPCODE; | |
1200 | ||
1201 | if (minimize) { | |
1202 | stack.currentFrame->locals.repeatOthercase = othercase; | |
1203 | for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { | |
1204 | RECURSIVE_MATCH(28, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); | |
1205 | if (isMatch) | |
1206 | RRETURN; | |
1207 | if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1208 | RRETURN; | |
1209 | if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc && *stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.repeatOthercase) | |
1210 | RRETURN; | |
1211 | ++stack.currentFrame->args.subjectPtr; | |
1212 | } | |
1213 | /* Control never reaches here */ | |
1214 | } else { | |
1215 | stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; | |
1216 | for (int i = min; i < stack.currentFrame->locals.max; i++) { | |
1217 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1218 | break; | |
1219 | if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc && *stack.currentFrame->args.subjectPtr != othercase) | |
1220 | break; | |
1221 | ++stack.currentFrame->args.subjectPtr; | |
1222 | } | |
1223 | while (stack.currentFrame->args.subjectPtr >= stack.currentFrame->locals.subjectPtrAtStartOfInstruction) { | |
1224 | RECURSIVE_MATCH(29, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); | |
1225 | if (isMatch) | |
1226 | RRETURN; | |
1227 | --stack.currentFrame->args.subjectPtr; | |
1228 | } | |
1229 | RRETURN_NO_MATCH; | |
1230 | } | |
1231 | /* Control never reaches here */ | |
1232 | } else { | |
1233 | /* No case on surrogate pairs, so no need to bother with "othercase". */ | |
1234 | ||
1235 | for (int i = 1; i <= min; i++) { | |
1236 | if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc) | |
1237 | RRETURN_NO_MATCH; | |
1238 | stack.currentFrame->args.subjectPtr += 2; | |
1239 | } | |
1240 | ||
1241 | if (min == stack.currentFrame->locals.max) | |
1242 | NEXT_OPCODE; | |
1243 | ||
1244 | if (minimize) { | |
1245 | for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { | |
1246 | RECURSIVE_MATCH(30, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); | |
1247 | if (isMatch) | |
1248 | RRETURN; | |
1249 | if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1250 | RRETURN; | |
1251 | if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc) | |
1252 | RRETURN; | |
1253 | stack.currentFrame->args.subjectPtr += 2; | |
1254 | } | |
1255 | /* Control never reaches here */ | |
1256 | } else { | |
1257 | stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; | |
1258 | for (int i = min; i < stack.currentFrame->locals.max; i++) { | |
1259 | if (stack.currentFrame->args.subjectPtr > md.endSubject - 2) | |
1260 | break; | |
1261 | if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc) | |
1262 | break; | |
1263 | stack.currentFrame->args.subjectPtr += 2; | |
1264 | } | |
1265 | while (stack.currentFrame->args.subjectPtr >= stack.currentFrame->locals.subjectPtrAtStartOfInstruction) { | |
1266 | RECURSIVE_MATCH(31, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); | |
1267 | if (isMatch) | |
1268 | RRETURN; | |
1269 | stack.currentFrame->args.subjectPtr -= 2; | |
1270 | } | |
1271 | RRETURN_NO_MATCH; | |
1272 | } | |
1273 | /* Control never reaches here */ | |
1274 | } | |
1275 | /* Control never reaches here */ | |
1276 | ||
1277 | /* Match a negated single one-byte character. */ | |
1278 | ||
1279 | BEGIN_OPCODE(NOT): { | |
1280 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1281 | RRETURN_NO_MATCH; | |
9dae56ea | 1282 | int b = stack.currentFrame->args.instructionPtr[1]; |
b37bf2e1 | 1283 | int c = *stack.currentFrame->args.subjectPtr++; |
9dae56ea | 1284 | stack.currentFrame->args.instructionPtr += 2; |
b37bf2e1 A |
1285 | if (md.ignoreCase) { |
1286 | if (c < 128) | |
1287 | c = toLowerCase(c); | |
9dae56ea | 1288 | if (toLowerCase(b) == c) |
b37bf2e1 A |
1289 | RRETURN_NO_MATCH; |
1290 | } else { | |
9dae56ea | 1291 | if (b == c) |
b37bf2e1 A |
1292 | RRETURN_NO_MATCH; |
1293 | } | |
1294 | NEXT_OPCODE; | |
1295 | } | |
1296 | ||
1297 | /* Match a negated single one-byte character repeatedly. This is almost a | |
1298 | repeat of the code for a repeated single character, but I haven't found a | |
1299 | nice way of commoning these up that doesn't require a test of the | |
1300 | positive/negative option for each character match. Maybe that wouldn't add | |
1301 | very much to the time taken, but character matching *is* what this is all | |
1302 | about... */ | |
1303 | ||
1304 | BEGIN_OPCODE(NOTEXACT): | |
1305 | min = stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); | |
1306 | minimize = false; | |
1307 | stack.currentFrame->args.instructionPtr += 3; | |
1308 | goto REPEATNOTCHAR; | |
1309 | ||
1310 | BEGIN_OPCODE(NOTUPTO): | |
1311 | BEGIN_OPCODE(NOTMINUPTO): | |
1312 | min = 0; | |
1313 | stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); | |
1314 | minimize = *stack.currentFrame->args.instructionPtr == OP_NOTMINUPTO; | |
1315 | stack.currentFrame->args.instructionPtr += 3; | |
1316 | goto REPEATNOTCHAR; | |
1317 | ||
1318 | BEGIN_OPCODE(NOTSTAR): | |
1319 | BEGIN_OPCODE(NOTMINSTAR): | |
1320 | BEGIN_OPCODE(NOTPLUS): | |
1321 | BEGIN_OPCODE(NOTMINPLUS): | |
1322 | BEGIN_OPCODE(NOTQUERY): | |
1323 | BEGIN_OPCODE(NOTMINQUERY): | |
1324 | repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_NOTSTAR, minimize, min, stack.currentFrame->locals.max); | |
1325 | ||
1326 | /* Common code for all repeated single-byte matches. We can give up quickly | |
1327 | if there are fewer than the minimum number of bytes left in the | |
1328 | subject. */ | |
1329 | ||
1330 | REPEATNOTCHAR: | |
1331 | if (min > md.endSubject - stack.currentFrame->args.subjectPtr) | |
1332 | RRETURN_NO_MATCH; | |
1333 | stack.currentFrame->locals.fc = *stack.currentFrame->args.instructionPtr++; | |
1334 | ||
1335 | /* The code is duplicated for the caseless and caseful cases, for speed, | |
1336 | since matching characters is likely to be quite common. First, ensure the | |
1337 | minimum number of matches are present. If min = max, continue at the same | |
1338 | level without recursing. Otherwise, if minimizing, keep trying the rest of | |
1339 | the expression and advancing one matching character if failing, up to the | |
1340 | maximum. Alternatively, if maximizing, find the maximum number of | |
1341 | characters and work backwards. */ | |
1342 | ||
1343 | DPRINTF(("negative matching %c{%d,%d}\n", stack.currentFrame->locals.fc, min, stack.currentFrame->locals.max)); | |
1344 | ||
1345 | if (md.ignoreCase) { | |
1346 | if (stack.currentFrame->locals.fc < 128) | |
1347 | stack.currentFrame->locals.fc = toLowerCase(stack.currentFrame->locals.fc); | |
1348 | ||
1349 | for (int i = 1; i <= min; i++) { | |
1350 | int d = *stack.currentFrame->args.subjectPtr++; | |
1351 | if (d < 128) | |
1352 | d = toLowerCase(d); | |
1353 | if (stack.currentFrame->locals.fc == d) | |
1354 | RRETURN_NO_MATCH; | |
1355 | } | |
1356 | ||
1357 | if (min == stack.currentFrame->locals.max) | |
1358 | NEXT_OPCODE; | |
1359 | ||
1360 | if (minimize) { | |
1361 | for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { | |
1362 | RECURSIVE_MATCH(38, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); | |
1363 | if (isMatch) | |
1364 | RRETURN; | |
1365 | int d = *stack.currentFrame->args.subjectPtr++; | |
1366 | if (d < 128) | |
1367 | d = toLowerCase(d); | |
1368 | if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject || stack.currentFrame->locals.fc == d) | |
1369 | RRETURN; | |
1370 | } | |
1371 | /* Control never reaches here */ | |
1372 | } | |
1373 | ||
1374 | /* Maximize case */ | |
1375 | ||
1376 | else { | |
1377 | stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; | |
1378 | ||
1379 | for (int i = min; i < stack.currentFrame->locals.max; i++) { | |
1380 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1381 | break; | |
1382 | int d = *stack.currentFrame->args.subjectPtr; | |
1383 | if (d < 128) | |
1384 | d = toLowerCase(d); | |
1385 | if (stack.currentFrame->locals.fc == d) | |
1386 | break; | |
1387 | ++stack.currentFrame->args.subjectPtr; | |
1388 | } | |
1389 | for (;;) { | |
1390 | RECURSIVE_MATCH(40, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); | |
1391 | if (isMatch) | |
1392 | RRETURN; | |
1393 | if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) | |
1394 | break; /* Stop if tried at original pos */ | |
1395 | } | |
1396 | ||
1397 | RRETURN; | |
1398 | } | |
1399 | /* Control never reaches here */ | |
1400 | } | |
1401 | ||
1402 | /* Caseful comparisons */ | |
1403 | ||
1404 | else { | |
1405 | for (int i = 1; i <= min; i++) { | |
1406 | int d = *stack.currentFrame->args.subjectPtr++; | |
1407 | if (stack.currentFrame->locals.fc == d) | |
1408 | RRETURN_NO_MATCH; | |
1409 | } | |
1410 | ||
1411 | if (min == stack.currentFrame->locals.max) | |
1412 | NEXT_OPCODE; | |
1413 | ||
1414 | if (minimize) { | |
1415 | for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { | |
1416 | RECURSIVE_MATCH(42, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); | |
1417 | if (isMatch) | |
1418 | RRETURN; | |
1419 | int d = *stack.currentFrame->args.subjectPtr++; | |
1420 | if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject || stack.currentFrame->locals.fc == d) | |
1421 | RRETURN; | |
1422 | } | |
1423 | /* Control never reaches here */ | |
1424 | } | |
1425 | ||
1426 | /* Maximize case */ | |
1427 | ||
1428 | else { | |
1429 | stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; | |
1430 | ||
1431 | for (int i = min; i < stack.currentFrame->locals.max; i++) { | |
1432 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1433 | break; | |
1434 | int d = *stack.currentFrame->args.subjectPtr; | |
1435 | if (stack.currentFrame->locals.fc == d) | |
1436 | break; | |
1437 | ++stack.currentFrame->args.subjectPtr; | |
1438 | } | |
1439 | for (;;) { | |
1440 | RECURSIVE_MATCH(44, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); | |
1441 | if (isMatch) | |
1442 | RRETURN; | |
1443 | if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) | |
1444 | break; /* Stop if tried at original pos */ | |
1445 | } | |
1446 | ||
1447 | RRETURN; | |
1448 | } | |
1449 | } | |
1450 | /* Control never reaches here */ | |
1451 | ||
1452 | /* Match a single character type repeatedly; several different opcodes | |
1453 | share code. This is very similar to the code for single characters, but we | |
1454 | repeat it in the interests of efficiency. */ | |
1455 | ||
1456 | BEGIN_OPCODE(TYPEEXACT): | |
1457 | min = stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); | |
1458 | minimize = true; | |
1459 | stack.currentFrame->args.instructionPtr += 3; | |
1460 | goto REPEATTYPE; | |
1461 | ||
1462 | BEGIN_OPCODE(TYPEUPTO): | |
1463 | BEGIN_OPCODE(TYPEMINUPTO): | |
1464 | min = 0; | |
1465 | stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); | |
1466 | minimize = *stack.currentFrame->args.instructionPtr == OP_TYPEMINUPTO; | |
1467 | stack.currentFrame->args.instructionPtr += 3; | |
1468 | goto REPEATTYPE; | |
1469 | ||
1470 | BEGIN_OPCODE(TYPESTAR): | |
1471 | BEGIN_OPCODE(TYPEMINSTAR): | |
1472 | BEGIN_OPCODE(TYPEPLUS): | |
1473 | BEGIN_OPCODE(TYPEMINPLUS): | |
1474 | BEGIN_OPCODE(TYPEQUERY): | |
1475 | BEGIN_OPCODE(TYPEMINQUERY): | |
1476 | repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_TYPESTAR, minimize, min, stack.currentFrame->locals.max); | |
1477 | ||
1478 | /* Common code for all repeated single character type matches. Note that | |
1479 | in UTF-8 mode, '.' matches a character of any length, but for the other | |
1480 | character types, the valid characters are all one-byte long. */ | |
1481 | ||
1482 | REPEATTYPE: | |
1483 | stack.currentFrame->locals.ctype = *stack.currentFrame->args.instructionPtr++; /* Code for the character type */ | |
1484 | ||
1485 | /* First, ensure the minimum number of matches are present. Use inline | |
1486 | code for maximizing the speed, and do the type test once at the start | |
1487 | (i.e. keep it out of the loop). Also we can test that there are at least | |
1488 | the minimum number of characters before we start. */ | |
1489 | ||
1490 | if (min > md.endSubject - stack.currentFrame->args.subjectPtr) | |
1491 | RRETURN_NO_MATCH; | |
1492 | if (min > 0) { | |
1493 | switch (stack.currentFrame->locals.ctype) { | |
1494 | case OP_NOT_NEWLINE: | |
1495 | for (int i = 1; i <= min; i++) { | |
1496 | if (isNewline(*stack.currentFrame->args.subjectPtr)) | |
1497 | RRETURN_NO_MATCH; | |
1498 | ++stack.currentFrame->args.subjectPtr; | |
1499 | } | |
1500 | break; | |
1501 | ||
1502 | case OP_NOT_DIGIT: | |
1503 | for (int i = 1; i <= min; i++) { | |
1504 | if (isASCIIDigit(*stack.currentFrame->args.subjectPtr)) | |
1505 | RRETURN_NO_MATCH; | |
1506 | ++stack.currentFrame->args.subjectPtr; | |
1507 | } | |
1508 | break; | |
1509 | ||
1510 | case OP_DIGIT: | |
1511 | for (int i = 1; i <= min; i++) { | |
1512 | if (!isASCIIDigit(*stack.currentFrame->args.subjectPtr)) | |
1513 | RRETURN_NO_MATCH; | |
1514 | ++stack.currentFrame->args.subjectPtr; | |
1515 | } | |
1516 | break; | |
1517 | ||
1518 | case OP_NOT_WHITESPACE: | |
1519 | for (int i = 1; i <= min; i++) { | |
1520 | if (isSpaceChar(*stack.currentFrame->args.subjectPtr)) | |
1521 | RRETURN_NO_MATCH; | |
1522 | ++stack.currentFrame->args.subjectPtr; | |
1523 | } | |
1524 | break; | |
1525 | ||
1526 | case OP_WHITESPACE: | |
1527 | for (int i = 1; i <= min; i++) { | |
1528 | if (!isSpaceChar(*stack.currentFrame->args.subjectPtr)) | |
1529 | RRETURN_NO_MATCH; | |
1530 | ++stack.currentFrame->args.subjectPtr; | |
1531 | } | |
1532 | break; | |
1533 | ||
1534 | case OP_NOT_WORDCHAR: | |
1535 | for (int i = 1; i <= min; i++) { | |
1536 | if (isWordChar(*stack.currentFrame->args.subjectPtr)) | |
1537 | RRETURN_NO_MATCH; | |
1538 | ++stack.currentFrame->args.subjectPtr; | |
1539 | } | |
1540 | break; | |
1541 | ||
1542 | case OP_WORDCHAR: | |
1543 | for (int i = 1; i <= min; i++) { | |
1544 | if (!isWordChar(*stack.currentFrame->args.subjectPtr)) | |
1545 | RRETURN_NO_MATCH; | |
1546 | ++stack.currentFrame->args.subjectPtr; | |
1547 | } | |
1548 | break; | |
1549 | ||
1550 | default: | |
1551 | ASSERT_NOT_REACHED(); | |
1552 | return matchError(JSRegExpErrorInternal, stack); | |
1553 | } /* End switch(stack.currentFrame->locals.ctype) */ | |
1554 | } | |
1555 | ||
1556 | /* If min = max, continue at the same level without recursing */ | |
1557 | ||
1558 | if (min == stack.currentFrame->locals.max) | |
1559 | NEXT_OPCODE; | |
1560 | ||
1561 | /* If minimizing, we have to test the rest of the pattern before each | |
1562 | subsequent match. */ | |
1563 | ||
1564 | if (minimize) { | |
1565 | for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { | |
1566 | RECURSIVE_MATCH(48, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); | |
1567 | if (isMatch) | |
1568 | RRETURN; | |
1569 | if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1570 | RRETURN; | |
1571 | ||
1572 | int c = *stack.currentFrame->args.subjectPtr++; | |
1573 | switch (stack.currentFrame->locals.ctype) { | |
1574 | case OP_NOT_NEWLINE: | |
1575 | if (isNewline(c)) | |
1576 | RRETURN; | |
1577 | break; | |
1578 | ||
1579 | case OP_NOT_DIGIT: | |
1580 | if (isASCIIDigit(c)) | |
1581 | RRETURN; | |
1582 | break; | |
1583 | ||
1584 | case OP_DIGIT: | |
1585 | if (!isASCIIDigit(c)) | |
1586 | RRETURN; | |
1587 | break; | |
1588 | ||
1589 | case OP_NOT_WHITESPACE: | |
1590 | if (isSpaceChar(c)) | |
1591 | RRETURN; | |
1592 | break; | |
1593 | ||
1594 | case OP_WHITESPACE: | |
1595 | if (!isSpaceChar(c)) | |
1596 | RRETURN; | |
1597 | break; | |
1598 | ||
1599 | case OP_NOT_WORDCHAR: | |
1600 | if (isWordChar(c)) | |
1601 | RRETURN; | |
1602 | break; | |
1603 | ||
1604 | case OP_WORDCHAR: | |
1605 | if (!isWordChar(c)) | |
1606 | RRETURN; | |
1607 | break; | |
1608 | ||
1609 | default: | |
1610 | ASSERT_NOT_REACHED(); | |
1611 | return matchError(JSRegExpErrorInternal, stack); | |
1612 | } | |
1613 | } | |
1614 | /* Control never reaches here */ | |
1615 | } | |
1616 | ||
1617 | /* If maximizing it is worth using inline code for speed, doing the type | |
1618 | test once at the start (i.e. keep it out of the loop). */ | |
1619 | ||
1620 | else { | |
1621 | stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; /* Remember where we started */ | |
1622 | ||
1623 | switch (stack.currentFrame->locals.ctype) { | |
1624 | case OP_NOT_NEWLINE: | |
1625 | for (int i = min; i < stack.currentFrame->locals.max; i++) { | |
1626 | if (stack.currentFrame->args.subjectPtr >= md.endSubject || isNewline(*stack.currentFrame->args.subjectPtr)) | |
1627 | break; | |
1628 | stack.currentFrame->args.subjectPtr++; | |
1629 | } | |
1630 | break; | |
1631 | ||
1632 | case OP_NOT_DIGIT: | |
1633 | for (int i = min; i < stack.currentFrame->locals.max; i++) { | |
1634 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1635 | break; | |
1636 | int c = *stack.currentFrame->args.subjectPtr; | |
1637 | if (isASCIIDigit(c)) | |
1638 | break; | |
1639 | ++stack.currentFrame->args.subjectPtr; | |
1640 | } | |
1641 | break; | |
1642 | ||
1643 | case OP_DIGIT: | |
1644 | for (int i = min; i < stack.currentFrame->locals.max; i++) { | |
1645 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1646 | break; | |
1647 | int c = *stack.currentFrame->args.subjectPtr; | |
1648 | if (!isASCIIDigit(c)) | |
1649 | break; | |
1650 | ++stack.currentFrame->args.subjectPtr; | |
1651 | } | |
1652 | break; | |
1653 | ||
1654 | case OP_NOT_WHITESPACE: | |
1655 | for (int i = min; i < stack.currentFrame->locals.max; i++) { | |
1656 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1657 | break; | |
1658 | int c = *stack.currentFrame->args.subjectPtr; | |
1659 | if (isSpaceChar(c)) | |
1660 | break; | |
1661 | ++stack.currentFrame->args.subjectPtr; | |
1662 | } | |
1663 | break; | |
1664 | ||
1665 | case OP_WHITESPACE: | |
1666 | for (int i = min; i < stack.currentFrame->locals.max; i++) { | |
1667 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1668 | break; | |
1669 | int c = *stack.currentFrame->args.subjectPtr; | |
1670 | if (!isSpaceChar(c)) | |
1671 | break; | |
1672 | ++stack.currentFrame->args.subjectPtr; | |
1673 | } | |
1674 | break; | |
1675 | ||
1676 | case OP_NOT_WORDCHAR: | |
1677 | for (int i = min; i < stack.currentFrame->locals.max; i++) { | |
1678 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1679 | break; | |
1680 | int c = *stack.currentFrame->args.subjectPtr; | |
1681 | if (isWordChar(c)) | |
1682 | break; | |
1683 | ++stack.currentFrame->args.subjectPtr; | |
1684 | } | |
1685 | break; | |
1686 | ||
1687 | case OP_WORDCHAR: | |
1688 | for (int i = min; i < stack.currentFrame->locals.max; i++) { | |
1689 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1690 | break; | |
1691 | int c = *stack.currentFrame->args.subjectPtr; | |
1692 | if (!isWordChar(c)) | |
1693 | break; | |
1694 | ++stack.currentFrame->args.subjectPtr; | |
1695 | } | |
1696 | break; | |
1697 | ||
1698 | default: | |
1699 | ASSERT_NOT_REACHED(); | |
1700 | return matchError(JSRegExpErrorInternal, stack); | |
1701 | } | |
1702 | ||
1703 | /* stack.currentFrame->args.subjectPtr is now past the end of the maximum run */ | |
1704 | ||
1705 | for (;;) { | |
1706 | RECURSIVE_MATCH(52, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); | |
1707 | if (isMatch) | |
1708 | RRETURN; | |
1709 | if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) | |
1710 | break; /* Stop if tried at original pos */ | |
1711 | } | |
1712 | ||
1713 | /* Get here if we can't make it match with any permitted repetitions */ | |
1714 | ||
1715 | RRETURN; | |
1716 | } | |
1717 | /* Control never reaches here */ | |
1718 | ||
1719 | BEGIN_OPCODE(CRMINPLUS): | |
1720 | BEGIN_OPCODE(CRMINQUERY): | |
1721 | BEGIN_OPCODE(CRMINRANGE): | |
1722 | BEGIN_OPCODE(CRMINSTAR): | |
1723 | BEGIN_OPCODE(CRPLUS): | |
1724 | BEGIN_OPCODE(CRQUERY): | |
1725 | BEGIN_OPCODE(CRRANGE): | |
1726 | BEGIN_OPCODE(CRSTAR): | |
1727 | ASSERT_NOT_REACHED(); | |
1728 | return matchError(JSRegExpErrorInternal, stack); | |
1729 | ||
1730 | #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP | |
1731 | CAPTURING_BRACKET: | |
1732 | #else | |
1733 | default: | |
1734 | #endif | |
1735 | /* Opening capturing bracket. If there is space in the offset vector, save | |
1736 | the current subject position in the working slot at the top of the vector. We | |
1737 | mustn't change the current values of the data slot, because they may be set | |
1738 | from a previous iteration of this group, and be referred to by a reference | |
1739 | inside the group. | |
1740 | ||
1741 | If the bracket fails to match, we need to restore this value and also the | |
1742 | values of the final offsets, in case they were set by a previous iteration of | |
1743 | the same bracket. | |
1744 | ||
1745 | If there isn't enough space in the offset vector, treat this as if it were a | |
1746 | non-capturing bracket. Don't worry about setting the flag for the error case | |
1747 | here; that is handled in the code for KET. */ | |
1748 | ||
1749 | ASSERT(*stack.currentFrame->args.instructionPtr > OP_BRA); | |
1750 | ||
1751 | stack.currentFrame->locals.number = *stack.currentFrame->args.instructionPtr - OP_BRA; | |
1752 | ||
1753 | /* For extended extraction brackets (large number), we have to fish out the | |
1754 | number from a dummy opcode at the start. */ | |
1755 | ||
1756 | if (stack.currentFrame->locals.number > EXTRACT_BASIC_MAX) | |
1757 | stack.currentFrame->locals.number = get2ByteValue(stack.currentFrame->args.instructionPtr + 2 + LINK_SIZE); | |
1758 | stack.currentFrame->locals.offset = stack.currentFrame->locals.number << 1; | |
1759 | ||
1760 | #ifdef DEBUG | |
1761 | printf("start bracket %d subject=", stack.currentFrame->locals.number); | |
1762 | pchars(stack.currentFrame->args.subjectPtr, 16, true, md); | |
1763 | printf("\n"); | |
1764 | #endif | |
1765 | ||
1766 | if (stack.currentFrame->locals.offset < md.offsetMax) { | |
1767 | stack.currentFrame->locals.saveOffset1 = md.offsetVector[stack.currentFrame->locals.offset]; | |
1768 | stack.currentFrame->locals.saveOffset2 = md.offsetVector[stack.currentFrame->locals.offset + 1]; | |
1769 | stack.currentFrame->locals.saveOffset3 = md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number]; | |
1770 | ||
1771 | DPRINTF(("saving %d %d %d\n", stack.currentFrame->locals.saveOffset1, stack.currentFrame->locals.saveOffset2, stack.currentFrame->locals.saveOffset3)); | |
1772 | md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number] = stack.currentFrame->args.subjectPtr - md.startSubject; | |
1773 | ||
1774 | do { | |
9dae56ea | 1775 | RECURSIVE_MATCH_NEW_GROUP(1, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain); |
b37bf2e1 A |
1776 | if (isMatch) |
1777 | RRETURN; | |
1778 | stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1); | |
1779 | } while (*stack.currentFrame->args.instructionPtr == OP_ALT); | |
1780 | ||
1781 | DPRINTF(("bracket %d failed\n", stack.currentFrame->locals.number)); | |
1782 | ||
1783 | md.offsetVector[stack.currentFrame->locals.offset] = stack.currentFrame->locals.saveOffset1; | |
1784 | md.offsetVector[stack.currentFrame->locals.offset + 1] = stack.currentFrame->locals.saveOffset2; | |
1785 | md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number] = stack.currentFrame->locals.saveOffset3; | |
1786 | ||
1787 | RRETURN; | |
1788 | } | |
1789 | ||
1790 | /* Insufficient room for saving captured contents */ | |
1791 | ||
1792 | goto NON_CAPTURING_BRACKET; | |
1793 | } | |
1794 | ||
1795 | /* Do not stick any code in here without much thought; it is assumed | |
1796 | that "continue" in the code above comes out to here to repeat the main | |
1797 | loop. */ | |
1798 | ||
1799 | } /* End of main loop */ | |
1800 | ||
1801 | ASSERT_NOT_REACHED(); | |
1802 | ||
1803 | #ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION | |
1804 | ||
1805 | RRETURN_SWITCH: | |
1806 | switch (stack.currentFrame->returnLocation) { | |
1807 | case 0: goto RETURN; | |
1808 | case 1: goto RRETURN_1; | |
1809 | case 2: goto RRETURN_2; | |
1810 | case 6: goto RRETURN_6; | |
1811 | case 7: goto RRETURN_7; | |
1812 | case 14: goto RRETURN_14; | |
1813 | case 15: goto RRETURN_15; | |
1814 | case 16: goto RRETURN_16; | |
1815 | case 17: goto RRETURN_17; | |
1816 | case 18: goto RRETURN_18; | |
1817 | case 19: goto RRETURN_19; | |
1818 | case 20: goto RRETURN_20; | |
1819 | case 21: goto RRETURN_21; | |
1820 | case 22: goto RRETURN_22; | |
1821 | case 24: goto RRETURN_24; | |
1822 | case 26: goto RRETURN_26; | |
1823 | case 27: goto RRETURN_27; | |
1824 | case 28: goto RRETURN_28; | |
1825 | case 29: goto RRETURN_29; | |
1826 | case 30: goto RRETURN_30; | |
1827 | case 31: goto RRETURN_31; | |
1828 | case 38: goto RRETURN_38; | |
1829 | case 40: goto RRETURN_40; | |
1830 | case 42: goto RRETURN_42; | |
1831 | case 44: goto RRETURN_44; | |
1832 | case 48: goto RRETURN_48; | |
1833 | case 52: goto RRETURN_52; | |
1834 | } | |
1835 | ||
1836 | ASSERT_NOT_REACHED(); | |
1837 | return matchError(JSRegExpErrorInternal, stack); | |
1838 | ||
1839 | #endif | |
1840 | ||
1841 | RETURN: | |
1842 | return isMatch; | |
1843 | } | |
1844 | ||
1845 | ||
1846 | /************************************************* | |
1847 | * Execute a Regular Expression * | |
1848 | *************************************************/ | |
1849 | ||
1850 | /* This function applies a compiled re to a subject string and picks out | |
1851 | portions of the string if it matches. Two elements in the vector are set for | |
1852 | each substring: the offsets to the start and end of the substring. | |
1853 | ||
1854 | Arguments: | |
1855 | re points to the compiled expression | |
1856 | extra_data points to extra data or is NULL | |
1857 | subject points to the subject string | |
1858 | length length of subject string (may contain binary zeros) | |
1859 | start_offset where to start in the subject string | |
1860 | options option bits | |
1861 | offsets points to a vector of ints to be filled in with offsets | |
9dae56ea | 1862 | offsetCount the number of elements in the vector |
b37bf2e1 A |
1863 | |
1864 | Returns: > 0 => success; value is the number of elements filled in | |
1865 | = 0 => success, but offsets is not big enough | |
1866 | -1 => failed to match | |
1867 | < -1 => some kind of unexpected problem | |
1868 | */ | |
1869 | ||
9dae56ea | 1870 | static void tryFirstByteOptimization(const UChar*& subjectPtr, const UChar* endSubject, int firstByte, bool firstByteIsCaseless, bool useMultiLineFirstCharOptimization, const UChar* originalSubjectStart) |
b37bf2e1 | 1871 | { |
9dae56ea | 1872 | // If firstByte is set, try scanning to the first instance of that byte |
b37bf2e1 | 1873 | // no need to try and match against any earlier part of the subject string. |
9dae56ea A |
1874 | if (firstByte >= 0) { |
1875 | UChar firstChar = firstByte; | |
1876 | if (firstByteIsCaseless) | |
b37bf2e1 A |
1877 | while (subjectPtr < endSubject) { |
1878 | int c = *subjectPtr; | |
1879 | if (c > 127) | |
1880 | break; | |
9dae56ea | 1881 | if (toLowerCase(c) == firstChar) |
b37bf2e1 A |
1882 | break; |
1883 | subjectPtr++; | |
1884 | } | |
1885 | else { | |
9dae56ea | 1886 | while (subjectPtr < endSubject && *subjectPtr != firstChar) |
b37bf2e1 A |
1887 | subjectPtr++; |
1888 | } | |
1889 | } else if (useMultiLineFirstCharOptimization) { | |
1890 | /* Or to just after \n for a multiline match if possible */ | |
1891 | // I'm not sure why this != originalSubjectStart check is necessary -- ecs 11/18/07 | |
1892 | if (subjectPtr > originalSubjectStart) { | |
1893 | while (subjectPtr < endSubject && !isNewline(subjectPtr[-1])) | |
1894 | subjectPtr++; | |
1895 | } | |
1896 | } | |
1897 | } | |
1898 | ||
9dae56ea | 1899 | static bool tryRequiredByteOptimization(const UChar*& subjectPtr, const UChar* endSubject, int reqByte, int reqByte2, bool reqByteIsCaseless, bool hasFirstByte, const UChar*& reqBytePtr) |
b37bf2e1 | 1900 | { |
9dae56ea A |
1901 | /* If reqByte is set, we know that that character must appear in the subject |
1902 | for the match to succeed. If the first character is set, reqByte must be | |
b37bf2e1 A |
1903 | later in the subject; otherwise the test starts at the match point. This |
1904 | optimization can save a huge amount of backtracking in patterns with nested | |
1905 | unlimited repeats that aren't going to match. Writing separate code for | |
1906 | cased/caseless versions makes it go faster, as does using an autoincrement | |
1907 | and backing off on a match. | |
1908 | ||
1909 | HOWEVER: when the subject string is very, very long, searching to its end can | |
1910 | take a long time, and give bad performance on quite ordinary patterns. This | |
1911 | showed up when somebody was matching /^C/ on a 32-megabyte string... so we | |
1912 | don't do this when the string is sufficiently long. | |
1913 | */ | |
1914 | ||
9dae56ea | 1915 | if (reqByte >= 0 && endSubject - subjectPtr < REQ_BYTE_MAX) { |
b37bf2e1 A |
1916 | const UChar* p = subjectPtr + (hasFirstByte ? 1 : 0); |
1917 | ||
1918 | /* We don't need to repeat the search if we haven't yet reached the | |
1919 | place we found it at last time. */ | |
1920 | ||
1921 | if (p > reqBytePtr) { | |
9dae56ea | 1922 | if (reqByteIsCaseless) { |
b37bf2e1 A |
1923 | while (p < endSubject) { |
1924 | int pp = *p++; | |
9dae56ea | 1925 | if (pp == reqByte || pp == reqByte2) { |
b37bf2e1 A |
1926 | p--; |
1927 | break; | |
1928 | } | |
1929 | } | |
1930 | } else { | |
1931 | while (p < endSubject) { | |
9dae56ea | 1932 | if (*p++ == reqByte) { |
b37bf2e1 A |
1933 | p--; |
1934 | break; | |
1935 | } | |
1936 | } | |
1937 | } | |
1938 | ||
1939 | /* If we can't find the required character, break the matching loop */ | |
1940 | ||
1941 | if (p >= endSubject) | |
1942 | return true; | |
1943 | ||
1944 | /* If we have found the required character, save the point where we | |
1945 | found it, so that we don't search again next time round the loop if | |
1946 | the start hasn't passed this character yet. */ | |
1947 | ||
1948 | reqBytePtr = p; | |
1949 | } | |
1950 | } | |
1951 | return false; | |
1952 | } | |
1953 | ||
1954 | int jsRegExpExecute(const JSRegExp* re, | |
1955 | const UChar* subject, int length, int start_offset, int* offsets, | |
9dae56ea | 1956 | int offsetCount) |
b37bf2e1 A |
1957 | { |
1958 | ASSERT(re); | |
9dae56ea A |
1959 | ASSERT(subject || !length); |
1960 | ASSERT(offsetCount >= 0); | |
1961 | ASSERT(offsets || offsetCount == 0); | |
1962 | ||
1963 | HistogramTimeLogger logger(re); | |
1964 | ||
b37bf2e1 A |
1965 | MatchData matchBlock; |
1966 | matchBlock.startSubject = subject; | |
1967 | matchBlock.endSubject = matchBlock.startSubject + length; | |
1968 | const UChar* endSubject = matchBlock.endSubject; | |
1969 | ||
1970 | matchBlock.multiline = (re->options & MatchAcrossMultipleLinesOption); | |
1971 | matchBlock.ignoreCase = (re->options & IgnoreCaseOption); | |
1972 | ||
1973 | /* If the expression has got more back references than the offsets supplied can | |
1974 | hold, we get a temporary chunk of working store to use during the matching. | |
1975 | Otherwise, we can use the vector supplied, rounding down its size to a multiple | |
1976 | of 3. */ | |
1977 | ||
9dae56ea | 1978 | int ocount = offsetCount - (offsetCount % 3); |
b37bf2e1 A |
1979 | |
1980 | // FIXME: This is lame that we have to second-guess our caller here. | |
1981 | // The API should change to either fail-hard when we don't have enough offset space | |
1982 | // or that we shouldn't ask our callers to pre-allocate in the first place. | |
9dae56ea A |
1983 | bool usingTemporaryOffsets = false; |
1984 | if (re->topBackref > 0 && re->topBackref >= ocount/3) { | |
1985 | ocount = re->topBackref * 3 + 3; | |
b37bf2e1 A |
1986 | matchBlock.offsetVector = new int[ocount]; |
1987 | if (!matchBlock.offsetVector) | |
1988 | return JSRegExpErrorNoMemory; | |
9dae56ea | 1989 | usingTemporaryOffsets = true; |
b37bf2e1 A |
1990 | } else |
1991 | matchBlock.offsetVector = offsets; | |
1992 | ||
1993 | matchBlock.offsetEnd = ocount; | |
1994 | matchBlock.offsetMax = (2*ocount)/3; | |
1995 | matchBlock.offsetOverflow = false; | |
1996 | ||
1997 | /* Compute the minimum number of offsets that we need to reset each time. Doing | |
1998 | this makes a huge difference to execution time when there aren't many brackets | |
1999 | in the pattern. */ | |
2000 | ||
9dae56ea A |
2001 | int resetCount = 2 + re->topBracket * 2; |
2002 | if (resetCount > offsetCount) | |
2003 | resetCount = ocount; | |
b37bf2e1 A |
2004 | |
2005 | /* Reset the working variable associated with each extraction. These should | |
2006 | never be used unless previously set, but they get saved and restored, and so we | |
2007 | initialize them to avoid reading uninitialized locations. */ | |
2008 | ||
2009 | if (matchBlock.offsetVector) { | |
2010 | int* iptr = matchBlock.offsetVector + ocount; | |
9dae56ea | 2011 | int* iend = iptr - resetCount/2 + 1; |
b37bf2e1 A |
2012 | while (--iptr >= iend) |
2013 | *iptr = -1; | |
2014 | } | |
2015 | ||
9dae56ea | 2016 | /* Set up the first character to match, if available. The firstByte value is |
b37bf2e1 A |
2017 | never set for an anchored regular expression, but the anchoring may be forced |
2018 | at run time, so we have to test for anchoring. The first char may be unset for | |
2019 | an unanchored pattern, of course. If there's no first char and the pattern was | |
2020 | studied, there may be a bitmap of possible first characters. */ | |
2021 | ||
9dae56ea A |
2022 | bool firstByteIsCaseless = false; |
2023 | int firstByte = -1; | |
b37bf2e1 | 2024 | if (re->options & UseFirstByteOptimizationOption) { |
9dae56ea A |
2025 | firstByte = re->firstByte & 255; |
2026 | if ((firstByteIsCaseless = (re->firstByte & REQ_IGNORE_CASE))) | |
2027 | firstByte = toLowerCase(firstByte); | |
b37bf2e1 A |
2028 | } |
2029 | ||
2030 | /* For anchored or unanchored matches, there may be a "last known required | |
2031 | character" set. */ | |
2032 | ||
9dae56ea A |
2033 | bool reqByteIsCaseless = false; |
2034 | int reqByte = -1; | |
2035 | int reqByte2 = -1; | |
b37bf2e1 | 2036 | if (re->options & UseRequiredByteOptimizationOption) { |
9dae56ea A |
2037 | reqByte = re->reqByte & 255; // FIXME: This optimization could be made to work for UTF16 chars as well... |
2038 | reqByteIsCaseless = (re->reqByte & REQ_IGNORE_CASE); | |
2039 | reqByte2 = flipCase(reqByte); | |
b37bf2e1 A |
2040 | } |
2041 | ||
2042 | /* Loop for handling unanchored repeated matching attempts; for anchored regexs | |
2043 | the loop runs just once. */ | |
2044 | ||
2045 | const UChar* startMatch = subject + start_offset; | |
2046 | const UChar* reqBytePtr = startMatch - 1; | |
2047 | bool useMultiLineFirstCharOptimization = re->options & UseMultiLineFirstByteOptimizationOption; | |
2048 | ||
2049 | do { | |
2050 | /* Reset the maximum number of extractions we might see. */ | |
2051 | if (matchBlock.offsetVector) { | |
2052 | int* iptr = matchBlock.offsetVector; | |
9dae56ea | 2053 | int* iend = iptr + resetCount; |
b37bf2e1 A |
2054 | while (iptr < iend) |
2055 | *iptr++ = -1; | |
2056 | } | |
2057 | ||
9dae56ea A |
2058 | tryFirstByteOptimization(startMatch, endSubject, firstByte, firstByteIsCaseless, useMultiLineFirstCharOptimization, matchBlock.startSubject + start_offset); |
2059 | if (tryRequiredByteOptimization(startMatch, endSubject, reqByte, reqByte2, reqByteIsCaseless, firstByte >= 0, reqBytePtr)) | |
b37bf2e1 A |
2060 | break; |
2061 | ||
2062 | /* When a match occurs, substrings will be set for all internal extractions; | |
2063 | we just need to set up the whole thing as substring 0 before returning. If | |
2064 | there were too many extractions, set the return code to zero. In the case | |
2065 | where we had to get some local store to hold offsets for backreferences, copy | |
2066 | those back references that we can. In this case there need not be overflow | |
2067 | if certain parts of the pattern were not used. */ | |
2068 | ||
2069 | /* The code starts after the JSRegExp block and the capture name table. */ | |
2070 | const unsigned char* start_code = (const unsigned char*)(re + 1); | |
2071 | ||
2072 | int returnCode = match(startMatch, start_code, 2, matchBlock); | |
2073 | ||
2074 | /* When the result is no match, advance the pointer to the next character | |
2075 | and continue. */ | |
2076 | if (returnCode == 0) { | |
2077 | startMatch++; | |
2078 | continue; | |
2079 | } | |
2080 | ||
2081 | if (returnCode != 1) { | |
2082 | ASSERT(returnCode == JSRegExpErrorHitLimit || returnCode == JSRegExpErrorNoMemory); | |
9dae56ea | 2083 | DPRINTF((">>>> error: returning %d\n", returnCode)); |
b37bf2e1 A |
2084 | return returnCode; |
2085 | } | |
2086 | ||
2087 | /* We have a match! Copy the offset information from temporary store if | |
2088 | necessary */ | |
2089 | ||
9dae56ea A |
2090 | if (usingTemporaryOffsets) { |
2091 | if (offsetCount >= 4) { | |
2092 | memcpy(offsets + 2, matchBlock.offsetVector + 2, (offsetCount - 2) * sizeof(int)); | |
b37bf2e1 A |
2093 | DPRINTF(("Copied offsets from temporary memory\n")); |
2094 | } | |
9dae56ea | 2095 | if (matchBlock.endOffsetTop > offsetCount) |
b37bf2e1 A |
2096 | matchBlock.offsetOverflow = true; |
2097 | ||
2098 | DPRINTF(("Freeing temporary memory\n")); | |
2099 | delete [] matchBlock.offsetVector; | |
2100 | } | |
2101 | ||
2102 | returnCode = matchBlock.offsetOverflow ? 0 : matchBlock.endOffsetTop / 2; | |
2103 | ||
9dae56ea | 2104 | if (offsetCount < 2) |
b37bf2e1 A |
2105 | returnCode = 0; |
2106 | else { | |
2107 | offsets[0] = startMatch - matchBlock.startSubject; | |
2108 | offsets[1] = matchBlock.endMatchPtr - matchBlock.startSubject; | |
2109 | } | |
2110 | ||
9dae56ea | 2111 | DPRINTF((">>>> returning %d\n", returnCode)); |
b37bf2e1 | 2112 | return returnCode; |
9dae56ea | 2113 | } while (!(re->options & IsAnchoredOption) && startMatch <= endSubject); |
b37bf2e1 | 2114 | |
9dae56ea | 2115 | if (usingTemporaryOffsets) { |
b37bf2e1 A |
2116 | DPRINTF(("Freeing temporary memory\n")); |
2117 | delete [] matchBlock.offsetVector; | |
2118 | } | |
2119 | ||
2120 | DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n")); | |
2121 | return JSRegExpErrorNoMatch; | |
2122 | } | |
9dae56ea A |
2123 | |
2124 | #if REGEXP_HISTOGRAM | |
2125 | ||
2126 | class CompareHistogramEntries { | |
2127 | public: | |
2128 | bool operator()(const pair<UString, double>& a, const pair<UString, double>& b) | |
2129 | { | |
2130 | if (a.second == b.second) | |
2131 | return a.first < b.first; | |
2132 | return a.second < b.second; | |
2133 | } | |
2134 | }; | |
2135 | ||
2136 | Histogram::~Histogram() | |
2137 | { | |
2138 | Vector<pair<UString, double> > values; | |
2139 | Map::iterator end = times.end(); | |
2140 | for (Map::iterator it = times.begin(); it != end; ++it) | |
2141 | values.append(*it); | |
2142 | sort(values.begin(), values.end(), CompareHistogramEntries()); | |
2143 | size_t size = values.size(); | |
2144 | printf("Regular Expressions, sorted by time spent evaluating them:\n"); | |
2145 | for (size_t i = 0; i < size; ++i) | |
2146 | printf(" %f - %s\n", values[size - i - 1].second, values[size - i - 1].first.UTF8String().c_str()); | |
2147 | } | |
2148 | ||
2149 | void Histogram::add(const JSRegExp* re, double elapsedTime) | |
2150 | { | |
2151 | UString string(reinterpret_cast<const UChar*>(reinterpret_cast<const char*>(re) + re->stringOffset), re->stringLength); | |
2152 | if (re->options & IgnoreCaseOption && re->options & MatchAcrossMultipleLinesOption) | |
2153 | string += " (multi-line, ignore case)"; | |
2154 | else { | |
2155 | if (re->options & IgnoreCaseOption) | |
2156 | string += " (ignore case)"; | |
2157 | if (re->options & MatchAcrossMultipleLinesOption) | |
2158 | string += " (multi-line)"; | |
2159 | } | |
2160 | pair<Map::iterator, bool> result = times.add(string.rep(), elapsedTime); | |
2161 | if (!result.second) | |
2162 | result.first->second += elapsedTime; | |
2163 | } | |
2164 | ||
2165 | HistogramTimeLogger::HistogramTimeLogger(const JSRegExp* re) | |
2166 | : m_re(re) | |
f9bf01c6 | 2167 | , m_startTime(currentTimeMS()) |
9dae56ea A |
2168 | { |
2169 | } | |
2170 | ||
2171 | HistogramTimeLogger::~HistogramTimeLogger() | |
2172 | { | |
2173 | static Histogram histogram; | |
f9bf01c6 | 2174 | histogram.add(m_re, currentTimeMS() - m_startTime); |
9dae56ea A |
2175 | } |
2176 | ||
2177 | #endif |