]> git.saurik.com Git - apple/javascriptcore.git/blame - pcre/pcre_exec.cpp
JavaScriptCore-466.1.6.tar.gz
[apple/javascriptcore.git] / pcre / pcre_exec.cpp
CommitLineData
b37bf2e1
A
1/* This is JavaScriptCore's variant of the PCRE library. While this library
2started out as a copy of PCRE, many of the features of PCRE have been
3removed. This library now supports only the regular expression features
4required by the JavaScript language specification, and has only the functions
5needed by JavaScriptCore and the rest of WebKit.
6
7 Originally written by Philip Hazel
8 Copyright (c) 1997-2006 University of Cambridge
9 Copyright (C) 2002, 2004, 2006, 2007 Apple Inc. All rights reserved.
10 Copyright (C) 2007 Eric Seidel <eric@webkit.org>
11
12-----------------------------------------------------------------------------
13Redistribution and use in source and binary forms, with or without
14modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37POSSIBILITY OF SUCH DAMAGE.
38-----------------------------------------------------------------------------
39*/
40
41/* This module contains jsRegExpExecute(), the externally visible function
42that does pattern matching using an NFA algorithm, following the rules from
43the JavaScript specification. There are also some supporting functions. */
44
45#include "config.h"
46
47#include "pcre_internal.h"
48
49#include <wtf/ASCIICType.h>
50#include <wtf/Vector.h>
51
52#include <limits.h>
53
54using namespace WTF;
55
56#ifdef __GNUC__
57#define USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
58//#define USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
59#endif
60
61/* Avoid warnings on Windows. */
62#undef min
63#undef max
64
65#ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
66typedef int ReturnLocation;
67#else
68typedef void* ReturnLocation;
69#endif
70
71/* Structure for building a chain of data for holding the values of
72the subject pointer at the start of each bracket, used to detect when
73an empty string has been matched by a bracket to break infinite loops. */
74struct BracketChainNode {
75 BracketChainNode* previousBracket;
76 const UChar* bracketStart;
77};
78
79struct MatchFrame {
80 ReturnLocation returnLocation;
81 struct MatchFrame* previousFrame;
82
83 /* Function arguments that may change */
84 struct {
85 const UChar* subjectPtr;
86 const unsigned char* instructionPtr;
87 int offsetTop;
88 BracketChainNode* bracketChain;
89 } args;
90
91
92 /* PCRE uses "fake" recursion built off of gotos, thus
93 stack-based local variables are not safe to use. Instead we have to
94 store local variables on the current MatchFrame. */
95 struct {
96 const unsigned char* data;
97 const unsigned char* startOfRepeatingBracket;
98 const UChar* subjectPtrAtStartOfInstruction; // Several instrutions stash away a subjectPtr here for later compare
99 const unsigned char* instructionPtrAtStartOfOnce;
100
101 int repeatOthercase;
102
103 int ctype;
104 int fc;
105 int fi;
106 int length;
107 int max;
108 int number;
109 int offset;
110 int saveOffset1;
111 int saveOffset2;
112 int saveOffset3;
113
114 BracketChainNode bracketChainNode;
115 } locals;
116};
117
118/* Structure for passing "static" information around between the functions
119doing traditional NFA matching, so that they are thread-safe. */
120
121struct MatchData {
122 int* offsetVector; /* Offset vector */
123 int offsetEnd; /* One past the end */
124 int offsetMax; /* The maximum usable for return data */
125 bool offsetOverflow; /* Set if too many extractions */
126 const UChar* startSubject; /* Start of the subject string */
127 const UChar* endSubject; /* End of the subject string */
128 const UChar* endMatchPtr; /* Subject position at end match */
129 int endOffsetTop; /* Highwater mark at end of match */
130 bool multiline;
131 bool ignoreCase;
132};
133
134/* The maximum remaining length of subject we are prepared to search for a
135req_byte match. */
136
137#define REQ_BYTE_MAX 1000
138
139/* The below limit restricts the number of "recursive" match calls in order to
140avoid spending exponential time on complex regular expressions. */
141
142static const unsigned matchLimit = 100000;
143
144#ifdef DEBUG
145/*************************************************
146* Debugging function to print chars *
147*************************************************/
148
149/* Print a sequence of chars in printable format, stopping at the end of the
150subject if the requested.
151
152Arguments:
153 p points to characters
154 length number to print
155 isSubject true if printing from within md.startSubject
156 md pointer to matching data block, if isSubject is true
157*/
158
159static void pchars(const UChar* p, int length, bool isSubject, const MatchData& md)
160{
161 if (isSubject && length > md.endSubject - p)
162 length = md.endSubject - p;
163 while (length-- > 0) {
164 int c;
165 if (isprint(c = *(p++)))
166 printf("%c", c);
167 else if (c < 256)
168 printf("\\x%02x", c);
169 else
170 printf("\\x{%x}", c);
171 }
172}
173#endif
174
175/*************************************************
176* Match a back-reference *
177*************************************************/
178
179/* If a back reference hasn't been set, the length that is passed is greater
180than the number of characters left in the string, so the match fails.
181
182Arguments:
183 offset index into the offset vector
184 subjectPtr points into the subject
185 length length to be matched
186 md points to match data block
187
188Returns: true if matched
189*/
190
191static bool matchRef(int offset, const UChar* subjectPtr, int length, const MatchData& md)
192{
193 const UChar* p = md.startSubject + md.offsetVector[offset];
194
195#ifdef DEBUG
196 if (subjectPtr >= md.endSubject)
197 printf("matching subject <null>");
198 else {
199 printf("matching subject ");
200 pchars(subjectPtr, length, true, md);
201 }
202 printf(" against backref ");
203 pchars(p, length, false, md);
204 printf("\n");
205#endif
206
207 /* Always fail if not enough characters left */
208
209 if (length > md.endSubject - subjectPtr)
210 return false;
211
212 /* Separate the caselesss case for speed */
213
214 if (md.ignoreCase) {
215 while (length-- > 0) {
216 UChar c = *p++;
217 int othercase = kjs_pcre_ucp_othercase(c);
218 UChar d = *subjectPtr++;
219 if (c != d && othercase != d)
220 return false;
221 }
222 }
223 else {
224 while (length-- > 0)
225 if (*p++ != *subjectPtr++)
226 return false;
227 }
228
229 return true;
230}
231
232#ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
233
234/* Use numbered labels and switch statement at the bottom of the match function. */
235
236#define RMATCH_WHERE(num) num
237#define RRETURN_LABEL RRETURN_SWITCH
238
239#else
240
241/* Use GCC's computed goto extension. */
242
243/* For one test case this is more than 40% faster than the switch statement.
244We could avoid the use of the num argument entirely by using local labels,
245but using it for the GCC case as well as the non-GCC case allows us to share
246a bit more code and notice if we use conflicting numbers.*/
247
248#define RMATCH_WHERE(num) &&RRETURN_##num
249#define RRETURN_LABEL *stack.currentFrame->returnLocation
250
251#endif
252
253#define RECURSIVE_MATCH_COMMON(num) \
254 goto RECURSE;\
255 RRETURN_##num: \
256 stack.popCurrentFrame();
257
258#define RECURSIVE_MATCH(num, ra, rb) \
259 do { \
260 stack.pushNewFrame((ra), (rb), RMATCH_WHERE(num)); \
261 RECURSIVE_MATCH_COMMON(num) \
262 } while (0)
263
264#define RECURSIVE_MATCH_STARTNG_NEW_GROUP(num, ra, rb) \
265 do { \
266 stack.pushNewFrame((ra), (rb), RMATCH_WHERE(num)); \
267 startNewGroup(stack.currentFrame); \
268 RECURSIVE_MATCH_COMMON(num) \
269 } while (0)
270
271#define RRETURN goto RRETURN_LABEL
272
273#define RRETURN_NO_MATCH do { isMatch = false; RRETURN; } while (0)
274
275/*************************************************
276* Match from current position *
277*************************************************/
278
279/* On entry instructionPtr points to the first opcode, and subjectPtr to the first character
280in the subject string, while substringStart holds the value of subjectPtr at the start of the
281last bracketed group - used for breaking infinite loops matching zero-length
282strings. This function is called recursively in many circumstances. Whenever it
283returns a negative (error) response, the outer match() call must also return the
284same response.
285
286Arguments:
287 subjectPtr pointer in subject
288 instructionPtr position in code
289 offsetTop current top pointer
290 md pointer to "static" info for the match
291
292Returns: 1 if matched ) these values are >= 0
293 0 if failed to match )
294 a negative error value if aborted by an error condition
295 (e.g. stopped by repeated call or recursion limit)
296*/
297
298static const unsigned FRAMES_ON_STACK = 16;
299
300struct MatchStack {
301 MatchStack()
302 : framesEnd(frames + FRAMES_ON_STACK)
303 , currentFrame(frames)
304 , size(1) // match() creates accesses the first frame w/o calling pushNewFrame
305 {
306 ASSERT((sizeof(frames) / sizeof(frames[0])) == FRAMES_ON_STACK);
307 }
308
309 MatchFrame frames[FRAMES_ON_STACK];
310 MatchFrame* framesEnd;
311 MatchFrame* currentFrame;
312 unsigned size;
313
314 inline bool canUseStackBufferForNextFrame()
315 {
316 return size < FRAMES_ON_STACK;
317 }
318
319 inline MatchFrame* allocateNextFrame()
320 {
321 if (canUseStackBufferForNextFrame())
322 return currentFrame + 1;
323 return new MatchFrame;
324 }
325
326 inline void pushNewFrame(const unsigned char* instructionPtr, BracketChainNode* bracketChain, ReturnLocation returnLocation)
327 {
328 MatchFrame* newframe = allocateNextFrame();
329 newframe->previousFrame = currentFrame;
330
331 newframe->args.subjectPtr = currentFrame->args.subjectPtr;
332 newframe->args.offsetTop = currentFrame->args.offsetTop;
333 newframe->args.instructionPtr = instructionPtr;
334 newframe->args.bracketChain = bracketChain;
335 newframe->returnLocation = returnLocation;
336 size++;
337
338 currentFrame = newframe;
339 }
340
341 inline void popCurrentFrame()
342 {
343 MatchFrame* oldFrame = currentFrame;
344 currentFrame = currentFrame->previousFrame;
345 if (size > FRAMES_ON_STACK)
346 delete oldFrame;
347 size--;
348 }
349
350 void popAllFrames()
351 {
352 while (size)
353 popCurrentFrame();
354 }
355};
356
357static int matchError(int errorCode, MatchStack& stack)
358{
359 stack.popAllFrames();
360 return errorCode;
361}
362
363/* Get the next UTF-8 character, not advancing the pointer, incrementing length
364 if there are extra bytes. This is called when we know we are in UTF-8 mode. */
365
366static inline void getUTF8CharAndIncrementLength(int& c, const unsigned char* subjectPtr, int& len)
367{
368 c = *subjectPtr;
369 if ((c & 0xc0) == 0xc0) {
370 int gcaa = kjs_pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */
371 int gcss = 6 * gcaa;
372 c = (c & kjs_pcre_utf8_table3[gcaa]) << gcss;
373 for (int gcii = 1; gcii <= gcaa; gcii++) {
374 gcss -= 6;
375 c |= (subjectPtr[gcii] & 0x3f) << gcss;
376 }
377 len += gcaa;
378 }
379}
380
381static inline void startNewGroup(MatchFrame* currentFrame)
382{
383 /* At the start of a bracketed group, add the current subject pointer to the
384 stack of such pointers, to be re-instated at the end of the group when we hit
385 the closing ket. When match() is called in other circumstances, we don't add to
386 this stack. */
387
388 currentFrame->locals.bracketChainNode.previousBracket = currentFrame->args.bracketChain;
389 currentFrame->locals.bracketChainNode.bracketStart = currentFrame->args.subjectPtr;
390 currentFrame->args.bracketChain = &currentFrame->locals.bracketChainNode;
391}
392
393// FIXME: "minimize" means "not greedy", we should invert the callers to ask for "greedy" to be less confusing
394static inline void repeatInformationFromInstructionOffset(short instructionOffset, bool& minimize, int& minimumRepeats, int& maximumRepeats)
395{
396 // Instruction offsets are based off of OP_CRSTAR, OP_STAR, OP_TYPESTAR, OP_NOTSTAR
397 static const char minimumRepeatsFromInstructionOffset[] = { 0, 0, 1, 1, 0, 0 };
398 static const int maximumRepeatsFromInstructionOffset[] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, 1, 1 };
399
400 ASSERT(instructionOffset >= 0);
401 ASSERT(instructionOffset <= (OP_CRMINQUERY - OP_CRSTAR));
402
403 minimize = (instructionOffset & 1); // this assumes ordering: Instruction, MinimizeInstruction, Instruction2, MinimizeInstruction2
404 minimumRepeats = minimumRepeatsFromInstructionOffset[instructionOffset];
405 maximumRepeats = maximumRepeatsFromInstructionOffset[instructionOffset];
406}
407
408static int match(const UChar* subjectPtr, const unsigned char* instructionPtr, int offsetTop, MatchData& md)
409{
410 bool isMatch = false;
411 int min;
412 bool minimize = false; /* Initialization not really needed, but some compilers think so. */
413 unsigned matchCount = 0;
414
415 MatchStack stack;
416
417 /* The opcode jump table. */
418#ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
419#define EMIT_JUMP_TABLE_ENTRY(opcode) &&LABEL_OP_##opcode,
420 static void* opcodeJumpTable[256] = { FOR_EACH_OPCODE(EMIT_JUMP_TABLE_ENTRY) };
421#undef EMIT_JUMP_TABLE_ENTRY
422#endif
423
424 /* One-time setup of the opcode jump table. */
425#ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
426 for (int i = 255; !opcodeJumpTable[i]; i--)
427 opcodeJumpTable[i] = &&CAPTURING_BRACKET;
428#endif
429
430#ifdef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
431 // Shark shows this as a hot line
432 // Using a static const here makes this line disappear, but makes later access hotter (not sure why)
433 stack.currentFrame->returnLocation = &&RETURN;
434#else
435 stack.currentFrame->returnLocation = 0;
436#endif
437 stack.currentFrame->args.subjectPtr = subjectPtr;
438 stack.currentFrame->args.instructionPtr = instructionPtr;
439 stack.currentFrame->args.offsetTop = offsetTop;
440 stack.currentFrame->args.bracketChain = 0;
441 startNewGroup(stack.currentFrame);
442
443 /* This is where control jumps back to to effect "recursion" */
444
445RECURSE:
446 if (++matchCount > matchLimit)
447 return matchError(JSRegExpErrorHitLimit, stack);
448
449 /* Now start processing the operations. */
450
451#ifndef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
452 while (true)
453#endif
454 {
455
456#ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
457#define BEGIN_OPCODE(opcode) LABEL_OP_##opcode
458#define NEXT_OPCODE goto *opcodeJumpTable[*stack.currentFrame->args.instructionPtr]
459#else
460#define BEGIN_OPCODE(opcode) case OP_##opcode
461#define NEXT_OPCODE continue
462#endif
463
464#ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
465 NEXT_OPCODE;
466#else
467 switch (*stack.currentFrame->args.instructionPtr)
468#endif
469 {
470 /* Non-capturing bracket: optimized */
471
472 BEGIN_OPCODE(BRA):
473 NON_CAPTURING_BRACKET:
474 DPRINTF(("start bracket 0\n"));
475 do {
476 RECURSIVE_MATCH_STARTNG_NEW_GROUP(2, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain);
477 if (isMatch)
478 RRETURN;
479 stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1);
480 } while (*stack.currentFrame->args.instructionPtr == OP_ALT);
481 DPRINTF(("bracket 0 failed\n"));
482 RRETURN;
483
484 /* Skip over large extraction number data if encountered. */
485
486 BEGIN_OPCODE(BRANUMBER):
487 stack.currentFrame->args.instructionPtr += 3;
488 NEXT_OPCODE;
489
490 /* End of the pattern. */
491
492 BEGIN_OPCODE(END):
493 md.endMatchPtr = stack.currentFrame->args.subjectPtr; /* Record where we ended */
494 md.endOffsetTop = stack.currentFrame->args.offsetTop; /* and how many extracts were taken */
495 isMatch = true;
496 RRETURN;
497
498 /* Assertion brackets. Check the alternative branches in turn - the
499 matching won't pass the KET for an assertion. If any one branch matches,
500 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
501 start of each branch to move the current point backwards, so the code at
502 this level is identical to the lookahead case. */
503
504 BEGIN_OPCODE(ASSERT):
505 do {
506 RECURSIVE_MATCH_STARTNG_NEW_GROUP(6, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, NULL);
507 if (isMatch)
508 break;
509 stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1);
510 } while (*stack.currentFrame->args.instructionPtr == OP_ALT);
511 if (*stack.currentFrame->args.instructionPtr == OP_KET)
512 RRETURN_NO_MATCH;
513
514 /* Continue from after the assertion, updating the offsets high water
515 mark, since extracts may have been taken during the assertion. */
516
517 advanceToEndOfBracket(stack.currentFrame->args.instructionPtr);
518 stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE;
519 stack.currentFrame->args.offsetTop = md.endOffsetTop;
520 NEXT_OPCODE;
521
522 /* Negative assertion: all branches must fail to match */
523
524 BEGIN_OPCODE(ASSERT_NOT):
525 do {
526 RECURSIVE_MATCH_STARTNG_NEW_GROUP(7, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, NULL);
527 if (isMatch)
528 RRETURN_NO_MATCH;
529 stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1);
530 } while (*stack.currentFrame->args.instructionPtr == OP_ALT);
531
532 stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE;
533 NEXT_OPCODE;
534
535 /* An alternation is the end of a branch; scan along to find the end of the
536 bracketed group and go to there. */
537
538 BEGIN_OPCODE(ALT):
539 advanceToEndOfBracket(stack.currentFrame->args.instructionPtr);
540 NEXT_OPCODE;
541
542 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
543 that it may occur zero times. It may repeat infinitely, or not at all -
544 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
545 repeat limits are compiled as a number of copies, with the optional ones
546 preceded by BRAZERO or BRAMINZERO. */
547
548 BEGIN_OPCODE(BRAZERO): {
549 stack.currentFrame->locals.startOfRepeatingBracket = stack.currentFrame->args.instructionPtr + 1;
550 RECURSIVE_MATCH_STARTNG_NEW_GROUP(14, stack.currentFrame->locals.startOfRepeatingBracket, stack.currentFrame->args.bracketChain);
551 if (isMatch)
552 RRETURN;
553 advanceToEndOfBracket(stack.currentFrame->locals.startOfRepeatingBracket);
554 stack.currentFrame->args.instructionPtr = stack.currentFrame->locals.startOfRepeatingBracket + 1 + LINK_SIZE;
555 NEXT_OPCODE;
556 }
557
558 BEGIN_OPCODE(BRAMINZERO): {
559 stack.currentFrame->locals.startOfRepeatingBracket = stack.currentFrame->args.instructionPtr + 1;
560 advanceToEndOfBracket(stack.currentFrame->locals.startOfRepeatingBracket);
561 RECURSIVE_MATCH_STARTNG_NEW_GROUP(15, stack.currentFrame->locals.startOfRepeatingBracket + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain);
562 if (isMatch)
563 RRETURN;
564 stack.currentFrame->args.instructionPtr++;
565 NEXT_OPCODE;
566 }
567
568 /* End of a group, repeated or non-repeating. If we are at the end of
569 an assertion "group", stop matching and return 1, but record the
570 current high water mark for use by positive assertions. Do this also
571 for the "once" (not-backup up) groups. */
572
573 BEGIN_OPCODE(KET):
574 BEGIN_OPCODE(KETRMIN):
575 BEGIN_OPCODE(KETRMAX):
576 stack.currentFrame->locals.instructionPtrAtStartOfOnce = stack.currentFrame->args.instructionPtr - getLinkValue(stack.currentFrame->args.instructionPtr + 1);
577 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.bracketChain->bracketStart;
578
579 /* Back up the stack of bracket start pointers. */
580
581 stack.currentFrame->args.bracketChain = stack.currentFrame->args.bracketChain->previousBracket;
582
583 if (*stack.currentFrame->locals.instructionPtrAtStartOfOnce == OP_ASSERT || *stack.currentFrame->locals.instructionPtrAtStartOfOnce == OP_ASSERT_NOT) {
584 md.endOffsetTop = stack.currentFrame->args.offsetTop;
585 isMatch = true;
586 RRETURN;
587 }
588
589 /* In all other cases except a conditional group we have to check the
590 group number back at the start and if necessary complete handling an
591 extraction by setting the offsets and bumping the high water mark. */
592
593 stack.currentFrame->locals.number = *stack.currentFrame->locals.instructionPtrAtStartOfOnce - OP_BRA;
594
595 /* For extended extraction brackets (large number), we have to fish out
596 the number from a dummy opcode at the start. */
597
598 if (stack.currentFrame->locals.number > EXTRACT_BASIC_MAX)
599 stack.currentFrame->locals.number = get2ByteValue(stack.currentFrame->locals.instructionPtrAtStartOfOnce + 2 + LINK_SIZE);
600 stack.currentFrame->locals.offset = stack.currentFrame->locals.number << 1;
601
602#ifdef DEBUG
603 printf("end bracket %d", stack.currentFrame->locals.number);
604 printf("\n");
605#endif
606
607 /* Test for a numbered group. This includes groups called as a result
608 of recursion. Note that whole-pattern recursion is coded as a recurse
609 into group 0, so it won't be picked up here. Instead, we catch it when
610 the OP_END is reached. */
611
612 if (stack.currentFrame->locals.number > 0) {
613 if (stack.currentFrame->locals.offset >= md.offsetMax)
614 md.offsetOverflow = true;
615 else {
616 md.offsetVector[stack.currentFrame->locals.offset] =
617 md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number];
618 md.offsetVector[stack.currentFrame->locals.offset+1] = stack.currentFrame->args.subjectPtr - md.startSubject;
619 if (stack.currentFrame->args.offsetTop <= stack.currentFrame->locals.offset)
620 stack.currentFrame->args.offsetTop = stack.currentFrame->locals.offset + 2;
621 }
622 }
623
624 /* For a non-repeating ket, just continue at this level. This also
625 happens for a repeating ket if no characters were matched in the group.
626 This is the forcible breaking of infinite loops as implemented in Perl
627 5.005. If there is an options reset, it will get obeyed in the normal
628 course of events. */
629
630 if (*stack.currentFrame->args.instructionPtr == OP_KET || stack.currentFrame->args.subjectPtr == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) {
631 stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE;
632 NEXT_OPCODE;
633 }
634
635 /* The repeating kets try the rest of the pattern or restart from the
636 preceding bracket, in the appropriate order. */
637
638 if (*stack.currentFrame->args.instructionPtr == OP_KETRMIN) {
639 RECURSIVE_MATCH(16, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain);
640 if (isMatch)
641 RRETURN;
642 RECURSIVE_MATCH_STARTNG_NEW_GROUP(17, stack.currentFrame->locals.instructionPtrAtStartOfOnce, stack.currentFrame->args.bracketChain);
643 if (isMatch)
644 RRETURN;
645 } else { /* OP_KETRMAX */
646 RECURSIVE_MATCH_STARTNG_NEW_GROUP(18, stack.currentFrame->locals.instructionPtrAtStartOfOnce, stack.currentFrame->args.bracketChain);
647 if (isMatch)
648 RRETURN;
649 RECURSIVE_MATCH(19, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain);
650 if (isMatch)
651 RRETURN;
652 }
653 RRETURN;
654
655 /* Start of subject, or after internal newline if multiline. */
656
657 BEGIN_OPCODE(CIRC):
658 if (stack.currentFrame->args.subjectPtr != md.startSubject && (!md.multiline || !isNewline(stack.currentFrame->args.subjectPtr[-1])))
659 RRETURN_NO_MATCH;
660 stack.currentFrame->args.instructionPtr++;
661 NEXT_OPCODE;
662
663 /* End of subject, or before internal newline if multiline. */
664
665 BEGIN_OPCODE(DOLL):
666 if (stack.currentFrame->args.subjectPtr < md.endSubject && (!md.multiline || !isNewline(*stack.currentFrame->args.subjectPtr)))
667 RRETURN_NO_MATCH;
668 stack.currentFrame->args.instructionPtr++;
669 NEXT_OPCODE;
670
671 /* Word boundary assertions */
672
673 BEGIN_OPCODE(NOT_WORD_BOUNDARY):
674 BEGIN_OPCODE(WORD_BOUNDARY): {
675 bool currentCharIsWordChar = false;
676 bool previousCharIsWordChar = false;
677
678 if (stack.currentFrame->args.subjectPtr > md.startSubject)
679 previousCharIsWordChar = isWordChar(stack.currentFrame->args.subjectPtr[-1]);
680 if (stack.currentFrame->args.subjectPtr < md.endSubject)
681 currentCharIsWordChar = isWordChar(*stack.currentFrame->args.subjectPtr);
682
683 /* Now see if the situation is what we want */
684 bool wordBoundaryDesired = (*stack.currentFrame->args.instructionPtr++ == OP_WORD_BOUNDARY);
685 if (wordBoundaryDesired ? currentCharIsWordChar == previousCharIsWordChar : currentCharIsWordChar != previousCharIsWordChar)
686 RRETURN_NO_MATCH;
687 NEXT_OPCODE;
688 }
689
690 /* Match a single character type; inline for speed */
691
692 BEGIN_OPCODE(NOT_NEWLINE):
693 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
694 RRETURN_NO_MATCH;
695 if (isNewline(*stack.currentFrame->args.subjectPtr++))
696 RRETURN_NO_MATCH;
697 stack.currentFrame->args.instructionPtr++;
698 NEXT_OPCODE;
699
700 BEGIN_OPCODE(NOT_DIGIT):
701 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
702 RRETURN_NO_MATCH;
703 if (isASCIIDigit(*stack.currentFrame->args.subjectPtr++))
704 RRETURN_NO_MATCH;
705 stack.currentFrame->args.instructionPtr++;
706 NEXT_OPCODE;
707
708 BEGIN_OPCODE(DIGIT):
709 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
710 RRETURN_NO_MATCH;
711 if (!isASCIIDigit(*stack.currentFrame->args.subjectPtr++))
712 RRETURN_NO_MATCH;
713 stack.currentFrame->args.instructionPtr++;
714 NEXT_OPCODE;
715
716 BEGIN_OPCODE(NOT_WHITESPACE):
717 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
718 RRETURN_NO_MATCH;
719 if (isSpaceChar(*stack.currentFrame->args.subjectPtr++))
720 RRETURN_NO_MATCH;
721 stack.currentFrame->args.instructionPtr++;
722 NEXT_OPCODE;
723
724 BEGIN_OPCODE(WHITESPACE):
725 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
726 RRETURN_NO_MATCH;
727 if (!isSpaceChar(*stack.currentFrame->args.subjectPtr++))
728 RRETURN_NO_MATCH;
729 stack.currentFrame->args.instructionPtr++;
730 NEXT_OPCODE;
731
732 BEGIN_OPCODE(NOT_WORDCHAR):
733 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
734 RRETURN_NO_MATCH;
735 if (isWordChar(*stack.currentFrame->args.subjectPtr++))
736 RRETURN_NO_MATCH;
737 stack.currentFrame->args.instructionPtr++;
738 NEXT_OPCODE;
739
740 BEGIN_OPCODE(WORDCHAR):
741 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
742 RRETURN_NO_MATCH;
743 if (!isWordChar(*stack.currentFrame->args.subjectPtr++))
744 RRETURN_NO_MATCH;
745 stack.currentFrame->args.instructionPtr++;
746 NEXT_OPCODE;
747
748 /* Match a back reference, possibly repeatedly. Look past the end of the
749 item to see if there is repeat information following. The code is similar
750 to that for character classes, but repeated for efficiency. Then obey
751 similar code to character type repeats - written out again for speed.
752 However, if the referenced string is the empty string, always treat
753 it as matched, any number of times (otherwise there could be infinite
754 loops). */
755
756 BEGIN_OPCODE(REF):
757 stack.currentFrame->locals.offset = get2ByteValue(stack.currentFrame->args.instructionPtr + 1) << 1; /* Doubled ref number */
758 stack.currentFrame->args.instructionPtr += 3; /* Advance past item */
759
760 /* If the reference is unset, set the length to be longer than the amount
761 of subject left; this ensures that every attempt at a match fails. We
762 can't just fail here, because of the possibility of quantifiers with zero
763 minima. */
764
765 if (stack.currentFrame->locals.offset >= stack.currentFrame->args.offsetTop || md.offsetVector[stack.currentFrame->locals.offset] < 0)
766 stack.currentFrame->locals.length = 0;
767 else
768 stack.currentFrame->locals.length = md.offsetVector[stack.currentFrame->locals.offset+1] - md.offsetVector[stack.currentFrame->locals.offset];
769
770 /* Set up for repetition, or handle the non-repeated case */
771
772 switch (*stack.currentFrame->args.instructionPtr) {
773 case OP_CRSTAR:
774 case OP_CRMINSTAR:
775 case OP_CRPLUS:
776 case OP_CRMINPLUS:
777 case OP_CRQUERY:
778 case OP_CRMINQUERY:
779 repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max);
780 break;
781
782 case OP_CRRANGE:
783 case OP_CRMINRANGE:
784 minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE);
785 min = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
786 stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 3);
787 if (stack.currentFrame->locals.max == 0)
788 stack.currentFrame->locals.max = INT_MAX;
789 stack.currentFrame->args.instructionPtr += 5;
790 break;
791
792 default: /* No repeat follows */
793 if (!matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md))
794 RRETURN_NO_MATCH;
795 stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length;
796 NEXT_OPCODE;
797 }
798
799 /* If the length of the reference is zero, just continue with the
800 main loop. */
801
802 if (stack.currentFrame->locals.length == 0)
803 NEXT_OPCODE;
804
805 /* First, ensure the minimum number of matches are present. */
806
807 for (int i = 1; i <= min; i++) {
808 if (!matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md))
809 RRETURN_NO_MATCH;
810 stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length;
811 }
812
813 /* If min = max, continue at the same level without recursion.
814 They are not both allowed to be zero. */
815
816 if (min == stack.currentFrame->locals.max)
817 NEXT_OPCODE;
818
819 /* If minimizing, keep trying and advancing the pointer */
820
821 if (minimize) {
822 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
823 RECURSIVE_MATCH(20, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
824 if (isMatch)
825 RRETURN;
826 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || !matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md))
827 RRETURN;
828 stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length;
829 }
830 /* Control never reaches here */
831 }
832
833 /* If maximizing, find the longest string and work backwards */
834
835 else {
836 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
837 for (int i = min; i < stack.currentFrame->locals.max; i++) {
838 if (!matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md))
839 break;
840 stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length;
841 }
842 while (stack.currentFrame->args.subjectPtr >= stack.currentFrame->locals.subjectPtrAtStartOfInstruction) {
843 RECURSIVE_MATCH(21, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
844 if (isMatch)
845 RRETURN;
846 stack.currentFrame->args.subjectPtr -= stack.currentFrame->locals.length;
847 }
848 RRETURN_NO_MATCH;
849 }
850 /* Control never reaches here */
851
852 /* Match a bit-mapped character class, possibly repeatedly. This op code is
853 used when all the characters in the class have values in the range 0-255,
854 and either the matching is caseful, or the characters are in the range
855 0-127 when UTF-8 processing is enabled. The only difference between
856 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
857 encountered.
858
859 First, look past the end of the item to see if there is repeat information
860 following. Then obey similar code to character type repeats - written out
861 again for speed. */
862
863 BEGIN_OPCODE(NCLASS):
864 BEGIN_OPCODE(CLASS):
865 stack.currentFrame->locals.data = stack.currentFrame->args.instructionPtr + 1; /* Save for matching */
866 stack.currentFrame->args.instructionPtr += 33; /* Advance past the item */
867
868 switch (*stack.currentFrame->args.instructionPtr) {
869 case OP_CRSTAR:
870 case OP_CRMINSTAR:
871 case OP_CRPLUS:
872 case OP_CRMINPLUS:
873 case OP_CRQUERY:
874 case OP_CRMINQUERY:
875 repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max);
876 break;
877
878 case OP_CRRANGE:
879 case OP_CRMINRANGE:
880 minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE);
881 min = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
882 stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 3);
883 if (stack.currentFrame->locals.max == 0)
884 stack.currentFrame->locals.max = INT_MAX;
885 stack.currentFrame->args.instructionPtr += 5;
886 break;
887
888 default: /* No repeat follows */
889 min = stack.currentFrame->locals.max = 1;
890 break;
891 }
892
893 /* First, ensure the minimum number of matches are present. */
894
895 for (int i = 1; i <= min; i++) {
896 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
897 RRETURN_NO_MATCH;
898 int c = *stack.currentFrame->args.subjectPtr++;
899 if (c > 255) {
900 if (stack.currentFrame->locals.data[-1] == OP_CLASS)
901 RRETURN_NO_MATCH;
902 } else {
903 if (!(stack.currentFrame->locals.data[c / 8] & (1 << (c & 7))))
904 RRETURN_NO_MATCH;
905 }
906 }
907
908 /* If max == min we can continue with the main loop without the
909 need to recurse. */
910
911 if (min == stack.currentFrame->locals.max)
912 NEXT_OPCODE;
913
914 /* If minimizing, keep testing the rest of the expression and advancing
915 the pointer while it matches the class. */
916 if (minimize) {
917 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
918 RECURSIVE_MATCH(22, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
919 if (isMatch)
920 RRETURN;
921 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject)
922 RRETURN;
923 int c = *stack.currentFrame->args.subjectPtr++;
924 if (c > 255) {
925 if (stack.currentFrame->locals.data[-1] == OP_CLASS)
926 RRETURN;
927 } else {
928 if ((stack.currentFrame->locals.data[c/8] & (1 << (c&7))) == 0)
929 RRETURN;
930 }
931 }
932 /* Control never reaches here */
933 }
934 /* If maximizing, find the longest possible run, then work backwards. */
935 else {
936 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
937
938 for (int i = min; i < stack.currentFrame->locals.max; i++) {
939 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
940 break;
941 int c = *stack.currentFrame->args.subjectPtr;
942 if (c > 255) {
943 if (stack.currentFrame->locals.data[-1] == OP_CLASS)
944 break;
945 } else {
946 if (!(stack.currentFrame->locals.data[c / 8] & (1 << (c & 7))))
947 break;
948 }
949 ++stack.currentFrame->args.subjectPtr;
950 }
951 for (;;) {
952 RECURSIVE_MATCH(24, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
953 if (isMatch)
954 RRETURN;
955 if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction)
956 break; /* Stop if tried at original pos */
957 }
958
959 RRETURN;
960 }
961 /* Control never reaches here */
962
963 /* Match an extended character class. */
964
965 BEGIN_OPCODE(XCLASS):
966 stack.currentFrame->locals.data = stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE; /* Save for matching */
967 stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1); /* Advance past the item */
968
969 switch (*stack.currentFrame->args.instructionPtr) {
970 case OP_CRSTAR:
971 case OP_CRMINSTAR:
972 case OP_CRPLUS:
973 case OP_CRMINPLUS:
974 case OP_CRQUERY:
975 case OP_CRMINQUERY:
976 repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max);
977 break;
978
979 case OP_CRRANGE:
980 case OP_CRMINRANGE:
981 minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE);
982 min = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
983 stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 3);
984 if (stack.currentFrame->locals.max == 0)
985 stack.currentFrame->locals.max = INT_MAX;
986 stack.currentFrame->args.instructionPtr += 5;
987 break;
988
989 default: /* No repeat follows */
990 min = stack.currentFrame->locals.max = 1;
991 }
992
993 /* First, ensure the minimum number of matches are present. */
994
995 for (int i = 1; i <= min; i++) {
996 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
997 RRETURN_NO_MATCH;
998 int c = *stack.currentFrame->args.subjectPtr++;
999 if (!kjs_pcre_xclass(c, stack.currentFrame->locals.data))
1000 RRETURN_NO_MATCH;
1001 }
1002
1003 /* If max == min we can continue with the main loop without the
1004 need to recurse. */
1005
1006 if (min == stack.currentFrame->locals.max)
1007 NEXT_OPCODE;
1008
1009 /* If minimizing, keep testing the rest of the expression and advancing
1010 the pointer while it matches the class. */
1011
1012 if (minimize) {
1013 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
1014 RECURSIVE_MATCH(26, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1015 if (isMatch)
1016 RRETURN;
1017 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject)
1018 RRETURN;
1019 int c = *stack.currentFrame->args.subjectPtr++;
1020 if (!kjs_pcre_xclass(c, stack.currentFrame->locals.data))
1021 RRETURN;
1022 }
1023 /* Control never reaches here */
1024 }
1025
1026 /* If maximizing, find the longest possible run, then work backwards. */
1027
1028 else {
1029 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
1030 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1031 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1032 break;
1033 int c = *stack.currentFrame->args.subjectPtr;
1034 if (!kjs_pcre_xclass(c, stack.currentFrame->locals.data))
1035 break;
1036 ++stack.currentFrame->args.subjectPtr;
1037 }
1038 for(;;) {
1039 RECURSIVE_MATCH(27, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1040 if (isMatch)
1041 RRETURN;
1042 if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction)
1043 break; /* Stop if tried at original pos */
1044 }
1045 RRETURN;
1046 }
1047
1048 /* Control never reaches here */
1049
1050 /* Match a single character, casefully */
1051
1052 BEGIN_OPCODE(CHAR):
1053 stack.currentFrame->locals.length = 1;
1054 stack.currentFrame->args.instructionPtr++;
1055 getUTF8CharAndIncrementLength(stack.currentFrame->locals.fc, stack.currentFrame->args.instructionPtr, stack.currentFrame->locals.length);
1056 stack.currentFrame->args.instructionPtr += stack.currentFrame->locals.length;
1057 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1058 RRETURN_NO_MATCH;
1059 if (stack.currentFrame->locals.fc != *stack.currentFrame->args.subjectPtr++)
1060 RRETURN_NO_MATCH;
1061 NEXT_OPCODE;
1062
1063 /* Match a single character, caselessly */
1064
1065 BEGIN_OPCODE(CHAR_IGNORING_CASE): {
1066 stack.currentFrame->locals.length = 1;
1067 stack.currentFrame->args.instructionPtr++;
1068 getUTF8CharAndIncrementLength(stack.currentFrame->locals.fc, stack.currentFrame->args.instructionPtr, stack.currentFrame->locals.length);
1069 stack.currentFrame->args.instructionPtr += stack.currentFrame->locals.length;
1070 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1071 RRETURN_NO_MATCH;
1072 int dc = *stack.currentFrame->args.subjectPtr++;
1073 if (stack.currentFrame->locals.fc != dc && kjs_pcre_ucp_othercase(stack.currentFrame->locals.fc) != dc)
1074 RRETURN_NO_MATCH;
1075 NEXT_OPCODE;
1076 }
1077
1078 /* Match a single ASCII character. */
1079
1080 BEGIN_OPCODE(ASCII_CHAR):
1081 if (md.endSubject == stack.currentFrame->args.subjectPtr)
1082 RRETURN_NO_MATCH;
1083 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->args.instructionPtr[1])
1084 RRETURN_NO_MATCH;
1085 ++stack.currentFrame->args.subjectPtr;
1086 stack.currentFrame->args.instructionPtr += 2;
1087 NEXT_OPCODE;
1088
1089 /* Match one of two cases of an ASCII letter. */
1090
1091 BEGIN_OPCODE(ASCII_LETTER_IGNORING_CASE):
1092 if (md.endSubject == stack.currentFrame->args.subjectPtr)
1093 RRETURN_NO_MATCH;
1094 if ((*stack.currentFrame->args.subjectPtr | 0x20) != stack.currentFrame->args.instructionPtr[1])
1095 RRETURN_NO_MATCH;
1096 ++stack.currentFrame->args.subjectPtr;
1097 stack.currentFrame->args.instructionPtr += 2;
1098 NEXT_OPCODE;
1099
1100 /* Match a single character repeatedly; different opcodes share code. */
1101
1102 BEGIN_OPCODE(EXACT):
1103 min = stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
1104 minimize = false;
1105 stack.currentFrame->args.instructionPtr += 3;
1106 goto REPEATCHAR;
1107
1108 BEGIN_OPCODE(UPTO):
1109 BEGIN_OPCODE(MINUPTO):
1110 min = 0;
1111 stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
1112 minimize = *stack.currentFrame->args.instructionPtr == OP_MINUPTO;
1113 stack.currentFrame->args.instructionPtr += 3;
1114 goto REPEATCHAR;
1115
1116 BEGIN_OPCODE(STAR):
1117 BEGIN_OPCODE(MINSTAR):
1118 BEGIN_OPCODE(PLUS):
1119 BEGIN_OPCODE(MINPLUS):
1120 BEGIN_OPCODE(QUERY):
1121 BEGIN_OPCODE(MINQUERY):
1122 repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_STAR, minimize, min, stack.currentFrame->locals.max);
1123
1124 /* Common code for all repeated single-character matches. We can give
1125 up quickly if there are fewer than the minimum number of characters left in
1126 the subject. */
1127
1128 REPEATCHAR:
1129
1130 stack.currentFrame->locals.length = 1;
1131 getUTF8CharAndIncrementLength(stack.currentFrame->locals.fc, stack.currentFrame->args.instructionPtr, stack.currentFrame->locals.length);
1132 if (min * (stack.currentFrame->locals.fc > 0xFFFF ? 2 : 1) > md.endSubject - stack.currentFrame->args.subjectPtr)
1133 RRETURN_NO_MATCH;
1134 stack.currentFrame->args.instructionPtr += stack.currentFrame->locals.length;
1135
1136 if (stack.currentFrame->locals.fc <= 0xFFFF) {
1137 int othercase = md.ignoreCase ? kjs_pcre_ucp_othercase(stack.currentFrame->locals.fc) : -1;
1138
1139 for (int i = 1; i <= min; i++) {
1140 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc && *stack.currentFrame->args.subjectPtr != othercase)
1141 RRETURN_NO_MATCH;
1142 ++stack.currentFrame->args.subjectPtr;
1143 }
1144
1145 if (min == stack.currentFrame->locals.max)
1146 NEXT_OPCODE;
1147
1148 if (minimize) {
1149 stack.currentFrame->locals.repeatOthercase = othercase;
1150 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
1151 RECURSIVE_MATCH(28, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1152 if (isMatch)
1153 RRETURN;
1154 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject)
1155 RRETURN;
1156 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc && *stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.repeatOthercase)
1157 RRETURN;
1158 ++stack.currentFrame->args.subjectPtr;
1159 }
1160 /* Control never reaches here */
1161 } else {
1162 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
1163 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1164 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1165 break;
1166 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc && *stack.currentFrame->args.subjectPtr != othercase)
1167 break;
1168 ++stack.currentFrame->args.subjectPtr;
1169 }
1170 while (stack.currentFrame->args.subjectPtr >= stack.currentFrame->locals.subjectPtrAtStartOfInstruction) {
1171 RECURSIVE_MATCH(29, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1172 if (isMatch)
1173 RRETURN;
1174 --stack.currentFrame->args.subjectPtr;
1175 }
1176 RRETURN_NO_MATCH;
1177 }
1178 /* Control never reaches here */
1179 } else {
1180 /* No case on surrogate pairs, so no need to bother with "othercase". */
1181
1182 for (int i = 1; i <= min; i++) {
1183 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc)
1184 RRETURN_NO_MATCH;
1185 stack.currentFrame->args.subjectPtr += 2;
1186 }
1187
1188 if (min == stack.currentFrame->locals.max)
1189 NEXT_OPCODE;
1190
1191 if (minimize) {
1192 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
1193 RECURSIVE_MATCH(30, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1194 if (isMatch)
1195 RRETURN;
1196 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject)
1197 RRETURN;
1198 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc)
1199 RRETURN;
1200 stack.currentFrame->args.subjectPtr += 2;
1201 }
1202 /* Control never reaches here */
1203 } else {
1204 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
1205 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1206 if (stack.currentFrame->args.subjectPtr > md.endSubject - 2)
1207 break;
1208 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc)
1209 break;
1210 stack.currentFrame->args.subjectPtr += 2;
1211 }
1212 while (stack.currentFrame->args.subjectPtr >= stack.currentFrame->locals.subjectPtrAtStartOfInstruction) {
1213 RECURSIVE_MATCH(31, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1214 if (isMatch)
1215 RRETURN;
1216 stack.currentFrame->args.subjectPtr -= 2;
1217 }
1218 RRETURN_NO_MATCH;
1219 }
1220 /* Control never reaches here */
1221 }
1222 /* Control never reaches here */
1223
1224 /* Match a negated single one-byte character. */
1225
1226 BEGIN_OPCODE(NOT): {
1227 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1228 RRETURN_NO_MATCH;
1229 stack.currentFrame->args.instructionPtr++;
1230 int c = *stack.currentFrame->args.subjectPtr++;
1231 if (md.ignoreCase) {
1232 if (c < 128)
1233 c = toLowerCase(c);
1234 if (toLowerCase(*stack.currentFrame->args.instructionPtr++) == c)
1235 RRETURN_NO_MATCH;
1236 } else {
1237 if (*stack.currentFrame->args.instructionPtr++ == c)
1238 RRETURN_NO_MATCH;
1239 }
1240 NEXT_OPCODE;
1241 }
1242
1243 /* Match a negated single one-byte character repeatedly. This is almost a
1244 repeat of the code for a repeated single character, but I haven't found a
1245 nice way of commoning these up that doesn't require a test of the
1246 positive/negative option for each character match. Maybe that wouldn't add
1247 very much to the time taken, but character matching *is* what this is all
1248 about... */
1249
1250 BEGIN_OPCODE(NOTEXACT):
1251 min = stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
1252 minimize = false;
1253 stack.currentFrame->args.instructionPtr += 3;
1254 goto REPEATNOTCHAR;
1255
1256 BEGIN_OPCODE(NOTUPTO):
1257 BEGIN_OPCODE(NOTMINUPTO):
1258 min = 0;
1259 stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
1260 minimize = *stack.currentFrame->args.instructionPtr == OP_NOTMINUPTO;
1261 stack.currentFrame->args.instructionPtr += 3;
1262 goto REPEATNOTCHAR;
1263
1264 BEGIN_OPCODE(NOTSTAR):
1265 BEGIN_OPCODE(NOTMINSTAR):
1266 BEGIN_OPCODE(NOTPLUS):
1267 BEGIN_OPCODE(NOTMINPLUS):
1268 BEGIN_OPCODE(NOTQUERY):
1269 BEGIN_OPCODE(NOTMINQUERY):
1270 repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_NOTSTAR, minimize, min, stack.currentFrame->locals.max);
1271
1272 /* Common code for all repeated single-byte matches. We can give up quickly
1273 if there are fewer than the minimum number of bytes left in the
1274 subject. */
1275
1276 REPEATNOTCHAR:
1277 if (min > md.endSubject - stack.currentFrame->args.subjectPtr)
1278 RRETURN_NO_MATCH;
1279 stack.currentFrame->locals.fc = *stack.currentFrame->args.instructionPtr++;
1280
1281 /* The code is duplicated for the caseless and caseful cases, for speed,
1282 since matching characters is likely to be quite common. First, ensure the
1283 minimum number of matches are present. If min = max, continue at the same
1284 level without recursing. Otherwise, if minimizing, keep trying the rest of
1285 the expression and advancing one matching character if failing, up to the
1286 maximum. Alternatively, if maximizing, find the maximum number of
1287 characters and work backwards. */
1288
1289 DPRINTF(("negative matching %c{%d,%d}\n", stack.currentFrame->locals.fc, min, stack.currentFrame->locals.max));
1290
1291 if (md.ignoreCase) {
1292 if (stack.currentFrame->locals.fc < 128)
1293 stack.currentFrame->locals.fc = toLowerCase(stack.currentFrame->locals.fc);
1294
1295 for (int i = 1; i <= min; i++) {
1296 int d = *stack.currentFrame->args.subjectPtr++;
1297 if (d < 128)
1298 d = toLowerCase(d);
1299 if (stack.currentFrame->locals.fc == d)
1300 RRETURN_NO_MATCH;
1301 }
1302
1303 if (min == stack.currentFrame->locals.max)
1304 NEXT_OPCODE;
1305
1306 if (minimize) {
1307 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
1308 RECURSIVE_MATCH(38, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1309 if (isMatch)
1310 RRETURN;
1311 int d = *stack.currentFrame->args.subjectPtr++;
1312 if (d < 128)
1313 d = toLowerCase(d);
1314 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject || stack.currentFrame->locals.fc == d)
1315 RRETURN;
1316 }
1317 /* Control never reaches here */
1318 }
1319
1320 /* Maximize case */
1321
1322 else {
1323 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
1324
1325 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1326 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1327 break;
1328 int d = *stack.currentFrame->args.subjectPtr;
1329 if (d < 128)
1330 d = toLowerCase(d);
1331 if (stack.currentFrame->locals.fc == d)
1332 break;
1333 ++stack.currentFrame->args.subjectPtr;
1334 }
1335 for (;;) {
1336 RECURSIVE_MATCH(40, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1337 if (isMatch)
1338 RRETURN;
1339 if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction)
1340 break; /* Stop if tried at original pos */
1341 }
1342
1343 RRETURN;
1344 }
1345 /* Control never reaches here */
1346 }
1347
1348 /* Caseful comparisons */
1349
1350 else {
1351 for (int i = 1; i <= min; i++) {
1352 int d = *stack.currentFrame->args.subjectPtr++;
1353 if (stack.currentFrame->locals.fc == d)
1354 RRETURN_NO_MATCH;
1355 }
1356
1357 if (min == stack.currentFrame->locals.max)
1358 NEXT_OPCODE;
1359
1360 if (minimize) {
1361 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
1362 RECURSIVE_MATCH(42, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1363 if (isMatch)
1364 RRETURN;
1365 int d = *stack.currentFrame->args.subjectPtr++;
1366 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject || stack.currentFrame->locals.fc == d)
1367 RRETURN;
1368 }
1369 /* Control never reaches here */
1370 }
1371
1372 /* Maximize case */
1373
1374 else {
1375 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
1376
1377 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1378 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1379 break;
1380 int d = *stack.currentFrame->args.subjectPtr;
1381 if (stack.currentFrame->locals.fc == d)
1382 break;
1383 ++stack.currentFrame->args.subjectPtr;
1384 }
1385 for (;;) {
1386 RECURSIVE_MATCH(44, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1387 if (isMatch)
1388 RRETURN;
1389 if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction)
1390 break; /* Stop if tried at original pos */
1391 }
1392
1393 RRETURN;
1394 }
1395 }
1396 /* Control never reaches here */
1397
1398 /* Match a single character type repeatedly; several different opcodes
1399 share code. This is very similar to the code for single characters, but we
1400 repeat it in the interests of efficiency. */
1401
1402 BEGIN_OPCODE(TYPEEXACT):
1403 min = stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
1404 minimize = true;
1405 stack.currentFrame->args.instructionPtr += 3;
1406 goto REPEATTYPE;
1407
1408 BEGIN_OPCODE(TYPEUPTO):
1409 BEGIN_OPCODE(TYPEMINUPTO):
1410 min = 0;
1411 stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
1412 minimize = *stack.currentFrame->args.instructionPtr == OP_TYPEMINUPTO;
1413 stack.currentFrame->args.instructionPtr += 3;
1414 goto REPEATTYPE;
1415
1416 BEGIN_OPCODE(TYPESTAR):
1417 BEGIN_OPCODE(TYPEMINSTAR):
1418 BEGIN_OPCODE(TYPEPLUS):
1419 BEGIN_OPCODE(TYPEMINPLUS):
1420 BEGIN_OPCODE(TYPEQUERY):
1421 BEGIN_OPCODE(TYPEMINQUERY):
1422 repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_TYPESTAR, minimize, min, stack.currentFrame->locals.max);
1423
1424 /* Common code for all repeated single character type matches. Note that
1425 in UTF-8 mode, '.' matches a character of any length, but for the other
1426 character types, the valid characters are all one-byte long. */
1427
1428 REPEATTYPE:
1429 stack.currentFrame->locals.ctype = *stack.currentFrame->args.instructionPtr++; /* Code for the character type */
1430
1431 /* First, ensure the minimum number of matches are present. Use inline
1432 code for maximizing the speed, and do the type test once at the start
1433 (i.e. keep it out of the loop). Also we can test that there are at least
1434 the minimum number of characters before we start. */
1435
1436 if (min > md.endSubject - stack.currentFrame->args.subjectPtr)
1437 RRETURN_NO_MATCH;
1438 if (min > 0) {
1439 switch (stack.currentFrame->locals.ctype) {
1440 case OP_NOT_NEWLINE:
1441 for (int i = 1; i <= min; i++) {
1442 if (isNewline(*stack.currentFrame->args.subjectPtr))
1443 RRETURN_NO_MATCH;
1444 ++stack.currentFrame->args.subjectPtr;
1445 }
1446 break;
1447
1448 case OP_NOT_DIGIT:
1449 for (int i = 1; i <= min; i++) {
1450 if (isASCIIDigit(*stack.currentFrame->args.subjectPtr))
1451 RRETURN_NO_MATCH;
1452 ++stack.currentFrame->args.subjectPtr;
1453 }
1454 break;
1455
1456 case OP_DIGIT:
1457 for (int i = 1; i <= min; i++) {
1458 if (!isASCIIDigit(*stack.currentFrame->args.subjectPtr))
1459 RRETURN_NO_MATCH;
1460 ++stack.currentFrame->args.subjectPtr;
1461 }
1462 break;
1463
1464 case OP_NOT_WHITESPACE:
1465 for (int i = 1; i <= min; i++) {
1466 if (isSpaceChar(*stack.currentFrame->args.subjectPtr))
1467 RRETURN_NO_MATCH;
1468 ++stack.currentFrame->args.subjectPtr;
1469 }
1470 break;
1471
1472 case OP_WHITESPACE:
1473 for (int i = 1; i <= min; i++) {
1474 if (!isSpaceChar(*stack.currentFrame->args.subjectPtr))
1475 RRETURN_NO_MATCH;
1476 ++stack.currentFrame->args.subjectPtr;
1477 }
1478 break;
1479
1480 case OP_NOT_WORDCHAR:
1481 for (int i = 1; i <= min; i++) {
1482 if (isWordChar(*stack.currentFrame->args.subjectPtr))
1483 RRETURN_NO_MATCH;
1484 ++stack.currentFrame->args.subjectPtr;
1485 }
1486 break;
1487
1488 case OP_WORDCHAR:
1489 for (int i = 1; i <= min; i++) {
1490 if (!isWordChar(*stack.currentFrame->args.subjectPtr))
1491 RRETURN_NO_MATCH;
1492 ++stack.currentFrame->args.subjectPtr;
1493 }
1494 break;
1495
1496 default:
1497 ASSERT_NOT_REACHED();
1498 return matchError(JSRegExpErrorInternal, stack);
1499 } /* End switch(stack.currentFrame->locals.ctype) */
1500 }
1501
1502 /* If min = max, continue at the same level without recursing */
1503
1504 if (min == stack.currentFrame->locals.max)
1505 NEXT_OPCODE;
1506
1507 /* If minimizing, we have to test the rest of the pattern before each
1508 subsequent match. */
1509
1510 if (minimize) {
1511 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
1512 RECURSIVE_MATCH(48, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1513 if (isMatch)
1514 RRETURN;
1515 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject)
1516 RRETURN;
1517
1518 int c = *stack.currentFrame->args.subjectPtr++;
1519 switch (stack.currentFrame->locals.ctype) {
1520 case OP_NOT_NEWLINE:
1521 if (isNewline(c))
1522 RRETURN;
1523 break;
1524
1525 case OP_NOT_DIGIT:
1526 if (isASCIIDigit(c))
1527 RRETURN;
1528 break;
1529
1530 case OP_DIGIT:
1531 if (!isASCIIDigit(c))
1532 RRETURN;
1533 break;
1534
1535 case OP_NOT_WHITESPACE:
1536 if (isSpaceChar(c))
1537 RRETURN;
1538 break;
1539
1540 case OP_WHITESPACE:
1541 if (!isSpaceChar(c))
1542 RRETURN;
1543 break;
1544
1545 case OP_NOT_WORDCHAR:
1546 if (isWordChar(c))
1547 RRETURN;
1548 break;
1549
1550 case OP_WORDCHAR:
1551 if (!isWordChar(c))
1552 RRETURN;
1553 break;
1554
1555 default:
1556 ASSERT_NOT_REACHED();
1557 return matchError(JSRegExpErrorInternal, stack);
1558 }
1559 }
1560 /* Control never reaches here */
1561 }
1562
1563 /* If maximizing it is worth using inline code for speed, doing the type
1564 test once at the start (i.e. keep it out of the loop). */
1565
1566 else {
1567 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; /* Remember where we started */
1568
1569 switch (stack.currentFrame->locals.ctype) {
1570 case OP_NOT_NEWLINE:
1571 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1572 if (stack.currentFrame->args.subjectPtr >= md.endSubject || isNewline(*stack.currentFrame->args.subjectPtr))
1573 break;
1574 stack.currentFrame->args.subjectPtr++;
1575 }
1576 break;
1577
1578 case OP_NOT_DIGIT:
1579 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1580 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1581 break;
1582 int c = *stack.currentFrame->args.subjectPtr;
1583 if (isASCIIDigit(c))
1584 break;
1585 ++stack.currentFrame->args.subjectPtr;
1586 }
1587 break;
1588
1589 case OP_DIGIT:
1590 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1591 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1592 break;
1593 int c = *stack.currentFrame->args.subjectPtr;
1594 if (!isASCIIDigit(c))
1595 break;
1596 ++stack.currentFrame->args.subjectPtr;
1597 }
1598 break;
1599
1600 case OP_NOT_WHITESPACE:
1601 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1602 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1603 break;
1604 int c = *stack.currentFrame->args.subjectPtr;
1605 if (isSpaceChar(c))
1606 break;
1607 ++stack.currentFrame->args.subjectPtr;
1608 }
1609 break;
1610
1611 case OP_WHITESPACE:
1612 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1613 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1614 break;
1615 int c = *stack.currentFrame->args.subjectPtr;
1616 if (!isSpaceChar(c))
1617 break;
1618 ++stack.currentFrame->args.subjectPtr;
1619 }
1620 break;
1621
1622 case OP_NOT_WORDCHAR:
1623 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1624 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1625 break;
1626 int c = *stack.currentFrame->args.subjectPtr;
1627 if (isWordChar(c))
1628 break;
1629 ++stack.currentFrame->args.subjectPtr;
1630 }
1631 break;
1632
1633 case OP_WORDCHAR:
1634 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1635 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1636 break;
1637 int c = *stack.currentFrame->args.subjectPtr;
1638 if (!isWordChar(c))
1639 break;
1640 ++stack.currentFrame->args.subjectPtr;
1641 }
1642 break;
1643
1644 default:
1645 ASSERT_NOT_REACHED();
1646 return matchError(JSRegExpErrorInternal, stack);
1647 }
1648
1649 /* stack.currentFrame->args.subjectPtr is now past the end of the maximum run */
1650
1651 for (;;) {
1652 RECURSIVE_MATCH(52, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1653 if (isMatch)
1654 RRETURN;
1655 if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction)
1656 break; /* Stop if tried at original pos */
1657 }
1658
1659 /* Get here if we can't make it match with any permitted repetitions */
1660
1661 RRETURN;
1662 }
1663 /* Control never reaches here */
1664
1665 BEGIN_OPCODE(CRMINPLUS):
1666 BEGIN_OPCODE(CRMINQUERY):
1667 BEGIN_OPCODE(CRMINRANGE):
1668 BEGIN_OPCODE(CRMINSTAR):
1669 BEGIN_OPCODE(CRPLUS):
1670 BEGIN_OPCODE(CRQUERY):
1671 BEGIN_OPCODE(CRRANGE):
1672 BEGIN_OPCODE(CRSTAR):
1673 ASSERT_NOT_REACHED();
1674 return matchError(JSRegExpErrorInternal, stack);
1675
1676#ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
1677 CAPTURING_BRACKET:
1678#else
1679 default:
1680#endif
1681 /* Opening capturing bracket. If there is space in the offset vector, save
1682 the current subject position in the working slot at the top of the vector. We
1683 mustn't change the current values of the data slot, because they may be set
1684 from a previous iteration of this group, and be referred to by a reference
1685 inside the group.
1686
1687 If the bracket fails to match, we need to restore this value and also the
1688 values of the final offsets, in case they were set by a previous iteration of
1689 the same bracket.
1690
1691 If there isn't enough space in the offset vector, treat this as if it were a
1692 non-capturing bracket. Don't worry about setting the flag for the error case
1693 here; that is handled in the code for KET. */
1694
1695 ASSERT(*stack.currentFrame->args.instructionPtr > OP_BRA);
1696
1697 stack.currentFrame->locals.number = *stack.currentFrame->args.instructionPtr - OP_BRA;
1698
1699 /* For extended extraction brackets (large number), we have to fish out the
1700 number from a dummy opcode at the start. */
1701
1702 if (stack.currentFrame->locals.number > EXTRACT_BASIC_MAX)
1703 stack.currentFrame->locals.number = get2ByteValue(stack.currentFrame->args.instructionPtr + 2 + LINK_SIZE);
1704 stack.currentFrame->locals.offset = stack.currentFrame->locals.number << 1;
1705
1706#ifdef DEBUG
1707 printf("start bracket %d subject=", stack.currentFrame->locals.number);
1708 pchars(stack.currentFrame->args.subjectPtr, 16, true, md);
1709 printf("\n");
1710#endif
1711
1712 if (stack.currentFrame->locals.offset < md.offsetMax) {
1713 stack.currentFrame->locals.saveOffset1 = md.offsetVector[stack.currentFrame->locals.offset];
1714 stack.currentFrame->locals.saveOffset2 = md.offsetVector[stack.currentFrame->locals.offset + 1];
1715 stack.currentFrame->locals.saveOffset3 = md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number];
1716
1717 DPRINTF(("saving %d %d %d\n", stack.currentFrame->locals.saveOffset1, stack.currentFrame->locals.saveOffset2, stack.currentFrame->locals.saveOffset3));
1718 md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number] = stack.currentFrame->args.subjectPtr - md.startSubject;
1719
1720 do {
1721 RECURSIVE_MATCH_STARTNG_NEW_GROUP(1, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain);
1722 if (isMatch)
1723 RRETURN;
1724 stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1);
1725 } while (*stack.currentFrame->args.instructionPtr == OP_ALT);
1726
1727 DPRINTF(("bracket %d failed\n", stack.currentFrame->locals.number));
1728
1729 md.offsetVector[stack.currentFrame->locals.offset] = stack.currentFrame->locals.saveOffset1;
1730 md.offsetVector[stack.currentFrame->locals.offset + 1] = stack.currentFrame->locals.saveOffset2;
1731 md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number] = stack.currentFrame->locals.saveOffset3;
1732
1733 RRETURN;
1734 }
1735
1736 /* Insufficient room for saving captured contents */
1737
1738 goto NON_CAPTURING_BRACKET;
1739 }
1740
1741 /* Do not stick any code in here without much thought; it is assumed
1742 that "continue" in the code above comes out to here to repeat the main
1743 loop. */
1744
1745 } /* End of main loop */
1746
1747 ASSERT_NOT_REACHED();
1748
1749#ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
1750
1751RRETURN_SWITCH:
1752 switch (stack.currentFrame->returnLocation) {
1753 case 0: goto RETURN;
1754 case 1: goto RRETURN_1;
1755 case 2: goto RRETURN_2;
1756 case 6: goto RRETURN_6;
1757 case 7: goto RRETURN_7;
1758 case 14: goto RRETURN_14;
1759 case 15: goto RRETURN_15;
1760 case 16: goto RRETURN_16;
1761 case 17: goto RRETURN_17;
1762 case 18: goto RRETURN_18;
1763 case 19: goto RRETURN_19;
1764 case 20: goto RRETURN_20;
1765 case 21: goto RRETURN_21;
1766 case 22: goto RRETURN_22;
1767 case 24: goto RRETURN_24;
1768 case 26: goto RRETURN_26;
1769 case 27: goto RRETURN_27;
1770 case 28: goto RRETURN_28;
1771 case 29: goto RRETURN_29;
1772 case 30: goto RRETURN_30;
1773 case 31: goto RRETURN_31;
1774 case 38: goto RRETURN_38;
1775 case 40: goto RRETURN_40;
1776 case 42: goto RRETURN_42;
1777 case 44: goto RRETURN_44;
1778 case 48: goto RRETURN_48;
1779 case 52: goto RRETURN_52;
1780 }
1781
1782 ASSERT_NOT_REACHED();
1783 return matchError(JSRegExpErrorInternal, stack);
1784
1785#endif
1786
1787RETURN:
1788 return isMatch;
1789}
1790
1791
1792/*************************************************
1793* Execute a Regular Expression *
1794*************************************************/
1795
1796/* This function applies a compiled re to a subject string and picks out
1797portions of the string if it matches. Two elements in the vector are set for
1798each substring: the offsets to the start and end of the substring.
1799
1800Arguments:
1801 re points to the compiled expression
1802 extra_data points to extra data or is NULL
1803 subject points to the subject string
1804 length length of subject string (may contain binary zeros)
1805 start_offset where to start in the subject string
1806 options option bits
1807 offsets points to a vector of ints to be filled in with offsets
1808 offsetcount the number of elements in the vector
1809
1810Returns: > 0 => success; value is the number of elements filled in
1811 = 0 => success, but offsets is not big enough
1812 -1 => failed to match
1813 < -1 => some kind of unexpected problem
1814*/
1815
1816static void tryFirstByteOptimization(const UChar*& subjectPtr, const UChar* endSubject, int first_byte, bool first_byte_caseless, bool useMultiLineFirstCharOptimization, const UChar* originalSubjectStart)
1817{
1818 // If first_byte is set, try scanning to the first instance of that byte
1819 // no need to try and match against any earlier part of the subject string.
1820 if (first_byte >= 0) {
1821 UChar first_char = first_byte;
1822 if (first_byte_caseless)
1823 while (subjectPtr < endSubject) {
1824 int c = *subjectPtr;
1825 if (c > 127)
1826 break;
1827 if (toLowerCase(c) == first_char)
1828 break;
1829 subjectPtr++;
1830 }
1831 else {
1832 while (subjectPtr < endSubject && *subjectPtr != first_char)
1833 subjectPtr++;
1834 }
1835 } else if (useMultiLineFirstCharOptimization) {
1836 /* Or to just after \n for a multiline match if possible */
1837 // I'm not sure why this != originalSubjectStart check is necessary -- ecs 11/18/07
1838 if (subjectPtr > originalSubjectStart) {
1839 while (subjectPtr < endSubject && !isNewline(subjectPtr[-1]))
1840 subjectPtr++;
1841 }
1842 }
1843}
1844
1845static bool tryRequiredByteOptimization(const UChar*& subjectPtr, const UChar* endSubject, int req_byte, int req_byte2, bool req_byte_caseless, bool hasFirstByte, const UChar*& reqBytePtr)
1846{
1847 /* If req_byte is set, we know that that character must appear in the subject
1848 for the match to succeed. If the first character is set, req_byte must be
1849 later in the subject; otherwise the test starts at the match point. This
1850 optimization can save a huge amount of backtracking in patterns with nested
1851 unlimited repeats that aren't going to match. Writing separate code for
1852 cased/caseless versions makes it go faster, as does using an autoincrement
1853 and backing off on a match.
1854
1855 HOWEVER: when the subject string is very, very long, searching to its end can
1856 take a long time, and give bad performance on quite ordinary patterns. This
1857 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
1858 don't do this when the string is sufficiently long.
1859 */
1860
1861 if (req_byte >= 0 && endSubject - subjectPtr < REQ_BYTE_MAX) {
1862 const UChar* p = subjectPtr + (hasFirstByte ? 1 : 0);
1863
1864 /* We don't need to repeat the search if we haven't yet reached the
1865 place we found it at last time. */
1866
1867 if (p > reqBytePtr) {
1868 if (req_byte_caseless) {
1869 while (p < endSubject) {
1870 int pp = *p++;
1871 if (pp == req_byte || pp == req_byte2) {
1872 p--;
1873 break;
1874 }
1875 }
1876 } else {
1877 while (p < endSubject) {
1878 if (*p++ == req_byte) {
1879 p--;
1880 break;
1881 }
1882 }
1883 }
1884
1885 /* If we can't find the required character, break the matching loop */
1886
1887 if (p >= endSubject)
1888 return true;
1889
1890 /* If we have found the required character, save the point where we
1891 found it, so that we don't search again next time round the loop if
1892 the start hasn't passed this character yet. */
1893
1894 reqBytePtr = p;
1895 }
1896 }
1897 return false;
1898}
1899
1900int jsRegExpExecute(const JSRegExp* re,
1901 const UChar* subject, int length, int start_offset, int* offsets,
1902 int offsetcount)
1903{
1904 ASSERT(re);
1905 ASSERT(subject);
1906 ASSERT(offsetcount >= 0);
1907 ASSERT(offsets || offsetcount == 0);
1908
1909 MatchData matchBlock;
1910 matchBlock.startSubject = subject;
1911 matchBlock.endSubject = matchBlock.startSubject + length;
1912 const UChar* endSubject = matchBlock.endSubject;
1913
1914 matchBlock.multiline = (re->options & MatchAcrossMultipleLinesOption);
1915 matchBlock.ignoreCase = (re->options & IgnoreCaseOption);
1916
1917 /* If the expression has got more back references than the offsets supplied can
1918 hold, we get a temporary chunk of working store to use during the matching.
1919 Otherwise, we can use the vector supplied, rounding down its size to a multiple
1920 of 3. */
1921
1922 int ocount = offsetcount - (offsetcount % 3);
1923
1924 // FIXME: This is lame that we have to second-guess our caller here.
1925 // The API should change to either fail-hard when we don't have enough offset space
1926 // or that we shouldn't ask our callers to pre-allocate in the first place.
1927 bool using_temporary_offsets = false;
1928 if (re->top_backref > 0 && re->top_backref >= ocount/3) {
1929 ocount = re->top_backref * 3 + 3;
1930 matchBlock.offsetVector = new int[ocount];
1931 if (!matchBlock.offsetVector)
1932 return JSRegExpErrorNoMemory;
1933 using_temporary_offsets = true;
1934 } else
1935 matchBlock.offsetVector = offsets;
1936
1937 matchBlock.offsetEnd = ocount;
1938 matchBlock.offsetMax = (2*ocount)/3;
1939 matchBlock.offsetOverflow = false;
1940
1941 /* Compute the minimum number of offsets that we need to reset each time. Doing
1942 this makes a huge difference to execution time when there aren't many brackets
1943 in the pattern. */
1944
1945 int resetcount = 2 + re->top_bracket * 2;
1946 if (resetcount > offsetcount)
1947 resetcount = ocount;
1948
1949 /* Reset the working variable associated with each extraction. These should
1950 never be used unless previously set, but they get saved and restored, and so we
1951 initialize them to avoid reading uninitialized locations. */
1952
1953 if (matchBlock.offsetVector) {
1954 int* iptr = matchBlock.offsetVector + ocount;
1955 int* iend = iptr - resetcount/2 + 1;
1956 while (--iptr >= iend)
1957 *iptr = -1;
1958 }
1959
1960 /* Set up the first character to match, if available. The first_byte value is
1961 never set for an anchored regular expression, but the anchoring may be forced
1962 at run time, so we have to test for anchoring. The first char may be unset for
1963 an unanchored pattern, of course. If there's no first char and the pattern was
1964 studied, there may be a bitmap of possible first characters. */
1965
1966 bool first_byte_caseless = false;
1967 int first_byte = -1;
1968 if (re->options & UseFirstByteOptimizationOption) {
1969 first_byte = re->first_byte & 255;
1970 if ((first_byte_caseless = (re->first_byte & REQ_IGNORE_CASE)))
1971 first_byte = toLowerCase(first_byte);
1972 }
1973
1974 /* For anchored or unanchored matches, there may be a "last known required
1975 character" set. */
1976
1977 bool req_byte_caseless = false;
1978 int req_byte = -1;
1979 int req_byte2 = -1;
1980 if (re->options & UseRequiredByteOptimizationOption) {
1981 req_byte = re->req_byte & 255; // FIXME: This optimization could be made to work for UTF16 chars as well...
1982 req_byte_caseless = (re->req_byte & REQ_IGNORE_CASE);
1983 req_byte2 = flipCase(req_byte);
1984 }
1985
1986 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
1987 the loop runs just once. */
1988
1989 const UChar* startMatch = subject + start_offset;
1990 const UChar* reqBytePtr = startMatch - 1;
1991 bool useMultiLineFirstCharOptimization = re->options & UseMultiLineFirstByteOptimizationOption;
1992
1993 do {
1994 /* Reset the maximum number of extractions we might see. */
1995 if (matchBlock.offsetVector) {
1996 int* iptr = matchBlock.offsetVector;
1997 int* iend = iptr + resetcount;
1998 while (iptr < iend)
1999 *iptr++ = -1;
2000 }
2001
2002 tryFirstByteOptimization(startMatch, endSubject, first_byte, first_byte_caseless, useMultiLineFirstCharOptimization, matchBlock.startSubject + start_offset);
2003 if (tryRequiredByteOptimization(startMatch, endSubject, req_byte, req_byte2, req_byte_caseless, first_byte >= 0, reqBytePtr))
2004 break;
2005
2006 /* When a match occurs, substrings will be set for all internal extractions;
2007 we just need to set up the whole thing as substring 0 before returning. If
2008 there were too many extractions, set the return code to zero. In the case
2009 where we had to get some local store to hold offsets for backreferences, copy
2010 those back references that we can. In this case there need not be overflow
2011 if certain parts of the pattern were not used. */
2012
2013 /* The code starts after the JSRegExp block and the capture name table. */
2014 const unsigned char* start_code = (const unsigned char*)(re + 1);
2015
2016 int returnCode = match(startMatch, start_code, 2, matchBlock);
2017
2018 /* When the result is no match, advance the pointer to the next character
2019 and continue. */
2020 if (returnCode == 0) {
2021 startMatch++;
2022 continue;
2023 }
2024
2025 if (returnCode != 1) {
2026 ASSERT(returnCode == JSRegExpErrorHitLimit || returnCode == JSRegExpErrorNoMemory);
2027 DPRINTF((">>>> error: returning %d\n", rc));
2028 return returnCode;
2029 }
2030
2031 /* We have a match! Copy the offset information from temporary store if
2032 necessary */
2033
2034 if (using_temporary_offsets) {
2035 if (offsetcount >= 4) {
2036 memcpy(offsets + 2, matchBlock.offsetVector + 2, (offsetcount - 2) * sizeof(int));
2037 DPRINTF(("Copied offsets from temporary memory\n"));
2038 }
2039 if (matchBlock.endOffsetTop > offsetcount)
2040 matchBlock.offsetOverflow = true;
2041
2042 DPRINTF(("Freeing temporary memory\n"));
2043 delete [] matchBlock.offsetVector;
2044 }
2045
2046 returnCode = matchBlock.offsetOverflow ? 0 : matchBlock.endOffsetTop / 2;
2047
2048 if (offsetcount < 2)
2049 returnCode = 0;
2050 else {
2051 offsets[0] = startMatch - matchBlock.startSubject;
2052 offsets[1] = matchBlock.endMatchPtr - matchBlock.startSubject;
2053 }
2054
2055 DPRINTF((">>>> returning %d\n", rc));
2056 return returnCode;
2057 } while (startMatch <= endSubject);
2058
2059 if (using_temporary_offsets) {
2060 DPRINTF(("Freeing temporary memory\n"));
2061 delete [] matchBlock.offsetVector;
2062 }
2063
2064 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
2065 return JSRegExpErrorNoMatch;
2066}