]> git.saurik.com Git - apple/javascriptcore.git/blame - pcre/pcre_exec.cpp
JavaScriptCore-576.tar.gz
[apple/javascriptcore.git] / pcre / pcre_exec.cpp
CommitLineData
b37bf2e1
A
1/* This is JavaScriptCore's variant of the PCRE library. While this library
2started out as a copy of PCRE, many of the features of PCRE have been
3removed. This library now supports only the regular expression features
4required by the JavaScript language specification, and has only the functions
5needed by JavaScriptCore and the rest of WebKit.
6
7 Originally written by Philip Hazel
8 Copyright (c) 1997-2006 University of Cambridge
9dae56ea 9 Copyright (C) 2002, 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
b37bf2e1
A
10 Copyright (C) 2007 Eric Seidel <eric@webkit.org>
11
12-----------------------------------------------------------------------------
13Redistribution and use in source and binary forms, with or without
14modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37POSSIBILITY OF SUCH DAMAGE.
38-----------------------------------------------------------------------------
39*/
40
41/* This module contains jsRegExpExecute(), the externally visible function
42that does pattern matching using an NFA algorithm, following the rules from
43the JavaScript specification. There are also some supporting functions. */
44
45#include "config.h"
b37bf2e1
A
46#include "pcre_internal.h"
47
9dae56ea 48#include <limits.h>
b37bf2e1
A
49#include <wtf/ASCIICType.h>
50#include <wtf/Vector.h>
51
9dae56ea 52#if REGEXP_HISTOGRAM
ba379fdc 53#include <wtf/DateMath.h>
9dae56ea
A
54#include <runtime/UString.h>
55#endif
b37bf2e1
A
56
57using namespace WTF;
58
ba379fdc 59#if COMPILER(GCC)
b37bf2e1
A
60#define USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
61//#define USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
62#endif
63
64/* Avoid warnings on Windows. */
65#undef min
66#undef max
67
68#ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
69typedef int ReturnLocation;
70#else
71typedef void* ReturnLocation;
72#endif
73
9dae56ea
A
74#if !REGEXP_HISTOGRAM
75
76class HistogramTimeLogger {
77public:
78 HistogramTimeLogger(const JSRegExp*) { }
79};
80
81#else
82
83using namespace JSC;
84
85class Histogram {
86public:
87 ~Histogram();
88 void add(const JSRegExp*, double);
89
90private:
91 typedef HashMap<RefPtr<UString::Rep>, double> Map;
92 Map times;
93};
94
95class HistogramTimeLogger {
96public:
97 HistogramTimeLogger(const JSRegExp*);
98 ~HistogramTimeLogger();
99
100private:
101 const JSRegExp* m_re;
102 double m_startTime;
103};
104
105#endif
106
b37bf2e1
A
107/* Structure for building a chain of data for holding the values of
108the subject pointer at the start of each bracket, used to detect when
109an empty string has been matched by a bracket to break infinite loops. */
110struct BracketChainNode {
111 BracketChainNode* previousBracket;
112 const UChar* bracketStart;
113};
114
f9bf01c6 115struct MatchFrame : FastAllocBase {
b37bf2e1
A
116 ReturnLocation returnLocation;
117 struct MatchFrame* previousFrame;
118
119 /* Function arguments that may change */
120 struct {
121 const UChar* subjectPtr;
122 const unsigned char* instructionPtr;
123 int offsetTop;
124 BracketChainNode* bracketChain;
125 } args;
126
127
128 /* PCRE uses "fake" recursion built off of gotos, thus
129 stack-based local variables are not safe to use. Instead we have to
130 store local variables on the current MatchFrame. */
131 struct {
132 const unsigned char* data;
133 const unsigned char* startOfRepeatingBracket;
134 const UChar* subjectPtrAtStartOfInstruction; // Several instrutions stash away a subjectPtr here for later compare
135 const unsigned char* instructionPtrAtStartOfOnce;
136
137 int repeatOthercase;
138
139 int ctype;
140 int fc;
141 int fi;
142 int length;
143 int max;
144 int number;
145 int offset;
146 int saveOffset1;
147 int saveOffset2;
148 int saveOffset3;
149
150 BracketChainNode bracketChainNode;
151 } locals;
152};
153
154/* Structure for passing "static" information around between the functions
155doing traditional NFA matching, so that they are thread-safe. */
156
157struct MatchData {
158 int* offsetVector; /* Offset vector */
159 int offsetEnd; /* One past the end */
160 int offsetMax; /* The maximum usable for return data */
161 bool offsetOverflow; /* Set if too many extractions */
162 const UChar* startSubject; /* Start of the subject string */
163 const UChar* endSubject; /* End of the subject string */
164 const UChar* endMatchPtr; /* Subject position at end match */
165 int endOffsetTop; /* Highwater mark at end of match */
166 bool multiline;
167 bool ignoreCase;
168};
169
170/* The maximum remaining length of subject we are prepared to search for a
9dae56ea 171reqByte match. */
b37bf2e1
A
172
173#define REQ_BYTE_MAX 1000
174
175/* The below limit restricts the number of "recursive" match calls in order to
176avoid spending exponential time on complex regular expressions. */
177
ba379fdc 178static const unsigned matchLimit = 1000000;
b37bf2e1
A
179
180#ifdef DEBUG
181/*************************************************
182* Debugging function to print chars *
183*************************************************/
184
185/* Print a sequence of chars in printable format, stopping at the end of the
186subject if the requested.
187
188Arguments:
189 p points to characters
190 length number to print
191 isSubject true if printing from within md.startSubject
192 md pointer to matching data block, if isSubject is true
193*/
194
195static void pchars(const UChar* p, int length, bool isSubject, const MatchData& md)
196{
197 if (isSubject && length > md.endSubject - p)
198 length = md.endSubject - p;
199 while (length-- > 0) {
200 int c;
201 if (isprint(c = *(p++)))
202 printf("%c", c);
203 else if (c < 256)
204 printf("\\x%02x", c);
205 else
206 printf("\\x{%x}", c);
207 }
208}
209#endif
210
211/*************************************************
212* Match a back-reference *
213*************************************************/
214
215/* If a back reference hasn't been set, the length that is passed is greater
216than the number of characters left in the string, so the match fails.
217
218Arguments:
219 offset index into the offset vector
220 subjectPtr points into the subject
221 length length to be matched
222 md points to match data block
223
224Returns: true if matched
225*/
226
227static bool matchRef(int offset, const UChar* subjectPtr, int length, const MatchData& md)
228{
229 const UChar* p = md.startSubject + md.offsetVector[offset];
230
231#ifdef DEBUG
232 if (subjectPtr >= md.endSubject)
233 printf("matching subject <null>");
234 else {
235 printf("matching subject ");
236 pchars(subjectPtr, length, true, md);
237 }
238 printf(" against backref ");
239 pchars(p, length, false, md);
240 printf("\n");
241#endif
242
243 /* Always fail if not enough characters left */
244
245 if (length > md.endSubject - subjectPtr)
246 return false;
247
248 /* Separate the caselesss case for speed */
249
250 if (md.ignoreCase) {
251 while (length-- > 0) {
252 UChar c = *p++;
9dae56ea 253 int othercase = jsc_pcre_ucp_othercase(c);
b37bf2e1
A
254 UChar d = *subjectPtr++;
255 if (c != d && othercase != d)
256 return false;
257 }
258 }
259 else {
260 while (length-- > 0)
261 if (*p++ != *subjectPtr++)
262 return false;
263 }
264
265 return true;
266}
267
268#ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
269
270/* Use numbered labels and switch statement at the bottom of the match function. */
271
272#define RMATCH_WHERE(num) num
273#define RRETURN_LABEL RRETURN_SWITCH
274
275#else
276
277/* Use GCC's computed goto extension. */
278
279/* For one test case this is more than 40% faster than the switch statement.
280We could avoid the use of the num argument entirely by using local labels,
281but using it for the GCC case as well as the non-GCC case allows us to share
282a bit more code and notice if we use conflicting numbers.*/
283
284#define RMATCH_WHERE(num) &&RRETURN_##num
285#define RRETURN_LABEL *stack.currentFrame->returnLocation
286
287#endif
288
289#define RECURSIVE_MATCH_COMMON(num) \
290 goto RECURSE;\
291 RRETURN_##num: \
292 stack.popCurrentFrame();
293
294#define RECURSIVE_MATCH(num, ra, rb) \
295 do { \
296 stack.pushNewFrame((ra), (rb), RMATCH_WHERE(num)); \
297 RECURSIVE_MATCH_COMMON(num) \
298 } while (0)
299
9dae56ea 300#define RECURSIVE_MATCH_NEW_GROUP(num, ra, rb) \
b37bf2e1
A
301 do { \
302 stack.pushNewFrame((ra), (rb), RMATCH_WHERE(num)); \
303 startNewGroup(stack.currentFrame); \
304 RECURSIVE_MATCH_COMMON(num) \
305 } while (0)
306
307#define RRETURN goto RRETURN_LABEL
308
309#define RRETURN_NO_MATCH do { isMatch = false; RRETURN; } while (0)
310
311/*************************************************
312* Match from current position *
313*************************************************/
314
315/* On entry instructionPtr points to the first opcode, and subjectPtr to the first character
316in the subject string, while substringStart holds the value of subjectPtr at the start of the
317last bracketed group - used for breaking infinite loops matching zero-length
318strings. This function is called recursively in many circumstances. Whenever it
319returns a negative (error) response, the outer match() call must also return the
320same response.
321
322Arguments:
323 subjectPtr pointer in subject
324 instructionPtr position in code
325 offsetTop current top pointer
326 md pointer to "static" info for the match
327
328Returns: 1 if matched ) these values are >= 0
329 0 if failed to match )
330 a negative error value if aborted by an error condition
331 (e.g. stopped by repeated call or recursion limit)
332*/
333
9dae56ea 334static const unsigned numFramesOnStack = 16;
b37bf2e1
A
335
336struct MatchStack {
337 MatchStack()
9dae56ea 338 : framesEnd(frames + numFramesOnStack)
b37bf2e1
A
339 , currentFrame(frames)
340 , size(1) // match() creates accesses the first frame w/o calling pushNewFrame
341 {
9dae56ea 342 ASSERT((sizeof(frames) / sizeof(frames[0])) == numFramesOnStack);
b37bf2e1
A
343 }
344
9dae56ea 345 MatchFrame frames[numFramesOnStack];
b37bf2e1
A
346 MatchFrame* framesEnd;
347 MatchFrame* currentFrame;
348 unsigned size;
349
350 inline bool canUseStackBufferForNextFrame()
351 {
9dae56ea 352 return size < numFramesOnStack;
b37bf2e1
A
353 }
354
355 inline MatchFrame* allocateNextFrame()
356 {
357 if (canUseStackBufferForNextFrame())
358 return currentFrame + 1;
359 return new MatchFrame;
360 }
361
362 inline void pushNewFrame(const unsigned char* instructionPtr, BracketChainNode* bracketChain, ReturnLocation returnLocation)
363 {
364 MatchFrame* newframe = allocateNextFrame();
365 newframe->previousFrame = currentFrame;
366
367 newframe->args.subjectPtr = currentFrame->args.subjectPtr;
368 newframe->args.offsetTop = currentFrame->args.offsetTop;
369 newframe->args.instructionPtr = instructionPtr;
370 newframe->args.bracketChain = bracketChain;
371 newframe->returnLocation = returnLocation;
372 size++;
373
374 currentFrame = newframe;
375 }
376
377 inline void popCurrentFrame()
378 {
379 MatchFrame* oldFrame = currentFrame;
380 currentFrame = currentFrame->previousFrame;
9dae56ea 381 if (size > numFramesOnStack)
b37bf2e1
A
382 delete oldFrame;
383 size--;
384 }
385
386 void popAllFrames()
387 {
388 while (size)
389 popCurrentFrame();
390 }
391};
392
393static int matchError(int errorCode, MatchStack& stack)
394{
395 stack.popAllFrames();
396 return errorCode;
397}
398
399/* Get the next UTF-8 character, not advancing the pointer, incrementing length
400 if there are extra bytes. This is called when we know we are in UTF-8 mode. */
401
402static inline void getUTF8CharAndIncrementLength(int& c, const unsigned char* subjectPtr, int& len)
403{
404 c = *subjectPtr;
405 if ((c & 0xc0) == 0xc0) {
9dae56ea 406 int gcaa = jsc_pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */
b37bf2e1 407 int gcss = 6 * gcaa;
9dae56ea 408 c = (c & jsc_pcre_utf8_table3[gcaa]) << gcss;
b37bf2e1
A
409 for (int gcii = 1; gcii <= gcaa; gcii++) {
410 gcss -= 6;
411 c |= (subjectPtr[gcii] & 0x3f) << gcss;
412 }
413 len += gcaa;
414 }
415}
416
417static inline void startNewGroup(MatchFrame* currentFrame)
418{
419 /* At the start of a bracketed group, add the current subject pointer to the
420 stack of such pointers, to be re-instated at the end of the group when we hit
421 the closing ket. When match() is called in other circumstances, we don't add to
422 this stack. */
423
424 currentFrame->locals.bracketChainNode.previousBracket = currentFrame->args.bracketChain;
425 currentFrame->locals.bracketChainNode.bracketStart = currentFrame->args.subjectPtr;
426 currentFrame->args.bracketChain = &currentFrame->locals.bracketChainNode;
427}
428
429// FIXME: "minimize" means "not greedy", we should invert the callers to ask for "greedy" to be less confusing
430static inline void repeatInformationFromInstructionOffset(short instructionOffset, bool& minimize, int& minimumRepeats, int& maximumRepeats)
431{
432 // Instruction offsets are based off of OP_CRSTAR, OP_STAR, OP_TYPESTAR, OP_NOTSTAR
433 static const char minimumRepeatsFromInstructionOffset[] = { 0, 0, 1, 1, 0, 0 };
434 static const int maximumRepeatsFromInstructionOffset[] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, 1, 1 };
435
436 ASSERT(instructionOffset >= 0);
437 ASSERT(instructionOffset <= (OP_CRMINQUERY - OP_CRSTAR));
438
439 minimize = (instructionOffset & 1); // this assumes ordering: Instruction, MinimizeInstruction, Instruction2, MinimizeInstruction2
440 minimumRepeats = minimumRepeatsFromInstructionOffset[instructionOffset];
441 maximumRepeats = maximumRepeatsFromInstructionOffset[instructionOffset];
442}
443
444static int match(const UChar* subjectPtr, const unsigned char* instructionPtr, int offsetTop, MatchData& md)
445{
446 bool isMatch = false;
447 int min;
448 bool minimize = false; /* Initialization not really needed, but some compilers think so. */
9dae56ea 449 unsigned remainingMatchCount = matchLimit;
ba379fdc 450 int othercase; /* Declare here to avoid errors during jumps */
b37bf2e1
A
451
452 MatchStack stack;
453
454 /* The opcode jump table. */
455#ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
456#define EMIT_JUMP_TABLE_ENTRY(opcode) &&LABEL_OP_##opcode,
457 static void* opcodeJumpTable[256] = { FOR_EACH_OPCODE(EMIT_JUMP_TABLE_ENTRY) };
458#undef EMIT_JUMP_TABLE_ENTRY
459#endif
460
461 /* One-time setup of the opcode jump table. */
462#ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
463 for (int i = 255; !opcodeJumpTable[i]; i--)
464 opcodeJumpTable[i] = &&CAPTURING_BRACKET;
465#endif
466
467#ifdef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
468 // Shark shows this as a hot line
469 // Using a static const here makes this line disappear, but makes later access hotter (not sure why)
470 stack.currentFrame->returnLocation = &&RETURN;
471#else
472 stack.currentFrame->returnLocation = 0;
473#endif
474 stack.currentFrame->args.subjectPtr = subjectPtr;
475 stack.currentFrame->args.instructionPtr = instructionPtr;
476 stack.currentFrame->args.offsetTop = offsetTop;
477 stack.currentFrame->args.bracketChain = 0;
478 startNewGroup(stack.currentFrame);
479
480 /* This is where control jumps back to to effect "recursion" */
481
482RECURSE:
9dae56ea 483 if (!--remainingMatchCount)
b37bf2e1
A
484 return matchError(JSRegExpErrorHitLimit, stack);
485
486 /* Now start processing the operations. */
487
488#ifndef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
489 while (true)
490#endif
491 {
492
493#ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
494#define BEGIN_OPCODE(opcode) LABEL_OP_##opcode
495#define NEXT_OPCODE goto *opcodeJumpTable[*stack.currentFrame->args.instructionPtr]
496#else
497#define BEGIN_OPCODE(opcode) case OP_##opcode
498#define NEXT_OPCODE continue
499#endif
500
501#ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
502 NEXT_OPCODE;
503#else
504 switch (*stack.currentFrame->args.instructionPtr)
505#endif
506 {
507 /* Non-capturing bracket: optimized */
508
509 BEGIN_OPCODE(BRA):
510 NON_CAPTURING_BRACKET:
511 DPRINTF(("start bracket 0\n"));
512 do {
9dae56ea 513 RECURSIVE_MATCH_NEW_GROUP(2, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain);
b37bf2e1
A
514 if (isMatch)
515 RRETURN;
516 stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1);
517 } while (*stack.currentFrame->args.instructionPtr == OP_ALT);
518 DPRINTF(("bracket 0 failed\n"));
519 RRETURN;
520
521 /* Skip over large extraction number data if encountered. */
522
523 BEGIN_OPCODE(BRANUMBER):
524 stack.currentFrame->args.instructionPtr += 3;
525 NEXT_OPCODE;
526
527 /* End of the pattern. */
528
529 BEGIN_OPCODE(END):
530 md.endMatchPtr = stack.currentFrame->args.subjectPtr; /* Record where we ended */
531 md.endOffsetTop = stack.currentFrame->args.offsetTop; /* and how many extracts were taken */
532 isMatch = true;
533 RRETURN;
534
535 /* Assertion brackets. Check the alternative branches in turn - the
536 matching won't pass the KET for an assertion. If any one branch matches,
537 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
538 start of each branch to move the current point backwards, so the code at
539 this level is identical to the lookahead case. */
540
541 BEGIN_OPCODE(ASSERT):
542 do {
9dae56ea 543 RECURSIVE_MATCH_NEW_GROUP(6, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, NULL);
b37bf2e1
A
544 if (isMatch)
545 break;
546 stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1);
547 } while (*stack.currentFrame->args.instructionPtr == OP_ALT);
548 if (*stack.currentFrame->args.instructionPtr == OP_KET)
549 RRETURN_NO_MATCH;
550
551 /* Continue from after the assertion, updating the offsets high water
552 mark, since extracts may have been taken during the assertion. */
553
554 advanceToEndOfBracket(stack.currentFrame->args.instructionPtr);
555 stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE;
556 stack.currentFrame->args.offsetTop = md.endOffsetTop;
557 NEXT_OPCODE;
558
559 /* Negative assertion: all branches must fail to match */
560
561 BEGIN_OPCODE(ASSERT_NOT):
562 do {
9dae56ea 563 RECURSIVE_MATCH_NEW_GROUP(7, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, NULL);
b37bf2e1
A
564 if (isMatch)
565 RRETURN_NO_MATCH;
566 stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1);
567 } while (*stack.currentFrame->args.instructionPtr == OP_ALT);
568
569 stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE;
570 NEXT_OPCODE;
571
572 /* An alternation is the end of a branch; scan along to find the end of the
573 bracketed group and go to there. */
574
575 BEGIN_OPCODE(ALT):
576 advanceToEndOfBracket(stack.currentFrame->args.instructionPtr);
577 NEXT_OPCODE;
578
579 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
580 that it may occur zero times. It may repeat infinitely, or not at all -
581 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
582 repeat limits are compiled as a number of copies, with the optional ones
583 preceded by BRAZERO or BRAMINZERO. */
584
585 BEGIN_OPCODE(BRAZERO): {
586 stack.currentFrame->locals.startOfRepeatingBracket = stack.currentFrame->args.instructionPtr + 1;
9dae56ea 587 RECURSIVE_MATCH_NEW_GROUP(14, stack.currentFrame->locals.startOfRepeatingBracket, stack.currentFrame->args.bracketChain);
b37bf2e1
A
588 if (isMatch)
589 RRETURN;
590 advanceToEndOfBracket(stack.currentFrame->locals.startOfRepeatingBracket);
591 stack.currentFrame->args.instructionPtr = stack.currentFrame->locals.startOfRepeatingBracket + 1 + LINK_SIZE;
592 NEXT_OPCODE;
593 }
594
595 BEGIN_OPCODE(BRAMINZERO): {
596 stack.currentFrame->locals.startOfRepeatingBracket = stack.currentFrame->args.instructionPtr + 1;
597 advanceToEndOfBracket(stack.currentFrame->locals.startOfRepeatingBracket);
9dae56ea 598 RECURSIVE_MATCH_NEW_GROUP(15, stack.currentFrame->locals.startOfRepeatingBracket + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain);
b37bf2e1
A
599 if (isMatch)
600 RRETURN;
601 stack.currentFrame->args.instructionPtr++;
602 NEXT_OPCODE;
603 }
604
605 /* End of a group, repeated or non-repeating. If we are at the end of
606 an assertion "group", stop matching and return 1, but record the
607 current high water mark for use by positive assertions. Do this also
608 for the "once" (not-backup up) groups. */
609
610 BEGIN_OPCODE(KET):
611 BEGIN_OPCODE(KETRMIN):
612 BEGIN_OPCODE(KETRMAX):
613 stack.currentFrame->locals.instructionPtrAtStartOfOnce = stack.currentFrame->args.instructionPtr - getLinkValue(stack.currentFrame->args.instructionPtr + 1);
614 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.bracketChain->bracketStart;
615
616 /* Back up the stack of bracket start pointers. */
617
618 stack.currentFrame->args.bracketChain = stack.currentFrame->args.bracketChain->previousBracket;
619
620 if (*stack.currentFrame->locals.instructionPtrAtStartOfOnce == OP_ASSERT || *stack.currentFrame->locals.instructionPtrAtStartOfOnce == OP_ASSERT_NOT) {
621 md.endOffsetTop = stack.currentFrame->args.offsetTop;
622 isMatch = true;
623 RRETURN;
624 }
625
626 /* In all other cases except a conditional group we have to check the
627 group number back at the start and if necessary complete handling an
628 extraction by setting the offsets and bumping the high water mark. */
629
630 stack.currentFrame->locals.number = *stack.currentFrame->locals.instructionPtrAtStartOfOnce - OP_BRA;
631
632 /* For extended extraction brackets (large number), we have to fish out
633 the number from a dummy opcode at the start. */
634
635 if (stack.currentFrame->locals.number > EXTRACT_BASIC_MAX)
636 stack.currentFrame->locals.number = get2ByteValue(stack.currentFrame->locals.instructionPtrAtStartOfOnce + 2 + LINK_SIZE);
637 stack.currentFrame->locals.offset = stack.currentFrame->locals.number << 1;
638
639#ifdef DEBUG
640 printf("end bracket %d", stack.currentFrame->locals.number);
641 printf("\n");
642#endif
643
644 /* Test for a numbered group. This includes groups called as a result
645 of recursion. Note that whole-pattern recursion is coded as a recurse
646 into group 0, so it won't be picked up here. Instead, we catch it when
647 the OP_END is reached. */
648
649 if (stack.currentFrame->locals.number > 0) {
650 if (stack.currentFrame->locals.offset >= md.offsetMax)
651 md.offsetOverflow = true;
652 else {
653 md.offsetVector[stack.currentFrame->locals.offset] =
654 md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number];
655 md.offsetVector[stack.currentFrame->locals.offset+1] = stack.currentFrame->args.subjectPtr - md.startSubject;
656 if (stack.currentFrame->args.offsetTop <= stack.currentFrame->locals.offset)
657 stack.currentFrame->args.offsetTop = stack.currentFrame->locals.offset + 2;
658 }
659 }
660
661 /* For a non-repeating ket, just continue at this level. This also
662 happens for a repeating ket if no characters were matched in the group.
663 This is the forcible breaking of infinite loops as implemented in Perl
664 5.005. If there is an options reset, it will get obeyed in the normal
665 course of events. */
666
667 if (*stack.currentFrame->args.instructionPtr == OP_KET || stack.currentFrame->args.subjectPtr == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) {
668 stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE;
669 NEXT_OPCODE;
670 }
671
672 /* The repeating kets try the rest of the pattern or restart from the
673 preceding bracket, in the appropriate order. */
674
675 if (*stack.currentFrame->args.instructionPtr == OP_KETRMIN) {
676 RECURSIVE_MATCH(16, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain);
677 if (isMatch)
678 RRETURN;
9dae56ea 679 RECURSIVE_MATCH_NEW_GROUP(17, stack.currentFrame->locals.instructionPtrAtStartOfOnce, stack.currentFrame->args.bracketChain);
b37bf2e1
A
680 if (isMatch)
681 RRETURN;
682 } else { /* OP_KETRMAX */
9dae56ea 683 RECURSIVE_MATCH_NEW_GROUP(18, stack.currentFrame->locals.instructionPtrAtStartOfOnce, stack.currentFrame->args.bracketChain);
b37bf2e1
A
684 if (isMatch)
685 RRETURN;
686 RECURSIVE_MATCH(19, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain);
687 if (isMatch)
688 RRETURN;
689 }
690 RRETURN;
691
9dae56ea
A
692 /* Start of subject. */
693
b37bf2e1 694 BEGIN_OPCODE(CIRC):
9dae56ea 695 if (stack.currentFrame->args.subjectPtr != md.startSubject)
b37bf2e1
A
696 RRETURN_NO_MATCH;
697 stack.currentFrame->args.instructionPtr++;
698 NEXT_OPCODE;
9dae56ea
A
699
700 /* After internal newline if multiline. */
701
702 BEGIN_OPCODE(BOL):
703 if (stack.currentFrame->args.subjectPtr != md.startSubject && !isNewline(stack.currentFrame->args.subjectPtr[-1]))
704 RRETURN_NO_MATCH;
705 stack.currentFrame->args.instructionPtr++;
706 NEXT_OPCODE;
707
708 /* End of subject. */
709
b37bf2e1 710 BEGIN_OPCODE(DOLL):
9dae56ea
A
711 if (stack.currentFrame->args.subjectPtr < md.endSubject)
712 RRETURN_NO_MATCH;
713 stack.currentFrame->args.instructionPtr++;
714 NEXT_OPCODE;
715
716 /* Before internal newline if multiline. */
717
718 BEGIN_OPCODE(EOL):
719 if (stack.currentFrame->args.subjectPtr < md.endSubject && !isNewline(*stack.currentFrame->args.subjectPtr))
b37bf2e1
A
720 RRETURN_NO_MATCH;
721 stack.currentFrame->args.instructionPtr++;
722 NEXT_OPCODE;
723
724 /* Word boundary assertions */
725
726 BEGIN_OPCODE(NOT_WORD_BOUNDARY):
727 BEGIN_OPCODE(WORD_BOUNDARY): {
728 bool currentCharIsWordChar = false;
729 bool previousCharIsWordChar = false;
730
731 if (stack.currentFrame->args.subjectPtr > md.startSubject)
732 previousCharIsWordChar = isWordChar(stack.currentFrame->args.subjectPtr[-1]);
733 if (stack.currentFrame->args.subjectPtr < md.endSubject)
734 currentCharIsWordChar = isWordChar(*stack.currentFrame->args.subjectPtr);
735
736 /* Now see if the situation is what we want */
737 bool wordBoundaryDesired = (*stack.currentFrame->args.instructionPtr++ == OP_WORD_BOUNDARY);
738 if (wordBoundaryDesired ? currentCharIsWordChar == previousCharIsWordChar : currentCharIsWordChar != previousCharIsWordChar)
739 RRETURN_NO_MATCH;
740 NEXT_OPCODE;
741 }
742
743 /* Match a single character type; inline for speed */
744
745 BEGIN_OPCODE(NOT_NEWLINE):
746 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
747 RRETURN_NO_MATCH;
748 if (isNewline(*stack.currentFrame->args.subjectPtr++))
749 RRETURN_NO_MATCH;
750 stack.currentFrame->args.instructionPtr++;
751 NEXT_OPCODE;
752
753 BEGIN_OPCODE(NOT_DIGIT):
754 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
755 RRETURN_NO_MATCH;
756 if (isASCIIDigit(*stack.currentFrame->args.subjectPtr++))
757 RRETURN_NO_MATCH;
758 stack.currentFrame->args.instructionPtr++;
759 NEXT_OPCODE;
760
761 BEGIN_OPCODE(DIGIT):
762 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
763 RRETURN_NO_MATCH;
764 if (!isASCIIDigit(*stack.currentFrame->args.subjectPtr++))
765 RRETURN_NO_MATCH;
766 stack.currentFrame->args.instructionPtr++;
767 NEXT_OPCODE;
768
769 BEGIN_OPCODE(NOT_WHITESPACE):
770 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
771 RRETURN_NO_MATCH;
772 if (isSpaceChar(*stack.currentFrame->args.subjectPtr++))
773 RRETURN_NO_MATCH;
774 stack.currentFrame->args.instructionPtr++;
775 NEXT_OPCODE;
776
777 BEGIN_OPCODE(WHITESPACE):
778 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
779 RRETURN_NO_MATCH;
780 if (!isSpaceChar(*stack.currentFrame->args.subjectPtr++))
781 RRETURN_NO_MATCH;
782 stack.currentFrame->args.instructionPtr++;
783 NEXT_OPCODE;
784
785 BEGIN_OPCODE(NOT_WORDCHAR):
786 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
787 RRETURN_NO_MATCH;
788 if (isWordChar(*stack.currentFrame->args.subjectPtr++))
789 RRETURN_NO_MATCH;
790 stack.currentFrame->args.instructionPtr++;
791 NEXT_OPCODE;
792
793 BEGIN_OPCODE(WORDCHAR):
794 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
795 RRETURN_NO_MATCH;
796 if (!isWordChar(*stack.currentFrame->args.subjectPtr++))
797 RRETURN_NO_MATCH;
798 stack.currentFrame->args.instructionPtr++;
799 NEXT_OPCODE;
800
801 /* Match a back reference, possibly repeatedly. Look past the end of the
802 item to see if there is repeat information following. The code is similar
803 to that for character classes, but repeated for efficiency. Then obey
804 similar code to character type repeats - written out again for speed.
805 However, if the referenced string is the empty string, always treat
806 it as matched, any number of times (otherwise there could be infinite
807 loops). */
808
809 BEGIN_OPCODE(REF):
810 stack.currentFrame->locals.offset = get2ByteValue(stack.currentFrame->args.instructionPtr + 1) << 1; /* Doubled ref number */
811 stack.currentFrame->args.instructionPtr += 3; /* Advance past item */
812
813 /* If the reference is unset, set the length to be longer than the amount
814 of subject left; this ensures that every attempt at a match fails. We
815 can't just fail here, because of the possibility of quantifiers with zero
816 minima. */
817
818 if (stack.currentFrame->locals.offset >= stack.currentFrame->args.offsetTop || md.offsetVector[stack.currentFrame->locals.offset] < 0)
819 stack.currentFrame->locals.length = 0;
820 else
821 stack.currentFrame->locals.length = md.offsetVector[stack.currentFrame->locals.offset+1] - md.offsetVector[stack.currentFrame->locals.offset];
822
823 /* Set up for repetition, or handle the non-repeated case */
824
825 switch (*stack.currentFrame->args.instructionPtr) {
826 case OP_CRSTAR:
827 case OP_CRMINSTAR:
828 case OP_CRPLUS:
829 case OP_CRMINPLUS:
830 case OP_CRQUERY:
831 case OP_CRMINQUERY:
832 repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max);
833 break;
834
835 case OP_CRRANGE:
836 case OP_CRMINRANGE:
837 minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE);
838 min = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
839 stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 3);
840 if (stack.currentFrame->locals.max == 0)
841 stack.currentFrame->locals.max = INT_MAX;
842 stack.currentFrame->args.instructionPtr += 5;
843 break;
844
845 default: /* No repeat follows */
846 if (!matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md))
847 RRETURN_NO_MATCH;
848 stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length;
849 NEXT_OPCODE;
850 }
851
852 /* If the length of the reference is zero, just continue with the
853 main loop. */
854
855 if (stack.currentFrame->locals.length == 0)
856 NEXT_OPCODE;
857
858 /* First, ensure the minimum number of matches are present. */
859
860 for (int i = 1; i <= min; i++) {
861 if (!matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md))
862 RRETURN_NO_MATCH;
863 stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length;
864 }
865
866 /* If min = max, continue at the same level without recursion.
867 They are not both allowed to be zero. */
868
869 if (min == stack.currentFrame->locals.max)
870 NEXT_OPCODE;
871
872 /* If minimizing, keep trying and advancing the pointer */
873
874 if (minimize) {
875 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
876 RECURSIVE_MATCH(20, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
877 if (isMatch)
878 RRETURN;
879 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || !matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md))
880 RRETURN;
881 stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length;
882 }
883 /* Control never reaches here */
884 }
885
886 /* If maximizing, find the longest string and work backwards */
887
888 else {
889 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
890 for (int i = min; i < stack.currentFrame->locals.max; i++) {
891 if (!matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md))
892 break;
893 stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length;
894 }
895 while (stack.currentFrame->args.subjectPtr >= stack.currentFrame->locals.subjectPtrAtStartOfInstruction) {
896 RECURSIVE_MATCH(21, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
897 if (isMatch)
898 RRETURN;
899 stack.currentFrame->args.subjectPtr -= stack.currentFrame->locals.length;
900 }
901 RRETURN_NO_MATCH;
902 }
903 /* Control never reaches here */
904
905 /* Match a bit-mapped character class, possibly repeatedly. This op code is
906 used when all the characters in the class have values in the range 0-255,
907 and either the matching is caseful, or the characters are in the range
908 0-127 when UTF-8 processing is enabled. The only difference between
909 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
910 encountered.
911
912 First, look past the end of the item to see if there is repeat information
913 following. Then obey similar code to character type repeats - written out
914 again for speed. */
915
916 BEGIN_OPCODE(NCLASS):
917 BEGIN_OPCODE(CLASS):
918 stack.currentFrame->locals.data = stack.currentFrame->args.instructionPtr + 1; /* Save for matching */
919 stack.currentFrame->args.instructionPtr += 33; /* Advance past the item */
920
921 switch (*stack.currentFrame->args.instructionPtr) {
922 case OP_CRSTAR:
923 case OP_CRMINSTAR:
924 case OP_CRPLUS:
925 case OP_CRMINPLUS:
926 case OP_CRQUERY:
927 case OP_CRMINQUERY:
928 repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max);
929 break;
930
931 case OP_CRRANGE:
932 case OP_CRMINRANGE:
933 minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE);
934 min = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
935 stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 3);
936 if (stack.currentFrame->locals.max == 0)
937 stack.currentFrame->locals.max = INT_MAX;
938 stack.currentFrame->args.instructionPtr += 5;
939 break;
940
941 default: /* No repeat follows */
942 min = stack.currentFrame->locals.max = 1;
943 break;
944 }
945
946 /* First, ensure the minimum number of matches are present. */
947
948 for (int i = 1; i <= min; i++) {
949 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
950 RRETURN_NO_MATCH;
951 int c = *stack.currentFrame->args.subjectPtr++;
952 if (c > 255) {
953 if (stack.currentFrame->locals.data[-1] == OP_CLASS)
954 RRETURN_NO_MATCH;
955 } else {
956 if (!(stack.currentFrame->locals.data[c / 8] & (1 << (c & 7))))
957 RRETURN_NO_MATCH;
958 }
959 }
960
961 /* If max == min we can continue with the main loop without the
962 need to recurse. */
963
964 if (min == stack.currentFrame->locals.max)
965 NEXT_OPCODE;
966
967 /* If minimizing, keep testing the rest of the expression and advancing
968 the pointer while it matches the class. */
969 if (minimize) {
970 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
971 RECURSIVE_MATCH(22, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
972 if (isMatch)
973 RRETURN;
974 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject)
975 RRETURN;
976 int c = *stack.currentFrame->args.subjectPtr++;
977 if (c > 255) {
978 if (stack.currentFrame->locals.data[-1] == OP_CLASS)
979 RRETURN;
980 } else {
981 if ((stack.currentFrame->locals.data[c/8] & (1 << (c&7))) == 0)
982 RRETURN;
983 }
984 }
985 /* Control never reaches here */
986 }
987 /* If maximizing, find the longest possible run, then work backwards. */
988 else {
989 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
990
991 for (int i = min; i < stack.currentFrame->locals.max; i++) {
992 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
993 break;
994 int c = *stack.currentFrame->args.subjectPtr;
995 if (c > 255) {
996 if (stack.currentFrame->locals.data[-1] == OP_CLASS)
997 break;
998 } else {
999 if (!(stack.currentFrame->locals.data[c / 8] & (1 << (c & 7))))
1000 break;
1001 }
1002 ++stack.currentFrame->args.subjectPtr;
1003 }
1004 for (;;) {
1005 RECURSIVE_MATCH(24, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1006 if (isMatch)
1007 RRETURN;
1008 if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction)
1009 break; /* Stop if tried at original pos */
1010 }
1011
1012 RRETURN;
1013 }
1014 /* Control never reaches here */
1015
1016 /* Match an extended character class. */
1017
1018 BEGIN_OPCODE(XCLASS):
1019 stack.currentFrame->locals.data = stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE; /* Save for matching */
1020 stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1); /* Advance past the item */
1021
1022 switch (*stack.currentFrame->args.instructionPtr) {
1023 case OP_CRSTAR:
1024 case OP_CRMINSTAR:
1025 case OP_CRPLUS:
1026 case OP_CRMINPLUS:
1027 case OP_CRQUERY:
1028 case OP_CRMINQUERY:
1029 repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max);
1030 break;
1031
1032 case OP_CRRANGE:
1033 case OP_CRMINRANGE:
1034 minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE);
1035 min = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
1036 stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 3);
1037 if (stack.currentFrame->locals.max == 0)
1038 stack.currentFrame->locals.max = INT_MAX;
1039 stack.currentFrame->args.instructionPtr += 5;
1040 break;
1041
1042 default: /* No repeat follows */
1043 min = stack.currentFrame->locals.max = 1;
1044 }
1045
1046 /* First, ensure the minimum number of matches are present. */
1047
1048 for (int i = 1; i <= min; i++) {
1049 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1050 RRETURN_NO_MATCH;
1051 int c = *stack.currentFrame->args.subjectPtr++;
9dae56ea 1052 if (!jsc_pcre_xclass(c, stack.currentFrame->locals.data))
b37bf2e1
A
1053 RRETURN_NO_MATCH;
1054 }
1055
1056 /* If max == min we can continue with the main loop without the
1057 need to recurse. */
1058
1059 if (min == stack.currentFrame->locals.max)
1060 NEXT_OPCODE;
1061
1062 /* If minimizing, keep testing the rest of the expression and advancing
1063 the pointer while it matches the class. */
1064
1065 if (minimize) {
1066 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
1067 RECURSIVE_MATCH(26, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1068 if (isMatch)
1069 RRETURN;
1070 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject)
1071 RRETURN;
1072 int c = *stack.currentFrame->args.subjectPtr++;
9dae56ea 1073 if (!jsc_pcre_xclass(c, stack.currentFrame->locals.data))
b37bf2e1
A
1074 RRETURN;
1075 }
1076 /* Control never reaches here */
1077 }
1078
1079 /* If maximizing, find the longest possible run, then work backwards. */
1080
1081 else {
1082 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
1083 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1084 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1085 break;
1086 int c = *stack.currentFrame->args.subjectPtr;
9dae56ea 1087 if (!jsc_pcre_xclass(c, stack.currentFrame->locals.data))
b37bf2e1
A
1088 break;
1089 ++stack.currentFrame->args.subjectPtr;
1090 }
1091 for(;;) {
1092 RECURSIVE_MATCH(27, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1093 if (isMatch)
1094 RRETURN;
1095 if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction)
1096 break; /* Stop if tried at original pos */
1097 }
1098 RRETURN;
1099 }
1100
1101 /* Control never reaches here */
1102
1103 /* Match a single character, casefully */
1104
1105 BEGIN_OPCODE(CHAR):
1106 stack.currentFrame->locals.length = 1;
1107 stack.currentFrame->args.instructionPtr++;
1108 getUTF8CharAndIncrementLength(stack.currentFrame->locals.fc, stack.currentFrame->args.instructionPtr, stack.currentFrame->locals.length);
1109 stack.currentFrame->args.instructionPtr += stack.currentFrame->locals.length;
1110 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1111 RRETURN_NO_MATCH;
1112 if (stack.currentFrame->locals.fc != *stack.currentFrame->args.subjectPtr++)
1113 RRETURN_NO_MATCH;
1114 NEXT_OPCODE;
1115
1116 /* Match a single character, caselessly */
1117
1118 BEGIN_OPCODE(CHAR_IGNORING_CASE): {
1119 stack.currentFrame->locals.length = 1;
1120 stack.currentFrame->args.instructionPtr++;
1121 getUTF8CharAndIncrementLength(stack.currentFrame->locals.fc, stack.currentFrame->args.instructionPtr, stack.currentFrame->locals.length);
1122 stack.currentFrame->args.instructionPtr += stack.currentFrame->locals.length;
1123 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1124 RRETURN_NO_MATCH;
1125 int dc = *stack.currentFrame->args.subjectPtr++;
9dae56ea 1126 if (stack.currentFrame->locals.fc != dc && jsc_pcre_ucp_othercase(stack.currentFrame->locals.fc) != dc)
b37bf2e1
A
1127 RRETURN_NO_MATCH;
1128 NEXT_OPCODE;
1129 }
1130
1131 /* Match a single ASCII character. */
1132
1133 BEGIN_OPCODE(ASCII_CHAR):
1134 if (md.endSubject == stack.currentFrame->args.subjectPtr)
1135 RRETURN_NO_MATCH;
1136 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->args.instructionPtr[1])
1137 RRETURN_NO_MATCH;
1138 ++stack.currentFrame->args.subjectPtr;
1139 stack.currentFrame->args.instructionPtr += 2;
1140 NEXT_OPCODE;
1141
1142 /* Match one of two cases of an ASCII letter. */
1143
1144 BEGIN_OPCODE(ASCII_LETTER_IGNORING_CASE):
1145 if (md.endSubject == stack.currentFrame->args.subjectPtr)
1146 RRETURN_NO_MATCH;
1147 if ((*stack.currentFrame->args.subjectPtr | 0x20) != stack.currentFrame->args.instructionPtr[1])
1148 RRETURN_NO_MATCH;
1149 ++stack.currentFrame->args.subjectPtr;
1150 stack.currentFrame->args.instructionPtr += 2;
1151 NEXT_OPCODE;
1152
1153 /* Match a single character repeatedly; different opcodes share code. */
1154
1155 BEGIN_OPCODE(EXACT):
1156 min = stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
1157 minimize = false;
1158 stack.currentFrame->args.instructionPtr += 3;
1159 goto REPEATCHAR;
1160
1161 BEGIN_OPCODE(UPTO):
1162 BEGIN_OPCODE(MINUPTO):
1163 min = 0;
1164 stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
1165 minimize = *stack.currentFrame->args.instructionPtr == OP_MINUPTO;
1166 stack.currentFrame->args.instructionPtr += 3;
1167 goto REPEATCHAR;
1168
1169 BEGIN_OPCODE(STAR):
1170 BEGIN_OPCODE(MINSTAR):
1171 BEGIN_OPCODE(PLUS):
1172 BEGIN_OPCODE(MINPLUS):
1173 BEGIN_OPCODE(QUERY):
1174 BEGIN_OPCODE(MINQUERY):
1175 repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_STAR, minimize, min, stack.currentFrame->locals.max);
1176
1177 /* Common code for all repeated single-character matches. We can give
1178 up quickly if there are fewer than the minimum number of characters left in
1179 the subject. */
1180
1181 REPEATCHAR:
1182
1183 stack.currentFrame->locals.length = 1;
1184 getUTF8CharAndIncrementLength(stack.currentFrame->locals.fc, stack.currentFrame->args.instructionPtr, stack.currentFrame->locals.length);
1185 if (min * (stack.currentFrame->locals.fc > 0xFFFF ? 2 : 1) > md.endSubject - stack.currentFrame->args.subjectPtr)
1186 RRETURN_NO_MATCH;
1187 stack.currentFrame->args.instructionPtr += stack.currentFrame->locals.length;
1188
1189 if (stack.currentFrame->locals.fc <= 0xFFFF) {
ba379fdc 1190 othercase = md.ignoreCase ? jsc_pcre_ucp_othercase(stack.currentFrame->locals.fc) : -1;
b37bf2e1
A
1191
1192 for (int i = 1; i <= min; i++) {
1193 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc && *stack.currentFrame->args.subjectPtr != othercase)
1194 RRETURN_NO_MATCH;
1195 ++stack.currentFrame->args.subjectPtr;
1196 }
1197
1198 if (min == stack.currentFrame->locals.max)
1199 NEXT_OPCODE;
1200
1201 if (minimize) {
1202 stack.currentFrame->locals.repeatOthercase = othercase;
1203 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
1204 RECURSIVE_MATCH(28, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1205 if (isMatch)
1206 RRETURN;
1207 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject)
1208 RRETURN;
1209 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc && *stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.repeatOthercase)
1210 RRETURN;
1211 ++stack.currentFrame->args.subjectPtr;
1212 }
1213 /* Control never reaches here */
1214 } else {
1215 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
1216 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1217 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1218 break;
1219 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc && *stack.currentFrame->args.subjectPtr != othercase)
1220 break;
1221 ++stack.currentFrame->args.subjectPtr;
1222 }
1223 while (stack.currentFrame->args.subjectPtr >= stack.currentFrame->locals.subjectPtrAtStartOfInstruction) {
1224 RECURSIVE_MATCH(29, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1225 if (isMatch)
1226 RRETURN;
1227 --stack.currentFrame->args.subjectPtr;
1228 }
1229 RRETURN_NO_MATCH;
1230 }
1231 /* Control never reaches here */
1232 } else {
1233 /* No case on surrogate pairs, so no need to bother with "othercase". */
1234
1235 for (int i = 1; i <= min; i++) {
1236 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc)
1237 RRETURN_NO_MATCH;
1238 stack.currentFrame->args.subjectPtr += 2;
1239 }
1240
1241 if (min == stack.currentFrame->locals.max)
1242 NEXT_OPCODE;
1243
1244 if (minimize) {
1245 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
1246 RECURSIVE_MATCH(30, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1247 if (isMatch)
1248 RRETURN;
1249 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject)
1250 RRETURN;
1251 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc)
1252 RRETURN;
1253 stack.currentFrame->args.subjectPtr += 2;
1254 }
1255 /* Control never reaches here */
1256 } else {
1257 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
1258 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1259 if (stack.currentFrame->args.subjectPtr > md.endSubject - 2)
1260 break;
1261 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc)
1262 break;
1263 stack.currentFrame->args.subjectPtr += 2;
1264 }
1265 while (stack.currentFrame->args.subjectPtr >= stack.currentFrame->locals.subjectPtrAtStartOfInstruction) {
1266 RECURSIVE_MATCH(31, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1267 if (isMatch)
1268 RRETURN;
1269 stack.currentFrame->args.subjectPtr -= 2;
1270 }
1271 RRETURN_NO_MATCH;
1272 }
1273 /* Control never reaches here */
1274 }
1275 /* Control never reaches here */
1276
1277 /* Match a negated single one-byte character. */
1278
1279 BEGIN_OPCODE(NOT): {
1280 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1281 RRETURN_NO_MATCH;
9dae56ea 1282 int b = stack.currentFrame->args.instructionPtr[1];
b37bf2e1 1283 int c = *stack.currentFrame->args.subjectPtr++;
9dae56ea 1284 stack.currentFrame->args.instructionPtr += 2;
b37bf2e1
A
1285 if (md.ignoreCase) {
1286 if (c < 128)
1287 c = toLowerCase(c);
9dae56ea 1288 if (toLowerCase(b) == c)
b37bf2e1
A
1289 RRETURN_NO_MATCH;
1290 } else {
9dae56ea 1291 if (b == c)
b37bf2e1
A
1292 RRETURN_NO_MATCH;
1293 }
1294 NEXT_OPCODE;
1295 }
1296
1297 /* Match a negated single one-byte character repeatedly. This is almost a
1298 repeat of the code for a repeated single character, but I haven't found a
1299 nice way of commoning these up that doesn't require a test of the
1300 positive/negative option for each character match. Maybe that wouldn't add
1301 very much to the time taken, but character matching *is* what this is all
1302 about... */
1303
1304 BEGIN_OPCODE(NOTEXACT):
1305 min = stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
1306 minimize = false;
1307 stack.currentFrame->args.instructionPtr += 3;
1308 goto REPEATNOTCHAR;
1309
1310 BEGIN_OPCODE(NOTUPTO):
1311 BEGIN_OPCODE(NOTMINUPTO):
1312 min = 0;
1313 stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
1314 minimize = *stack.currentFrame->args.instructionPtr == OP_NOTMINUPTO;
1315 stack.currentFrame->args.instructionPtr += 3;
1316 goto REPEATNOTCHAR;
1317
1318 BEGIN_OPCODE(NOTSTAR):
1319 BEGIN_OPCODE(NOTMINSTAR):
1320 BEGIN_OPCODE(NOTPLUS):
1321 BEGIN_OPCODE(NOTMINPLUS):
1322 BEGIN_OPCODE(NOTQUERY):
1323 BEGIN_OPCODE(NOTMINQUERY):
1324 repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_NOTSTAR, minimize, min, stack.currentFrame->locals.max);
1325
1326 /* Common code for all repeated single-byte matches. We can give up quickly
1327 if there are fewer than the minimum number of bytes left in the
1328 subject. */
1329
1330 REPEATNOTCHAR:
1331 if (min > md.endSubject - stack.currentFrame->args.subjectPtr)
1332 RRETURN_NO_MATCH;
1333 stack.currentFrame->locals.fc = *stack.currentFrame->args.instructionPtr++;
1334
1335 /* The code is duplicated for the caseless and caseful cases, for speed,
1336 since matching characters is likely to be quite common. First, ensure the
1337 minimum number of matches are present. If min = max, continue at the same
1338 level without recursing. Otherwise, if minimizing, keep trying the rest of
1339 the expression and advancing one matching character if failing, up to the
1340 maximum. Alternatively, if maximizing, find the maximum number of
1341 characters and work backwards. */
1342
1343 DPRINTF(("negative matching %c{%d,%d}\n", stack.currentFrame->locals.fc, min, stack.currentFrame->locals.max));
1344
1345 if (md.ignoreCase) {
1346 if (stack.currentFrame->locals.fc < 128)
1347 stack.currentFrame->locals.fc = toLowerCase(stack.currentFrame->locals.fc);
1348
1349 for (int i = 1; i <= min; i++) {
1350 int d = *stack.currentFrame->args.subjectPtr++;
1351 if (d < 128)
1352 d = toLowerCase(d);
1353 if (stack.currentFrame->locals.fc == d)
1354 RRETURN_NO_MATCH;
1355 }
1356
1357 if (min == stack.currentFrame->locals.max)
1358 NEXT_OPCODE;
1359
1360 if (minimize) {
1361 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
1362 RECURSIVE_MATCH(38, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1363 if (isMatch)
1364 RRETURN;
1365 int d = *stack.currentFrame->args.subjectPtr++;
1366 if (d < 128)
1367 d = toLowerCase(d);
1368 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject || stack.currentFrame->locals.fc == d)
1369 RRETURN;
1370 }
1371 /* Control never reaches here */
1372 }
1373
1374 /* Maximize case */
1375
1376 else {
1377 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
1378
1379 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1380 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1381 break;
1382 int d = *stack.currentFrame->args.subjectPtr;
1383 if (d < 128)
1384 d = toLowerCase(d);
1385 if (stack.currentFrame->locals.fc == d)
1386 break;
1387 ++stack.currentFrame->args.subjectPtr;
1388 }
1389 for (;;) {
1390 RECURSIVE_MATCH(40, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1391 if (isMatch)
1392 RRETURN;
1393 if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction)
1394 break; /* Stop if tried at original pos */
1395 }
1396
1397 RRETURN;
1398 }
1399 /* Control never reaches here */
1400 }
1401
1402 /* Caseful comparisons */
1403
1404 else {
1405 for (int i = 1; i <= min; i++) {
1406 int d = *stack.currentFrame->args.subjectPtr++;
1407 if (stack.currentFrame->locals.fc == d)
1408 RRETURN_NO_MATCH;
1409 }
1410
1411 if (min == stack.currentFrame->locals.max)
1412 NEXT_OPCODE;
1413
1414 if (minimize) {
1415 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
1416 RECURSIVE_MATCH(42, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1417 if (isMatch)
1418 RRETURN;
1419 int d = *stack.currentFrame->args.subjectPtr++;
1420 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject || stack.currentFrame->locals.fc == d)
1421 RRETURN;
1422 }
1423 /* Control never reaches here */
1424 }
1425
1426 /* Maximize case */
1427
1428 else {
1429 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
1430
1431 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1432 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1433 break;
1434 int d = *stack.currentFrame->args.subjectPtr;
1435 if (stack.currentFrame->locals.fc == d)
1436 break;
1437 ++stack.currentFrame->args.subjectPtr;
1438 }
1439 for (;;) {
1440 RECURSIVE_MATCH(44, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1441 if (isMatch)
1442 RRETURN;
1443 if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction)
1444 break; /* Stop if tried at original pos */
1445 }
1446
1447 RRETURN;
1448 }
1449 }
1450 /* Control never reaches here */
1451
1452 /* Match a single character type repeatedly; several different opcodes
1453 share code. This is very similar to the code for single characters, but we
1454 repeat it in the interests of efficiency. */
1455
1456 BEGIN_OPCODE(TYPEEXACT):
1457 min = stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
1458 minimize = true;
1459 stack.currentFrame->args.instructionPtr += 3;
1460 goto REPEATTYPE;
1461
1462 BEGIN_OPCODE(TYPEUPTO):
1463 BEGIN_OPCODE(TYPEMINUPTO):
1464 min = 0;
1465 stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
1466 minimize = *stack.currentFrame->args.instructionPtr == OP_TYPEMINUPTO;
1467 stack.currentFrame->args.instructionPtr += 3;
1468 goto REPEATTYPE;
1469
1470 BEGIN_OPCODE(TYPESTAR):
1471 BEGIN_OPCODE(TYPEMINSTAR):
1472 BEGIN_OPCODE(TYPEPLUS):
1473 BEGIN_OPCODE(TYPEMINPLUS):
1474 BEGIN_OPCODE(TYPEQUERY):
1475 BEGIN_OPCODE(TYPEMINQUERY):
1476 repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_TYPESTAR, minimize, min, stack.currentFrame->locals.max);
1477
1478 /* Common code for all repeated single character type matches. Note that
1479 in UTF-8 mode, '.' matches a character of any length, but for the other
1480 character types, the valid characters are all one-byte long. */
1481
1482 REPEATTYPE:
1483 stack.currentFrame->locals.ctype = *stack.currentFrame->args.instructionPtr++; /* Code for the character type */
1484
1485 /* First, ensure the minimum number of matches are present. Use inline
1486 code for maximizing the speed, and do the type test once at the start
1487 (i.e. keep it out of the loop). Also we can test that there are at least
1488 the minimum number of characters before we start. */
1489
1490 if (min > md.endSubject - stack.currentFrame->args.subjectPtr)
1491 RRETURN_NO_MATCH;
1492 if (min > 0) {
1493 switch (stack.currentFrame->locals.ctype) {
1494 case OP_NOT_NEWLINE:
1495 for (int i = 1; i <= min; i++) {
1496 if (isNewline(*stack.currentFrame->args.subjectPtr))
1497 RRETURN_NO_MATCH;
1498 ++stack.currentFrame->args.subjectPtr;
1499 }
1500 break;
1501
1502 case OP_NOT_DIGIT:
1503 for (int i = 1; i <= min; i++) {
1504 if (isASCIIDigit(*stack.currentFrame->args.subjectPtr))
1505 RRETURN_NO_MATCH;
1506 ++stack.currentFrame->args.subjectPtr;
1507 }
1508 break;
1509
1510 case OP_DIGIT:
1511 for (int i = 1; i <= min; i++) {
1512 if (!isASCIIDigit(*stack.currentFrame->args.subjectPtr))
1513 RRETURN_NO_MATCH;
1514 ++stack.currentFrame->args.subjectPtr;
1515 }
1516 break;
1517
1518 case OP_NOT_WHITESPACE:
1519 for (int i = 1; i <= min; i++) {
1520 if (isSpaceChar(*stack.currentFrame->args.subjectPtr))
1521 RRETURN_NO_MATCH;
1522 ++stack.currentFrame->args.subjectPtr;
1523 }
1524 break;
1525
1526 case OP_WHITESPACE:
1527 for (int i = 1; i <= min; i++) {
1528 if (!isSpaceChar(*stack.currentFrame->args.subjectPtr))
1529 RRETURN_NO_MATCH;
1530 ++stack.currentFrame->args.subjectPtr;
1531 }
1532 break;
1533
1534 case OP_NOT_WORDCHAR:
1535 for (int i = 1; i <= min; i++) {
1536 if (isWordChar(*stack.currentFrame->args.subjectPtr))
1537 RRETURN_NO_MATCH;
1538 ++stack.currentFrame->args.subjectPtr;
1539 }
1540 break;
1541
1542 case OP_WORDCHAR:
1543 for (int i = 1; i <= min; i++) {
1544 if (!isWordChar(*stack.currentFrame->args.subjectPtr))
1545 RRETURN_NO_MATCH;
1546 ++stack.currentFrame->args.subjectPtr;
1547 }
1548 break;
1549
1550 default:
1551 ASSERT_NOT_REACHED();
1552 return matchError(JSRegExpErrorInternal, stack);
1553 } /* End switch(stack.currentFrame->locals.ctype) */
1554 }
1555
1556 /* If min = max, continue at the same level without recursing */
1557
1558 if (min == stack.currentFrame->locals.max)
1559 NEXT_OPCODE;
1560
1561 /* If minimizing, we have to test the rest of the pattern before each
1562 subsequent match. */
1563
1564 if (minimize) {
1565 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
1566 RECURSIVE_MATCH(48, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1567 if (isMatch)
1568 RRETURN;
1569 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject)
1570 RRETURN;
1571
1572 int c = *stack.currentFrame->args.subjectPtr++;
1573 switch (stack.currentFrame->locals.ctype) {
1574 case OP_NOT_NEWLINE:
1575 if (isNewline(c))
1576 RRETURN;
1577 break;
1578
1579 case OP_NOT_DIGIT:
1580 if (isASCIIDigit(c))
1581 RRETURN;
1582 break;
1583
1584 case OP_DIGIT:
1585 if (!isASCIIDigit(c))
1586 RRETURN;
1587 break;
1588
1589 case OP_NOT_WHITESPACE:
1590 if (isSpaceChar(c))
1591 RRETURN;
1592 break;
1593
1594 case OP_WHITESPACE:
1595 if (!isSpaceChar(c))
1596 RRETURN;
1597 break;
1598
1599 case OP_NOT_WORDCHAR:
1600 if (isWordChar(c))
1601 RRETURN;
1602 break;
1603
1604 case OP_WORDCHAR:
1605 if (!isWordChar(c))
1606 RRETURN;
1607 break;
1608
1609 default:
1610 ASSERT_NOT_REACHED();
1611 return matchError(JSRegExpErrorInternal, stack);
1612 }
1613 }
1614 /* Control never reaches here */
1615 }
1616
1617 /* If maximizing it is worth using inline code for speed, doing the type
1618 test once at the start (i.e. keep it out of the loop). */
1619
1620 else {
1621 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; /* Remember where we started */
1622
1623 switch (stack.currentFrame->locals.ctype) {
1624 case OP_NOT_NEWLINE:
1625 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1626 if (stack.currentFrame->args.subjectPtr >= md.endSubject || isNewline(*stack.currentFrame->args.subjectPtr))
1627 break;
1628 stack.currentFrame->args.subjectPtr++;
1629 }
1630 break;
1631
1632 case OP_NOT_DIGIT:
1633 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1634 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1635 break;
1636 int c = *stack.currentFrame->args.subjectPtr;
1637 if (isASCIIDigit(c))
1638 break;
1639 ++stack.currentFrame->args.subjectPtr;
1640 }
1641 break;
1642
1643 case OP_DIGIT:
1644 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1645 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1646 break;
1647 int c = *stack.currentFrame->args.subjectPtr;
1648 if (!isASCIIDigit(c))
1649 break;
1650 ++stack.currentFrame->args.subjectPtr;
1651 }
1652 break;
1653
1654 case OP_NOT_WHITESPACE:
1655 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1656 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1657 break;
1658 int c = *stack.currentFrame->args.subjectPtr;
1659 if (isSpaceChar(c))
1660 break;
1661 ++stack.currentFrame->args.subjectPtr;
1662 }
1663 break;
1664
1665 case OP_WHITESPACE:
1666 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1667 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1668 break;
1669 int c = *stack.currentFrame->args.subjectPtr;
1670 if (!isSpaceChar(c))
1671 break;
1672 ++stack.currentFrame->args.subjectPtr;
1673 }
1674 break;
1675
1676 case OP_NOT_WORDCHAR:
1677 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1678 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1679 break;
1680 int c = *stack.currentFrame->args.subjectPtr;
1681 if (isWordChar(c))
1682 break;
1683 ++stack.currentFrame->args.subjectPtr;
1684 }
1685 break;
1686
1687 case OP_WORDCHAR:
1688 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1689 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1690 break;
1691 int c = *stack.currentFrame->args.subjectPtr;
1692 if (!isWordChar(c))
1693 break;
1694 ++stack.currentFrame->args.subjectPtr;
1695 }
1696 break;
1697
1698 default:
1699 ASSERT_NOT_REACHED();
1700 return matchError(JSRegExpErrorInternal, stack);
1701 }
1702
1703 /* stack.currentFrame->args.subjectPtr is now past the end of the maximum run */
1704
1705 for (;;) {
1706 RECURSIVE_MATCH(52, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1707 if (isMatch)
1708 RRETURN;
1709 if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction)
1710 break; /* Stop if tried at original pos */
1711 }
1712
1713 /* Get here if we can't make it match with any permitted repetitions */
1714
1715 RRETURN;
1716 }
1717 /* Control never reaches here */
1718
1719 BEGIN_OPCODE(CRMINPLUS):
1720 BEGIN_OPCODE(CRMINQUERY):
1721 BEGIN_OPCODE(CRMINRANGE):
1722 BEGIN_OPCODE(CRMINSTAR):
1723 BEGIN_OPCODE(CRPLUS):
1724 BEGIN_OPCODE(CRQUERY):
1725 BEGIN_OPCODE(CRRANGE):
1726 BEGIN_OPCODE(CRSTAR):
1727 ASSERT_NOT_REACHED();
1728 return matchError(JSRegExpErrorInternal, stack);
1729
1730#ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
1731 CAPTURING_BRACKET:
1732#else
1733 default:
1734#endif
1735 /* Opening capturing bracket. If there is space in the offset vector, save
1736 the current subject position in the working slot at the top of the vector. We
1737 mustn't change the current values of the data slot, because they may be set
1738 from a previous iteration of this group, and be referred to by a reference
1739 inside the group.
1740
1741 If the bracket fails to match, we need to restore this value and also the
1742 values of the final offsets, in case they were set by a previous iteration of
1743 the same bracket.
1744
1745 If there isn't enough space in the offset vector, treat this as if it were a
1746 non-capturing bracket. Don't worry about setting the flag for the error case
1747 here; that is handled in the code for KET. */
1748
1749 ASSERT(*stack.currentFrame->args.instructionPtr > OP_BRA);
1750
1751 stack.currentFrame->locals.number = *stack.currentFrame->args.instructionPtr - OP_BRA;
1752
1753 /* For extended extraction brackets (large number), we have to fish out the
1754 number from a dummy opcode at the start. */
1755
1756 if (stack.currentFrame->locals.number > EXTRACT_BASIC_MAX)
1757 stack.currentFrame->locals.number = get2ByteValue(stack.currentFrame->args.instructionPtr + 2 + LINK_SIZE);
1758 stack.currentFrame->locals.offset = stack.currentFrame->locals.number << 1;
1759
1760#ifdef DEBUG
1761 printf("start bracket %d subject=", stack.currentFrame->locals.number);
1762 pchars(stack.currentFrame->args.subjectPtr, 16, true, md);
1763 printf("\n");
1764#endif
1765
1766 if (stack.currentFrame->locals.offset < md.offsetMax) {
1767 stack.currentFrame->locals.saveOffset1 = md.offsetVector[stack.currentFrame->locals.offset];
1768 stack.currentFrame->locals.saveOffset2 = md.offsetVector[stack.currentFrame->locals.offset + 1];
1769 stack.currentFrame->locals.saveOffset3 = md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number];
1770
1771 DPRINTF(("saving %d %d %d\n", stack.currentFrame->locals.saveOffset1, stack.currentFrame->locals.saveOffset2, stack.currentFrame->locals.saveOffset3));
1772 md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number] = stack.currentFrame->args.subjectPtr - md.startSubject;
1773
1774 do {
9dae56ea 1775 RECURSIVE_MATCH_NEW_GROUP(1, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain);
b37bf2e1
A
1776 if (isMatch)
1777 RRETURN;
1778 stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1);
1779 } while (*stack.currentFrame->args.instructionPtr == OP_ALT);
1780
1781 DPRINTF(("bracket %d failed\n", stack.currentFrame->locals.number));
1782
1783 md.offsetVector[stack.currentFrame->locals.offset] = stack.currentFrame->locals.saveOffset1;
1784 md.offsetVector[stack.currentFrame->locals.offset + 1] = stack.currentFrame->locals.saveOffset2;
1785 md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number] = stack.currentFrame->locals.saveOffset3;
1786
1787 RRETURN;
1788 }
1789
1790 /* Insufficient room for saving captured contents */
1791
1792 goto NON_CAPTURING_BRACKET;
1793 }
1794
1795 /* Do not stick any code in here without much thought; it is assumed
1796 that "continue" in the code above comes out to here to repeat the main
1797 loop. */
1798
1799 } /* End of main loop */
1800
1801 ASSERT_NOT_REACHED();
1802
1803#ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
1804
1805RRETURN_SWITCH:
1806 switch (stack.currentFrame->returnLocation) {
1807 case 0: goto RETURN;
1808 case 1: goto RRETURN_1;
1809 case 2: goto RRETURN_2;
1810 case 6: goto RRETURN_6;
1811 case 7: goto RRETURN_7;
1812 case 14: goto RRETURN_14;
1813 case 15: goto RRETURN_15;
1814 case 16: goto RRETURN_16;
1815 case 17: goto RRETURN_17;
1816 case 18: goto RRETURN_18;
1817 case 19: goto RRETURN_19;
1818 case 20: goto RRETURN_20;
1819 case 21: goto RRETURN_21;
1820 case 22: goto RRETURN_22;
1821 case 24: goto RRETURN_24;
1822 case 26: goto RRETURN_26;
1823 case 27: goto RRETURN_27;
1824 case 28: goto RRETURN_28;
1825 case 29: goto RRETURN_29;
1826 case 30: goto RRETURN_30;
1827 case 31: goto RRETURN_31;
1828 case 38: goto RRETURN_38;
1829 case 40: goto RRETURN_40;
1830 case 42: goto RRETURN_42;
1831 case 44: goto RRETURN_44;
1832 case 48: goto RRETURN_48;
1833 case 52: goto RRETURN_52;
1834 }
1835
1836 ASSERT_NOT_REACHED();
1837 return matchError(JSRegExpErrorInternal, stack);
1838
1839#endif
1840
1841RETURN:
1842 return isMatch;
1843}
1844
1845
1846/*************************************************
1847* Execute a Regular Expression *
1848*************************************************/
1849
1850/* This function applies a compiled re to a subject string and picks out
1851portions of the string if it matches. Two elements in the vector are set for
1852each substring: the offsets to the start and end of the substring.
1853
1854Arguments:
1855 re points to the compiled expression
1856 extra_data points to extra data or is NULL
1857 subject points to the subject string
1858 length length of subject string (may contain binary zeros)
1859 start_offset where to start in the subject string
1860 options option bits
1861 offsets points to a vector of ints to be filled in with offsets
9dae56ea 1862 offsetCount the number of elements in the vector
b37bf2e1
A
1863
1864Returns: > 0 => success; value is the number of elements filled in
1865 = 0 => success, but offsets is not big enough
1866 -1 => failed to match
1867 < -1 => some kind of unexpected problem
1868*/
1869
9dae56ea 1870static void tryFirstByteOptimization(const UChar*& subjectPtr, const UChar* endSubject, int firstByte, bool firstByteIsCaseless, bool useMultiLineFirstCharOptimization, const UChar* originalSubjectStart)
b37bf2e1 1871{
9dae56ea 1872 // If firstByte is set, try scanning to the first instance of that byte
b37bf2e1 1873 // no need to try and match against any earlier part of the subject string.
9dae56ea
A
1874 if (firstByte >= 0) {
1875 UChar firstChar = firstByte;
1876 if (firstByteIsCaseless)
b37bf2e1
A
1877 while (subjectPtr < endSubject) {
1878 int c = *subjectPtr;
1879 if (c > 127)
1880 break;
9dae56ea 1881 if (toLowerCase(c) == firstChar)
b37bf2e1
A
1882 break;
1883 subjectPtr++;
1884 }
1885 else {
9dae56ea 1886 while (subjectPtr < endSubject && *subjectPtr != firstChar)
b37bf2e1
A
1887 subjectPtr++;
1888 }
1889 } else if (useMultiLineFirstCharOptimization) {
1890 /* Or to just after \n for a multiline match if possible */
1891 // I'm not sure why this != originalSubjectStart check is necessary -- ecs 11/18/07
1892 if (subjectPtr > originalSubjectStart) {
1893 while (subjectPtr < endSubject && !isNewline(subjectPtr[-1]))
1894 subjectPtr++;
1895 }
1896 }
1897}
1898
9dae56ea 1899static bool tryRequiredByteOptimization(const UChar*& subjectPtr, const UChar* endSubject, int reqByte, int reqByte2, bool reqByteIsCaseless, bool hasFirstByte, const UChar*& reqBytePtr)
b37bf2e1 1900{
9dae56ea
A
1901 /* If reqByte is set, we know that that character must appear in the subject
1902 for the match to succeed. If the first character is set, reqByte must be
b37bf2e1
A
1903 later in the subject; otherwise the test starts at the match point. This
1904 optimization can save a huge amount of backtracking in patterns with nested
1905 unlimited repeats that aren't going to match. Writing separate code for
1906 cased/caseless versions makes it go faster, as does using an autoincrement
1907 and backing off on a match.
1908
1909 HOWEVER: when the subject string is very, very long, searching to its end can
1910 take a long time, and give bad performance on quite ordinary patterns. This
1911 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
1912 don't do this when the string is sufficiently long.
1913 */
1914
9dae56ea 1915 if (reqByte >= 0 && endSubject - subjectPtr < REQ_BYTE_MAX) {
b37bf2e1
A
1916 const UChar* p = subjectPtr + (hasFirstByte ? 1 : 0);
1917
1918 /* We don't need to repeat the search if we haven't yet reached the
1919 place we found it at last time. */
1920
1921 if (p > reqBytePtr) {
9dae56ea 1922 if (reqByteIsCaseless) {
b37bf2e1
A
1923 while (p < endSubject) {
1924 int pp = *p++;
9dae56ea 1925 if (pp == reqByte || pp == reqByte2) {
b37bf2e1
A
1926 p--;
1927 break;
1928 }
1929 }
1930 } else {
1931 while (p < endSubject) {
9dae56ea 1932 if (*p++ == reqByte) {
b37bf2e1
A
1933 p--;
1934 break;
1935 }
1936 }
1937 }
1938
1939 /* If we can't find the required character, break the matching loop */
1940
1941 if (p >= endSubject)
1942 return true;
1943
1944 /* If we have found the required character, save the point where we
1945 found it, so that we don't search again next time round the loop if
1946 the start hasn't passed this character yet. */
1947
1948 reqBytePtr = p;
1949 }
1950 }
1951 return false;
1952}
1953
1954int jsRegExpExecute(const JSRegExp* re,
1955 const UChar* subject, int length, int start_offset, int* offsets,
9dae56ea 1956 int offsetCount)
b37bf2e1
A
1957{
1958 ASSERT(re);
9dae56ea
A
1959 ASSERT(subject || !length);
1960 ASSERT(offsetCount >= 0);
1961 ASSERT(offsets || offsetCount == 0);
1962
1963 HistogramTimeLogger logger(re);
1964
b37bf2e1
A
1965 MatchData matchBlock;
1966 matchBlock.startSubject = subject;
1967 matchBlock.endSubject = matchBlock.startSubject + length;
1968 const UChar* endSubject = matchBlock.endSubject;
1969
1970 matchBlock.multiline = (re->options & MatchAcrossMultipleLinesOption);
1971 matchBlock.ignoreCase = (re->options & IgnoreCaseOption);
1972
1973 /* If the expression has got more back references than the offsets supplied can
1974 hold, we get a temporary chunk of working store to use during the matching.
1975 Otherwise, we can use the vector supplied, rounding down its size to a multiple
1976 of 3. */
1977
9dae56ea 1978 int ocount = offsetCount - (offsetCount % 3);
b37bf2e1
A
1979
1980 // FIXME: This is lame that we have to second-guess our caller here.
1981 // The API should change to either fail-hard when we don't have enough offset space
1982 // or that we shouldn't ask our callers to pre-allocate in the first place.
9dae56ea
A
1983 bool usingTemporaryOffsets = false;
1984 if (re->topBackref > 0 && re->topBackref >= ocount/3) {
1985 ocount = re->topBackref * 3 + 3;
b37bf2e1
A
1986 matchBlock.offsetVector = new int[ocount];
1987 if (!matchBlock.offsetVector)
1988 return JSRegExpErrorNoMemory;
9dae56ea 1989 usingTemporaryOffsets = true;
b37bf2e1
A
1990 } else
1991 matchBlock.offsetVector = offsets;
1992
1993 matchBlock.offsetEnd = ocount;
1994 matchBlock.offsetMax = (2*ocount)/3;
1995 matchBlock.offsetOverflow = false;
1996
1997 /* Compute the minimum number of offsets that we need to reset each time. Doing
1998 this makes a huge difference to execution time when there aren't many brackets
1999 in the pattern. */
2000
9dae56ea
A
2001 int resetCount = 2 + re->topBracket * 2;
2002 if (resetCount > offsetCount)
2003 resetCount = ocount;
b37bf2e1
A
2004
2005 /* Reset the working variable associated with each extraction. These should
2006 never be used unless previously set, but they get saved and restored, and so we
2007 initialize them to avoid reading uninitialized locations. */
2008
2009 if (matchBlock.offsetVector) {
2010 int* iptr = matchBlock.offsetVector + ocount;
9dae56ea 2011 int* iend = iptr - resetCount/2 + 1;
b37bf2e1
A
2012 while (--iptr >= iend)
2013 *iptr = -1;
2014 }
2015
9dae56ea 2016 /* Set up the first character to match, if available. The firstByte value is
b37bf2e1
A
2017 never set for an anchored regular expression, but the anchoring may be forced
2018 at run time, so we have to test for anchoring. The first char may be unset for
2019 an unanchored pattern, of course. If there's no first char and the pattern was
2020 studied, there may be a bitmap of possible first characters. */
2021
9dae56ea
A
2022 bool firstByteIsCaseless = false;
2023 int firstByte = -1;
b37bf2e1 2024 if (re->options & UseFirstByteOptimizationOption) {
9dae56ea
A
2025 firstByte = re->firstByte & 255;
2026 if ((firstByteIsCaseless = (re->firstByte & REQ_IGNORE_CASE)))
2027 firstByte = toLowerCase(firstByte);
b37bf2e1
A
2028 }
2029
2030 /* For anchored or unanchored matches, there may be a "last known required
2031 character" set. */
2032
9dae56ea
A
2033 bool reqByteIsCaseless = false;
2034 int reqByte = -1;
2035 int reqByte2 = -1;
b37bf2e1 2036 if (re->options & UseRequiredByteOptimizationOption) {
9dae56ea
A
2037 reqByte = re->reqByte & 255; // FIXME: This optimization could be made to work for UTF16 chars as well...
2038 reqByteIsCaseless = (re->reqByte & REQ_IGNORE_CASE);
2039 reqByte2 = flipCase(reqByte);
b37bf2e1
A
2040 }
2041
2042 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
2043 the loop runs just once. */
2044
2045 const UChar* startMatch = subject + start_offset;
2046 const UChar* reqBytePtr = startMatch - 1;
2047 bool useMultiLineFirstCharOptimization = re->options & UseMultiLineFirstByteOptimizationOption;
2048
2049 do {
2050 /* Reset the maximum number of extractions we might see. */
2051 if (matchBlock.offsetVector) {
2052 int* iptr = matchBlock.offsetVector;
9dae56ea 2053 int* iend = iptr + resetCount;
b37bf2e1
A
2054 while (iptr < iend)
2055 *iptr++ = -1;
2056 }
2057
9dae56ea
A
2058 tryFirstByteOptimization(startMatch, endSubject, firstByte, firstByteIsCaseless, useMultiLineFirstCharOptimization, matchBlock.startSubject + start_offset);
2059 if (tryRequiredByteOptimization(startMatch, endSubject, reqByte, reqByte2, reqByteIsCaseless, firstByte >= 0, reqBytePtr))
b37bf2e1
A
2060 break;
2061
2062 /* When a match occurs, substrings will be set for all internal extractions;
2063 we just need to set up the whole thing as substring 0 before returning. If
2064 there were too many extractions, set the return code to zero. In the case
2065 where we had to get some local store to hold offsets for backreferences, copy
2066 those back references that we can. In this case there need not be overflow
2067 if certain parts of the pattern were not used. */
2068
2069 /* The code starts after the JSRegExp block and the capture name table. */
2070 const unsigned char* start_code = (const unsigned char*)(re + 1);
2071
2072 int returnCode = match(startMatch, start_code, 2, matchBlock);
2073
2074 /* When the result is no match, advance the pointer to the next character
2075 and continue. */
2076 if (returnCode == 0) {
2077 startMatch++;
2078 continue;
2079 }
2080
2081 if (returnCode != 1) {
2082 ASSERT(returnCode == JSRegExpErrorHitLimit || returnCode == JSRegExpErrorNoMemory);
9dae56ea 2083 DPRINTF((">>>> error: returning %d\n", returnCode));
b37bf2e1
A
2084 return returnCode;
2085 }
2086
2087 /* We have a match! Copy the offset information from temporary store if
2088 necessary */
2089
9dae56ea
A
2090 if (usingTemporaryOffsets) {
2091 if (offsetCount >= 4) {
2092 memcpy(offsets + 2, matchBlock.offsetVector + 2, (offsetCount - 2) * sizeof(int));
b37bf2e1
A
2093 DPRINTF(("Copied offsets from temporary memory\n"));
2094 }
9dae56ea 2095 if (matchBlock.endOffsetTop > offsetCount)
b37bf2e1
A
2096 matchBlock.offsetOverflow = true;
2097
2098 DPRINTF(("Freeing temporary memory\n"));
2099 delete [] matchBlock.offsetVector;
2100 }
2101
2102 returnCode = matchBlock.offsetOverflow ? 0 : matchBlock.endOffsetTop / 2;
2103
9dae56ea 2104 if (offsetCount < 2)
b37bf2e1
A
2105 returnCode = 0;
2106 else {
2107 offsets[0] = startMatch - matchBlock.startSubject;
2108 offsets[1] = matchBlock.endMatchPtr - matchBlock.startSubject;
2109 }
2110
9dae56ea 2111 DPRINTF((">>>> returning %d\n", returnCode));
b37bf2e1 2112 return returnCode;
9dae56ea 2113 } while (!(re->options & IsAnchoredOption) && startMatch <= endSubject);
b37bf2e1 2114
9dae56ea 2115 if (usingTemporaryOffsets) {
b37bf2e1
A
2116 DPRINTF(("Freeing temporary memory\n"));
2117 delete [] matchBlock.offsetVector;
2118 }
2119
2120 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
2121 return JSRegExpErrorNoMatch;
2122}
9dae56ea
A
2123
2124#if REGEXP_HISTOGRAM
2125
2126class CompareHistogramEntries {
2127public:
2128 bool operator()(const pair<UString, double>& a, const pair<UString, double>& b)
2129 {
2130 if (a.second == b.second)
2131 return a.first < b.first;
2132 return a.second < b.second;
2133 }
2134};
2135
2136Histogram::~Histogram()
2137{
2138 Vector<pair<UString, double> > values;
2139 Map::iterator end = times.end();
2140 for (Map::iterator it = times.begin(); it != end; ++it)
2141 values.append(*it);
2142 sort(values.begin(), values.end(), CompareHistogramEntries());
2143 size_t size = values.size();
2144 printf("Regular Expressions, sorted by time spent evaluating them:\n");
2145 for (size_t i = 0; i < size; ++i)
2146 printf(" %f - %s\n", values[size - i - 1].second, values[size - i - 1].first.UTF8String().c_str());
2147}
2148
2149void Histogram::add(const JSRegExp* re, double elapsedTime)
2150{
2151 UString string(reinterpret_cast<const UChar*>(reinterpret_cast<const char*>(re) + re->stringOffset), re->stringLength);
2152 if (re->options & IgnoreCaseOption && re->options & MatchAcrossMultipleLinesOption)
2153 string += " (multi-line, ignore case)";
2154 else {
2155 if (re->options & IgnoreCaseOption)
2156 string += " (ignore case)";
2157 if (re->options & MatchAcrossMultipleLinesOption)
2158 string += " (multi-line)";
2159 }
2160 pair<Map::iterator, bool> result = times.add(string.rep(), elapsedTime);
2161 if (!result.second)
2162 result.first->second += elapsedTime;
2163}
2164
2165HistogramTimeLogger::HistogramTimeLogger(const JSRegExp* re)
2166 : m_re(re)
f9bf01c6 2167 , m_startTime(currentTimeMS())
9dae56ea
A
2168{
2169}
2170
2171HistogramTimeLogger::~HistogramTimeLogger()
2172{
2173 static Histogram histogram;
f9bf01c6 2174 histogram.add(m_re, currentTimeMS() - m_startTime);
9dae56ea
A
2175}
2176
2177#endif