]> git.saurik.com Git - apple/javascriptcore.git/blame - pcre/pcre_exec.cpp
JavaScriptCore-525.tar.gz
[apple/javascriptcore.git] / pcre / pcre_exec.cpp
CommitLineData
b37bf2e1
A
1/* This is JavaScriptCore's variant of the PCRE library. While this library
2started out as a copy of PCRE, many of the features of PCRE have been
3removed. This library now supports only the regular expression features
4required by the JavaScript language specification, and has only the functions
5needed by JavaScriptCore and the rest of WebKit.
6
7 Originally written by Philip Hazel
8 Copyright (c) 1997-2006 University of Cambridge
9dae56ea 9 Copyright (C) 2002, 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
b37bf2e1
A
10 Copyright (C) 2007 Eric Seidel <eric@webkit.org>
11
12-----------------------------------------------------------------------------
13Redistribution and use in source and binary forms, with or without
14modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37POSSIBILITY OF SUCH DAMAGE.
38-----------------------------------------------------------------------------
39*/
40
41/* This module contains jsRegExpExecute(), the externally visible function
42that does pattern matching using an NFA algorithm, following the rules from
43the JavaScript specification. There are also some supporting functions. */
44
45#include "config.h"
b37bf2e1
A
46#include "pcre_internal.h"
47
9dae56ea 48#include <limits.h>
b37bf2e1
A
49#include <wtf/ASCIICType.h>
50#include <wtf/Vector.h>
51
9dae56ea
A
52#if REGEXP_HISTOGRAM
53#include <parser/DateMath.h>
54#include <runtime/UString.h>
55#endif
b37bf2e1
A
56
57using namespace WTF;
58
59#ifdef __GNUC__
60#define USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
61//#define USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
62#endif
63
64/* Avoid warnings on Windows. */
65#undef min
66#undef max
67
68#ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
69typedef int ReturnLocation;
70#else
71typedef void* ReturnLocation;
72#endif
73
9dae56ea
A
74#if !REGEXP_HISTOGRAM
75
76class HistogramTimeLogger {
77public:
78 HistogramTimeLogger(const JSRegExp*) { }
79};
80
81#else
82
83using namespace JSC;
84
85class Histogram {
86public:
87 ~Histogram();
88 void add(const JSRegExp*, double);
89
90private:
91 typedef HashMap<RefPtr<UString::Rep>, double> Map;
92 Map times;
93};
94
95class HistogramTimeLogger {
96public:
97 HistogramTimeLogger(const JSRegExp*);
98 ~HistogramTimeLogger();
99
100private:
101 const JSRegExp* m_re;
102 double m_startTime;
103};
104
105#endif
106
b37bf2e1
A
107/* Structure for building a chain of data for holding the values of
108the subject pointer at the start of each bracket, used to detect when
109an empty string has been matched by a bracket to break infinite loops. */
110struct BracketChainNode {
111 BracketChainNode* previousBracket;
112 const UChar* bracketStart;
113};
114
115struct MatchFrame {
116 ReturnLocation returnLocation;
117 struct MatchFrame* previousFrame;
118
119 /* Function arguments that may change */
120 struct {
121 const UChar* subjectPtr;
122 const unsigned char* instructionPtr;
123 int offsetTop;
124 BracketChainNode* bracketChain;
125 } args;
126
127
128 /* PCRE uses "fake" recursion built off of gotos, thus
129 stack-based local variables are not safe to use. Instead we have to
130 store local variables on the current MatchFrame. */
131 struct {
132 const unsigned char* data;
133 const unsigned char* startOfRepeatingBracket;
134 const UChar* subjectPtrAtStartOfInstruction; // Several instrutions stash away a subjectPtr here for later compare
135 const unsigned char* instructionPtrAtStartOfOnce;
136
137 int repeatOthercase;
138
139 int ctype;
140 int fc;
141 int fi;
142 int length;
143 int max;
144 int number;
145 int offset;
146 int saveOffset1;
147 int saveOffset2;
148 int saveOffset3;
149
150 BracketChainNode bracketChainNode;
151 } locals;
152};
153
154/* Structure for passing "static" information around between the functions
155doing traditional NFA matching, so that they are thread-safe. */
156
157struct MatchData {
158 int* offsetVector; /* Offset vector */
159 int offsetEnd; /* One past the end */
160 int offsetMax; /* The maximum usable for return data */
161 bool offsetOverflow; /* Set if too many extractions */
162 const UChar* startSubject; /* Start of the subject string */
163 const UChar* endSubject; /* End of the subject string */
164 const UChar* endMatchPtr; /* Subject position at end match */
165 int endOffsetTop; /* Highwater mark at end of match */
166 bool multiline;
167 bool ignoreCase;
168};
169
170/* The maximum remaining length of subject we are prepared to search for a
9dae56ea 171reqByte match. */
b37bf2e1
A
172
173#define REQ_BYTE_MAX 1000
174
175/* The below limit restricts the number of "recursive" match calls in order to
176avoid spending exponential time on complex regular expressions. */
177
178static const unsigned matchLimit = 100000;
179
180#ifdef DEBUG
181/*************************************************
182* Debugging function to print chars *
183*************************************************/
184
185/* Print a sequence of chars in printable format, stopping at the end of the
186subject if the requested.
187
188Arguments:
189 p points to characters
190 length number to print
191 isSubject true if printing from within md.startSubject
192 md pointer to matching data block, if isSubject is true
193*/
194
195static void pchars(const UChar* p, int length, bool isSubject, const MatchData& md)
196{
197 if (isSubject && length > md.endSubject - p)
198 length = md.endSubject - p;
199 while (length-- > 0) {
200 int c;
201 if (isprint(c = *(p++)))
202 printf("%c", c);
203 else if (c < 256)
204 printf("\\x%02x", c);
205 else
206 printf("\\x{%x}", c);
207 }
208}
209#endif
210
211/*************************************************
212* Match a back-reference *
213*************************************************/
214
215/* If a back reference hasn't been set, the length that is passed is greater
216than the number of characters left in the string, so the match fails.
217
218Arguments:
219 offset index into the offset vector
220 subjectPtr points into the subject
221 length length to be matched
222 md points to match data block
223
224Returns: true if matched
225*/
226
227static bool matchRef(int offset, const UChar* subjectPtr, int length, const MatchData& md)
228{
229 const UChar* p = md.startSubject + md.offsetVector[offset];
230
231#ifdef DEBUG
232 if (subjectPtr >= md.endSubject)
233 printf("matching subject <null>");
234 else {
235 printf("matching subject ");
236 pchars(subjectPtr, length, true, md);
237 }
238 printf(" against backref ");
239 pchars(p, length, false, md);
240 printf("\n");
241#endif
242
243 /* Always fail if not enough characters left */
244
245 if (length > md.endSubject - subjectPtr)
246 return false;
247
248 /* Separate the caselesss case for speed */
249
250 if (md.ignoreCase) {
251 while (length-- > 0) {
252 UChar c = *p++;
9dae56ea 253 int othercase = jsc_pcre_ucp_othercase(c);
b37bf2e1
A
254 UChar d = *subjectPtr++;
255 if (c != d && othercase != d)
256 return false;
257 }
258 }
259 else {
260 while (length-- > 0)
261 if (*p++ != *subjectPtr++)
262 return false;
263 }
264
265 return true;
266}
267
268#ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
269
270/* Use numbered labels and switch statement at the bottom of the match function. */
271
272#define RMATCH_WHERE(num) num
273#define RRETURN_LABEL RRETURN_SWITCH
274
275#else
276
277/* Use GCC's computed goto extension. */
278
279/* For one test case this is more than 40% faster than the switch statement.
280We could avoid the use of the num argument entirely by using local labels,
281but using it for the GCC case as well as the non-GCC case allows us to share
282a bit more code and notice if we use conflicting numbers.*/
283
284#define RMATCH_WHERE(num) &&RRETURN_##num
285#define RRETURN_LABEL *stack.currentFrame->returnLocation
286
287#endif
288
289#define RECURSIVE_MATCH_COMMON(num) \
290 goto RECURSE;\
291 RRETURN_##num: \
292 stack.popCurrentFrame();
293
294#define RECURSIVE_MATCH(num, ra, rb) \
295 do { \
296 stack.pushNewFrame((ra), (rb), RMATCH_WHERE(num)); \
297 RECURSIVE_MATCH_COMMON(num) \
298 } while (0)
299
9dae56ea 300#define RECURSIVE_MATCH_NEW_GROUP(num, ra, rb) \
b37bf2e1
A
301 do { \
302 stack.pushNewFrame((ra), (rb), RMATCH_WHERE(num)); \
303 startNewGroup(stack.currentFrame); \
304 RECURSIVE_MATCH_COMMON(num) \
305 } while (0)
306
307#define RRETURN goto RRETURN_LABEL
308
309#define RRETURN_NO_MATCH do { isMatch = false; RRETURN; } while (0)
310
311/*************************************************
312* Match from current position *
313*************************************************/
314
315/* On entry instructionPtr points to the first opcode, and subjectPtr to the first character
316in the subject string, while substringStart holds the value of subjectPtr at the start of the
317last bracketed group - used for breaking infinite loops matching zero-length
318strings. This function is called recursively in many circumstances. Whenever it
319returns a negative (error) response, the outer match() call must also return the
320same response.
321
322Arguments:
323 subjectPtr pointer in subject
324 instructionPtr position in code
325 offsetTop current top pointer
326 md pointer to "static" info for the match
327
328Returns: 1 if matched ) these values are >= 0
329 0 if failed to match )
330 a negative error value if aborted by an error condition
331 (e.g. stopped by repeated call or recursion limit)
332*/
333
9dae56ea 334static const unsigned numFramesOnStack = 16;
b37bf2e1
A
335
336struct MatchStack {
337 MatchStack()
9dae56ea 338 : framesEnd(frames + numFramesOnStack)
b37bf2e1
A
339 , currentFrame(frames)
340 , size(1) // match() creates accesses the first frame w/o calling pushNewFrame
341 {
9dae56ea 342 ASSERT((sizeof(frames) / sizeof(frames[0])) == numFramesOnStack);
b37bf2e1
A
343 }
344
9dae56ea 345 MatchFrame frames[numFramesOnStack];
b37bf2e1
A
346 MatchFrame* framesEnd;
347 MatchFrame* currentFrame;
348 unsigned size;
349
350 inline bool canUseStackBufferForNextFrame()
351 {
9dae56ea 352 return size < numFramesOnStack;
b37bf2e1
A
353 }
354
355 inline MatchFrame* allocateNextFrame()
356 {
357 if (canUseStackBufferForNextFrame())
358 return currentFrame + 1;
359 return new MatchFrame;
360 }
361
362 inline void pushNewFrame(const unsigned char* instructionPtr, BracketChainNode* bracketChain, ReturnLocation returnLocation)
363 {
364 MatchFrame* newframe = allocateNextFrame();
365 newframe->previousFrame = currentFrame;
366
367 newframe->args.subjectPtr = currentFrame->args.subjectPtr;
368 newframe->args.offsetTop = currentFrame->args.offsetTop;
369 newframe->args.instructionPtr = instructionPtr;
370 newframe->args.bracketChain = bracketChain;
371 newframe->returnLocation = returnLocation;
372 size++;
373
374 currentFrame = newframe;
375 }
376
377 inline void popCurrentFrame()
378 {
379 MatchFrame* oldFrame = currentFrame;
380 currentFrame = currentFrame->previousFrame;
9dae56ea 381 if (size > numFramesOnStack)
b37bf2e1
A
382 delete oldFrame;
383 size--;
384 }
385
386 void popAllFrames()
387 {
388 while (size)
389 popCurrentFrame();
390 }
391};
392
393static int matchError(int errorCode, MatchStack& stack)
394{
395 stack.popAllFrames();
396 return errorCode;
397}
398
399/* Get the next UTF-8 character, not advancing the pointer, incrementing length
400 if there are extra bytes. This is called when we know we are in UTF-8 mode. */
401
402static inline void getUTF8CharAndIncrementLength(int& c, const unsigned char* subjectPtr, int& len)
403{
404 c = *subjectPtr;
405 if ((c & 0xc0) == 0xc0) {
9dae56ea 406 int gcaa = jsc_pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */
b37bf2e1 407 int gcss = 6 * gcaa;
9dae56ea 408 c = (c & jsc_pcre_utf8_table3[gcaa]) << gcss;
b37bf2e1
A
409 for (int gcii = 1; gcii <= gcaa; gcii++) {
410 gcss -= 6;
411 c |= (subjectPtr[gcii] & 0x3f) << gcss;
412 }
413 len += gcaa;
414 }
415}
416
417static inline void startNewGroup(MatchFrame* currentFrame)
418{
419 /* At the start of a bracketed group, add the current subject pointer to the
420 stack of such pointers, to be re-instated at the end of the group when we hit
421 the closing ket. When match() is called in other circumstances, we don't add to
422 this stack. */
423
424 currentFrame->locals.bracketChainNode.previousBracket = currentFrame->args.bracketChain;
425 currentFrame->locals.bracketChainNode.bracketStart = currentFrame->args.subjectPtr;
426 currentFrame->args.bracketChain = &currentFrame->locals.bracketChainNode;
427}
428
429// FIXME: "minimize" means "not greedy", we should invert the callers to ask for "greedy" to be less confusing
430static inline void repeatInformationFromInstructionOffset(short instructionOffset, bool& minimize, int& minimumRepeats, int& maximumRepeats)
431{
432 // Instruction offsets are based off of OP_CRSTAR, OP_STAR, OP_TYPESTAR, OP_NOTSTAR
433 static const char minimumRepeatsFromInstructionOffset[] = { 0, 0, 1, 1, 0, 0 };
434 static const int maximumRepeatsFromInstructionOffset[] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, 1, 1 };
435
436 ASSERT(instructionOffset >= 0);
437 ASSERT(instructionOffset <= (OP_CRMINQUERY - OP_CRSTAR));
438
439 minimize = (instructionOffset & 1); // this assumes ordering: Instruction, MinimizeInstruction, Instruction2, MinimizeInstruction2
440 minimumRepeats = minimumRepeatsFromInstructionOffset[instructionOffset];
441 maximumRepeats = maximumRepeatsFromInstructionOffset[instructionOffset];
442}
443
444static int match(const UChar* subjectPtr, const unsigned char* instructionPtr, int offsetTop, MatchData& md)
445{
446 bool isMatch = false;
447 int min;
448 bool minimize = false; /* Initialization not really needed, but some compilers think so. */
9dae56ea 449 unsigned remainingMatchCount = matchLimit;
b37bf2e1
A
450
451 MatchStack stack;
452
453 /* The opcode jump table. */
454#ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
455#define EMIT_JUMP_TABLE_ENTRY(opcode) &&LABEL_OP_##opcode,
456 static void* opcodeJumpTable[256] = { FOR_EACH_OPCODE(EMIT_JUMP_TABLE_ENTRY) };
457#undef EMIT_JUMP_TABLE_ENTRY
458#endif
459
460 /* One-time setup of the opcode jump table. */
461#ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
462 for (int i = 255; !opcodeJumpTable[i]; i--)
463 opcodeJumpTable[i] = &&CAPTURING_BRACKET;
464#endif
465
466#ifdef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
467 // Shark shows this as a hot line
468 // Using a static const here makes this line disappear, but makes later access hotter (not sure why)
469 stack.currentFrame->returnLocation = &&RETURN;
470#else
471 stack.currentFrame->returnLocation = 0;
472#endif
473 stack.currentFrame->args.subjectPtr = subjectPtr;
474 stack.currentFrame->args.instructionPtr = instructionPtr;
475 stack.currentFrame->args.offsetTop = offsetTop;
476 stack.currentFrame->args.bracketChain = 0;
477 startNewGroup(stack.currentFrame);
478
479 /* This is where control jumps back to to effect "recursion" */
480
481RECURSE:
9dae56ea 482 if (!--remainingMatchCount)
b37bf2e1
A
483 return matchError(JSRegExpErrorHitLimit, stack);
484
485 /* Now start processing the operations. */
486
487#ifndef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
488 while (true)
489#endif
490 {
491
492#ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
493#define BEGIN_OPCODE(opcode) LABEL_OP_##opcode
494#define NEXT_OPCODE goto *opcodeJumpTable[*stack.currentFrame->args.instructionPtr]
495#else
496#define BEGIN_OPCODE(opcode) case OP_##opcode
497#define NEXT_OPCODE continue
498#endif
499
500#ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
501 NEXT_OPCODE;
502#else
503 switch (*stack.currentFrame->args.instructionPtr)
504#endif
505 {
506 /* Non-capturing bracket: optimized */
507
508 BEGIN_OPCODE(BRA):
509 NON_CAPTURING_BRACKET:
510 DPRINTF(("start bracket 0\n"));
511 do {
9dae56ea 512 RECURSIVE_MATCH_NEW_GROUP(2, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain);
b37bf2e1
A
513 if (isMatch)
514 RRETURN;
515 stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1);
516 } while (*stack.currentFrame->args.instructionPtr == OP_ALT);
517 DPRINTF(("bracket 0 failed\n"));
518 RRETURN;
519
520 /* Skip over large extraction number data if encountered. */
521
522 BEGIN_OPCODE(BRANUMBER):
523 stack.currentFrame->args.instructionPtr += 3;
524 NEXT_OPCODE;
525
526 /* End of the pattern. */
527
528 BEGIN_OPCODE(END):
529 md.endMatchPtr = stack.currentFrame->args.subjectPtr; /* Record where we ended */
530 md.endOffsetTop = stack.currentFrame->args.offsetTop; /* and how many extracts were taken */
531 isMatch = true;
532 RRETURN;
533
534 /* Assertion brackets. Check the alternative branches in turn - the
535 matching won't pass the KET for an assertion. If any one branch matches,
536 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
537 start of each branch to move the current point backwards, so the code at
538 this level is identical to the lookahead case. */
539
540 BEGIN_OPCODE(ASSERT):
541 do {
9dae56ea 542 RECURSIVE_MATCH_NEW_GROUP(6, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, NULL);
b37bf2e1
A
543 if (isMatch)
544 break;
545 stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1);
546 } while (*stack.currentFrame->args.instructionPtr == OP_ALT);
547 if (*stack.currentFrame->args.instructionPtr == OP_KET)
548 RRETURN_NO_MATCH;
549
550 /* Continue from after the assertion, updating the offsets high water
551 mark, since extracts may have been taken during the assertion. */
552
553 advanceToEndOfBracket(stack.currentFrame->args.instructionPtr);
554 stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE;
555 stack.currentFrame->args.offsetTop = md.endOffsetTop;
556 NEXT_OPCODE;
557
558 /* Negative assertion: all branches must fail to match */
559
560 BEGIN_OPCODE(ASSERT_NOT):
561 do {
9dae56ea 562 RECURSIVE_MATCH_NEW_GROUP(7, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, NULL);
b37bf2e1
A
563 if (isMatch)
564 RRETURN_NO_MATCH;
565 stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1);
566 } while (*stack.currentFrame->args.instructionPtr == OP_ALT);
567
568 stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE;
569 NEXT_OPCODE;
570
571 /* An alternation is the end of a branch; scan along to find the end of the
572 bracketed group and go to there. */
573
574 BEGIN_OPCODE(ALT):
575 advanceToEndOfBracket(stack.currentFrame->args.instructionPtr);
576 NEXT_OPCODE;
577
578 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
579 that it may occur zero times. It may repeat infinitely, or not at all -
580 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
581 repeat limits are compiled as a number of copies, with the optional ones
582 preceded by BRAZERO or BRAMINZERO. */
583
584 BEGIN_OPCODE(BRAZERO): {
585 stack.currentFrame->locals.startOfRepeatingBracket = stack.currentFrame->args.instructionPtr + 1;
9dae56ea 586 RECURSIVE_MATCH_NEW_GROUP(14, stack.currentFrame->locals.startOfRepeatingBracket, stack.currentFrame->args.bracketChain);
b37bf2e1
A
587 if (isMatch)
588 RRETURN;
589 advanceToEndOfBracket(stack.currentFrame->locals.startOfRepeatingBracket);
590 stack.currentFrame->args.instructionPtr = stack.currentFrame->locals.startOfRepeatingBracket + 1 + LINK_SIZE;
591 NEXT_OPCODE;
592 }
593
594 BEGIN_OPCODE(BRAMINZERO): {
595 stack.currentFrame->locals.startOfRepeatingBracket = stack.currentFrame->args.instructionPtr + 1;
596 advanceToEndOfBracket(stack.currentFrame->locals.startOfRepeatingBracket);
9dae56ea 597 RECURSIVE_MATCH_NEW_GROUP(15, stack.currentFrame->locals.startOfRepeatingBracket + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain);
b37bf2e1
A
598 if (isMatch)
599 RRETURN;
600 stack.currentFrame->args.instructionPtr++;
601 NEXT_OPCODE;
602 }
603
604 /* End of a group, repeated or non-repeating. If we are at the end of
605 an assertion "group", stop matching and return 1, but record the
606 current high water mark for use by positive assertions. Do this also
607 for the "once" (not-backup up) groups. */
608
609 BEGIN_OPCODE(KET):
610 BEGIN_OPCODE(KETRMIN):
611 BEGIN_OPCODE(KETRMAX):
612 stack.currentFrame->locals.instructionPtrAtStartOfOnce = stack.currentFrame->args.instructionPtr - getLinkValue(stack.currentFrame->args.instructionPtr + 1);
613 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.bracketChain->bracketStart;
614
615 /* Back up the stack of bracket start pointers. */
616
617 stack.currentFrame->args.bracketChain = stack.currentFrame->args.bracketChain->previousBracket;
618
619 if (*stack.currentFrame->locals.instructionPtrAtStartOfOnce == OP_ASSERT || *stack.currentFrame->locals.instructionPtrAtStartOfOnce == OP_ASSERT_NOT) {
620 md.endOffsetTop = stack.currentFrame->args.offsetTop;
621 isMatch = true;
622 RRETURN;
623 }
624
625 /* In all other cases except a conditional group we have to check the
626 group number back at the start and if necessary complete handling an
627 extraction by setting the offsets and bumping the high water mark. */
628
629 stack.currentFrame->locals.number = *stack.currentFrame->locals.instructionPtrAtStartOfOnce - OP_BRA;
630
631 /* For extended extraction brackets (large number), we have to fish out
632 the number from a dummy opcode at the start. */
633
634 if (stack.currentFrame->locals.number > EXTRACT_BASIC_MAX)
635 stack.currentFrame->locals.number = get2ByteValue(stack.currentFrame->locals.instructionPtrAtStartOfOnce + 2 + LINK_SIZE);
636 stack.currentFrame->locals.offset = stack.currentFrame->locals.number << 1;
637
638#ifdef DEBUG
639 printf("end bracket %d", stack.currentFrame->locals.number);
640 printf("\n");
641#endif
642
643 /* Test for a numbered group. This includes groups called as a result
644 of recursion. Note that whole-pattern recursion is coded as a recurse
645 into group 0, so it won't be picked up here. Instead, we catch it when
646 the OP_END is reached. */
647
648 if (stack.currentFrame->locals.number > 0) {
649 if (stack.currentFrame->locals.offset >= md.offsetMax)
650 md.offsetOverflow = true;
651 else {
652 md.offsetVector[stack.currentFrame->locals.offset] =
653 md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number];
654 md.offsetVector[stack.currentFrame->locals.offset+1] = stack.currentFrame->args.subjectPtr - md.startSubject;
655 if (stack.currentFrame->args.offsetTop <= stack.currentFrame->locals.offset)
656 stack.currentFrame->args.offsetTop = stack.currentFrame->locals.offset + 2;
657 }
658 }
659
660 /* For a non-repeating ket, just continue at this level. This also
661 happens for a repeating ket if no characters were matched in the group.
662 This is the forcible breaking of infinite loops as implemented in Perl
663 5.005. If there is an options reset, it will get obeyed in the normal
664 course of events. */
665
666 if (*stack.currentFrame->args.instructionPtr == OP_KET || stack.currentFrame->args.subjectPtr == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) {
667 stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE;
668 NEXT_OPCODE;
669 }
670
671 /* The repeating kets try the rest of the pattern or restart from the
672 preceding bracket, in the appropriate order. */
673
674 if (*stack.currentFrame->args.instructionPtr == OP_KETRMIN) {
675 RECURSIVE_MATCH(16, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain);
676 if (isMatch)
677 RRETURN;
9dae56ea 678 RECURSIVE_MATCH_NEW_GROUP(17, stack.currentFrame->locals.instructionPtrAtStartOfOnce, stack.currentFrame->args.bracketChain);
b37bf2e1
A
679 if (isMatch)
680 RRETURN;
681 } else { /* OP_KETRMAX */
9dae56ea 682 RECURSIVE_MATCH_NEW_GROUP(18, stack.currentFrame->locals.instructionPtrAtStartOfOnce, stack.currentFrame->args.bracketChain);
b37bf2e1
A
683 if (isMatch)
684 RRETURN;
685 RECURSIVE_MATCH(19, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain);
686 if (isMatch)
687 RRETURN;
688 }
689 RRETURN;
690
9dae56ea
A
691 /* Start of subject. */
692
b37bf2e1 693 BEGIN_OPCODE(CIRC):
9dae56ea 694 if (stack.currentFrame->args.subjectPtr != md.startSubject)
b37bf2e1
A
695 RRETURN_NO_MATCH;
696 stack.currentFrame->args.instructionPtr++;
697 NEXT_OPCODE;
9dae56ea
A
698
699 /* After internal newline if multiline. */
700
701 BEGIN_OPCODE(BOL):
702 if (stack.currentFrame->args.subjectPtr != md.startSubject && !isNewline(stack.currentFrame->args.subjectPtr[-1]))
703 RRETURN_NO_MATCH;
704 stack.currentFrame->args.instructionPtr++;
705 NEXT_OPCODE;
706
707 /* End of subject. */
708
b37bf2e1 709 BEGIN_OPCODE(DOLL):
9dae56ea
A
710 if (stack.currentFrame->args.subjectPtr < md.endSubject)
711 RRETURN_NO_MATCH;
712 stack.currentFrame->args.instructionPtr++;
713 NEXT_OPCODE;
714
715 /* Before internal newline if multiline. */
716
717 BEGIN_OPCODE(EOL):
718 if (stack.currentFrame->args.subjectPtr < md.endSubject && !isNewline(*stack.currentFrame->args.subjectPtr))
b37bf2e1
A
719 RRETURN_NO_MATCH;
720 stack.currentFrame->args.instructionPtr++;
721 NEXT_OPCODE;
722
723 /* Word boundary assertions */
724
725 BEGIN_OPCODE(NOT_WORD_BOUNDARY):
726 BEGIN_OPCODE(WORD_BOUNDARY): {
727 bool currentCharIsWordChar = false;
728 bool previousCharIsWordChar = false;
729
730 if (stack.currentFrame->args.subjectPtr > md.startSubject)
731 previousCharIsWordChar = isWordChar(stack.currentFrame->args.subjectPtr[-1]);
732 if (stack.currentFrame->args.subjectPtr < md.endSubject)
733 currentCharIsWordChar = isWordChar(*stack.currentFrame->args.subjectPtr);
734
735 /* Now see if the situation is what we want */
736 bool wordBoundaryDesired = (*stack.currentFrame->args.instructionPtr++ == OP_WORD_BOUNDARY);
737 if (wordBoundaryDesired ? currentCharIsWordChar == previousCharIsWordChar : currentCharIsWordChar != previousCharIsWordChar)
738 RRETURN_NO_MATCH;
739 NEXT_OPCODE;
740 }
741
742 /* Match a single character type; inline for speed */
743
744 BEGIN_OPCODE(NOT_NEWLINE):
745 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
746 RRETURN_NO_MATCH;
747 if (isNewline(*stack.currentFrame->args.subjectPtr++))
748 RRETURN_NO_MATCH;
749 stack.currentFrame->args.instructionPtr++;
750 NEXT_OPCODE;
751
752 BEGIN_OPCODE(NOT_DIGIT):
753 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
754 RRETURN_NO_MATCH;
755 if (isASCIIDigit(*stack.currentFrame->args.subjectPtr++))
756 RRETURN_NO_MATCH;
757 stack.currentFrame->args.instructionPtr++;
758 NEXT_OPCODE;
759
760 BEGIN_OPCODE(DIGIT):
761 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
762 RRETURN_NO_MATCH;
763 if (!isASCIIDigit(*stack.currentFrame->args.subjectPtr++))
764 RRETURN_NO_MATCH;
765 stack.currentFrame->args.instructionPtr++;
766 NEXT_OPCODE;
767
768 BEGIN_OPCODE(NOT_WHITESPACE):
769 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
770 RRETURN_NO_MATCH;
771 if (isSpaceChar(*stack.currentFrame->args.subjectPtr++))
772 RRETURN_NO_MATCH;
773 stack.currentFrame->args.instructionPtr++;
774 NEXT_OPCODE;
775
776 BEGIN_OPCODE(WHITESPACE):
777 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
778 RRETURN_NO_MATCH;
779 if (!isSpaceChar(*stack.currentFrame->args.subjectPtr++))
780 RRETURN_NO_MATCH;
781 stack.currentFrame->args.instructionPtr++;
782 NEXT_OPCODE;
783
784 BEGIN_OPCODE(NOT_WORDCHAR):
785 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
786 RRETURN_NO_MATCH;
787 if (isWordChar(*stack.currentFrame->args.subjectPtr++))
788 RRETURN_NO_MATCH;
789 stack.currentFrame->args.instructionPtr++;
790 NEXT_OPCODE;
791
792 BEGIN_OPCODE(WORDCHAR):
793 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
794 RRETURN_NO_MATCH;
795 if (!isWordChar(*stack.currentFrame->args.subjectPtr++))
796 RRETURN_NO_MATCH;
797 stack.currentFrame->args.instructionPtr++;
798 NEXT_OPCODE;
799
800 /* Match a back reference, possibly repeatedly. Look past the end of the
801 item to see if there is repeat information following. The code is similar
802 to that for character classes, but repeated for efficiency. Then obey
803 similar code to character type repeats - written out again for speed.
804 However, if the referenced string is the empty string, always treat
805 it as matched, any number of times (otherwise there could be infinite
806 loops). */
807
808 BEGIN_OPCODE(REF):
809 stack.currentFrame->locals.offset = get2ByteValue(stack.currentFrame->args.instructionPtr + 1) << 1; /* Doubled ref number */
810 stack.currentFrame->args.instructionPtr += 3; /* Advance past item */
811
812 /* If the reference is unset, set the length to be longer than the amount
813 of subject left; this ensures that every attempt at a match fails. We
814 can't just fail here, because of the possibility of quantifiers with zero
815 minima. */
816
817 if (stack.currentFrame->locals.offset >= stack.currentFrame->args.offsetTop || md.offsetVector[stack.currentFrame->locals.offset] < 0)
818 stack.currentFrame->locals.length = 0;
819 else
820 stack.currentFrame->locals.length = md.offsetVector[stack.currentFrame->locals.offset+1] - md.offsetVector[stack.currentFrame->locals.offset];
821
822 /* Set up for repetition, or handle the non-repeated case */
823
824 switch (*stack.currentFrame->args.instructionPtr) {
825 case OP_CRSTAR:
826 case OP_CRMINSTAR:
827 case OP_CRPLUS:
828 case OP_CRMINPLUS:
829 case OP_CRQUERY:
830 case OP_CRMINQUERY:
831 repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max);
832 break;
833
834 case OP_CRRANGE:
835 case OP_CRMINRANGE:
836 minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE);
837 min = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
838 stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 3);
839 if (stack.currentFrame->locals.max == 0)
840 stack.currentFrame->locals.max = INT_MAX;
841 stack.currentFrame->args.instructionPtr += 5;
842 break;
843
844 default: /* No repeat follows */
845 if (!matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md))
846 RRETURN_NO_MATCH;
847 stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length;
848 NEXT_OPCODE;
849 }
850
851 /* If the length of the reference is zero, just continue with the
852 main loop. */
853
854 if (stack.currentFrame->locals.length == 0)
855 NEXT_OPCODE;
856
857 /* First, ensure the minimum number of matches are present. */
858
859 for (int i = 1; i <= min; i++) {
860 if (!matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md))
861 RRETURN_NO_MATCH;
862 stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length;
863 }
864
865 /* If min = max, continue at the same level without recursion.
866 They are not both allowed to be zero. */
867
868 if (min == stack.currentFrame->locals.max)
869 NEXT_OPCODE;
870
871 /* If minimizing, keep trying and advancing the pointer */
872
873 if (minimize) {
874 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
875 RECURSIVE_MATCH(20, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
876 if (isMatch)
877 RRETURN;
878 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || !matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md))
879 RRETURN;
880 stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length;
881 }
882 /* Control never reaches here */
883 }
884
885 /* If maximizing, find the longest string and work backwards */
886
887 else {
888 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
889 for (int i = min; i < stack.currentFrame->locals.max; i++) {
890 if (!matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md))
891 break;
892 stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length;
893 }
894 while (stack.currentFrame->args.subjectPtr >= stack.currentFrame->locals.subjectPtrAtStartOfInstruction) {
895 RECURSIVE_MATCH(21, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
896 if (isMatch)
897 RRETURN;
898 stack.currentFrame->args.subjectPtr -= stack.currentFrame->locals.length;
899 }
900 RRETURN_NO_MATCH;
901 }
902 /* Control never reaches here */
903
904 /* Match a bit-mapped character class, possibly repeatedly. This op code is
905 used when all the characters in the class have values in the range 0-255,
906 and either the matching is caseful, or the characters are in the range
907 0-127 when UTF-8 processing is enabled. The only difference between
908 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
909 encountered.
910
911 First, look past the end of the item to see if there is repeat information
912 following. Then obey similar code to character type repeats - written out
913 again for speed. */
914
915 BEGIN_OPCODE(NCLASS):
916 BEGIN_OPCODE(CLASS):
917 stack.currentFrame->locals.data = stack.currentFrame->args.instructionPtr + 1; /* Save for matching */
918 stack.currentFrame->args.instructionPtr += 33; /* Advance past the item */
919
920 switch (*stack.currentFrame->args.instructionPtr) {
921 case OP_CRSTAR:
922 case OP_CRMINSTAR:
923 case OP_CRPLUS:
924 case OP_CRMINPLUS:
925 case OP_CRQUERY:
926 case OP_CRMINQUERY:
927 repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max);
928 break;
929
930 case OP_CRRANGE:
931 case OP_CRMINRANGE:
932 minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE);
933 min = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
934 stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 3);
935 if (stack.currentFrame->locals.max == 0)
936 stack.currentFrame->locals.max = INT_MAX;
937 stack.currentFrame->args.instructionPtr += 5;
938 break;
939
940 default: /* No repeat follows */
941 min = stack.currentFrame->locals.max = 1;
942 break;
943 }
944
945 /* First, ensure the minimum number of matches are present. */
946
947 for (int i = 1; i <= min; i++) {
948 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
949 RRETURN_NO_MATCH;
950 int c = *stack.currentFrame->args.subjectPtr++;
951 if (c > 255) {
952 if (stack.currentFrame->locals.data[-1] == OP_CLASS)
953 RRETURN_NO_MATCH;
954 } else {
955 if (!(stack.currentFrame->locals.data[c / 8] & (1 << (c & 7))))
956 RRETURN_NO_MATCH;
957 }
958 }
959
960 /* If max == min we can continue with the main loop without the
961 need to recurse. */
962
963 if (min == stack.currentFrame->locals.max)
964 NEXT_OPCODE;
965
966 /* If minimizing, keep testing the rest of the expression and advancing
967 the pointer while it matches the class. */
968 if (minimize) {
969 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
970 RECURSIVE_MATCH(22, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
971 if (isMatch)
972 RRETURN;
973 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject)
974 RRETURN;
975 int c = *stack.currentFrame->args.subjectPtr++;
976 if (c > 255) {
977 if (stack.currentFrame->locals.data[-1] == OP_CLASS)
978 RRETURN;
979 } else {
980 if ((stack.currentFrame->locals.data[c/8] & (1 << (c&7))) == 0)
981 RRETURN;
982 }
983 }
984 /* Control never reaches here */
985 }
986 /* If maximizing, find the longest possible run, then work backwards. */
987 else {
988 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
989
990 for (int i = min; i < stack.currentFrame->locals.max; i++) {
991 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
992 break;
993 int c = *stack.currentFrame->args.subjectPtr;
994 if (c > 255) {
995 if (stack.currentFrame->locals.data[-1] == OP_CLASS)
996 break;
997 } else {
998 if (!(stack.currentFrame->locals.data[c / 8] & (1 << (c & 7))))
999 break;
1000 }
1001 ++stack.currentFrame->args.subjectPtr;
1002 }
1003 for (;;) {
1004 RECURSIVE_MATCH(24, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1005 if (isMatch)
1006 RRETURN;
1007 if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction)
1008 break; /* Stop if tried at original pos */
1009 }
1010
1011 RRETURN;
1012 }
1013 /* Control never reaches here */
1014
1015 /* Match an extended character class. */
1016
1017 BEGIN_OPCODE(XCLASS):
1018 stack.currentFrame->locals.data = stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE; /* Save for matching */
1019 stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1); /* Advance past the item */
1020
1021 switch (*stack.currentFrame->args.instructionPtr) {
1022 case OP_CRSTAR:
1023 case OP_CRMINSTAR:
1024 case OP_CRPLUS:
1025 case OP_CRMINPLUS:
1026 case OP_CRQUERY:
1027 case OP_CRMINQUERY:
1028 repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max);
1029 break;
1030
1031 case OP_CRRANGE:
1032 case OP_CRMINRANGE:
1033 minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE);
1034 min = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
1035 stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 3);
1036 if (stack.currentFrame->locals.max == 0)
1037 stack.currentFrame->locals.max = INT_MAX;
1038 stack.currentFrame->args.instructionPtr += 5;
1039 break;
1040
1041 default: /* No repeat follows */
1042 min = stack.currentFrame->locals.max = 1;
1043 }
1044
1045 /* First, ensure the minimum number of matches are present. */
1046
1047 for (int i = 1; i <= min; i++) {
1048 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1049 RRETURN_NO_MATCH;
1050 int c = *stack.currentFrame->args.subjectPtr++;
9dae56ea 1051 if (!jsc_pcre_xclass(c, stack.currentFrame->locals.data))
b37bf2e1
A
1052 RRETURN_NO_MATCH;
1053 }
1054
1055 /* If max == min we can continue with the main loop without the
1056 need to recurse. */
1057
1058 if (min == stack.currentFrame->locals.max)
1059 NEXT_OPCODE;
1060
1061 /* If minimizing, keep testing the rest of the expression and advancing
1062 the pointer while it matches the class. */
1063
1064 if (minimize) {
1065 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
1066 RECURSIVE_MATCH(26, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1067 if (isMatch)
1068 RRETURN;
1069 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject)
1070 RRETURN;
1071 int c = *stack.currentFrame->args.subjectPtr++;
9dae56ea 1072 if (!jsc_pcre_xclass(c, stack.currentFrame->locals.data))
b37bf2e1
A
1073 RRETURN;
1074 }
1075 /* Control never reaches here */
1076 }
1077
1078 /* If maximizing, find the longest possible run, then work backwards. */
1079
1080 else {
1081 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
1082 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1083 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1084 break;
1085 int c = *stack.currentFrame->args.subjectPtr;
9dae56ea 1086 if (!jsc_pcre_xclass(c, stack.currentFrame->locals.data))
b37bf2e1
A
1087 break;
1088 ++stack.currentFrame->args.subjectPtr;
1089 }
1090 for(;;) {
1091 RECURSIVE_MATCH(27, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1092 if (isMatch)
1093 RRETURN;
1094 if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction)
1095 break; /* Stop if tried at original pos */
1096 }
1097 RRETURN;
1098 }
1099
1100 /* Control never reaches here */
1101
1102 /* Match a single character, casefully */
1103
1104 BEGIN_OPCODE(CHAR):
1105 stack.currentFrame->locals.length = 1;
1106 stack.currentFrame->args.instructionPtr++;
1107 getUTF8CharAndIncrementLength(stack.currentFrame->locals.fc, stack.currentFrame->args.instructionPtr, stack.currentFrame->locals.length);
1108 stack.currentFrame->args.instructionPtr += stack.currentFrame->locals.length;
1109 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1110 RRETURN_NO_MATCH;
1111 if (stack.currentFrame->locals.fc != *stack.currentFrame->args.subjectPtr++)
1112 RRETURN_NO_MATCH;
1113 NEXT_OPCODE;
1114
1115 /* Match a single character, caselessly */
1116
1117 BEGIN_OPCODE(CHAR_IGNORING_CASE): {
1118 stack.currentFrame->locals.length = 1;
1119 stack.currentFrame->args.instructionPtr++;
1120 getUTF8CharAndIncrementLength(stack.currentFrame->locals.fc, stack.currentFrame->args.instructionPtr, stack.currentFrame->locals.length);
1121 stack.currentFrame->args.instructionPtr += stack.currentFrame->locals.length;
1122 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1123 RRETURN_NO_MATCH;
1124 int dc = *stack.currentFrame->args.subjectPtr++;
9dae56ea 1125 if (stack.currentFrame->locals.fc != dc && jsc_pcre_ucp_othercase(stack.currentFrame->locals.fc) != dc)
b37bf2e1
A
1126 RRETURN_NO_MATCH;
1127 NEXT_OPCODE;
1128 }
1129
1130 /* Match a single ASCII character. */
1131
1132 BEGIN_OPCODE(ASCII_CHAR):
1133 if (md.endSubject == stack.currentFrame->args.subjectPtr)
1134 RRETURN_NO_MATCH;
1135 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->args.instructionPtr[1])
1136 RRETURN_NO_MATCH;
1137 ++stack.currentFrame->args.subjectPtr;
1138 stack.currentFrame->args.instructionPtr += 2;
1139 NEXT_OPCODE;
1140
1141 /* Match one of two cases of an ASCII letter. */
1142
1143 BEGIN_OPCODE(ASCII_LETTER_IGNORING_CASE):
1144 if (md.endSubject == stack.currentFrame->args.subjectPtr)
1145 RRETURN_NO_MATCH;
1146 if ((*stack.currentFrame->args.subjectPtr | 0x20) != stack.currentFrame->args.instructionPtr[1])
1147 RRETURN_NO_MATCH;
1148 ++stack.currentFrame->args.subjectPtr;
1149 stack.currentFrame->args.instructionPtr += 2;
1150 NEXT_OPCODE;
1151
1152 /* Match a single character repeatedly; different opcodes share code. */
1153
1154 BEGIN_OPCODE(EXACT):
1155 min = stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
1156 minimize = false;
1157 stack.currentFrame->args.instructionPtr += 3;
1158 goto REPEATCHAR;
1159
1160 BEGIN_OPCODE(UPTO):
1161 BEGIN_OPCODE(MINUPTO):
1162 min = 0;
1163 stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
1164 minimize = *stack.currentFrame->args.instructionPtr == OP_MINUPTO;
1165 stack.currentFrame->args.instructionPtr += 3;
1166 goto REPEATCHAR;
1167
1168 BEGIN_OPCODE(STAR):
1169 BEGIN_OPCODE(MINSTAR):
1170 BEGIN_OPCODE(PLUS):
1171 BEGIN_OPCODE(MINPLUS):
1172 BEGIN_OPCODE(QUERY):
1173 BEGIN_OPCODE(MINQUERY):
1174 repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_STAR, minimize, min, stack.currentFrame->locals.max);
1175
1176 /* Common code for all repeated single-character matches. We can give
1177 up quickly if there are fewer than the minimum number of characters left in
1178 the subject. */
1179
1180 REPEATCHAR:
1181
1182 stack.currentFrame->locals.length = 1;
1183 getUTF8CharAndIncrementLength(stack.currentFrame->locals.fc, stack.currentFrame->args.instructionPtr, stack.currentFrame->locals.length);
1184 if (min * (stack.currentFrame->locals.fc > 0xFFFF ? 2 : 1) > md.endSubject - stack.currentFrame->args.subjectPtr)
1185 RRETURN_NO_MATCH;
1186 stack.currentFrame->args.instructionPtr += stack.currentFrame->locals.length;
1187
1188 if (stack.currentFrame->locals.fc <= 0xFFFF) {
9dae56ea 1189 int othercase = md.ignoreCase ? jsc_pcre_ucp_othercase(stack.currentFrame->locals.fc) : -1;
b37bf2e1
A
1190
1191 for (int i = 1; i <= min; i++) {
1192 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc && *stack.currentFrame->args.subjectPtr != othercase)
1193 RRETURN_NO_MATCH;
1194 ++stack.currentFrame->args.subjectPtr;
1195 }
1196
1197 if (min == stack.currentFrame->locals.max)
1198 NEXT_OPCODE;
1199
1200 if (minimize) {
1201 stack.currentFrame->locals.repeatOthercase = othercase;
1202 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
1203 RECURSIVE_MATCH(28, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1204 if (isMatch)
1205 RRETURN;
1206 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject)
1207 RRETURN;
1208 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc && *stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.repeatOthercase)
1209 RRETURN;
1210 ++stack.currentFrame->args.subjectPtr;
1211 }
1212 /* Control never reaches here */
1213 } else {
1214 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
1215 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1216 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1217 break;
1218 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc && *stack.currentFrame->args.subjectPtr != othercase)
1219 break;
1220 ++stack.currentFrame->args.subjectPtr;
1221 }
1222 while (stack.currentFrame->args.subjectPtr >= stack.currentFrame->locals.subjectPtrAtStartOfInstruction) {
1223 RECURSIVE_MATCH(29, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1224 if (isMatch)
1225 RRETURN;
1226 --stack.currentFrame->args.subjectPtr;
1227 }
1228 RRETURN_NO_MATCH;
1229 }
1230 /* Control never reaches here */
1231 } else {
1232 /* No case on surrogate pairs, so no need to bother with "othercase". */
1233
1234 for (int i = 1; i <= min; i++) {
1235 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc)
1236 RRETURN_NO_MATCH;
1237 stack.currentFrame->args.subjectPtr += 2;
1238 }
1239
1240 if (min == stack.currentFrame->locals.max)
1241 NEXT_OPCODE;
1242
1243 if (minimize) {
1244 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
1245 RECURSIVE_MATCH(30, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1246 if (isMatch)
1247 RRETURN;
1248 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject)
1249 RRETURN;
1250 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc)
1251 RRETURN;
1252 stack.currentFrame->args.subjectPtr += 2;
1253 }
1254 /* Control never reaches here */
1255 } else {
1256 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
1257 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1258 if (stack.currentFrame->args.subjectPtr > md.endSubject - 2)
1259 break;
1260 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc)
1261 break;
1262 stack.currentFrame->args.subjectPtr += 2;
1263 }
1264 while (stack.currentFrame->args.subjectPtr >= stack.currentFrame->locals.subjectPtrAtStartOfInstruction) {
1265 RECURSIVE_MATCH(31, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1266 if (isMatch)
1267 RRETURN;
1268 stack.currentFrame->args.subjectPtr -= 2;
1269 }
1270 RRETURN_NO_MATCH;
1271 }
1272 /* Control never reaches here */
1273 }
1274 /* Control never reaches here */
1275
1276 /* Match a negated single one-byte character. */
1277
1278 BEGIN_OPCODE(NOT): {
1279 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1280 RRETURN_NO_MATCH;
9dae56ea 1281 int b = stack.currentFrame->args.instructionPtr[1];
b37bf2e1 1282 int c = *stack.currentFrame->args.subjectPtr++;
9dae56ea 1283 stack.currentFrame->args.instructionPtr += 2;
b37bf2e1
A
1284 if (md.ignoreCase) {
1285 if (c < 128)
1286 c = toLowerCase(c);
9dae56ea 1287 if (toLowerCase(b) == c)
b37bf2e1
A
1288 RRETURN_NO_MATCH;
1289 } else {
9dae56ea 1290 if (b == c)
b37bf2e1
A
1291 RRETURN_NO_MATCH;
1292 }
1293 NEXT_OPCODE;
1294 }
1295
1296 /* Match a negated single one-byte character repeatedly. This is almost a
1297 repeat of the code for a repeated single character, but I haven't found a
1298 nice way of commoning these up that doesn't require a test of the
1299 positive/negative option for each character match. Maybe that wouldn't add
1300 very much to the time taken, but character matching *is* what this is all
1301 about... */
1302
1303 BEGIN_OPCODE(NOTEXACT):
1304 min = stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
1305 minimize = false;
1306 stack.currentFrame->args.instructionPtr += 3;
1307 goto REPEATNOTCHAR;
1308
1309 BEGIN_OPCODE(NOTUPTO):
1310 BEGIN_OPCODE(NOTMINUPTO):
1311 min = 0;
1312 stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
1313 minimize = *stack.currentFrame->args.instructionPtr == OP_NOTMINUPTO;
1314 stack.currentFrame->args.instructionPtr += 3;
1315 goto REPEATNOTCHAR;
1316
1317 BEGIN_OPCODE(NOTSTAR):
1318 BEGIN_OPCODE(NOTMINSTAR):
1319 BEGIN_OPCODE(NOTPLUS):
1320 BEGIN_OPCODE(NOTMINPLUS):
1321 BEGIN_OPCODE(NOTQUERY):
1322 BEGIN_OPCODE(NOTMINQUERY):
1323 repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_NOTSTAR, minimize, min, stack.currentFrame->locals.max);
1324
1325 /* Common code for all repeated single-byte matches. We can give up quickly
1326 if there are fewer than the minimum number of bytes left in the
1327 subject. */
1328
1329 REPEATNOTCHAR:
1330 if (min > md.endSubject - stack.currentFrame->args.subjectPtr)
1331 RRETURN_NO_MATCH;
1332 stack.currentFrame->locals.fc = *stack.currentFrame->args.instructionPtr++;
1333
1334 /* The code is duplicated for the caseless and caseful cases, for speed,
1335 since matching characters is likely to be quite common. First, ensure the
1336 minimum number of matches are present. If min = max, continue at the same
1337 level without recursing. Otherwise, if minimizing, keep trying the rest of
1338 the expression and advancing one matching character if failing, up to the
1339 maximum. Alternatively, if maximizing, find the maximum number of
1340 characters and work backwards. */
1341
1342 DPRINTF(("negative matching %c{%d,%d}\n", stack.currentFrame->locals.fc, min, stack.currentFrame->locals.max));
1343
1344 if (md.ignoreCase) {
1345 if (stack.currentFrame->locals.fc < 128)
1346 stack.currentFrame->locals.fc = toLowerCase(stack.currentFrame->locals.fc);
1347
1348 for (int i = 1; i <= min; i++) {
1349 int d = *stack.currentFrame->args.subjectPtr++;
1350 if (d < 128)
1351 d = toLowerCase(d);
1352 if (stack.currentFrame->locals.fc == d)
1353 RRETURN_NO_MATCH;
1354 }
1355
1356 if (min == stack.currentFrame->locals.max)
1357 NEXT_OPCODE;
1358
1359 if (minimize) {
1360 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
1361 RECURSIVE_MATCH(38, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1362 if (isMatch)
1363 RRETURN;
1364 int d = *stack.currentFrame->args.subjectPtr++;
1365 if (d < 128)
1366 d = toLowerCase(d);
1367 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject || stack.currentFrame->locals.fc == d)
1368 RRETURN;
1369 }
1370 /* Control never reaches here */
1371 }
1372
1373 /* Maximize case */
1374
1375 else {
1376 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
1377
1378 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1379 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1380 break;
1381 int d = *stack.currentFrame->args.subjectPtr;
1382 if (d < 128)
1383 d = toLowerCase(d);
1384 if (stack.currentFrame->locals.fc == d)
1385 break;
1386 ++stack.currentFrame->args.subjectPtr;
1387 }
1388 for (;;) {
1389 RECURSIVE_MATCH(40, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1390 if (isMatch)
1391 RRETURN;
1392 if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction)
1393 break; /* Stop if tried at original pos */
1394 }
1395
1396 RRETURN;
1397 }
1398 /* Control never reaches here */
1399 }
1400
1401 /* Caseful comparisons */
1402
1403 else {
1404 for (int i = 1; i <= min; i++) {
1405 int d = *stack.currentFrame->args.subjectPtr++;
1406 if (stack.currentFrame->locals.fc == d)
1407 RRETURN_NO_MATCH;
1408 }
1409
1410 if (min == stack.currentFrame->locals.max)
1411 NEXT_OPCODE;
1412
1413 if (minimize) {
1414 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
1415 RECURSIVE_MATCH(42, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1416 if (isMatch)
1417 RRETURN;
1418 int d = *stack.currentFrame->args.subjectPtr++;
1419 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject || stack.currentFrame->locals.fc == d)
1420 RRETURN;
1421 }
1422 /* Control never reaches here */
1423 }
1424
1425 /* Maximize case */
1426
1427 else {
1428 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
1429
1430 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1431 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1432 break;
1433 int d = *stack.currentFrame->args.subjectPtr;
1434 if (stack.currentFrame->locals.fc == d)
1435 break;
1436 ++stack.currentFrame->args.subjectPtr;
1437 }
1438 for (;;) {
1439 RECURSIVE_MATCH(44, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1440 if (isMatch)
1441 RRETURN;
1442 if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction)
1443 break; /* Stop if tried at original pos */
1444 }
1445
1446 RRETURN;
1447 }
1448 }
1449 /* Control never reaches here */
1450
1451 /* Match a single character type repeatedly; several different opcodes
1452 share code. This is very similar to the code for single characters, but we
1453 repeat it in the interests of efficiency. */
1454
1455 BEGIN_OPCODE(TYPEEXACT):
1456 min = stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
1457 minimize = true;
1458 stack.currentFrame->args.instructionPtr += 3;
1459 goto REPEATTYPE;
1460
1461 BEGIN_OPCODE(TYPEUPTO):
1462 BEGIN_OPCODE(TYPEMINUPTO):
1463 min = 0;
1464 stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
1465 minimize = *stack.currentFrame->args.instructionPtr == OP_TYPEMINUPTO;
1466 stack.currentFrame->args.instructionPtr += 3;
1467 goto REPEATTYPE;
1468
1469 BEGIN_OPCODE(TYPESTAR):
1470 BEGIN_OPCODE(TYPEMINSTAR):
1471 BEGIN_OPCODE(TYPEPLUS):
1472 BEGIN_OPCODE(TYPEMINPLUS):
1473 BEGIN_OPCODE(TYPEQUERY):
1474 BEGIN_OPCODE(TYPEMINQUERY):
1475 repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_TYPESTAR, minimize, min, stack.currentFrame->locals.max);
1476
1477 /* Common code for all repeated single character type matches. Note that
1478 in UTF-8 mode, '.' matches a character of any length, but for the other
1479 character types, the valid characters are all one-byte long. */
1480
1481 REPEATTYPE:
1482 stack.currentFrame->locals.ctype = *stack.currentFrame->args.instructionPtr++; /* Code for the character type */
1483
1484 /* First, ensure the minimum number of matches are present. Use inline
1485 code for maximizing the speed, and do the type test once at the start
1486 (i.e. keep it out of the loop). Also we can test that there are at least
1487 the minimum number of characters before we start. */
1488
1489 if (min > md.endSubject - stack.currentFrame->args.subjectPtr)
1490 RRETURN_NO_MATCH;
1491 if (min > 0) {
1492 switch (stack.currentFrame->locals.ctype) {
1493 case OP_NOT_NEWLINE:
1494 for (int i = 1; i <= min; i++) {
1495 if (isNewline(*stack.currentFrame->args.subjectPtr))
1496 RRETURN_NO_MATCH;
1497 ++stack.currentFrame->args.subjectPtr;
1498 }
1499 break;
1500
1501 case OP_NOT_DIGIT:
1502 for (int i = 1; i <= min; i++) {
1503 if (isASCIIDigit(*stack.currentFrame->args.subjectPtr))
1504 RRETURN_NO_MATCH;
1505 ++stack.currentFrame->args.subjectPtr;
1506 }
1507 break;
1508
1509 case OP_DIGIT:
1510 for (int i = 1; i <= min; i++) {
1511 if (!isASCIIDigit(*stack.currentFrame->args.subjectPtr))
1512 RRETURN_NO_MATCH;
1513 ++stack.currentFrame->args.subjectPtr;
1514 }
1515 break;
1516
1517 case OP_NOT_WHITESPACE:
1518 for (int i = 1; i <= min; i++) {
1519 if (isSpaceChar(*stack.currentFrame->args.subjectPtr))
1520 RRETURN_NO_MATCH;
1521 ++stack.currentFrame->args.subjectPtr;
1522 }
1523 break;
1524
1525 case OP_WHITESPACE:
1526 for (int i = 1; i <= min; i++) {
1527 if (!isSpaceChar(*stack.currentFrame->args.subjectPtr))
1528 RRETURN_NO_MATCH;
1529 ++stack.currentFrame->args.subjectPtr;
1530 }
1531 break;
1532
1533 case OP_NOT_WORDCHAR:
1534 for (int i = 1; i <= min; i++) {
1535 if (isWordChar(*stack.currentFrame->args.subjectPtr))
1536 RRETURN_NO_MATCH;
1537 ++stack.currentFrame->args.subjectPtr;
1538 }
1539 break;
1540
1541 case OP_WORDCHAR:
1542 for (int i = 1; i <= min; i++) {
1543 if (!isWordChar(*stack.currentFrame->args.subjectPtr))
1544 RRETURN_NO_MATCH;
1545 ++stack.currentFrame->args.subjectPtr;
1546 }
1547 break;
1548
1549 default:
1550 ASSERT_NOT_REACHED();
1551 return matchError(JSRegExpErrorInternal, stack);
1552 } /* End switch(stack.currentFrame->locals.ctype) */
1553 }
1554
1555 /* If min = max, continue at the same level without recursing */
1556
1557 if (min == stack.currentFrame->locals.max)
1558 NEXT_OPCODE;
1559
1560 /* If minimizing, we have to test the rest of the pattern before each
1561 subsequent match. */
1562
1563 if (minimize) {
1564 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
1565 RECURSIVE_MATCH(48, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1566 if (isMatch)
1567 RRETURN;
1568 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject)
1569 RRETURN;
1570
1571 int c = *stack.currentFrame->args.subjectPtr++;
1572 switch (stack.currentFrame->locals.ctype) {
1573 case OP_NOT_NEWLINE:
1574 if (isNewline(c))
1575 RRETURN;
1576 break;
1577
1578 case OP_NOT_DIGIT:
1579 if (isASCIIDigit(c))
1580 RRETURN;
1581 break;
1582
1583 case OP_DIGIT:
1584 if (!isASCIIDigit(c))
1585 RRETURN;
1586 break;
1587
1588 case OP_NOT_WHITESPACE:
1589 if (isSpaceChar(c))
1590 RRETURN;
1591 break;
1592
1593 case OP_WHITESPACE:
1594 if (!isSpaceChar(c))
1595 RRETURN;
1596 break;
1597
1598 case OP_NOT_WORDCHAR:
1599 if (isWordChar(c))
1600 RRETURN;
1601 break;
1602
1603 case OP_WORDCHAR:
1604 if (!isWordChar(c))
1605 RRETURN;
1606 break;
1607
1608 default:
1609 ASSERT_NOT_REACHED();
1610 return matchError(JSRegExpErrorInternal, stack);
1611 }
1612 }
1613 /* Control never reaches here */
1614 }
1615
1616 /* If maximizing it is worth using inline code for speed, doing the type
1617 test once at the start (i.e. keep it out of the loop). */
1618
1619 else {
1620 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; /* Remember where we started */
1621
1622 switch (stack.currentFrame->locals.ctype) {
1623 case OP_NOT_NEWLINE:
1624 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1625 if (stack.currentFrame->args.subjectPtr >= md.endSubject || isNewline(*stack.currentFrame->args.subjectPtr))
1626 break;
1627 stack.currentFrame->args.subjectPtr++;
1628 }
1629 break;
1630
1631 case OP_NOT_DIGIT:
1632 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1633 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1634 break;
1635 int c = *stack.currentFrame->args.subjectPtr;
1636 if (isASCIIDigit(c))
1637 break;
1638 ++stack.currentFrame->args.subjectPtr;
1639 }
1640 break;
1641
1642 case OP_DIGIT:
1643 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1644 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1645 break;
1646 int c = *stack.currentFrame->args.subjectPtr;
1647 if (!isASCIIDigit(c))
1648 break;
1649 ++stack.currentFrame->args.subjectPtr;
1650 }
1651 break;
1652
1653 case OP_NOT_WHITESPACE:
1654 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1655 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1656 break;
1657 int c = *stack.currentFrame->args.subjectPtr;
1658 if (isSpaceChar(c))
1659 break;
1660 ++stack.currentFrame->args.subjectPtr;
1661 }
1662 break;
1663
1664 case OP_WHITESPACE:
1665 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1666 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1667 break;
1668 int c = *stack.currentFrame->args.subjectPtr;
1669 if (!isSpaceChar(c))
1670 break;
1671 ++stack.currentFrame->args.subjectPtr;
1672 }
1673 break;
1674
1675 case OP_NOT_WORDCHAR:
1676 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1677 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1678 break;
1679 int c = *stack.currentFrame->args.subjectPtr;
1680 if (isWordChar(c))
1681 break;
1682 ++stack.currentFrame->args.subjectPtr;
1683 }
1684 break;
1685
1686 case OP_WORDCHAR:
1687 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1688 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1689 break;
1690 int c = *stack.currentFrame->args.subjectPtr;
1691 if (!isWordChar(c))
1692 break;
1693 ++stack.currentFrame->args.subjectPtr;
1694 }
1695 break;
1696
1697 default:
1698 ASSERT_NOT_REACHED();
1699 return matchError(JSRegExpErrorInternal, stack);
1700 }
1701
1702 /* stack.currentFrame->args.subjectPtr is now past the end of the maximum run */
1703
1704 for (;;) {
1705 RECURSIVE_MATCH(52, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1706 if (isMatch)
1707 RRETURN;
1708 if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction)
1709 break; /* Stop if tried at original pos */
1710 }
1711
1712 /* Get here if we can't make it match with any permitted repetitions */
1713
1714 RRETURN;
1715 }
1716 /* Control never reaches here */
1717
1718 BEGIN_OPCODE(CRMINPLUS):
1719 BEGIN_OPCODE(CRMINQUERY):
1720 BEGIN_OPCODE(CRMINRANGE):
1721 BEGIN_OPCODE(CRMINSTAR):
1722 BEGIN_OPCODE(CRPLUS):
1723 BEGIN_OPCODE(CRQUERY):
1724 BEGIN_OPCODE(CRRANGE):
1725 BEGIN_OPCODE(CRSTAR):
1726 ASSERT_NOT_REACHED();
1727 return matchError(JSRegExpErrorInternal, stack);
1728
1729#ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
1730 CAPTURING_BRACKET:
1731#else
1732 default:
1733#endif
1734 /* Opening capturing bracket. If there is space in the offset vector, save
1735 the current subject position in the working slot at the top of the vector. We
1736 mustn't change the current values of the data slot, because they may be set
1737 from a previous iteration of this group, and be referred to by a reference
1738 inside the group.
1739
1740 If the bracket fails to match, we need to restore this value and also the
1741 values of the final offsets, in case they were set by a previous iteration of
1742 the same bracket.
1743
1744 If there isn't enough space in the offset vector, treat this as if it were a
1745 non-capturing bracket. Don't worry about setting the flag for the error case
1746 here; that is handled in the code for KET. */
1747
1748 ASSERT(*stack.currentFrame->args.instructionPtr > OP_BRA);
1749
1750 stack.currentFrame->locals.number = *stack.currentFrame->args.instructionPtr - OP_BRA;
1751
1752 /* For extended extraction brackets (large number), we have to fish out the
1753 number from a dummy opcode at the start. */
1754
1755 if (stack.currentFrame->locals.number > EXTRACT_BASIC_MAX)
1756 stack.currentFrame->locals.number = get2ByteValue(stack.currentFrame->args.instructionPtr + 2 + LINK_SIZE);
1757 stack.currentFrame->locals.offset = stack.currentFrame->locals.number << 1;
1758
1759#ifdef DEBUG
1760 printf("start bracket %d subject=", stack.currentFrame->locals.number);
1761 pchars(stack.currentFrame->args.subjectPtr, 16, true, md);
1762 printf("\n");
1763#endif
1764
1765 if (stack.currentFrame->locals.offset < md.offsetMax) {
1766 stack.currentFrame->locals.saveOffset1 = md.offsetVector[stack.currentFrame->locals.offset];
1767 stack.currentFrame->locals.saveOffset2 = md.offsetVector[stack.currentFrame->locals.offset + 1];
1768 stack.currentFrame->locals.saveOffset3 = md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number];
1769
1770 DPRINTF(("saving %d %d %d\n", stack.currentFrame->locals.saveOffset1, stack.currentFrame->locals.saveOffset2, stack.currentFrame->locals.saveOffset3));
1771 md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number] = stack.currentFrame->args.subjectPtr - md.startSubject;
1772
1773 do {
9dae56ea 1774 RECURSIVE_MATCH_NEW_GROUP(1, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain);
b37bf2e1
A
1775 if (isMatch)
1776 RRETURN;
1777 stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1);
1778 } while (*stack.currentFrame->args.instructionPtr == OP_ALT);
1779
1780 DPRINTF(("bracket %d failed\n", stack.currentFrame->locals.number));
1781
1782 md.offsetVector[stack.currentFrame->locals.offset] = stack.currentFrame->locals.saveOffset1;
1783 md.offsetVector[stack.currentFrame->locals.offset + 1] = stack.currentFrame->locals.saveOffset2;
1784 md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number] = stack.currentFrame->locals.saveOffset3;
1785
1786 RRETURN;
1787 }
1788
1789 /* Insufficient room for saving captured contents */
1790
1791 goto NON_CAPTURING_BRACKET;
1792 }
1793
1794 /* Do not stick any code in here without much thought; it is assumed
1795 that "continue" in the code above comes out to here to repeat the main
1796 loop. */
1797
1798 } /* End of main loop */
1799
1800 ASSERT_NOT_REACHED();
1801
1802#ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
1803
1804RRETURN_SWITCH:
1805 switch (stack.currentFrame->returnLocation) {
1806 case 0: goto RETURN;
1807 case 1: goto RRETURN_1;
1808 case 2: goto RRETURN_2;
1809 case 6: goto RRETURN_6;
1810 case 7: goto RRETURN_7;
1811 case 14: goto RRETURN_14;
1812 case 15: goto RRETURN_15;
1813 case 16: goto RRETURN_16;
1814 case 17: goto RRETURN_17;
1815 case 18: goto RRETURN_18;
1816 case 19: goto RRETURN_19;
1817 case 20: goto RRETURN_20;
1818 case 21: goto RRETURN_21;
1819 case 22: goto RRETURN_22;
1820 case 24: goto RRETURN_24;
1821 case 26: goto RRETURN_26;
1822 case 27: goto RRETURN_27;
1823 case 28: goto RRETURN_28;
1824 case 29: goto RRETURN_29;
1825 case 30: goto RRETURN_30;
1826 case 31: goto RRETURN_31;
1827 case 38: goto RRETURN_38;
1828 case 40: goto RRETURN_40;
1829 case 42: goto RRETURN_42;
1830 case 44: goto RRETURN_44;
1831 case 48: goto RRETURN_48;
1832 case 52: goto RRETURN_52;
1833 }
1834
1835 ASSERT_NOT_REACHED();
1836 return matchError(JSRegExpErrorInternal, stack);
1837
1838#endif
1839
1840RETURN:
1841 return isMatch;
1842}
1843
1844
1845/*************************************************
1846* Execute a Regular Expression *
1847*************************************************/
1848
1849/* This function applies a compiled re to a subject string and picks out
1850portions of the string if it matches. Two elements in the vector are set for
1851each substring: the offsets to the start and end of the substring.
1852
1853Arguments:
1854 re points to the compiled expression
1855 extra_data points to extra data or is NULL
1856 subject points to the subject string
1857 length length of subject string (may contain binary zeros)
1858 start_offset where to start in the subject string
1859 options option bits
1860 offsets points to a vector of ints to be filled in with offsets
9dae56ea 1861 offsetCount the number of elements in the vector
b37bf2e1
A
1862
1863Returns: > 0 => success; value is the number of elements filled in
1864 = 0 => success, but offsets is not big enough
1865 -1 => failed to match
1866 < -1 => some kind of unexpected problem
1867*/
1868
9dae56ea 1869static void tryFirstByteOptimization(const UChar*& subjectPtr, const UChar* endSubject, int firstByte, bool firstByteIsCaseless, bool useMultiLineFirstCharOptimization, const UChar* originalSubjectStart)
b37bf2e1 1870{
9dae56ea 1871 // If firstByte is set, try scanning to the first instance of that byte
b37bf2e1 1872 // no need to try and match against any earlier part of the subject string.
9dae56ea
A
1873 if (firstByte >= 0) {
1874 UChar firstChar = firstByte;
1875 if (firstByteIsCaseless)
b37bf2e1
A
1876 while (subjectPtr < endSubject) {
1877 int c = *subjectPtr;
1878 if (c > 127)
1879 break;
9dae56ea 1880 if (toLowerCase(c) == firstChar)
b37bf2e1
A
1881 break;
1882 subjectPtr++;
1883 }
1884 else {
9dae56ea 1885 while (subjectPtr < endSubject && *subjectPtr != firstChar)
b37bf2e1
A
1886 subjectPtr++;
1887 }
1888 } else if (useMultiLineFirstCharOptimization) {
1889 /* Or to just after \n for a multiline match if possible */
1890 // I'm not sure why this != originalSubjectStart check is necessary -- ecs 11/18/07
1891 if (subjectPtr > originalSubjectStart) {
1892 while (subjectPtr < endSubject && !isNewline(subjectPtr[-1]))
1893 subjectPtr++;
1894 }
1895 }
1896}
1897
9dae56ea 1898static bool tryRequiredByteOptimization(const UChar*& subjectPtr, const UChar* endSubject, int reqByte, int reqByte2, bool reqByteIsCaseless, bool hasFirstByte, const UChar*& reqBytePtr)
b37bf2e1 1899{
9dae56ea
A
1900 /* If reqByte is set, we know that that character must appear in the subject
1901 for the match to succeed. If the first character is set, reqByte must be
b37bf2e1
A
1902 later in the subject; otherwise the test starts at the match point. This
1903 optimization can save a huge amount of backtracking in patterns with nested
1904 unlimited repeats that aren't going to match. Writing separate code for
1905 cased/caseless versions makes it go faster, as does using an autoincrement
1906 and backing off on a match.
1907
1908 HOWEVER: when the subject string is very, very long, searching to its end can
1909 take a long time, and give bad performance on quite ordinary patterns. This
1910 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
1911 don't do this when the string is sufficiently long.
1912 */
1913
9dae56ea 1914 if (reqByte >= 0 && endSubject - subjectPtr < REQ_BYTE_MAX) {
b37bf2e1
A
1915 const UChar* p = subjectPtr + (hasFirstByte ? 1 : 0);
1916
1917 /* We don't need to repeat the search if we haven't yet reached the
1918 place we found it at last time. */
1919
1920 if (p > reqBytePtr) {
9dae56ea 1921 if (reqByteIsCaseless) {
b37bf2e1
A
1922 while (p < endSubject) {
1923 int pp = *p++;
9dae56ea 1924 if (pp == reqByte || pp == reqByte2) {
b37bf2e1
A
1925 p--;
1926 break;
1927 }
1928 }
1929 } else {
1930 while (p < endSubject) {
9dae56ea 1931 if (*p++ == reqByte) {
b37bf2e1
A
1932 p--;
1933 break;
1934 }
1935 }
1936 }
1937
1938 /* If we can't find the required character, break the matching loop */
1939
1940 if (p >= endSubject)
1941 return true;
1942
1943 /* If we have found the required character, save the point where we
1944 found it, so that we don't search again next time round the loop if
1945 the start hasn't passed this character yet. */
1946
1947 reqBytePtr = p;
1948 }
1949 }
1950 return false;
1951}
1952
1953int jsRegExpExecute(const JSRegExp* re,
1954 const UChar* subject, int length, int start_offset, int* offsets,
9dae56ea 1955 int offsetCount)
b37bf2e1
A
1956{
1957 ASSERT(re);
9dae56ea
A
1958 ASSERT(subject || !length);
1959 ASSERT(offsetCount >= 0);
1960 ASSERT(offsets || offsetCount == 0);
1961
1962 HistogramTimeLogger logger(re);
1963
b37bf2e1
A
1964 MatchData matchBlock;
1965 matchBlock.startSubject = subject;
1966 matchBlock.endSubject = matchBlock.startSubject + length;
1967 const UChar* endSubject = matchBlock.endSubject;
1968
1969 matchBlock.multiline = (re->options & MatchAcrossMultipleLinesOption);
1970 matchBlock.ignoreCase = (re->options & IgnoreCaseOption);
1971
1972 /* If the expression has got more back references than the offsets supplied can
1973 hold, we get a temporary chunk of working store to use during the matching.
1974 Otherwise, we can use the vector supplied, rounding down its size to a multiple
1975 of 3. */
1976
9dae56ea 1977 int ocount = offsetCount - (offsetCount % 3);
b37bf2e1
A
1978
1979 // FIXME: This is lame that we have to second-guess our caller here.
1980 // The API should change to either fail-hard when we don't have enough offset space
1981 // or that we shouldn't ask our callers to pre-allocate in the first place.
9dae56ea
A
1982 bool usingTemporaryOffsets = false;
1983 if (re->topBackref > 0 && re->topBackref >= ocount/3) {
1984 ocount = re->topBackref * 3 + 3;
b37bf2e1
A
1985 matchBlock.offsetVector = new int[ocount];
1986 if (!matchBlock.offsetVector)
1987 return JSRegExpErrorNoMemory;
9dae56ea 1988 usingTemporaryOffsets = true;
b37bf2e1
A
1989 } else
1990 matchBlock.offsetVector = offsets;
1991
1992 matchBlock.offsetEnd = ocount;
1993 matchBlock.offsetMax = (2*ocount)/3;
1994 matchBlock.offsetOverflow = false;
1995
1996 /* Compute the minimum number of offsets that we need to reset each time. Doing
1997 this makes a huge difference to execution time when there aren't many brackets
1998 in the pattern. */
1999
9dae56ea
A
2000 int resetCount = 2 + re->topBracket * 2;
2001 if (resetCount > offsetCount)
2002 resetCount = ocount;
b37bf2e1
A
2003
2004 /* Reset the working variable associated with each extraction. These should
2005 never be used unless previously set, but they get saved and restored, and so we
2006 initialize them to avoid reading uninitialized locations. */
2007
2008 if (matchBlock.offsetVector) {
2009 int* iptr = matchBlock.offsetVector + ocount;
9dae56ea 2010 int* iend = iptr - resetCount/2 + 1;
b37bf2e1
A
2011 while (--iptr >= iend)
2012 *iptr = -1;
2013 }
2014
9dae56ea 2015 /* Set up the first character to match, if available. The firstByte value is
b37bf2e1
A
2016 never set for an anchored regular expression, but the anchoring may be forced
2017 at run time, so we have to test for anchoring. The first char may be unset for
2018 an unanchored pattern, of course. If there's no first char and the pattern was
2019 studied, there may be a bitmap of possible first characters. */
2020
9dae56ea
A
2021 bool firstByteIsCaseless = false;
2022 int firstByte = -1;
b37bf2e1 2023 if (re->options & UseFirstByteOptimizationOption) {
9dae56ea
A
2024 firstByte = re->firstByte & 255;
2025 if ((firstByteIsCaseless = (re->firstByte & REQ_IGNORE_CASE)))
2026 firstByte = toLowerCase(firstByte);
b37bf2e1
A
2027 }
2028
2029 /* For anchored or unanchored matches, there may be a "last known required
2030 character" set. */
2031
9dae56ea
A
2032 bool reqByteIsCaseless = false;
2033 int reqByte = -1;
2034 int reqByte2 = -1;
b37bf2e1 2035 if (re->options & UseRequiredByteOptimizationOption) {
9dae56ea
A
2036 reqByte = re->reqByte & 255; // FIXME: This optimization could be made to work for UTF16 chars as well...
2037 reqByteIsCaseless = (re->reqByte & REQ_IGNORE_CASE);
2038 reqByte2 = flipCase(reqByte);
b37bf2e1
A
2039 }
2040
2041 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
2042 the loop runs just once. */
2043
2044 const UChar* startMatch = subject + start_offset;
2045 const UChar* reqBytePtr = startMatch - 1;
2046 bool useMultiLineFirstCharOptimization = re->options & UseMultiLineFirstByteOptimizationOption;
2047
2048 do {
2049 /* Reset the maximum number of extractions we might see. */
2050 if (matchBlock.offsetVector) {
2051 int* iptr = matchBlock.offsetVector;
9dae56ea 2052 int* iend = iptr + resetCount;
b37bf2e1
A
2053 while (iptr < iend)
2054 *iptr++ = -1;
2055 }
2056
9dae56ea
A
2057 tryFirstByteOptimization(startMatch, endSubject, firstByte, firstByteIsCaseless, useMultiLineFirstCharOptimization, matchBlock.startSubject + start_offset);
2058 if (tryRequiredByteOptimization(startMatch, endSubject, reqByte, reqByte2, reqByteIsCaseless, firstByte >= 0, reqBytePtr))
b37bf2e1
A
2059 break;
2060
2061 /* When a match occurs, substrings will be set for all internal extractions;
2062 we just need to set up the whole thing as substring 0 before returning. If
2063 there were too many extractions, set the return code to zero. In the case
2064 where we had to get some local store to hold offsets for backreferences, copy
2065 those back references that we can. In this case there need not be overflow
2066 if certain parts of the pattern were not used. */
2067
2068 /* The code starts after the JSRegExp block and the capture name table. */
2069 const unsigned char* start_code = (const unsigned char*)(re + 1);
2070
2071 int returnCode = match(startMatch, start_code, 2, matchBlock);
2072
2073 /* When the result is no match, advance the pointer to the next character
2074 and continue. */
2075 if (returnCode == 0) {
2076 startMatch++;
2077 continue;
2078 }
2079
2080 if (returnCode != 1) {
2081 ASSERT(returnCode == JSRegExpErrorHitLimit || returnCode == JSRegExpErrorNoMemory);
9dae56ea 2082 DPRINTF((">>>> error: returning %d\n", returnCode));
b37bf2e1
A
2083 return returnCode;
2084 }
2085
2086 /* We have a match! Copy the offset information from temporary store if
2087 necessary */
2088
9dae56ea
A
2089 if (usingTemporaryOffsets) {
2090 if (offsetCount >= 4) {
2091 memcpy(offsets + 2, matchBlock.offsetVector + 2, (offsetCount - 2) * sizeof(int));
b37bf2e1
A
2092 DPRINTF(("Copied offsets from temporary memory\n"));
2093 }
9dae56ea 2094 if (matchBlock.endOffsetTop > offsetCount)
b37bf2e1
A
2095 matchBlock.offsetOverflow = true;
2096
2097 DPRINTF(("Freeing temporary memory\n"));
2098 delete [] matchBlock.offsetVector;
2099 }
2100
2101 returnCode = matchBlock.offsetOverflow ? 0 : matchBlock.endOffsetTop / 2;
2102
9dae56ea 2103 if (offsetCount < 2)
b37bf2e1
A
2104 returnCode = 0;
2105 else {
2106 offsets[0] = startMatch - matchBlock.startSubject;
2107 offsets[1] = matchBlock.endMatchPtr - matchBlock.startSubject;
2108 }
2109
9dae56ea 2110 DPRINTF((">>>> returning %d\n", returnCode));
b37bf2e1 2111 return returnCode;
9dae56ea 2112 } while (!(re->options & IsAnchoredOption) && startMatch <= endSubject);
b37bf2e1 2113
9dae56ea 2114 if (usingTemporaryOffsets) {
b37bf2e1
A
2115 DPRINTF(("Freeing temporary memory\n"));
2116 delete [] matchBlock.offsetVector;
2117 }
2118
2119 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
2120 return JSRegExpErrorNoMatch;
2121}
9dae56ea
A
2122
2123#if REGEXP_HISTOGRAM
2124
2125class CompareHistogramEntries {
2126public:
2127 bool operator()(const pair<UString, double>& a, const pair<UString, double>& b)
2128 {
2129 if (a.second == b.second)
2130 return a.first < b.first;
2131 return a.second < b.second;
2132 }
2133};
2134
2135Histogram::~Histogram()
2136{
2137 Vector<pair<UString, double> > values;
2138 Map::iterator end = times.end();
2139 for (Map::iterator it = times.begin(); it != end; ++it)
2140 values.append(*it);
2141 sort(values.begin(), values.end(), CompareHistogramEntries());
2142 size_t size = values.size();
2143 printf("Regular Expressions, sorted by time spent evaluating them:\n");
2144 for (size_t i = 0; i < size; ++i)
2145 printf(" %f - %s\n", values[size - i - 1].second, values[size - i - 1].first.UTF8String().c_str());
2146}
2147
2148void Histogram::add(const JSRegExp* re, double elapsedTime)
2149{
2150 UString string(reinterpret_cast<const UChar*>(reinterpret_cast<const char*>(re) + re->stringOffset), re->stringLength);
2151 if (re->options & IgnoreCaseOption && re->options & MatchAcrossMultipleLinesOption)
2152 string += " (multi-line, ignore case)";
2153 else {
2154 if (re->options & IgnoreCaseOption)
2155 string += " (ignore case)";
2156 if (re->options & MatchAcrossMultipleLinesOption)
2157 string += " (multi-line)";
2158 }
2159 pair<Map::iterator, bool> result = times.add(string.rep(), elapsedTime);
2160 if (!result.second)
2161 result.first->second += elapsedTime;
2162}
2163
2164HistogramTimeLogger::HistogramTimeLogger(const JSRegExp* re)
2165 : m_re(re)
2166 , m_startTime(getCurrentUTCTimeWithMicroseconds())
2167{
2168}
2169
2170HistogramTimeLogger::~HistogramTimeLogger()
2171{
2172 static Histogram histogram;
2173 histogram.add(m_re, getCurrentUTCTimeWithMicroseconds() - m_startTime);
2174}
2175
2176#endif