1 /* This is JavaScriptCore's variant of the PCRE library. While this library
2 started out as a copy of PCRE, many of the features of PCRE have been
3 removed. This library now supports only the regular expression features
4 required by the JavaScript language specification, and has only the functions
5 needed by JavaScriptCore and the rest of WebKit.
7 Originally written by Philip Hazel
8 Copyright (c) 1997-2006 University of Cambridge
9 Copyright (C) 2002, 2004, 2006, 2007 Apple Inc. All rights reserved.
10 Copyright (C) 2007 Eric Seidel <eric@webkit.org>
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
41 /* This module contains jsRegExpExecute(), the externally visible function
42 that does pattern matching using an NFA algorithm, following the rules from
43 the JavaScript specification. There are also some supporting functions. */
47 #include "pcre_internal.h"
49 #include <wtf/ASCIICType.h>
50 #include <wtf/Vector.h>
57 #define USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
58 //#define USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
61 /* Avoid warnings on Windows. */
65 #ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
66 typedef int ReturnLocation
;
68 typedef void* ReturnLocation
;
71 /* Structure for building a chain of data for holding the values of
72 the subject pointer at the start of each bracket, used to detect when
73 an empty string has been matched by a bracket to break infinite loops. */
74 struct BracketChainNode
{
75 BracketChainNode
* previousBracket
;
76 const UChar
* bracketStart
;
80 ReturnLocation returnLocation
;
81 struct MatchFrame
* previousFrame
;
83 /* Function arguments that may change */
85 const UChar
* subjectPtr
;
86 const unsigned char* instructionPtr
;
88 BracketChainNode
* bracketChain
;
92 /* PCRE uses "fake" recursion built off of gotos, thus
93 stack-based local variables are not safe to use. Instead we have to
94 store local variables on the current MatchFrame. */
96 const unsigned char* data
;
97 const unsigned char* startOfRepeatingBracket
;
98 const UChar
* subjectPtrAtStartOfInstruction
; // Several instrutions stash away a subjectPtr here for later compare
99 const unsigned char* instructionPtrAtStartOfOnce
;
114 BracketChainNode bracketChainNode
;
118 /* Structure for passing "static" information around between the functions
119 doing traditional NFA matching, so that they are thread-safe. */
122 int* offsetVector
; /* Offset vector */
123 int offsetEnd
; /* One past the end */
124 int offsetMax
; /* The maximum usable for return data */
125 bool offsetOverflow
; /* Set if too many extractions */
126 const UChar
* startSubject
; /* Start of the subject string */
127 const UChar
* endSubject
; /* End of the subject string */
128 const UChar
* endMatchPtr
; /* Subject position at end match */
129 int endOffsetTop
; /* Highwater mark at end of match */
134 /* The maximum remaining length of subject we are prepared to search for a
137 #define REQ_BYTE_MAX 1000
139 /* The below limit restricts the number of "recursive" match calls in order to
140 avoid spending exponential time on complex regular expressions. */
142 static const unsigned matchLimit
= 100000;
145 /*************************************************
146 * Debugging function to print chars *
147 *************************************************/
149 /* Print a sequence of chars in printable format, stopping at the end of the
150 subject if the requested.
153 p points to characters
154 length number to print
155 isSubject true if printing from within md.startSubject
156 md pointer to matching data block, if isSubject is true
159 static void pchars(const UChar
* p
, int length
, bool isSubject
, const MatchData
& md
)
161 if (isSubject
&& length
> md
.endSubject
- p
)
162 length
= md
.endSubject
- p
;
163 while (length
-- > 0) {
165 if (isprint(c
= *(p
++)))
168 printf("\\x%02x", c
);
170 printf("\\x{%x}", c
);
175 /*************************************************
176 * Match a back-reference *
177 *************************************************/
179 /* If a back reference hasn't been set, the length that is passed is greater
180 than the number of characters left in the string, so the match fails.
183 offset index into the offset vector
184 subjectPtr points into the subject
185 length length to be matched
186 md points to match data block
188 Returns: true if matched
191 static bool matchRef(int offset
, const UChar
* subjectPtr
, int length
, const MatchData
& md
)
193 const UChar
* p
= md
.startSubject
+ md
.offsetVector
[offset
];
196 if (subjectPtr
>= md
.endSubject
)
197 printf("matching subject <null>");
199 printf("matching subject ");
200 pchars(subjectPtr
, length
, true, md
);
202 printf(" against backref ");
203 pchars(p
, length
, false, md
);
207 /* Always fail if not enough characters left */
209 if (length
> md
.endSubject
- subjectPtr
)
212 /* Separate the caselesss case for speed */
215 while (length
-- > 0) {
217 int othercase
= kjs_pcre_ucp_othercase(c
);
218 UChar d
= *subjectPtr
++;
219 if (c
!= d
&& othercase
!= d
)
225 if (*p
++ != *subjectPtr
++)
232 #ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
234 /* Use numbered labels and switch statement at the bottom of the match function. */
236 #define RMATCH_WHERE(num) num
237 #define RRETURN_LABEL RRETURN_SWITCH
241 /* Use GCC's computed goto extension. */
243 /* For one test case this is more than 40% faster than the switch statement.
244 We could avoid the use of the num argument entirely by using local labels,
245 but using it for the GCC case as well as the non-GCC case allows us to share
246 a bit more code and notice if we use conflicting numbers.*/
248 #define RMATCH_WHERE(num) &&RRETURN_##num
249 #define RRETURN_LABEL *stack.currentFrame->returnLocation
253 #define RECURSIVE_MATCH_COMMON(num) \
256 stack.popCurrentFrame();
258 #define RECURSIVE_MATCH(num, ra, rb) \
260 stack.pushNewFrame((ra), (rb), RMATCH_WHERE(num)); \
261 RECURSIVE_MATCH_COMMON(num) \
264 #define RECURSIVE_MATCH_STARTNG_NEW_GROUP(num, ra, rb) \
266 stack.pushNewFrame((ra), (rb), RMATCH_WHERE(num)); \
267 startNewGroup(stack.currentFrame); \
268 RECURSIVE_MATCH_COMMON(num) \
271 #define RRETURN goto RRETURN_LABEL
273 #define RRETURN_NO_MATCH do { isMatch = false; RRETURN; } while (0)
275 /*************************************************
276 * Match from current position *
277 *************************************************/
279 /* On entry instructionPtr points to the first opcode, and subjectPtr to the first character
280 in the subject string, while substringStart holds the value of subjectPtr at the start of the
281 last bracketed group - used for breaking infinite loops matching zero-length
282 strings. This function is called recursively in many circumstances. Whenever it
283 returns a negative (error) response, the outer match() call must also return the
287 subjectPtr pointer in subject
288 instructionPtr position in code
289 offsetTop current top pointer
290 md pointer to "static" info for the match
292 Returns: 1 if matched ) these values are >= 0
293 0 if failed to match )
294 a negative error value if aborted by an error condition
295 (e.g. stopped by repeated call or recursion limit)
298 static const unsigned FRAMES_ON_STACK
= 16;
302 : framesEnd(frames
+ FRAMES_ON_STACK
)
303 , currentFrame(frames
)
304 , size(1) // match() creates accesses the first frame w/o calling pushNewFrame
306 ASSERT((sizeof(frames
) / sizeof(frames
[0])) == FRAMES_ON_STACK
);
309 MatchFrame frames
[FRAMES_ON_STACK
];
310 MatchFrame
* framesEnd
;
311 MatchFrame
* currentFrame
;
314 inline bool canUseStackBufferForNextFrame()
316 return size
< FRAMES_ON_STACK
;
319 inline MatchFrame
* allocateNextFrame()
321 if (canUseStackBufferForNextFrame())
322 return currentFrame
+ 1;
323 return new MatchFrame
;
326 inline void pushNewFrame(const unsigned char* instructionPtr
, BracketChainNode
* bracketChain
, ReturnLocation returnLocation
)
328 MatchFrame
* newframe
= allocateNextFrame();
329 newframe
->previousFrame
= currentFrame
;
331 newframe
->args
.subjectPtr
= currentFrame
->args
.subjectPtr
;
332 newframe
->args
.offsetTop
= currentFrame
->args
.offsetTop
;
333 newframe
->args
.instructionPtr
= instructionPtr
;
334 newframe
->args
.bracketChain
= bracketChain
;
335 newframe
->returnLocation
= returnLocation
;
338 currentFrame
= newframe
;
341 inline void popCurrentFrame()
343 MatchFrame
* oldFrame
= currentFrame
;
344 currentFrame
= currentFrame
->previousFrame
;
345 if (size
> FRAMES_ON_STACK
)
357 static int matchError(int errorCode
, MatchStack
& stack
)
359 stack
.popAllFrames();
363 /* Get the next UTF-8 character, not advancing the pointer, incrementing length
364 if there are extra bytes. This is called when we know we are in UTF-8 mode. */
366 static inline void getUTF8CharAndIncrementLength(int& c
, const unsigned char* subjectPtr
, int& len
)
369 if ((c
& 0xc0) == 0xc0) {
370 int gcaa
= kjs_pcre_utf8_table4
[c
& 0x3f]; /* Number of additional bytes */
372 c
= (c
& kjs_pcre_utf8_table3
[gcaa
]) << gcss
;
373 for (int gcii
= 1; gcii
<= gcaa
; gcii
++) {
375 c
|= (subjectPtr
[gcii
] & 0x3f) << gcss
;
381 static inline void startNewGroup(MatchFrame
* currentFrame
)
383 /* At the start of a bracketed group, add the current subject pointer to the
384 stack of such pointers, to be re-instated at the end of the group when we hit
385 the closing ket. When match() is called in other circumstances, we don't add to
388 currentFrame
->locals
.bracketChainNode
.previousBracket
= currentFrame
->args
.bracketChain
;
389 currentFrame
->locals
.bracketChainNode
.bracketStart
= currentFrame
->args
.subjectPtr
;
390 currentFrame
->args
.bracketChain
= ¤tFrame
->locals
.bracketChainNode
;
393 // FIXME: "minimize" means "not greedy", we should invert the callers to ask for "greedy" to be less confusing
394 static inline void repeatInformationFromInstructionOffset(short instructionOffset
, bool& minimize
, int& minimumRepeats
, int& maximumRepeats
)
396 // Instruction offsets are based off of OP_CRSTAR, OP_STAR, OP_TYPESTAR, OP_NOTSTAR
397 static const char minimumRepeatsFromInstructionOffset
[] = { 0, 0, 1, 1, 0, 0 };
398 static const int maximumRepeatsFromInstructionOffset
[] = { INT_MAX
, INT_MAX
, INT_MAX
, INT_MAX
, 1, 1 };
400 ASSERT(instructionOffset
>= 0);
401 ASSERT(instructionOffset
<= (OP_CRMINQUERY
- OP_CRSTAR
));
403 minimize
= (instructionOffset
& 1); // this assumes ordering: Instruction, MinimizeInstruction, Instruction2, MinimizeInstruction2
404 minimumRepeats
= minimumRepeatsFromInstructionOffset
[instructionOffset
];
405 maximumRepeats
= maximumRepeatsFromInstructionOffset
[instructionOffset
];
408 static int match(const UChar
* subjectPtr
, const unsigned char* instructionPtr
, int offsetTop
, MatchData
& md
)
410 bool isMatch
= false;
412 bool minimize
= false; /* Initialization not really needed, but some compilers think so. */
413 unsigned matchCount
= 0;
417 /* The opcode jump table. */
418 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
419 #define EMIT_JUMP_TABLE_ENTRY(opcode) &&LABEL_OP_##opcode,
420 static void* opcodeJumpTable
[256] = { FOR_EACH_OPCODE(EMIT_JUMP_TABLE_ENTRY
) };
421 #undef EMIT_JUMP_TABLE_ENTRY
424 /* One-time setup of the opcode jump table. */
425 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
426 for (int i
= 255; !opcodeJumpTable
[i
]; i
--)
427 opcodeJumpTable
[i
] = &&CAPTURING_BRACKET
;
430 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
431 // Shark shows this as a hot line
432 // Using a static const here makes this line disappear, but makes later access hotter (not sure why)
433 stack
.currentFrame
->returnLocation
= &&RETURN
;
435 stack
.currentFrame
->returnLocation
= 0;
437 stack
.currentFrame
->args
.subjectPtr
= subjectPtr
;
438 stack
.currentFrame
->args
.instructionPtr
= instructionPtr
;
439 stack
.currentFrame
->args
.offsetTop
= offsetTop
;
440 stack
.currentFrame
->args
.bracketChain
= 0;
441 startNewGroup(stack
.currentFrame
);
443 /* This is where control jumps back to to effect "recursion" */
446 if (++matchCount
> matchLimit
)
447 return matchError(JSRegExpErrorHitLimit
, stack
);
449 /* Now start processing the operations. */
451 #ifndef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
456 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
457 #define BEGIN_OPCODE(opcode) LABEL_OP_##opcode
458 #define NEXT_OPCODE goto *opcodeJumpTable[*stack.currentFrame->args.instructionPtr]
460 #define BEGIN_OPCODE(opcode) case OP_##opcode
461 #define NEXT_OPCODE continue
464 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
467 switch (*stack
.currentFrame
->args
.instructionPtr
)
470 /* Non-capturing bracket: optimized */
473 NON_CAPTURING_BRACKET
:
474 DPRINTF(("start bracket 0\n"));
476 RECURSIVE_MATCH_STARTNG_NEW_GROUP(2, stack
.currentFrame
->args
.instructionPtr
+ 1 + LINK_SIZE
, stack
.currentFrame
->args
.bracketChain
);
479 stack
.currentFrame
->args
.instructionPtr
+= getLinkValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
480 } while (*stack
.currentFrame
->args
.instructionPtr
== OP_ALT
);
481 DPRINTF(("bracket 0 failed\n"));
484 /* Skip over large extraction number data if encountered. */
486 BEGIN_OPCODE(BRANUMBER
):
487 stack
.currentFrame
->args
.instructionPtr
+= 3;
490 /* End of the pattern. */
493 md
.endMatchPtr
= stack
.currentFrame
->args
.subjectPtr
; /* Record where we ended */
494 md
.endOffsetTop
= stack
.currentFrame
->args
.offsetTop
; /* and how many extracts were taken */
498 /* Assertion brackets. Check the alternative branches in turn - the
499 matching won't pass the KET for an assertion. If any one branch matches,
500 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
501 start of each branch to move the current point backwards, so the code at
502 this level is identical to the lookahead case. */
504 BEGIN_OPCODE(ASSERT
):
506 RECURSIVE_MATCH_STARTNG_NEW_GROUP(6, stack
.currentFrame
->args
.instructionPtr
+ 1 + LINK_SIZE
, NULL
);
509 stack
.currentFrame
->args
.instructionPtr
+= getLinkValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
510 } while (*stack
.currentFrame
->args
.instructionPtr
== OP_ALT
);
511 if (*stack
.currentFrame
->args
.instructionPtr
== OP_KET
)
514 /* Continue from after the assertion, updating the offsets high water
515 mark, since extracts may have been taken during the assertion. */
517 advanceToEndOfBracket(stack
.currentFrame
->args
.instructionPtr
);
518 stack
.currentFrame
->args
.instructionPtr
+= 1 + LINK_SIZE
;
519 stack
.currentFrame
->args
.offsetTop
= md
.endOffsetTop
;
522 /* Negative assertion: all branches must fail to match */
524 BEGIN_OPCODE(ASSERT_NOT
):
526 RECURSIVE_MATCH_STARTNG_NEW_GROUP(7, stack
.currentFrame
->args
.instructionPtr
+ 1 + LINK_SIZE
, NULL
);
529 stack
.currentFrame
->args
.instructionPtr
+= getLinkValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
530 } while (*stack
.currentFrame
->args
.instructionPtr
== OP_ALT
);
532 stack
.currentFrame
->args
.instructionPtr
+= 1 + LINK_SIZE
;
535 /* An alternation is the end of a branch; scan along to find the end of the
536 bracketed group and go to there. */
539 advanceToEndOfBracket(stack
.currentFrame
->args
.instructionPtr
);
542 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
543 that it may occur zero times. It may repeat infinitely, or not at all -
544 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
545 repeat limits are compiled as a number of copies, with the optional ones
546 preceded by BRAZERO or BRAMINZERO. */
548 BEGIN_OPCODE(BRAZERO
): {
549 stack
.currentFrame
->locals
.startOfRepeatingBracket
= stack
.currentFrame
->args
.instructionPtr
+ 1;
550 RECURSIVE_MATCH_STARTNG_NEW_GROUP(14, stack
.currentFrame
->locals
.startOfRepeatingBracket
, stack
.currentFrame
->args
.bracketChain
);
553 advanceToEndOfBracket(stack
.currentFrame
->locals
.startOfRepeatingBracket
);
554 stack
.currentFrame
->args
.instructionPtr
= stack
.currentFrame
->locals
.startOfRepeatingBracket
+ 1 + LINK_SIZE
;
558 BEGIN_OPCODE(BRAMINZERO
): {
559 stack
.currentFrame
->locals
.startOfRepeatingBracket
= stack
.currentFrame
->args
.instructionPtr
+ 1;
560 advanceToEndOfBracket(stack
.currentFrame
->locals
.startOfRepeatingBracket
);
561 RECURSIVE_MATCH_STARTNG_NEW_GROUP(15, stack
.currentFrame
->locals
.startOfRepeatingBracket
+ 1 + LINK_SIZE
, stack
.currentFrame
->args
.bracketChain
);
564 stack
.currentFrame
->args
.instructionPtr
++;
568 /* End of a group, repeated or non-repeating. If we are at the end of
569 an assertion "group", stop matching and return 1, but record the
570 current high water mark for use by positive assertions. Do this also
571 for the "once" (not-backup up) groups. */
574 BEGIN_OPCODE(KETRMIN
):
575 BEGIN_OPCODE(KETRMAX
):
576 stack
.currentFrame
->locals
.instructionPtrAtStartOfOnce
= stack
.currentFrame
->args
.instructionPtr
- getLinkValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
577 stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
= stack
.currentFrame
->args
.bracketChain
->bracketStart
;
579 /* Back up the stack of bracket start pointers. */
581 stack
.currentFrame
->args
.bracketChain
= stack
.currentFrame
->args
.bracketChain
->previousBracket
;
583 if (*stack
.currentFrame
->locals
.instructionPtrAtStartOfOnce
== OP_ASSERT
|| *stack
.currentFrame
->locals
.instructionPtrAtStartOfOnce
== OP_ASSERT_NOT
) {
584 md
.endOffsetTop
= stack
.currentFrame
->args
.offsetTop
;
589 /* In all other cases except a conditional group we have to check the
590 group number back at the start and if necessary complete handling an
591 extraction by setting the offsets and bumping the high water mark. */
593 stack
.currentFrame
->locals
.number
= *stack
.currentFrame
->locals
.instructionPtrAtStartOfOnce
- OP_BRA
;
595 /* For extended extraction brackets (large number), we have to fish out
596 the number from a dummy opcode at the start. */
598 if (stack
.currentFrame
->locals
.number
> EXTRACT_BASIC_MAX
)
599 stack
.currentFrame
->locals
.number
= get2ByteValue(stack
.currentFrame
->locals
.instructionPtrAtStartOfOnce
+ 2 + LINK_SIZE
);
600 stack
.currentFrame
->locals
.offset
= stack
.currentFrame
->locals
.number
<< 1;
603 printf("end bracket %d", stack
.currentFrame
->locals
.number
);
607 /* Test for a numbered group. This includes groups called as a result
608 of recursion. Note that whole-pattern recursion is coded as a recurse
609 into group 0, so it won't be picked up here. Instead, we catch it when
610 the OP_END is reached. */
612 if (stack
.currentFrame
->locals
.number
> 0) {
613 if (stack
.currentFrame
->locals
.offset
>= md
.offsetMax
)
614 md
.offsetOverflow
= true;
616 md
.offsetVector
[stack
.currentFrame
->locals
.offset
] =
617 md
.offsetVector
[md
.offsetEnd
- stack
.currentFrame
->locals
.number
];
618 md
.offsetVector
[stack
.currentFrame
->locals
.offset
+1] = stack
.currentFrame
->args
.subjectPtr
- md
.startSubject
;
619 if (stack
.currentFrame
->args
.offsetTop
<= stack
.currentFrame
->locals
.offset
)
620 stack
.currentFrame
->args
.offsetTop
= stack
.currentFrame
->locals
.offset
+ 2;
624 /* For a non-repeating ket, just continue at this level. This also
625 happens for a repeating ket if no characters were matched in the group.
626 This is the forcible breaking of infinite loops as implemented in Perl
627 5.005. If there is an options reset, it will get obeyed in the normal
630 if (*stack
.currentFrame
->args
.instructionPtr
== OP_KET
|| stack
.currentFrame
->args
.subjectPtr
== stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
) {
631 stack
.currentFrame
->args
.instructionPtr
+= 1 + LINK_SIZE
;
635 /* The repeating kets try the rest of the pattern or restart from the
636 preceding bracket, in the appropriate order. */
638 if (*stack
.currentFrame
->args
.instructionPtr
== OP_KETRMIN
) {
639 RECURSIVE_MATCH(16, stack
.currentFrame
->args
.instructionPtr
+ 1 + LINK_SIZE
, stack
.currentFrame
->args
.bracketChain
);
642 RECURSIVE_MATCH_STARTNG_NEW_GROUP(17, stack
.currentFrame
->locals
.instructionPtrAtStartOfOnce
, stack
.currentFrame
->args
.bracketChain
);
645 } else { /* OP_KETRMAX */
646 RECURSIVE_MATCH_STARTNG_NEW_GROUP(18, stack
.currentFrame
->locals
.instructionPtrAtStartOfOnce
, stack
.currentFrame
->args
.bracketChain
);
649 RECURSIVE_MATCH(19, stack
.currentFrame
->args
.instructionPtr
+ 1 + LINK_SIZE
, stack
.currentFrame
->args
.bracketChain
);
655 /* Start of subject, or after internal newline if multiline. */
658 if (stack
.currentFrame
->args
.subjectPtr
!= md
.startSubject
&& (!md
.multiline
|| !isNewline(stack
.currentFrame
->args
.subjectPtr
[-1])))
660 stack
.currentFrame
->args
.instructionPtr
++;
663 /* End of subject, or before internal newline if multiline. */
666 if (stack
.currentFrame
->args
.subjectPtr
< md
.endSubject
&& (!md
.multiline
|| !isNewline(*stack
.currentFrame
->args
.subjectPtr
)))
668 stack
.currentFrame
->args
.instructionPtr
++;
671 /* Word boundary assertions */
673 BEGIN_OPCODE(NOT_WORD_BOUNDARY
):
674 BEGIN_OPCODE(WORD_BOUNDARY
): {
675 bool currentCharIsWordChar
= false;
676 bool previousCharIsWordChar
= false;
678 if (stack
.currentFrame
->args
.subjectPtr
> md
.startSubject
)
679 previousCharIsWordChar
= isWordChar(stack
.currentFrame
->args
.subjectPtr
[-1]);
680 if (stack
.currentFrame
->args
.subjectPtr
< md
.endSubject
)
681 currentCharIsWordChar
= isWordChar(*stack
.currentFrame
->args
.subjectPtr
);
683 /* Now see if the situation is what we want */
684 bool wordBoundaryDesired
= (*stack
.currentFrame
->args
.instructionPtr
++ == OP_WORD_BOUNDARY
);
685 if (wordBoundaryDesired
? currentCharIsWordChar
== previousCharIsWordChar
: currentCharIsWordChar
!= previousCharIsWordChar
)
690 /* Match a single character type; inline for speed */
692 BEGIN_OPCODE(NOT_NEWLINE
):
693 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
695 if (isNewline(*stack
.currentFrame
->args
.subjectPtr
++))
697 stack
.currentFrame
->args
.instructionPtr
++;
700 BEGIN_OPCODE(NOT_DIGIT
):
701 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
703 if (isASCIIDigit(*stack
.currentFrame
->args
.subjectPtr
++))
705 stack
.currentFrame
->args
.instructionPtr
++;
709 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
711 if (!isASCIIDigit(*stack
.currentFrame
->args
.subjectPtr
++))
713 stack
.currentFrame
->args
.instructionPtr
++;
716 BEGIN_OPCODE(NOT_WHITESPACE
):
717 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
719 if (isSpaceChar(*stack
.currentFrame
->args
.subjectPtr
++))
721 stack
.currentFrame
->args
.instructionPtr
++;
724 BEGIN_OPCODE(WHITESPACE
):
725 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
727 if (!isSpaceChar(*stack
.currentFrame
->args
.subjectPtr
++))
729 stack
.currentFrame
->args
.instructionPtr
++;
732 BEGIN_OPCODE(NOT_WORDCHAR
):
733 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
735 if (isWordChar(*stack
.currentFrame
->args
.subjectPtr
++))
737 stack
.currentFrame
->args
.instructionPtr
++;
740 BEGIN_OPCODE(WORDCHAR
):
741 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
743 if (!isWordChar(*stack
.currentFrame
->args
.subjectPtr
++))
745 stack
.currentFrame
->args
.instructionPtr
++;
748 /* Match a back reference, possibly repeatedly. Look past the end of the
749 item to see if there is repeat information following. The code is similar
750 to that for character classes, but repeated for efficiency. Then obey
751 similar code to character type repeats - written out again for speed.
752 However, if the referenced string is the empty string, always treat
753 it as matched, any number of times (otherwise there could be infinite
757 stack
.currentFrame
->locals
.offset
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 1) << 1; /* Doubled ref number */
758 stack
.currentFrame
->args
.instructionPtr
+= 3; /* Advance past item */
760 /* If the reference is unset, set the length to be longer than the amount
761 of subject left; this ensures that every attempt at a match fails. We
762 can't just fail here, because of the possibility of quantifiers with zero
765 if (stack
.currentFrame
->locals
.offset
>= stack
.currentFrame
->args
.offsetTop
|| md
.offsetVector
[stack
.currentFrame
->locals
.offset
] < 0)
766 stack
.currentFrame
->locals
.length
= 0;
768 stack
.currentFrame
->locals
.length
= md
.offsetVector
[stack
.currentFrame
->locals
.offset
+1] - md
.offsetVector
[stack
.currentFrame
->locals
.offset
];
770 /* Set up for repetition, or handle the non-repeated case */
772 switch (*stack
.currentFrame
->args
.instructionPtr
) {
779 repeatInformationFromInstructionOffset(*stack
.currentFrame
->args
.instructionPtr
++ - OP_CRSTAR
, minimize
, min
, stack
.currentFrame
->locals
.max
);
784 minimize
= (*stack
.currentFrame
->args
.instructionPtr
== OP_CRMINRANGE
);
785 min
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
786 stack
.currentFrame
->locals
.max
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 3);
787 if (stack
.currentFrame
->locals
.max
== 0)
788 stack
.currentFrame
->locals
.max
= INT_MAX
;
789 stack
.currentFrame
->args
.instructionPtr
+= 5;
792 default: /* No repeat follows */
793 if (!matchRef(stack
.currentFrame
->locals
.offset
, stack
.currentFrame
->args
.subjectPtr
, stack
.currentFrame
->locals
.length
, md
))
795 stack
.currentFrame
->args
.subjectPtr
+= stack
.currentFrame
->locals
.length
;
799 /* If the length of the reference is zero, just continue with the
802 if (stack
.currentFrame
->locals
.length
== 0)
805 /* First, ensure the minimum number of matches are present. */
807 for (int i
= 1; i
<= min
; i
++) {
808 if (!matchRef(stack
.currentFrame
->locals
.offset
, stack
.currentFrame
->args
.subjectPtr
, stack
.currentFrame
->locals
.length
, md
))
810 stack
.currentFrame
->args
.subjectPtr
+= stack
.currentFrame
->locals
.length
;
813 /* If min = max, continue at the same level without recursion.
814 They are not both allowed to be zero. */
816 if (min
== stack
.currentFrame
->locals
.max
)
819 /* If minimizing, keep trying and advancing the pointer */
822 for (stack
.currentFrame
->locals
.fi
= min
;; stack
.currentFrame
->locals
.fi
++) {
823 RECURSIVE_MATCH(20, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
826 if (stack
.currentFrame
->locals
.fi
>= stack
.currentFrame
->locals
.max
|| !matchRef(stack
.currentFrame
->locals
.offset
, stack
.currentFrame
->args
.subjectPtr
, stack
.currentFrame
->locals
.length
, md
))
828 stack
.currentFrame
->args
.subjectPtr
+= stack
.currentFrame
->locals
.length
;
830 /* Control never reaches here */
833 /* If maximizing, find the longest string and work backwards */
836 stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
= stack
.currentFrame
->args
.subjectPtr
;
837 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
838 if (!matchRef(stack
.currentFrame
->locals
.offset
, stack
.currentFrame
->args
.subjectPtr
, stack
.currentFrame
->locals
.length
, md
))
840 stack
.currentFrame
->args
.subjectPtr
+= stack
.currentFrame
->locals
.length
;
842 while (stack
.currentFrame
->args
.subjectPtr
>= stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
) {
843 RECURSIVE_MATCH(21, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
846 stack
.currentFrame
->args
.subjectPtr
-= stack
.currentFrame
->locals
.length
;
850 /* Control never reaches here */
852 /* Match a bit-mapped character class, possibly repeatedly. This op code is
853 used when all the characters in the class have values in the range 0-255,
854 and either the matching is caseful, or the characters are in the range
855 0-127 when UTF-8 processing is enabled. The only difference between
856 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
859 First, look past the end of the item to see if there is repeat information
860 following. Then obey similar code to character type repeats - written out
863 BEGIN_OPCODE(NCLASS
):
865 stack
.currentFrame
->locals
.data
= stack
.currentFrame
->args
.instructionPtr
+ 1; /* Save for matching */
866 stack
.currentFrame
->args
.instructionPtr
+= 33; /* Advance past the item */
868 switch (*stack
.currentFrame
->args
.instructionPtr
) {
875 repeatInformationFromInstructionOffset(*stack
.currentFrame
->args
.instructionPtr
++ - OP_CRSTAR
, minimize
, min
, stack
.currentFrame
->locals
.max
);
880 minimize
= (*stack
.currentFrame
->args
.instructionPtr
== OP_CRMINRANGE
);
881 min
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
882 stack
.currentFrame
->locals
.max
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 3);
883 if (stack
.currentFrame
->locals
.max
== 0)
884 stack
.currentFrame
->locals
.max
= INT_MAX
;
885 stack
.currentFrame
->args
.instructionPtr
+= 5;
888 default: /* No repeat follows */
889 min
= stack
.currentFrame
->locals
.max
= 1;
893 /* First, ensure the minimum number of matches are present. */
895 for (int i
= 1; i
<= min
; i
++) {
896 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
898 int c
= *stack
.currentFrame
->args
.subjectPtr
++;
900 if (stack
.currentFrame
->locals
.data
[-1] == OP_CLASS
)
903 if (!(stack
.currentFrame
->locals
.data
[c
/ 8] & (1 << (c
& 7))))
908 /* If max == min we can continue with the main loop without the
911 if (min
== stack
.currentFrame
->locals
.max
)
914 /* If minimizing, keep testing the rest of the expression and advancing
915 the pointer while it matches the class. */
917 for (stack
.currentFrame
->locals
.fi
= min
;; stack
.currentFrame
->locals
.fi
++) {
918 RECURSIVE_MATCH(22, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
921 if (stack
.currentFrame
->locals
.fi
>= stack
.currentFrame
->locals
.max
|| stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
923 int c
= *stack
.currentFrame
->args
.subjectPtr
++;
925 if (stack
.currentFrame
->locals
.data
[-1] == OP_CLASS
)
928 if ((stack
.currentFrame
->locals
.data
[c
/8] & (1 << (c
&7))) == 0)
932 /* Control never reaches here */
934 /* If maximizing, find the longest possible run, then work backwards. */
936 stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
= stack
.currentFrame
->args
.subjectPtr
;
938 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
939 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
941 int c
= *stack
.currentFrame
->args
.subjectPtr
;
943 if (stack
.currentFrame
->locals
.data
[-1] == OP_CLASS
)
946 if (!(stack
.currentFrame
->locals
.data
[c
/ 8] & (1 << (c
& 7))))
949 ++stack
.currentFrame
->args
.subjectPtr
;
952 RECURSIVE_MATCH(24, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
955 if (stack
.currentFrame
->args
.subjectPtr
-- == stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
)
956 break; /* Stop if tried at original pos */
961 /* Control never reaches here */
963 /* Match an extended character class. */
965 BEGIN_OPCODE(XCLASS
):
966 stack
.currentFrame
->locals
.data
= stack
.currentFrame
->args
.instructionPtr
+ 1 + LINK_SIZE
; /* Save for matching */
967 stack
.currentFrame
->args
.instructionPtr
+= getLinkValue(stack
.currentFrame
->args
.instructionPtr
+ 1); /* Advance past the item */
969 switch (*stack
.currentFrame
->args
.instructionPtr
) {
976 repeatInformationFromInstructionOffset(*stack
.currentFrame
->args
.instructionPtr
++ - OP_CRSTAR
, minimize
, min
, stack
.currentFrame
->locals
.max
);
981 minimize
= (*stack
.currentFrame
->args
.instructionPtr
== OP_CRMINRANGE
);
982 min
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
983 stack
.currentFrame
->locals
.max
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 3);
984 if (stack
.currentFrame
->locals
.max
== 0)
985 stack
.currentFrame
->locals
.max
= INT_MAX
;
986 stack
.currentFrame
->args
.instructionPtr
+= 5;
989 default: /* No repeat follows */
990 min
= stack
.currentFrame
->locals
.max
= 1;
993 /* First, ensure the minimum number of matches are present. */
995 for (int i
= 1; i
<= min
; i
++) {
996 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
998 int c
= *stack
.currentFrame
->args
.subjectPtr
++;
999 if (!kjs_pcre_xclass(c
, stack
.currentFrame
->locals
.data
))
1003 /* If max == min we can continue with the main loop without the
1006 if (min
== stack
.currentFrame
->locals
.max
)
1009 /* If minimizing, keep testing the rest of the expression and advancing
1010 the pointer while it matches the class. */
1013 for (stack
.currentFrame
->locals
.fi
= min
;; stack
.currentFrame
->locals
.fi
++) {
1014 RECURSIVE_MATCH(26, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1017 if (stack
.currentFrame
->locals
.fi
>= stack
.currentFrame
->locals
.max
|| stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1019 int c
= *stack
.currentFrame
->args
.subjectPtr
++;
1020 if (!kjs_pcre_xclass(c
, stack
.currentFrame
->locals
.data
))
1023 /* Control never reaches here */
1026 /* If maximizing, find the longest possible run, then work backwards. */
1029 stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
= stack
.currentFrame
->args
.subjectPtr
;
1030 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1031 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1033 int c
= *stack
.currentFrame
->args
.subjectPtr
;
1034 if (!kjs_pcre_xclass(c
, stack
.currentFrame
->locals
.data
))
1036 ++stack
.currentFrame
->args
.subjectPtr
;
1039 RECURSIVE_MATCH(27, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1042 if (stack
.currentFrame
->args
.subjectPtr
-- == stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
)
1043 break; /* Stop if tried at original pos */
1048 /* Control never reaches here */
1050 /* Match a single character, casefully */
1053 stack
.currentFrame
->locals
.length
= 1;
1054 stack
.currentFrame
->args
.instructionPtr
++;
1055 getUTF8CharAndIncrementLength(stack
.currentFrame
->locals
.fc
, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->locals
.length
);
1056 stack
.currentFrame
->args
.instructionPtr
+= stack
.currentFrame
->locals
.length
;
1057 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1059 if (stack
.currentFrame
->locals
.fc
!= *stack
.currentFrame
->args
.subjectPtr
++)
1063 /* Match a single character, caselessly */
1065 BEGIN_OPCODE(CHAR_IGNORING_CASE
): {
1066 stack
.currentFrame
->locals
.length
= 1;
1067 stack
.currentFrame
->args
.instructionPtr
++;
1068 getUTF8CharAndIncrementLength(stack
.currentFrame
->locals
.fc
, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->locals
.length
);
1069 stack
.currentFrame
->args
.instructionPtr
+= stack
.currentFrame
->locals
.length
;
1070 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1072 int dc
= *stack
.currentFrame
->args
.subjectPtr
++;
1073 if (stack
.currentFrame
->locals
.fc
!= dc
&& kjs_pcre_ucp_othercase(stack
.currentFrame
->locals
.fc
) != dc
)
1078 /* Match a single ASCII character. */
1080 BEGIN_OPCODE(ASCII_CHAR
):
1081 if (md
.endSubject
== stack
.currentFrame
->args
.subjectPtr
)
1083 if (*stack
.currentFrame
->args
.subjectPtr
!= stack
.currentFrame
->args
.instructionPtr
[1])
1085 ++stack
.currentFrame
->args
.subjectPtr
;
1086 stack
.currentFrame
->args
.instructionPtr
+= 2;
1089 /* Match one of two cases of an ASCII letter. */
1091 BEGIN_OPCODE(ASCII_LETTER_IGNORING_CASE
):
1092 if (md
.endSubject
== stack
.currentFrame
->args
.subjectPtr
)
1094 if ((*stack
.currentFrame
->args
.subjectPtr
| 0x20) != stack
.currentFrame
->args
.instructionPtr
[1])
1096 ++stack
.currentFrame
->args
.subjectPtr
;
1097 stack
.currentFrame
->args
.instructionPtr
+= 2;
1100 /* Match a single character repeatedly; different opcodes share code. */
1102 BEGIN_OPCODE(EXACT
):
1103 min
= stack
.currentFrame
->locals
.max
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
1105 stack
.currentFrame
->args
.instructionPtr
+= 3;
1109 BEGIN_OPCODE(MINUPTO
):
1111 stack
.currentFrame
->locals
.max
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
1112 minimize
= *stack
.currentFrame
->args
.instructionPtr
== OP_MINUPTO
;
1113 stack
.currentFrame
->args
.instructionPtr
+= 3;
1117 BEGIN_OPCODE(MINSTAR
):
1119 BEGIN_OPCODE(MINPLUS
):
1120 BEGIN_OPCODE(QUERY
):
1121 BEGIN_OPCODE(MINQUERY
):
1122 repeatInformationFromInstructionOffset(*stack
.currentFrame
->args
.instructionPtr
++ - OP_STAR
, minimize
, min
, stack
.currentFrame
->locals
.max
);
1124 /* Common code for all repeated single-character matches. We can give
1125 up quickly if there are fewer than the minimum number of characters left in
1130 stack
.currentFrame
->locals
.length
= 1;
1131 getUTF8CharAndIncrementLength(stack
.currentFrame
->locals
.fc
, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->locals
.length
);
1132 if (min
* (stack
.currentFrame
->locals
.fc
> 0xFFFF ? 2 : 1) > md
.endSubject
- stack
.currentFrame
->args
.subjectPtr
)
1134 stack
.currentFrame
->args
.instructionPtr
+= stack
.currentFrame
->locals
.length
;
1136 if (stack
.currentFrame
->locals
.fc
<= 0xFFFF) {
1137 int othercase
= md
.ignoreCase
? kjs_pcre_ucp_othercase(stack
.currentFrame
->locals
.fc
) : -1;
1139 for (int i
= 1; i
<= min
; i
++) {
1140 if (*stack
.currentFrame
->args
.subjectPtr
!= stack
.currentFrame
->locals
.fc
&& *stack
.currentFrame
->args
.subjectPtr
!= othercase
)
1142 ++stack
.currentFrame
->args
.subjectPtr
;
1145 if (min
== stack
.currentFrame
->locals
.max
)
1149 stack
.currentFrame
->locals
.repeatOthercase
= othercase
;
1150 for (stack
.currentFrame
->locals
.fi
= min
;; stack
.currentFrame
->locals
.fi
++) {
1151 RECURSIVE_MATCH(28, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1154 if (stack
.currentFrame
->locals
.fi
>= stack
.currentFrame
->locals
.max
|| stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1156 if (*stack
.currentFrame
->args
.subjectPtr
!= stack
.currentFrame
->locals
.fc
&& *stack
.currentFrame
->args
.subjectPtr
!= stack
.currentFrame
->locals
.repeatOthercase
)
1158 ++stack
.currentFrame
->args
.subjectPtr
;
1160 /* Control never reaches here */
1162 stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
= stack
.currentFrame
->args
.subjectPtr
;
1163 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1164 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1166 if (*stack
.currentFrame
->args
.subjectPtr
!= stack
.currentFrame
->locals
.fc
&& *stack
.currentFrame
->args
.subjectPtr
!= othercase
)
1168 ++stack
.currentFrame
->args
.subjectPtr
;
1170 while (stack
.currentFrame
->args
.subjectPtr
>= stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
) {
1171 RECURSIVE_MATCH(29, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1174 --stack
.currentFrame
->args
.subjectPtr
;
1178 /* Control never reaches here */
1180 /* No case on surrogate pairs, so no need to bother with "othercase". */
1182 for (int i
= 1; i
<= min
; i
++) {
1183 if (*stack
.currentFrame
->args
.subjectPtr
!= stack
.currentFrame
->locals
.fc
)
1185 stack
.currentFrame
->args
.subjectPtr
+= 2;
1188 if (min
== stack
.currentFrame
->locals
.max
)
1192 for (stack
.currentFrame
->locals
.fi
= min
;; stack
.currentFrame
->locals
.fi
++) {
1193 RECURSIVE_MATCH(30, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1196 if (stack
.currentFrame
->locals
.fi
>= stack
.currentFrame
->locals
.max
|| stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1198 if (*stack
.currentFrame
->args
.subjectPtr
!= stack
.currentFrame
->locals
.fc
)
1200 stack
.currentFrame
->args
.subjectPtr
+= 2;
1202 /* Control never reaches here */
1204 stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
= stack
.currentFrame
->args
.subjectPtr
;
1205 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1206 if (stack
.currentFrame
->args
.subjectPtr
> md
.endSubject
- 2)
1208 if (*stack
.currentFrame
->args
.subjectPtr
!= stack
.currentFrame
->locals
.fc
)
1210 stack
.currentFrame
->args
.subjectPtr
+= 2;
1212 while (stack
.currentFrame
->args
.subjectPtr
>= stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
) {
1213 RECURSIVE_MATCH(31, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1216 stack
.currentFrame
->args
.subjectPtr
-= 2;
1220 /* Control never reaches here */
1222 /* Control never reaches here */
1224 /* Match a negated single one-byte character. */
1226 BEGIN_OPCODE(NOT
): {
1227 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1229 stack
.currentFrame
->args
.instructionPtr
++;
1230 int c
= *stack
.currentFrame
->args
.subjectPtr
++;
1231 if (md
.ignoreCase
) {
1234 if (toLowerCase(*stack
.currentFrame
->args
.instructionPtr
++) == c
)
1237 if (*stack
.currentFrame
->args
.instructionPtr
++ == c
)
1243 /* Match a negated single one-byte character repeatedly. This is almost a
1244 repeat of the code for a repeated single character, but I haven't found a
1245 nice way of commoning these up that doesn't require a test of the
1246 positive/negative option for each character match. Maybe that wouldn't add
1247 very much to the time taken, but character matching *is* what this is all
1250 BEGIN_OPCODE(NOTEXACT
):
1251 min
= stack
.currentFrame
->locals
.max
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
1253 stack
.currentFrame
->args
.instructionPtr
+= 3;
1256 BEGIN_OPCODE(NOTUPTO
):
1257 BEGIN_OPCODE(NOTMINUPTO
):
1259 stack
.currentFrame
->locals
.max
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
1260 minimize
= *stack
.currentFrame
->args
.instructionPtr
== OP_NOTMINUPTO
;
1261 stack
.currentFrame
->args
.instructionPtr
+= 3;
1264 BEGIN_OPCODE(NOTSTAR
):
1265 BEGIN_OPCODE(NOTMINSTAR
):
1266 BEGIN_OPCODE(NOTPLUS
):
1267 BEGIN_OPCODE(NOTMINPLUS
):
1268 BEGIN_OPCODE(NOTQUERY
):
1269 BEGIN_OPCODE(NOTMINQUERY
):
1270 repeatInformationFromInstructionOffset(*stack
.currentFrame
->args
.instructionPtr
++ - OP_NOTSTAR
, minimize
, min
, stack
.currentFrame
->locals
.max
);
1272 /* Common code for all repeated single-byte matches. We can give up quickly
1273 if there are fewer than the minimum number of bytes left in the
1277 if (min
> md
.endSubject
- stack
.currentFrame
->args
.subjectPtr
)
1279 stack
.currentFrame
->locals
.fc
= *stack
.currentFrame
->args
.instructionPtr
++;
1281 /* The code is duplicated for the caseless and caseful cases, for speed,
1282 since matching characters is likely to be quite common. First, ensure the
1283 minimum number of matches are present. If min = max, continue at the same
1284 level without recursing. Otherwise, if minimizing, keep trying the rest of
1285 the expression and advancing one matching character if failing, up to the
1286 maximum. Alternatively, if maximizing, find the maximum number of
1287 characters and work backwards. */
1289 DPRINTF(("negative matching %c{%d,%d}\n", stack
.currentFrame
->locals
.fc
, min
, stack
.currentFrame
->locals
.max
));
1291 if (md
.ignoreCase
) {
1292 if (stack
.currentFrame
->locals
.fc
< 128)
1293 stack
.currentFrame
->locals
.fc
= toLowerCase(stack
.currentFrame
->locals
.fc
);
1295 for (int i
= 1; i
<= min
; i
++) {
1296 int d
= *stack
.currentFrame
->args
.subjectPtr
++;
1299 if (stack
.currentFrame
->locals
.fc
== d
)
1303 if (min
== stack
.currentFrame
->locals
.max
)
1307 for (stack
.currentFrame
->locals
.fi
= min
;; stack
.currentFrame
->locals
.fi
++) {
1308 RECURSIVE_MATCH(38, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1311 int d
= *stack
.currentFrame
->args
.subjectPtr
++;
1314 if (stack
.currentFrame
->locals
.fi
>= stack
.currentFrame
->locals
.max
|| stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
|| stack
.currentFrame
->locals
.fc
== d
)
1317 /* Control never reaches here */
1323 stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
= stack
.currentFrame
->args
.subjectPtr
;
1325 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1326 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1328 int d
= *stack
.currentFrame
->args
.subjectPtr
;
1331 if (stack
.currentFrame
->locals
.fc
== d
)
1333 ++stack
.currentFrame
->args
.subjectPtr
;
1336 RECURSIVE_MATCH(40, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1339 if (stack
.currentFrame
->args
.subjectPtr
-- == stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
)
1340 break; /* Stop if tried at original pos */
1345 /* Control never reaches here */
1348 /* Caseful comparisons */
1351 for (int i
= 1; i
<= min
; i
++) {
1352 int d
= *stack
.currentFrame
->args
.subjectPtr
++;
1353 if (stack
.currentFrame
->locals
.fc
== d
)
1357 if (min
== stack
.currentFrame
->locals
.max
)
1361 for (stack
.currentFrame
->locals
.fi
= min
;; stack
.currentFrame
->locals
.fi
++) {
1362 RECURSIVE_MATCH(42, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1365 int d
= *stack
.currentFrame
->args
.subjectPtr
++;
1366 if (stack
.currentFrame
->locals
.fi
>= stack
.currentFrame
->locals
.max
|| stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
|| stack
.currentFrame
->locals
.fc
== d
)
1369 /* Control never reaches here */
1375 stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
= stack
.currentFrame
->args
.subjectPtr
;
1377 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1378 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1380 int d
= *stack
.currentFrame
->args
.subjectPtr
;
1381 if (stack
.currentFrame
->locals
.fc
== d
)
1383 ++stack
.currentFrame
->args
.subjectPtr
;
1386 RECURSIVE_MATCH(44, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1389 if (stack
.currentFrame
->args
.subjectPtr
-- == stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
)
1390 break; /* Stop if tried at original pos */
1396 /* Control never reaches here */
1398 /* Match a single character type repeatedly; several different opcodes
1399 share code. This is very similar to the code for single characters, but we
1400 repeat it in the interests of efficiency. */
1402 BEGIN_OPCODE(TYPEEXACT
):
1403 min
= stack
.currentFrame
->locals
.max
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
1405 stack
.currentFrame
->args
.instructionPtr
+= 3;
1408 BEGIN_OPCODE(TYPEUPTO
):
1409 BEGIN_OPCODE(TYPEMINUPTO
):
1411 stack
.currentFrame
->locals
.max
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
1412 minimize
= *stack
.currentFrame
->args
.instructionPtr
== OP_TYPEMINUPTO
;
1413 stack
.currentFrame
->args
.instructionPtr
+= 3;
1416 BEGIN_OPCODE(TYPESTAR
):
1417 BEGIN_OPCODE(TYPEMINSTAR
):
1418 BEGIN_OPCODE(TYPEPLUS
):
1419 BEGIN_OPCODE(TYPEMINPLUS
):
1420 BEGIN_OPCODE(TYPEQUERY
):
1421 BEGIN_OPCODE(TYPEMINQUERY
):
1422 repeatInformationFromInstructionOffset(*stack
.currentFrame
->args
.instructionPtr
++ - OP_TYPESTAR
, minimize
, min
, stack
.currentFrame
->locals
.max
);
1424 /* Common code for all repeated single character type matches. Note that
1425 in UTF-8 mode, '.' matches a character of any length, but for the other
1426 character types, the valid characters are all one-byte long. */
1429 stack
.currentFrame
->locals
.ctype
= *stack
.currentFrame
->args
.instructionPtr
++; /* Code for the character type */
1431 /* First, ensure the minimum number of matches are present. Use inline
1432 code for maximizing the speed, and do the type test once at the start
1433 (i.e. keep it out of the loop). Also we can test that there are at least
1434 the minimum number of characters before we start. */
1436 if (min
> md
.endSubject
- stack
.currentFrame
->args
.subjectPtr
)
1439 switch (stack
.currentFrame
->locals
.ctype
) {
1440 case OP_NOT_NEWLINE
:
1441 for (int i
= 1; i
<= min
; i
++) {
1442 if (isNewline(*stack
.currentFrame
->args
.subjectPtr
))
1444 ++stack
.currentFrame
->args
.subjectPtr
;
1449 for (int i
= 1; i
<= min
; i
++) {
1450 if (isASCIIDigit(*stack
.currentFrame
->args
.subjectPtr
))
1452 ++stack
.currentFrame
->args
.subjectPtr
;
1457 for (int i
= 1; i
<= min
; i
++) {
1458 if (!isASCIIDigit(*stack
.currentFrame
->args
.subjectPtr
))
1460 ++stack
.currentFrame
->args
.subjectPtr
;
1464 case OP_NOT_WHITESPACE
:
1465 for (int i
= 1; i
<= min
; i
++) {
1466 if (isSpaceChar(*stack
.currentFrame
->args
.subjectPtr
))
1468 ++stack
.currentFrame
->args
.subjectPtr
;
1473 for (int i
= 1; i
<= min
; i
++) {
1474 if (!isSpaceChar(*stack
.currentFrame
->args
.subjectPtr
))
1476 ++stack
.currentFrame
->args
.subjectPtr
;
1480 case OP_NOT_WORDCHAR
:
1481 for (int i
= 1; i
<= min
; i
++) {
1482 if (isWordChar(*stack
.currentFrame
->args
.subjectPtr
))
1484 ++stack
.currentFrame
->args
.subjectPtr
;
1489 for (int i
= 1; i
<= min
; i
++) {
1490 if (!isWordChar(*stack
.currentFrame
->args
.subjectPtr
))
1492 ++stack
.currentFrame
->args
.subjectPtr
;
1497 ASSERT_NOT_REACHED();
1498 return matchError(JSRegExpErrorInternal
, stack
);
1499 } /* End switch(stack.currentFrame->locals.ctype) */
1502 /* If min = max, continue at the same level without recursing */
1504 if (min
== stack
.currentFrame
->locals
.max
)
1507 /* If minimizing, we have to test the rest of the pattern before each
1508 subsequent match. */
1511 for (stack
.currentFrame
->locals
.fi
= min
;; stack
.currentFrame
->locals
.fi
++) {
1512 RECURSIVE_MATCH(48, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1515 if (stack
.currentFrame
->locals
.fi
>= stack
.currentFrame
->locals
.max
|| stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1518 int c
= *stack
.currentFrame
->args
.subjectPtr
++;
1519 switch (stack
.currentFrame
->locals
.ctype
) {
1520 case OP_NOT_NEWLINE
:
1526 if (isASCIIDigit(c
))
1531 if (!isASCIIDigit(c
))
1535 case OP_NOT_WHITESPACE
:
1541 if (!isSpaceChar(c
))
1545 case OP_NOT_WORDCHAR
:
1556 ASSERT_NOT_REACHED();
1557 return matchError(JSRegExpErrorInternal
, stack
);
1560 /* Control never reaches here */
1563 /* If maximizing it is worth using inline code for speed, doing the type
1564 test once at the start (i.e. keep it out of the loop). */
1567 stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
= stack
.currentFrame
->args
.subjectPtr
; /* Remember where we started */
1569 switch (stack
.currentFrame
->locals
.ctype
) {
1570 case OP_NOT_NEWLINE
:
1571 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1572 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
|| isNewline(*stack
.currentFrame
->args
.subjectPtr
))
1574 stack
.currentFrame
->args
.subjectPtr
++;
1579 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1580 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1582 int c
= *stack
.currentFrame
->args
.subjectPtr
;
1583 if (isASCIIDigit(c
))
1585 ++stack
.currentFrame
->args
.subjectPtr
;
1590 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1591 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1593 int c
= *stack
.currentFrame
->args
.subjectPtr
;
1594 if (!isASCIIDigit(c
))
1596 ++stack
.currentFrame
->args
.subjectPtr
;
1600 case OP_NOT_WHITESPACE
:
1601 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1602 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1604 int c
= *stack
.currentFrame
->args
.subjectPtr
;
1607 ++stack
.currentFrame
->args
.subjectPtr
;
1612 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1613 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1615 int c
= *stack
.currentFrame
->args
.subjectPtr
;
1616 if (!isSpaceChar(c
))
1618 ++stack
.currentFrame
->args
.subjectPtr
;
1622 case OP_NOT_WORDCHAR
:
1623 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1624 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1626 int c
= *stack
.currentFrame
->args
.subjectPtr
;
1629 ++stack
.currentFrame
->args
.subjectPtr
;
1634 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1635 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1637 int c
= *stack
.currentFrame
->args
.subjectPtr
;
1640 ++stack
.currentFrame
->args
.subjectPtr
;
1645 ASSERT_NOT_REACHED();
1646 return matchError(JSRegExpErrorInternal
, stack
);
1649 /* stack.currentFrame->args.subjectPtr is now past the end of the maximum run */
1652 RECURSIVE_MATCH(52, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1655 if (stack
.currentFrame
->args
.subjectPtr
-- == stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
)
1656 break; /* Stop if tried at original pos */
1659 /* Get here if we can't make it match with any permitted repetitions */
1663 /* Control never reaches here */
1665 BEGIN_OPCODE(CRMINPLUS
):
1666 BEGIN_OPCODE(CRMINQUERY
):
1667 BEGIN_OPCODE(CRMINRANGE
):
1668 BEGIN_OPCODE(CRMINSTAR
):
1669 BEGIN_OPCODE(CRPLUS
):
1670 BEGIN_OPCODE(CRQUERY
):
1671 BEGIN_OPCODE(CRRANGE
):
1672 BEGIN_OPCODE(CRSTAR
):
1673 ASSERT_NOT_REACHED();
1674 return matchError(JSRegExpErrorInternal
, stack
);
1676 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
1681 /* Opening capturing bracket. If there is space in the offset vector, save
1682 the current subject position in the working slot at the top of the vector. We
1683 mustn't change the current values of the data slot, because they may be set
1684 from a previous iteration of this group, and be referred to by a reference
1687 If the bracket fails to match, we need to restore this value and also the
1688 values of the final offsets, in case they were set by a previous iteration of
1691 If there isn't enough space in the offset vector, treat this as if it were a
1692 non-capturing bracket. Don't worry about setting the flag for the error case
1693 here; that is handled in the code for KET. */
1695 ASSERT(*stack
.currentFrame
->args
.instructionPtr
> OP_BRA
);
1697 stack
.currentFrame
->locals
.number
= *stack
.currentFrame
->args
.instructionPtr
- OP_BRA
;
1699 /* For extended extraction brackets (large number), we have to fish out the
1700 number from a dummy opcode at the start. */
1702 if (stack
.currentFrame
->locals
.number
> EXTRACT_BASIC_MAX
)
1703 stack
.currentFrame
->locals
.number
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 2 + LINK_SIZE
);
1704 stack
.currentFrame
->locals
.offset
= stack
.currentFrame
->locals
.number
<< 1;
1707 printf("start bracket %d subject=", stack
.currentFrame
->locals
.number
);
1708 pchars(stack
.currentFrame
->args
.subjectPtr
, 16, true, md
);
1712 if (stack
.currentFrame
->locals
.offset
< md
.offsetMax
) {
1713 stack
.currentFrame
->locals
.saveOffset1
= md
.offsetVector
[stack
.currentFrame
->locals
.offset
];
1714 stack
.currentFrame
->locals
.saveOffset2
= md
.offsetVector
[stack
.currentFrame
->locals
.offset
+ 1];
1715 stack
.currentFrame
->locals
.saveOffset3
= md
.offsetVector
[md
.offsetEnd
- stack
.currentFrame
->locals
.number
];
1717 DPRINTF(("saving %d %d %d\n", stack
.currentFrame
->locals
.saveOffset1
, stack
.currentFrame
->locals
.saveOffset2
, stack
.currentFrame
->locals
.saveOffset3
));
1718 md
.offsetVector
[md
.offsetEnd
- stack
.currentFrame
->locals
.number
] = stack
.currentFrame
->args
.subjectPtr
- md
.startSubject
;
1721 RECURSIVE_MATCH_STARTNG_NEW_GROUP(1, stack
.currentFrame
->args
.instructionPtr
+ 1 + LINK_SIZE
, stack
.currentFrame
->args
.bracketChain
);
1724 stack
.currentFrame
->args
.instructionPtr
+= getLinkValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
1725 } while (*stack
.currentFrame
->args
.instructionPtr
== OP_ALT
);
1727 DPRINTF(("bracket %d failed\n", stack
.currentFrame
->locals
.number
));
1729 md
.offsetVector
[stack
.currentFrame
->locals
.offset
] = stack
.currentFrame
->locals
.saveOffset1
;
1730 md
.offsetVector
[stack
.currentFrame
->locals
.offset
+ 1] = stack
.currentFrame
->locals
.saveOffset2
;
1731 md
.offsetVector
[md
.offsetEnd
- stack
.currentFrame
->locals
.number
] = stack
.currentFrame
->locals
.saveOffset3
;
1736 /* Insufficient room for saving captured contents */
1738 goto NON_CAPTURING_BRACKET
;
1741 /* Do not stick any code in here without much thought; it is assumed
1742 that "continue" in the code above comes out to here to repeat the main
1745 } /* End of main loop */
1747 ASSERT_NOT_REACHED();
1749 #ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
1752 switch (stack
.currentFrame
->returnLocation
) {
1753 case 0: goto RETURN
;
1754 case 1: goto RRETURN_1
;
1755 case 2: goto RRETURN_2
;
1756 case 6: goto RRETURN_6
;
1757 case 7: goto RRETURN_7
;
1758 case 14: goto RRETURN_14
;
1759 case 15: goto RRETURN_15
;
1760 case 16: goto RRETURN_16
;
1761 case 17: goto RRETURN_17
;
1762 case 18: goto RRETURN_18
;
1763 case 19: goto RRETURN_19
;
1764 case 20: goto RRETURN_20
;
1765 case 21: goto RRETURN_21
;
1766 case 22: goto RRETURN_22
;
1767 case 24: goto RRETURN_24
;
1768 case 26: goto RRETURN_26
;
1769 case 27: goto RRETURN_27
;
1770 case 28: goto RRETURN_28
;
1771 case 29: goto RRETURN_29
;
1772 case 30: goto RRETURN_30
;
1773 case 31: goto RRETURN_31
;
1774 case 38: goto RRETURN_38
;
1775 case 40: goto RRETURN_40
;
1776 case 42: goto RRETURN_42
;
1777 case 44: goto RRETURN_44
;
1778 case 48: goto RRETURN_48
;
1779 case 52: goto RRETURN_52
;
1782 ASSERT_NOT_REACHED();
1783 return matchError(JSRegExpErrorInternal
, stack
);
1792 /*************************************************
1793 * Execute a Regular Expression *
1794 *************************************************/
1796 /* This function applies a compiled re to a subject string and picks out
1797 portions of the string if it matches. Two elements in the vector are set for
1798 each substring: the offsets to the start and end of the substring.
1801 re points to the compiled expression
1802 extra_data points to extra data or is NULL
1803 subject points to the subject string
1804 length length of subject string (may contain binary zeros)
1805 start_offset where to start in the subject string
1807 offsets points to a vector of ints to be filled in with offsets
1808 offsetcount the number of elements in the vector
1810 Returns: > 0 => success; value is the number of elements filled in
1811 = 0 => success, but offsets is not big enough
1812 -1 => failed to match
1813 < -1 => some kind of unexpected problem
1816 static void tryFirstByteOptimization(const UChar
*& subjectPtr
, const UChar
* endSubject
, int first_byte
, bool first_byte_caseless
, bool useMultiLineFirstCharOptimization
, const UChar
* originalSubjectStart
)
1818 // If first_byte is set, try scanning to the first instance of that byte
1819 // no need to try and match against any earlier part of the subject string.
1820 if (first_byte
>= 0) {
1821 UChar first_char
= first_byte
;
1822 if (first_byte_caseless
)
1823 while (subjectPtr
< endSubject
) {
1824 int c
= *subjectPtr
;
1827 if (toLowerCase(c
) == first_char
)
1832 while (subjectPtr
< endSubject
&& *subjectPtr
!= first_char
)
1835 } else if (useMultiLineFirstCharOptimization
) {
1836 /* Or to just after \n for a multiline match if possible */
1837 // I'm not sure why this != originalSubjectStart check is necessary -- ecs 11/18/07
1838 if (subjectPtr
> originalSubjectStart
) {
1839 while (subjectPtr
< endSubject
&& !isNewline(subjectPtr
[-1]))
1845 static bool tryRequiredByteOptimization(const UChar
*& subjectPtr
, const UChar
* endSubject
, int req_byte
, int req_byte2
, bool req_byte_caseless
, bool hasFirstByte
, const UChar
*& reqBytePtr
)
1847 /* If req_byte is set, we know that that character must appear in the subject
1848 for the match to succeed. If the first character is set, req_byte must be
1849 later in the subject; otherwise the test starts at the match point. This
1850 optimization can save a huge amount of backtracking in patterns with nested
1851 unlimited repeats that aren't going to match. Writing separate code for
1852 cased/caseless versions makes it go faster, as does using an autoincrement
1853 and backing off on a match.
1855 HOWEVER: when the subject string is very, very long, searching to its end can
1856 take a long time, and give bad performance on quite ordinary patterns. This
1857 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
1858 don't do this when the string is sufficiently long.
1861 if (req_byte
>= 0 && endSubject
- subjectPtr
< REQ_BYTE_MAX
) {
1862 const UChar
* p
= subjectPtr
+ (hasFirstByte
? 1 : 0);
1864 /* We don't need to repeat the search if we haven't yet reached the
1865 place we found it at last time. */
1867 if (p
> reqBytePtr
) {
1868 if (req_byte_caseless
) {
1869 while (p
< endSubject
) {
1871 if (pp
== req_byte
|| pp
== req_byte2
) {
1877 while (p
< endSubject
) {
1878 if (*p
++ == req_byte
) {
1885 /* If we can't find the required character, break the matching loop */
1887 if (p
>= endSubject
)
1890 /* If we have found the required character, save the point where we
1891 found it, so that we don't search again next time round the loop if
1892 the start hasn't passed this character yet. */
1900 int jsRegExpExecute(const JSRegExp
* re
,
1901 const UChar
* subject
, int length
, int start_offset
, int* offsets
,
1906 ASSERT(offsetcount
>= 0);
1907 ASSERT(offsets
|| offsetcount
== 0);
1909 MatchData matchBlock
;
1910 matchBlock
.startSubject
= subject
;
1911 matchBlock
.endSubject
= matchBlock
.startSubject
+ length
;
1912 const UChar
* endSubject
= matchBlock
.endSubject
;
1914 matchBlock
.multiline
= (re
->options
& MatchAcrossMultipleLinesOption
);
1915 matchBlock
.ignoreCase
= (re
->options
& IgnoreCaseOption
);
1917 /* If the expression has got more back references than the offsets supplied can
1918 hold, we get a temporary chunk of working store to use during the matching.
1919 Otherwise, we can use the vector supplied, rounding down its size to a multiple
1922 int ocount
= offsetcount
- (offsetcount
% 3);
1924 // FIXME: This is lame that we have to second-guess our caller here.
1925 // The API should change to either fail-hard when we don't have enough offset space
1926 // or that we shouldn't ask our callers to pre-allocate in the first place.
1927 bool using_temporary_offsets
= false;
1928 if (re
->top_backref
> 0 && re
->top_backref
>= ocount
/3) {
1929 ocount
= re
->top_backref
* 3 + 3;
1930 matchBlock
.offsetVector
= new int[ocount
];
1931 if (!matchBlock
.offsetVector
)
1932 return JSRegExpErrorNoMemory
;
1933 using_temporary_offsets
= true;
1935 matchBlock
.offsetVector
= offsets
;
1937 matchBlock
.offsetEnd
= ocount
;
1938 matchBlock
.offsetMax
= (2*ocount
)/3;
1939 matchBlock
.offsetOverflow
= false;
1941 /* Compute the minimum number of offsets that we need to reset each time. Doing
1942 this makes a huge difference to execution time when there aren't many brackets
1945 int resetcount
= 2 + re
->top_bracket
* 2;
1946 if (resetcount
> offsetcount
)
1947 resetcount
= ocount
;
1949 /* Reset the working variable associated with each extraction. These should
1950 never be used unless previously set, but they get saved and restored, and so we
1951 initialize them to avoid reading uninitialized locations. */
1953 if (matchBlock
.offsetVector
) {
1954 int* iptr
= matchBlock
.offsetVector
+ ocount
;
1955 int* iend
= iptr
- resetcount
/2 + 1;
1956 while (--iptr
>= iend
)
1960 /* Set up the first character to match, if available. The first_byte value is
1961 never set for an anchored regular expression, but the anchoring may be forced
1962 at run time, so we have to test for anchoring. The first char may be unset for
1963 an unanchored pattern, of course. If there's no first char and the pattern was
1964 studied, there may be a bitmap of possible first characters. */
1966 bool first_byte_caseless
= false;
1967 int first_byte
= -1;
1968 if (re
->options
& UseFirstByteOptimizationOption
) {
1969 first_byte
= re
->first_byte
& 255;
1970 if ((first_byte_caseless
= (re
->first_byte
& REQ_IGNORE_CASE
)))
1971 first_byte
= toLowerCase(first_byte
);
1974 /* For anchored or unanchored matches, there may be a "last known required
1977 bool req_byte_caseless
= false;
1980 if (re
->options
& UseRequiredByteOptimizationOption
) {
1981 req_byte
= re
->req_byte
& 255; // FIXME: This optimization could be made to work for UTF16 chars as well...
1982 req_byte_caseless
= (re
->req_byte
& REQ_IGNORE_CASE
);
1983 req_byte2
= flipCase(req_byte
);
1986 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
1987 the loop runs just once. */
1989 const UChar
* startMatch
= subject
+ start_offset
;
1990 const UChar
* reqBytePtr
= startMatch
- 1;
1991 bool useMultiLineFirstCharOptimization
= re
->options
& UseMultiLineFirstByteOptimizationOption
;
1994 /* Reset the maximum number of extractions we might see. */
1995 if (matchBlock
.offsetVector
) {
1996 int* iptr
= matchBlock
.offsetVector
;
1997 int* iend
= iptr
+ resetcount
;
2002 tryFirstByteOptimization(startMatch
, endSubject
, first_byte
, first_byte_caseless
, useMultiLineFirstCharOptimization
, matchBlock
.startSubject
+ start_offset
);
2003 if (tryRequiredByteOptimization(startMatch
, endSubject
, req_byte
, req_byte2
, req_byte_caseless
, first_byte
>= 0, reqBytePtr
))
2006 /* When a match occurs, substrings will be set for all internal extractions;
2007 we just need to set up the whole thing as substring 0 before returning. If
2008 there were too many extractions, set the return code to zero. In the case
2009 where we had to get some local store to hold offsets for backreferences, copy
2010 those back references that we can. In this case there need not be overflow
2011 if certain parts of the pattern were not used. */
2013 /* The code starts after the JSRegExp block and the capture name table. */
2014 const unsigned char* start_code
= (const unsigned char*)(re
+ 1);
2016 int returnCode
= match(startMatch
, start_code
, 2, matchBlock
);
2018 /* When the result is no match, advance the pointer to the next character
2020 if (returnCode
== 0) {
2025 if (returnCode
!= 1) {
2026 ASSERT(returnCode
== JSRegExpErrorHitLimit
|| returnCode
== JSRegExpErrorNoMemory
);
2027 DPRINTF((">>>> error: returning %d\n", rc
));
2031 /* We have a match! Copy the offset information from temporary store if
2034 if (using_temporary_offsets
) {
2035 if (offsetcount
>= 4) {
2036 memcpy(offsets
+ 2, matchBlock
.offsetVector
+ 2, (offsetcount
- 2) * sizeof(int));
2037 DPRINTF(("Copied offsets from temporary memory\n"));
2039 if (matchBlock
.endOffsetTop
> offsetcount
)
2040 matchBlock
.offsetOverflow
= true;
2042 DPRINTF(("Freeing temporary memory\n"));
2043 delete [] matchBlock
.offsetVector
;
2046 returnCode
= matchBlock
.offsetOverflow
? 0 : matchBlock
.endOffsetTop
/ 2;
2048 if (offsetcount
< 2)
2051 offsets
[0] = startMatch
- matchBlock
.startSubject
;
2052 offsets
[1] = matchBlock
.endMatchPtr
- matchBlock
.startSubject
;
2055 DPRINTF((">>>> returning %d\n", rc
));
2057 } while (startMatch
<= endSubject
);
2059 if (using_temporary_offsets
) {
2060 DPRINTF(("Freeing temporary memory\n"));
2061 delete [] matchBlock
.offsetVector
;
2064 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
2065 return JSRegExpErrorNoMatch
;