1 /* This is JavaScriptCore's variant of the PCRE library. While this library
2 started out as a copy of PCRE, many of the features of PCRE have been
3 removed. This library now supports only the regular expression features
4 required by the JavaScript language specification, and has only the functions
5 needed by JavaScriptCore and the rest of WebKit.
7 Originally written by Philip Hazel
8 Copyright (c) 1997-2006 University of Cambridge
9 Copyright (C) 2002, 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
10 Copyright (C) 2007 Eric Seidel <eric@webkit.org>
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
41 /* This module contains jsRegExpExecute(), the externally visible function
42 that does pattern matching using an NFA algorithm, following the rules from
43 the JavaScript specification. There are also some supporting functions. */
46 #include "pcre_internal.h"
49 #include <wtf/ASCIICType.h>
50 #include <wtf/Vector.h>
53 #include <wtf/DateMath.h>
54 #include <runtime/UString.h>
60 #define USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
61 //#define USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
64 /* Avoid warnings on Windows. */
68 #ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
69 typedef int ReturnLocation
;
71 typedef void* ReturnLocation
;
76 class HistogramTimeLogger
{
78 HistogramTimeLogger(const JSRegExp
*) { }
88 void add(const JSRegExp
*, double);
91 typedef HashMap
<RefPtr
<UString::Rep
>, double> Map
;
95 class HistogramTimeLogger
{
97 HistogramTimeLogger(const JSRegExp
*);
98 ~HistogramTimeLogger();
101 const JSRegExp
* m_re
;
107 /* Structure for building a chain of data for holding the values of
108 the subject pointer at the start of each bracket, used to detect when
109 an empty string has been matched by a bracket to break infinite loops. */
110 struct BracketChainNode
{
111 BracketChainNode
* previousBracket
;
112 const UChar
* bracketStart
;
115 struct MatchFrame
: FastAllocBase
{
116 ReturnLocation returnLocation
;
117 struct MatchFrame
* previousFrame
;
119 /* Function arguments that may change */
121 const UChar
* subjectPtr
;
122 const unsigned char* instructionPtr
;
124 BracketChainNode
* bracketChain
;
128 /* PCRE uses "fake" recursion built off of gotos, thus
129 stack-based local variables are not safe to use. Instead we have to
130 store local variables on the current MatchFrame. */
132 const unsigned char* data
;
133 const unsigned char* startOfRepeatingBracket
;
134 const UChar
* subjectPtrAtStartOfInstruction
; // Several instrutions stash away a subjectPtr here for later compare
135 const unsigned char* instructionPtrAtStartOfOnce
;
150 BracketChainNode bracketChainNode
;
154 /* Structure for passing "static" information around between the functions
155 doing traditional NFA matching, so that they are thread-safe. */
158 int* offsetVector
; /* Offset vector */
159 int offsetEnd
; /* One past the end */
160 int offsetMax
; /* The maximum usable for return data */
161 bool offsetOverflow
; /* Set if too many extractions */
162 const UChar
* startSubject
; /* Start of the subject string */
163 const UChar
* endSubject
; /* End of the subject string */
164 const UChar
* endMatchPtr
; /* Subject position at end match */
165 int endOffsetTop
; /* Highwater mark at end of match */
170 /* The maximum remaining length of subject we are prepared to search for a
173 #define REQ_BYTE_MAX 1000
175 /* The below limit restricts the number of "recursive" match calls in order to
176 avoid spending exponential time on complex regular expressions. */
178 static const unsigned matchLimit
= 1000000;
181 /*************************************************
182 * Debugging function to print chars *
183 *************************************************/
185 /* Print a sequence of chars in printable format, stopping at the end of the
186 subject if the requested.
189 p points to characters
190 length number to print
191 isSubject true if printing from within md.startSubject
192 md pointer to matching data block, if isSubject is true
195 static void pchars(const UChar
* p
, int length
, bool isSubject
, const MatchData
& md
)
197 if (isSubject
&& length
> md
.endSubject
- p
)
198 length
= md
.endSubject
- p
;
199 while (length
-- > 0) {
201 if (isprint(c
= *(p
++)))
204 printf("\\x%02x", c
);
206 printf("\\x{%x}", c
);
211 /*************************************************
212 * Match a back-reference *
213 *************************************************/
215 /* If a back reference hasn't been set, the length that is passed is greater
216 than the number of characters left in the string, so the match fails.
219 offset index into the offset vector
220 subjectPtr points into the subject
221 length length to be matched
222 md points to match data block
224 Returns: true if matched
227 static bool matchRef(int offset
, const UChar
* subjectPtr
, int length
, const MatchData
& md
)
229 const UChar
* p
= md
.startSubject
+ md
.offsetVector
[offset
];
232 if (subjectPtr
>= md
.endSubject
)
233 printf("matching subject <null>");
235 printf("matching subject ");
236 pchars(subjectPtr
, length
, true, md
);
238 printf(" against backref ");
239 pchars(p
, length
, false, md
);
243 /* Always fail if not enough characters left */
245 if (length
> md
.endSubject
- subjectPtr
)
248 /* Separate the caselesss case for speed */
251 while (length
-- > 0) {
253 int othercase
= jsc_pcre_ucp_othercase(c
);
254 UChar d
= *subjectPtr
++;
255 if (c
!= d
&& othercase
!= d
)
261 if (*p
++ != *subjectPtr
++)
268 #ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
270 /* Use numbered labels and switch statement at the bottom of the match function. */
272 #define RMATCH_WHERE(num) num
273 #define RRETURN_LABEL RRETURN_SWITCH
277 /* Use GCC's computed goto extension. */
279 /* For one test case this is more than 40% faster than the switch statement.
280 We could avoid the use of the num argument entirely by using local labels,
281 but using it for the GCC case as well as the non-GCC case allows us to share
282 a bit more code and notice if we use conflicting numbers.*/
284 #define RMATCH_WHERE(num) &&RRETURN_##num
285 #define RRETURN_LABEL *stack.currentFrame->returnLocation
289 #define RECURSIVE_MATCH_COMMON(num) \
292 stack.popCurrentFrame();
294 #define RECURSIVE_MATCH(num, ra, rb) \
296 stack.pushNewFrame((ra), (rb), RMATCH_WHERE(num)); \
297 RECURSIVE_MATCH_COMMON(num) \
300 #define RECURSIVE_MATCH_NEW_GROUP(num, ra, rb) \
302 stack.pushNewFrame((ra), (rb), RMATCH_WHERE(num)); \
303 startNewGroup(stack.currentFrame); \
304 RECURSIVE_MATCH_COMMON(num) \
307 #define RRETURN goto RRETURN_LABEL
309 #define RRETURN_NO_MATCH do { isMatch = false; RRETURN; } while (0)
311 /*************************************************
312 * Match from current position *
313 *************************************************/
315 /* On entry instructionPtr points to the first opcode, and subjectPtr to the first character
316 in the subject string, while substringStart holds the value of subjectPtr at the start of the
317 last bracketed group - used for breaking infinite loops matching zero-length
318 strings. This function is called recursively in many circumstances. Whenever it
319 returns a negative (error) response, the outer match() call must also return the
323 subjectPtr pointer in subject
324 instructionPtr position in code
325 offsetTop current top pointer
326 md pointer to "static" info for the match
328 Returns: 1 if matched ) these values are >= 0
329 0 if failed to match )
330 a negative error value if aborted by an error condition
331 (e.g. stopped by repeated call or recursion limit)
334 static const unsigned numFramesOnStack
= 16;
338 : framesEnd(frames
+ numFramesOnStack
)
339 , currentFrame(frames
)
340 , size(1) // match() creates accesses the first frame w/o calling pushNewFrame
342 ASSERT((sizeof(frames
) / sizeof(frames
[0])) == numFramesOnStack
);
345 MatchFrame frames
[numFramesOnStack
];
346 MatchFrame
* framesEnd
;
347 MatchFrame
* currentFrame
;
350 inline bool canUseStackBufferForNextFrame()
352 return size
< numFramesOnStack
;
355 inline MatchFrame
* allocateNextFrame()
357 if (canUseStackBufferForNextFrame())
358 return currentFrame
+ 1;
359 return new MatchFrame
;
362 inline void pushNewFrame(const unsigned char* instructionPtr
, BracketChainNode
* bracketChain
, ReturnLocation returnLocation
)
364 MatchFrame
* newframe
= allocateNextFrame();
365 newframe
->previousFrame
= currentFrame
;
367 newframe
->args
.subjectPtr
= currentFrame
->args
.subjectPtr
;
368 newframe
->args
.offsetTop
= currentFrame
->args
.offsetTop
;
369 newframe
->args
.instructionPtr
= instructionPtr
;
370 newframe
->args
.bracketChain
= bracketChain
;
371 newframe
->returnLocation
= returnLocation
;
374 currentFrame
= newframe
;
377 inline void popCurrentFrame()
379 MatchFrame
* oldFrame
= currentFrame
;
380 currentFrame
= currentFrame
->previousFrame
;
381 if (size
> numFramesOnStack
)
393 static int matchError(int errorCode
, MatchStack
& stack
)
395 stack
.popAllFrames();
399 /* Get the next UTF-8 character, not advancing the pointer, incrementing length
400 if there are extra bytes. This is called when we know we are in UTF-8 mode. */
402 static inline void getUTF8CharAndIncrementLength(int& c
, const unsigned char* subjectPtr
, int& len
)
405 if ((c
& 0xc0) == 0xc0) {
406 int gcaa
= jsc_pcre_utf8_table4
[c
& 0x3f]; /* Number of additional bytes */
408 c
= (c
& jsc_pcre_utf8_table3
[gcaa
]) << gcss
;
409 for (int gcii
= 1; gcii
<= gcaa
; gcii
++) {
411 c
|= (subjectPtr
[gcii
] & 0x3f) << gcss
;
417 static inline void startNewGroup(MatchFrame
* currentFrame
)
419 /* At the start of a bracketed group, add the current subject pointer to the
420 stack of such pointers, to be re-instated at the end of the group when we hit
421 the closing ket. When match() is called in other circumstances, we don't add to
424 currentFrame
->locals
.bracketChainNode
.previousBracket
= currentFrame
->args
.bracketChain
;
425 currentFrame
->locals
.bracketChainNode
.bracketStart
= currentFrame
->args
.subjectPtr
;
426 currentFrame
->args
.bracketChain
= ¤tFrame
->locals
.bracketChainNode
;
429 // FIXME: "minimize" means "not greedy", we should invert the callers to ask for "greedy" to be less confusing
430 static inline void repeatInformationFromInstructionOffset(short instructionOffset
, bool& minimize
, int& minimumRepeats
, int& maximumRepeats
)
432 // Instruction offsets are based off of OP_CRSTAR, OP_STAR, OP_TYPESTAR, OP_NOTSTAR
433 static const char minimumRepeatsFromInstructionOffset
[] = { 0, 0, 1, 1, 0, 0 };
434 static const int maximumRepeatsFromInstructionOffset
[] = { INT_MAX
, INT_MAX
, INT_MAX
, INT_MAX
, 1, 1 };
436 ASSERT(instructionOffset
>= 0);
437 ASSERT(instructionOffset
<= (OP_CRMINQUERY
- OP_CRSTAR
));
439 minimize
= (instructionOffset
& 1); // this assumes ordering: Instruction, MinimizeInstruction, Instruction2, MinimizeInstruction2
440 minimumRepeats
= minimumRepeatsFromInstructionOffset
[instructionOffset
];
441 maximumRepeats
= maximumRepeatsFromInstructionOffset
[instructionOffset
];
444 static int match(const UChar
* subjectPtr
, const unsigned char* instructionPtr
, int offsetTop
, MatchData
& md
)
446 bool isMatch
= false;
448 bool minimize
= false; /* Initialization not really needed, but some compilers think so. */
449 unsigned remainingMatchCount
= matchLimit
;
450 int othercase
; /* Declare here to avoid errors during jumps */
454 /* The opcode jump table. */
455 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
456 #define EMIT_JUMP_TABLE_ENTRY(opcode) &&LABEL_OP_##opcode,
457 static void* opcodeJumpTable
[256] = { FOR_EACH_OPCODE(EMIT_JUMP_TABLE_ENTRY
) };
458 #undef EMIT_JUMP_TABLE_ENTRY
461 /* One-time setup of the opcode jump table. */
462 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
463 for (int i
= 255; !opcodeJumpTable
[i
]; i
--)
464 opcodeJumpTable
[i
] = &&CAPTURING_BRACKET
;
467 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
468 // Shark shows this as a hot line
469 // Using a static const here makes this line disappear, but makes later access hotter (not sure why)
470 stack
.currentFrame
->returnLocation
= &&RETURN
;
472 stack
.currentFrame
->returnLocation
= 0;
474 stack
.currentFrame
->args
.subjectPtr
= subjectPtr
;
475 stack
.currentFrame
->args
.instructionPtr
= instructionPtr
;
476 stack
.currentFrame
->args
.offsetTop
= offsetTop
;
477 stack
.currentFrame
->args
.bracketChain
= 0;
478 startNewGroup(stack
.currentFrame
);
480 /* This is where control jumps back to to effect "recursion" */
483 if (!--remainingMatchCount
)
484 return matchError(JSRegExpErrorHitLimit
, stack
);
486 /* Now start processing the operations. */
488 #ifndef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
493 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
494 #define BEGIN_OPCODE(opcode) LABEL_OP_##opcode
495 #define NEXT_OPCODE goto *opcodeJumpTable[*stack.currentFrame->args.instructionPtr]
497 #define BEGIN_OPCODE(opcode) case OP_##opcode
498 #define NEXT_OPCODE continue
501 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
504 switch (*stack
.currentFrame
->args
.instructionPtr
)
507 /* Non-capturing bracket: optimized */
510 NON_CAPTURING_BRACKET
:
511 DPRINTF(("start bracket 0\n"));
513 RECURSIVE_MATCH_NEW_GROUP(2, stack
.currentFrame
->args
.instructionPtr
+ 1 + LINK_SIZE
, stack
.currentFrame
->args
.bracketChain
);
516 stack
.currentFrame
->args
.instructionPtr
+= getLinkValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
517 } while (*stack
.currentFrame
->args
.instructionPtr
== OP_ALT
);
518 DPRINTF(("bracket 0 failed\n"));
521 /* Skip over large extraction number data if encountered. */
523 BEGIN_OPCODE(BRANUMBER
):
524 stack
.currentFrame
->args
.instructionPtr
+= 3;
527 /* End of the pattern. */
530 md
.endMatchPtr
= stack
.currentFrame
->args
.subjectPtr
; /* Record where we ended */
531 md
.endOffsetTop
= stack
.currentFrame
->args
.offsetTop
; /* and how many extracts were taken */
535 /* Assertion brackets. Check the alternative branches in turn - the
536 matching won't pass the KET for an assertion. If any one branch matches,
537 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
538 start of each branch to move the current point backwards, so the code at
539 this level is identical to the lookahead case. */
541 BEGIN_OPCODE(ASSERT
):
543 RECURSIVE_MATCH_NEW_GROUP(6, stack
.currentFrame
->args
.instructionPtr
+ 1 + LINK_SIZE
, NULL
);
546 stack
.currentFrame
->args
.instructionPtr
+= getLinkValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
547 } while (*stack
.currentFrame
->args
.instructionPtr
== OP_ALT
);
548 if (*stack
.currentFrame
->args
.instructionPtr
== OP_KET
)
551 /* Continue from after the assertion, updating the offsets high water
552 mark, since extracts may have been taken during the assertion. */
554 advanceToEndOfBracket(stack
.currentFrame
->args
.instructionPtr
);
555 stack
.currentFrame
->args
.instructionPtr
+= 1 + LINK_SIZE
;
556 stack
.currentFrame
->args
.offsetTop
= md
.endOffsetTop
;
559 /* Negative assertion: all branches must fail to match */
561 BEGIN_OPCODE(ASSERT_NOT
):
563 RECURSIVE_MATCH_NEW_GROUP(7, stack
.currentFrame
->args
.instructionPtr
+ 1 + LINK_SIZE
, NULL
);
566 stack
.currentFrame
->args
.instructionPtr
+= getLinkValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
567 } while (*stack
.currentFrame
->args
.instructionPtr
== OP_ALT
);
569 stack
.currentFrame
->args
.instructionPtr
+= 1 + LINK_SIZE
;
572 /* An alternation is the end of a branch; scan along to find the end of the
573 bracketed group and go to there. */
576 advanceToEndOfBracket(stack
.currentFrame
->args
.instructionPtr
);
579 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
580 that it may occur zero times. It may repeat infinitely, or not at all -
581 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
582 repeat limits are compiled as a number of copies, with the optional ones
583 preceded by BRAZERO or BRAMINZERO. */
585 BEGIN_OPCODE(BRAZERO
): {
586 stack
.currentFrame
->locals
.startOfRepeatingBracket
= stack
.currentFrame
->args
.instructionPtr
+ 1;
587 RECURSIVE_MATCH_NEW_GROUP(14, stack
.currentFrame
->locals
.startOfRepeatingBracket
, stack
.currentFrame
->args
.bracketChain
);
590 advanceToEndOfBracket(stack
.currentFrame
->locals
.startOfRepeatingBracket
);
591 stack
.currentFrame
->args
.instructionPtr
= stack
.currentFrame
->locals
.startOfRepeatingBracket
+ 1 + LINK_SIZE
;
595 BEGIN_OPCODE(BRAMINZERO
): {
596 stack
.currentFrame
->locals
.startOfRepeatingBracket
= stack
.currentFrame
->args
.instructionPtr
+ 1;
597 advanceToEndOfBracket(stack
.currentFrame
->locals
.startOfRepeatingBracket
);
598 RECURSIVE_MATCH_NEW_GROUP(15, stack
.currentFrame
->locals
.startOfRepeatingBracket
+ 1 + LINK_SIZE
, stack
.currentFrame
->args
.bracketChain
);
601 stack
.currentFrame
->args
.instructionPtr
++;
605 /* End of a group, repeated or non-repeating. If we are at the end of
606 an assertion "group", stop matching and return 1, but record the
607 current high water mark for use by positive assertions. Do this also
608 for the "once" (not-backup up) groups. */
611 BEGIN_OPCODE(KETRMIN
):
612 BEGIN_OPCODE(KETRMAX
):
613 stack
.currentFrame
->locals
.instructionPtrAtStartOfOnce
= stack
.currentFrame
->args
.instructionPtr
- getLinkValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
614 stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
= stack
.currentFrame
->args
.bracketChain
->bracketStart
;
616 /* Back up the stack of bracket start pointers. */
618 stack
.currentFrame
->args
.bracketChain
= stack
.currentFrame
->args
.bracketChain
->previousBracket
;
620 if (*stack
.currentFrame
->locals
.instructionPtrAtStartOfOnce
== OP_ASSERT
|| *stack
.currentFrame
->locals
.instructionPtrAtStartOfOnce
== OP_ASSERT_NOT
) {
621 md
.endOffsetTop
= stack
.currentFrame
->args
.offsetTop
;
626 /* In all other cases except a conditional group we have to check the
627 group number back at the start and if necessary complete handling an
628 extraction by setting the offsets and bumping the high water mark. */
630 stack
.currentFrame
->locals
.number
= *stack
.currentFrame
->locals
.instructionPtrAtStartOfOnce
- OP_BRA
;
632 /* For extended extraction brackets (large number), we have to fish out
633 the number from a dummy opcode at the start. */
635 if (stack
.currentFrame
->locals
.number
> EXTRACT_BASIC_MAX
)
636 stack
.currentFrame
->locals
.number
= get2ByteValue(stack
.currentFrame
->locals
.instructionPtrAtStartOfOnce
+ 2 + LINK_SIZE
);
637 stack
.currentFrame
->locals
.offset
= stack
.currentFrame
->locals
.number
<< 1;
640 printf("end bracket %d", stack
.currentFrame
->locals
.number
);
644 /* Test for a numbered group. This includes groups called as a result
645 of recursion. Note that whole-pattern recursion is coded as a recurse
646 into group 0, so it won't be picked up here. Instead, we catch it when
647 the OP_END is reached. */
649 if (stack
.currentFrame
->locals
.number
> 0) {
650 if (stack
.currentFrame
->locals
.offset
>= md
.offsetMax
)
651 md
.offsetOverflow
= true;
653 md
.offsetVector
[stack
.currentFrame
->locals
.offset
] =
654 md
.offsetVector
[md
.offsetEnd
- stack
.currentFrame
->locals
.number
];
655 md
.offsetVector
[stack
.currentFrame
->locals
.offset
+1] = stack
.currentFrame
->args
.subjectPtr
- md
.startSubject
;
656 if (stack
.currentFrame
->args
.offsetTop
<= stack
.currentFrame
->locals
.offset
)
657 stack
.currentFrame
->args
.offsetTop
= stack
.currentFrame
->locals
.offset
+ 2;
661 /* For a non-repeating ket, just continue at this level. This also
662 happens for a repeating ket if no characters were matched in the group.
663 This is the forcible breaking of infinite loops as implemented in Perl
664 5.005. If there is an options reset, it will get obeyed in the normal
667 if (*stack
.currentFrame
->args
.instructionPtr
== OP_KET
|| stack
.currentFrame
->args
.subjectPtr
== stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
) {
668 stack
.currentFrame
->args
.instructionPtr
+= 1 + LINK_SIZE
;
672 /* The repeating kets try the rest of the pattern or restart from the
673 preceding bracket, in the appropriate order. */
675 if (*stack
.currentFrame
->args
.instructionPtr
== OP_KETRMIN
) {
676 RECURSIVE_MATCH(16, stack
.currentFrame
->args
.instructionPtr
+ 1 + LINK_SIZE
, stack
.currentFrame
->args
.bracketChain
);
679 RECURSIVE_MATCH_NEW_GROUP(17, stack
.currentFrame
->locals
.instructionPtrAtStartOfOnce
, stack
.currentFrame
->args
.bracketChain
);
682 } else { /* OP_KETRMAX */
683 RECURSIVE_MATCH_NEW_GROUP(18, stack
.currentFrame
->locals
.instructionPtrAtStartOfOnce
, stack
.currentFrame
->args
.bracketChain
);
686 RECURSIVE_MATCH(19, stack
.currentFrame
->args
.instructionPtr
+ 1 + LINK_SIZE
, stack
.currentFrame
->args
.bracketChain
);
692 /* Start of subject. */
695 if (stack
.currentFrame
->args
.subjectPtr
!= md
.startSubject
)
697 stack
.currentFrame
->args
.instructionPtr
++;
700 /* After internal newline if multiline. */
703 if (stack
.currentFrame
->args
.subjectPtr
!= md
.startSubject
&& !isNewline(stack
.currentFrame
->args
.subjectPtr
[-1]))
705 stack
.currentFrame
->args
.instructionPtr
++;
708 /* End of subject. */
711 if (stack
.currentFrame
->args
.subjectPtr
< md
.endSubject
)
713 stack
.currentFrame
->args
.instructionPtr
++;
716 /* Before internal newline if multiline. */
719 if (stack
.currentFrame
->args
.subjectPtr
< md
.endSubject
&& !isNewline(*stack
.currentFrame
->args
.subjectPtr
))
721 stack
.currentFrame
->args
.instructionPtr
++;
724 /* Word boundary assertions */
726 BEGIN_OPCODE(NOT_WORD_BOUNDARY
):
727 BEGIN_OPCODE(WORD_BOUNDARY
): {
728 bool currentCharIsWordChar
= false;
729 bool previousCharIsWordChar
= false;
731 if (stack
.currentFrame
->args
.subjectPtr
> md
.startSubject
)
732 previousCharIsWordChar
= isWordChar(stack
.currentFrame
->args
.subjectPtr
[-1]);
733 if (stack
.currentFrame
->args
.subjectPtr
< md
.endSubject
)
734 currentCharIsWordChar
= isWordChar(*stack
.currentFrame
->args
.subjectPtr
);
736 /* Now see if the situation is what we want */
737 bool wordBoundaryDesired
= (*stack
.currentFrame
->args
.instructionPtr
++ == OP_WORD_BOUNDARY
);
738 if (wordBoundaryDesired
? currentCharIsWordChar
== previousCharIsWordChar
: currentCharIsWordChar
!= previousCharIsWordChar
)
743 /* Match a single character type; inline for speed */
745 BEGIN_OPCODE(NOT_NEWLINE
):
746 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
748 if (isNewline(*stack
.currentFrame
->args
.subjectPtr
++))
750 stack
.currentFrame
->args
.instructionPtr
++;
753 BEGIN_OPCODE(NOT_DIGIT
):
754 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
756 if (isASCIIDigit(*stack
.currentFrame
->args
.subjectPtr
++))
758 stack
.currentFrame
->args
.instructionPtr
++;
762 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
764 if (!isASCIIDigit(*stack
.currentFrame
->args
.subjectPtr
++))
766 stack
.currentFrame
->args
.instructionPtr
++;
769 BEGIN_OPCODE(NOT_WHITESPACE
):
770 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
772 if (isSpaceChar(*stack
.currentFrame
->args
.subjectPtr
++))
774 stack
.currentFrame
->args
.instructionPtr
++;
777 BEGIN_OPCODE(WHITESPACE
):
778 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
780 if (!isSpaceChar(*stack
.currentFrame
->args
.subjectPtr
++))
782 stack
.currentFrame
->args
.instructionPtr
++;
785 BEGIN_OPCODE(NOT_WORDCHAR
):
786 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
788 if (isWordChar(*stack
.currentFrame
->args
.subjectPtr
++))
790 stack
.currentFrame
->args
.instructionPtr
++;
793 BEGIN_OPCODE(WORDCHAR
):
794 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
796 if (!isWordChar(*stack
.currentFrame
->args
.subjectPtr
++))
798 stack
.currentFrame
->args
.instructionPtr
++;
801 /* Match a back reference, possibly repeatedly. Look past the end of the
802 item to see if there is repeat information following. The code is similar
803 to that for character classes, but repeated for efficiency. Then obey
804 similar code to character type repeats - written out again for speed.
805 However, if the referenced string is the empty string, always treat
806 it as matched, any number of times (otherwise there could be infinite
810 stack
.currentFrame
->locals
.offset
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 1) << 1; /* Doubled ref number */
811 stack
.currentFrame
->args
.instructionPtr
+= 3; /* Advance past item */
813 /* If the reference is unset, set the length to be longer than the amount
814 of subject left; this ensures that every attempt at a match fails. We
815 can't just fail here, because of the possibility of quantifiers with zero
818 if (stack
.currentFrame
->locals
.offset
>= stack
.currentFrame
->args
.offsetTop
|| md
.offsetVector
[stack
.currentFrame
->locals
.offset
] < 0)
819 stack
.currentFrame
->locals
.length
= 0;
821 stack
.currentFrame
->locals
.length
= md
.offsetVector
[stack
.currentFrame
->locals
.offset
+1] - md
.offsetVector
[stack
.currentFrame
->locals
.offset
];
823 /* Set up for repetition, or handle the non-repeated case */
825 switch (*stack
.currentFrame
->args
.instructionPtr
) {
832 repeatInformationFromInstructionOffset(*stack
.currentFrame
->args
.instructionPtr
++ - OP_CRSTAR
, minimize
, min
, stack
.currentFrame
->locals
.max
);
837 minimize
= (*stack
.currentFrame
->args
.instructionPtr
== OP_CRMINRANGE
);
838 min
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
839 stack
.currentFrame
->locals
.max
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 3);
840 if (stack
.currentFrame
->locals
.max
== 0)
841 stack
.currentFrame
->locals
.max
= INT_MAX
;
842 stack
.currentFrame
->args
.instructionPtr
+= 5;
845 default: /* No repeat follows */
846 if (!matchRef(stack
.currentFrame
->locals
.offset
, stack
.currentFrame
->args
.subjectPtr
, stack
.currentFrame
->locals
.length
, md
))
848 stack
.currentFrame
->args
.subjectPtr
+= stack
.currentFrame
->locals
.length
;
852 /* If the length of the reference is zero, just continue with the
855 if (stack
.currentFrame
->locals
.length
== 0)
858 /* First, ensure the minimum number of matches are present. */
860 for (int i
= 1; i
<= min
; i
++) {
861 if (!matchRef(stack
.currentFrame
->locals
.offset
, stack
.currentFrame
->args
.subjectPtr
, stack
.currentFrame
->locals
.length
, md
))
863 stack
.currentFrame
->args
.subjectPtr
+= stack
.currentFrame
->locals
.length
;
866 /* If min = max, continue at the same level without recursion.
867 They are not both allowed to be zero. */
869 if (min
== stack
.currentFrame
->locals
.max
)
872 /* If minimizing, keep trying and advancing the pointer */
875 for (stack
.currentFrame
->locals
.fi
= min
;; stack
.currentFrame
->locals
.fi
++) {
876 RECURSIVE_MATCH(20, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
879 if (stack
.currentFrame
->locals
.fi
>= stack
.currentFrame
->locals
.max
|| !matchRef(stack
.currentFrame
->locals
.offset
, stack
.currentFrame
->args
.subjectPtr
, stack
.currentFrame
->locals
.length
, md
))
881 stack
.currentFrame
->args
.subjectPtr
+= stack
.currentFrame
->locals
.length
;
883 /* Control never reaches here */
886 /* If maximizing, find the longest string and work backwards */
889 stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
= stack
.currentFrame
->args
.subjectPtr
;
890 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
891 if (!matchRef(stack
.currentFrame
->locals
.offset
, stack
.currentFrame
->args
.subjectPtr
, stack
.currentFrame
->locals
.length
, md
))
893 stack
.currentFrame
->args
.subjectPtr
+= stack
.currentFrame
->locals
.length
;
895 while (stack
.currentFrame
->args
.subjectPtr
>= stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
) {
896 RECURSIVE_MATCH(21, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
899 stack
.currentFrame
->args
.subjectPtr
-= stack
.currentFrame
->locals
.length
;
903 /* Control never reaches here */
905 /* Match a bit-mapped character class, possibly repeatedly. This op code is
906 used when all the characters in the class have values in the range 0-255,
907 and either the matching is caseful, or the characters are in the range
908 0-127 when UTF-8 processing is enabled. The only difference between
909 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
912 First, look past the end of the item to see if there is repeat information
913 following. Then obey similar code to character type repeats - written out
916 BEGIN_OPCODE(NCLASS
):
918 stack
.currentFrame
->locals
.data
= stack
.currentFrame
->args
.instructionPtr
+ 1; /* Save for matching */
919 stack
.currentFrame
->args
.instructionPtr
+= 33; /* Advance past the item */
921 switch (*stack
.currentFrame
->args
.instructionPtr
) {
928 repeatInformationFromInstructionOffset(*stack
.currentFrame
->args
.instructionPtr
++ - OP_CRSTAR
, minimize
, min
, stack
.currentFrame
->locals
.max
);
933 minimize
= (*stack
.currentFrame
->args
.instructionPtr
== OP_CRMINRANGE
);
934 min
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
935 stack
.currentFrame
->locals
.max
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 3);
936 if (stack
.currentFrame
->locals
.max
== 0)
937 stack
.currentFrame
->locals
.max
= INT_MAX
;
938 stack
.currentFrame
->args
.instructionPtr
+= 5;
941 default: /* No repeat follows */
942 min
= stack
.currentFrame
->locals
.max
= 1;
946 /* First, ensure the minimum number of matches are present. */
948 for (int i
= 1; i
<= min
; i
++) {
949 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
951 int c
= *stack
.currentFrame
->args
.subjectPtr
++;
953 if (stack
.currentFrame
->locals
.data
[-1] == OP_CLASS
)
956 if (!(stack
.currentFrame
->locals
.data
[c
/ 8] & (1 << (c
& 7))))
961 /* If max == min we can continue with the main loop without the
964 if (min
== stack
.currentFrame
->locals
.max
)
967 /* If minimizing, keep testing the rest of the expression and advancing
968 the pointer while it matches the class. */
970 for (stack
.currentFrame
->locals
.fi
= min
;; stack
.currentFrame
->locals
.fi
++) {
971 RECURSIVE_MATCH(22, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
974 if (stack
.currentFrame
->locals
.fi
>= stack
.currentFrame
->locals
.max
|| stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
976 int c
= *stack
.currentFrame
->args
.subjectPtr
++;
978 if (stack
.currentFrame
->locals
.data
[-1] == OP_CLASS
)
981 if ((stack
.currentFrame
->locals
.data
[c
/8] & (1 << (c
&7))) == 0)
985 /* Control never reaches here */
987 /* If maximizing, find the longest possible run, then work backwards. */
989 stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
= stack
.currentFrame
->args
.subjectPtr
;
991 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
992 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
994 int c
= *stack
.currentFrame
->args
.subjectPtr
;
996 if (stack
.currentFrame
->locals
.data
[-1] == OP_CLASS
)
999 if (!(stack
.currentFrame
->locals
.data
[c
/ 8] & (1 << (c
& 7))))
1002 ++stack
.currentFrame
->args
.subjectPtr
;
1005 RECURSIVE_MATCH(24, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1008 if (stack
.currentFrame
->args
.subjectPtr
-- == stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
)
1009 break; /* Stop if tried at original pos */
1014 /* Control never reaches here */
1016 /* Match an extended character class. */
1018 BEGIN_OPCODE(XCLASS
):
1019 stack
.currentFrame
->locals
.data
= stack
.currentFrame
->args
.instructionPtr
+ 1 + LINK_SIZE
; /* Save for matching */
1020 stack
.currentFrame
->args
.instructionPtr
+= getLinkValue(stack
.currentFrame
->args
.instructionPtr
+ 1); /* Advance past the item */
1022 switch (*stack
.currentFrame
->args
.instructionPtr
) {
1029 repeatInformationFromInstructionOffset(*stack
.currentFrame
->args
.instructionPtr
++ - OP_CRSTAR
, minimize
, min
, stack
.currentFrame
->locals
.max
);
1034 minimize
= (*stack
.currentFrame
->args
.instructionPtr
== OP_CRMINRANGE
);
1035 min
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
1036 stack
.currentFrame
->locals
.max
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 3);
1037 if (stack
.currentFrame
->locals
.max
== 0)
1038 stack
.currentFrame
->locals
.max
= INT_MAX
;
1039 stack
.currentFrame
->args
.instructionPtr
+= 5;
1042 default: /* No repeat follows */
1043 min
= stack
.currentFrame
->locals
.max
= 1;
1046 /* First, ensure the minimum number of matches are present. */
1048 for (int i
= 1; i
<= min
; i
++) {
1049 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1051 int c
= *stack
.currentFrame
->args
.subjectPtr
++;
1052 if (!jsc_pcre_xclass(c
, stack
.currentFrame
->locals
.data
))
1056 /* If max == min we can continue with the main loop without the
1059 if (min
== stack
.currentFrame
->locals
.max
)
1062 /* If minimizing, keep testing the rest of the expression and advancing
1063 the pointer while it matches the class. */
1066 for (stack
.currentFrame
->locals
.fi
= min
;; stack
.currentFrame
->locals
.fi
++) {
1067 RECURSIVE_MATCH(26, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1070 if (stack
.currentFrame
->locals
.fi
>= stack
.currentFrame
->locals
.max
|| stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1072 int c
= *stack
.currentFrame
->args
.subjectPtr
++;
1073 if (!jsc_pcre_xclass(c
, stack
.currentFrame
->locals
.data
))
1076 /* Control never reaches here */
1079 /* If maximizing, find the longest possible run, then work backwards. */
1082 stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
= stack
.currentFrame
->args
.subjectPtr
;
1083 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1084 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1086 int c
= *stack
.currentFrame
->args
.subjectPtr
;
1087 if (!jsc_pcre_xclass(c
, stack
.currentFrame
->locals
.data
))
1089 ++stack
.currentFrame
->args
.subjectPtr
;
1092 RECURSIVE_MATCH(27, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1095 if (stack
.currentFrame
->args
.subjectPtr
-- == stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
)
1096 break; /* Stop if tried at original pos */
1101 /* Control never reaches here */
1103 /* Match a single character, casefully */
1106 stack
.currentFrame
->locals
.length
= 1;
1107 stack
.currentFrame
->args
.instructionPtr
++;
1108 getUTF8CharAndIncrementLength(stack
.currentFrame
->locals
.fc
, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->locals
.length
);
1109 stack
.currentFrame
->args
.instructionPtr
+= stack
.currentFrame
->locals
.length
;
1110 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1112 if (stack
.currentFrame
->locals
.fc
!= *stack
.currentFrame
->args
.subjectPtr
++)
1116 /* Match a single character, caselessly */
1118 BEGIN_OPCODE(CHAR_IGNORING_CASE
): {
1119 stack
.currentFrame
->locals
.length
= 1;
1120 stack
.currentFrame
->args
.instructionPtr
++;
1121 getUTF8CharAndIncrementLength(stack
.currentFrame
->locals
.fc
, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->locals
.length
);
1122 stack
.currentFrame
->args
.instructionPtr
+= stack
.currentFrame
->locals
.length
;
1123 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1125 int dc
= *stack
.currentFrame
->args
.subjectPtr
++;
1126 if (stack
.currentFrame
->locals
.fc
!= dc
&& jsc_pcre_ucp_othercase(stack
.currentFrame
->locals
.fc
) != dc
)
1131 /* Match a single ASCII character. */
1133 BEGIN_OPCODE(ASCII_CHAR
):
1134 if (md
.endSubject
== stack
.currentFrame
->args
.subjectPtr
)
1136 if (*stack
.currentFrame
->args
.subjectPtr
!= stack
.currentFrame
->args
.instructionPtr
[1])
1138 ++stack
.currentFrame
->args
.subjectPtr
;
1139 stack
.currentFrame
->args
.instructionPtr
+= 2;
1142 /* Match one of two cases of an ASCII letter. */
1144 BEGIN_OPCODE(ASCII_LETTER_IGNORING_CASE
):
1145 if (md
.endSubject
== stack
.currentFrame
->args
.subjectPtr
)
1147 if ((*stack
.currentFrame
->args
.subjectPtr
| 0x20) != stack
.currentFrame
->args
.instructionPtr
[1])
1149 ++stack
.currentFrame
->args
.subjectPtr
;
1150 stack
.currentFrame
->args
.instructionPtr
+= 2;
1153 /* Match a single character repeatedly; different opcodes share code. */
1155 BEGIN_OPCODE(EXACT
):
1156 min
= stack
.currentFrame
->locals
.max
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
1158 stack
.currentFrame
->args
.instructionPtr
+= 3;
1162 BEGIN_OPCODE(MINUPTO
):
1164 stack
.currentFrame
->locals
.max
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
1165 minimize
= *stack
.currentFrame
->args
.instructionPtr
== OP_MINUPTO
;
1166 stack
.currentFrame
->args
.instructionPtr
+= 3;
1170 BEGIN_OPCODE(MINSTAR
):
1172 BEGIN_OPCODE(MINPLUS
):
1173 BEGIN_OPCODE(QUERY
):
1174 BEGIN_OPCODE(MINQUERY
):
1175 repeatInformationFromInstructionOffset(*stack
.currentFrame
->args
.instructionPtr
++ - OP_STAR
, minimize
, min
, stack
.currentFrame
->locals
.max
);
1177 /* Common code for all repeated single-character matches. We can give
1178 up quickly if there are fewer than the minimum number of characters left in
1183 stack
.currentFrame
->locals
.length
= 1;
1184 getUTF8CharAndIncrementLength(stack
.currentFrame
->locals
.fc
, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->locals
.length
);
1185 if (min
* (stack
.currentFrame
->locals
.fc
> 0xFFFF ? 2 : 1) > md
.endSubject
- stack
.currentFrame
->args
.subjectPtr
)
1187 stack
.currentFrame
->args
.instructionPtr
+= stack
.currentFrame
->locals
.length
;
1189 if (stack
.currentFrame
->locals
.fc
<= 0xFFFF) {
1190 othercase
= md
.ignoreCase
? jsc_pcre_ucp_othercase(stack
.currentFrame
->locals
.fc
) : -1;
1192 for (int i
= 1; i
<= min
; i
++) {
1193 if (*stack
.currentFrame
->args
.subjectPtr
!= stack
.currentFrame
->locals
.fc
&& *stack
.currentFrame
->args
.subjectPtr
!= othercase
)
1195 ++stack
.currentFrame
->args
.subjectPtr
;
1198 if (min
== stack
.currentFrame
->locals
.max
)
1202 stack
.currentFrame
->locals
.repeatOthercase
= othercase
;
1203 for (stack
.currentFrame
->locals
.fi
= min
;; stack
.currentFrame
->locals
.fi
++) {
1204 RECURSIVE_MATCH(28, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1207 if (stack
.currentFrame
->locals
.fi
>= stack
.currentFrame
->locals
.max
|| stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1209 if (*stack
.currentFrame
->args
.subjectPtr
!= stack
.currentFrame
->locals
.fc
&& *stack
.currentFrame
->args
.subjectPtr
!= stack
.currentFrame
->locals
.repeatOthercase
)
1211 ++stack
.currentFrame
->args
.subjectPtr
;
1213 /* Control never reaches here */
1215 stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
= stack
.currentFrame
->args
.subjectPtr
;
1216 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1217 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1219 if (*stack
.currentFrame
->args
.subjectPtr
!= stack
.currentFrame
->locals
.fc
&& *stack
.currentFrame
->args
.subjectPtr
!= othercase
)
1221 ++stack
.currentFrame
->args
.subjectPtr
;
1223 while (stack
.currentFrame
->args
.subjectPtr
>= stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
) {
1224 RECURSIVE_MATCH(29, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1227 --stack
.currentFrame
->args
.subjectPtr
;
1231 /* Control never reaches here */
1233 /* No case on surrogate pairs, so no need to bother with "othercase". */
1235 for (int i
= 1; i
<= min
; i
++) {
1236 if (*stack
.currentFrame
->args
.subjectPtr
!= stack
.currentFrame
->locals
.fc
)
1238 stack
.currentFrame
->args
.subjectPtr
+= 2;
1241 if (min
== stack
.currentFrame
->locals
.max
)
1245 for (stack
.currentFrame
->locals
.fi
= min
;; stack
.currentFrame
->locals
.fi
++) {
1246 RECURSIVE_MATCH(30, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1249 if (stack
.currentFrame
->locals
.fi
>= stack
.currentFrame
->locals
.max
|| stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1251 if (*stack
.currentFrame
->args
.subjectPtr
!= stack
.currentFrame
->locals
.fc
)
1253 stack
.currentFrame
->args
.subjectPtr
+= 2;
1255 /* Control never reaches here */
1257 stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
= stack
.currentFrame
->args
.subjectPtr
;
1258 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1259 if (stack
.currentFrame
->args
.subjectPtr
> md
.endSubject
- 2)
1261 if (*stack
.currentFrame
->args
.subjectPtr
!= stack
.currentFrame
->locals
.fc
)
1263 stack
.currentFrame
->args
.subjectPtr
+= 2;
1265 while (stack
.currentFrame
->args
.subjectPtr
>= stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
) {
1266 RECURSIVE_MATCH(31, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1269 stack
.currentFrame
->args
.subjectPtr
-= 2;
1273 /* Control never reaches here */
1275 /* Control never reaches here */
1277 /* Match a negated single one-byte character. */
1279 BEGIN_OPCODE(NOT
): {
1280 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1282 int b
= stack
.currentFrame
->args
.instructionPtr
[1];
1283 int c
= *stack
.currentFrame
->args
.subjectPtr
++;
1284 stack
.currentFrame
->args
.instructionPtr
+= 2;
1285 if (md
.ignoreCase
) {
1288 if (toLowerCase(b
) == c
)
1297 /* Match a negated single one-byte character repeatedly. This is almost a
1298 repeat of the code for a repeated single character, but I haven't found a
1299 nice way of commoning these up that doesn't require a test of the
1300 positive/negative option for each character match. Maybe that wouldn't add
1301 very much to the time taken, but character matching *is* what this is all
1304 BEGIN_OPCODE(NOTEXACT
):
1305 min
= stack
.currentFrame
->locals
.max
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
1307 stack
.currentFrame
->args
.instructionPtr
+= 3;
1310 BEGIN_OPCODE(NOTUPTO
):
1311 BEGIN_OPCODE(NOTMINUPTO
):
1313 stack
.currentFrame
->locals
.max
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
1314 minimize
= *stack
.currentFrame
->args
.instructionPtr
== OP_NOTMINUPTO
;
1315 stack
.currentFrame
->args
.instructionPtr
+= 3;
1318 BEGIN_OPCODE(NOTSTAR
):
1319 BEGIN_OPCODE(NOTMINSTAR
):
1320 BEGIN_OPCODE(NOTPLUS
):
1321 BEGIN_OPCODE(NOTMINPLUS
):
1322 BEGIN_OPCODE(NOTQUERY
):
1323 BEGIN_OPCODE(NOTMINQUERY
):
1324 repeatInformationFromInstructionOffset(*stack
.currentFrame
->args
.instructionPtr
++ - OP_NOTSTAR
, minimize
, min
, stack
.currentFrame
->locals
.max
);
1326 /* Common code for all repeated single-byte matches. We can give up quickly
1327 if there are fewer than the minimum number of bytes left in the
1331 if (min
> md
.endSubject
- stack
.currentFrame
->args
.subjectPtr
)
1333 stack
.currentFrame
->locals
.fc
= *stack
.currentFrame
->args
.instructionPtr
++;
1335 /* The code is duplicated for the caseless and caseful cases, for speed,
1336 since matching characters is likely to be quite common. First, ensure the
1337 minimum number of matches are present. If min = max, continue at the same
1338 level without recursing. Otherwise, if minimizing, keep trying the rest of
1339 the expression and advancing one matching character if failing, up to the
1340 maximum. Alternatively, if maximizing, find the maximum number of
1341 characters and work backwards. */
1343 DPRINTF(("negative matching %c{%d,%d}\n", stack
.currentFrame
->locals
.fc
, min
, stack
.currentFrame
->locals
.max
));
1345 if (md
.ignoreCase
) {
1346 if (stack
.currentFrame
->locals
.fc
< 128)
1347 stack
.currentFrame
->locals
.fc
= toLowerCase(stack
.currentFrame
->locals
.fc
);
1349 for (int i
= 1; i
<= min
; i
++) {
1350 int d
= *stack
.currentFrame
->args
.subjectPtr
++;
1353 if (stack
.currentFrame
->locals
.fc
== d
)
1357 if (min
== stack
.currentFrame
->locals
.max
)
1361 for (stack
.currentFrame
->locals
.fi
= min
;; stack
.currentFrame
->locals
.fi
++) {
1362 RECURSIVE_MATCH(38, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1365 int d
= *stack
.currentFrame
->args
.subjectPtr
++;
1368 if (stack
.currentFrame
->locals
.fi
>= stack
.currentFrame
->locals
.max
|| stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
|| stack
.currentFrame
->locals
.fc
== d
)
1371 /* Control never reaches here */
1377 stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
= stack
.currentFrame
->args
.subjectPtr
;
1379 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1380 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1382 int d
= *stack
.currentFrame
->args
.subjectPtr
;
1385 if (stack
.currentFrame
->locals
.fc
== d
)
1387 ++stack
.currentFrame
->args
.subjectPtr
;
1390 RECURSIVE_MATCH(40, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1393 if (stack
.currentFrame
->args
.subjectPtr
-- == stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
)
1394 break; /* Stop if tried at original pos */
1399 /* Control never reaches here */
1402 /* Caseful comparisons */
1405 for (int i
= 1; i
<= min
; i
++) {
1406 int d
= *stack
.currentFrame
->args
.subjectPtr
++;
1407 if (stack
.currentFrame
->locals
.fc
== d
)
1411 if (min
== stack
.currentFrame
->locals
.max
)
1415 for (stack
.currentFrame
->locals
.fi
= min
;; stack
.currentFrame
->locals
.fi
++) {
1416 RECURSIVE_MATCH(42, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1419 int d
= *stack
.currentFrame
->args
.subjectPtr
++;
1420 if (stack
.currentFrame
->locals
.fi
>= stack
.currentFrame
->locals
.max
|| stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
|| stack
.currentFrame
->locals
.fc
== d
)
1423 /* Control never reaches here */
1429 stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
= stack
.currentFrame
->args
.subjectPtr
;
1431 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1432 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1434 int d
= *stack
.currentFrame
->args
.subjectPtr
;
1435 if (stack
.currentFrame
->locals
.fc
== d
)
1437 ++stack
.currentFrame
->args
.subjectPtr
;
1440 RECURSIVE_MATCH(44, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1443 if (stack
.currentFrame
->args
.subjectPtr
-- == stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
)
1444 break; /* Stop if tried at original pos */
1450 /* Control never reaches here */
1452 /* Match a single character type repeatedly; several different opcodes
1453 share code. This is very similar to the code for single characters, but we
1454 repeat it in the interests of efficiency. */
1456 BEGIN_OPCODE(TYPEEXACT
):
1457 min
= stack
.currentFrame
->locals
.max
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
1459 stack
.currentFrame
->args
.instructionPtr
+= 3;
1462 BEGIN_OPCODE(TYPEUPTO
):
1463 BEGIN_OPCODE(TYPEMINUPTO
):
1465 stack
.currentFrame
->locals
.max
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
1466 minimize
= *stack
.currentFrame
->args
.instructionPtr
== OP_TYPEMINUPTO
;
1467 stack
.currentFrame
->args
.instructionPtr
+= 3;
1470 BEGIN_OPCODE(TYPESTAR
):
1471 BEGIN_OPCODE(TYPEMINSTAR
):
1472 BEGIN_OPCODE(TYPEPLUS
):
1473 BEGIN_OPCODE(TYPEMINPLUS
):
1474 BEGIN_OPCODE(TYPEQUERY
):
1475 BEGIN_OPCODE(TYPEMINQUERY
):
1476 repeatInformationFromInstructionOffset(*stack
.currentFrame
->args
.instructionPtr
++ - OP_TYPESTAR
, minimize
, min
, stack
.currentFrame
->locals
.max
);
1478 /* Common code for all repeated single character type matches. Note that
1479 in UTF-8 mode, '.' matches a character of any length, but for the other
1480 character types, the valid characters are all one-byte long. */
1483 stack
.currentFrame
->locals
.ctype
= *stack
.currentFrame
->args
.instructionPtr
++; /* Code for the character type */
1485 /* First, ensure the minimum number of matches are present. Use inline
1486 code for maximizing the speed, and do the type test once at the start
1487 (i.e. keep it out of the loop). Also we can test that there are at least
1488 the minimum number of characters before we start. */
1490 if (min
> md
.endSubject
- stack
.currentFrame
->args
.subjectPtr
)
1493 switch (stack
.currentFrame
->locals
.ctype
) {
1494 case OP_NOT_NEWLINE
:
1495 for (int i
= 1; i
<= min
; i
++) {
1496 if (isNewline(*stack
.currentFrame
->args
.subjectPtr
))
1498 ++stack
.currentFrame
->args
.subjectPtr
;
1503 for (int i
= 1; i
<= min
; i
++) {
1504 if (isASCIIDigit(*stack
.currentFrame
->args
.subjectPtr
))
1506 ++stack
.currentFrame
->args
.subjectPtr
;
1511 for (int i
= 1; i
<= min
; i
++) {
1512 if (!isASCIIDigit(*stack
.currentFrame
->args
.subjectPtr
))
1514 ++stack
.currentFrame
->args
.subjectPtr
;
1518 case OP_NOT_WHITESPACE
:
1519 for (int i
= 1; i
<= min
; i
++) {
1520 if (isSpaceChar(*stack
.currentFrame
->args
.subjectPtr
))
1522 ++stack
.currentFrame
->args
.subjectPtr
;
1527 for (int i
= 1; i
<= min
; i
++) {
1528 if (!isSpaceChar(*stack
.currentFrame
->args
.subjectPtr
))
1530 ++stack
.currentFrame
->args
.subjectPtr
;
1534 case OP_NOT_WORDCHAR
:
1535 for (int i
= 1; i
<= min
; i
++) {
1536 if (isWordChar(*stack
.currentFrame
->args
.subjectPtr
))
1538 ++stack
.currentFrame
->args
.subjectPtr
;
1543 for (int i
= 1; i
<= min
; i
++) {
1544 if (!isWordChar(*stack
.currentFrame
->args
.subjectPtr
))
1546 ++stack
.currentFrame
->args
.subjectPtr
;
1551 ASSERT_NOT_REACHED();
1552 return matchError(JSRegExpErrorInternal
, stack
);
1553 } /* End switch(stack.currentFrame->locals.ctype) */
1556 /* If min = max, continue at the same level without recursing */
1558 if (min
== stack
.currentFrame
->locals
.max
)
1561 /* If minimizing, we have to test the rest of the pattern before each
1562 subsequent match. */
1565 for (stack
.currentFrame
->locals
.fi
= min
;; stack
.currentFrame
->locals
.fi
++) {
1566 RECURSIVE_MATCH(48, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1569 if (stack
.currentFrame
->locals
.fi
>= stack
.currentFrame
->locals
.max
|| stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1572 int c
= *stack
.currentFrame
->args
.subjectPtr
++;
1573 switch (stack
.currentFrame
->locals
.ctype
) {
1574 case OP_NOT_NEWLINE
:
1580 if (isASCIIDigit(c
))
1585 if (!isASCIIDigit(c
))
1589 case OP_NOT_WHITESPACE
:
1595 if (!isSpaceChar(c
))
1599 case OP_NOT_WORDCHAR
:
1610 ASSERT_NOT_REACHED();
1611 return matchError(JSRegExpErrorInternal
, stack
);
1614 /* Control never reaches here */
1617 /* If maximizing it is worth using inline code for speed, doing the type
1618 test once at the start (i.e. keep it out of the loop). */
1621 stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
= stack
.currentFrame
->args
.subjectPtr
; /* Remember where we started */
1623 switch (stack
.currentFrame
->locals
.ctype
) {
1624 case OP_NOT_NEWLINE
:
1625 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1626 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
|| isNewline(*stack
.currentFrame
->args
.subjectPtr
))
1628 stack
.currentFrame
->args
.subjectPtr
++;
1633 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1634 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1636 int c
= *stack
.currentFrame
->args
.subjectPtr
;
1637 if (isASCIIDigit(c
))
1639 ++stack
.currentFrame
->args
.subjectPtr
;
1644 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1645 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1647 int c
= *stack
.currentFrame
->args
.subjectPtr
;
1648 if (!isASCIIDigit(c
))
1650 ++stack
.currentFrame
->args
.subjectPtr
;
1654 case OP_NOT_WHITESPACE
:
1655 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1656 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1658 int c
= *stack
.currentFrame
->args
.subjectPtr
;
1661 ++stack
.currentFrame
->args
.subjectPtr
;
1666 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1667 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1669 int c
= *stack
.currentFrame
->args
.subjectPtr
;
1670 if (!isSpaceChar(c
))
1672 ++stack
.currentFrame
->args
.subjectPtr
;
1676 case OP_NOT_WORDCHAR
:
1677 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1678 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1680 int c
= *stack
.currentFrame
->args
.subjectPtr
;
1683 ++stack
.currentFrame
->args
.subjectPtr
;
1688 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1689 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1691 int c
= *stack
.currentFrame
->args
.subjectPtr
;
1694 ++stack
.currentFrame
->args
.subjectPtr
;
1699 ASSERT_NOT_REACHED();
1700 return matchError(JSRegExpErrorInternal
, stack
);
1703 /* stack.currentFrame->args.subjectPtr is now past the end of the maximum run */
1706 RECURSIVE_MATCH(52, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1709 if (stack
.currentFrame
->args
.subjectPtr
-- == stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
)
1710 break; /* Stop if tried at original pos */
1713 /* Get here if we can't make it match with any permitted repetitions */
1717 /* Control never reaches here */
1719 BEGIN_OPCODE(CRMINPLUS
):
1720 BEGIN_OPCODE(CRMINQUERY
):
1721 BEGIN_OPCODE(CRMINRANGE
):
1722 BEGIN_OPCODE(CRMINSTAR
):
1723 BEGIN_OPCODE(CRPLUS
):
1724 BEGIN_OPCODE(CRQUERY
):
1725 BEGIN_OPCODE(CRRANGE
):
1726 BEGIN_OPCODE(CRSTAR
):
1727 ASSERT_NOT_REACHED();
1728 return matchError(JSRegExpErrorInternal
, stack
);
1730 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
1735 /* Opening capturing bracket. If there is space in the offset vector, save
1736 the current subject position in the working slot at the top of the vector. We
1737 mustn't change the current values of the data slot, because they may be set
1738 from a previous iteration of this group, and be referred to by a reference
1741 If the bracket fails to match, we need to restore this value and also the
1742 values of the final offsets, in case they were set by a previous iteration of
1745 If there isn't enough space in the offset vector, treat this as if it were a
1746 non-capturing bracket. Don't worry about setting the flag for the error case
1747 here; that is handled in the code for KET. */
1749 ASSERT(*stack
.currentFrame
->args
.instructionPtr
> OP_BRA
);
1751 stack
.currentFrame
->locals
.number
= *stack
.currentFrame
->args
.instructionPtr
- OP_BRA
;
1753 /* For extended extraction brackets (large number), we have to fish out the
1754 number from a dummy opcode at the start. */
1756 if (stack
.currentFrame
->locals
.number
> EXTRACT_BASIC_MAX
)
1757 stack
.currentFrame
->locals
.number
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 2 + LINK_SIZE
);
1758 stack
.currentFrame
->locals
.offset
= stack
.currentFrame
->locals
.number
<< 1;
1761 printf("start bracket %d subject=", stack
.currentFrame
->locals
.number
);
1762 pchars(stack
.currentFrame
->args
.subjectPtr
, 16, true, md
);
1766 if (stack
.currentFrame
->locals
.offset
< md
.offsetMax
) {
1767 stack
.currentFrame
->locals
.saveOffset1
= md
.offsetVector
[stack
.currentFrame
->locals
.offset
];
1768 stack
.currentFrame
->locals
.saveOffset2
= md
.offsetVector
[stack
.currentFrame
->locals
.offset
+ 1];
1769 stack
.currentFrame
->locals
.saveOffset3
= md
.offsetVector
[md
.offsetEnd
- stack
.currentFrame
->locals
.number
];
1771 DPRINTF(("saving %d %d %d\n", stack
.currentFrame
->locals
.saveOffset1
, stack
.currentFrame
->locals
.saveOffset2
, stack
.currentFrame
->locals
.saveOffset3
));
1772 md
.offsetVector
[md
.offsetEnd
- stack
.currentFrame
->locals
.number
] = stack
.currentFrame
->args
.subjectPtr
- md
.startSubject
;
1775 RECURSIVE_MATCH_NEW_GROUP(1, stack
.currentFrame
->args
.instructionPtr
+ 1 + LINK_SIZE
, stack
.currentFrame
->args
.bracketChain
);
1778 stack
.currentFrame
->args
.instructionPtr
+= getLinkValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
1779 } while (*stack
.currentFrame
->args
.instructionPtr
== OP_ALT
);
1781 DPRINTF(("bracket %d failed\n", stack
.currentFrame
->locals
.number
));
1783 md
.offsetVector
[stack
.currentFrame
->locals
.offset
] = stack
.currentFrame
->locals
.saveOffset1
;
1784 md
.offsetVector
[stack
.currentFrame
->locals
.offset
+ 1] = stack
.currentFrame
->locals
.saveOffset2
;
1785 md
.offsetVector
[md
.offsetEnd
- stack
.currentFrame
->locals
.number
] = stack
.currentFrame
->locals
.saveOffset3
;
1790 /* Insufficient room for saving captured contents */
1792 goto NON_CAPTURING_BRACKET
;
1795 /* Do not stick any code in here without much thought; it is assumed
1796 that "continue" in the code above comes out to here to repeat the main
1799 } /* End of main loop */
1801 ASSERT_NOT_REACHED();
1803 #ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
1806 switch (stack
.currentFrame
->returnLocation
) {
1807 case 0: goto RETURN
;
1808 case 1: goto RRETURN_1
;
1809 case 2: goto RRETURN_2
;
1810 case 6: goto RRETURN_6
;
1811 case 7: goto RRETURN_7
;
1812 case 14: goto RRETURN_14
;
1813 case 15: goto RRETURN_15
;
1814 case 16: goto RRETURN_16
;
1815 case 17: goto RRETURN_17
;
1816 case 18: goto RRETURN_18
;
1817 case 19: goto RRETURN_19
;
1818 case 20: goto RRETURN_20
;
1819 case 21: goto RRETURN_21
;
1820 case 22: goto RRETURN_22
;
1821 case 24: goto RRETURN_24
;
1822 case 26: goto RRETURN_26
;
1823 case 27: goto RRETURN_27
;
1824 case 28: goto RRETURN_28
;
1825 case 29: goto RRETURN_29
;
1826 case 30: goto RRETURN_30
;
1827 case 31: goto RRETURN_31
;
1828 case 38: goto RRETURN_38
;
1829 case 40: goto RRETURN_40
;
1830 case 42: goto RRETURN_42
;
1831 case 44: goto RRETURN_44
;
1832 case 48: goto RRETURN_48
;
1833 case 52: goto RRETURN_52
;
1836 ASSERT_NOT_REACHED();
1837 return matchError(JSRegExpErrorInternal
, stack
);
1846 /*************************************************
1847 * Execute a Regular Expression *
1848 *************************************************/
1850 /* This function applies a compiled re to a subject string and picks out
1851 portions of the string if it matches. Two elements in the vector are set for
1852 each substring: the offsets to the start and end of the substring.
1855 re points to the compiled expression
1856 extra_data points to extra data or is NULL
1857 subject points to the subject string
1858 length length of subject string (may contain binary zeros)
1859 start_offset where to start in the subject string
1861 offsets points to a vector of ints to be filled in with offsets
1862 offsetCount the number of elements in the vector
1864 Returns: > 0 => success; value is the number of elements filled in
1865 = 0 => success, but offsets is not big enough
1866 -1 => failed to match
1867 < -1 => some kind of unexpected problem
1870 static void tryFirstByteOptimization(const UChar
*& subjectPtr
, const UChar
* endSubject
, int firstByte
, bool firstByteIsCaseless
, bool useMultiLineFirstCharOptimization
, const UChar
* originalSubjectStart
)
1872 // If firstByte is set, try scanning to the first instance of that byte
1873 // no need to try and match against any earlier part of the subject string.
1874 if (firstByte
>= 0) {
1875 UChar firstChar
= firstByte
;
1876 if (firstByteIsCaseless
)
1877 while (subjectPtr
< endSubject
) {
1878 int c
= *subjectPtr
;
1881 if (toLowerCase(c
) == firstChar
)
1886 while (subjectPtr
< endSubject
&& *subjectPtr
!= firstChar
)
1889 } else if (useMultiLineFirstCharOptimization
) {
1890 /* Or to just after \n for a multiline match if possible */
1891 // I'm not sure why this != originalSubjectStart check is necessary -- ecs 11/18/07
1892 if (subjectPtr
> originalSubjectStart
) {
1893 while (subjectPtr
< endSubject
&& !isNewline(subjectPtr
[-1]))
1899 static bool tryRequiredByteOptimization(const UChar
*& subjectPtr
, const UChar
* endSubject
, int reqByte
, int reqByte2
, bool reqByteIsCaseless
, bool hasFirstByte
, const UChar
*& reqBytePtr
)
1901 /* If reqByte is set, we know that that character must appear in the subject
1902 for the match to succeed. If the first character is set, reqByte must be
1903 later in the subject; otherwise the test starts at the match point. This
1904 optimization can save a huge amount of backtracking in patterns with nested
1905 unlimited repeats that aren't going to match. Writing separate code for
1906 cased/caseless versions makes it go faster, as does using an autoincrement
1907 and backing off on a match.
1909 HOWEVER: when the subject string is very, very long, searching to its end can
1910 take a long time, and give bad performance on quite ordinary patterns. This
1911 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
1912 don't do this when the string is sufficiently long.
1915 if (reqByte
>= 0 && endSubject
- subjectPtr
< REQ_BYTE_MAX
) {
1916 const UChar
* p
= subjectPtr
+ (hasFirstByte
? 1 : 0);
1918 /* We don't need to repeat the search if we haven't yet reached the
1919 place we found it at last time. */
1921 if (p
> reqBytePtr
) {
1922 if (reqByteIsCaseless
) {
1923 while (p
< endSubject
) {
1925 if (pp
== reqByte
|| pp
== reqByte2
) {
1931 while (p
< endSubject
) {
1932 if (*p
++ == reqByte
) {
1939 /* If we can't find the required character, break the matching loop */
1941 if (p
>= endSubject
)
1944 /* If we have found the required character, save the point where we
1945 found it, so that we don't search again next time round the loop if
1946 the start hasn't passed this character yet. */
1954 int jsRegExpExecute(const JSRegExp
* re
,
1955 const UChar
* subject
, int length
, int start_offset
, int* offsets
,
1959 ASSERT(subject
|| !length
);
1960 ASSERT(offsetCount
>= 0);
1961 ASSERT(offsets
|| offsetCount
== 0);
1963 HistogramTimeLogger
logger(re
);
1965 MatchData matchBlock
;
1966 matchBlock
.startSubject
= subject
;
1967 matchBlock
.endSubject
= matchBlock
.startSubject
+ length
;
1968 const UChar
* endSubject
= matchBlock
.endSubject
;
1970 matchBlock
.multiline
= (re
->options
& MatchAcrossMultipleLinesOption
);
1971 matchBlock
.ignoreCase
= (re
->options
& IgnoreCaseOption
);
1973 /* If the expression has got more back references than the offsets supplied can
1974 hold, we get a temporary chunk of working store to use during the matching.
1975 Otherwise, we can use the vector supplied, rounding down its size to a multiple
1978 int ocount
= offsetCount
- (offsetCount
% 3);
1980 // FIXME: This is lame that we have to second-guess our caller here.
1981 // The API should change to either fail-hard when we don't have enough offset space
1982 // or that we shouldn't ask our callers to pre-allocate in the first place.
1983 bool usingTemporaryOffsets
= false;
1984 if (re
->topBackref
> 0 && re
->topBackref
>= ocount
/3) {
1985 ocount
= re
->topBackref
* 3 + 3;
1986 matchBlock
.offsetVector
= new int[ocount
];
1987 if (!matchBlock
.offsetVector
)
1988 return JSRegExpErrorNoMemory
;
1989 usingTemporaryOffsets
= true;
1991 matchBlock
.offsetVector
= offsets
;
1993 matchBlock
.offsetEnd
= ocount
;
1994 matchBlock
.offsetMax
= (2*ocount
)/3;
1995 matchBlock
.offsetOverflow
= false;
1997 /* Compute the minimum number of offsets that we need to reset each time. Doing
1998 this makes a huge difference to execution time when there aren't many brackets
2001 int resetCount
= 2 + re
->topBracket
* 2;
2002 if (resetCount
> offsetCount
)
2003 resetCount
= ocount
;
2005 /* Reset the working variable associated with each extraction. These should
2006 never be used unless previously set, but they get saved and restored, and so we
2007 initialize them to avoid reading uninitialized locations. */
2009 if (matchBlock
.offsetVector
) {
2010 int* iptr
= matchBlock
.offsetVector
+ ocount
;
2011 int* iend
= iptr
- resetCount
/2 + 1;
2012 while (--iptr
>= iend
)
2016 /* Set up the first character to match, if available. The firstByte value is
2017 never set for an anchored regular expression, but the anchoring may be forced
2018 at run time, so we have to test for anchoring. The first char may be unset for
2019 an unanchored pattern, of course. If there's no first char and the pattern was
2020 studied, there may be a bitmap of possible first characters. */
2022 bool firstByteIsCaseless
= false;
2024 if (re
->options
& UseFirstByteOptimizationOption
) {
2025 firstByte
= re
->firstByte
& 255;
2026 if ((firstByteIsCaseless
= (re
->firstByte
& REQ_IGNORE_CASE
)))
2027 firstByte
= toLowerCase(firstByte
);
2030 /* For anchored or unanchored matches, there may be a "last known required
2033 bool reqByteIsCaseless
= false;
2036 if (re
->options
& UseRequiredByteOptimizationOption
) {
2037 reqByte
= re
->reqByte
& 255; // FIXME: This optimization could be made to work for UTF16 chars as well...
2038 reqByteIsCaseless
= (re
->reqByte
& REQ_IGNORE_CASE
);
2039 reqByte2
= flipCase(reqByte
);
2042 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
2043 the loop runs just once. */
2045 const UChar
* startMatch
= subject
+ start_offset
;
2046 const UChar
* reqBytePtr
= startMatch
- 1;
2047 bool useMultiLineFirstCharOptimization
= re
->options
& UseMultiLineFirstByteOptimizationOption
;
2050 /* Reset the maximum number of extractions we might see. */
2051 if (matchBlock
.offsetVector
) {
2052 int* iptr
= matchBlock
.offsetVector
;
2053 int* iend
= iptr
+ resetCount
;
2058 tryFirstByteOptimization(startMatch
, endSubject
, firstByte
, firstByteIsCaseless
, useMultiLineFirstCharOptimization
, matchBlock
.startSubject
+ start_offset
);
2059 if (tryRequiredByteOptimization(startMatch
, endSubject
, reqByte
, reqByte2
, reqByteIsCaseless
, firstByte
>= 0, reqBytePtr
))
2062 /* When a match occurs, substrings will be set for all internal extractions;
2063 we just need to set up the whole thing as substring 0 before returning. If
2064 there were too many extractions, set the return code to zero. In the case
2065 where we had to get some local store to hold offsets for backreferences, copy
2066 those back references that we can. In this case there need not be overflow
2067 if certain parts of the pattern were not used. */
2069 /* The code starts after the JSRegExp block and the capture name table. */
2070 const unsigned char* start_code
= (const unsigned char*)(re
+ 1);
2072 int returnCode
= match(startMatch
, start_code
, 2, matchBlock
);
2074 /* When the result is no match, advance the pointer to the next character
2076 if (returnCode
== 0) {
2081 if (returnCode
!= 1) {
2082 ASSERT(returnCode
== JSRegExpErrorHitLimit
|| returnCode
== JSRegExpErrorNoMemory
);
2083 DPRINTF((">>>> error: returning %d\n", returnCode
));
2087 /* We have a match! Copy the offset information from temporary store if
2090 if (usingTemporaryOffsets
) {
2091 if (offsetCount
>= 4) {
2092 memcpy(offsets
+ 2, matchBlock
.offsetVector
+ 2, (offsetCount
- 2) * sizeof(int));
2093 DPRINTF(("Copied offsets from temporary memory\n"));
2095 if (matchBlock
.endOffsetTop
> offsetCount
)
2096 matchBlock
.offsetOverflow
= true;
2098 DPRINTF(("Freeing temporary memory\n"));
2099 delete [] matchBlock
.offsetVector
;
2102 returnCode
= matchBlock
.offsetOverflow
? 0 : matchBlock
.endOffsetTop
/ 2;
2104 if (offsetCount
< 2)
2107 offsets
[0] = startMatch
- matchBlock
.startSubject
;
2108 offsets
[1] = matchBlock
.endMatchPtr
- matchBlock
.startSubject
;
2111 DPRINTF((">>>> returning %d\n", returnCode
));
2113 } while (!(re
->options
& IsAnchoredOption
) && startMatch
<= endSubject
);
2115 if (usingTemporaryOffsets
) {
2116 DPRINTF(("Freeing temporary memory\n"));
2117 delete [] matchBlock
.offsetVector
;
2120 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
2121 return JSRegExpErrorNoMatch
;
2124 #if REGEXP_HISTOGRAM
2126 class CompareHistogramEntries
{
2128 bool operator()(const pair
<UString
, double>& a
, const pair
<UString
, double>& b
)
2130 if (a
.second
== b
.second
)
2131 return a
.first
< b
.first
;
2132 return a
.second
< b
.second
;
2136 Histogram::~Histogram()
2138 Vector
<pair
<UString
, double> > values
;
2139 Map::iterator end
= times
.end();
2140 for (Map::iterator it
= times
.begin(); it
!= end
; ++it
)
2142 sort(values
.begin(), values
.end(), CompareHistogramEntries());
2143 size_t size
= values
.size();
2144 printf("Regular Expressions, sorted by time spent evaluating them:\n");
2145 for (size_t i
= 0; i
< size
; ++i
)
2146 printf(" %f - %s\n", values
[size
- i
- 1].second
, values
[size
- i
- 1].first
.UTF8String().c_str());
2149 void Histogram::add(const JSRegExp
* re
, double elapsedTime
)
2151 UString
string(reinterpret_cast<const UChar
*>(reinterpret_cast<const char*>(re
) + re
->stringOffset
), re
->stringLength
);
2152 if (re
->options
& IgnoreCaseOption
&& re
->options
& MatchAcrossMultipleLinesOption
)
2153 string
+= " (multi-line, ignore case)";
2155 if (re
->options
& IgnoreCaseOption
)
2156 string
+= " (ignore case)";
2157 if (re
->options
& MatchAcrossMultipleLinesOption
)
2158 string
+= " (multi-line)";
2160 pair
<Map::iterator
, bool> result
= times
.add(string
.rep(), elapsedTime
);
2162 result
.first
->second
+= elapsedTime
;
2165 HistogramTimeLogger::HistogramTimeLogger(const JSRegExp
* re
)
2167 , m_startTime(currentTimeMS())
2171 HistogramTimeLogger::~HistogramTimeLogger()
2173 static Histogram histogram
;
2174 histogram
.add(m_re
, currentTimeMS() - m_startTime
);