1 /* This is JavaScriptCore's variant of the PCRE library. While this library
2 started out as a copy of PCRE, many of the features of PCRE have been
3 removed. This library now supports only the regular expression features
4 required by the JavaScript language specification, and has only the functions
5 needed by JavaScriptCore and the rest of WebKit.
7 Originally written by Philip Hazel
8 Copyright (c) 1997-2006 University of Cambridge
9 Copyright (C) 2002, 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
10 Copyright (C) 2007 Eric Seidel <eric@webkit.org>
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
41 /* This module contains jsRegExpExecute(), the externally visible function
42 that does pattern matching using an NFA algorithm, following the rules from
43 the JavaScript specification. There are also some supporting functions. */
46 #include "pcre_internal.h"
49 #include <wtf/ASCIICType.h>
50 #include <wtf/Vector.h>
53 #include <parser/DateMath.h>
54 #include <runtime/UString.h>
60 #define USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
61 //#define USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
64 /* Avoid warnings on Windows. */
68 #ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
69 typedef int ReturnLocation
;
71 typedef void* ReturnLocation
;
76 class HistogramTimeLogger
{
78 HistogramTimeLogger(const JSRegExp
*) { }
88 void add(const JSRegExp
*, double);
91 typedef HashMap
<RefPtr
<UString::Rep
>, double> Map
;
95 class HistogramTimeLogger
{
97 HistogramTimeLogger(const JSRegExp
*);
98 ~HistogramTimeLogger();
101 const JSRegExp
* m_re
;
107 /* Structure for building a chain of data for holding the values of
108 the subject pointer at the start of each bracket, used to detect when
109 an empty string has been matched by a bracket to break infinite loops. */
110 struct BracketChainNode
{
111 BracketChainNode
* previousBracket
;
112 const UChar
* bracketStart
;
116 ReturnLocation returnLocation
;
117 struct MatchFrame
* previousFrame
;
119 /* Function arguments that may change */
121 const UChar
* subjectPtr
;
122 const unsigned char* instructionPtr
;
124 BracketChainNode
* bracketChain
;
128 /* PCRE uses "fake" recursion built off of gotos, thus
129 stack-based local variables are not safe to use. Instead we have to
130 store local variables on the current MatchFrame. */
132 const unsigned char* data
;
133 const unsigned char* startOfRepeatingBracket
;
134 const UChar
* subjectPtrAtStartOfInstruction
; // Several instrutions stash away a subjectPtr here for later compare
135 const unsigned char* instructionPtrAtStartOfOnce
;
150 BracketChainNode bracketChainNode
;
154 /* Structure for passing "static" information around between the functions
155 doing traditional NFA matching, so that they are thread-safe. */
158 int* offsetVector
; /* Offset vector */
159 int offsetEnd
; /* One past the end */
160 int offsetMax
; /* The maximum usable for return data */
161 bool offsetOverflow
; /* Set if too many extractions */
162 const UChar
* startSubject
; /* Start of the subject string */
163 const UChar
* endSubject
; /* End of the subject string */
164 const UChar
* endMatchPtr
; /* Subject position at end match */
165 int endOffsetTop
; /* Highwater mark at end of match */
170 /* The maximum remaining length of subject we are prepared to search for a
173 #define REQ_BYTE_MAX 1000
175 /* The below limit restricts the number of "recursive" match calls in order to
176 avoid spending exponential time on complex regular expressions. */
178 static const unsigned matchLimit
= 100000;
181 /*************************************************
182 * Debugging function to print chars *
183 *************************************************/
185 /* Print a sequence of chars in printable format, stopping at the end of the
186 subject if the requested.
189 p points to characters
190 length number to print
191 isSubject true if printing from within md.startSubject
192 md pointer to matching data block, if isSubject is true
195 static void pchars(const UChar
* p
, int length
, bool isSubject
, const MatchData
& md
)
197 if (isSubject
&& length
> md
.endSubject
- p
)
198 length
= md
.endSubject
- p
;
199 while (length
-- > 0) {
201 if (isprint(c
= *(p
++)))
204 printf("\\x%02x", c
);
206 printf("\\x{%x}", c
);
211 /*************************************************
212 * Match a back-reference *
213 *************************************************/
215 /* If a back reference hasn't been set, the length that is passed is greater
216 than the number of characters left in the string, so the match fails.
219 offset index into the offset vector
220 subjectPtr points into the subject
221 length length to be matched
222 md points to match data block
224 Returns: true if matched
227 static bool matchRef(int offset
, const UChar
* subjectPtr
, int length
, const MatchData
& md
)
229 const UChar
* p
= md
.startSubject
+ md
.offsetVector
[offset
];
232 if (subjectPtr
>= md
.endSubject
)
233 printf("matching subject <null>");
235 printf("matching subject ");
236 pchars(subjectPtr
, length
, true, md
);
238 printf(" against backref ");
239 pchars(p
, length
, false, md
);
243 /* Always fail if not enough characters left */
245 if (length
> md
.endSubject
- subjectPtr
)
248 /* Separate the caselesss case for speed */
251 while (length
-- > 0) {
253 int othercase
= jsc_pcre_ucp_othercase(c
);
254 UChar d
= *subjectPtr
++;
255 if (c
!= d
&& othercase
!= d
)
261 if (*p
++ != *subjectPtr
++)
268 #ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
270 /* Use numbered labels and switch statement at the bottom of the match function. */
272 #define RMATCH_WHERE(num) num
273 #define RRETURN_LABEL RRETURN_SWITCH
277 /* Use GCC's computed goto extension. */
279 /* For one test case this is more than 40% faster than the switch statement.
280 We could avoid the use of the num argument entirely by using local labels,
281 but using it for the GCC case as well as the non-GCC case allows us to share
282 a bit more code and notice if we use conflicting numbers.*/
284 #define RMATCH_WHERE(num) &&RRETURN_##num
285 #define RRETURN_LABEL *stack.currentFrame->returnLocation
289 #define RECURSIVE_MATCH_COMMON(num) \
292 stack.popCurrentFrame();
294 #define RECURSIVE_MATCH(num, ra, rb) \
296 stack.pushNewFrame((ra), (rb), RMATCH_WHERE(num)); \
297 RECURSIVE_MATCH_COMMON(num) \
300 #define RECURSIVE_MATCH_NEW_GROUP(num, ra, rb) \
302 stack.pushNewFrame((ra), (rb), RMATCH_WHERE(num)); \
303 startNewGroup(stack.currentFrame); \
304 RECURSIVE_MATCH_COMMON(num) \
307 #define RRETURN goto RRETURN_LABEL
309 #define RRETURN_NO_MATCH do { isMatch = false; RRETURN; } while (0)
311 /*************************************************
312 * Match from current position *
313 *************************************************/
315 /* On entry instructionPtr points to the first opcode, and subjectPtr to the first character
316 in the subject string, while substringStart holds the value of subjectPtr at the start of the
317 last bracketed group - used for breaking infinite loops matching zero-length
318 strings. This function is called recursively in many circumstances. Whenever it
319 returns a negative (error) response, the outer match() call must also return the
323 subjectPtr pointer in subject
324 instructionPtr position in code
325 offsetTop current top pointer
326 md pointer to "static" info for the match
328 Returns: 1 if matched ) these values are >= 0
329 0 if failed to match )
330 a negative error value if aborted by an error condition
331 (e.g. stopped by repeated call or recursion limit)
334 static const unsigned numFramesOnStack
= 16;
338 : framesEnd(frames
+ numFramesOnStack
)
339 , currentFrame(frames
)
340 , size(1) // match() creates accesses the first frame w/o calling pushNewFrame
342 ASSERT((sizeof(frames
) / sizeof(frames
[0])) == numFramesOnStack
);
345 MatchFrame frames
[numFramesOnStack
];
346 MatchFrame
* framesEnd
;
347 MatchFrame
* currentFrame
;
350 inline bool canUseStackBufferForNextFrame()
352 return size
< numFramesOnStack
;
355 inline MatchFrame
* allocateNextFrame()
357 if (canUseStackBufferForNextFrame())
358 return currentFrame
+ 1;
359 return new MatchFrame
;
362 inline void pushNewFrame(const unsigned char* instructionPtr
, BracketChainNode
* bracketChain
, ReturnLocation returnLocation
)
364 MatchFrame
* newframe
= allocateNextFrame();
365 newframe
->previousFrame
= currentFrame
;
367 newframe
->args
.subjectPtr
= currentFrame
->args
.subjectPtr
;
368 newframe
->args
.offsetTop
= currentFrame
->args
.offsetTop
;
369 newframe
->args
.instructionPtr
= instructionPtr
;
370 newframe
->args
.bracketChain
= bracketChain
;
371 newframe
->returnLocation
= returnLocation
;
374 currentFrame
= newframe
;
377 inline void popCurrentFrame()
379 MatchFrame
* oldFrame
= currentFrame
;
380 currentFrame
= currentFrame
->previousFrame
;
381 if (size
> numFramesOnStack
)
393 static int matchError(int errorCode
, MatchStack
& stack
)
395 stack
.popAllFrames();
399 /* Get the next UTF-8 character, not advancing the pointer, incrementing length
400 if there are extra bytes. This is called when we know we are in UTF-8 mode. */
402 static inline void getUTF8CharAndIncrementLength(int& c
, const unsigned char* subjectPtr
, int& len
)
405 if ((c
& 0xc0) == 0xc0) {
406 int gcaa
= jsc_pcre_utf8_table4
[c
& 0x3f]; /* Number of additional bytes */
408 c
= (c
& jsc_pcre_utf8_table3
[gcaa
]) << gcss
;
409 for (int gcii
= 1; gcii
<= gcaa
; gcii
++) {
411 c
|= (subjectPtr
[gcii
] & 0x3f) << gcss
;
417 static inline void startNewGroup(MatchFrame
* currentFrame
)
419 /* At the start of a bracketed group, add the current subject pointer to the
420 stack of such pointers, to be re-instated at the end of the group when we hit
421 the closing ket. When match() is called in other circumstances, we don't add to
424 currentFrame
->locals
.bracketChainNode
.previousBracket
= currentFrame
->args
.bracketChain
;
425 currentFrame
->locals
.bracketChainNode
.bracketStart
= currentFrame
->args
.subjectPtr
;
426 currentFrame
->args
.bracketChain
= ¤tFrame
->locals
.bracketChainNode
;
429 // FIXME: "minimize" means "not greedy", we should invert the callers to ask for "greedy" to be less confusing
430 static inline void repeatInformationFromInstructionOffset(short instructionOffset
, bool& minimize
, int& minimumRepeats
, int& maximumRepeats
)
432 // Instruction offsets are based off of OP_CRSTAR, OP_STAR, OP_TYPESTAR, OP_NOTSTAR
433 static const char minimumRepeatsFromInstructionOffset
[] = { 0, 0, 1, 1, 0, 0 };
434 static const int maximumRepeatsFromInstructionOffset
[] = { INT_MAX
, INT_MAX
, INT_MAX
, INT_MAX
, 1, 1 };
436 ASSERT(instructionOffset
>= 0);
437 ASSERT(instructionOffset
<= (OP_CRMINQUERY
- OP_CRSTAR
));
439 minimize
= (instructionOffset
& 1); // this assumes ordering: Instruction, MinimizeInstruction, Instruction2, MinimizeInstruction2
440 minimumRepeats
= minimumRepeatsFromInstructionOffset
[instructionOffset
];
441 maximumRepeats
= maximumRepeatsFromInstructionOffset
[instructionOffset
];
444 static int match(const UChar
* subjectPtr
, const unsigned char* instructionPtr
, int offsetTop
, MatchData
& md
)
446 bool isMatch
= false;
448 bool minimize
= false; /* Initialization not really needed, but some compilers think so. */
449 unsigned remainingMatchCount
= matchLimit
;
453 /* The opcode jump table. */
454 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
455 #define EMIT_JUMP_TABLE_ENTRY(opcode) &&LABEL_OP_##opcode,
456 static void* opcodeJumpTable
[256] = { FOR_EACH_OPCODE(EMIT_JUMP_TABLE_ENTRY
) };
457 #undef EMIT_JUMP_TABLE_ENTRY
460 /* One-time setup of the opcode jump table. */
461 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
462 for (int i
= 255; !opcodeJumpTable
[i
]; i
--)
463 opcodeJumpTable
[i
] = &&CAPTURING_BRACKET
;
466 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
467 // Shark shows this as a hot line
468 // Using a static const here makes this line disappear, but makes later access hotter (not sure why)
469 stack
.currentFrame
->returnLocation
= &&RETURN
;
471 stack
.currentFrame
->returnLocation
= 0;
473 stack
.currentFrame
->args
.subjectPtr
= subjectPtr
;
474 stack
.currentFrame
->args
.instructionPtr
= instructionPtr
;
475 stack
.currentFrame
->args
.offsetTop
= offsetTop
;
476 stack
.currentFrame
->args
.bracketChain
= 0;
477 startNewGroup(stack
.currentFrame
);
479 /* This is where control jumps back to to effect "recursion" */
482 if (!--remainingMatchCount
)
483 return matchError(JSRegExpErrorHitLimit
, stack
);
485 /* Now start processing the operations. */
487 #ifndef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
492 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
493 #define BEGIN_OPCODE(opcode) LABEL_OP_##opcode
494 #define NEXT_OPCODE goto *opcodeJumpTable[*stack.currentFrame->args.instructionPtr]
496 #define BEGIN_OPCODE(opcode) case OP_##opcode
497 #define NEXT_OPCODE continue
500 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
503 switch (*stack
.currentFrame
->args
.instructionPtr
)
506 /* Non-capturing bracket: optimized */
509 NON_CAPTURING_BRACKET
:
510 DPRINTF(("start bracket 0\n"));
512 RECURSIVE_MATCH_NEW_GROUP(2, stack
.currentFrame
->args
.instructionPtr
+ 1 + LINK_SIZE
, stack
.currentFrame
->args
.bracketChain
);
515 stack
.currentFrame
->args
.instructionPtr
+= getLinkValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
516 } while (*stack
.currentFrame
->args
.instructionPtr
== OP_ALT
);
517 DPRINTF(("bracket 0 failed\n"));
520 /* Skip over large extraction number data if encountered. */
522 BEGIN_OPCODE(BRANUMBER
):
523 stack
.currentFrame
->args
.instructionPtr
+= 3;
526 /* End of the pattern. */
529 md
.endMatchPtr
= stack
.currentFrame
->args
.subjectPtr
; /* Record where we ended */
530 md
.endOffsetTop
= stack
.currentFrame
->args
.offsetTop
; /* and how many extracts were taken */
534 /* Assertion brackets. Check the alternative branches in turn - the
535 matching won't pass the KET for an assertion. If any one branch matches,
536 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
537 start of each branch to move the current point backwards, so the code at
538 this level is identical to the lookahead case. */
540 BEGIN_OPCODE(ASSERT
):
542 RECURSIVE_MATCH_NEW_GROUP(6, stack
.currentFrame
->args
.instructionPtr
+ 1 + LINK_SIZE
, NULL
);
545 stack
.currentFrame
->args
.instructionPtr
+= getLinkValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
546 } while (*stack
.currentFrame
->args
.instructionPtr
== OP_ALT
);
547 if (*stack
.currentFrame
->args
.instructionPtr
== OP_KET
)
550 /* Continue from after the assertion, updating the offsets high water
551 mark, since extracts may have been taken during the assertion. */
553 advanceToEndOfBracket(stack
.currentFrame
->args
.instructionPtr
);
554 stack
.currentFrame
->args
.instructionPtr
+= 1 + LINK_SIZE
;
555 stack
.currentFrame
->args
.offsetTop
= md
.endOffsetTop
;
558 /* Negative assertion: all branches must fail to match */
560 BEGIN_OPCODE(ASSERT_NOT
):
562 RECURSIVE_MATCH_NEW_GROUP(7, stack
.currentFrame
->args
.instructionPtr
+ 1 + LINK_SIZE
, NULL
);
565 stack
.currentFrame
->args
.instructionPtr
+= getLinkValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
566 } while (*stack
.currentFrame
->args
.instructionPtr
== OP_ALT
);
568 stack
.currentFrame
->args
.instructionPtr
+= 1 + LINK_SIZE
;
571 /* An alternation is the end of a branch; scan along to find the end of the
572 bracketed group and go to there. */
575 advanceToEndOfBracket(stack
.currentFrame
->args
.instructionPtr
);
578 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
579 that it may occur zero times. It may repeat infinitely, or not at all -
580 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
581 repeat limits are compiled as a number of copies, with the optional ones
582 preceded by BRAZERO or BRAMINZERO. */
584 BEGIN_OPCODE(BRAZERO
): {
585 stack
.currentFrame
->locals
.startOfRepeatingBracket
= stack
.currentFrame
->args
.instructionPtr
+ 1;
586 RECURSIVE_MATCH_NEW_GROUP(14, stack
.currentFrame
->locals
.startOfRepeatingBracket
, stack
.currentFrame
->args
.bracketChain
);
589 advanceToEndOfBracket(stack
.currentFrame
->locals
.startOfRepeatingBracket
);
590 stack
.currentFrame
->args
.instructionPtr
= stack
.currentFrame
->locals
.startOfRepeatingBracket
+ 1 + LINK_SIZE
;
594 BEGIN_OPCODE(BRAMINZERO
): {
595 stack
.currentFrame
->locals
.startOfRepeatingBracket
= stack
.currentFrame
->args
.instructionPtr
+ 1;
596 advanceToEndOfBracket(stack
.currentFrame
->locals
.startOfRepeatingBracket
);
597 RECURSIVE_MATCH_NEW_GROUP(15, stack
.currentFrame
->locals
.startOfRepeatingBracket
+ 1 + LINK_SIZE
, stack
.currentFrame
->args
.bracketChain
);
600 stack
.currentFrame
->args
.instructionPtr
++;
604 /* End of a group, repeated or non-repeating. If we are at the end of
605 an assertion "group", stop matching and return 1, but record the
606 current high water mark for use by positive assertions. Do this also
607 for the "once" (not-backup up) groups. */
610 BEGIN_OPCODE(KETRMIN
):
611 BEGIN_OPCODE(KETRMAX
):
612 stack
.currentFrame
->locals
.instructionPtrAtStartOfOnce
= stack
.currentFrame
->args
.instructionPtr
- getLinkValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
613 stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
= stack
.currentFrame
->args
.bracketChain
->bracketStart
;
615 /* Back up the stack of bracket start pointers. */
617 stack
.currentFrame
->args
.bracketChain
= stack
.currentFrame
->args
.bracketChain
->previousBracket
;
619 if (*stack
.currentFrame
->locals
.instructionPtrAtStartOfOnce
== OP_ASSERT
|| *stack
.currentFrame
->locals
.instructionPtrAtStartOfOnce
== OP_ASSERT_NOT
) {
620 md
.endOffsetTop
= stack
.currentFrame
->args
.offsetTop
;
625 /* In all other cases except a conditional group we have to check the
626 group number back at the start and if necessary complete handling an
627 extraction by setting the offsets and bumping the high water mark. */
629 stack
.currentFrame
->locals
.number
= *stack
.currentFrame
->locals
.instructionPtrAtStartOfOnce
- OP_BRA
;
631 /* For extended extraction brackets (large number), we have to fish out
632 the number from a dummy opcode at the start. */
634 if (stack
.currentFrame
->locals
.number
> EXTRACT_BASIC_MAX
)
635 stack
.currentFrame
->locals
.number
= get2ByteValue(stack
.currentFrame
->locals
.instructionPtrAtStartOfOnce
+ 2 + LINK_SIZE
);
636 stack
.currentFrame
->locals
.offset
= stack
.currentFrame
->locals
.number
<< 1;
639 printf("end bracket %d", stack
.currentFrame
->locals
.number
);
643 /* Test for a numbered group. This includes groups called as a result
644 of recursion. Note that whole-pattern recursion is coded as a recurse
645 into group 0, so it won't be picked up here. Instead, we catch it when
646 the OP_END is reached. */
648 if (stack
.currentFrame
->locals
.number
> 0) {
649 if (stack
.currentFrame
->locals
.offset
>= md
.offsetMax
)
650 md
.offsetOverflow
= true;
652 md
.offsetVector
[stack
.currentFrame
->locals
.offset
] =
653 md
.offsetVector
[md
.offsetEnd
- stack
.currentFrame
->locals
.number
];
654 md
.offsetVector
[stack
.currentFrame
->locals
.offset
+1] = stack
.currentFrame
->args
.subjectPtr
- md
.startSubject
;
655 if (stack
.currentFrame
->args
.offsetTop
<= stack
.currentFrame
->locals
.offset
)
656 stack
.currentFrame
->args
.offsetTop
= stack
.currentFrame
->locals
.offset
+ 2;
660 /* For a non-repeating ket, just continue at this level. This also
661 happens for a repeating ket if no characters were matched in the group.
662 This is the forcible breaking of infinite loops as implemented in Perl
663 5.005. If there is an options reset, it will get obeyed in the normal
666 if (*stack
.currentFrame
->args
.instructionPtr
== OP_KET
|| stack
.currentFrame
->args
.subjectPtr
== stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
) {
667 stack
.currentFrame
->args
.instructionPtr
+= 1 + LINK_SIZE
;
671 /* The repeating kets try the rest of the pattern or restart from the
672 preceding bracket, in the appropriate order. */
674 if (*stack
.currentFrame
->args
.instructionPtr
== OP_KETRMIN
) {
675 RECURSIVE_MATCH(16, stack
.currentFrame
->args
.instructionPtr
+ 1 + LINK_SIZE
, stack
.currentFrame
->args
.bracketChain
);
678 RECURSIVE_MATCH_NEW_GROUP(17, stack
.currentFrame
->locals
.instructionPtrAtStartOfOnce
, stack
.currentFrame
->args
.bracketChain
);
681 } else { /* OP_KETRMAX */
682 RECURSIVE_MATCH_NEW_GROUP(18, stack
.currentFrame
->locals
.instructionPtrAtStartOfOnce
, stack
.currentFrame
->args
.bracketChain
);
685 RECURSIVE_MATCH(19, stack
.currentFrame
->args
.instructionPtr
+ 1 + LINK_SIZE
, stack
.currentFrame
->args
.bracketChain
);
691 /* Start of subject. */
694 if (stack
.currentFrame
->args
.subjectPtr
!= md
.startSubject
)
696 stack
.currentFrame
->args
.instructionPtr
++;
699 /* After internal newline if multiline. */
702 if (stack
.currentFrame
->args
.subjectPtr
!= md
.startSubject
&& !isNewline(stack
.currentFrame
->args
.subjectPtr
[-1]))
704 stack
.currentFrame
->args
.instructionPtr
++;
707 /* End of subject. */
710 if (stack
.currentFrame
->args
.subjectPtr
< md
.endSubject
)
712 stack
.currentFrame
->args
.instructionPtr
++;
715 /* Before internal newline if multiline. */
718 if (stack
.currentFrame
->args
.subjectPtr
< md
.endSubject
&& !isNewline(*stack
.currentFrame
->args
.subjectPtr
))
720 stack
.currentFrame
->args
.instructionPtr
++;
723 /* Word boundary assertions */
725 BEGIN_OPCODE(NOT_WORD_BOUNDARY
):
726 BEGIN_OPCODE(WORD_BOUNDARY
): {
727 bool currentCharIsWordChar
= false;
728 bool previousCharIsWordChar
= false;
730 if (stack
.currentFrame
->args
.subjectPtr
> md
.startSubject
)
731 previousCharIsWordChar
= isWordChar(stack
.currentFrame
->args
.subjectPtr
[-1]);
732 if (stack
.currentFrame
->args
.subjectPtr
< md
.endSubject
)
733 currentCharIsWordChar
= isWordChar(*stack
.currentFrame
->args
.subjectPtr
);
735 /* Now see if the situation is what we want */
736 bool wordBoundaryDesired
= (*stack
.currentFrame
->args
.instructionPtr
++ == OP_WORD_BOUNDARY
);
737 if (wordBoundaryDesired
? currentCharIsWordChar
== previousCharIsWordChar
: currentCharIsWordChar
!= previousCharIsWordChar
)
742 /* Match a single character type; inline for speed */
744 BEGIN_OPCODE(NOT_NEWLINE
):
745 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
747 if (isNewline(*stack
.currentFrame
->args
.subjectPtr
++))
749 stack
.currentFrame
->args
.instructionPtr
++;
752 BEGIN_OPCODE(NOT_DIGIT
):
753 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
755 if (isASCIIDigit(*stack
.currentFrame
->args
.subjectPtr
++))
757 stack
.currentFrame
->args
.instructionPtr
++;
761 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
763 if (!isASCIIDigit(*stack
.currentFrame
->args
.subjectPtr
++))
765 stack
.currentFrame
->args
.instructionPtr
++;
768 BEGIN_OPCODE(NOT_WHITESPACE
):
769 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
771 if (isSpaceChar(*stack
.currentFrame
->args
.subjectPtr
++))
773 stack
.currentFrame
->args
.instructionPtr
++;
776 BEGIN_OPCODE(WHITESPACE
):
777 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
779 if (!isSpaceChar(*stack
.currentFrame
->args
.subjectPtr
++))
781 stack
.currentFrame
->args
.instructionPtr
++;
784 BEGIN_OPCODE(NOT_WORDCHAR
):
785 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
787 if (isWordChar(*stack
.currentFrame
->args
.subjectPtr
++))
789 stack
.currentFrame
->args
.instructionPtr
++;
792 BEGIN_OPCODE(WORDCHAR
):
793 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
795 if (!isWordChar(*stack
.currentFrame
->args
.subjectPtr
++))
797 stack
.currentFrame
->args
.instructionPtr
++;
800 /* Match a back reference, possibly repeatedly. Look past the end of the
801 item to see if there is repeat information following. The code is similar
802 to that for character classes, but repeated for efficiency. Then obey
803 similar code to character type repeats - written out again for speed.
804 However, if the referenced string is the empty string, always treat
805 it as matched, any number of times (otherwise there could be infinite
809 stack
.currentFrame
->locals
.offset
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 1) << 1; /* Doubled ref number */
810 stack
.currentFrame
->args
.instructionPtr
+= 3; /* Advance past item */
812 /* If the reference is unset, set the length to be longer than the amount
813 of subject left; this ensures that every attempt at a match fails. We
814 can't just fail here, because of the possibility of quantifiers with zero
817 if (stack
.currentFrame
->locals
.offset
>= stack
.currentFrame
->args
.offsetTop
|| md
.offsetVector
[stack
.currentFrame
->locals
.offset
] < 0)
818 stack
.currentFrame
->locals
.length
= 0;
820 stack
.currentFrame
->locals
.length
= md
.offsetVector
[stack
.currentFrame
->locals
.offset
+1] - md
.offsetVector
[stack
.currentFrame
->locals
.offset
];
822 /* Set up for repetition, or handle the non-repeated case */
824 switch (*stack
.currentFrame
->args
.instructionPtr
) {
831 repeatInformationFromInstructionOffset(*stack
.currentFrame
->args
.instructionPtr
++ - OP_CRSTAR
, minimize
, min
, stack
.currentFrame
->locals
.max
);
836 minimize
= (*stack
.currentFrame
->args
.instructionPtr
== OP_CRMINRANGE
);
837 min
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
838 stack
.currentFrame
->locals
.max
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 3);
839 if (stack
.currentFrame
->locals
.max
== 0)
840 stack
.currentFrame
->locals
.max
= INT_MAX
;
841 stack
.currentFrame
->args
.instructionPtr
+= 5;
844 default: /* No repeat follows */
845 if (!matchRef(stack
.currentFrame
->locals
.offset
, stack
.currentFrame
->args
.subjectPtr
, stack
.currentFrame
->locals
.length
, md
))
847 stack
.currentFrame
->args
.subjectPtr
+= stack
.currentFrame
->locals
.length
;
851 /* If the length of the reference is zero, just continue with the
854 if (stack
.currentFrame
->locals
.length
== 0)
857 /* First, ensure the minimum number of matches are present. */
859 for (int i
= 1; i
<= min
; i
++) {
860 if (!matchRef(stack
.currentFrame
->locals
.offset
, stack
.currentFrame
->args
.subjectPtr
, stack
.currentFrame
->locals
.length
, md
))
862 stack
.currentFrame
->args
.subjectPtr
+= stack
.currentFrame
->locals
.length
;
865 /* If min = max, continue at the same level without recursion.
866 They are not both allowed to be zero. */
868 if (min
== stack
.currentFrame
->locals
.max
)
871 /* If minimizing, keep trying and advancing the pointer */
874 for (stack
.currentFrame
->locals
.fi
= min
;; stack
.currentFrame
->locals
.fi
++) {
875 RECURSIVE_MATCH(20, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
878 if (stack
.currentFrame
->locals
.fi
>= stack
.currentFrame
->locals
.max
|| !matchRef(stack
.currentFrame
->locals
.offset
, stack
.currentFrame
->args
.subjectPtr
, stack
.currentFrame
->locals
.length
, md
))
880 stack
.currentFrame
->args
.subjectPtr
+= stack
.currentFrame
->locals
.length
;
882 /* Control never reaches here */
885 /* If maximizing, find the longest string and work backwards */
888 stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
= stack
.currentFrame
->args
.subjectPtr
;
889 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
890 if (!matchRef(stack
.currentFrame
->locals
.offset
, stack
.currentFrame
->args
.subjectPtr
, stack
.currentFrame
->locals
.length
, md
))
892 stack
.currentFrame
->args
.subjectPtr
+= stack
.currentFrame
->locals
.length
;
894 while (stack
.currentFrame
->args
.subjectPtr
>= stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
) {
895 RECURSIVE_MATCH(21, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
898 stack
.currentFrame
->args
.subjectPtr
-= stack
.currentFrame
->locals
.length
;
902 /* Control never reaches here */
904 /* Match a bit-mapped character class, possibly repeatedly. This op code is
905 used when all the characters in the class have values in the range 0-255,
906 and either the matching is caseful, or the characters are in the range
907 0-127 when UTF-8 processing is enabled. The only difference between
908 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
911 First, look past the end of the item to see if there is repeat information
912 following. Then obey similar code to character type repeats - written out
915 BEGIN_OPCODE(NCLASS
):
917 stack
.currentFrame
->locals
.data
= stack
.currentFrame
->args
.instructionPtr
+ 1; /* Save for matching */
918 stack
.currentFrame
->args
.instructionPtr
+= 33; /* Advance past the item */
920 switch (*stack
.currentFrame
->args
.instructionPtr
) {
927 repeatInformationFromInstructionOffset(*stack
.currentFrame
->args
.instructionPtr
++ - OP_CRSTAR
, minimize
, min
, stack
.currentFrame
->locals
.max
);
932 minimize
= (*stack
.currentFrame
->args
.instructionPtr
== OP_CRMINRANGE
);
933 min
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
934 stack
.currentFrame
->locals
.max
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 3);
935 if (stack
.currentFrame
->locals
.max
== 0)
936 stack
.currentFrame
->locals
.max
= INT_MAX
;
937 stack
.currentFrame
->args
.instructionPtr
+= 5;
940 default: /* No repeat follows */
941 min
= stack
.currentFrame
->locals
.max
= 1;
945 /* First, ensure the minimum number of matches are present. */
947 for (int i
= 1; i
<= min
; i
++) {
948 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
950 int c
= *stack
.currentFrame
->args
.subjectPtr
++;
952 if (stack
.currentFrame
->locals
.data
[-1] == OP_CLASS
)
955 if (!(stack
.currentFrame
->locals
.data
[c
/ 8] & (1 << (c
& 7))))
960 /* If max == min we can continue with the main loop without the
963 if (min
== stack
.currentFrame
->locals
.max
)
966 /* If minimizing, keep testing the rest of the expression and advancing
967 the pointer while it matches the class. */
969 for (stack
.currentFrame
->locals
.fi
= min
;; stack
.currentFrame
->locals
.fi
++) {
970 RECURSIVE_MATCH(22, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
973 if (stack
.currentFrame
->locals
.fi
>= stack
.currentFrame
->locals
.max
|| stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
975 int c
= *stack
.currentFrame
->args
.subjectPtr
++;
977 if (stack
.currentFrame
->locals
.data
[-1] == OP_CLASS
)
980 if ((stack
.currentFrame
->locals
.data
[c
/8] & (1 << (c
&7))) == 0)
984 /* Control never reaches here */
986 /* If maximizing, find the longest possible run, then work backwards. */
988 stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
= stack
.currentFrame
->args
.subjectPtr
;
990 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
991 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
993 int c
= *stack
.currentFrame
->args
.subjectPtr
;
995 if (stack
.currentFrame
->locals
.data
[-1] == OP_CLASS
)
998 if (!(stack
.currentFrame
->locals
.data
[c
/ 8] & (1 << (c
& 7))))
1001 ++stack
.currentFrame
->args
.subjectPtr
;
1004 RECURSIVE_MATCH(24, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1007 if (stack
.currentFrame
->args
.subjectPtr
-- == stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
)
1008 break; /* Stop if tried at original pos */
1013 /* Control never reaches here */
1015 /* Match an extended character class. */
1017 BEGIN_OPCODE(XCLASS
):
1018 stack
.currentFrame
->locals
.data
= stack
.currentFrame
->args
.instructionPtr
+ 1 + LINK_SIZE
; /* Save for matching */
1019 stack
.currentFrame
->args
.instructionPtr
+= getLinkValue(stack
.currentFrame
->args
.instructionPtr
+ 1); /* Advance past the item */
1021 switch (*stack
.currentFrame
->args
.instructionPtr
) {
1028 repeatInformationFromInstructionOffset(*stack
.currentFrame
->args
.instructionPtr
++ - OP_CRSTAR
, minimize
, min
, stack
.currentFrame
->locals
.max
);
1033 minimize
= (*stack
.currentFrame
->args
.instructionPtr
== OP_CRMINRANGE
);
1034 min
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
1035 stack
.currentFrame
->locals
.max
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 3);
1036 if (stack
.currentFrame
->locals
.max
== 0)
1037 stack
.currentFrame
->locals
.max
= INT_MAX
;
1038 stack
.currentFrame
->args
.instructionPtr
+= 5;
1041 default: /* No repeat follows */
1042 min
= stack
.currentFrame
->locals
.max
= 1;
1045 /* First, ensure the minimum number of matches are present. */
1047 for (int i
= 1; i
<= min
; i
++) {
1048 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1050 int c
= *stack
.currentFrame
->args
.subjectPtr
++;
1051 if (!jsc_pcre_xclass(c
, stack
.currentFrame
->locals
.data
))
1055 /* If max == min we can continue with the main loop without the
1058 if (min
== stack
.currentFrame
->locals
.max
)
1061 /* If minimizing, keep testing the rest of the expression and advancing
1062 the pointer while it matches the class. */
1065 for (stack
.currentFrame
->locals
.fi
= min
;; stack
.currentFrame
->locals
.fi
++) {
1066 RECURSIVE_MATCH(26, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1069 if (stack
.currentFrame
->locals
.fi
>= stack
.currentFrame
->locals
.max
|| stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1071 int c
= *stack
.currentFrame
->args
.subjectPtr
++;
1072 if (!jsc_pcre_xclass(c
, stack
.currentFrame
->locals
.data
))
1075 /* Control never reaches here */
1078 /* If maximizing, find the longest possible run, then work backwards. */
1081 stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
= stack
.currentFrame
->args
.subjectPtr
;
1082 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1083 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1085 int c
= *stack
.currentFrame
->args
.subjectPtr
;
1086 if (!jsc_pcre_xclass(c
, stack
.currentFrame
->locals
.data
))
1088 ++stack
.currentFrame
->args
.subjectPtr
;
1091 RECURSIVE_MATCH(27, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1094 if (stack
.currentFrame
->args
.subjectPtr
-- == stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
)
1095 break; /* Stop if tried at original pos */
1100 /* Control never reaches here */
1102 /* Match a single character, casefully */
1105 stack
.currentFrame
->locals
.length
= 1;
1106 stack
.currentFrame
->args
.instructionPtr
++;
1107 getUTF8CharAndIncrementLength(stack
.currentFrame
->locals
.fc
, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->locals
.length
);
1108 stack
.currentFrame
->args
.instructionPtr
+= stack
.currentFrame
->locals
.length
;
1109 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1111 if (stack
.currentFrame
->locals
.fc
!= *stack
.currentFrame
->args
.subjectPtr
++)
1115 /* Match a single character, caselessly */
1117 BEGIN_OPCODE(CHAR_IGNORING_CASE
): {
1118 stack
.currentFrame
->locals
.length
= 1;
1119 stack
.currentFrame
->args
.instructionPtr
++;
1120 getUTF8CharAndIncrementLength(stack
.currentFrame
->locals
.fc
, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->locals
.length
);
1121 stack
.currentFrame
->args
.instructionPtr
+= stack
.currentFrame
->locals
.length
;
1122 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1124 int dc
= *stack
.currentFrame
->args
.subjectPtr
++;
1125 if (stack
.currentFrame
->locals
.fc
!= dc
&& jsc_pcre_ucp_othercase(stack
.currentFrame
->locals
.fc
) != dc
)
1130 /* Match a single ASCII character. */
1132 BEGIN_OPCODE(ASCII_CHAR
):
1133 if (md
.endSubject
== stack
.currentFrame
->args
.subjectPtr
)
1135 if (*stack
.currentFrame
->args
.subjectPtr
!= stack
.currentFrame
->args
.instructionPtr
[1])
1137 ++stack
.currentFrame
->args
.subjectPtr
;
1138 stack
.currentFrame
->args
.instructionPtr
+= 2;
1141 /* Match one of two cases of an ASCII letter. */
1143 BEGIN_OPCODE(ASCII_LETTER_IGNORING_CASE
):
1144 if (md
.endSubject
== stack
.currentFrame
->args
.subjectPtr
)
1146 if ((*stack
.currentFrame
->args
.subjectPtr
| 0x20) != stack
.currentFrame
->args
.instructionPtr
[1])
1148 ++stack
.currentFrame
->args
.subjectPtr
;
1149 stack
.currentFrame
->args
.instructionPtr
+= 2;
1152 /* Match a single character repeatedly; different opcodes share code. */
1154 BEGIN_OPCODE(EXACT
):
1155 min
= stack
.currentFrame
->locals
.max
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
1157 stack
.currentFrame
->args
.instructionPtr
+= 3;
1161 BEGIN_OPCODE(MINUPTO
):
1163 stack
.currentFrame
->locals
.max
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
1164 minimize
= *stack
.currentFrame
->args
.instructionPtr
== OP_MINUPTO
;
1165 stack
.currentFrame
->args
.instructionPtr
+= 3;
1169 BEGIN_OPCODE(MINSTAR
):
1171 BEGIN_OPCODE(MINPLUS
):
1172 BEGIN_OPCODE(QUERY
):
1173 BEGIN_OPCODE(MINQUERY
):
1174 repeatInformationFromInstructionOffset(*stack
.currentFrame
->args
.instructionPtr
++ - OP_STAR
, minimize
, min
, stack
.currentFrame
->locals
.max
);
1176 /* Common code for all repeated single-character matches. We can give
1177 up quickly if there are fewer than the minimum number of characters left in
1182 stack
.currentFrame
->locals
.length
= 1;
1183 getUTF8CharAndIncrementLength(stack
.currentFrame
->locals
.fc
, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->locals
.length
);
1184 if (min
* (stack
.currentFrame
->locals
.fc
> 0xFFFF ? 2 : 1) > md
.endSubject
- stack
.currentFrame
->args
.subjectPtr
)
1186 stack
.currentFrame
->args
.instructionPtr
+= stack
.currentFrame
->locals
.length
;
1188 if (stack
.currentFrame
->locals
.fc
<= 0xFFFF) {
1189 int othercase
= md
.ignoreCase
? jsc_pcre_ucp_othercase(stack
.currentFrame
->locals
.fc
) : -1;
1191 for (int i
= 1; i
<= min
; i
++) {
1192 if (*stack
.currentFrame
->args
.subjectPtr
!= stack
.currentFrame
->locals
.fc
&& *stack
.currentFrame
->args
.subjectPtr
!= othercase
)
1194 ++stack
.currentFrame
->args
.subjectPtr
;
1197 if (min
== stack
.currentFrame
->locals
.max
)
1201 stack
.currentFrame
->locals
.repeatOthercase
= othercase
;
1202 for (stack
.currentFrame
->locals
.fi
= min
;; stack
.currentFrame
->locals
.fi
++) {
1203 RECURSIVE_MATCH(28, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1206 if (stack
.currentFrame
->locals
.fi
>= stack
.currentFrame
->locals
.max
|| stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1208 if (*stack
.currentFrame
->args
.subjectPtr
!= stack
.currentFrame
->locals
.fc
&& *stack
.currentFrame
->args
.subjectPtr
!= stack
.currentFrame
->locals
.repeatOthercase
)
1210 ++stack
.currentFrame
->args
.subjectPtr
;
1212 /* Control never reaches here */
1214 stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
= stack
.currentFrame
->args
.subjectPtr
;
1215 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1216 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1218 if (*stack
.currentFrame
->args
.subjectPtr
!= stack
.currentFrame
->locals
.fc
&& *stack
.currentFrame
->args
.subjectPtr
!= othercase
)
1220 ++stack
.currentFrame
->args
.subjectPtr
;
1222 while (stack
.currentFrame
->args
.subjectPtr
>= stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
) {
1223 RECURSIVE_MATCH(29, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1226 --stack
.currentFrame
->args
.subjectPtr
;
1230 /* Control never reaches here */
1232 /* No case on surrogate pairs, so no need to bother with "othercase". */
1234 for (int i
= 1; i
<= min
; i
++) {
1235 if (*stack
.currentFrame
->args
.subjectPtr
!= stack
.currentFrame
->locals
.fc
)
1237 stack
.currentFrame
->args
.subjectPtr
+= 2;
1240 if (min
== stack
.currentFrame
->locals
.max
)
1244 for (stack
.currentFrame
->locals
.fi
= min
;; stack
.currentFrame
->locals
.fi
++) {
1245 RECURSIVE_MATCH(30, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1248 if (stack
.currentFrame
->locals
.fi
>= stack
.currentFrame
->locals
.max
|| stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1250 if (*stack
.currentFrame
->args
.subjectPtr
!= stack
.currentFrame
->locals
.fc
)
1252 stack
.currentFrame
->args
.subjectPtr
+= 2;
1254 /* Control never reaches here */
1256 stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
= stack
.currentFrame
->args
.subjectPtr
;
1257 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1258 if (stack
.currentFrame
->args
.subjectPtr
> md
.endSubject
- 2)
1260 if (*stack
.currentFrame
->args
.subjectPtr
!= stack
.currentFrame
->locals
.fc
)
1262 stack
.currentFrame
->args
.subjectPtr
+= 2;
1264 while (stack
.currentFrame
->args
.subjectPtr
>= stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
) {
1265 RECURSIVE_MATCH(31, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1268 stack
.currentFrame
->args
.subjectPtr
-= 2;
1272 /* Control never reaches here */
1274 /* Control never reaches here */
1276 /* Match a negated single one-byte character. */
1278 BEGIN_OPCODE(NOT
): {
1279 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1281 int b
= stack
.currentFrame
->args
.instructionPtr
[1];
1282 int c
= *stack
.currentFrame
->args
.subjectPtr
++;
1283 stack
.currentFrame
->args
.instructionPtr
+= 2;
1284 if (md
.ignoreCase
) {
1287 if (toLowerCase(b
) == c
)
1296 /* Match a negated single one-byte character repeatedly. This is almost a
1297 repeat of the code for a repeated single character, but I haven't found a
1298 nice way of commoning these up that doesn't require a test of the
1299 positive/negative option for each character match. Maybe that wouldn't add
1300 very much to the time taken, but character matching *is* what this is all
1303 BEGIN_OPCODE(NOTEXACT
):
1304 min
= stack
.currentFrame
->locals
.max
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
1306 stack
.currentFrame
->args
.instructionPtr
+= 3;
1309 BEGIN_OPCODE(NOTUPTO
):
1310 BEGIN_OPCODE(NOTMINUPTO
):
1312 stack
.currentFrame
->locals
.max
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
1313 minimize
= *stack
.currentFrame
->args
.instructionPtr
== OP_NOTMINUPTO
;
1314 stack
.currentFrame
->args
.instructionPtr
+= 3;
1317 BEGIN_OPCODE(NOTSTAR
):
1318 BEGIN_OPCODE(NOTMINSTAR
):
1319 BEGIN_OPCODE(NOTPLUS
):
1320 BEGIN_OPCODE(NOTMINPLUS
):
1321 BEGIN_OPCODE(NOTQUERY
):
1322 BEGIN_OPCODE(NOTMINQUERY
):
1323 repeatInformationFromInstructionOffset(*stack
.currentFrame
->args
.instructionPtr
++ - OP_NOTSTAR
, minimize
, min
, stack
.currentFrame
->locals
.max
);
1325 /* Common code for all repeated single-byte matches. We can give up quickly
1326 if there are fewer than the minimum number of bytes left in the
1330 if (min
> md
.endSubject
- stack
.currentFrame
->args
.subjectPtr
)
1332 stack
.currentFrame
->locals
.fc
= *stack
.currentFrame
->args
.instructionPtr
++;
1334 /* The code is duplicated for the caseless and caseful cases, for speed,
1335 since matching characters is likely to be quite common. First, ensure the
1336 minimum number of matches are present. If min = max, continue at the same
1337 level without recursing. Otherwise, if minimizing, keep trying the rest of
1338 the expression and advancing one matching character if failing, up to the
1339 maximum. Alternatively, if maximizing, find the maximum number of
1340 characters and work backwards. */
1342 DPRINTF(("negative matching %c{%d,%d}\n", stack
.currentFrame
->locals
.fc
, min
, stack
.currentFrame
->locals
.max
));
1344 if (md
.ignoreCase
) {
1345 if (stack
.currentFrame
->locals
.fc
< 128)
1346 stack
.currentFrame
->locals
.fc
= toLowerCase(stack
.currentFrame
->locals
.fc
);
1348 for (int i
= 1; i
<= min
; i
++) {
1349 int d
= *stack
.currentFrame
->args
.subjectPtr
++;
1352 if (stack
.currentFrame
->locals
.fc
== d
)
1356 if (min
== stack
.currentFrame
->locals
.max
)
1360 for (stack
.currentFrame
->locals
.fi
= min
;; stack
.currentFrame
->locals
.fi
++) {
1361 RECURSIVE_MATCH(38, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1364 int d
= *stack
.currentFrame
->args
.subjectPtr
++;
1367 if (stack
.currentFrame
->locals
.fi
>= stack
.currentFrame
->locals
.max
|| stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
|| stack
.currentFrame
->locals
.fc
== d
)
1370 /* Control never reaches here */
1376 stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
= stack
.currentFrame
->args
.subjectPtr
;
1378 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1379 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1381 int d
= *stack
.currentFrame
->args
.subjectPtr
;
1384 if (stack
.currentFrame
->locals
.fc
== d
)
1386 ++stack
.currentFrame
->args
.subjectPtr
;
1389 RECURSIVE_MATCH(40, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1392 if (stack
.currentFrame
->args
.subjectPtr
-- == stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
)
1393 break; /* Stop if tried at original pos */
1398 /* Control never reaches here */
1401 /* Caseful comparisons */
1404 for (int i
= 1; i
<= min
; i
++) {
1405 int d
= *stack
.currentFrame
->args
.subjectPtr
++;
1406 if (stack
.currentFrame
->locals
.fc
== d
)
1410 if (min
== stack
.currentFrame
->locals
.max
)
1414 for (stack
.currentFrame
->locals
.fi
= min
;; stack
.currentFrame
->locals
.fi
++) {
1415 RECURSIVE_MATCH(42, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1418 int d
= *stack
.currentFrame
->args
.subjectPtr
++;
1419 if (stack
.currentFrame
->locals
.fi
>= stack
.currentFrame
->locals
.max
|| stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
|| stack
.currentFrame
->locals
.fc
== d
)
1422 /* Control never reaches here */
1428 stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
= stack
.currentFrame
->args
.subjectPtr
;
1430 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1431 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1433 int d
= *stack
.currentFrame
->args
.subjectPtr
;
1434 if (stack
.currentFrame
->locals
.fc
== d
)
1436 ++stack
.currentFrame
->args
.subjectPtr
;
1439 RECURSIVE_MATCH(44, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1442 if (stack
.currentFrame
->args
.subjectPtr
-- == stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
)
1443 break; /* Stop if tried at original pos */
1449 /* Control never reaches here */
1451 /* Match a single character type repeatedly; several different opcodes
1452 share code. This is very similar to the code for single characters, but we
1453 repeat it in the interests of efficiency. */
1455 BEGIN_OPCODE(TYPEEXACT
):
1456 min
= stack
.currentFrame
->locals
.max
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
1458 stack
.currentFrame
->args
.instructionPtr
+= 3;
1461 BEGIN_OPCODE(TYPEUPTO
):
1462 BEGIN_OPCODE(TYPEMINUPTO
):
1464 stack
.currentFrame
->locals
.max
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
1465 minimize
= *stack
.currentFrame
->args
.instructionPtr
== OP_TYPEMINUPTO
;
1466 stack
.currentFrame
->args
.instructionPtr
+= 3;
1469 BEGIN_OPCODE(TYPESTAR
):
1470 BEGIN_OPCODE(TYPEMINSTAR
):
1471 BEGIN_OPCODE(TYPEPLUS
):
1472 BEGIN_OPCODE(TYPEMINPLUS
):
1473 BEGIN_OPCODE(TYPEQUERY
):
1474 BEGIN_OPCODE(TYPEMINQUERY
):
1475 repeatInformationFromInstructionOffset(*stack
.currentFrame
->args
.instructionPtr
++ - OP_TYPESTAR
, minimize
, min
, stack
.currentFrame
->locals
.max
);
1477 /* Common code for all repeated single character type matches. Note that
1478 in UTF-8 mode, '.' matches a character of any length, but for the other
1479 character types, the valid characters are all one-byte long. */
1482 stack
.currentFrame
->locals
.ctype
= *stack
.currentFrame
->args
.instructionPtr
++; /* Code for the character type */
1484 /* First, ensure the minimum number of matches are present. Use inline
1485 code for maximizing the speed, and do the type test once at the start
1486 (i.e. keep it out of the loop). Also we can test that there are at least
1487 the minimum number of characters before we start. */
1489 if (min
> md
.endSubject
- stack
.currentFrame
->args
.subjectPtr
)
1492 switch (stack
.currentFrame
->locals
.ctype
) {
1493 case OP_NOT_NEWLINE
:
1494 for (int i
= 1; i
<= min
; i
++) {
1495 if (isNewline(*stack
.currentFrame
->args
.subjectPtr
))
1497 ++stack
.currentFrame
->args
.subjectPtr
;
1502 for (int i
= 1; i
<= min
; i
++) {
1503 if (isASCIIDigit(*stack
.currentFrame
->args
.subjectPtr
))
1505 ++stack
.currentFrame
->args
.subjectPtr
;
1510 for (int i
= 1; i
<= min
; i
++) {
1511 if (!isASCIIDigit(*stack
.currentFrame
->args
.subjectPtr
))
1513 ++stack
.currentFrame
->args
.subjectPtr
;
1517 case OP_NOT_WHITESPACE
:
1518 for (int i
= 1; i
<= min
; i
++) {
1519 if (isSpaceChar(*stack
.currentFrame
->args
.subjectPtr
))
1521 ++stack
.currentFrame
->args
.subjectPtr
;
1526 for (int i
= 1; i
<= min
; i
++) {
1527 if (!isSpaceChar(*stack
.currentFrame
->args
.subjectPtr
))
1529 ++stack
.currentFrame
->args
.subjectPtr
;
1533 case OP_NOT_WORDCHAR
:
1534 for (int i
= 1; i
<= min
; i
++) {
1535 if (isWordChar(*stack
.currentFrame
->args
.subjectPtr
))
1537 ++stack
.currentFrame
->args
.subjectPtr
;
1542 for (int i
= 1; i
<= min
; i
++) {
1543 if (!isWordChar(*stack
.currentFrame
->args
.subjectPtr
))
1545 ++stack
.currentFrame
->args
.subjectPtr
;
1550 ASSERT_NOT_REACHED();
1551 return matchError(JSRegExpErrorInternal
, stack
);
1552 } /* End switch(stack.currentFrame->locals.ctype) */
1555 /* If min = max, continue at the same level without recursing */
1557 if (min
== stack
.currentFrame
->locals
.max
)
1560 /* If minimizing, we have to test the rest of the pattern before each
1561 subsequent match. */
1564 for (stack
.currentFrame
->locals
.fi
= min
;; stack
.currentFrame
->locals
.fi
++) {
1565 RECURSIVE_MATCH(48, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1568 if (stack
.currentFrame
->locals
.fi
>= stack
.currentFrame
->locals
.max
|| stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1571 int c
= *stack
.currentFrame
->args
.subjectPtr
++;
1572 switch (stack
.currentFrame
->locals
.ctype
) {
1573 case OP_NOT_NEWLINE
:
1579 if (isASCIIDigit(c
))
1584 if (!isASCIIDigit(c
))
1588 case OP_NOT_WHITESPACE
:
1594 if (!isSpaceChar(c
))
1598 case OP_NOT_WORDCHAR
:
1609 ASSERT_NOT_REACHED();
1610 return matchError(JSRegExpErrorInternal
, stack
);
1613 /* Control never reaches here */
1616 /* If maximizing it is worth using inline code for speed, doing the type
1617 test once at the start (i.e. keep it out of the loop). */
1620 stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
= stack
.currentFrame
->args
.subjectPtr
; /* Remember where we started */
1622 switch (stack
.currentFrame
->locals
.ctype
) {
1623 case OP_NOT_NEWLINE
:
1624 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1625 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
|| isNewline(*stack
.currentFrame
->args
.subjectPtr
))
1627 stack
.currentFrame
->args
.subjectPtr
++;
1632 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1633 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1635 int c
= *stack
.currentFrame
->args
.subjectPtr
;
1636 if (isASCIIDigit(c
))
1638 ++stack
.currentFrame
->args
.subjectPtr
;
1643 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1644 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1646 int c
= *stack
.currentFrame
->args
.subjectPtr
;
1647 if (!isASCIIDigit(c
))
1649 ++stack
.currentFrame
->args
.subjectPtr
;
1653 case OP_NOT_WHITESPACE
:
1654 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1655 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1657 int c
= *stack
.currentFrame
->args
.subjectPtr
;
1660 ++stack
.currentFrame
->args
.subjectPtr
;
1665 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1666 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1668 int c
= *stack
.currentFrame
->args
.subjectPtr
;
1669 if (!isSpaceChar(c
))
1671 ++stack
.currentFrame
->args
.subjectPtr
;
1675 case OP_NOT_WORDCHAR
:
1676 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1677 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1679 int c
= *stack
.currentFrame
->args
.subjectPtr
;
1682 ++stack
.currentFrame
->args
.subjectPtr
;
1687 for (int i
= min
; i
< stack
.currentFrame
->locals
.max
; i
++) {
1688 if (stack
.currentFrame
->args
.subjectPtr
>= md
.endSubject
)
1690 int c
= *stack
.currentFrame
->args
.subjectPtr
;
1693 ++stack
.currentFrame
->args
.subjectPtr
;
1698 ASSERT_NOT_REACHED();
1699 return matchError(JSRegExpErrorInternal
, stack
);
1702 /* stack.currentFrame->args.subjectPtr is now past the end of the maximum run */
1705 RECURSIVE_MATCH(52, stack
.currentFrame
->args
.instructionPtr
, stack
.currentFrame
->args
.bracketChain
);
1708 if (stack
.currentFrame
->args
.subjectPtr
-- == stack
.currentFrame
->locals
.subjectPtrAtStartOfInstruction
)
1709 break; /* Stop if tried at original pos */
1712 /* Get here if we can't make it match with any permitted repetitions */
1716 /* Control never reaches here */
1718 BEGIN_OPCODE(CRMINPLUS
):
1719 BEGIN_OPCODE(CRMINQUERY
):
1720 BEGIN_OPCODE(CRMINRANGE
):
1721 BEGIN_OPCODE(CRMINSTAR
):
1722 BEGIN_OPCODE(CRPLUS
):
1723 BEGIN_OPCODE(CRQUERY
):
1724 BEGIN_OPCODE(CRRANGE
):
1725 BEGIN_OPCODE(CRSTAR
):
1726 ASSERT_NOT_REACHED();
1727 return matchError(JSRegExpErrorInternal
, stack
);
1729 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
1734 /* Opening capturing bracket. If there is space in the offset vector, save
1735 the current subject position in the working slot at the top of the vector. We
1736 mustn't change the current values of the data slot, because they may be set
1737 from a previous iteration of this group, and be referred to by a reference
1740 If the bracket fails to match, we need to restore this value and also the
1741 values of the final offsets, in case they were set by a previous iteration of
1744 If there isn't enough space in the offset vector, treat this as if it were a
1745 non-capturing bracket. Don't worry about setting the flag for the error case
1746 here; that is handled in the code for KET. */
1748 ASSERT(*stack
.currentFrame
->args
.instructionPtr
> OP_BRA
);
1750 stack
.currentFrame
->locals
.number
= *stack
.currentFrame
->args
.instructionPtr
- OP_BRA
;
1752 /* For extended extraction brackets (large number), we have to fish out the
1753 number from a dummy opcode at the start. */
1755 if (stack
.currentFrame
->locals
.number
> EXTRACT_BASIC_MAX
)
1756 stack
.currentFrame
->locals
.number
= get2ByteValue(stack
.currentFrame
->args
.instructionPtr
+ 2 + LINK_SIZE
);
1757 stack
.currentFrame
->locals
.offset
= stack
.currentFrame
->locals
.number
<< 1;
1760 printf("start bracket %d subject=", stack
.currentFrame
->locals
.number
);
1761 pchars(stack
.currentFrame
->args
.subjectPtr
, 16, true, md
);
1765 if (stack
.currentFrame
->locals
.offset
< md
.offsetMax
) {
1766 stack
.currentFrame
->locals
.saveOffset1
= md
.offsetVector
[stack
.currentFrame
->locals
.offset
];
1767 stack
.currentFrame
->locals
.saveOffset2
= md
.offsetVector
[stack
.currentFrame
->locals
.offset
+ 1];
1768 stack
.currentFrame
->locals
.saveOffset3
= md
.offsetVector
[md
.offsetEnd
- stack
.currentFrame
->locals
.number
];
1770 DPRINTF(("saving %d %d %d\n", stack
.currentFrame
->locals
.saveOffset1
, stack
.currentFrame
->locals
.saveOffset2
, stack
.currentFrame
->locals
.saveOffset3
));
1771 md
.offsetVector
[md
.offsetEnd
- stack
.currentFrame
->locals
.number
] = stack
.currentFrame
->args
.subjectPtr
- md
.startSubject
;
1774 RECURSIVE_MATCH_NEW_GROUP(1, stack
.currentFrame
->args
.instructionPtr
+ 1 + LINK_SIZE
, stack
.currentFrame
->args
.bracketChain
);
1777 stack
.currentFrame
->args
.instructionPtr
+= getLinkValue(stack
.currentFrame
->args
.instructionPtr
+ 1);
1778 } while (*stack
.currentFrame
->args
.instructionPtr
== OP_ALT
);
1780 DPRINTF(("bracket %d failed\n", stack
.currentFrame
->locals
.number
));
1782 md
.offsetVector
[stack
.currentFrame
->locals
.offset
] = stack
.currentFrame
->locals
.saveOffset1
;
1783 md
.offsetVector
[stack
.currentFrame
->locals
.offset
+ 1] = stack
.currentFrame
->locals
.saveOffset2
;
1784 md
.offsetVector
[md
.offsetEnd
- stack
.currentFrame
->locals
.number
] = stack
.currentFrame
->locals
.saveOffset3
;
1789 /* Insufficient room for saving captured contents */
1791 goto NON_CAPTURING_BRACKET
;
1794 /* Do not stick any code in here without much thought; it is assumed
1795 that "continue" in the code above comes out to here to repeat the main
1798 } /* End of main loop */
1800 ASSERT_NOT_REACHED();
1802 #ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
1805 switch (stack
.currentFrame
->returnLocation
) {
1806 case 0: goto RETURN
;
1807 case 1: goto RRETURN_1
;
1808 case 2: goto RRETURN_2
;
1809 case 6: goto RRETURN_6
;
1810 case 7: goto RRETURN_7
;
1811 case 14: goto RRETURN_14
;
1812 case 15: goto RRETURN_15
;
1813 case 16: goto RRETURN_16
;
1814 case 17: goto RRETURN_17
;
1815 case 18: goto RRETURN_18
;
1816 case 19: goto RRETURN_19
;
1817 case 20: goto RRETURN_20
;
1818 case 21: goto RRETURN_21
;
1819 case 22: goto RRETURN_22
;
1820 case 24: goto RRETURN_24
;
1821 case 26: goto RRETURN_26
;
1822 case 27: goto RRETURN_27
;
1823 case 28: goto RRETURN_28
;
1824 case 29: goto RRETURN_29
;
1825 case 30: goto RRETURN_30
;
1826 case 31: goto RRETURN_31
;
1827 case 38: goto RRETURN_38
;
1828 case 40: goto RRETURN_40
;
1829 case 42: goto RRETURN_42
;
1830 case 44: goto RRETURN_44
;
1831 case 48: goto RRETURN_48
;
1832 case 52: goto RRETURN_52
;
1835 ASSERT_NOT_REACHED();
1836 return matchError(JSRegExpErrorInternal
, stack
);
1845 /*************************************************
1846 * Execute a Regular Expression *
1847 *************************************************/
1849 /* This function applies a compiled re to a subject string and picks out
1850 portions of the string if it matches. Two elements in the vector are set for
1851 each substring: the offsets to the start and end of the substring.
1854 re points to the compiled expression
1855 extra_data points to extra data or is NULL
1856 subject points to the subject string
1857 length length of subject string (may contain binary zeros)
1858 start_offset where to start in the subject string
1860 offsets points to a vector of ints to be filled in with offsets
1861 offsetCount the number of elements in the vector
1863 Returns: > 0 => success; value is the number of elements filled in
1864 = 0 => success, but offsets is not big enough
1865 -1 => failed to match
1866 < -1 => some kind of unexpected problem
1869 static void tryFirstByteOptimization(const UChar
*& subjectPtr
, const UChar
* endSubject
, int firstByte
, bool firstByteIsCaseless
, bool useMultiLineFirstCharOptimization
, const UChar
* originalSubjectStart
)
1871 // If firstByte is set, try scanning to the first instance of that byte
1872 // no need to try and match against any earlier part of the subject string.
1873 if (firstByte
>= 0) {
1874 UChar firstChar
= firstByte
;
1875 if (firstByteIsCaseless
)
1876 while (subjectPtr
< endSubject
) {
1877 int c
= *subjectPtr
;
1880 if (toLowerCase(c
) == firstChar
)
1885 while (subjectPtr
< endSubject
&& *subjectPtr
!= firstChar
)
1888 } else if (useMultiLineFirstCharOptimization
) {
1889 /* Or to just after \n for a multiline match if possible */
1890 // I'm not sure why this != originalSubjectStart check is necessary -- ecs 11/18/07
1891 if (subjectPtr
> originalSubjectStart
) {
1892 while (subjectPtr
< endSubject
&& !isNewline(subjectPtr
[-1]))
1898 static bool tryRequiredByteOptimization(const UChar
*& subjectPtr
, const UChar
* endSubject
, int reqByte
, int reqByte2
, bool reqByteIsCaseless
, bool hasFirstByte
, const UChar
*& reqBytePtr
)
1900 /* If reqByte is set, we know that that character must appear in the subject
1901 for the match to succeed. If the first character is set, reqByte must be
1902 later in the subject; otherwise the test starts at the match point. This
1903 optimization can save a huge amount of backtracking in patterns with nested
1904 unlimited repeats that aren't going to match. Writing separate code for
1905 cased/caseless versions makes it go faster, as does using an autoincrement
1906 and backing off on a match.
1908 HOWEVER: when the subject string is very, very long, searching to its end can
1909 take a long time, and give bad performance on quite ordinary patterns. This
1910 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
1911 don't do this when the string is sufficiently long.
1914 if (reqByte
>= 0 && endSubject
- subjectPtr
< REQ_BYTE_MAX
) {
1915 const UChar
* p
= subjectPtr
+ (hasFirstByte
? 1 : 0);
1917 /* We don't need to repeat the search if we haven't yet reached the
1918 place we found it at last time. */
1920 if (p
> reqBytePtr
) {
1921 if (reqByteIsCaseless
) {
1922 while (p
< endSubject
) {
1924 if (pp
== reqByte
|| pp
== reqByte2
) {
1930 while (p
< endSubject
) {
1931 if (*p
++ == reqByte
) {
1938 /* If we can't find the required character, break the matching loop */
1940 if (p
>= endSubject
)
1943 /* If we have found the required character, save the point where we
1944 found it, so that we don't search again next time round the loop if
1945 the start hasn't passed this character yet. */
1953 int jsRegExpExecute(const JSRegExp
* re
,
1954 const UChar
* subject
, int length
, int start_offset
, int* offsets
,
1958 ASSERT(subject
|| !length
);
1959 ASSERT(offsetCount
>= 0);
1960 ASSERT(offsets
|| offsetCount
== 0);
1962 HistogramTimeLogger
logger(re
);
1964 MatchData matchBlock
;
1965 matchBlock
.startSubject
= subject
;
1966 matchBlock
.endSubject
= matchBlock
.startSubject
+ length
;
1967 const UChar
* endSubject
= matchBlock
.endSubject
;
1969 matchBlock
.multiline
= (re
->options
& MatchAcrossMultipleLinesOption
);
1970 matchBlock
.ignoreCase
= (re
->options
& IgnoreCaseOption
);
1972 /* If the expression has got more back references than the offsets supplied can
1973 hold, we get a temporary chunk of working store to use during the matching.
1974 Otherwise, we can use the vector supplied, rounding down its size to a multiple
1977 int ocount
= offsetCount
- (offsetCount
% 3);
1979 // FIXME: This is lame that we have to second-guess our caller here.
1980 // The API should change to either fail-hard when we don't have enough offset space
1981 // or that we shouldn't ask our callers to pre-allocate in the first place.
1982 bool usingTemporaryOffsets
= false;
1983 if (re
->topBackref
> 0 && re
->topBackref
>= ocount
/3) {
1984 ocount
= re
->topBackref
* 3 + 3;
1985 matchBlock
.offsetVector
= new int[ocount
];
1986 if (!matchBlock
.offsetVector
)
1987 return JSRegExpErrorNoMemory
;
1988 usingTemporaryOffsets
= true;
1990 matchBlock
.offsetVector
= offsets
;
1992 matchBlock
.offsetEnd
= ocount
;
1993 matchBlock
.offsetMax
= (2*ocount
)/3;
1994 matchBlock
.offsetOverflow
= false;
1996 /* Compute the minimum number of offsets that we need to reset each time. Doing
1997 this makes a huge difference to execution time when there aren't many brackets
2000 int resetCount
= 2 + re
->topBracket
* 2;
2001 if (resetCount
> offsetCount
)
2002 resetCount
= ocount
;
2004 /* Reset the working variable associated with each extraction. These should
2005 never be used unless previously set, but they get saved and restored, and so we
2006 initialize them to avoid reading uninitialized locations. */
2008 if (matchBlock
.offsetVector
) {
2009 int* iptr
= matchBlock
.offsetVector
+ ocount
;
2010 int* iend
= iptr
- resetCount
/2 + 1;
2011 while (--iptr
>= iend
)
2015 /* Set up the first character to match, if available. The firstByte value is
2016 never set for an anchored regular expression, but the anchoring may be forced
2017 at run time, so we have to test for anchoring. The first char may be unset for
2018 an unanchored pattern, of course. If there's no first char and the pattern was
2019 studied, there may be a bitmap of possible first characters. */
2021 bool firstByteIsCaseless
= false;
2023 if (re
->options
& UseFirstByteOptimizationOption
) {
2024 firstByte
= re
->firstByte
& 255;
2025 if ((firstByteIsCaseless
= (re
->firstByte
& REQ_IGNORE_CASE
)))
2026 firstByte
= toLowerCase(firstByte
);
2029 /* For anchored or unanchored matches, there may be a "last known required
2032 bool reqByteIsCaseless
= false;
2035 if (re
->options
& UseRequiredByteOptimizationOption
) {
2036 reqByte
= re
->reqByte
& 255; // FIXME: This optimization could be made to work for UTF16 chars as well...
2037 reqByteIsCaseless
= (re
->reqByte
& REQ_IGNORE_CASE
);
2038 reqByte2
= flipCase(reqByte
);
2041 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
2042 the loop runs just once. */
2044 const UChar
* startMatch
= subject
+ start_offset
;
2045 const UChar
* reqBytePtr
= startMatch
- 1;
2046 bool useMultiLineFirstCharOptimization
= re
->options
& UseMultiLineFirstByteOptimizationOption
;
2049 /* Reset the maximum number of extractions we might see. */
2050 if (matchBlock
.offsetVector
) {
2051 int* iptr
= matchBlock
.offsetVector
;
2052 int* iend
= iptr
+ resetCount
;
2057 tryFirstByteOptimization(startMatch
, endSubject
, firstByte
, firstByteIsCaseless
, useMultiLineFirstCharOptimization
, matchBlock
.startSubject
+ start_offset
);
2058 if (tryRequiredByteOptimization(startMatch
, endSubject
, reqByte
, reqByte2
, reqByteIsCaseless
, firstByte
>= 0, reqBytePtr
))
2061 /* When a match occurs, substrings will be set for all internal extractions;
2062 we just need to set up the whole thing as substring 0 before returning. If
2063 there were too many extractions, set the return code to zero. In the case
2064 where we had to get some local store to hold offsets for backreferences, copy
2065 those back references that we can. In this case there need not be overflow
2066 if certain parts of the pattern were not used. */
2068 /* The code starts after the JSRegExp block and the capture name table. */
2069 const unsigned char* start_code
= (const unsigned char*)(re
+ 1);
2071 int returnCode
= match(startMatch
, start_code
, 2, matchBlock
);
2073 /* When the result is no match, advance the pointer to the next character
2075 if (returnCode
== 0) {
2080 if (returnCode
!= 1) {
2081 ASSERT(returnCode
== JSRegExpErrorHitLimit
|| returnCode
== JSRegExpErrorNoMemory
);
2082 DPRINTF((">>>> error: returning %d\n", returnCode
));
2086 /* We have a match! Copy the offset information from temporary store if
2089 if (usingTemporaryOffsets
) {
2090 if (offsetCount
>= 4) {
2091 memcpy(offsets
+ 2, matchBlock
.offsetVector
+ 2, (offsetCount
- 2) * sizeof(int));
2092 DPRINTF(("Copied offsets from temporary memory\n"));
2094 if (matchBlock
.endOffsetTop
> offsetCount
)
2095 matchBlock
.offsetOverflow
= true;
2097 DPRINTF(("Freeing temporary memory\n"));
2098 delete [] matchBlock
.offsetVector
;
2101 returnCode
= matchBlock
.offsetOverflow
? 0 : matchBlock
.endOffsetTop
/ 2;
2103 if (offsetCount
< 2)
2106 offsets
[0] = startMatch
- matchBlock
.startSubject
;
2107 offsets
[1] = matchBlock
.endMatchPtr
- matchBlock
.startSubject
;
2110 DPRINTF((">>>> returning %d\n", returnCode
));
2112 } while (!(re
->options
& IsAnchoredOption
) && startMatch
<= endSubject
);
2114 if (usingTemporaryOffsets
) {
2115 DPRINTF(("Freeing temporary memory\n"));
2116 delete [] matchBlock
.offsetVector
;
2119 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
2120 return JSRegExpErrorNoMatch
;
2123 #if REGEXP_HISTOGRAM
2125 class CompareHistogramEntries
{
2127 bool operator()(const pair
<UString
, double>& a
, const pair
<UString
, double>& b
)
2129 if (a
.second
== b
.second
)
2130 return a
.first
< b
.first
;
2131 return a
.second
< b
.second
;
2135 Histogram::~Histogram()
2137 Vector
<pair
<UString
, double> > values
;
2138 Map::iterator end
= times
.end();
2139 for (Map::iterator it
= times
.begin(); it
!= end
; ++it
)
2141 sort(values
.begin(), values
.end(), CompareHistogramEntries());
2142 size_t size
= values
.size();
2143 printf("Regular Expressions, sorted by time spent evaluating them:\n");
2144 for (size_t i
= 0; i
< size
; ++i
)
2145 printf(" %f - %s\n", values
[size
- i
- 1].second
, values
[size
- i
- 1].first
.UTF8String().c_str());
2148 void Histogram::add(const JSRegExp
* re
, double elapsedTime
)
2150 UString
string(reinterpret_cast<const UChar
*>(reinterpret_cast<const char*>(re
) + re
->stringOffset
), re
->stringLength
);
2151 if (re
->options
& IgnoreCaseOption
&& re
->options
& MatchAcrossMultipleLinesOption
)
2152 string
+= " (multi-line, ignore case)";
2154 if (re
->options
& IgnoreCaseOption
)
2155 string
+= " (ignore case)";
2156 if (re
->options
& MatchAcrossMultipleLinesOption
)
2157 string
+= " (multi-line)";
2159 pair
<Map::iterator
, bool> result
= times
.add(string
.rep(), elapsedTime
);
2161 result
.first
->second
+= elapsedTime
;
2164 HistogramTimeLogger::HistogramTimeLogger(const JSRegExp
* re
)
2166 , m_startTime(getCurrentUTCTimeWithMicroseconds())
2170 HistogramTimeLogger::~HistogramTimeLogger()
2172 static Histogram histogram
;
2173 histogram
.add(m_re
, getCurrentUTCTimeWithMicroseconds() - m_startTime
);