2 **************************************************************************
3 * Copyright (C) 2002-2010 International Business Machines Corporation *
4 * and others. All rights reserved. *
5 **************************************************************************
10 // Contains the implementation of class RegexMatcher,
11 // which is one of the main API classes for the ICU regular expression package.
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
17 #include "unicode/regex.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/ustring.h"
21 #include "unicode/rbbi.h"
32 // #include <malloc.h> // Needed for heapcheck testing
35 // Find progress callback
36 // ----------------------
37 // Macro to inline test & call to ReportFindProgress(). Eliminates unnecessary function call.
39 #define REGEXFINDPROGRESS_INTERRUPT(pos, status) \
40 (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE)
45 // When a failure would go back to a LOOP_C instruction,
46 // strings, characters, and setrefs scan backwards for a valid start
47 // character themselves, pop the stack, and save state, emulating the
48 // LOOP_C's effect but assured that the next character of input is a
49 // possible matching character.
51 // Good idea in theory; unfortunately it only helps out a few specific
52 // cases and slows the engine down a little in the rest.
54 //#define REGEX_SMART_BACKTRACKING 1
58 // Default limit for the size of the back track stack, to avoid system
59 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
60 // This value puts ICU's limits higher than most other regexp implementations,
61 // which use recursion rather than the heap, and take more storage per
64 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY
= 8000000;
66 // Time limit counter constant.
67 // Time limits for expression evaluation are in terms of quanta of work by
68 // the engine, each of which is 10,000 state saves.
69 // This constant determines that state saves per tick number.
70 static const int32_t TIMER_INITIAL_VALUE
= 10000;
72 //-----------------------------------------------------------------------------
74 // Constructor and Destructor
76 //-----------------------------------------------------------------------------
77 RegexMatcher::RegexMatcher(const RegexPattern
*pat
) {
78 fDeferredStatus
= U_ZERO_ERROR
;
79 init(fDeferredStatus
);
80 if (U_FAILURE(fDeferredStatus
)) {
84 fDeferredStatus
= U_ILLEGAL_ARGUMENT_ERROR
;
88 init2(RegexStaticSets::gStaticSets
->fEmptyText
, fDeferredStatus
);
93 RegexMatcher::RegexMatcher(const UnicodeString
®exp
, const UnicodeString
&input
,
94 uint32_t flags
, UErrorCode
&status
) {
96 if (U_FAILURE(status
)) {
100 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
101 fPattern
= fPatternOwned
;
103 UText inputText
= UTEXT_INITIALIZER
;
104 utext_openConstUnicodeString(&inputText
, &input
, &status
);
105 init2(&inputText
, status
);
106 utext_close(&inputText
);
108 fInputUniStrMaybeMutable
= TRUE
;
112 RegexMatcher::RegexMatcher(UText
*regexp
, UText
*input
,
113 uint32_t flags
, UErrorCode
&status
) {
115 if (U_FAILURE(status
)) {
119 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
120 if (U_FAILURE(status
)) {
124 fPattern
= fPatternOwned
;
125 init2(input
, status
);
129 RegexMatcher::RegexMatcher(const UnicodeString
®exp
,
130 uint32_t flags
, UErrorCode
&status
) {
132 if (U_FAILURE(status
)) {
136 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
137 if (U_FAILURE(status
)) {
140 fPattern
= fPatternOwned
;
141 init2(RegexStaticSets::gStaticSets
->fEmptyText
, status
);
144 RegexMatcher::RegexMatcher(UText
*regexp
,
145 uint32_t flags
, UErrorCode
&status
) {
147 if (U_FAILURE(status
)) {
151 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
152 if (U_FAILURE(status
)) {
156 fPattern
= fPatternOwned
;
157 init2(RegexStaticSets::gStaticSets
->fEmptyText
, status
);
163 RegexMatcher::~RegexMatcher() {
165 if (fData
!= fSmallData
) {
170 delete fPatternOwned
;
171 fPatternOwned
= NULL
;
179 utext_close(fInputText
);
182 utext_close(fAltInputText
);
185 #if UCONFIG_NO_BREAK_ITERATION==0
186 delete fWordBreakItr
;
191 // init() common initialization for use by all constructors.
192 // Initialize all fields, get the object into a consistent state.
193 // This must be done even when the initial status shows an error,
194 // so that the object is initialized sufficiently well for the destructor
197 void RegexMatcher::init(UErrorCode
&status
) {
199 fPatternOwned
= NULL
;
209 fTransparentBounds
= FALSE
;
210 fAnchoringBounds
= TRUE
;
223 fStackLimit
= DEFAULT_BACKTRACK_STACK_CAPACITY
;
225 fCallbackContext
= NULL
;
226 fFindProgressCallbackFn
= NULL
;
227 fFindProgressCallbackContext
= NULL
;
229 fDeferredStatus
= status
;
231 fWordBreakItr
= NULL
;
233 fStack
= new UVector64(status
);
235 fAltInputText
= NULL
;
238 fInputUniStrMaybeMutable
= FALSE
;
240 if (U_FAILURE(status
)) {
241 fDeferredStatus
= status
;
246 // init2() Common initialization for use by RegexMatcher constructors, part 2.
247 // This handles the common setup to be done after the Pattern is available.
249 void RegexMatcher::init2(UText
*input
, UErrorCode
&status
) {
250 if (U_FAILURE(status
)) {
251 fDeferredStatus
= status
;
255 if (fPattern
->fDataSize
> (int32_t)(sizeof(fSmallData
)/sizeof(fSmallData
[0]))) {
256 fData
= (int64_t *)uprv_malloc(fPattern
->fDataSize
* sizeof(int64_t));
258 status
= fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
264 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY
, status
);
265 if (U_FAILURE(status
)) {
266 fDeferredStatus
= status
;
272 static const UChar BACKSLASH
= 0x5c;
273 static const UChar DOLLARSIGN
= 0x24;
274 //--------------------------------------------------------------------------------
278 //--------------------------------------------------------------------------------
279 RegexMatcher
&RegexMatcher::appendReplacement(UnicodeString
&dest
,
280 const UnicodeString
&replacement
,
281 UErrorCode
&status
) {
282 UText replacementText
= UTEXT_INITIALIZER
;
284 utext_openConstUnicodeString(&replacementText
, &replacement
, &status
);
285 if (U_SUCCESS(status
)) {
286 UText resultText
= UTEXT_INITIALIZER
;
287 utext_openUnicodeString(&resultText
, &dest
, &status
);
289 if (U_SUCCESS(status
)) {
290 appendReplacement(&resultText
, &replacementText
, status
);
291 utext_close(&resultText
);
293 utext_close(&replacementText
);
300 // appendReplacement, UText mode
302 RegexMatcher
&RegexMatcher::appendReplacement(UText
*dest
,
304 UErrorCode
&status
) {
305 if (U_FAILURE(status
)) {
308 if (U_FAILURE(fDeferredStatus
)) {
309 status
= fDeferredStatus
;
312 if (fMatch
== FALSE
) {
313 status
= U_REGEX_INVALID_STATE
;
317 // Copy input string from the end of previous match to start of current match
318 int64_t destLen
= utext_nativeLength(dest
);
319 if (fMatchStart
> fAppendPosition
) {
320 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
321 destLen
+= utext_replace(dest
, destLen
, destLen
, fInputText
->chunkContents
+fAppendPosition
,
322 (int32_t)(fMatchStart
-fAppendPosition
), &status
);
325 if (UTEXT_USES_U16(fInputText
)) {
326 len16
= (int32_t)(fMatchStart
-fAppendPosition
);
328 UErrorCode lengthStatus
= U_ZERO_ERROR
;
329 len16
= utext_extract(fInputText
, fAppendPosition
, fMatchStart
, NULL
, 0, &lengthStatus
);
331 UChar
*inputChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(len16
+1));
332 if (inputChars
== NULL
) {
333 status
= U_MEMORY_ALLOCATION_ERROR
;
336 utext_extract(fInputText
, fAppendPosition
, fMatchStart
, inputChars
, len16
+1, &status
);
337 destLen
+= utext_replace(dest
, destLen
, destLen
, inputChars
, len16
, &status
);
338 uprv_free(inputChars
);
341 fAppendPosition
= fMatchEnd
;
344 // scan the replacement text, looking for substitutions ($n) and \escapes.
345 // TODO: optimize this loop by efficiently scanning for '$' or '\',
346 // move entire ranges not containing substitutions.
347 UTEXT_SETNATIVEINDEX(replacement
, 0);
348 UChar32 c
= UTEXT_NEXT32(replacement
);
349 while (c
!= U_SENTINEL
) {
350 if (c
== BACKSLASH
) {
351 // Backslash Escape. Copy the following char out without further checks.
352 // Note: Surrogate pairs don't need any special handling
353 // The second half wont be a '$' or a '\', and
354 // will move to the dest normally on the next
356 c
= UTEXT_CURRENT32(replacement
);
357 if (c
== U_SENTINEL
) {
361 if (c
==0x55/*U*/ || c
==0x75/*u*/) {
362 // We have a \udddd or \Udddddddd escape sequence.
364 struct URegexUTextUnescapeCharContext context
= U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement
);
365 UChar32 escapedChar
= u_unescapeAt(uregex_utext_unescape_charAt
, &offset
, INT32_MAX
, &context
);
366 if (escapedChar
!= (UChar32
)0xFFFFFFFF) {
367 if (U_IS_BMP(escapedChar
)) {
368 UChar c16
= (UChar
)escapedChar
;
369 destLen
+= utext_replace(dest
, destLen
, destLen
, &c16
, 1, &status
);
372 surrogate
[0] = U16_LEAD(escapedChar
);
373 surrogate
[1] = U16_TRAIL(escapedChar
);
374 if (U_SUCCESS(status
)) {
375 destLen
+= utext_replace(dest
, destLen
, destLen
, surrogate
, 2, &status
);
378 // TODO: Report errors for mal-formed \u escapes?
379 // As this is, the original sequence is output, which may be OK.
380 if (context
.lastOffset
== offset
) {
381 UTEXT_PREVIOUS32(replacement
);
382 } else if (context
.lastOffset
!= offset
-1) {
383 utext_moveIndex32(replacement
, offset
- context
.lastOffset
- 1);
387 UTEXT_NEXT32(replacement
);
388 // Plain backslash escape. Just put out the escaped character.
390 UChar c16
= (UChar
)c
;
391 destLen
+= utext_replace(dest
, destLen
, destLen
, &c16
, 1, &status
);
394 surrogate
[0] = U16_LEAD(c
);
395 surrogate
[1] = U16_TRAIL(c
);
396 if (U_SUCCESS(status
)) {
397 destLen
+= utext_replace(dest
, destLen
, destLen
, surrogate
, 2, &status
);
401 } else if (c
!= DOLLARSIGN
) {
402 // Normal char, not a $. Copy it out without further checks.
404 UChar c16
= (UChar
)c
;
405 destLen
+= utext_replace(dest
, destLen
, destLen
, &c16
, 1, &status
);
408 surrogate
[0] = U16_LEAD(c
);
409 surrogate
[1] = U16_TRAIL(c
);
410 if (U_SUCCESS(status
)) {
411 destLen
+= utext_replace(dest
, destLen
, destLen
, surrogate
, 2, &status
);
415 // We've got a $. Pick up a capture group number if one follows.
416 // Consume at most the number of digits necessary for the largest capture
417 // number that is valid for this pattern.
419 int32_t numDigits
= 0;
420 int32_t groupNum
= 0;
423 digitC
= UTEXT_CURRENT32(replacement
);
424 if (digitC
== U_SENTINEL
) {
427 if (u_isdigit(digitC
) == FALSE
) {
430 UTEXT_NEXT32(replacement
);
431 groupNum
=groupNum
*10 + u_charDigitValue(digitC
);
433 if (numDigits
>= fPattern
->fMaxCaptureDigits
) {
439 if (numDigits
== 0) {
440 // The $ didn't introduce a group number at all.
441 // Treat it as just part of the substitution text.
442 UChar c16
= DOLLARSIGN
;
443 destLen
+= utext_replace(dest
, destLen
, destLen
, &c16
, 1, &status
);
445 // Finally, append the capture group data to the destination.
446 destLen
+= appendGroup(groupNum
, dest
, status
);
447 if (U_FAILURE(status
)) {
448 // Can fail if group number is out of range.
454 if (U_FAILURE(status
)) {
457 c
= UTEXT_NEXT32(replacement
);
466 //--------------------------------------------------------------------------------
468 // appendTail Intended to be used in conjunction with appendReplacement()
469 // To the destination string, append everything following
470 // the last match position from the input string.
472 // Note: Match ranges do not affect appendTail or appendReplacement
474 //--------------------------------------------------------------------------------
475 UnicodeString
&RegexMatcher::appendTail(UnicodeString
&dest
) {
476 UErrorCode status
= U_ZERO_ERROR
;
477 UText resultText
= UTEXT_INITIALIZER
;
478 utext_openUnicodeString(&resultText
, &dest
, &status
);
480 if (U_SUCCESS(status
)) {
481 appendTail(&resultText
, status
);
482 utext_close(&resultText
);
489 // appendTail, UText mode
491 UText
*RegexMatcher::appendTail(UText
*dest
, UErrorCode
&status
) {
492 UBool bailOut
= FALSE
;
493 if (U_FAILURE(status
)) {
496 if (U_FAILURE(fDeferredStatus
)) {
497 status
= fDeferredStatus
;
502 // dest must not be NULL
504 utext_replace(dest
, utext_nativeLength(dest
), utext_nativeLength(dest
), NULL
, 0, &status
);
509 if (fInputLength
> fAppendPosition
) {
510 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
511 int64_t destLen
= utext_nativeLength(dest
);
512 utext_replace(dest
, destLen
, destLen
, fInputText
->chunkContents
+fAppendPosition
,
513 (int32_t)(fInputLength
-fAppendPosition
), &status
);
516 if (UTEXT_USES_U16(fInputText
)) {
517 len16
= (int32_t)(fInputLength
-fAppendPosition
);
519 len16
= utext_extract(fInputText
, fAppendPosition
, fInputLength
, NULL
, 0, &status
);
520 status
= U_ZERO_ERROR
; // buffer overflow
523 UChar
*inputChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(len16
));
524 if (inputChars
== NULL
) {
525 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
527 utext_extract(fInputText
, fAppendPosition
, fInputLength
, inputChars
, len16
, &status
); // unterminated
528 int64_t destLen
= utext_nativeLength(dest
);
529 utext_replace(dest
, destLen
, destLen
, inputChars
, len16
, &status
);
530 uprv_free(inputChars
);
539 //--------------------------------------------------------------------------------
543 //--------------------------------------------------------------------------------
544 int32_t RegexMatcher::end(UErrorCode
&err
) const {
548 int64_t RegexMatcher::end64(UErrorCode
&err
) const {
549 return end64(0, err
);
552 int64_t RegexMatcher::end64(int32_t group
, UErrorCode
&err
) const {
553 if (U_FAILURE(err
)) {
556 if (fMatch
== FALSE
) {
557 err
= U_REGEX_INVALID_STATE
;
560 if (group
< 0 || group
> fPattern
->fGroupMap
->size()) {
561 err
= U_INDEX_OUTOFBOUNDS_ERROR
;
568 // Get the position within the stack frame of the variables for
569 // this capture group.
570 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(group
-1);
571 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
572 U_ASSERT(groupOffset
>= 0);
573 e
= fFrame
->fExtra
[groupOffset
+ 1];
579 int32_t RegexMatcher::end(int32_t group
, UErrorCode
&err
) const {
580 return (int32_t)end64(group
, err
);
584 //--------------------------------------------------------------------------------
588 //--------------------------------------------------------------------------------
589 UBool
RegexMatcher::find() {
590 // Start at the position of the last match end. (Will be zero if the
591 // matcher has been reset.)
593 if (U_FAILURE(fDeferredStatus
)) {
597 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
598 return findUsingChunk();
601 int64_t startPos
= fMatchEnd
;
603 startPos
= fActiveStart
;
607 // Save the position of any previous successful match.
608 fLastMatchEnd
= fMatchEnd
;
610 if (fMatchStart
== fMatchEnd
) {
611 // Previous match had zero length. Move start position up one position
612 // to avoid sending find() into a loop on zero-length matches.
613 if (startPos
>= fActiveLimit
) {
618 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
619 UTEXT_NEXT32(fInputText
);
620 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
623 if (fLastMatchEnd
>= 0) {
624 // A previous find() failed to match. Don't try again.
625 // (without this test, a pattern with a zero-length match
626 // could match again at the end of an input string.)
633 // Compute the position in the input string beyond which a match can not begin, because
634 // the minimum length match would extend past the end of the input.
635 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
636 // Be aware of possible overflows if making changes here.
637 int64_t testStartLimit
;
638 if (UTEXT_USES_U16(fInputText
)) {
639 testStartLimit
= fActiveLimit
- fPattern
->fMinMatchLen
;
640 if (startPos
> testStartLimit
) {
646 // For now, let the matcher discover that it can't match on its own
647 // We don't know how long the match len is in native characters
648 testStartLimit
= fActiveLimit
;
652 U_ASSERT(startPos
>= 0);
654 switch (fPattern
->fStartType
) {
656 // No optimization was found.
657 // Try a match at each input position.
659 MatchAt(startPos
, FALSE
, fDeferredStatus
);
660 if (U_FAILURE(fDeferredStatus
)) {
666 if (startPos
>= testStartLimit
) {
670 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
671 UTEXT_NEXT32(fInputText
);
672 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
673 // Note that it's perfectly OK for a pattern to have a zero-length
674 // match at the end of a string, so we must make sure that the loop
675 // runs with startPos == testStartLimit the last time through.
676 if (REGEXFINDPROGRESS_INTERRUPT(startPos
, fDeferredStatus
))
682 // Matches are only possible at the start of the input string
683 // (pattern begins with ^ or \A)
684 if (startPos
> fActiveStart
) {
688 MatchAt(startPos
, FALSE
, fDeferredStatus
);
689 if (U_FAILURE(fDeferredStatus
)) {
697 // Match may start on any char from a pre-computed set.
698 U_ASSERT(fPattern
->fMinMatchLen
> 0);
700 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
702 c
= UTEXT_NEXT32(fInputText
);
703 pos
= UTEXT_GETNATIVEINDEX(fInputText
);
704 // c will be -1 (U_SENTINEL) at end of text, in which case we
705 // skip this next block (so we don't have a negative array index)
706 // and handle end of text in the following block.
707 if (c
>= 0 && ((c
<256 && fPattern
->fInitialChars8
->contains(c
)) ||
708 (c
>=256 && fPattern
->fInitialChars
->contains(c
)))) {
709 MatchAt(startPos
, FALSE
, fDeferredStatus
);
710 if (U_FAILURE(fDeferredStatus
)) {
716 UTEXT_SETNATIVEINDEX(fInputText
, pos
);
718 if (startPos
>= testStartLimit
) {
724 if (REGEXFINDPROGRESS_INTERRUPT(startPos
, fDeferredStatus
))
733 // Match starts on exactly one char.
734 U_ASSERT(fPattern
->fMinMatchLen
> 0);
735 UChar32 theChar
= fPattern
->fInitialChar
;
737 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
739 c
= UTEXT_NEXT32(fInputText
);
740 pos
= UTEXT_GETNATIVEINDEX(fInputText
);
742 MatchAt(startPos
, FALSE
, fDeferredStatus
);
743 if (U_FAILURE(fDeferredStatus
)) {
749 UTEXT_SETNATIVEINDEX(fInputText
, pos
);
751 if (startPos
>= testStartLimit
) {
757 if (REGEXFINDPROGRESS_INTERRUPT(startPos
, fDeferredStatus
))
766 if (startPos
== fAnchorStart
) {
767 MatchAt(startPos
, FALSE
, fDeferredStatus
);
768 if (U_FAILURE(fDeferredStatus
)) {
774 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
775 c
= UTEXT_NEXT32(fInputText
);
776 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
778 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
779 c
= UTEXT_PREVIOUS32(fInputText
);
780 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
783 if (fPattern
->fFlags
& UREGEX_UNIX_LINES
) {
786 MatchAt(startPos
, FALSE
, fDeferredStatus
);
787 if (U_FAILURE(fDeferredStatus
)) {
793 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
795 if (startPos
>= testStartLimit
) {
800 c
= UTEXT_NEXT32(fInputText
);
801 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
802 // Note that it's perfectly OK for a pattern to have a zero-length
803 // match at the end of a string, so we must make sure that the loop
804 // runs with startPos == testStartLimit the last time through.
805 if (REGEXFINDPROGRESS_INTERRUPT(startPos
, fDeferredStatus
))
810 if (((c
& 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
811 ((c
<=0x0d && c
>=0x0a) || c
==0x85 ||c
==0x2028 || c
==0x2029 )) {
812 if (c
== 0x0d && startPos
< fActiveLimit
&& UTEXT_CURRENT32(fInputText
) == 0x0a) {
813 UTEXT_NEXT32(fInputText
);
814 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
816 MatchAt(startPos
, FALSE
, fDeferredStatus
);
817 if (U_FAILURE(fDeferredStatus
)) {
823 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
825 if (startPos
>= testStartLimit
) {
830 c
= UTEXT_NEXT32(fInputText
);
831 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
832 // Note that it's perfectly OK for a pattern to have a zero-length
833 // match at the end of a string, so we must make sure that the loop
834 // runs with startPos == testStartLimit the last time through.
835 if (REGEXFINDPROGRESS_INTERRUPT(startPos
, fDeferredStatus
))
851 UBool
RegexMatcher::find(int64_t start
, UErrorCode
&status
) {
852 if (U_FAILURE(status
)) {
855 if (U_FAILURE(fDeferredStatus
)) {
856 status
= fDeferredStatus
;
859 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
860 // This will reset the region to be the full input length.
862 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
866 int64_t nativeStart
= start
;
867 if (nativeStart
< fActiveStart
|| nativeStart
> fActiveLimit
) {
868 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
871 fMatchEnd
= nativeStart
;
876 //--------------------------------------------------------------------------------
878 // findUsingChunk() -- like find(), but with the advance knowledge that the
879 // entire string is available in the UText's chunk buffer.
881 //--------------------------------------------------------------------------------
882 UBool
RegexMatcher::findUsingChunk() {
883 // Start at the position of the last match end. (Will be zero if the
884 // matcher has been reset.
887 int32_t startPos
= (int32_t)fMatchEnd
;
889 startPos
= (int32_t)fActiveStart
;
892 const UChar
*inputBuf
= fInputText
->chunkContents
;
895 // Save the position of any previous successful match.
896 fLastMatchEnd
= fMatchEnd
;
898 if (fMatchStart
== fMatchEnd
) {
899 // Previous match had zero length. Move start position up one position
900 // to avoid sending find() into a loop on zero-length matches.
901 if (startPos
>= fActiveLimit
) {
906 U16_FWD_1(inputBuf
, startPos
, fInputLength
);
909 if (fLastMatchEnd
>= 0) {
910 // A previous find() failed to match. Don't try again.
911 // (without this test, a pattern with a zero-length match
912 // could match again at the end of an input string.)
919 // Compute the position in the input string beyond which a match can not begin, because
920 // the minimum length match would extend past the end of the input.
921 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
922 // Be aware of possible overflows if making changes here.
923 int32_t testLen
= (int32_t)(fActiveLimit
- fPattern
->fMinMatchLen
);
924 if (startPos
> testLen
) {
931 U_ASSERT(startPos
>= 0);
933 switch (fPattern
->fStartType
) {
935 // No optimization was found.
936 // Try a match at each input position.
938 MatchChunkAt(startPos
, FALSE
, fDeferredStatus
);
939 if (U_FAILURE(fDeferredStatus
)) {
945 if (startPos
>= testLen
) {
949 U16_FWD_1(inputBuf
, startPos
, fActiveLimit
);
950 // Note that it's perfectly OK for a pattern to have a zero-length
951 // match at the end of a string, so we must make sure that the loop
952 // runs with startPos == testLen the last time through.
953 if (REGEXFINDPROGRESS_INTERRUPT(startPos
, fDeferredStatus
))
959 // Matches are only possible at the start of the input string
960 // (pattern begins with ^ or \A)
961 if (startPos
> fActiveStart
) {
965 MatchChunkAt(startPos
, FALSE
, fDeferredStatus
);
966 if (U_FAILURE(fDeferredStatus
)) {
974 // Match may start on any char from a pre-computed set.
975 U_ASSERT(fPattern
->fMinMatchLen
> 0);
977 int32_t pos
= startPos
;
978 U16_NEXT(inputBuf
, startPos
, fActiveLimit
, c
); // like c = inputBuf[startPos++];
979 if ((c
<256 && fPattern
->fInitialChars8
->contains(c
)) ||
980 (c
>=256 && fPattern
->fInitialChars
->contains(c
))) {
981 MatchChunkAt(pos
, FALSE
, fDeferredStatus
);
982 if (U_FAILURE(fDeferredStatus
)) {
989 if (pos
>= testLen
) {
994 if (REGEXFINDPROGRESS_INTERRUPT(startPos
, fDeferredStatus
))
1003 // Match starts on exactly one char.
1004 U_ASSERT(fPattern
->fMinMatchLen
> 0);
1005 UChar32 theChar
= fPattern
->fInitialChar
;
1007 int32_t pos
= startPos
;
1008 U16_NEXT(inputBuf
, startPos
, fActiveLimit
, c
); // like c = inputBuf[startPos++];
1010 MatchChunkAt(pos
, FALSE
, fDeferredStatus
);
1011 if (U_FAILURE(fDeferredStatus
)) {
1018 if (pos
>= testLen
) {
1023 if (REGEXFINDPROGRESS_INTERRUPT(startPos
, fDeferredStatus
))
1032 if (startPos
== fAnchorStart
) {
1033 MatchChunkAt(startPos
, FALSE
, fDeferredStatus
);
1034 if (U_FAILURE(fDeferredStatus
)) {
1040 U16_FWD_1(inputBuf
, startPos
, fActiveLimit
);
1043 if (fPattern
->fFlags
& UREGEX_UNIX_LINES
) {
1045 c
= inputBuf
[startPos
-1];
1047 MatchChunkAt(startPos
, FALSE
, fDeferredStatus
);
1048 if (U_FAILURE(fDeferredStatus
)) {
1055 if (startPos
>= testLen
) {
1060 U16_FWD_1(inputBuf
, startPos
, fActiveLimit
);
1061 // Note that it's perfectly OK for a pattern to have a zero-length
1062 // match at the end of a string, so we must make sure that the loop
1063 // runs with startPos == testLen the last time through.
1064 if (REGEXFINDPROGRESS_INTERRUPT(startPos
, fDeferredStatus
))
1069 c
= inputBuf
[startPos
-1];
1070 if (((c
& 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
1071 ((c
<=0x0d && c
>=0x0a) || c
==0x85 ||c
==0x2028 || c
==0x2029 )) {
1072 if (c
== 0x0d && startPos
< fActiveLimit
&& inputBuf
[startPos
] == 0x0a) {
1075 MatchChunkAt(startPos
, FALSE
, fDeferredStatus
);
1076 if (U_FAILURE(fDeferredStatus
)) {
1083 if (startPos
>= testLen
) {
1088 U16_FWD_1(inputBuf
, startPos
, fActiveLimit
);
1089 // Note that it's perfectly OK for a pattern to have a zero-length
1090 // match at the end of a string, so we must make sure that the loop
1091 // runs with startPos == testLen the last time through.
1092 if (REGEXFINDPROGRESS_INTERRUPT(startPos
, fDeferredStatus
))
1108 //--------------------------------------------------------------------------------
1112 //--------------------------------------------------------------------------------
1113 UnicodeString
RegexMatcher::group(UErrorCode
&status
) const {
1114 return group(0, status
);
1117 // Return immutable shallow clone
1118 UText
*RegexMatcher::group(UText
*dest
, int64_t &group_len
, UErrorCode
&status
) const {
1119 return group(0, dest
, group_len
, status
);
1122 // Return immutable shallow clone
1123 UText
*RegexMatcher::group(int32_t groupNum
, UText
*dest
, int64_t &group_len
, UErrorCode
&status
) const {
1125 UBool bailOut
= FALSE
;
1126 if (U_FAILURE(status
)) {
1129 if (U_FAILURE(fDeferredStatus
)) {
1130 status
= fDeferredStatus
;
1133 if (fMatch
== FALSE
) {
1134 status
= U_REGEX_INVALID_STATE
;
1137 if (groupNum
< 0 || groupNum
> fPattern
->fGroupMap
->size()) {
1138 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1143 return (dest
) ? dest
: utext_openUChars(NULL
, NULL
, 0, &status
);
1147 if (groupNum
== 0) {
1151 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(groupNum
-1);
1152 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
1153 U_ASSERT(groupOffset
>= 0);
1154 s
= fFrame
->fExtra
[groupOffset
];
1155 e
= fFrame
->fExtra
[groupOffset
+1];
1159 // A capture group wasn't part of the match
1160 return utext_clone(dest
, fInputText
, FALSE
, TRUE
, &status
);
1165 dest
= utext_clone(dest
, fInputText
, FALSE
, TRUE
, &status
);
1167 UTEXT_SETNATIVEINDEX(dest
, s
);
1171 UnicodeString
RegexMatcher::group(int32_t groupNum
, UErrorCode
&status
) const {
1172 UnicodeString result
;
1173 if (U_FAILURE(status
)) {
1176 UText resultText
= UTEXT_INITIALIZER
;
1177 utext_openUnicodeString(&resultText
, &result
, &status
);
1178 group(groupNum
, &resultText
, status
);
1179 utext_close(&resultText
);
1184 // Return deep (mutable) clone
1185 // Technology Preview (as an API), but note that the UnicodeString API is implemented
1186 // using this function.
1187 UText
*RegexMatcher::group(int32_t groupNum
, UText
*dest
, UErrorCode
&status
) const {
1188 UBool bailOut
= FALSE
;
1189 if (U_FAILURE(status
)) {
1192 if (U_FAILURE(fDeferredStatus
)) {
1193 status
= fDeferredStatus
;
1197 if (fMatch
== FALSE
) {
1198 status
= U_REGEX_INVALID_STATE
;
1201 if (groupNum
< 0 || groupNum
> fPattern
->fGroupMap
->size()) {
1202 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1208 utext_replace(dest
, 0, utext_nativeLength(dest
), NULL
, 0, &status
);
1211 return utext_openUChars(NULL
, NULL
, 0, &status
);
1216 if (groupNum
== 0) {
1220 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(groupNum
-1);
1221 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
1222 U_ASSERT(groupOffset
>= 0);
1223 s
= fFrame
->fExtra
[groupOffset
];
1224 e
= fFrame
->fExtra
[groupOffset
+1];
1228 // A capture group wasn't part of the match
1230 utext_replace(dest
, 0, utext_nativeLength(dest
), NULL
, 0, &status
);
1233 return utext_openUChars(NULL
, NULL
, 0, &status
);
1238 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1239 U_ASSERT(e
<= fInputLength
);
1241 utext_replace(dest
, 0, utext_nativeLength(dest
), fInputText
->chunkContents
+s
, (int32_t)(e
-s
), &status
);
1243 UText groupText
= UTEXT_INITIALIZER
;
1244 utext_openUChars(&groupText
, fInputText
->chunkContents
+s
, e
-s
, &status
);
1245 dest
= utext_clone(NULL
, &groupText
, TRUE
, FALSE
, &status
);
1246 utext_close(&groupText
);
1250 if (UTEXT_USES_U16(fInputText
)) {
1251 len16
= (int32_t)(e
-s
);
1253 UErrorCode lengthStatus
= U_ZERO_ERROR
;
1254 len16
= utext_extract(fInputText
, s
, e
, NULL
, 0, &lengthStatus
);
1256 UChar
*groupChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(len16
+1));
1257 if (groupChars
== NULL
) {
1258 status
= U_MEMORY_ALLOCATION_ERROR
;
1261 utext_extract(fInputText
, s
, e
, groupChars
, len16
+1, &status
);
1264 utext_replace(dest
, 0, utext_nativeLength(dest
), groupChars
, len16
, &status
);
1266 UText groupText
= UTEXT_INITIALIZER
;
1267 utext_openUChars(&groupText
, groupChars
, len16
, &status
);
1268 dest
= utext_clone(NULL
, &groupText
, TRUE
, FALSE
, &status
);
1269 utext_close(&groupText
);
1272 uprv_free(groupChars
);
1277 //--------------------------------------------------------------------------------
1279 // appendGroup() -- currently internal only, appends a group to a UText rather
1280 // than replacing its contents
1282 //--------------------------------------------------------------------------------
1284 int64_t RegexMatcher::appendGroup(int32_t groupNum
, UText
*dest
, UErrorCode
&status
) const {
1285 if (U_FAILURE(status
)) {
1288 if (U_FAILURE(fDeferredStatus
)) {
1289 status
= fDeferredStatus
;
1292 int64_t destLen
= utext_nativeLength(dest
);
1294 if (fMatch
== FALSE
) {
1295 status
= U_REGEX_INVALID_STATE
;
1296 return utext_replace(dest
, destLen
, destLen
, NULL
, 0, &status
);
1298 if (groupNum
< 0 || groupNum
> fPattern
->fGroupMap
->size()) {
1299 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1300 return utext_replace(dest
, destLen
, destLen
, NULL
, 0, &status
);
1304 if (groupNum
== 0) {
1308 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(groupNum
-1);
1309 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
1310 U_ASSERT(groupOffset
>= 0);
1311 s
= fFrame
->fExtra
[groupOffset
];
1312 e
= fFrame
->fExtra
[groupOffset
+1];
1316 // A capture group wasn't part of the match
1317 return utext_replace(dest
, destLen
, destLen
, NULL
, 0, &status
);
1322 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1323 U_ASSERT(e
<= fInputLength
);
1324 deltaLen
= utext_replace(dest
, destLen
, destLen
, fInputText
->chunkContents
+s
, (int32_t)(e
-s
), &status
);
1327 if (UTEXT_USES_U16(fInputText
)) {
1328 len16
= (int32_t)(e
-s
);
1330 UErrorCode lengthStatus
= U_ZERO_ERROR
;
1331 len16
= utext_extract(fInputText
, s
, e
, NULL
, 0, &lengthStatus
);
1333 UChar
*groupChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(len16
+1));
1334 if (groupChars
== NULL
) {
1335 status
= U_MEMORY_ALLOCATION_ERROR
;
1338 utext_extract(fInputText
, s
, e
, groupChars
, len16
+1, &status
);
1340 deltaLen
= utext_replace(dest
, destLen
, destLen
, groupChars
, len16
, &status
);
1341 uprv_free(groupChars
);
1348 //--------------------------------------------------------------------------------
1352 //--------------------------------------------------------------------------------
1353 int32_t RegexMatcher::groupCount() const {
1354 return fPattern
->fGroupMap
->size();
1359 //--------------------------------------------------------------------------------
1361 // hasAnchoringBounds()
1363 //--------------------------------------------------------------------------------
1364 UBool
RegexMatcher::hasAnchoringBounds() const {
1365 return fAnchoringBounds
;
1369 //--------------------------------------------------------------------------------
1371 // hasTransparentBounds()
1373 //--------------------------------------------------------------------------------
1374 UBool
RegexMatcher::hasTransparentBounds() const {
1375 return fTransparentBounds
;
1380 //--------------------------------------------------------------------------------
1384 //--------------------------------------------------------------------------------
1385 UBool
RegexMatcher::hitEnd() const {
1390 //--------------------------------------------------------------------------------
1394 //--------------------------------------------------------------------------------
1395 const UnicodeString
&RegexMatcher::input() const {
1397 UErrorCode status
= U_ZERO_ERROR
;
1399 if (UTEXT_USES_U16(fInputText
)) {
1400 len16
= (int32_t)fInputLength
;
1402 len16
= utext_extract(fInputText
, 0, fInputLength
, NULL
, 0, &status
);
1403 status
= U_ZERO_ERROR
; // overflow, length status
1405 UnicodeString
*result
= new UnicodeString(len16
, 0, 0);
1407 UChar
*inputChars
= result
->getBuffer(len16
);
1408 utext_extract(fInputText
, 0, fInputLength
, inputChars
, len16
, &status
); // unterminated warning
1409 result
->releaseBuffer(len16
);
1411 (*(const UnicodeString
**)&fInput
) = result
; // pointer assignment, rather than operator=
1417 //--------------------------------------------------------------------------------
1421 //--------------------------------------------------------------------------------
1422 UText
*RegexMatcher::inputText() const {
1427 //--------------------------------------------------------------------------------
1429 // getInput() -- like inputText(), but makes a clone or copies into another UText
1431 //--------------------------------------------------------------------------------
1432 UText
*RegexMatcher::getInput (UText
*dest
, UErrorCode
&status
) const {
1433 UBool bailOut
= FALSE
;
1434 if (U_FAILURE(status
)) {
1437 if (U_FAILURE(fDeferredStatus
)) {
1438 status
= fDeferredStatus
;
1444 utext_replace(dest
, 0, utext_nativeLength(dest
), NULL
, 0, &status
);
1447 return utext_clone(NULL
, fInputText
, FALSE
, TRUE
, &status
);
1452 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1453 utext_replace(dest
, 0, utext_nativeLength(dest
), fInputText
->chunkContents
, (int32_t)fInputLength
, &status
);
1456 if (UTEXT_USES_U16(fInputText
)) {
1457 input16Len
= (int32_t)fInputLength
;
1459 UErrorCode lengthStatus
= U_ZERO_ERROR
;
1460 input16Len
= utext_extract(fInputText
, 0, fInputLength
, NULL
, 0, &lengthStatus
); // buffer overflow error
1462 UChar
*inputChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(input16Len
));
1463 if (inputChars
== NULL
) {
1467 status
= U_ZERO_ERROR
;
1468 utext_extract(fInputText
, 0, fInputLength
, inputChars
, input16Len
, &status
); // not terminated warning
1469 status
= U_ZERO_ERROR
;
1470 utext_replace(dest
, 0, utext_nativeLength(dest
), inputChars
, input16Len
, &status
);
1472 uprv_free(inputChars
);
1476 return utext_clone(NULL
, fInputText
, FALSE
, TRUE
, &status
);
1481 static UBool
compat_SyncMutableUTextContents(UText
*ut
);
1482 static UBool
compat_SyncMutableUTextContents(UText
*ut
) {
1483 UBool retVal
= FALSE
;
1485 // In the following test, we're really only interested in whether the UText should switch
1486 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents
1487 // will still point to the correct data.
1488 if (utext_nativeLength(ut
) != ut
->nativeIndexingLimit
) {
1489 UnicodeString
*us
=(UnicodeString
*)ut
->context
;
1491 // Update to the latest length.
1492 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1493 int32_t newLength
= us
->length();
1495 // Update the chunk description.
1496 // The buffer may have switched between stack- and heap-based.
1497 ut
->chunkContents
= us
->getBuffer();
1498 ut
->chunkLength
= newLength
;
1499 ut
->chunkNativeLimit
= newLength
;
1500 ut
->nativeIndexingLimit
= newLength
;
1507 //--------------------------------------------------------------------------------
1511 //--------------------------------------------------------------------------------
1512 UBool
RegexMatcher::lookingAt(UErrorCode
&status
) {
1513 if (U_FAILURE(status
)) {
1516 if (U_FAILURE(fDeferredStatus
)) {
1517 status
= fDeferredStatus
;
1521 if (fInputUniStrMaybeMutable
) {
1522 if (compat_SyncMutableUTextContents(fInputText
)) {
1523 fInputLength
= utext_nativeLength(fInputText
);
1528 resetPreserveRegion();
1530 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1531 MatchChunkAt((int32_t)fActiveStart
, FALSE
, status
);
1533 MatchAt(fActiveStart
, FALSE
, status
);
1539 UBool
RegexMatcher::lookingAt(int64_t start
, UErrorCode
&status
) {
1540 if (U_FAILURE(status
)) {
1543 if (U_FAILURE(fDeferredStatus
)) {
1544 status
= fDeferredStatus
;
1550 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1554 if (fInputUniStrMaybeMutable
) {
1555 if (compat_SyncMutableUTextContents(fInputText
)) {
1556 fInputLength
= utext_nativeLength(fInputText
);
1561 int64_t nativeStart
;
1562 nativeStart
= start
;
1563 if (nativeStart
< fActiveStart
|| nativeStart
> fActiveLimit
) {
1564 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1568 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1569 MatchChunkAt((int32_t)nativeStart
, FALSE
, status
);
1571 MatchAt(nativeStart
, FALSE
, status
);
1578 //--------------------------------------------------------------------------------
1582 //--------------------------------------------------------------------------------
1583 UBool
RegexMatcher::matches(UErrorCode
&status
) {
1584 if (U_FAILURE(status
)) {
1587 if (U_FAILURE(fDeferredStatus
)) {
1588 status
= fDeferredStatus
;
1592 if (fInputUniStrMaybeMutable
) {
1593 if (compat_SyncMutableUTextContents(fInputText
)) {
1594 fInputLength
= utext_nativeLength(fInputText
);
1599 resetPreserveRegion();
1602 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1603 MatchChunkAt((int32_t)fActiveStart
, TRUE
, status
);
1605 MatchAt(fActiveStart
, TRUE
, status
);
1611 UBool
RegexMatcher::matches(int64_t start
, UErrorCode
&status
) {
1612 if (U_FAILURE(status
)) {
1615 if (U_FAILURE(fDeferredStatus
)) {
1616 status
= fDeferredStatus
;
1622 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1626 if (fInputUniStrMaybeMutable
) {
1627 if (compat_SyncMutableUTextContents(fInputText
)) {
1628 fInputLength
= utext_nativeLength(fInputText
);
1633 int64_t nativeStart
;
1634 nativeStart
= start
;
1635 if (nativeStart
< fActiveStart
|| nativeStart
> fActiveLimit
) {
1636 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1640 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1641 MatchChunkAt((int32_t)nativeStart
, TRUE
, status
);
1643 MatchAt(nativeStart
, TRUE
, status
);
1650 //--------------------------------------------------------------------------------
1654 //--------------------------------------------------------------------------------
1655 const RegexPattern
&RegexMatcher::pattern() const {
1661 //--------------------------------------------------------------------------------
1665 //--------------------------------------------------------------------------------
1666 RegexMatcher
&RegexMatcher::region(int64_t regionStart
, int64_t regionLimit
, int64_t startIndex
, UErrorCode
&status
) {
1667 if (U_FAILURE(status
)) {
1671 if (regionStart
>regionLimit
|| regionStart
<0 || regionLimit
<0) {
1672 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1675 int64_t nativeStart
= regionStart
;
1676 int64_t nativeLimit
= regionLimit
;
1677 if (nativeStart
> fInputLength
|| nativeLimit
> fInputLength
) {
1678 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1681 if (startIndex
== -1)
1684 resetPreserveRegion();
1686 fRegionStart
= nativeStart
;
1687 fRegionLimit
= nativeLimit
;
1688 fActiveStart
= nativeStart
;
1689 fActiveLimit
= nativeLimit
;
1691 if (startIndex
!= -1) {
1692 if (startIndex
< fActiveStart
|| startIndex
> fActiveLimit
) {
1693 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1695 fMatchEnd
= startIndex
;
1698 if (!fTransparentBounds
) {
1699 fLookStart
= nativeStart
;
1700 fLookLimit
= nativeLimit
;
1702 if (fAnchoringBounds
) {
1703 fAnchorStart
= nativeStart
;
1704 fAnchorLimit
= nativeLimit
;
1709 RegexMatcher
&RegexMatcher::region(int64_t start
, int64_t limit
, UErrorCode
&status
) {
1710 return region(start
, limit
, -1, status
);
1713 //--------------------------------------------------------------------------------
1717 //--------------------------------------------------------------------------------
1718 int32_t RegexMatcher::regionEnd() const {
1719 return (int32_t)fRegionLimit
;
1722 int64_t RegexMatcher::regionEnd64() const {
1723 return fRegionLimit
;
1726 //--------------------------------------------------------------------------------
1730 //--------------------------------------------------------------------------------
1731 int32_t RegexMatcher::regionStart() const {
1732 return (int32_t)fRegionStart
;
1735 int64_t RegexMatcher::regionStart64() const {
1736 return fRegionStart
;
1740 //--------------------------------------------------------------------------------
1744 //--------------------------------------------------------------------------------
1745 UnicodeString
RegexMatcher::replaceAll(const UnicodeString
&replacement
, UErrorCode
&status
) {
1746 UText replacementText
= UTEXT_INITIALIZER
;
1747 UText resultText
= UTEXT_INITIALIZER
;
1748 UnicodeString resultString
;
1749 if (U_FAILURE(status
)) {
1750 return resultString
;
1753 utext_openConstUnicodeString(&replacementText
, &replacement
, &status
);
1754 utext_openUnicodeString(&resultText
, &resultString
, &status
);
1756 replaceAll(&replacementText
, &resultText
, status
);
1758 utext_close(&resultText
);
1759 utext_close(&replacementText
);
1761 return resultString
;
1766 // replaceAll, UText mode
1768 UText
*RegexMatcher::replaceAll(UText
*replacement
, UText
*dest
, UErrorCode
&status
) {
1769 if (U_FAILURE(status
)) {
1772 if (U_FAILURE(fDeferredStatus
)) {
1773 status
= fDeferredStatus
;
1778 UnicodeString emptyString
;
1779 UText empty
= UTEXT_INITIALIZER
;
1781 utext_openUnicodeString(&empty
, &emptyString
, &status
);
1782 dest
= utext_clone(NULL
, &empty
, TRUE
, FALSE
, &status
);
1783 utext_close(&empty
);
1786 if (U_SUCCESS(status
)) {
1789 appendReplacement(dest
, replacement
, status
);
1790 if (U_FAILURE(status
)) {
1794 appendTail(dest
, status
);
1801 //--------------------------------------------------------------------------------
1805 //--------------------------------------------------------------------------------
1806 UnicodeString
RegexMatcher::replaceFirst(const UnicodeString
&replacement
, UErrorCode
&status
) {
1807 UText replacementText
= UTEXT_INITIALIZER
;
1808 UText resultText
= UTEXT_INITIALIZER
;
1809 UnicodeString resultString
;
1811 utext_openConstUnicodeString(&replacementText
, &replacement
, &status
);
1812 utext_openUnicodeString(&resultText
, &resultString
, &status
);
1814 replaceFirst(&replacementText
, &resultText
, status
);
1816 utext_close(&resultText
);
1817 utext_close(&replacementText
);
1819 return resultString
;
1823 // replaceFirst, UText mode
1825 UText
*RegexMatcher::replaceFirst(UText
*replacement
, UText
*dest
, UErrorCode
&status
) {
1826 if (U_FAILURE(status
)) {
1829 if (U_FAILURE(fDeferredStatus
)) {
1830 status
= fDeferredStatus
;
1836 return getInput(dest
, status
);
1840 UnicodeString emptyString
;
1841 UText empty
= UTEXT_INITIALIZER
;
1843 utext_openUnicodeString(&empty
, &emptyString
, &status
);
1844 dest
= utext_clone(NULL
, &empty
, TRUE
, FALSE
, &status
);
1845 utext_close(&empty
);
1848 appendReplacement(dest
, replacement
, status
);
1849 appendTail(dest
, status
);
1855 //--------------------------------------------------------------------------------
1859 //--------------------------------------------------------------------------------
1860 UBool
RegexMatcher::requireEnd() const {
1865 //--------------------------------------------------------------------------------
1869 //--------------------------------------------------------------------------------
1870 RegexMatcher
&RegexMatcher::reset() {
1872 fRegionLimit
= fInputLength
;
1874 fActiveLimit
= fInputLength
;
1876 fAnchorLimit
= fInputLength
;
1878 fLookLimit
= fInputLength
;
1879 resetPreserveRegion();
1885 void RegexMatcher::resetPreserveRegion() {
1889 fAppendPosition
= 0;
1892 fRequireEnd
= FALSE
;
1894 fTickCounter
= TIMER_INITIAL_VALUE
;
1895 //resetStack(); // more expensive than it looks...
1899 RegexMatcher
&RegexMatcher::reset(const UnicodeString
&input
) {
1900 fInputText
= utext_openConstUnicodeString(fInputText
, &input
, &fDeferredStatus
);
1901 if (fPattern
->fNeedsAltInput
) {
1902 fAltInputText
= utext_clone(fAltInputText
, fInputText
, FALSE
, TRUE
, &fDeferredStatus
);
1904 fInputLength
= utext_nativeLength(fInputText
);
1910 // Do the following for any UnicodeString.
1911 // This is for compatibility for those clients who modify the input string "live" during regex operations.
1912 fInputUniStrMaybeMutable
= TRUE
;
1914 if (fWordBreakItr
!= NULL
) {
1915 #if UCONFIG_NO_BREAK_ITERATION==0
1916 UErrorCode status
= U_ZERO_ERROR
;
1917 fWordBreakItr
->setText(fInputText
, status
);
1924 RegexMatcher
&RegexMatcher::reset(UText
*input
) {
1925 if (fInputText
!= input
) {
1926 fInputText
= utext_clone(fInputText
, input
, FALSE
, TRUE
, &fDeferredStatus
);
1927 if (fPattern
->fNeedsAltInput
) fAltInputText
= utext_clone(fAltInputText
, fInputText
, FALSE
, TRUE
, &fDeferredStatus
);
1928 fInputLength
= utext_nativeLength(fInputText
);
1933 if (fWordBreakItr
!= NULL
) {
1934 #if UCONFIG_NO_BREAK_ITERATION==0
1935 UErrorCode status
= U_ZERO_ERROR
;
1936 fWordBreakItr
->setText(input
, status
);
1941 fInputUniStrMaybeMutable
= FALSE
;
1946 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
1947 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1951 RegexMatcher
&RegexMatcher::reset(int64_t position
, UErrorCode
&status
) {
1952 if (U_FAILURE(status
)) {
1955 reset(); // Reset also resets the region to be the entire string.
1957 if (position
< 0 || position
> fActiveLimit
) {
1958 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1961 fMatchEnd
= position
;
1969 //--------------------------------------------------------------------------------
1973 //--------------------------------------------------------------------------------
1974 void RegexMatcher::setTrace(UBool state
) {
1975 fTraceDebug
= state
;
1980 //---------------------------------------------------------------------
1984 //---------------------------------------------------------------------
1985 int32_t RegexMatcher::split(const UnicodeString
&input
,
1986 UnicodeString dest
[],
1987 int32_t destCapacity
,
1990 UText inputText
= UTEXT_INITIALIZER
;
1991 utext_openConstUnicodeString(&inputText
, &input
, &status
);
1992 if (U_FAILURE(status
)) {
1996 UText
**destText
= (UText
**)uprv_malloc(sizeof(UText
*)*destCapacity
);
1997 if (destText
== NULL
) {
1998 status
= U_MEMORY_ALLOCATION_ERROR
;
2002 for (i
= 0; i
< destCapacity
; i
++) {
2003 destText
[i
] = utext_openUnicodeString(NULL
, &dest
[i
], &status
);
2006 int32_t fieldCount
= split(&inputText
, destText
, destCapacity
, status
);
2008 for (i
= 0; i
< destCapacity
; i
++) {
2009 utext_close(destText
[i
]);
2012 uprv_free(destText
);
2013 utext_close(&inputText
);
2018 // split, UText mode
2020 int32_t RegexMatcher::split(UText
*input
,
2022 int32_t destCapacity
,
2026 // Check arguements for validity
2028 if (U_FAILURE(status
)) {
2032 if (destCapacity
< 1) {
2033 status
= U_ILLEGAL_ARGUMENT_ERROR
;
2038 // Reset for the input text
2041 int64_t nextOutputStringStart
= 0;
2042 if (fActiveLimit
== 0) {
2047 // Loop through the input text, searching for the delimiter pattern
2050 int32_t numCaptureGroups
= fPattern
->fGroupMap
->size();
2052 if (i
>=destCapacity
-1) {
2053 // There is one or zero output string left.
2054 // Fill the last output string with whatever is left from the input, then exit the loop.
2055 // ( i will be == destCapacity if we filled the output array while processing
2056 // capture groups of the delimiter expression, in which case we will discard the
2057 // last capture group saved in favor of the unprocessed remainder of the
2060 if (fActiveLimit
> nextOutputStringStart
) {
2061 if (UTEXT_FULL_TEXT_IN_CHUNK(input
, fInputLength
)) {
2063 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]),
2064 input
->chunkContents
+nextOutputStringStart
,
2065 (int32_t)(fActiveLimit
-nextOutputStringStart
), &status
);
2067 UText remainingText
= UTEXT_INITIALIZER
;
2068 utext_openUChars(&remainingText
, input
->chunkContents
+nextOutputStringStart
,
2069 fActiveLimit
-nextOutputStringStart
, &status
);
2070 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2071 utext_close(&remainingText
);
2074 UErrorCode lengthStatus
= U_ZERO_ERROR
;
2075 int32_t remaining16Length
=
2076 utext_extract(input
, nextOutputStringStart
, fActiveLimit
, NULL
, 0, &lengthStatus
);
2077 UChar
*remainingChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(remaining16Length
+1));
2078 if (remainingChars
== NULL
) {
2079 status
= U_MEMORY_ALLOCATION_ERROR
;
2083 utext_extract(input
, nextOutputStringStart
, fActiveLimit
, remainingChars
, remaining16Length
+1, &status
);
2085 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]), remainingChars
, remaining16Length
, &status
);
2087 UText remainingText
= UTEXT_INITIALIZER
;
2088 utext_openUChars(&remainingText
, remainingChars
, remaining16Length
, &status
);
2089 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2090 utext_close(&remainingText
);
2093 uprv_free(remainingChars
);
2099 // We found another delimiter. Move everything from where we started looking
2100 // up until the start of the delimiter into the next output string.
2101 if (UTEXT_FULL_TEXT_IN_CHUNK(input
, fInputLength
)) {
2103 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]),
2104 input
->chunkContents
+nextOutputStringStart
,
2105 (int32_t)(fMatchStart
-nextOutputStringStart
), &status
);
2107 UText remainingText
= UTEXT_INITIALIZER
;
2108 utext_openUChars(&remainingText
, input
->chunkContents
+nextOutputStringStart
,
2109 fMatchStart
-nextOutputStringStart
, &status
);
2110 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2111 utext_close(&remainingText
);
2114 UErrorCode lengthStatus
= U_ZERO_ERROR
;
2115 int32_t remaining16Length
= utext_extract(input
, nextOutputStringStart
, fMatchStart
, NULL
, 0, &lengthStatus
);
2116 UChar
*remainingChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(remaining16Length
+1));
2117 if (remainingChars
== NULL
) {
2118 status
= U_MEMORY_ALLOCATION_ERROR
;
2121 utext_extract(input
, nextOutputStringStart
, fMatchStart
, remainingChars
, remaining16Length
+1, &status
);
2123 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]), remainingChars
, remaining16Length
, &status
);
2125 UText remainingText
= UTEXT_INITIALIZER
;
2126 utext_openUChars(&remainingText
, remainingChars
, remaining16Length
, &status
);
2127 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2128 utext_close(&remainingText
);
2131 uprv_free(remainingChars
);
2133 nextOutputStringStart
= fMatchEnd
;
2135 // If the delimiter pattern has capturing parentheses, the captured
2136 // text goes out into the next n destination strings.
2138 UBool lastGroupWasNullUText
= FALSE
;
2139 for (groupNum
=1; groupNum
<=numCaptureGroups
; groupNum
++) {
2140 if (i
==destCapacity
-1) {
2144 lastGroupWasNullUText
= (dest
[i
] == NULL
? TRUE
: FALSE
);
2145 dest
[i
] = group(groupNum
, dest
[i
], status
);
2148 if (nextOutputStringStart
== fActiveLimit
) {
2149 // The delimiter was at the end of the string. We're done.
2151 } else if (i
== destCapacity
-1) {
2152 // We're out of capture groups, and the rest of the string is more important
2153 if (lastGroupWasNullUText
) {
2154 utext_close(dest
[i
]);
2162 // We ran off the end of the input while looking for the next delimiter.
2163 // All the remaining text goes into the current output string.
2164 if (UTEXT_FULL_TEXT_IN_CHUNK(input
, fInputLength
)) {
2166 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]),
2167 input
->chunkContents
+nextOutputStringStart
,
2168 (int32_t)(fActiveLimit
-nextOutputStringStart
), &status
);
2170 UText remainingText
= UTEXT_INITIALIZER
;
2171 utext_openUChars(&remainingText
, input
->chunkContents
+nextOutputStringStart
,
2172 fActiveLimit
-nextOutputStringStart
, &status
);
2173 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2174 utext_close(&remainingText
);
2177 UErrorCode lengthStatus
= U_ZERO_ERROR
;
2178 int32_t remaining16Length
= utext_extract(input
, nextOutputStringStart
, fActiveLimit
, NULL
, 0, &lengthStatus
);
2179 UChar
*remainingChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(remaining16Length
+1));
2180 if (remainingChars
== NULL
) {
2181 status
= U_MEMORY_ALLOCATION_ERROR
;
2185 utext_extract(input
, nextOutputStringStart
, fActiveLimit
, remainingChars
, remaining16Length
+1, &status
);
2187 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]), remainingChars
, remaining16Length
, &status
);
2189 UText remainingText
= UTEXT_INITIALIZER
;
2190 utext_openUChars(&remainingText
, remainingChars
, remaining16Length
, &status
);
2191 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2192 utext_close(&remainingText
);
2195 uprv_free(remainingChars
);
2199 if (U_FAILURE(status
)) {
2202 } // end of for loop
2207 //--------------------------------------------------------------------------------
2211 //--------------------------------------------------------------------------------
2212 int32_t RegexMatcher::start(UErrorCode
&status
) const {
2213 return start(0, status
);
2216 int64_t RegexMatcher::start64(UErrorCode
&status
) const {
2217 return start64(0, status
);
2220 //--------------------------------------------------------------------------------
2222 // start(int32_t group, UErrorCode &status)
2224 //--------------------------------------------------------------------------------
2226 int64_t RegexMatcher::start64(int32_t group
, UErrorCode
&status
) const {
2227 if (U_FAILURE(status
)) {
2230 if (U_FAILURE(fDeferredStatus
)) {
2231 status
= fDeferredStatus
;
2234 if (fMatch
== FALSE
) {
2235 status
= U_REGEX_INVALID_STATE
;
2238 if (group
< 0 || group
> fPattern
->fGroupMap
->size()) {
2239 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
2246 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(group
-1);
2247 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
2248 U_ASSERT(groupOffset
>= 0);
2249 s
= fFrame
->fExtra
[groupOffset
];
2256 int32_t RegexMatcher::start(int32_t group
, UErrorCode
&status
) const {
2257 return (int32_t)start64(group
, status
);
2260 //--------------------------------------------------------------------------------
2262 // useAnchoringBounds
2264 //--------------------------------------------------------------------------------
2265 RegexMatcher
&RegexMatcher::useAnchoringBounds(UBool b
) {
2266 fAnchoringBounds
= b
;
2267 fAnchorStart
= (fAnchoringBounds
? fRegionStart
: 0);
2268 fAnchorLimit
= (fAnchoringBounds
? fRegionLimit
: fInputLength
);
2273 //--------------------------------------------------------------------------------
2275 // useTransparentBounds
2277 //--------------------------------------------------------------------------------
2278 RegexMatcher
&RegexMatcher::useTransparentBounds(UBool b
) {
2279 fTransparentBounds
= b
;
2280 fLookStart
= (fTransparentBounds
? 0 : fRegionStart
);
2281 fLookLimit
= (fTransparentBounds
? fInputLength
: fRegionLimit
);
2285 //--------------------------------------------------------------------------------
2289 //--------------------------------------------------------------------------------
2290 void RegexMatcher::setTimeLimit(int32_t limit
, UErrorCode
&status
) {
2291 if (U_FAILURE(status
)) {
2294 if (U_FAILURE(fDeferredStatus
)) {
2295 status
= fDeferredStatus
;
2299 status
= U_ILLEGAL_ARGUMENT_ERROR
;
2306 //--------------------------------------------------------------------------------
2310 //--------------------------------------------------------------------------------
2311 int32_t RegexMatcher::getTimeLimit() const {
2316 //--------------------------------------------------------------------------------
2320 //--------------------------------------------------------------------------------
2321 void RegexMatcher::setStackLimit(int32_t limit
, UErrorCode
&status
) {
2322 if (U_FAILURE(status
)) {
2325 if (U_FAILURE(fDeferredStatus
)) {
2326 status
= fDeferredStatus
;
2330 status
= U_ILLEGAL_ARGUMENT_ERROR
;
2334 // Reset the matcher. This is needed here in case there is a current match
2335 // whose final stack frame (containing the match results, pointed to by fFrame)
2336 // would be lost by resizing to a smaller stack size.
2340 // Unlimited stack expansion
2341 fStack
->setMaxCapacity(0);
2343 // Change the units of the limit from bytes to ints, and bump the size up
2344 // to be big enough to hold at least one stack frame for the pattern,
2345 // if it isn't there already.
2346 int32_t adjustedLimit
= limit
/ sizeof(int32_t);
2347 if (adjustedLimit
< fPattern
->fFrameSize
) {
2348 adjustedLimit
= fPattern
->fFrameSize
;
2350 fStack
->setMaxCapacity(adjustedLimit
);
2352 fStackLimit
= limit
;
2356 //--------------------------------------------------------------------------------
2360 //--------------------------------------------------------------------------------
2361 int32_t RegexMatcher::getStackLimit() const {
2366 //--------------------------------------------------------------------------------
2370 //--------------------------------------------------------------------------------
2371 void RegexMatcher::setMatchCallback(URegexMatchCallback
*callback
,
2372 const void *context
,
2373 UErrorCode
&status
) {
2374 if (U_FAILURE(status
)) {
2377 fCallbackFn
= callback
;
2378 fCallbackContext
= context
;
2382 //--------------------------------------------------------------------------------
2386 //--------------------------------------------------------------------------------
2387 void RegexMatcher::getMatchCallback(URegexMatchCallback
*&callback
,
2388 const void *&context
,
2389 UErrorCode
&status
) {
2390 if (U_FAILURE(status
)) {
2393 callback
= fCallbackFn
;
2394 context
= fCallbackContext
;
2398 //--------------------------------------------------------------------------------
2402 //--------------------------------------------------------------------------------
2403 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback
*callback
,
2404 const void *context
,
2405 UErrorCode
&status
) {
2406 if (U_FAILURE(status
)) {
2409 fFindProgressCallbackFn
= callback
;
2410 fFindProgressCallbackContext
= context
;
2414 //--------------------------------------------------------------------------------
2418 //--------------------------------------------------------------------------------
2419 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback
*&callback
,
2420 const void *&context
,
2421 UErrorCode
&status
) {
2422 if (U_FAILURE(status
)) {
2425 callback
= fFindProgressCallbackFn
;
2426 context
= fFindProgressCallbackContext
;
2430 //================================================================================
2432 // Code following this point in this file is the internal
2433 // Match Engine Implementation.
2435 //================================================================================
2438 //--------------------------------------------------------------------------------
2441 // Discard any previous contents of the state save stack, and initialize a
2442 // new stack frame to all -1. The -1s are needed for capture group limits,
2443 // where they indicate that a group has not yet matched anything.
2444 //--------------------------------------------------------------------------------
2445 REStackFrame
*RegexMatcher::resetStack() {
2446 // Discard any previous contents of the state save stack, and initialize a
2447 // new stack frame with all -1 data. The -1s are needed for capture group limits,
2448 // where they indicate that a group has not yet matched anything.
2449 fStack
->removeAllElements();
2451 REStackFrame
*iFrame
= (REStackFrame
*)fStack
->reserveBlock(fPattern
->fFrameSize
, fDeferredStatus
);
2453 for (i
=0; i
<fPattern
->fFrameSize
-RESTACKFRAME_HDRCOUNT
; i
++) {
2454 iFrame
->fExtra
[i
] = -1;
2461 //--------------------------------------------------------------------------------
2464 // in perl, "xab..cd..", \b is true at positions 0,3,5,7
2466 // If the current char is a combining mark,
2468 // Else Scan backwards to the first non-combining char.
2469 // We are at a boundary if the this char and the original chars are
2470 // opposite in membership in \w set
2472 // parameters: pos - the current position in the input buffer
2474 // TODO: double-check edge cases at region boundaries.
2476 //--------------------------------------------------------------------------------
2477 UBool
RegexMatcher::isWordBoundary(int64_t pos
) {
2478 UBool isBoundary
= FALSE
;
2479 UBool cIsWord
= FALSE
;
2481 if (pos
>= fLookLimit
) {
2484 // Determine whether char c at current position is a member of the word set of chars.
2485 // If we're off the end of the string, behave as though we're not at a word char.
2486 UTEXT_SETNATIVEINDEX(fInputText
, pos
);
2487 UChar32 c
= UTEXT_CURRENT32(fInputText
);
2488 if (u_hasBinaryProperty(c
, UCHAR_GRAPHEME_EXTEND
) || u_charType(c
) == U_FORMAT_CHAR
) {
2489 // Current char is a combining one. Not a boundary.
2492 cIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(c
);
2495 // Back up until we come to a non-combining char, determine whether
2496 // that char is a word char.
2497 UBool prevCIsWord
= FALSE
;
2499 if (UTEXT_GETNATIVEINDEX(fInputText
) <= fLookStart
) {
2502 UChar32 prevChar
= UTEXT_PREVIOUS32(fInputText
);
2503 if (!(u_hasBinaryProperty(prevChar
, UCHAR_GRAPHEME_EXTEND
)
2504 || u_charType(prevChar
) == U_FORMAT_CHAR
)) {
2505 prevCIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(prevChar
);
2509 isBoundary
= cIsWord
^ prevCIsWord
;
2513 UBool
RegexMatcher::isChunkWordBoundary(int32_t pos
) {
2514 UBool isBoundary
= FALSE
;
2515 UBool cIsWord
= FALSE
;
2517 const UChar
*inputBuf
= fInputText
->chunkContents
;
2519 if (pos
>= fLookLimit
) {
2522 // Determine whether char c at current position is a member of the word set of chars.
2523 // If we're off the end of the string, behave as though we're not at a word char.
2525 U16_GET(inputBuf
, fLookStart
, pos
, fLookLimit
, c
);
2526 if (u_hasBinaryProperty(c
, UCHAR_GRAPHEME_EXTEND
) || u_charType(c
) == U_FORMAT_CHAR
) {
2527 // Current char is a combining one. Not a boundary.
2530 cIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(c
);
2533 // Back up until we come to a non-combining char, determine whether
2534 // that char is a word char.
2535 UBool prevCIsWord
= FALSE
;
2537 if (pos
<= fLookStart
) {
2541 U16_PREV(inputBuf
, fLookStart
, pos
, prevChar
);
2542 if (!(u_hasBinaryProperty(prevChar
, UCHAR_GRAPHEME_EXTEND
)
2543 || u_charType(prevChar
) == U_FORMAT_CHAR
)) {
2544 prevCIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(prevChar
);
2548 isBoundary
= cIsWord
^ prevCIsWord
;
2552 //--------------------------------------------------------------------------------
2556 // Test for a word boundary using RBBI word break.
2558 // parameters: pos - the current position in the input buffer
2560 //--------------------------------------------------------------------------------
2561 UBool
RegexMatcher::isUWordBoundary(int64_t pos
) {
2562 UBool returnVal
= FALSE
;
2563 #if UCONFIG_NO_BREAK_ITERATION==0
2565 // If we haven't yet created a break iterator for this matcher, do it now.
2566 if (fWordBreakItr
== NULL
) {
2568 (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus
);
2569 if (U_FAILURE(fDeferredStatus
)) {
2572 fWordBreakItr
->setText(fInputText
, fDeferredStatus
);
2575 if (pos
>= fLookLimit
) {
2577 returnVal
= TRUE
; // With Unicode word rules, only positions within the interior of "real"
2578 // words are not boundaries. All non-word chars stand by themselves,
2579 // with word boundaries on both sides.
2581 if (!UTEXT_USES_U16(fInputText
)) {
2582 // !!!: Would like a better way to do this!
2583 UErrorCode status
= U_ZERO_ERROR
;
2584 pos
= utext_extract(fInputText
, 0, pos
, NULL
, 0, &status
);
2586 returnVal
= fWordBreakItr
->isBoundary((int32_t)pos
);
2592 //--------------------------------------------------------------------------------
2594 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state
2595 // saves. Increment the "time" counter, and call the
2596 // user callback function if there is one installed.
2598 // If the match operation needs to be aborted, either for a time-out
2599 // or because the user callback asked for it, just set an error status.
2600 // The engine will pick that up and stop in its outer loop.
2602 //--------------------------------------------------------------------------------
2603 void RegexMatcher::IncrementTime(UErrorCode
&status
) {
2604 fTickCounter
= TIMER_INITIAL_VALUE
;
2606 if (fCallbackFn
!= NULL
) {
2607 if ((*fCallbackFn
)(fCallbackContext
, fTime
) == FALSE
) {
2608 status
= U_REGEX_STOPPED_BY_CALLER
;
2612 if (fTimeLimit
> 0 && fTime
>= fTimeLimit
) {
2613 status
= U_REGEX_TIME_OUT
;
2617 //--------------------------------------------------------------------------------
2619 // ReportFindProgress This function is called once for each advance in the target
2620 // string from the find() function, and calls the user progress callback
2621 // function if there is one installed.
2625 // If the match operation needs to be aborted because the user
2626 // callback asked for it, just set an error status.
2627 // The engine will pick that up and stop in its outer loop.
2629 //--------------------------------------------------------------------------------
2630 UBool
RegexMatcher::ReportFindProgress(int64_t matchIndex
, UErrorCode
&status
) {
2631 if (fFindProgressCallbackFn
!= NULL
) {
2632 if ((*fFindProgressCallbackFn
)(fFindProgressCallbackContext
, matchIndex
) == FALSE
) {
2633 status
= U_ZERO_ERROR
/*U_REGEX_STOPPED_BY_CALLER*/;
2640 //--------------------------------------------------------------------------------
2643 // Make a new stack frame, initialized as a copy of the current stack frame.
2644 // Set the pattern index in the original stack frame from the operand value
2645 // in the opcode. Execution of the engine continues with the state in
2646 // the newly created stack frame
2648 // Note that reserveBlock() may grow the stack, resulting in the
2649 // whole thing being relocated in memory.
2652 // fp The top frame pointer when called. At return, a new
2653 // fame will be present
2654 // savePatIdx An index into the compiled pattern. Goes into the original
2655 // (not new) frame. If execution ever back-tracks out of the
2656 // new frame, this will be where we continue from in the pattern.
2658 // The new frame pointer.
2660 //--------------------------------------------------------------------------------
2661 inline REStackFrame
*RegexMatcher::StateSave(REStackFrame
*fp
, int64_t savePatIdx
, UErrorCode
&status
) {
2662 // push storage for a new frame.
2663 int64_t *newFP
= fStack
->reserveBlock(fFrameSize
, status
);
2664 if (newFP
== NULL
) {
2665 // Failure on attempted stack expansion.
2666 // Stack function set some other error code, change it to a more
2667 // specific one for regular expressions.
2668 status
= U_REGEX_STACK_OVERFLOW
;
2669 // We need to return a writable stack frame, so just return the
2670 // previous frame. The match operation will stop quickly
2671 // because of the error status, after which the frame will never
2672 // be looked at again.
2675 fp
= (REStackFrame
*)(newFP
- fFrameSize
); // in case of realloc of stack.
2677 // New stack frame = copy of old top frame.
2678 int64_t *source
= (int64_t *)fp
;
2679 int64_t *dest
= newFP
;
2681 *dest
++ = *source
++;
2682 if (source
== newFP
) {
2688 if (fTickCounter
<= 0) {
2689 IncrementTime(status
); // Re-initializes fTickCounter
2691 fp
->fPatIdx
= savePatIdx
;
2692 return (REStackFrame
*)newFP
;
2696 //--------------------------------------------------------------------------------
2698 // MatchAt This is the actual matching engine.
2700 // startIdx: begin matching a this index.
2701 // toEnd: if true, match must extend to end of the input region
2703 //--------------------------------------------------------------------------------
2704 void RegexMatcher::MatchAt(int64_t startIdx
, UBool toEnd
, UErrorCode
&status
) {
2705 UBool isMatch
= FALSE
; // True if the we have a match.
2707 int64_t backSearchIndex
= U_INT64_MAX
; // used after greedy single-character matches for searching backwards
2709 int32_t op
; // Operation from the compiled pattern, split into
2710 int32_t opType
; // the opcode
2711 int32_t opValue
; // and the operand value.
2713 #ifdef REGEX_RUN_DEBUG
2716 printf("MatchAt(startIdx=%ld)\n", startIdx
);
2717 printf("Original Pattern: ");
2718 UChar32 c
= utext_next32From(fPattern
->fPattern
, 0);
2719 while (c
!= U_SENTINEL
) {
2720 if (c
<32 || c
>256) {
2723 REGEX_DUMP_DEBUG_PRINTF(("%c", c
));
2725 c
= UTEXT_NEXT32(fPattern
->fPattern
);
2728 printf("Input String: ");
2729 c
= utext_next32From(fInputText
, 0);
2730 while (c
!= U_SENTINEL
) {
2731 if (c
<32 || c
>256) {
2736 c
= UTEXT_NEXT32(fInputText
);
2743 if (U_FAILURE(status
)) {
2747 // Cache frequently referenced items from the compiled pattern
2749 int64_t *pat
= fPattern
->fCompiledPat
->getBuffer();
2751 const UChar
*litText
= fPattern
->fLiteralText
.getBuffer();
2752 UVector
*sets
= fPattern
->fSets
;
2754 fFrameSize
= fPattern
->fFrameSize
;
2755 REStackFrame
*fp
= resetStack();
2758 fp
->fInputIdx
= startIdx
;
2760 // Zero out the pattern's static data
2762 for (i
= 0; i
<fPattern
->fDataSize
; i
++) {
2767 // Main loop for interpreting the compiled pattern.
2768 // One iteration of the loop per pattern operation performed.
2772 if (_heapchk() != _HEAPOK
) {
2773 fprintf(stderr
, "Heap Trouble\n");
2777 op
= (int32_t)pat
[fp
->fPatIdx
];
2778 opType
= URX_TYPE(op
);
2779 opValue
= URX_VAL(op
);
2780 #ifdef REGEX_RUN_DEBUG
2782 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2783 printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp
->fInputIdx
,
2784 UTEXT_CURRENT32(fInputText
), (int64_t *)fp
-fStack
->getBuffer(), fActiveLimit
);
2785 fPattern
->dumpOp(fp
->fPatIdx
);
2798 // Force a backtrack. In some circumstances, the pattern compiler
2799 // will notice that the pattern can't possibly match anything, and will
2800 // emit one of these at that point.
2801 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2806 if (fp
->fInputIdx
< fActiveLimit
) {
2807 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2808 UChar32 c
= UTEXT_NEXT32(fInputText
);
2810 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
2817 #ifdef REGEX_SMART_BACKTRACKING
2818 if (fp
->fInputIdx
> backSearchIndex
&& fStack
->size() > fFrameSize
) {
2819 REStackFrame
*prevFrame
= (REStackFrame
*)fStack
->peekFrame(fFrameSize
);
2820 if (URX_LOOP_C
== URX_TYPE(pat
[prevFrame
->fPatIdx
]) && fp
->fInputIdx
<= prevFrame
->fInputIdx
) {
2821 UBool success
= FALSE
;
2822 UChar32 c
= UTEXT_PREVIOUS32(fInputText
);
2823 while (UTEXT_GETNATIVEINDEX(fInputText
) >= backSearchIndex
) {
2827 } else if (c
== U_SENTINEL
) {
2830 c
= UTEXT_PREVIOUS32(fInputText
);
2834 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2835 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
2836 if (fp
->fInputIdx
> backSearchIndex
) {
2837 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
2839 fp
->fPatIdx
++; // Skip the LOOP_C, we just did that
2846 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2852 // Test input against a literal string.
2853 // Strings require two slots in the compiled pattern, one for the
2854 // offset to the string text, and one for the length.
2855 int32_t stringStartIdx
= opValue
;
2858 op
= (int32_t)pat
[fp
->fPatIdx
]; // Fetch the second operand
2860 opType
= URX_TYPE(op
);
2861 stringLen
= URX_VAL(op
);
2862 U_ASSERT(opType
== URX_STRING_LEN
);
2863 U_ASSERT(stringLen
>= 2);
2865 const UChar
*patternChars
= litText
+stringStartIdx
;
2866 const UChar
*patternEnd
= patternChars
+stringLen
;
2868 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2870 UBool success
= TRUE
;
2872 while (patternChars
< patternEnd
&& success
) {
2873 c
= UTEXT_NEXT32(fInputText
);
2875 if (c
!= U_SENTINEL
&& UTEXT_GETNATIVEINDEX(fInputText
) <= fActiveLimit
) {
2877 success
= (*patternChars
== c
);
2879 } else if (patternChars
+1 < patternEnd
) {
2880 success
= (*patternChars
== U16_LEAD(c
) && *(patternChars
+1) == U16_TRAIL(c
));
2885 fHitEnd
= TRUE
; // TODO: See ticket 6074
2890 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
2892 #ifdef REGEX_SMART_BACKTRACKING
2893 if (fp
->fInputIdx
> backSearchIndex
&& fStack
->size()) {
2894 REStackFrame
*prevFrame
= (REStackFrame
*)fStack
->peekFrame(fFrameSize
);
2895 if (URX_LOOP_C
== URX_TYPE(pat
[prevFrame
->fPatIdx
]) && fp
->fInputIdx
<= prevFrame
->fInputIdx
) {
2896 // Reset to last start point
2897 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2898 patternChars
= litText
+stringStartIdx
;
2900 // Search backwards for a possible start
2902 c
= UTEXT_PREVIOUS32(fInputText
);
2903 if (c
== U_SENTINEL
) {
2905 } else if ((U_IS_BMP(c
) && *patternChars
== c
) ||
2906 (*patternChars
== U16_LEAD(c
) && *(patternChars
+1) == U16_TRAIL(c
))) {
2910 } while (UTEXT_GETNATIVEINDEX(fInputText
) >= backSearchIndex
);
2914 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2915 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
2916 if (fp
->fInputIdx
> backSearchIndex
) {
2917 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
2919 fp
->fPatIdx
++; // Skip the LOOP_C, we just did that
2925 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2931 case URX_STATE_SAVE
:
2932 fp
= StateSave(fp
, opValue
, status
);
2937 // The match loop will exit via this path on a successful match,
2938 // when we reach the end of the pattern.
2939 if (toEnd
&& fp
->fInputIdx
!= fActiveLimit
) {
2940 // The pattern matched, but not to the end of input. Try some more.
2941 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2947 // Start and End Capture stack frame variables are laid out out like this:
2948 // fp->fExtra[opValue] - The start of a completed capture group
2949 // opValue+1 - The end of a completed capture group
2950 // opValue+2 - the start of a capture group whose end
2951 // has not yet been reached (and might not ever be).
2952 case URX_START_CAPTURE
:
2953 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
2954 fp
->fExtra
[opValue
+2] = fp
->fInputIdx
;
2958 case URX_END_CAPTURE
:
2959 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
2960 U_ASSERT(fp
->fExtra
[opValue
+2] >= 0); // Start pos for this group must be set.
2961 fp
->fExtra
[opValue
] = fp
->fExtra
[opValue
+2]; // Tentative start becomes real.
2962 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // End position
2963 U_ASSERT(fp
->fExtra
[opValue
] <= fp
->fExtra
[opValue
+1]);
2967 case URX_DOLLAR
: // $, test for End of line
2968 // or for position before new line at end of input
2970 if (fp
->fInputIdx
>= fAnchorLimit
) {
2971 // We really are at the end of input. Success.
2977 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2979 // If we are positioned just before a new-line that is located at the
2980 // end of input, succeed.
2981 UChar32 c
= UTEXT_NEXT32(fInputText
);
2982 if (UTEXT_GETNATIVEINDEX(fInputText
) >= fAnchorLimit
) {
2983 if ((c
>=0x0a && c
<=0x0d) || c
==0x85 || c
==0x2028 || c
==0x2029) {
2984 // If not in the middle of a CR/LF sequence
2985 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& (UTEXT_PREVIOUS32(fInputText
), UTEXT_PREVIOUS32(fInputText
))==0x0d)) {
2986 // At new-line at end of input. Success
2994 UChar32 nextC
= UTEXT_NEXT32(fInputText
);
2995 if (c
== 0x0d && nextC
== 0x0a && UTEXT_GETNATIVEINDEX(fInputText
) >= fAnchorLimit
) {
2998 break; // At CR/LF at end of input. Success
3002 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3007 case URX_DOLLAR_D
: // $, test for End of Line, in UNIX_LINES mode.
3008 if (fp
->fInputIdx
>= fAnchorLimit
) {
3009 // Off the end of input. Success.
3014 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3015 UChar32 c
= UTEXT_NEXT32(fInputText
);
3016 // Either at the last character of input, or off the end.
3017 if (c
== 0x0a && UTEXT_GETNATIVEINDEX(fInputText
) == fAnchorLimit
) {
3024 // Not at end of input. Back-track out.
3025 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3029 case URX_DOLLAR_M
: // $, test for End of line in multi-line mode
3031 if (fp
->fInputIdx
>= fAnchorLimit
) {
3032 // We really are at the end of input. Success.
3037 // If we are positioned just before a new-line, succeed.
3038 // It makes no difference where the new-line is within the input.
3039 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3040 UChar32 c
= UTEXT_CURRENT32(fInputText
);
3041 if ((c
>=0x0a && c
<=0x0d) || c
==0x85 ||c
==0x2028 || c
==0x2029) {
3042 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
3043 // In multi-line mode, hitting a new-line just before the end of input does not
3044 // set the hitEnd or requireEnd flags
3045 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& UTEXT_PREVIOUS32(fInputText
)==0x0d)) {
3049 // not at a new line. Fail.
3050 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3055 case URX_DOLLAR_MD
: // $, test for End of line in multi-line and UNIX_LINES mode
3057 if (fp
->fInputIdx
>= fAnchorLimit
) {
3058 // We really are at the end of input. Success.
3060 fRequireEnd
= TRUE
; // Java set requireEnd in this case, even though
3061 break; // adding a new-line would not lose the match.
3063 // If we are not positioned just before a new-line, the test fails; backtrack out.
3064 // It makes no difference where the new-line is within the input.
3065 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3066 if (UTEXT_CURRENT32(fInputText
) != 0x0a) {
3067 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3073 case URX_CARET
: // ^, test for start of line
3074 if (fp
->fInputIdx
!= fAnchorStart
) {
3075 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3080 case URX_CARET_M
: // ^, test for start of line in mulit-line mode
3082 if (fp
->fInputIdx
== fAnchorStart
) {
3083 // We are at the start input. Success.
3086 // Check whether character just before the current pos is a new-line
3087 // unless we are at the end of input
3088 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3089 UChar32 c
= UTEXT_PREVIOUS32(fInputText
);
3090 if ((fp
->fInputIdx
< fAnchorLimit
) &&
3091 ((c
<=0x0d && c
>=0x0a) || c
==0x85 ||c
==0x2028 || c
==0x2029)) {
3092 // It's a new-line. ^ is true. Success.
3093 // TODO: what should be done with positions between a CR and LF?
3096 // Not at the start of a line. Fail.
3097 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3102 case URX_CARET_M_UNIX
: // ^, test for start of line in mulit-line + Unix-line mode
3104 U_ASSERT(fp
->fInputIdx
>= fAnchorStart
);
3105 if (fp
->fInputIdx
<= fAnchorStart
) {
3106 // We are at the start input. Success.
3109 // Check whether character just before the current pos is a new-line
3110 U_ASSERT(fp
->fInputIdx
<= fAnchorLimit
);
3111 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3112 UChar32 c
= UTEXT_PREVIOUS32(fInputText
);
3114 // Not at the start of a line. Back-track out.
3115 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3120 case URX_BACKSLASH_B
: // Test for word boundaries
3122 UBool success
= isWordBoundary(fp
->fInputIdx
);
3123 success
^= (opValue
!= 0); // flip sense for \B
3125 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3131 case URX_BACKSLASH_BU
: // Test for word boundaries, Unicode-style
3133 UBool success
= isUWordBoundary(fp
->fInputIdx
);
3134 success
^= (opValue
!= 0); // flip sense for \B
3136 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3142 case URX_BACKSLASH_D
: // Test for decimal digit
3144 if (fp
->fInputIdx
>= fActiveLimit
) {
3146 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3150 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3152 UChar32 c
= UTEXT_NEXT32(fInputText
);
3153 int8_t ctype
= u_charType(c
); // TODO: make a unicode set for this. Will be faster.
3154 UBool success
= (ctype
== U_DECIMAL_DIGIT_NUMBER
);
3155 success
^= (opValue
!= 0); // flip sense for \D
3157 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3159 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3165 case URX_BACKSLASH_G
: // Test for position at end of previous match
3166 if (!((fMatch
&& fp
->fInputIdx
==fMatchEnd
) || (fMatch
==FALSE
&& fp
->fInputIdx
==fActiveStart
))) {
3167 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3172 case URX_BACKSLASH_X
:
3173 // Match a Grapheme, as defined by Unicode TR 29.
3174 // Differs slightly from Perl, which consumes combining marks independently
3178 // Fail if at end of input
3179 if (fp
->fInputIdx
>= fActiveLimit
) {
3181 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3185 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3187 // Examine (and consume) the current char.
3188 // Dispatch into a little state machine, based on the char.
3190 c
= UTEXT_NEXT32(fInputText
);
3191 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3192 UnicodeSet
**sets
= fPattern
->fStaticSets
;
3193 if (sets
[URX_GC_NORMAL
]->contains(c
)) goto GC_Extend
;
3194 if (sets
[URX_GC_CONTROL
]->contains(c
)) goto GC_Control
;
3195 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
3196 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
3197 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
3198 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
3199 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
3205 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
3206 c
= UTEXT_NEXT32(fInputText
);
3207 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3208 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
3209 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
3210 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
3211 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
3212 UTEXT_PREVIOUS32(fInputText
);
3213 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3217 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
3218 c
= UTEXT_NEXT32(fInputText
);
3219 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3220 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
3221 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
3222 UTEXT_PREVIOUS32(fInputText
);
3223 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3227 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
3228 c
= UTEXT_NEXT32(fInputText
);
3229 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3230 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
3231 UTEXT_PREVIOUS32(fInputText
);
3232 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3236 // Combining characters are consumed here
3238 if (fp
->fInputIdx
>= fActiveLimit
) {
3241 c
= UTEXT_CURRENT32(fInputText
);
3242 if (sets
[URX_GC_EXTEND
]->contains(c
) == FALSE
) {
3245 UTEXT_NEXT32(fInputText
);
3246 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3251 // Most control chars stand alone (don't combine with combining chars),
3252 // except for that CR/LF sequence is a single grapheme cluster.
3253 if (c
== 0x0d && fp
->fInputIdx
< fActiveLimit
&& UTEXT_CURRENT32(fInputText
) == 0x0a) {
3254 c
= UTEXT_NEXT32(fInputText
);
3255 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3259 if (fp
->fInputIdx
>= fActiveLimit
) {
3268 case URX_BACKSLASH_Z
: // Test for end of Input
3269 if (fp
->fInputIdx
< fAnchorLimit
) {
3270 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3279 case URX_STATIC_SETREF
:
3281 // Test input character against one of the predefined sets
3282 // (Word Characters, for example)
3283 // The high bit of the op value is a flag for the match polarity.
3284 // 0: success if input char is in set.
3285 // 1: success if input char is not in set.
3286 if (fp
->fInputIdx
>= fActiveLimit
) {
3288 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3292 UBool success
= ((opValue
& URX_NEG_SET
) == URX_NEG_SET
);
3293 opValue
&= ~URX_NEG_SET
;
3294 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
3296 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3297 UChar32 c
= UTEXT_NEXT32(fInputText
);
3299 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
3300 if (s8
->contains(c
)) {
3304 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
3305 if (s
->contains(c
)) {
3310 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3312 // the character wasn't in the set.
3313 #ifdef REGEX_SMART_BACKTRACKING
3314 if (fp
->fInputIdx
> backSearchIndex
&& fStack
->size() > fFrameSize
) {
3315 REStackFrame
*prevFrame
= (REStackFrame
*)fStack
->peekFrame(fFrameSize
);
3316 if (URX_LOOP_C
== URX_TYPE(pat
[prevFrame
->fPatIdx
]) && fp
->fInputIdx
<= prevFrame
->fInputIdx
) {
3317 // Try to find it, backwards
3318 UTEXT_PREVIOUS32(fInputText
); // skip the first character we tried
3319 success
= ((opValue
& URX_NEG_SET
) == URX_NEG_SET
); // reset
3321 c
= UTEXT_PREVIOUS32(fInputText
);
3322 if (c
== U_SENTINEL
) {
3324 } else if (c
< 256) {
3325 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
3326 if (s8
->contains(c
)) {
3330 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
3331 if (s
->contains(c
)) {
3335 } while (UTEXT_GETNATIVEINDEX(fInputText
) >= backSearchIndex
&& !success
);
3337 if (success
&& c
!= U_SENTINEL
) {
3338 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3339 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3340 if (fp
->fInputIdx
> backSearchIndex
) {
3341 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
3343 fp
->fPatIdx
++; // Skip the LOOP_C, we just did that
3349 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3355 case URX_STAT_SETREF_N
:
3357 // Test input character for NOT being a member of one of
3358 // the predefined sets (Word Characters, for example)
3359 if (fp
->fInputIdx
>= fActiveLimit
) {
3361 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3365 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
3367 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3369 UChar32 c
= UTEXT_NEXT32(fInputText
);
3371 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
3372 if (s8
->contains(c
) == FALSE
) {
3373 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3377 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
3378 if (s
->contains(c
) == FALSE
) {
3379 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3383 // the character wasn't in the set.
3384 #ifdef REGEX_SMART_BACKTRACKING
3385 if (fp
->fInputIdx
> backSearchIndex
&& fStack
->size() > fFrameSize
) {
3386 REStackFrame
*prevFrame
= (REStackFrame
*)fStack
->peekFrame(fFrameSize
);
3387 if (URX_LOOP_C
== URX_TYPE(pat
[prevFrame
->fPatIdx
]) && fp
->fInputIdx
<= prevFrame
->fInputIdx
) {
3388 // Try to find it, backwards
3389 UTEXT_PREVIOUS32(fInputText
); // skip the first character we tried
3390 UBool success
= FALSE
;
3392 c
= UTEXT_PREVIOUS32(fInputText
);
3393 if (c
== U_SENTINEL
) {
3395 } else if (c
< 256) {
3396 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
3397 if (s8
->contains(c
) == FALSE
) {
3402 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
3403 if (s
->contains(c
) == FALSE
) {
3408 } while (UTEXT_GETNATIVEINDEX(fInputText
) >= backSearchIndex
);
3411 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3412 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3413 if (fp
->fInputIdx
> backSearchIndex
) {
3414 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
3416 fp
->fPatIdx
++; // Skip the LOOP_C, we just did that
3422 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3428 if (fp
->fInputIdx
>= fActiveLimit
) {
3430 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3433 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3435 // There is input left. Pick up one char and test it for set membership.
3436 UChar32 c
= UTEXT_NEXT32(fInputText
);
3437 U_ASSERT(opValue
> 0 && opValue
< sets
->size());
3439 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
3440 if (s8
->contains(c
)) {
3441 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3445 UnicodeSet
*s
= (UnicodeSet
*)sets
->elementAt(opValue
);
3446 if (s
->contains(c
)) {
3447 // The character is in the set. A Match.
3448 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3453 // the character wasn't in the set.
3454 #ifdef REGEX_SMART_BACKTRACKING
3455 if (fp
->fInputIdx
> backSearchIndex
&& fStack
->size() > fFrameSize
) {
3456 REStackFrame
*prevFrame
= (REStackFrame
*)fStack
->peekFrame(fFrameSize
);
3457 if (URX_LOOP_C
== URX_TYPE(pat
[prevFrame
->fPatIdx
]) && fp
->fInputIdx
<= prevFrame
->fInputIdx
) {
3458 // Try to find it, backwards
3459 UTEXT_PREVIOUS32(fInputText
); // skip the first character we tried
3460 UBool success
= FALSE
;
3462 c
= UTEXT_PREVIOUS32(fInputText
);
3463 if (c
== U_SENTINEL
) {
3465 } else if (c
< 256) {
3466 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
3467 if (s8
->contains(c
)) {
3472 UnicodeSet
*s
= (UnicodeSet
*)sets
->elementAt(opValue
);
3473 if (s
->contains(c
)) {
3478 } while (UTEXT_GETNATIVEINDEX(fInputText
) >= backSearchIndex
);
3481 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3482 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3483 if (fp
->fInputIdx
> backSearchIndex
) {
3484 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
3486 fp
->fPatIdx
++; // Skip the LOOP_C, we just did that
3492 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3499 // . matches anything, but stops at end-of-line.
3500 if (fp
->fInputIdx
>= fActiveLimit
) {
3501 // At end of input. Match failed. Backtrack out.
3503 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3507 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3509 // There is input left. Advance over one char, unless we've hit end-of-line
3510 UChar32 c
= UTEXT_NEXT32(fInputText
);
3511 if (((c
& 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
3512 ((c
<=0x0d && c
>=0x0a) || c
==0x85 ||c
==0x2028 || c
==0x2029)) {
3513 // End of line in normal mode. . does not match.
3514 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3517 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3522 case URX_DOTANY_ALL
:
3524 // ., in dot-matches-all (including new lines) mode
3525 if (fp
->fInputIdx
>= fActiveLimit
) {
3526 // At end of input. Match failed. Backtrack out.
3528 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3532 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3534 // There is input left. Advance over one char, except if we are
3535 // at a cr/lf, advance over both of them.
3537 c
= UTEXT_NEXT32(fInputText
);
3538 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3539 if (c
==0x0d && fp
->fInputIdx
< fActiveLimit
) {
3540 // In the case of a CR/LF, we need to advance over both.
3541 UChar32 nextc
= UTEXT_CURRENT32(fInputText
);
3542 if (nextc
== 0x0a) {
3543 UTEXT_NEXT32(fInputText
);
3544 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3551 case URX_DOTANY_UNIX
:
3553 // '.' operator, matches all, but stops at end-of-line.
3554 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
3555 if (fp
->fInputIdx
>= fActiveLimit
) {
3556 // At end of input. Match failed. Backtrack out.
3558 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3562 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3564 // There is input left. Advance over one char, unless we've hit end-of-line
3565 UChar32 c
= UTEXT_NEXT32(fInputText
);
3567 // End of line in normal mode. '.' does not match the \n
3568 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3570 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3577 fp
->fPatIdx
= opValue
;
3585 U_ASSERT(opValue
< fPattern
->fCompiledPat
->size());
3586 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
3587 fp
->fPatIdx
= opValue
; // Then JMP.
3591 // This opcode is used with (x)+, when x can match a zero length string.
3592 // Same as JMP_SAV, except conditional on the match having made forward progress.
3593 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3594 // data address of the input position at the start of the loop.
3596 U_ASSERT(opValue
> 0 && opValue
< fPattern
->fCompiledPat
->size());
3597 int32_t stoOp
= (int32_t)pat
[opValue
-1];
3598 U_ASSERT(URX_TYPE(stoOp
) == URX_STO_INP_LOC
);
3599 int32_t frameLoc
= URX_VAL(stoOp
);
3600 U_ASSERT(frameLoc
>= 0 && frameLoc
< fFrameSize
);
3601 int64_t prevInputIdx
= fp
->fExtra
[frameLoc
];
3602 U_ASSERT(prevInputIdx
<= fp
->fInputIdx
);
3603 if (prevInputIdx
< fp
->fInputIdx
) {
3604 // The match did make progress. Repeat the loop.
3605 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
3606 fp
->fPatIdx
= opValue
;
3607 fp
->fExtra
[frameLoc
] = fp
->fInputIdx
;
3609 // If the input position did not advance, we do nothing here,
3610 // execution will fall out of the loop.
3616 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
3617 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
3619 // Pick up the three extra operands that CTR_INIT has, and
3620 // skip the pattern location counter past
3621 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
3623 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
3624 int32_t minCount
= (int32_t)pat
[instrOperandLoc
+1];
3625 int32_t maxCount
= (int32_t)pat
[instrOperandLoc
+2];
3626 U_ASSERT(minCount
>=0);
3627 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
3628 U_ASSERT(loopLoc
>fp
->fPatIdx
);
3630 if (minCount
== 0) {
3631 fp
= StateSave(fp
, loopLoc
+1, status
);
3633 if (maxCount
== 0) {
3634 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3641 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
3642 int32_t initOp
= (int32_t)pat
[opValue
];
3643 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT
);
3644 int64_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
3645 int32_t minCount
= (int32_t)pat
[opValue
+2];
3646 int32_t maxCount
= (int32_t)pat
[opValue
+3];
3647 // Increment the counter. Note: we DIDN'T worry about counter
3648 // overflow, since the data comes from UnicodeStrings, which
3649 // stores its length in an int32_t. Do we have to think about
3650 // this now that we're using UText? Probably not, since the length
3651 // in UChar32s is still an int32_t.
3653 U_ASSERT(*pCounter
> 0);
3654 if ((uint64_t)*pCounter
>= (uint32_t)maxCount
) {
3655 U_ASSERT(*pCounter
== maxCount
|| maxCount
== -1);
3658 if (*pCounter
>= minCount
) {
3659 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
3661 fp
->fPatIdx
= opValue
+ 4; // Loop back.
3665 case URX_CTR_INIT_NG
:
3667 // Initialize a non-greedy loop
3668 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
3669 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
3671 // Pick up the three extra operands that CTR_INIT has, and
3672 // skip the pattern location counter past
3673 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
3675 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
3676 int32_t minCount
= (int32_t)pat
[instrOperandLoc
+1];
3677 int32_t maxCount
= (int32_t)pat
[instrOperandLoc
+2];
3678 U_ASSERT(minCount
>=0);
3679 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
3680 U_ASSERT(loopLoc
>fp
->fPatIdx
);
3682 if (minCount
== 0) {
3683 if (maxCount
!= 0) {
3684 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
3686 fp
->fPatIdx
= loopLoc
+1; // Continue with stuff after repeated block
3691 case URX_CTR_LOOP_NG
:
3693 // Non-greedy {min, max} loops
3694 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
3695 int32_t initOp
= (int32_t)pat
[opValue
];
3696 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT_NG
);
3697 int64_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
3698 int32_t minCount
= (int32_t)pat
[opValue
+2];
3699 int32_t maxCount
= (int32_t)pat
[opValue
+3];
3700 // Increment the counter. Note: we DIDN'T worry about counter
3701 // overflow, since the data comes from UnicodeStrings, which
3702 // stores its length in an int32_t. Do we have to think about
3703 // this now that we're using UText? Probably not, since the length
3704 // in UChar32s is still an int32_t.
3706 U_ASSERT(*pCounter
> 0);
3708 if ((uint64_t)*pCounter
>= (uint32_t)maxCount
) {
3709 // The loop has matched the maximum permitted number of times.
3710 // Break out of here with no action. Matching will
3711 // continue with the following pattern.
3712 U_ASSERT(*pCounter
== maxCount
|| maxCount
== -1);
3716 if (*pCounter
< minCount
) {
3717 // We haven't met the minimum number of matches yet.
3718 // Loop back for another one.
3719 fp
->fPatIdx
= opValue
+ 4; // Loop back.
3721 // We do have the minimum number of matches.
3722 // Fall into the following pattern, but first do
3723 // a state save to the top of the loop, so that a failure
3724 // in the following pattern will try another iteration of the loop.
3725 fp
= StateSave(fp
, opValue
+ 4, status
);
3731 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
3732 fData
[opValue
] = fStack
->size();
3737 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
3738 int32_t newStackSize
= (int32_t)fData
[opValue
];
3739 U_ASSERT(newStackSize
<= fStack
->size());
3740 int64_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
3741 if (newFP
== (int64_t *)fp
) {
3745 for (i
=0; i
<fFrameSize
; i
++) {
3746 newFP
[i
] = ((int64_t *)fp
)[i
];
3748 fp
= (REStackFrame
*)newFP
;
3749 fStack
->setSize(newStackSize
);
3756 U_ASSERT(opValue
< fFrameSize
);
3757 int64_t groupStartIdx
= fp
->fExtra
[opValue
];
3758 int64_t groupEndIdx
= fp
->fExtra
[opValue
+1];
3759 U_ASSERT(groupStartIdx
<= groupEndIdx
);
3760 if (groupStartIdx
< 0) {
3761 // This capture group has not participated in the match thus far,
3762 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
3765 if (groupEndIdx
== groupStartIdx
) {
3766 // The capture group match was of an empty string.
3767 // Verified by testing: Perl matches succeed in this case, so
3772 UTEXT_SETNATIVEINDEX(fAltInputText
, groupStartIdx
);
3773 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3775 UBool haveMatch
= (opType
== URX_BACKREF
?
3776 (0 == utext_compareNativeLimit(fAltInputText
, groupEndIdx
, fInputText
, -1)) :
3777 (0 == utext_caseCompareNativeLimit(fAltInputText
, groupEndIdx
, fInputText
, -1, U_FOLD_CASE_DEFAULT
, &status
)));
3778 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3780 if (fp
->fInputIdx
> fActiveLimit
) {
3782 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
3783 } else if (!haveMatch
) {
3784 if (fp
->fInputIdx
== fActiveLimit
) {
3787 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
3792 case URX_STO_INP_LOC
:
3794 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
);
3795 fp
->fExtra
[opValue
] = fp
->fInputIdx
;
3801 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
3803 int32_t dataLoc
= URX_VAL(pat
[instrOperandLoc
]);
3804 U_ASSERT(dataLoc
>= 0 && dataLoc
< fFrameSize
);
3805 int64_t savedInputIdx
= fp
->fExtra
[dataLoc
];
3806 U_ASSERT(savedInputIdx
<= fp
->fInputIdx
);
3807 if (savedInputIdx
< fp
->fInputIdx
) {
3808 fp
->fPatIdx
= opValue
; // JMP
3810 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no progress in loop.
3817 // Entering a lookahead block.
3818 // Save Stack Ptr, Input Pos.
3819 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3820 fData
[opValue
] = fStack
->size();
3821 fData
[opValue
+1] = fp
->fInputIdx
;
3822 fActiveStart
= fLookStart
; // Set the match region change for
3823 fActiveLimit
= fLookLimit
; // transparent bounds.
3829 // Leaving a look-ahead block.
3830 // restore Stack Ptr, Input Pos to positions they had on entry to block.
3831 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3832 int32_t stackSize
= fStack
->size();
3833 int32_t newStackSize
=(int32_t)fData
[opValue
];
3834 U_ASSERT(stackSize
>= newStackSize
);
3835 if (stackSize
> newStackSize
) {
3836 // Copy the current top frame back to the new (cut back) top frame.
3837 // This makes the capture groups from within the look-ahead
3838 // expression available.
3839 int64_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
3841 for (i
=0; i
<fFrameSize
; i
++) {
3842 newFP
[i
] = ((int64_t *)fp
)[i
];
3844 fp
= (REStackFrame
*)newFP
;
3845 fStack
->setSize(newStackSize
);
3847 fp
->fInputIdx
= fData
[opValue
+1];
3849 // Restore the active region bounds in the input string; they may have
3850 // been changed because of transparent bounds on a Region.
3851 fActiveStart
= fRegionStart
;
3852 fActiveLimit
= fRegionLimit
;
3857 if (fp
->fInputIdx
< fActiveLimit
) {
3858 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3860 UChar32 c
= UTEXT_NEXT32(fInputText
);
3861 if (u_foldCase(c
, U_FOLD_CASE_DEFAULT
) == opValue
) {
3862 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3869 #ifdef REGEX_SMART_BACKTRACKING
3870 if (fp
->fInputIdx
> backSearchIndex
&& fStack
->size() > fFrameSize
) {
3871 REStackFrame
*prevFrame
= (REStackFrame
*)fStack
->peekFrame(fFrameSize
);
3872 if (URX_LOOP_C
== URX_TYPE(pat
[prevFrame
->fPatIdx
]) && fp
->fInputIdx
<= prevFrame
->fInputIdx
) {
3873 UBool success
= FALSE
;
3874 UChar32 c
= UTEXT_PREVIOUS32(fInputText
);
3875 while (UTEXT_GETNATIVEINDEX(fInputText
) >= backSearchIndex
) {
3876 if (u_foldCase(c
, U_FOLD_CASE_DEFAULT
) == opValue
) {
3879 } else if (c
== U_SENTINEL
) {
3882 c
= UTEXT_PREVIOUS32(fInputText
);
3886 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3887 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3888 if (fp
->fInputIdx
> backSearchIndex
) {
3889 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
3891 fp
->fPatIdx
++; // Skip the LOOP_C, we just did that
3898 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3903 // Test input against a literal string.
3904 // Strings require two slots in the compiled pattern, one for the
3905 // offset to the string text, and one for the length.
3906 const UCaseProps
*csp
= ucase_getSingleton();
3908 int32_t stringStartIdx
, stringLen
;
3909 stringStartIdx
= opValue
;
3911 op
= (int32_t)pat
[fp
->fPatIdx
];
3913 opType
= URX_TYPE(op
);
3914 opValue
= URX_VAL(op
);
3915 U_ASSERT(opType
== URX_STRING_LEN
);
3916 stringLen
= opValue
;
3918 const UChar
*patternChars
= litText
+stringStartIdx
;
3919 const UChar
*patternEnd
= patternChars
+stringLen
;
3921 const UChar
*foldChars
= NULL
;
3922 int32_t foldOffset
, foldLength
;
3925 foldOffset
= foldLength
= 0;
3926 UBool success
= TRUE
;
3928 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3929 while (patternChars
< patternEnd
&& success
) {
3930 if(foldOffset
< foldLength
) {
3931 U16_NEXT_UNSAFE(foldChars
, foldOffset
, c
);
3933 c
= UTEXT_NEXT32(fInputText
);
3934 if (c
!= U_SENTINEL
) {
3935 foldLength
= ucase_toFullFolding(csp
, c
, &foldChars
, U_FOLD_CASE_DEFAULT
);
3936 if(foldLength
>= 0) {
3937 if(foldLength
<= UCASE_MAX_STRING_LENGTH
) { // !!!: Does not correctly handle chars that fold to 0-length strings
3939 U16_NEXT_UNSAFE(foldChars
, foldOffset
, c
);
3942 foldLength
= foldOffset
; // to avoid reading chars from the folding buffer
3947 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3951 if (c
!= U_SENTINEL
&& (fp
->fInputIdx
<= fActiveLimit
)) {
3953 success
= (*patternChars
== c
);
3955 } else if (patternChars
+1 < patternEnd
) {
3956 success
= (*patternChars
== U16_LEAD(c
) && *(patternChars
+1) == U16_TRAIL(c
));
3960 fHitEnd
= TRUE
; // TODO: See ticket 6074
3965 #ifdef REGEX_SMART_BACKTRACKING
3966 if (fp
->fInputIdx
> backSearchIndex
&& fStack
->size()) {
3967 REStackFrame
*prevFrame
= (REStackFrame
*)fStack
->peekFrame(fFrameSize
);
3968 if (URX_LOOP_C
== URX_TYPE(pat
[prevFrame
->fPatIdx
]) && fp
->fInputIdx
<= prevFrame
->fInputIdx
) {
3969 // Reset to last start point
3970 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3971 patternChars
= litText
+stringStartIdx
;
3973 // Search backwards for a possible start
3975 c
= UTEXT_PREVIOUS32(fInputText
);
3976 if (c
== U_SENTINEL
) {
3979 foldLength
= ucase_toFullFolding(csp
, c
, &foldChars
, U_FOLD_CASE_DEFAULT
);
3980 if(foldLength
>= 0) {
3981 if(foldLength
<= UCASE_MAX_STRING_LENGTH
) { // !!!: Does not correctly handle chars that fold to 0-length strings
3983 U16_NEXT_UNSAFE(foldChars
, foldOffset
, c
);
3986 foldLength
= foldOffset
; // to avoid reading chars from the folding buffer
3990 if ((U_IS_BMP(c
) && *patternChars
== c
) ||
3991 (*patternChars
== U16_LEAD(c
) && *(patternChars
+1) == U16_TRAIL(c
))) {
3996 } while (UTEXT_GETNATIVEINDEX(fInputText
) >= backSearchIndex
);
4000 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4001 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
4002 if (fp
->fInputIdx
> backSearchIndex
) {
4003 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
4005 fp
->fPatIdx
++; // Skip the LOOP_C, we just did that
4011 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4019 // Entering a look-behind block.
4020 // Save Stack Ptr, Input Pos.
4021 // TODO: implement transparent bounds. Ticket #6067
4022 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
4023 fData
[opValue
] = fStack
->size();
4024 fData
[opValue
+1] = fp
->fInputIdx
;
4025 // Init the variable containing the start index for attempted matches.
4026 fData
[opValue
+2] = -1;
4027 // Save input string length, then reset to pin any matches to end at
4028 // the current position.
4029 fData
[opValue
+3] = fActiveLimit
;
4030 fActiveLimit
= fp
->fInputIdx
;
4037 // Positive Look-Behind, at top of loop checking for matches of LB expression
4038 // at all possible input starting positions.
4040 // Fetch the min and max possible match lengths. They are the operands
4041 // of this op in the pattern.
4042 int32_t minML
= (int32_t)pat
[fp
->fPatIdx
++];
4043 int32_t maxML
= (int32_t)pat
[fp
->fPatIdx
++];
4044 U_ASSERT(minML
<= maxML
);
4045 U_ASSERT(minML
>= 0);
4047 // Fetch (from data) the last input index where a match was attempted.
4048 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
4049 int64_t *lbStartIdx
= &fData
[opValue
+2];
4050 if (*lbStartIdx
< 0) {
4051 // First time through loop.
4052 *lbStartIdx
= fp
->fInputIdx
- minML
;
4054 // 2nd through nth time through the loop.
4055 // Back up start position for match by one.
4056 if (*lbStartIdx
== 0) {
4059 UTEXT_SETNATIVEINDEX(fInputText
, *lbStartIdx
);
4060 UTEXT_PREVIOUS32(fInputText
);
4061 *lbStartIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
4065 if (*lbStartIdx
< 0 || *lbStartIdx
< fp
->fInputIdx
- maxML
) {
4066 // We have tried all potential match starting points without
4067 // getting a match. Backtrack out, and out of the
4068 // Look Behind altogether.
4069 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4070 int64_t restoreInputLen
= fData
[opValue
+3];
4071 U_ASSERT(restoreInputLen
>= fActiveLimit
);
4072 U_ASSERT(restoreInputLen
<= fInputLength
);
4073 fActiveLimit
= restoreInputLen
;
4077 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4078 // (successful match will fall off the end of the loop.)
4079 fp
= StateSave(fp
, fp
->fPatIdx
-3, status
);
4080 fp
->fInputIdx
= *lbStartIdx
;
4085 // End of a look-behind block, after a successful match.
4087 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
4088 if (fp
->fInputIdx
!= fActiveLimit
) {
4089 // The look-behind expression matched, but the match did not
4090 // extend all the way to the point that we are looking behind from.
4091 // FAIL out of here, which will take us back to the LB_CONT, which
4092 // will retry the match starting at another position or fail
4093 // the look-behind altogether, whichever is appropriate.
4094 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4098 // Look-behind match is good. Restore the orignal input string length,
4099 // which had been truncated to pin the end of the lookbehind match to the
4100 // position being looked-behind.
4101 int64_t originalInputLen
= fData
[opValue
+3];
4102 U_ASSERT(originalInputLen
>= fActiveLimit
);
4103 U_ASSERT(originalInputLen
<= fInputLength
);
4104 fActiveLimit
= originalInputLen
;
4111 // Negative Look-Behind, at top of loop checking for matches of LB expression
4112 // at all possible input starting positions.
4114 // Fetch the extra parameters of this op.
4115 int32_t minML
= (int32_t)pat
[fp
->fPatIdx
++];
4116 int32_t maxML
= (int32_t)pat
[fp
->fPatIdx
++];
4117 int32_t continueLoc
= (int32_t)pat
[fp
->fPatIdx
++];
4118 continueLoc
= URX_VAL(continueLoc
);
4119 U_ASSERT(minML
<= maxML
);
4120 U_ASSERT(minML
>= 0);
4121 U_ASSERT(continueLoc
> fp
->fPatIdx
);
4123 // Fetch (from data) the last input index where a match was attempted.
4124 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
4125 int64_t *lbStartIdx
= &fData
[opValue
+2];
4126 if (*lbStartIdx
< 0) {
4127 // First time through loop.
4128 *lbStartIdx
= fp
->fInputIdx
- minML
;
4130 // 2nd through nth time through the loop.
4131 // Back up start position for match by one.
4132 if (*lbStartIdx
== 0) {
4135 UTEXT_SETNATIVEINDEX(fInputText
, *lbStartIdx
);
4136 UTEXT_PREVIOUS32(fInputText
);
4137 *lbStartIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
4141 if (*lbStartIdx
< 0 || *lbStartIdx
< fp
->fInputIdx
- maxML
) {
4142 // We have tried all potential match starting points without
4143 // getting a match, which means that the negative lookbehind as
4144 // a whole has succeeded. Jump forward to the continue location
4145 int64_t restoreInputLen
= fData
[opValue
+3];
4146 U_ASSERT(restoreInputLen
>= fActiveLimit
);
4147 U_ASSERT(restoreInputLen
<= fInputLength
);
4148 fActiveLimit
= restoreInputLen
;
4149 fp
->fPatIdx
= continueLoc
;
4153 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4154 // (successful match will cause a FAIL out of the loop altogether.)
4155 fp
= StateSave(fp
, fp
->fPatIdx
-4, status
);
4156 fp
->fInputIdx
= *lbStartIdx
;
4161 // End of a negative look-behind block, after a successful match.
4163 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
4164 if (fp
->fInputIdx
!= fActiveLimit
) {
4165 // The look-behind expression matched, but the match did not
4166 // extend all the way to the point that we are looking behind from.
4167 // FAIL out of here, which will take us back to the LB_CONT, which
4168 // will retry the match starting at another position or succeed
4169 // the look-behind altogether, whichever is appropriate.
4170 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4174 // Look-behind expression matched, which means look-behind test as
4177 // Restore the orignal input string length, which had been truncated
4178 // inorder to pin the end of the lookbehind match
4179 // to the position being looked-behind.
4180 int64_t originalInputLen
= fData
[opValue
+3];
4181 U_ASSERT(originalInputLen
>= fActiveLimit
);
4182 U_ASSERT(originalInputLen
<= fInputLength
);
4183 fActiveLimit
= originalInputLen
;
4185 // Restore original stack position, discarding any state saved
4186 // by the successful pattern match.
4187 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
4188 int32_t newStackSize
= (int32_t)fData
[opValue
];
4189 U_ASSERT(fStack
->size() > newStackSize
);
4190 fStack
->setSize(newStackSize
);
4192 // FAIL, which will take control back to someplace
4193 // prior to entering the look-behind test.
4194 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4200 // Loop Initialization for the optimized implementation of
4201 // [some character set]*
4202 // This op scans through all matching input.
4203 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4205 U_ASSERT(opValue
> 0 && opValue
< sets
->size());
4206 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
4207 UnicodeSet
*s
= (UnicodeSet
*)sets
->elementAt(opValue
);
4209 // Loop through input, until either the input is exhausted or
4210 // we reach a character that is not a member of the set.
4211 int64_t ix
= fp
->fInputIdx
;
4212 UTEXT_SETNATIVEINDEX(fInputText
, ix
);
4214 if (ix
>= fActiveLimit
) {
4218 UChar32 c
= UTEXT_NEXT32(fInputText
);
4220 if (s8
->contains(c
) == FALSE
) {
4224 if (s
->contains(c
) == FALSE
) {
4228 ix
= UTEXT_GETNATIVEINDEX(fInputText
);
4231 // If there were no matching characters, skip over the loop altogether.
4232 // The loop doesn't run at all, a * op always succeeds.
4233 if (ix
== fp
->fInputIdx
) {
4234 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
4238 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4239 // must follow. It's operand is the stack location
4240 // that holds the starting input index for the match of this [set]*
4241 int32_t loopcOp
= (int32_t)pat
[fp
->fPatIdx
];
4242 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
4243 int32_t stackLoc
= URX_VAL(loopcOp
);
4244 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
4245 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
4246 #ifdef REGEX_SMART_BACKTRACKING
4247 backSearchIndex
= fp
->fInputIdx
;
4251 // Save State to the URX_LOOP_C op that follows this one,
4252 // so that match failures in the following code will return to there.
4253 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4254 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
4260 case URX_LOOP_DOT_I
:
4261 // Loop Initialization for the optimized implementation of .*
4262 // This op scans through all remaining input.
4263 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4265 // Loop through input until the input is exhausted (we reach an end-of-line)
4266 // In DOTALL mode, we can just go straight to the end of the input.
4268 if ((opValue
& 1) == 1) {
4269 // Dot-matches-All mode. Jump straight to the end of the string.
4273 // NOT DOT ALL mode. Line endings do not match '.'
4274 // Scan forward until a line ending or end of input.
4276 UTEXT_SETNATIVEINDEX(fInputText
, ix
);
4278 if (ix
>= fActiveLimit
) {
4282 UChar32 c
= UTEXT_NEXT32(fInputText
);
4283 if ((c
& 0x7f) <= 0x29) { // Fast filter of non-new-line-s
4284 if ((c
== 0x0a) || // 0x0a is newline in both modes.
4285 (((opValue
& 2) == 0) && // IF not UNIX_LINES mode
4286 (c
<=0x0d && c
>=0x0a)) || c
==0x85 ||c
==0x2028 || c
==0x2029) {
4287 // char is a line ending. Exit the scanning loop.
4291 ix
= UTEXT_GETNATIVEINDEX(fInputText
);
4295 // If there were no matching characters, skip over the loop altogether.
4296 // The loop doesn't run at all, a * op always succeeds.
4297 if (ix
== fp
->fInputIdx
) {
4298 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
4302 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4303 // must follow. It's operand is the stack location
4304 // that holds the starting input index for the match of this .*
4305 int32_t loopcOp
= (int32_t)pat
[fp
->fPatIdx
];
4306 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
4307 int32_t stackLoc
= URX_VAL(loopcOp
);
4308 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
4309 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
4310 #ifdef REGEX_SMART_BACKTRACKING
4311 backSearchIndex
= fp
->fInputIdx
;
4315 // Save State to the URX_LOOP_C op that follows this one,
4316 // so that match failures in the following code will return to there.
4317 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4318 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
4326 U_ASSERT(opValue
>=0 && opValue
<fFrameSize
);
4327 backSearchIndex
= fp
->fExtra
[opValue
];
4328 U_ASSERT(backSearchIndex
<= fp
->fInputIdx
);
4329 if (backSearchIndex
== fp
->fInputIdx
) {
4330 // We've backed up the input idx to the point that the loop started.
4331 // The loop is done. Leave here without saving state.
4332 // Subsequent failures won't come back here.
4335 // Set up for the next iteration of the loop, with input index
4336 // backed up by one from the last time through,
4337 // and a state save to this instruction in case the following code fails again.
4338 // (We're going backwards because this loop emulates stack unwinding, not
4339 // the initial scan forward.)
4340 U_ASSERT(fp
->fInputIdx
> 0);
4341 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
4342 UChar32 prevC
= UTEXT_PREVIOUS32(fInputText
);
4343 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
4345 UChar32 twoPrevC
= UTEXT_PREVIOUS32(fInputText
);
4346 if (prevC
== 0x0a &&
4347 fp
->fInputIdx
> backSearchIndex
&&
4349 int32_t prevOp
= (int32_t)pat
[fp
->fPatIdx
-2];
4350 if (URX_TYPE(prevOp
) == URX_LOOP_DOT_I
) {
4351 // .*, stepping back over CRLF pair.
4352 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
4357 fp
= StateSave(fp
, fp
->fPatIdx
-1, status
);
4364 // Trouble. The compiled pattern contains an entry with an
4365 // unrecognized type tag.
4369 if (U_FAILURE(status
)) {
4378 fLastMatchEnd
= fMatchEnd
;
4379 fMatchStart
= startIdx
;
4380 fMatchEnd
= fp
->fInputIdx
;
4382 REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart
, fMatchEnd
));
4388 REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
4392 fFrame
= fp
; // The active stack frame when the engine stopped.
4393 // Contains the capture group results that we need to
4399 //--------------------------------------------------------------------------------
4401 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with the
4402 // assumption that the entire string is available in the UText's
4403 // chunk buffer. For now, that means we can use int32_t indexes,
4404 // except for anything that needs to be saved (like group starts
4407 // startIdx: begin matching a this index.
4408 // toEnd: if true, match must extend to end of the input region
4410 //--------------------------------------------------------------------------------
4411 void RegexMatcher::MatchChunkAt(int32_t startIdx
, UBool toEnd
, UErrorCode
&status
) {
4412 UBool isMatch
= FALSE
; // True if the we have a match.
4414 int32_t backSearchIndex
= INT32_MAX
; // used after greedy single-character matches for searching backwards
4416 int32_t op
; // Operation from the compiled pattern, split into
4417 int32_t opType
; // the opcode
4418 int32_t opValue
; // and the operand value.
4420 #ifdef REGEX_RUN_DEBUG
4423 printf("MatchAt(startIdx=%ld)\n", startIdx
);
4424 printf("Original Pattern: ");
4425 UChar32 c
= utext_next32From(fPattern
->fPattern
, 0);
4426 while (c
!= U_SENTINEL
) {
4427 if (c
<32 || c
>256) {
4430 REGEX_DUMP_DEBUG_PRINTF(("%c", c
));
4432 c
= UTEXT_NEXT32(fPattern
->fPattern
);
4435 printf("Input String: ");
4436 c
= utext_next32From(fInputText
, 0);
4437 while (c
!= U_SENTINEL
) {
4438 if (c
<32 || c
>256) {
4443 c
= UTEXT_NEXT32(fInputText
);
4450 if (U_FAILURE(status
)) {
4454 // Cache frequently referenced items from the compiled pattern
4456 int64_t *pat
= fPattern
->fCompiledPat
->getBuffer();
4458 const UChar
*litText
= fPattern
->fLiteralText
.getBuffer();
4459 UVector
*sets
= fPattern
->fSets
;
4461 const UChar
*inputBuf
= fInputText
->chunkContents
;
4463 fFrameSize
= fPattern
->fFrameSize
;
4464 REStackFrame
*fp
= resetStack();
4467 fp
->fInputIdx
= startIdx
;
4469 // Zero out the pattern's static data
4471 for (i
= 0; i
<fPattern
->fDataSize
; i
++) {
4476 // Main loop for interpreting the compiled pattern.
4477 // One iteration of the loop per pattern operation performed.
4481 if (_heapchk() != _HEAPOK
) {
4482 fprintf(stderr
, "Heap Trouble\n");
4486 op
= (int32_t)pat
[fp
->fPatIdx
];
4487 opType
= URX_TYPE(op
);
4488 opValue
= URX_VAL(op
);
4489 #ifdef REGEX_RUN_DEBUG
4491 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
4492 printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp
->fInputIdx
,
4493 UTEXT_CURRENT32(fInputText
), (int64_t *)fp
-fStack
->getBuffer(), fActiveLimit
);
4494 fPattern
->dumpOp(fp
->fPatIdx
);
4507 // Force a backtrack. In some circumstances, the pattern compiler
4508 // will notice that the pattern can't possibly match anything, and will
4509 // emit one of these at that point.
4510 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4515 if (fp
->fInputIdx
< fActiveLimit
) {
4517 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4525 #ifdef REGEX_SMART_BACKTRACKING
4526 if (fp
->fInputIdx
> backSearchIndex
&& fStack
->size() > fFrameSize
) {
4527 REStackFrame
*prevFrame
= (REStackFrame
*)fStack
->peekFrame(fFrameSize
);
4528 if (URX_LOOP_C
== URX_TYPE(pat
[prevFrame
->fPatIdx
]) && fp
->fInputIdx
<= prevFrame
->fInputIdx
) {
4529 int64_t reverseIndex
= fp
->fInputIdx
;
4532 U16_PREV(inputBuf
, backSearchIndex
, reverseIndex
, c
);
4536 } while (reverseIndex
> backSearchIndex
);
4539 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4540 fp
->fInputIdx
= reverseIndex
;
4541 if (fp
->fInputIdx
> backSearchIndex
) {
4542 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
4544 fp
->fPatIdx
++; // Skip the LOOP_C, we just did that
4551 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4557 // Test input against a literal string.
4558 // Strings require two slots in the compiled pattern, one for the
4559 // offset to the string text, and one for the length.
4560 int32_t stringStartIdx
= opValue
;
4563 op
= (int32_t)pat
[fp
->fPatIdx
]; // Fetch the second operand
4565 opType
= URX_TYPE(op
);
4566 stringLen
= URX_VAL(op
);
4567 U_ASSERT(opType
== URX_STRING_LEN
);
4568 U_ASSERT(stringLen
>= 2);
4570 if (fp
->fInputIdx
+ stringLen
> fActiveLimit
) {
4571 // No match. String is longer than the remaining input text.
4572 fHitEnd
= TRUE
; // TODO: See ticket 6074
4573 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4577 const UChar
* pInp
= inputBuf
+ fp
->fInputIdx
;
4578 const UChar
* pPat
= litText
+stringStartIdx
;
4579 const UChar
* pEnd
= pInp
+ stringLen
;
4580 UBool success
= FALSE
;
4582 if (*pInp
== *pPat
) {
4586 // Successful Match.
4597 fp
->fInputIdx
+= stringLen
;
4599 #ifdef REGEX_SMART_BACKTRACKING
4600 if (fp
->fInputIdx
> backSearchIndex
&& fStack
->size()) {
4601 REStackFrame
*prevFrame
= (REStackFrame
*)fStack
->peekFrame(fFrameSize
);
4602 if (URX_LOOP_C
== URX_TYPE(pat
[prevFrame
->fPatIdx
]) && fp
->fInputIdx
<= prevFrame
->fInputIdx
) {
4603 // Reset to last start point
4604 int64_t reverseIndex
= fp
->fInputIdx
;
4606 pPat
= litText
+stringStartIdx
;
4608 // Search backwards for a possible start
4610 U16_PREV(inputBuf
, backSearchIndex
, reverseIndex
, c
);
4611 if ((U_IS_BMP(c
) && *pPat
== c
) ||
4612 (*pPat
== U16_LEAD(c
) && *(pPat
+1) == U16_TRAIL(c
))) {
4616 } while (reverseIndex
> backSearchIndex
);
4620 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4621 fp
->fInputIdx
= reverseIndex
;
4622 if (fp
->fInputIdx
> backSearchIndex
) {
4623 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
4625 fp
->fPatIdx
++; // Skip the LOOP_C, we just did that
4631 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4637 case URX_STATE_SAVE
:
4638 fp
= StateSave(fp
, opValue
, status
);
4643 // The match loop will exit via this path on a successful match,
4644 // when we reach the end of the pattern.
4645 if (toEnd
&& fp
->fInputIdx
!= fActiveLimit
) {
4646 // The pattern matched, but not to the end of input. Try some more.
4647 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4653 // Start and End Capture stack frame variables are laid out out like this:
4654 // fp->fExtra[opValue] - The start of a completed capture group
4655 // opValue+1 - The end of a completed capture group
4656 // opValue+2 - the start of a capture group whose end
4657 // has not yet been reached (and might not ever be).
4658 case URX_START_CAPTURE
:
4659 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
4660 fp
->fExtra
[opValue
+2] = fp
->fInputIdx
;
4664 case URX_END_CAPTURE
:
4665 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
4666 U_ASSERT(fp
->fExtra
[opValue
+2] >= 0); // Start pos for this group must be set.
4667 fp
->fExtra
[opValue
] = fp
->fExtra
[opValue
+2]; // Tentative start becomes real.
4668 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // End position
4669 U_ASSERT(fp
->fExtra
[opValue
] <= fp
->fExtra
[opValue
+1]);
4673 case URX_DOLLAR
: // $, test for End of line
4674 // or for position before new line at end of input
4675 if (fp
->fInputIdx
< fAnchorLimit
-2) {
4676 // We are no where near the end of input. Fail.
4677 // This is the common case. Keep it first.
4678 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4681 if (fp
->fInputIdx
>= fAnchorLimit
) {
4682 // We really are at the end of input. Success.
4688 // If we are positioned just before a new-line that is located at the
4689 // end of input, succeed.
4690 if (fp
->fInputIdx
== fAnchorLimit
-1) {
4692 U16_GET(inputBuf
, fAnchorStart
, fp
->fInputIdx
, fAnchorLimit
, c
);
4694 if ((c
>=0x0a && c
<=0x0d) || c
==0x85 || c
==0x2028 || c
==0x2029) {
4695 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& inputBuf
[fp
->fInputIdx
-1]==0x0d)) {
4696 // At new-line at end of input. Success
4702 } else if (fp
->fInputIdx
== fAnchorLimit
-2 &&
4703 inputBuf
[fp
->fInputIdx
]==0x0d && inputBuf
[fp
->fInputIdx
+1]==0x0a) {
4706 break; // At CR/LF at end of input. Success
4709 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4714 case URX_DOLLAR_D
: // $, test for End of Line, in UNIX_LINES mode.
4715 if (fp
->fInputIdx
>= fAnchorLimit
-1) {
4716 // Either at the last character of input, or off the end.
4717 if (fp
->fInputIdx
== fAnchorLimit
-1) {
4718 // At last char of input. Success if it's a new line.
4719 if (inputBuf
[fp
->fInputIdx
] == 0x0a) {
4725 // Off the end of input. Success.
4732 // Not at end of input. Back-track out.
4733 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4737 case URX_DOLLAR_M
: // $, test for End of line in multi-line mode
4739 if (fp
->fInputIdx
>= fAnchorLimit
) {
4740 // We really are at the end of input. Success.
4745 // If we are positioned just before a new-line, succeed.
4746 // It makes no difference where the new-line is within the input.
4747 UChar32 c
= inputBuf
[fp
->fInputIdx
];
4748 if ((c
>=0x0a && c
<=0x0d) || c
==0x85 ||c
==0x2028 || c
==0x2029) {
4749 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
4750 // In multi-line mode, hitting a new-line just before the end of input does not
4751 // set the hitEnd or requireEnd flags
4752 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& inputBuf
[fp
->fInputIdx
-1]==0x0d)) {
4756 // not at a new line. Fail.
4757 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4762 case URX_DOLLAR_MD
: // $, test for End of line in multi-line and UNIX_LINES mode
4764 if (fp
->fInputIdx
>= fAnchorLimit
) {
4765 // We really are at the end of input. Success.
4767 fRequireEnd
= TRUE
; // Java set requireEnd in this case, even though
4768 break; // adding a new-line would not lose the match.
4770 // If we are not positioned just before a new-line, the test fails; backtrack out.
4771 // It makes no difference where the new-line is within the input.
4772 if (inputBuf
[fp
->fInputIdx
] != 0x0a) {
4773 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4779 case URX_CARET
: // ^, test for start of line
4780 if (fp
->fInputIdx
!= fAnchorStart
) {
4781 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4786 case URX_CARET_M
: // ^, test for start of line in mulit-line mode
4788 if (fp
->fInputIdx
== fAnchorStart
) {
4789 // We are at the start input. Success.
4792 // Check whether character just before the current pos is a new-line
4793 // unless we are at the end of input
4794 UChar c
= inputBuf
[fp
->fInputIdx
- 1];
4795 if ((fp
->fInputIdx
< fAnchorLimit
) &&
4796 ((c
<=0x0d && c
>=0x0a) || c
==0x85 ||c
==0x2028 || c
==0x2029)) {
4797 // It's a new-line. ^ is true. Success.
4798 // TODO: what should be done with positions between a CR and LF?
4801 // Not at the start of a line. Fail.
4802 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4807 case URX_CARET_M_UNIX
: // ^, test for start of line in mulit-line + Unix-line mode
4809 U_ASSERT(fp
->fInputIdx
>= fAnchorStart
);
4810 if (fp
->fInputIdx
<= fAnchorStart
) {
4811 // We are at the start input. Success.
4814 // Check whether character just before the current pos is a new-line
4815 U_ASSERT(fp
->fInputIdx
<= fAnchorLimit
);
4816 UChar c
= inputBuf
[fp
->fInputIdx
- 1];
4818 // Not at the start of a line. Back-track out.
4819 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4824 case URX_BACKSLASH_B
: // Test for word boundaries
4826 UBool success
= isChunkWordBoundary((int32_t)fp
->fInputIdx
);
4827 success
^= (opValue
!= 0); // flip sense for \B
4829 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4835 case URX_BACKSLASH_BU
: // Test for word boundaries, Unicode-style
4837 UBool success
= isUWordBoundary(fp
->fInputIdx
);
4838 success
^= (opValue
!= 0); // flip sense for \B
4840 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4846 case URX_BACKSLASH_D
: // Test for decimal digit
4848 if (fp
->fInputIdx
>= fActiveLimit
) {
4850 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4855 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4856 int8_t ctype
= u_charType(c
); // TODO: make a unicode set for this. Will be faster.
4857 UBool success
= (ctype
== U_DECIMAL_DIGIT_NUMBER
);
4858 success
^= (opValue
!= 0); // flip sense for \D
4860 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4866 case URX_BACKSLASH_G
: // Test for position at end of previous match
4867 if (!((fMatch
&& fp
->fInputIdx
==fMatchEnd
) || (fMatch
==FALSE
&& fp
->fInputIdx
==fActiveStart
))) {
4868 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4873 case URX_BACKSLASH_X
:
4874 // Match a Grapheme, as defined by Unicode TR 29.
4875 // Differs slightly from Perl, which consumes combining marks independently
4879 // Fail if at end of input
4880 if (fp
->fInputIdx
>= fActiveLimit
) {
4882 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4886 // Examine (and consume) the current char.
4887 // Dispatch into a little state machine, based on the char.
4889 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4890 UnicodeSet
**sets
= fPattern
->fStaticSets
;
4891 if (sets
[URX_GC_NORMAL
]->contains(c
)) goto GC_Extend
;
4892 if (sets
[URX_GC_CONTROL
]->contains(c
)) goto GC_Control
;
4893 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
4894 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
4895 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
4896 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
4897 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
4903 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
4904 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4905 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
4906 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
4907 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
4908 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
4909 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c
);
4913 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
4914 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4915 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
4916 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
4917 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c
);
4921 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
4922 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4923 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
4924 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c
);
4928 // Combining characters are consumed here
4930 if (fp
->fInputIdx
>= fActiveLimit
) {
4933 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4934 if (sets
[URX_GC_EXTEND
]->contains(c
) == FALSE
) {
4935 U16_BACK_1(inputBuf
, 0, fp
->fInputIdx
);
4942 // Most control chars stand alone (don't combine with combining chars),
4943 // except for that CR/LF sequence is a single grapheme cluster.
4944 if (c
== 0x0d && fp
->fInputIdx
< fActiveLimit
&& inputBuf
[fp
->fInputIdx
] == 0x0a) {
4949 if (fp
->fInputIdx
>= fActiveLimit
) {
4958 case URX_BACKSLASH_Z
: // Test for end of Input
4959 if (fp
->fInputIdx
< fAnchorLimit
) {
4960 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4969 case URX_STATIC_SETREF
:
4971 // Test input character against one of the predefined sets
4972 // (Word Characters, for example)
4973 // The high bit of the op value is a flag for the match polarity.
4974 // 0: success if input char is in set.
4975 // 1: success if input char is not in set.
4976 if (fp
->fInputIdx
>= fActiveLimit
) {
4978 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4982 UBool success
= ((opValue
& URX_NEG_SET
) == URX_NEG_SET
);
4983 opValue
&= ~URX_NEG_SET
;
4984 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
4987 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4989 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
4990 if (s8
->contains(c
)) {
4994 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
4995 if (s
->contains(c
)) {
5000 #ifdef REGEX_SMART_BACKTRACKING
5001 if (fp
->fInputIdx
> backSearchIndex
&& fStack
->size() > fFrameSize
) {
5002 REStackFrame
*prevFrame
= (REStackFrame
*)fStack
->peekFrame(fFrameSize
);
5003 if (URX_LOOP_C
== URX_TYPE(pat
[prevFrame
->fPatIdx
]) && fp
->fInputIdx
<= prevFrame
->fInputIdx
) {
5004 // Try to find it, backwards
5005 int64_t reverseIndex
= fp
->fInputIdx
;
5006 U16_BACK_1(inputBuf
, backSearchIndex
, reverseIndex
); // skip the first character we tried
5007 success
= ((opValue
& URX_NEG_SET
) == URX_NEG_SET
); // reset
5009 U16_PREV(inputBuf
, backSearchIndex
, reverseIndex
, c
);
5011 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
5012 if (s8
->contains(c
)) {
5016 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
5017 if (s
->contains(c
)) {
5021 } while (reverseIndex
> backSearchIndex
&& !success
);
5024 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5025 fp
->fInputIdx
= reverseIndex
;
5026 if (fp
->fInputIdx
> backSearchIndex
) {
5027 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5029 fp
->fPatIdx
++; // Skip the LOOP_C, we just did that
5035 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5041 case URX_STAT_SETREF_N
:
5043 // Test input character for NOT being a member of one of
5044 // the predefined sets (Word Characters, for example)
5045 if (fp
->fInputIdx
>= fActiveLimit
) {
5047 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5051 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
5054 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
5056 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
5057 if (s8
->contains(c
) == FALSE
) {
5061 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
5062 if (s
->contains(c
) == FALSE
) {
5067 #ifdef REGEX_SMART_BACKTRACKING
5068 if (fp
->fInputIdx
> backSearchIndex
&& fStack
->size() > fFrameSize
) {
5069 REStackFrame
*prevFrame
= (REStackFrame
*)fStack
->peekFrame(fFrameSize
);
5070 if (URX_LOOP_C
== URX_TYPE(pat
[prevFrame
->fPatIdx
]) && fp
->fInputIdx
<= prevFrame
->fInputIdx
) {
5071 // Try to find it, backwards
5072 int64_t reverseIndex
= fp
->fInputIdx
;
5073 U16_BACK_1(inputBuf
, backSearchIndex
, reverseIndex
); // skip the first character we tried
5074 UBool success
= FALSE
;
5076 U16_PREV(inputBuf
, backSearchIndex
, reverseIndex
, c
);
5078 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
5079 if (s8
->contains(c
) == FALSE
) {
5084 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
5085 if (s
->contains(c
) == FALSE
) {
5090 } while (reverseIndex
> backSearchIndex
);
5093 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5094 fp
->fInputIdx
= reverseIndex
;
5095 if (fp
->fInputIdx
> backSearchIndex
) {
5096 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5098 fp
->fPatIdx
++; // Skip the LOOP_C, we just did that
5104 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5111 if (fp
->fInputIdx
>= fActiveLimit
) {
5113 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5117 U_ASSERT(opValue
> 0 && opValue
< sets
->size());
5119 // There is input left. Pick up one char and test it for set membership.
5121 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
5123 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
5124 if (s8
->contains(c
)) {
5125 // The character is in the set. A Match.
5129 UnicodeSet
*s
= (UnicodeSet
*)sets
->elementAt(opValue
);
5130 if (s
->contains(c
)) {
5131 // The character is in the set. A Match.
5136 // the character wasn't in the set.
5137 #ifdef REGEX_SMART_BACKTRACKING
5138 if (fp
->fInputIdx
> backSearchIndex
&& fStack
->size() > fFrameSize
) {
5139 REStackFrame
*prevFrame
= (REStackFrame
*)fStack
->peekFrame(fFrameSize
);
5140 if (URX_LOOP_C
== URX_TYPE(pat
[prevFrame
->fPatIdx
]) && fp
->fInputIdx
<= prevFrame
->fInputIdx
) {
5141 // Try to find it, backwards
5142 int64_t reverseIndex
= fp
->fInputIdx
;
5143 U16_BACK_1(inputBuf
, backSearchIndex
, reverseIndex
); // skip the first character we tried
5144 UBool success
= FALSE
;
5146 U16_PREV(inputBuf
, backSearchIndex
, reverseIndex
, c
);
5148 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
5149 if (s8
->contains(c
)) {
5154 UnicodeSet
*s
= (UnicodeSet
*)sets
->elementAt(opValue
);
5155 if (s
->contains(c
)) {
5160 } while (reverseIndex
> backSearchIndex
);
5163 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5164 fp
->fInputIdx
= reverseIndex
;
5165 if (fp
->fInputIdx
> reverseIndex
) {
5166 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5168 fp
->fPatIdx
++; // Skip the LOOP_C, we just did that
5174 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5181 // . matches anything, but stops at end-of-line.
5182 if (fp
->fInputIdx
>= fActiveLimit
) {
5183 // At end of input. Match failed. Backtrack out.
5185 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5189 // There is input left. Advance over one char, unless we've hit end-of-line
5191 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
5192 if (((c
& 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
5193 ((c
<=0x0d && c
>=0x0a) || c
==0x85 ||c
==0x2028 || c
==0x2029)) {
5194 // End of line in normal mode. . does not match.
5195 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5202 case URX_DOTANY_ALL
:
5204 // . in dot-matches-all (including new lines) mode
5205 if (fp
->fInputIdx
>= fActiveLimit
) {
5206 // At end of input. Match failed. Backtrack out.
5208 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5212 // There is input left. Advance over one char, except if we are
5213 // at a cr/lf, advance over both of them.
5215 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
5216 if (c
==0x0d && fp
->fInputIdx
< fActiveLimit
) {
5217 // In the case of a CR/LF, we need to advance over both.
5218 if (inputBuf
[fp
->fInputIdx
] == 0x0a) {
5219 U16_FWD_1(inputBuf
, fp
->fInputIdx
, fActiveLimit
);
5226 case URX_DOTANY_UNIX
:
5228 // '.' operator, matches all, but stops at end-of-line.
5229 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
5230 if (fp
->fInputIdx
>= fActiveLimit
) {
5231 // At end of input. Match failed. Backtrack out.
5233 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5237 // There is input left. Advance over one char, unless we've hit end-of-line
5239 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
5241 // End of line in normal mode. '.' does not match the \n
5242 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5249 fp
->fPatIdx
= opValue
;
5257 U_ASSERT(opValue
< fPattern
->fCompiledPat
->size());
5258 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
5259 fp
->fPatIdx
= opValue
; // Then JMP.
5263 // This opcode is used with (x)+, when x can match a zero length string.
5264 // Same as JMP_SAV, except conditional on the match having made forward progress.
5265 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
5266 // data address of the input position at the start of the loop.
5268 U_ASSERT(opValue
> 0 && opValue
< fPattern
->fCompiledPat
->size());
5269 int32_t stoOp
= (int32_t)pat
[opValue
-1];
5270 U_ASSERT(URX_TYPE(stoOp
) == URX_STO_INP_LOC
);
5271 int32_t frameLoc
= URX_VAL(stoOp
);
5272 U_ASSERT(frameLoc
>= 0 && frameLoc
< fFrameSize
);
5273 int32_t prevInputIdx
= (int32_t)fp
->fExtra
[frameLoc
];
5274 U_ASSERT(prevInputIdx
<= fp
->fInputIdx
);
5275 if (prevInputIdx
< fp
->fInputIdx
) {
5276 // The match did make progress. Repeat the loop.
5277 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
5278 fp
->fPatIdx
= opValue
;
5279 fp
->fExtra
[frameLoc
] = fp
->fInputIdx
;
5281 // If the input position did not advance, we do nothing here,
5282 // execution will fall out of the loop.
5288 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
5289 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
5291 // Pick up the three extra operands that CTR_INIT has, and
5292 // skip the pattern location counter past
5293 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
5295 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
5296 int32_t minCount
= (int32_t)pat
[instrOperandLoc
+1];
5297 int32_t maxCount
= (int32_t)pat
[instrOperandLoc
+2];
5298 U_ASSERT(minCount
>=0);
5299 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
5300 U_ASSERT(loopLoc
>fp
->fPatIdx
);
5302 if (minCount
== 0) {
5303 fp
= StateSave(fp
, loopLoc
+1, status
);
5305 if (maxCount
== 0) {
5306 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5313 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
5314 int32_t initOp
= (int32_t)pat
[opValue
];
5315 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT
);
5316 int64_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
5317 int32_t minCount
= (int32_t)pat
[opValue
+2];
5318 int32_t maxCount
= (int32_t)pat
[opValue
+3];
5319 // Increment the counter. Note: we DIDN'T worry about counter
5320 // overflow, since the data comes from UnicodeStrings, which
5321 // stores its length in an int32_t. Do we have to think about
5322 // this now that we're using UText? Probably not, since the length
5323 // in UChar32s is still an int32_t.
5325 U_ASSERT(*pCounter
> 0);
5326 if ((uint64_t)*pCounter
>= (uint32_t)maxCount
) {
5327 U_ASSERT(*pCounter
== maxCount
|| maxCount
== -1);
5330 if (*pCounter
>= minCount
) {
5331 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5333 fp
->fPatIdx
= opValue
+ 4; // Loop back.
5337 case URX_CTR_INIT_NG
:
5339 // Initialize a non-greedy loop
5340 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
5341 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
5343 // Pick up the three extra operands that CTR_INIT has, and
5344 // skip the pattern location counter past
5345 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
5347 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
5348 int32_t minCount
= (int32_t)pat
[instrOperandLoc
+1];
5349 int32_t maxCount
= (int32_t)pat
[instrOperandLoc
+2];
5350 U_ASSERT(minCount
>=0);
5351 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
5352 U_ASSERT(loopLoc
>fp
->fPatIdx
);
5354 if (minCount
== 0) {
5355 if (maxCount
!= 0) {
5356 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5358 fp
->fPatIdx
= loopLoc
+1; // Continue with stuff after repeated block
5363 case URX_CTR_LOOP_NG
:
5365 // Non-greedy {min, max} loops
5366 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
5367 int32_t initOp
= (int32_t)pat
[opValue
];
5368 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT_NG
);
5369 int64_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
5370 int32_t minCount
= (int32_t)pat
[opValue
+2];
5371 int32_t maxCount
= (int32_t)pat
[opValue
+3];
5372 // Increment the counter. Note: we DIDN'T worry about counter
5373 // overflow, since the data comes from UnicodeStrings, which
5374 // stores its length in an int32_t. Do we have to think about
5375 // this now that we're using UText? Probably not, since the length
5376 // in UChar32s is still an int32_t.
5378 U_ASSERT(*pCounter
> 0);
5380 if ((uint64_t)*pCounter
>= (uint32_t)maxCount
) {
5381 // The loop has matched the maximum permitted number of times.
5382 // Break out of here with no action. Matching will
5383 // continue with the following pattern.
5384 U_ASSERT(*pCounter
== maxCount
|| maxCount
== -1);
5388 if (*pCounter
< minCount
) {
5389 // We haven't met the minimum number of matches yet.
5390 // Loop back for another one.
5391 fp
->fPatIdx
= opValue
+ 4; // Loop back.
5393 // We do have the minimum number of matches.
5394 // Fall into the following pattern, but first do
5395 // a state save to the top of the loop, so that a failure
5396 // in the following pattern will try another iteration of the loop.
5397 fp
= StateSave(fp
, opValue
+ 4, status
);
5403 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
5404 fData
[opValue
] = fStack
->size();
5409 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
5410 int32_t newStackSize
= (int32_t)fData
[opValue
];
5411 U_ASSERT(newStackSize
<= fStack
->size());
5412 int64_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
5413 if (newFP
== (int64_t *)fp
) {
5417 for (i
=0; i
<fFrameSize
; i
++) {
5418 newFP
[i
] = ((int64_t *)fp
)[i
];
5420 fp
= (REStackFrame
*)newFP
;
5421 fStack
->setSize(newStackSize
);
5428 U_ASSERT(opValue
< fFrameSize
);
5429 int64_t groupStartIdx
= fp
->fExtra
[opValue
];
5430 int64_t groupEndIdx
= fp
->fExtra
[opValue
+1];
5431 U_ASSERT(groupStartIdx
<= groupEndIdx
);
5432 int64_t len
= groupEndIdx
-groupStartIdx
;
5433 if (groupStartIdx
< 0) {
5434 // This capture group has not participated in the match thus far,
5435 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
5439 // The capture group match was of an empty string.
5440 // Verified by testing: Perl matches succeed in this case, so
5445 UBool haveMatch
= FALSE
;
5446 if (fp
->fInputIdx
+ len
<= fActiveLimit
) {
5447 if (opType
== URX_BACKREF
) {
5448 if (u_strncmp(inputBuf
+groupStartIdx
, inputBuf
+fp
->fInputIdx
, (int32_t)len
) == 0) {
5452 if (u_strncasecmp(inputBuf
+groupStartIdx
, inputBuf
+fp
->fInputIdx
,
5453 (int32_t)len
, U_FOLD_CASE_DEFAULT
) == 0) {
5458 // TODO: probably need to do a partial string comparison, and only
5459 // set HitEnd if the available input matched. Ticket #6074
5463 fp
->fInputIdx
+= len
; // Match. Advance current input position.
5465 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
5470 case URX_STO_INP_LOC
:
5472 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
);
5473 fp
->fExtra
[opValue
] = fp
->fInputIdx
;
5479 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
5481 int32_t dataLoc
= URX_VAL(pat
[instrOperandLoc
]);
5482 U_ASSERT(dataLoc
>= 0 && dataLoc
< fFrameSize
);
5483 int32_t savedInputIdx
= (int32_t)fp
->fExtra
[dataLoc
];
5484 U_ASSERT(savedInputIdx
<= fp
->fInputIdx
);
5485 if (savedInputIdx
< fp
->fInputIdx
) {
5486 fp
->fPatIdx
= opValue
; // JMP
5488 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no progress in loop.
5495 // Entering a lookahead block.
5496 // Save Stack Ptr, Input Pos.
5497 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5498 fData
[opValue
] = fStack
->size();
5499 fData
[opValue
+1] = fp
->fInputIdx
;
5500 fActiveStart
= fLookStart
; // Set the match region change for
5501 fActiveLimit
= fLookLimit
; // transparent bounds.
5507 // Leaving a look-ahead block.
5508 // restore Stack Ptr, Input Pos to positions they had on entry to block.
5509 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5510 int32_t stackSize
= fStack
->size();
5511 int32_t newStackSize
= (int32_t)fData
[opValue
];
5512 U_ASSERT(stackSize
>= newStackSize
);
5513 if (stackSize
> newStackSize
) {
5514 // Copy the current top frame back to the new (cut back) top frame.
5515 // This makes the capture groups from within the look-ahead
5516 // expression available.
5517 int64_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
5519 for (i
=0; i
<fFrameSize
; i
++) {
5520 newFP
[i
] = ((int64_t *)fp
)[i
];
5522 fp
= (REStackFrame
*)newFP
;
5523 fStack
->setSize(newStackSize
);
5525 fp
->fInputIdx
= fData
[opValue
+1];
5527 // Restore the active region bounds in the input string; they may have
5528 // been changed because of transparent bounds on a Region.
5529 fActiveStart
= fRegionStart
;
5530 fActiveLimit
= fRegionLimit
;
5535 if (fp
->fInputIdx
< fActiveLimit
) {
5537 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
5538 if (u_foldCase(c
, U_FOLD_CASE_DEFAULT
) == opValue
) {
5545 #ifdef REGEX_SMART_BACKTRACKING
5546 if (fp
->fInputIdx
> backSearchIndex
&& fStack
->size() > fFrameSize
) {
5547 REStackFrame
*prevFrame
= (REStackFrame
*)fStack
->peekFrame(fFrameSize
);
5548 if (URX_LOOP_C
== URX_TYPE(pat
[prevFrame
->fPatIdx
]) && fp
->fInputIdx
<= prevFrame
->fInputIdx
) {
5549 UBool success
= FALSE
;
5550 int64_t reverseIndex
= fp
->fInputIdx
;
5552 while (reverseIndex
> backSearchIndex
) {
5553 U16_PREV(inputBuf
, backSearchIndex
, reverseIndex
, c
);
5554 if (u_foldCase(c
, U_FOLD_CASE_DEFAULT
) == opValue
) {
5557 } else if (c
== U_SENTINEL
) {
5563 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5564 fp
->fInputIdx
= reverseIndex
;
5565 if (fp
->fInputIdx
> backSearchIndex
) {
5566 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5568 fp
->fPatIdx
++; // Skip the LOOP_C, we just did that
5575 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5580 // Test input against a literal string.
5581 // Strings require two slots in the compiled pattern, one for the
5582 // offset to the string text, and one for the length.
5583 const UCaseProps
*csp
= ucase_getSingleton();
5585 int32_t stringStartIdx
, stringLen
;
5586 stringStartIdx
= opValue
;
5588 op
= (int32_t)pat
[fp
->fPatIdx
];
5590 opType
= URX_TYPE(op
);
5591 opValue
= URX_VAL(op
);
5592 U_ASSERT(opType
== URX_STRING_LEN
);
5593 stringLen
= opValue
;
5595 const UChar
*patternChars
= litText
+stringStartIdx
;
5596 const UChar
*patternEnd
= patternChars
+stringLen
;
5598 const UChar
*foldChars
= NULL
;
5599 int32_t foldOffset
, foldLength
;
5602 #ifdef REGEX_SMART_BACKTRACKING
5603 int32_t originalInputIdx
= fp
->fInputIdx
;
5605 UBool success
= TRUE
;
5607 foldOffset
= foldLength
= 0;
5609 while (patternChars
< patternEnd
&& success
) {
5610 if(foldOffset
< foldLength
) {
5611 U16_NEXT_UNSAFE(foldChars
, foldOffset
, c
);
5613 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
5614 foldLength
= ucase_toFullFolding(csp
, c
, &foldChars
, U_FOLD_CASE_DEFAULT
);
5615 if(foldLength
>= 0) {
5616 if(foldLength
<= UCASE_MAX_STRING_LENGTH
) { // !!!: Does not correctly handle chars that fold to 0-length strings
5618 U16_NEXT_UNSAFE(foldChars
, foldOffset
, c
);
5621 foldLength
= foldOffset
; // to avoid reading chars from the folding buffer
5626 if (fp
->fInputIdx
<= fActiveLimit
) {
5628 success
= (*patternChars
== c
);
5630 } else if (patternChars
+1 < patternEnd
) {
5631 success
= (*patternChars
== U16_LEAD(c
) && *(patternChars
+1) == U16_TRAIL(c
));
5636 fHitEnd
= TRUE
; // TODO: See ticket 6074
5641 #ifdef REGEX_SMART_BACKTRACKING
5642 if (fp
->fInputIdx
> backSearchIndex
&& fStack
->size()) {
5643 REStackFrame
*prevFrame
= (REStackFrame
*)fStack
->peekFrame(fFrameSize
);
5644 if (URX_LOOP_C
== URX_TYPE(pat
[prevFrame
->fPatIdx
]) && fp
->fInputIdx
<= prevFrame
->fInputIdx
) {
5645 // Reset to last start point
5646 int64_t reverseIndex
= originalInputIdx
;
5647 patternChars
= litText
+stringStartIdx
;
5649 // Search backwards for a possible start
5651 U16_PREV(inputBuf
, backSearchIndex
, reverseIndex
, c
);
5652 foldLength
= ucase_toFullFolding(csp
, c
, &foldChars
, U_FOLD_CASE_DEFAULT
);
5653 if(foldLength
>= 0) {
5654 if(foldLength
<= UCASE_MAX_STRING_LENGTH
) { // !!!: Does not correctly handle chars that fold to 0-length strings
5656 U16_NEXT_UNSAFE(foldChars
, foldOffset
, c
);
5659 foldLength
= foldOffset
; // to avoid reading chars from the folding buffer
5663 if ((U_IS_BMP(c
) && *patternChars
== c
) ||
5664 (*patternChars
== U16_LEAD(c
) && *(patternChars
+1) == U16_TRAIL(c
))) {
5668 } while (reverseIndex
> backSearchIndex
);
5672 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5673 fp
->fInputIdx
= reverseIndex
;
5674 if (fp
->fInputIdx
> backSearchIndex
) {
5675 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5677 fp
->fPatIdx
++; // Skip the LOOP_C, we just did that
5683 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5691 // Entering a look-behind block.
5692 // Save Stack Ptr, Input Pos.
5693 // TODO: implement transparent bounds. Ticket #6067
5694 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5695 fData
[opValue
] = fStack
->size();
5696 fData
[opValue
+1] = fp
->fInputIdx
;
5697 // Init the variable containing the start index for attempted matches.
5698 fData
[opValue
+2] = -1;
5699 // Save input string length, then reset to pin any matches to end at
5700 // the current position.
5701 fData
[opValue
+3] = fActiveLimit
;
5702 fActiveLimit
= fp
->fInputIdx
;
5709 // Positive Look-Behind, at top of loop checking for matches of LB expression
5710 // at all possible input starting positions.
5712 // Fetch the min and max possible match lengths. They are the operands
5713 // of this op in the pattern.
5714 int32_t minML
= (int32_t)pat
[fp
->fPatIdx
++];
5715 int32_t maxML
= (int32_t)pat
[fp
->fPatIdx
++];
5716 U_ASSERT(minML
<= maxML
);
5717 U_ASSERT(minML
>= 0);
5719 // Fetch (from data) the last input index where a match was attempted.
5720 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5721 int64_t *lbStartIdx
= &fData
[opValue
+2];
5722 if (*lbStartIdx
< 0) {
5723 // First time through loop.
5724 *lbStartIdx
= fp
->fInputIdx
- minML
;
5726 // 2nd through nth time through the loop.
5727 // Back up start position for match by one.
5728 if (*lbStartIdx
== 0) {
5731 U16_BACK_1(inputBuf
, 0, *lbStartIdx
);
5735 if (*lbStartIdx
< 0 || *lbStartIdx
< fp
->fInputIdx
- maxML
) {
5736 // We have tried all potential match starting points without
5737 // getting a match. Backtrack out, and out of the
5738 // Look Behind altogether.
5739 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5740 int64_t restoreInputLen
= fData
[opValue
+3];
5741 U_ASSERT(restoreInputLen
>= fActiveLimit
);
5742 U_ASSERT(restoreInputLen
<= fInputLength
);
5743 fActiveLimit
= restoreInputLen
;
5747 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5748 // (successful match will fall off the end of the loop.)
5749 fp
= StateSave(fp
, fp
->fPatIdx
-3, status
);
5750 fp
->fInputIdx
= *lbStartIdx
;
5755 // End of a look-behind block, after a successful match.
5757 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5758 if (fp
->fInputIdx
!= fActiveLimit
) {
5759 // The look-behind expression matched, but the match did not
5760 // extend all the way to the point that we are looking behind from.
5761 // FAIL out of here, which will take us back to the LB_CONT, which
5762 // will retry the match starting at another position or fail
5763 // the look-behind altogether, whichever is appropriate.
5764 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5768 // Look-behind match is good. Restore the orignal input string length,
5769 // which had been truncated to pin the end of the lookbehind match to the
5770 // position being looked-behind.
5771 int64_t originalInputLen
= fData
[opValue
+3];
5772 U_ASSERT(originalInputLen
>= fActiveLimit
);
5773 U_ASSERT(originalInputLen
<= fInputLength
);
5774 fActiveLimit
= originalInputLen
;
5781 // Negative Look-Behind, at top of loop checking for matches of LB expression
5782 // at all possible input starting positions.
5784 // Fetch the extra parameters of this op.
5785 int32_t minML
= (int32_t)pat
[fp
->fPatIdx
++];
5786 int32_t maxML
= (int32_t)pat
[fp
->fPatIdx
++];
5787 int32_t continueLoc
= (int32_t)pat
[fp
->fPatIdx
++];
5788 continueLoc
= URX_VAL(continueLoc
);
5789 U_ASSERT(minML
<= maxML
);
5790 U_ASSERT(minML
>= 0);
5791 U_ASSERT(continueLoc
> fp
->fPatIdx
);
5793 // Fetch (from data) the last input index where a match was attempted.
5794 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5795 int64_t *lbStartIdx
= &fData
[opValue
+2];
5796 if (*lbStartIdx
< 0) {
5797 // First time through loop.
5798 *lbStartIdx
= fp
->fInputIdx
- minML
;
5800 // 2nd through nth time through the loop.
5801 // Back up start position for match by one.
5802 if (*lbStartIdx
== 0) {
5803 (*lbStartIdx
)--; // Because U16_BACK is unsafe starting at 0.
5805 U16_BACK_1(inputBuf
, 0, *lbStartIdx
);
5809 if (*lbStartIdx
< 0 || *lbStartIdx
< fp
->fInputIdx
- maxML
) {
5810 // We have tried all potential match starting points without
5811 // getting a match, which means that the negative lookbehind as
5812 // a whole has succeeded. Jump forward to the continue location
5813 int64_t restoreInputLen
= fData
[opValue
+3];
5814 U_ASSERT(restoreInputLen
>= fActiveLimit
);
5815 U_ASSERT(restoreInputLen
<= fInputLength
);
5816 fActiveLimit
= restoreInputLen
;
5817 fp
->fPatIdx
= continueLoc
;
5821 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5822 // (successful match will cause a FAIL out of the loop altogether.)
5823 fp
= StateSave(fp
, fp
->fPatIdx
-4, status
);
5824 fp
->fInputIdx
= *lbStartIdx
;
5829 // End of a negative look-behind block, after a successful match.
5831 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5832 if (fp
->fInputIdx
!= fActiveLimit
) {
5833 // The look-behind expression matched, but the match did not
5834 // extend all the way to the point that we are looking behind from.
5835 // FAIL out of here, which will take us back to the LB_CONT, which
5836 // will retry the match starting at another position or succeed
5837 // the look-behind altogether, whichever is appropriate.
5838 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5842 // Look-behind expression matched, which means look-behind test as
5845 // Restore the orignal input string length, which had been truncated
5846 // inorder to pin the end of the lookbehind match
5847 // to the position being looked-behind.
5848 int64_t originalInputLen
= fData
[opValue
+3];
5849 U_ASSERT(originalInputLen
>= fActiveLimit
);
5850 U_ASSERT(originalInputLen
<= fInputLength
);
5851 fActiveLimit
= originalInputLen
;
5853 // Restore original stack position, discarding any state saved
5854 // by the successful pattern match.
5855 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5856 int32_t newStackSize
= (int32_t)fData
[opValue
];
5857 U_ASSERT(fStack
->size() > newStackSize
);
5858 fStack
->setSize(newStackSize
);
5860 // FAIL, which will take control back to someplace
5861 // prior to entering the look-behind test.
5862 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5868 // Loop Initialization for the optimized implementation of
5869 // [some character set]*
5870 // This op scans through all matching input.
5871 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5873 U_ASSERT(opValue
> 0 && opValue
< sets
->size());
5874 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
5875 UnicodeSet
*s
= (UnicodeSet
*)sets
->elementAt(opValue
);
5877 // Loop through input, until either the input is exhausted or
5878 // we reach a character that is not a member of the set.
5879 int32_t ix
= (int32_t)fp
->fInputIdx
;
5881 if (ix
>= fActiveLimit
) {
5886 U16_NEXT(inputBuf
, ix
, fActiveLimit
, c
);
5888 if (s8
->contains(c
) == FALSE
) {
5889 U16_BACK_1(inputBuf
, 0, ix
);
5893 if (s
->contains(c
) == FALSE
) {
5894 U16_BACK_1(inputBuf
, 0, ix
);
5900 // If there were no matching characters, skip over the loop altogether.
5901 // The loop doesn't run at all, a * op always succeeds.
5902 if (ix
== fp
->fInputIdx
) {
5903 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
5907 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5908 // must follow. It's operand is the stack location
5909 // that holds the starting input index for the match of this [set]*
5910 int32_t loopcOp
= (int32_t)pat
[fp
->fPatIdx
];
5911 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
5912 int32_t stackLoc
= URX_VAL(loopcOp
);
5913 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
5914 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
5915 #ifdef REGEX_SMART_BACKTRACKING
5916 backSearchIndex
= fp
->fInputIdx
;
5920 // Save State to the URX_LOOP_C op that follows this one,
5921 // so that match failures in the following code will return to there.
5922 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5923 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5929 case URX_LOOP_DOT_I
:
5930 // Loop Initialization for the optimized implementation of .*
5931 // This op scans through all remaining input.
5932 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5934 // Loop through input until the input is exhausted (we reach an end-of-line)
5935 // In DOTALL mode, we can just go straight to the end of the input.
5937 if ((opValue
& 1) == 1) {
5938 // Dot-matches-All mode. Jump straight to the end of the string.
5939 ix
= (int32_t)fActiveLimit
;
5942 // NOT DOT ALL mode. Line endings do not match '.'
5943 // Scan forward until a line ending or end of input.
5944 ix
= (int32_t)fp
->fInputIdx
;
5946 if (ix
>= fActiveLimit
) {
5951 U16_NEXT(inputBuf
, ix
, fActiveLimit
, c
); // c = inputBuf[ix++]
5952 if ((c
& 0x7f) <= 0x29) { // Fast filter of non-new-line-s
5953 if ((c
== 0x0a) || // 0x0a is newline in both modes.
5954 (((opValue
& 2) == 0) && // IF not UNIX_LINES mode
5955 ((c
<=0x0d && c
>=0x0a) || c
==0x85 || c
==0x2028 || c
==0x2029))) {
5956 // char is a line ending. Put the input pos back to the
5957 // line ending char, and exit the scanning loop.
5958 U16_BACK_1(inputBuf
, 0, ix
);
5965 // If there were no matching characters, skip over the loop altogether.
5966 // The loop doesn't run at all, a * op always succeeds.
5967 if (ix
== fp
->fInputIdx
) {
5968 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
5972 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5973 // must follow. It's operand is the stack location
5974 // that holds the starting input index for the match of this .*
5975 int32_t loopcOp
= (int32_t)pat
[fp
->fPatIdx
];
5976 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
5977 int32_t stackLoc
= URX_VAL(loopcOp
);
5978 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
5979 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
5980 #ifdef REGEX_SMART_BACKTRACKING
5981 backSearchIndex
= fp
->fInputIdx
;
5985 // Save State to the URX_LOOP_C op that follows this one,
5986 // so that match failures in the following code will return to there.
5987 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5988 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5996 U_ASSERT(opValue
>=0 && opValue
<fFrameSize
);
5997 backSearchIndex
= (int32_t)fp
->fExtra
[opValue
];
5998 U_ASSERT(backSearchIndex
<= fp
->fInputIdx
);
5999 if (backSearchIndex
== fp
->fInputIdx
) {
6000 // We've backed up the input idx to the point that the loop started.
6001 // The loop is done. Leave here without saving state.
6002 // Subsequent failures won't come back here.
6005 // Set up for the next iteration of the loop, with input index
6006 // backed up by one from the last time through,
6007 // and a state save to this instruction in case the following code fails again.
6008 // (We're going backwards because this loop emulates stack unwinding, not
6009 // the initial scan forward.)
6010 U_ASSERT(fp
->fInputIdx
> 0);
6012 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, prevC
); // !!!: should this 0 be one of f*Limit?
6014 if (prevC
== 0x0a &&
6015 fp
->fInputIdx
> backSearchIndex
&&
6016 inputBuf
[fp
->fInputIdx
-1] == 0x0d) {
6017 int32_t prevOp
= (int32_t)pat
[fp
->fPatIdx
-2];
6018 if (URX_TYPE(prevOp
) == URX_LOOP_DOT_I
) {
6019 // .*, stepping back over CRLF pair.
6020 U16_BACK_1(inputBuf
, 0, fp
->fInputIdx
);
6025 fp
= StateSave(fp
, fp
->fPatIdx
-1, status
);
6032 // Trouble. The compiled pattern contains an entry with an
6033 // unrecognized type tag.
6037 if (U_FAILURE(status
)) {
6046 fLastMatchEnd
= fMatchEnd
;
6047 fMatchStart
= startIdx
;
6048 fMatchEnd
= fp
->fInputIdx
;
6050 REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart
, fMatchEnd
));
6056 REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
6060 fFrame
= fp
; // The active stack frame when the engine stopped.
6061 // Contains the capture group results that we need to
6068 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher
)
6072 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS