2 **************************************************************************
3 * Copyright (C) 2002-2013 International Business Machines Corporation *
4 * and others. All rights reserved. *
5 **************************************************************************
10 // Contains the implementation of class RegexMatcher,
11 // which is one of the main API classes for the ICU regular expression package.
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
17 #include "unicode/regex.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/ustring.h"
21 #include "unicode/rbbi.h"
22 #include "unicode/utf.h"
23 #include "unicode/utf16.h"
34 // #include <malloc.h> // Needed for heapcheck testing
37 // Find progress callback
38 // ----------------------
39 // Macro to inline test & call to ReportFindProgress(). Eliminates unnecessary function call.
41 #define REGEXFINDPROGRESS_INTERRUPT(pos, status) \
42 (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE)
47 // When a failure would go back to a LOOP_C instruction,
48 // strings, characters, and setrefs scan backwards for a valid start
49 // character themselves, pop the stack, and save state, emulating the
50 // LOOP_C's effect but assured that the next character of input is a
51 // possible matching character.
53 // Good idea in theory; unfortunately it only helps out a few specific
54 // cases and slows the engine down a little in the rest.
58 // Default limit for the size of the back track stack, to avoid system
59 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
60 // This value puts ICU's limits higher than most other regexp implementations,
61 // which use recursion rather than the heap, and take more storage per
64 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY
= 8000000;
66 // Time limit counter constant.
67 // Time limits for expression evaluation are in terms of quanta of work by
68 // the engine, each of which is 10,000 state saves.
69 // This constant determines that state saves per tick number.
70 static const int32_t TIMER_INITIAL_VALUE
= 10000;
72 //-----------------------------------------------------------------------------
74 // Constructor and Destructor
76 //-----------------------------------------------------------------------------
77 RegexMatcher::RegexMatcher(const RegexPattern
*pat
) {
78 fDeferredStatus
= U_ZERO_ERROR
;
79 init(fDeferredStatus
);
80 if (U_FAILURE(fDeferredStatus
)) {
84 fDeferredStatus
= U_ILLEGAL_ARGUMENT_ERROR
;
88 init2(RegexStaticSets::gStaticSets
->fEmptyText
, fDeferredStatus
);
93 RegexMatcher::RegexMatcher(const UnicodeString
®exp
, const UnicodeString
&input
,
94 uint32_t flags
, UErrorCode
&status
) {
96 if (U_FAILURE(status
)) {
100 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
101 fPattern
= fPatternOwned
;
103 UText inputText
= UTEXT_INITIALIZER
;
104 utext_openConstUnicodeString(&inputText
, &input
, &status
);
105 init2(&inputText
, status
);
106 utext_close(&inputText
);
108 fInputUniStrMaybeMutable
= TRUE
;
112 RegexMatcher::RegexMatcher(UText
*regexp
, UText
*input
,
113 uint32_t flags
, UErrorCode
&status
) {
115 if (U_FAILURE(status
)) {
119 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
120 if (U_FAILURE(status
)) {
124 fPattern
= fPatternOwned
;
125 init2(input
, status
);
129 RegexMatcher::RegexMatcher(const UnicodeString
®exp
,
130 uint32_t flags
, UErrorCode
&status
) {
132 if (U_FAILURE(status
)) {
136 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
137 if (U_FAILURE(status
)) {
140 fPattern
= fPatternOwned
;
141 init2(RegexStaticSets::gStaticSets
->fEmptyText
, status
);
144 RegexMatcher::RegexMatcher(UText
*regexp
,
145 uint32_t flags
, UErrorCode
&status
) {
147 if (U_FAILURE(status
)) {
151 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
152 if (U_FAILURE(status
)) {
156 fPattern
= fPatternOwned
;
157 init2(RegexStaticSets::gStaticSets
->fEmptyText
, status
);
163 RegexMatcher::~RegexMatcher() {
165 if (fData
!= fSmallData
) {
170 delete fPatternOwned
;
171 fPatternOwned
= NULL
;
179 utext_close(fInputText
);
182 utext_close(fAltInputText
);
185 #if UCONFIG_NO_BREAK_ITERATION==0
186 delete fWordBreakItr
;
191 // init() common initialization for use by all constructors.
192 // Initialize all fields, get the object into a consistent state.
193 // This must be done even when the initial status shows an error,
194 // so that the object is initialized sufficiently well for the destructor
197 void RegexMatcher::init(UErrorCode
&status
) {
199 fPatternOwned
= NULL
;
209 fTransparentBounds
= FALSE
;
210 fAnchoringBounds
= TRUE
;
223 fStackLimit
= DEFAULT_BACKTRACK_STACK_CAPACITY
;
225 fCallbackContext
= NULL
;
226 fFindProgressCallbackFn
= NULL
;
227 fFindProgressCallbackContext
= NULL
;
229 fDeferredStatus
= status
;
231 fWordBreakItr
= NULL
;
235 fAltInputText
= NULL
;
238 fInputUniStrMaybeMutable
= FALSE
;
240 if (U_FAILURE(status
)) {
241 fDeferredStatus
= status
;
246 // init2() Common initialization for use by RegexMatcher constructors, part 2.
247 // This handles the common setup to be done after the Pattern is available.
249 void RegexMatcher::init2(UText
*input
, UErrorCode
&status
) {
250 if (U_FAILURE(status
)) {
251 fDeferredStatus
= status
;
255 if (fPattern
->fDataSize
> (int32_t)(sizeof(fSmallData
)/sizeof(fSmallData
[0]))) {
256 fData
= (int64_t *)uprv_malloc(fPattern
->fDataSize
* sizeof(int64_t));
258 status
= fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
263 fStack
= new UVector64(status
);
264 if (fStack
== NULL
) {
265 status
= fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
270 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY
, status
);
271 if (U_FAILURE(status
)) {
272 fDeferredStatus
= status
;
278 static const UChar BACKSLASH
= 0x5c;
279 static const UChar DOLLARSIGN
= 0x24;
280 //--------------------------------------------------------------------------------
284 //--------------------------------------------------------------------------------
285 RegexMatcher
&RegexMatcher::appendReplacement(UnicodeString
&dest
,
286 const UnicodeString
&replacement
,
287 UErrorCode
&status
) {
288 UText replacementText
= UTEXT_INITIALIZER
;
290 utext_openConstUnicodeString(&replacementText
, &replacement
, &status
);
291 if (U_SUCCESS(status
)) {
292 UText resultText
= UTEXT_INITIALIZER
;
293 utext_openUnicodeString(&resultText
, &dest
, &status
);
295 if (U_SUCCESS(status
)) {
296 appendReplacement(&resultText
, &replacementText
, status
);
297 utext_close(&resultText
);
299 utext_close(&replacementText
);
306 // appendReplacement, UText mode
308 RegexMatcher
&RegexMatcher::appendReplacement(UText
*dest
,
310 UErrorCode
&status
) {
311 if (U_FAILURE(status
)) {
314 if (U_FAILURE(fDeferredStatus
)) {
315 status
= fDeferredStatus
;
318 if (fMatch
== FALSE
) {
319 status
= U_REGEX_INVALID_STATE
;
323 // Copy input string from the end of previous match to start of current match
324 int64_t destLen
= utext_nativeLength(dest
);
325 if (fMatchStart
> fAppendPosition
) {
326 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
327 destLen
+= utext_replace(dest
, destLen
, destLen
, fInputText
->chunkContents
+fAppendPosition
,
328 (int32_t)(fMatchStart
-fAppendPosition
), &status
);
331 if (UTEXT_USES_U16(fInputText
)) {
332 len16
= (int32_t)(fMatchStart
-fAppendPosition
);
334 UErrorCode lengthStatus
= U_ZERO_ERROR
;
335 len16
= utext_extract(fInputText
, fAppendPosition
, fMatchStart
, NULL
, 0, &lengthStatus
);
337 UChar
*inputChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(len16
+1));
338 if (inputChars
== NULL
) {
339 status
= U_MEMORY_ALLOCATION_ERROR
;
342 utext_extract(fInputText
, fAppendPosition
, fMatchStart
, inputChars
, len16
+1, &status
);
343 destLen
+= utext_replace(dest
, destLen
, destLen
, inputChars
, len16
, &status
);
344 uprv_free(inputChars
);
347 fAppendPosition
= fMatchEnd
;
350 // scan the replacement text, looking for substitutions ($n) and \escapes.
351 // TODO: optimize this loop by efficiently scanning for '$' or '\',
352 // move entire ranges not containing substitutions.
353 UTEXT_SETNATIVEINDEX(replacement
, 0);
354 UChar32 c
= UTEXT_NEXT32(replacement
);
355 while (c
!= U_SENTINEL
) {
356 if (c
== BACKSLASH
) {
357 // Backslash Escape. Copy the following char out without further checks.
358 // Note: Surrogate pairs don't need any special handling
359 // The second half wont be a '$' or a '\', and
360 // will move to the dest normally on the next
362 c
= UTEXT_CURRENT32(replacement
);
363 if (c
== U_SENTINEL
) {
367 if (c
==0x55/*U*/ || c
==0x75/*u*/) {
368 // We have a \udddd or \Udddddddd escape sequence.
370 struct URegexUTextUnescapeCharContext context
= U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement
);
371 UChar32 escapedChar
= u_unescapeAt(uregex_utext_unescape_charAt
, &offset
, INT32_MAX
, &context
);
372 if (escapedChar
!= (UChar32
)0xFFFFFFFF) {
373 if (U_IS_BMP(escapedChar
)) {
374 UChar c16
= (UChar
)escapedChar
;
375 destLen
+= utext_replace(dest
, destLen
, destLen
, &c16
, 1, &status
);
378 surrogate
[0] = U16_LEAD(escapedChar
);
379 surrogate
[1] = U16_TRAIL(escapedChar
);
380 if (U_SUCCESS(status
)) {
381 destLen
+= utext_replace(dest
, destLen
, destLen
, surrogate
, 2, &status
);
384 // TODO: Report errors for mal-formed \u escapes?
385 // As this is, the original sequence is output, which may be OK.
386 if (context
.lastOffset
== offset
) {
387 (void)UTEXT_PREVIOUS32(replacement
);
388 } else if (context
.lastOffset
!= offset
-1) {
389 utext_moveIndex32(replacement
, offset
- context
.lastOffset
- 1);
393 (void)UTEXT_NEXT32(replacement
);
394 // Plain backslash escape. Just put out the escaped character.
396 UChar c16
= (UChar
)c
;
397 destLen
+= utext_replace(dest
, destLen
, destLen
, &c16
, 1, &status
);
400 surrogate
[0] = U16_LEAD(c
);
401 surrogate
[1] = U16_TRAIL(c
);
402 if (U_SUCCESS(status
)) {
403 destLen
+= utext_replace(dest
, destLen
, destLen
, surrogate
, 2, &status
);
407 } else if (c
!= DOLLARSIGN
) {
408 // Normal char, not a $. Copy it out without further checks.
410 UChar c16
= (UChar
)c
;
411 destLen
+= utext_replace(dest
, destLen
, destLen
, &c16
, 1, &status
);
414 surrogate
[0] = U16_LEAD(c
);
415 surrogate
[1] = U16_TRAIL(c
);
416 if (U_SUCCESS(status
)) {
417 destLen
+= utext_replace(dest
, destLen
, destLen
, surrogate
, 2, &status
);
421 // We've got a $. Pick up a capture group number if one follows.
422 // Consume at most the number of digits necessary for the largest capture
423 // number that is valid for this pattern.
425 int32_t numDigits
= 0;
426 int32_t groupNum
= 0;
429 digitC
= UTEXT_CURRENT32(replacement
);
430 if (digitC
== U_SENTINEL
) {
433 if (u_isdigit(digitC
) == FALSE
) {
436 (void)UTEXT_NEXT32(replacement
);
437 groupNum
=groupNum
*10 + u_charDigitValue(digitC
);
439 if (numDigits
>= fPattern
->fMaxCaptureDigits
) {
445 if (numDigits
== 0) {
446 // The $ didn't introduce a group number at all.
447 // Treat it as just part of the substitution text.
448 UChar c16
= DOLLARSIGN
;
449 destLen
+= utext_replace(dest
, destLen
, destLen
, &c16
, 1, &status
);
451 // Finally, append the capture group data to the destination.
452 destLen
+= appendGroup(groupNum
, dest
, status
);
453 if (U_FAILURE(status
)) {
454 // Can fail if group number is out of range.
460 if (U_FAILURE(status
)) {
463 c
= UTEXT_NEXT32(replacement
);
472 //--------------------------------------------------------------------------------
474 // appendTail Intended to be used in conjunction with appendReplacement()
475 // To the destination string, append everything following
476 // the last match position from the input string.
478 // Note: Match ranges do not affect appendTail or appendReplacement
480 //--------------------------------------------------------------------------------
481 UnicodeString
&RegexMatcher::appendTail(UnicodeString
&dest
) {
482 UErrorCode status
= U_ZERO_ERROR
;
483 UText resultText
= UTEXT_INITIALIZER
;
484 utext_openUnicodeString(&resultText
, &dest
, &status
);
486 if (U_SUCCESS(status
)) {
487 appendTail(&resultText
, status
);
488 utext_close(&resultText
);
495 // appendTail, UText mode
497 UText
*RegexMatcher::appendTail(UText
*dest
, UErrorCode
&status
) {
498 if (U_FAILURE(status
)) {
501 if (U_FAILURE(fDeferredStatus
)) {
502 status
= fDeferredStatus
;
506 if (fInputLength
> fAppendPosition
) {
507 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
508 int64_t destLen
= utext_nativeLength(dest
);
509 utext_replace(dest
, destLen
, destLen
, fInputText
->chunkContents
+fAppendPosition
,
510 (int32_t)(fInputLength
-fAppendPosition
), &status
);
513 if (UTEXT_USES_U16(fInputText
)) {
514 len16
= (int32_t)(fInputLength
-fAppendPosition
);
516 len16
= utext_extract(fInputText
, fAppendPosition
, fInputLength
, NULL
, 0, &status
);
517 status
= U_ZERO_ERROR
; // buffer overflow
520 UChar
*inputChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(len16
));
521 if (inputChars
== NULL
) {
522 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
524 utext_extract(fInputText
, fAppendPosition
, fInputLength
, inputChars
, len16
, &status
); // unterminated
525 int64_t destLen
= utext_nativeLength(dest
);
526 utext_replace(dest
, destLen
, destLen
, inputChars
, len16
, &status
);
527 uprv_free(inputChars
);
536 //--------------------------------------------------------------------------------
540 //--------------------------------------------------------------------------------
541 int32_t RegexMatcher::end(UErrorCode
&err
) const {
545 int64_t RegexMatcher::end64(UErrorCode
&err
) const {
546 return end64(0, err
);
549 int64_t RegexMatcher::end64(int32_t group
, UErrorCode
&err
) const {
550 if (U_FAILURE(err
)) {
553 if (fMatch
== FALSE
) {
554 err
= U_REGEX_INVALID_STATE
;
557 if (group
< 0 || group
> fPattern
->fGroupMap
->size()) {
558 err
= U_INDEX_OUTOFBOUNDS_ERROR
;
565 // Get the position within the stack frame of the variables for
566 // this capture group.
567 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(group
-1);
568 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
569 U_ASSERT(groupOffset
>= 0);
570 e
= fFrame
->fExtra
[groupOffset
+ 1];
576 int32_t RegexMatcher::end(int32_t group
, UErrorCode
&err
) const {
577 return (int32_t)end64(group
, err
);
581 //--------------------------------------------------------------------------------
585 //--------------------------------------------------------------------------------
586 UBool
RegexMatcher::find() {
587 // Start at the position of the last match end. (Will be zero if the
588 // matcher has been reset.)
590 if (U_FAILURE(fDeferredStatus
)) {
594 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
595 return findUsingChunk();
598 int64_t startPos
= fMatchEnd
;
600 startPos
= fActiveStart
;
604 // Save the position of any previous successful match.
605 fLastMatchEnd
= fMatchEnd
;
607 if (fMatchStart
== fMatchEnd
) {
608 // Previous match had zero length. Move start position up one position
609 // to avoid sending find() into a loop on zero-length matches.
610 if (startPos
>= fActiveLimit
) {
615 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
616 (void)UTEXT_NEXT32(fInputText
);
617 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
620 if (fLastMatchEnd
>= 0) {
621 // A previous find() failed to match. Don't try again.
622 // (without this test, a pattern with a zero-length match
623 // could match again at the end of an input string.)
630 // Compute the position in the input string beyond which a match can not begin, because
631 // the minimum length match would extend past the end of the input.
632 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
633 // Be aware of possible overflows if making changes here.
634 int64_t testStartLimit
;
635 if (UTEXT_USES_U16(fInputText
)) {
636 testStartLimit
= fActiveLimit
- fPattern
->fMinMatchLen
;
637 if (startPos
> testStartLimit
) {
643 // For now, let the matcher discover that it can't match on its own
644 // We don't know how long the match len is in native characters
645 testStartLimit
= fActiveLimit
;
649 U_ASSERT(startPos
>= 0);
651 switch (fPattern
->fStartType
) {
653 // No optimization was found.
654 // Try a match at each input position.
656 MatchAt(startPos
, FALSE
, fDeferredStatus
);
657 if (U_FAILURE(fDeferredStatus
)) {
663 if (startPos
>= testStartLimit
) {
667 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
668 (void)UTEXT_NEXT32(fInputText
);
669 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
670 // Note that it's perfectly OK for a pattern to have a zero-length
671 // match at the end of a string, so we must make sure that the loop
672 // runs with startPos == testStartLimit the last time through.
673 if (REGEXFINDPROGRESS_INTERRUPT(startPos
, fDeferredStatus
))
679 // Matches are only possible at the start of the input string
680 // (pattern begins with ^ or \A)
681 if (startPos
> fActiveStart
) {
685 MatchAt(startPos
, FALSE
, fDeferredStatus
);
686 if (U_FAILURE(fDeferredStatus
)) {
694 // Match may start on any char from a pre-computed set.
695 U_ASSERT(fPattern
->fMinMatchLen
> 0);
697 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
699 c
= UTEXT_NEXT32(fInputText
);
700 pos
= UTEXT_GETNATIVEINDEX(fInputText
);
701 // c will be -1 (U_SENTINEL) at end of text, in which case we
702 // skip this next block (so we don't have a negative array index)
703 // and handle end of text in the following block.
704 if (c
>= 0 && ((c
<256 && fPattern
->fInitialChars8
->contains(c
)) ||
705 (c
>=256 && fPattern
->fInitialChars
->contains(c
)))) {
706 MatchAt(startPos
, FALSE
, fDeferredStatus
);
707 if (U_FAILURE(fDeferredStatus
)) {
713 UTEXT_SETNATIVEINDEX(fInputText
, pos
);
715 if (startPos
>= testStartLimit
) {
721 if (REGEXFINDPROGRESS_INTERRUPT(startPos
, fDeferredStatus
))
730 // Match starts on exactly one char.
731 U_ASSERT(fPattern
->fMinMatchLen
> 0);
732 UChar32 theChar
= fPattern
->fInitialChar
;
734 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
736 c
= UTEXT_NEXT32(fInputText
);
737 pos
= UTEXT_GETNATIVEINDEX(fInputText
);
739 MatchAt(startPos
, FALSE
, fDeferredStatus
);
740 if (U_FAILURE(fDeferredStatus
)) {
746 UTEXT_SETNATIVEINDEX(fInputText
, pos
);
748 if (startPos
>= testStartLimit
) {
754 if (REGEXFINDPROGRESS_INTERRUPT(startPos
, fDeferredStatus
))
763 if (startPos
== fAnchorStart
) {
764 MatchAt(startPos
, FALSE
, fDeferredStatus
);
765 if (U_FAILURE(fDeferredStatus
)) {
771 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
772 c
= UTEXT_NEXT32(fInputText
);
773 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
775 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
776 c
= UTEXT_PREVIOUS32(fInputText
);
777 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
780 if (fPattern
->fFlags
& UREGEX_UNIX_LINES
) {
783 MatchAt(startPos
, FALSE
, fDeferredStatus
);
784 if (U_FAILURE(fDeferredStatus
)) {
790 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
792 if (startPos
>= testStartLimit
) {
797 c
= UTEXT_NEXT32(fInputText
);
798 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
799 // Note that it's perfectly OK for a pattern to have a zero-length
800 // match at the end of a string, so we must make sure that the loop
801 // runs with startPos == testStartLimit the last time through.
802 if (REGEXFINDPROGRESS_INTERRUPT(startPos
, fDeferredStatus
))
807 if (((c
& 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
808 ((c
<=0x0d && c
>=0x0a) || c
==0x85 ||c
==0x2028 || c
==0x2029 )) {
809 if (c
== 0x0d && startPos
< fActiveLimit
&& UTEXT_CURRENT32(fInputText
) == 0x0a) {
810 (void)UTEXT_NEXT32(fInputText
);
811 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
813 MatchAt(startPos
, FALSE
, fDeferredStatus
);
814 if (U_FAILURE(fDeferredStatus
)) {
820 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
822 if (startPos
>= testStartLimit
) {
827 c
= UTEXT_NEXT32(fInputText
);
828 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
829 // Note that it's perfectly OK for a pattern to have a zero-length
830 // match at the end of a string, so we must make sure that the loop
831 // runs with startPos == testStartLimit the last time through.
832 if (REGEXFINDPROGRESS_INTERRUPT(startPos
, fDeferredStatus
))
848 UBool
RegexMatcher::find(int64_t start
, UErrorCode
&status
) {
849 if (U_FAILURE(status
)) {
852 if (U_FAILURE(fDeferredStatus
)) {
853 status
= fDeferredStatus
;
856 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
857 // This will reset the region to be the full input length.
859 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
863 int64_t nativeStart
= start
;
864 if (nativeStart
< fActiveStart
|| nativeStart
> fActiveLimit
) {
865 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
868 fMatchEnd
= nativeStart
;
873 //--------------------------------------------------------------------------------
875 // findUsingChunk() -- like find(), but with the advance knowledge that the
876 // entire string is available in the UText's chunk buffer.
878 //--------------------------------------------------------------------------------
879 UBool
RegexMatcher::findUsingChunk() {
880 // Start at the position of the last match end. (Will be zero if the
881 // matcher has been reset.
884 int32_t startPos
= (int32_t)fMatchEnd
;
886 startPos
= (int32_t)fActiveStart
;
889 const UChar
*inputBuf
= fInputText
->chunkContents
;
892 // Save the position of any previous successful match.
893 fLastMatchEnd
= fMatchEnd
;
895 if (fMatchStart
== fMatchEnd
) {
896 // Previous match had zero length. Move start position up one position
897 // to avoid sending find() into a loop on zero-length matches.
898 if (startPos
>= fActiveLimit
) {
903 U16_FWD_1(inputBuf
, startPos
, fInputLength
);
906 if (fLastMatchEnd
>= 0) {
907 // A previous find() failed to match. Don't try again.
908 // (without this test, a pattern with a zero-length match
909 // could match again at the end of an input string.)
916 // Compute the position in the input string beyond which a match can not begin, because
917 // the minimum length match would extend past the end of the input.
918 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
919 // Be aware of possible overflows if making changes here.
920 int32_t testLen
= (int32_t)(fActiveLimit
- fPattern
->fMinMatchLen
);
921 if (startPos
> testLen
) {
928 U_ASSERT(startPos
>= 0);
930 switch (fPattern
->fStartType
) {
932 // No optimization was found.
933 // Try a match at each input position.
935 MatchChunkAt(startPos
, FALSE
, fDeferredStatus
);
936 if (U_FAILURE(fDeferredStatus
)) {
942 if (startPos
>= testLen
) {
946 U16_FWD_1(inputBuf
, startPos
, fActiveLimit
);
947 // Note that it's perfectly OK for a pattern to have a zero-length
948 // match at the end of a string, so we must make sure that the loop
949 // runs with startPos == testLen the last time through.
950 if (REGEXFINDPROGRESS_INTERRUPT(startPos
, fDeferredStatus
))
956 // Matches are only possible at the start of the input string
957 // (pattern begins with ^ or \A)
958 if (startPos
> fActiveStart
) {
962 MatchChunkAt(startPos
, FALSE
, fDeferredStatus
);
963 if (U_FAILURE(fDeferredStatus
)) {
971 // Match may start on any char from a pre-computed set.
972 U_ASSERT(fPattern
->fMinMatchLen
> 0);
974 int32_t pos
= startPos
;
975 U16_NEXT(inputBuf
, startPos
, fActiveLimit
, c
); // like c = inputBuf[startPos++];
976 if ((c
<256 && fPattern
->fInitialChars8
->contains(c
)) ||
977 (c
>=256 && fPattern
->fInitialChars
->contains(c
))) {
978 MatchChunkAt(pos
, FALSE
, fDeferredStatus
);
979 if (U_FAILURE(fDeferredStatus
)) {
986 if (pos
>= testLen
) {
991 if (REGEXFINDPROGRESS_INTERRUPT(startPos
, fDeferredStatus
))
1000 // Match starts on exactly one char.
1001 U_ASSERT(fPattern
->fMinMatchLen
> 0);
1002 UChar32 theChar
= fPattern
->fInitialChar
;
1004 int32_t pos
= startPos
;
1005 U16_NEXT(inputBuf
, startPos
, fActiveLimit
, c
); // like c = inputBuf[startPos++];
1007 MatchChunkAt(pos
, FALSE
, fDeferredStatus
);
1008 if (U_FAILURE(fDeferredStatus
)) {
1015 if (pos
>= testLen
) {
1020 if (REGEXFINDPROGRESS_INTERRUPT(startPos
, fDeferredStatus
))
1029 if (startPos
== fAnchorStart
) {
1030 MatchChunkAt(startPos
, FALSE
, fDeferredStatus
);
1031 if (U_FAILURE(fDeferredStatus
)) {
1037 U16_FWD_1(inputBuf
, startPos
, fActiveLimit
);
1040 if (fPattern
->fFlags
& UREGEX_UNIX_LINES
) {
1042 c
= inputBuf
[startPos
-1];
1044 MatchChunkAt(startPos
, FALSE
, fDeferredStatus
);
1045 if (U_FAILURE(fDeferredStatus
)) {
1052 if (startPos
>= testLen
) {
1057 U16_FWD_1(inputBuf
, startPos
, fActiveLimit
);
1058 // Note that it's perfectly OK for a pattern to have a zero-length
1059 // match at the end of a string, so we must make sure that the loop
1060 // runs with startPos == testLen the last time through.
1061 if (REGEXFINDPROGRESS_INTERRUPT(startPos
, fDeferredStatus
))
1066 c
= inputBuf
[startPos
-1];
1067 if (((c
& 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
1068 ((c
<=0x0d && c
>=0x0a) || c
==0x85 ||c
==0x2028 || c
==0x2029 )) {
1069 if (c
== 0x0d && startPos
< fActiveLimit
&& inputBuf
[startPos
] == 0x0a) {
1072 MatchChunkAt(startPos
, FALSE
, fDeferredStatus
);
1073 if (U_FAILURE(fDeferredStatus
)) {
1080 if (startPos
>= testLen
) {
1085 U16_FWD_1(inputBuf
, startPos
, fActiveLimit
);
1086 // Note that it's perfectly OK for a pattern to have a zero-length
1087 // match at the end of a string, so we must make sure that the loop
1088 // runs with startPos == testLen the last time through.
1089 if (REGEXFINDPROGRESS_INTERRUPT(startPos
, fDeferredStatus
))
1105 //--------------------------------------------------------------------------------
1109 //--------------------------------------------------------------------------------
1110 UnicodeString
RegexMatcher::group(UErrorCode
&status
) const {
1111 return group(0, status
);
1114 // Return immutable shallow clone
1115 UText
*RegexMatcher::group(UText
*dest
, int64_t &group_len
, UErrorCode
&status
) const {
1116 return group(0, dest
, group_len
, status
);
1119 // Return immutable shallow clone
1120 UText
*RegexMatcher::group(int32_t groupNum
, UText
*dest
, int64_t &group_len
, UErrorCode
&status
) const {
1122 if (U_FAILURE(status
)) {
1125 if (U_FAILURE(fDeferredStatus
)) {
1126 status
= fDeferredStatus
;
1127 } else if (fMatch
== FALSE
) {
1128 status
= U_REGEX_INVALID_STATE
;
1129 } else if (groupNum
< 0 || groupNum
> fPattern
->fGroupMap
->size()) {
1130 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1133 if (U_FAILURE(status
)) {
1138 if (groupNum
== 0) {
1142 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(groupNum
-1);
1143 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
1144 U_ASSERT(groupOffset
>= 0);
1145 s
= fFrame
->fExtra
[groupOffset
];
1146 e
= fFrame
->fExtra
[groupOffset
+1];
1150 // A capture group wasn't part of the match
1151 return utext_clone(dest
, fInputText
, FALSE
, TRUE
, &status
);
1156 dest
= utext_clone(dest
, fInputText
, FALSE
, TRUE
, &status
);
1158 UTEXT_SETNATIVEINDEX(dest
, s
);
1162 UnicodeString
RegexMatcher::group(int32_t groupNum
, UErrorCode
&status
) const {
1163 UnicodeString result
;
1164 if (U_FAILURE(status
)) {
1167 UText resultText
= UTEXT_INITIALIZER
;
1168 utext_openUnicodeString(&resultText
, &result
, &status
);
1169 group(groupNum
, &resultText
, status
);
1170 utext_close(&resultText
);
1175 // Return deep (mutable) clone
1176 // Technology Preview (as an API), but note that the UnicodeString API is implemented
1177 // using this function.
1178 UText
*RegexMatcher::group(int32_t groupNum
, UText
*dest
, UErrorCode
&status
) const {
1179 if (U_FAILURE(status
)) {
1183 if (U_FAILURE(fDeferredStatus
)) {
1184 status
= fDeferredStatus
;
1185 } else if (fMatch
== FALSE
) {
1186 status
= U_REGEX_INVALID_STATE
;
1187 } else if (groupNum
< 0 || groupNum
> fPattern
->fGroupMap
->size()) {
1188 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1190 if (U_FAILURE(status
)) {
1195 if (groupNum
== 0) {
1199 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(groupNum
-1);
1200 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
1201 U_ASSERT(groupOffset
>= 0);
1202 s
= fFrame
->fExtra
[groupOffset
];
1203 e
= fFrame
->fExtra
[groupOffset
+1];
1207 // A capture group wasn't part of the match
1209 utext_replace(dest
, 0, utext_nativeLength(dest
), NULL
, 0, &status
);
1212 return utext_openUChars(NULL
, NULL
, 0, &status
);
1217 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1218 U_ASSERT(e
<= fInputLength
);
1220 utext_replace(dest
, 0, utext_nativeLength(dest
), fInputText
->chunkContents
+s
, (int32_t)(e
-s
), &status
);
1222 UText groupText
= UTEXT_INITIALIZER
;
1223 utext_openUChars(&groupText
, fInputText
->chunkContents
+s
, e
-s
, &status
);
1224 dest
= utext_clone(NULL
, &groupText
, TRUE
, FALSE
, &status
);
1225 utext_close(&groupText
);
1229 if (UTEXT_USES_U16(fInputText
)) {
1230 len16
= (int32_t)(e
-s
);
1232 UErrorCode lengthStatus
= U_ZERO_ERROR
;
1233 len16
= utext_extract(fInputText
, s
, e
, NULL
, 0, &lengthStatus
);
1235 UChar
*groupChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(len16
+1));
1236 if (groupChars
== NULL
) {
1237 status
= U_MEMORY_ALLOCATION_ERROR
;
1240 utext_extract(fInputText
, s
, e
, groupChars
, len16
+1, &status
);
1243 utext_replace(dest
, 0, utext_nativeLength(dest
), groupChars
, len16
, &status
);
1245 UText groupText
= UTEXT_INITIALIZER
;
1246 utext_openUChars(&groupText
, groupChars
, len16
, &status
);
1247 dest
= utext_clone(NULL
, &groupText
, TRUE
, FALSE
, &status
);
1248 utext_close(&groupText
);
1251 uprv_free(groupChars
);
1256 //--------------------------------------------------------------------------------
1258 // appendGroup() -- currently internal only, appends a group to a UText rather
1259 // than replacing its contents
1261 //--------------------------------------------------------------------------------
1263 int64_t RegexMatcher::appendGroup(int32_t groupNum
, UText
*dest
, UErrorCode
&status
) const {
1264 if (U_FAILURE(status
)) {
1267 if (U_FAILURE(fDeferredStatus
)) {
1268 status
= fDeferredStatus
;
1271 int64_t destLen
= utext_nativeLength(dest
);
1273 if (fMatch
== FALSE
) {
1274 status
= U_REGEX_INVALID_STATE
;
1275 return utext_replace(dest
, destLen
, destLen
, NULL
, 0, &status
);
1277 if (groupNum
< 0 || groupNum
> fPattern
->fGroupMap
->size()) {
1278 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1279 return utext_replace(dest
, destLen
, destLen
, NULL
, 0, &status
);
1283 if (groupNum
== 0) {
1287 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(groupNum
-1);
1288 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
1289 U_ASSERT(groupOffset
>= 0);
1290 s
= fFrame
->fExtra
[groupOffset
];
1291 e
= fFrame
->fExtra
[groupOffset
+1];
1295 // A capture group wasn't part of the match
1296 return utext_replace(dest
, destLen
, destLen
, NULL
, 0, &status
);
1301 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1302 U_ASSERT(e
<= fInputLength
);
1303 deltaLen
= utext_replace(dest
, destLen
, destLen
, fInputText
->chunkContents
+s
, (int32_t)(e
-s
), &status
);
1306 if (UTEXT_USES_U16(fInputText
)) {
1307 len16
= (int32_t)(e
-s
);
1309 UErrorCode lengthStatus
= U_ZERO_ERROR
;
1310 len16
= utext_extract(fInputText
, s
, e
, NULL
, 0, &lengthStatus
);
1312 UChar
*groupChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(len16
+1));
1313 if (groupChars
== NULL
) {
1314 status
= U_MEMORY_ALLOCATION_ERROR
;
1317 utext_extract(fInputText
, s
, e
, groupChars
, len16
+1, &status
);
1319 deltaLen
= utext_replace(dest
, destLen
, destLen
, groupChars
, len16
, &status
);
1320 uprv_free(groupChars
);
1327 //--------------------------------------------------------------------------------
1331 //--------------------------------------------------------------------------------
1332 int32_t RegexMatcher::groupCount() const {
1333 return fPattern
->fGroupMap
->size();
1338 //--------------------------------------------------------------------------------
1340 // hasAnchoringBounds()
1342 //--------------------------------------------------------------------------------
1343 UBool
RegexMatcher::hasAnchoringBounds() const {
1344 return fAnchoringBounds
;
1348 //--------------------------------------------------------------------------------
1350 // hasTransparentBounds()
1352 //--------------------------------------------------------------------------------
1353 UBool
RegexMatcher::hasTransparentBounds() const {
1354 return fTransparentBounds
;
1359 //--------------------------------------------------------------------------------
1363 //--------------------------------------------------------------------------------
1364 UBool
RegexMatcher::hitEnd() const {
1369 //--------------------------------------------------------------------------------
1373 //--------------------------------------------------------------------------------
1374 const UnicodeString
&RegexMatcher::input() const {
1376 UErrorCode status
= U_ZERO_ERROR
;
1378 if (UTEXT_USES_U16(fInputText
)) {
1379 len16
= (int32_t)fInputLength
;
1381 len16
= utext_extract(fInputText
, 0, fInputLength
, NULL
, 0, &status
);
1382 status
= U_ZERO_ERROR
; // overflow, length status
1384 UnicodeString
*result
= new UnicodeString(len16
, 0, 0);
1386 UChar
*inputChars
= result
->getBuffer(len16
);
1387 utext_extract(fInputText
, 0, fInputLength
, inputChars
, len16
, &status
); // unterminated warning
1388 result
->releaseBuffer(len16
);
1390 (*(const UnicodeString
**)&fInput
) = result
; // pointer assignment, rather than operator=
1396 //--------------------------------------------------------------------------------
1400 //--------------------------------------------------------------------------------
1401 UText
*RegexMatcher::inputText() const {
1406 //--------------------------------------------------------------------------------
1408 // getInput() -- like inputText(), but makes a clone or copies into another UText
1410 //--------------------------------------------------------------------------------
1411 UText
*RegexMatcher::getInput (UText
*dest
, UErrorCode
&status
) const {
1412 if (U_FAILURE(status
)) {
1415 if (U_FAILURE(fDeferredStatus
)) {
1416 status
= fDeferredStatus
;
1421 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1422 utext_replace(dest
, 0, utext_nativeLength(dest
), fInputText
->chunkContents
, (int32_t)fInputLength
, &status
);
1425 if (UTEXT_USES_U16(fInputText
)) {
1426 input16Len
= (int32_t)fInputLength
;
1428 UErrorCode lengthStatus
= U_ZERO_ERROR
;
1429 input16Len
= utext_extract(fInputText
, 0, fInputLength
, NULL
, 0, &lengthStatus
); // buffer overflow error
1431 UChar
*inputChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(input16Len
));
1432 if (inputChars
== NULL
) {
1436 status
= U_ZERO_ERROR
;
1437 utext_extract(fInputText
, 0, fInputLength
, inputChars
, input16Len
, &status
); // not terminated warning
1438 status
= U_ZERO_ERROR
;
1439 utext_replace(dest
, 0, utext_nativeLength(dest
), inputChars
, input16Len
, &status
);
1441 uprv_free(inputChars
);
1445 return utext_clone(NULL
, fInputText
, FALSE
, TRUE
, &status
);
1450 static UBool
compat_SyncMutableUTextContents(UText
*ut
);
1451 static UBool
compat_SyncMutableUTextContents(UText
*ut
) {
1452 UBool retVal
= FALSE
;
1454 // In the following test, we're really only interested in whether the UText should switch
1455 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents
1456 // will still point to the correct data.
1457 if (utext_nativeLength(ut
) != ut
->nativeIndexingLimit
) {
1458 UnicodeString
*us
=(UnicodeString
*)ut
->context
;
1460 // Update to the latest length.
1461 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1462 int32_t newLength
= us
->length();
1464 // Update the chunk description.
1465 // The buffer may have switched between stack- and heap-based.
1466 ut
->chunkContents
= us
->getBuffer();
1467 ut
->chunkLength
= newLength
;
1468 ut
->chunkNativeLimit
= newLength
;
1469 ut
->nativeIndexingLimit
= newLength
;
1476 //--------------------------------------------------------------------------------
1480 //--------------------------------------------------------------------------------
1481 UBool
RegexMatcher::lookingAt(UErrorCode
&status
) {
1482 if (U_FAILURE(status
)) {
1485 if (U_FAILURE(fDeferredStatus
)) {
1486 status
= fDeferredStatus
;
1490 if (fInputUniStrMaybeMutable
) {
1491 if (compat_SyncMutableUTextContents(fInputText
)) {
1492 fInputLength
= utext_nativeLength(fInputText
);
1497 resetPreserveRegion();
1499 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1500 MatchChunkAt((int32_t)fActiveStart
, FALSE
, status
);
1502 MatchAt(fActiveStart
, FALSE
, status
);
1508 UBool
RegexMatcher::lookingAt(int64_t start
, UErrorCode
&status
) {
1509 if (U_FAILURE(status
)) {
1512 if (U_FAILURE(fDeferredStatus
)) {
1513 status
= fDeferredStatus
;
1519 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1523 if (fInputUniStrMaybeMutable
) {
1524 if (compat_SyncMutableUTextContents(fInputText
)) {
1525 fInputLength
= utext_nativeLength(fInputText
);
1530 int64_t nativeStart
;
1531 nativeStart
= start
;
1532 if (nativeStart
< fActiveStart
|| nativeStart
> fActiveLimit
) {
1533 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1537 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1538 MatchChunkAt((int32_t)nativeStart
, FALSE
, status
);
1540 MatchAt(nativeStart
, FALSE
, status
);
1547 //--------------------------------------------------------------------------------
1551 //--------------------------------------------------------------------------------
1552 UBool
RegexMatcher::matches(UErrorCode
&status
) {
1553 if (U_FAILURE(status
)) {
1556 if (U_FAILURE(fDeferredStatus
)) {
1557 status
= fDeferredStatus
;
1561 if (fInputUniStrMaybeMutable
) {
1562 if (compat_SyncMutableUTextContents(fInputText
)) {
1563 fInputLength
= utext_nativeLength(fInputText
);
1568 resetPreserveRegion();
1571 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1572 MatchChunkAt((int32_t)fActiveStart
, TRUE
, status
);
1574 MatchAt(fActiveStart
, TRUE
, status
);
1580 UBool
RegexMatcher::matches(int64_t start
, UErrorCode
&status
) {
1581 if (U_FAILURE(status
)) {
1584 if (U_FAILURE(fDeferredStatus
)) {
1585 status
= fDeferredStatus
;
1591 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1595 if (fInputUniStrMaybeMutable
) {
1596 if (compat_SyncMutableUTextContents(fInputText
)) {
1597 fInputLength
= utext_nativeLength(fInputText
);
1602 int64_t nativeStart
;
1603 nativeStart
= start
;
1604 if (nativeStart
< fActiveStart
|| nativeStart
> fActiveLimit
) {
1605 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1609 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1610 MatchChunkAt((int32_t)nativeStart
, TRUE
, status
);
1612 MatchAt(nativeStart
, TRUE
, status
);
1619 //--------------------------------------------------------------------------------
1623 //--------------------------------------------------------------------------------
1624 const RegexPattern
&RegexMatcher::pattern() const {
1630 //--------------------------------------------------------------------------------
1634 //--------------------------------------------------------------------------------
1635 RegexMatcher
&RegexMatcher::region(int64_t regionStart
, int64_t regionLimit
, int64_t startIndex
, UErrorCode
&status
) {
1636 if (U_FAILURE(status
)) {
1640 if (regionStart
>regionLimit
|| regionStart
<0 || regionLimit
<0) {
1641 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1644 int64_t nativeStart
= regionStart
;
1645 int64_t nativeLimit
= regionLimit
;
1646 if (nativeStart
> fInputLength
|| nativeLimit
> fInputLength
) {
1647 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1650 if (startIndex
== -1)
1653 resetPreserveRegion();
1655 fRegionStart
= nativeStart
;
1656 fRegionLimit
= nativeLimit
;
1657 fActiveStart
= nativeStart
;
1658 fActiveLimit
= nativeLimit
;
1660 if (startIndex
!= -1) {
1661 if (startIndex
< fActiveStart
|| startIndex
> fActiveLimit
) {
1662 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1664 fMatchEnd
= startIndex
;
1667 if (!fTransparentBounds
) {
1668 fLookStart
= nativeStart
;
1669 fLookLimit
= nativeLimit
;
1671 if (fAnchoringBounds
) {
1672 fAnchorStart
= nativeStart
;
1673 fAnchorLimit
= nativeLimit
;
1678 RegexMatcher
&RegexMatcher::region(int64_t start
, int64_t limit
, UErrorCode
&status
) {
1679 return region(start
, limit
, -1, status
);
1682 //--------------------------------------------------------------------------------
1686 //--------------------------------------------------------------------------------
1687 int32_t RegexMatcher::regionEnd() const {
1688 return (int32_t)fRegionLimit
;
1691 int64_t RegexMatcher::regionEnd64() const {
1692 return fRegionLimit
;
1695 //--------------------------------------------------------------------------------
1699 //--------------------------------------------------------------------------------
1700 int32_t RegexMatcher::regionStart() const {
1701 return (int32_t)fRegionStart
;
1704 int64_t RegexMatcher::regionStart64() const {
1705 return fRegionStart
;
1709 //--------------------------------------------------------------------------------
1713 //--------------------------------------------------------------------------------
1714 UnicodeString
RegexMatcher::replaceAll(const UnicodeString
&replacement
, UErrorCode
&status
) {
1715 UText replacementText
= UTEXT_INITIALIZER
;
1716 UText resultText
= UTEXT_INITIALIZER
;
1717 UnicodeString resultString
;
1718 if (U_FAILURE(status
)) {
1719 return resultString
;
1722 utext_openConstUnicodeString(&replacementText
, &replacement
, &status
);
1723 utext_openUnicodeString(&resultText
, &resultString
, &status
);
1725 replaceAll(&replacementText
, &resultText
, status
);
1727 utext_close(&resultText
);
1728 utext_close(&replacementText
);
1730 return resultString
;
1735 // replaceAll, UText mode
1737 UText
*RegexMatcher::replaceAll(UText
*replacement
, UText
*dest
, UErrorCode
&status
) {
1738 if (U_FAILURE(status
)) {
1741 if (U_FAILURE(fDeferredStatus
)) {
1742 status
= fDeferredStatus
;
1747 UnicodeString emptyString
;
1748 UText empty
= UTEXT_INITIALIZER
;
1750 utext_openUnicodeString(&empty
, &emptyString
, &status
);
1751 dest
= utext_clone(NULL
, &empty
, TRUE
, FALSE
, &status
);
1752 utext_close(&empty
);
1755 if (U_SUCCESS(status
)) {
1758 appendReplacement(dest
, replacement
, status
);
1759 if (U_FAILURE(status
)) {
1763 appendTail(dest
, status
);
1770 //--------------------------------------------------------------------------------
1774 //--------------------------------------------------------------------------------
1775 UnicodeString
RegexMatcher::replaceFirst(const UnicodeString
&replacement
, UErrorCode
&status
) {
1776 UText replacementText
= UTEXT_INITIALIZER
;
1777 UText resultText
= UTEXT_INITIALIZER
;
1778 UnicodeString resultString
;
1780 utext_openConstUnicodeString(&replacementText
, &replacement
, &status
);
1781 utext_openUnicodeString(&resultText
, &resultString
, &status
);
1783 replaceFirst(&replacementText
, &resultText
, status
);
1785 utext_close(&resultText
);
1786 utext_close(&replacementText
);
1788 return resultString
;
1792 // replaceFirst, UText mode
1794 UText
*RegexMatcher::replaceFirst(UText
*replacement
, UText
*dest
, UErrorCode
&status
) {
1795 if (U_FAILURE(status
)) {
1798 if (U_FAILURE(fDeferredStatus
)) {
1799 status
= fDeferredStatus
;
1805 return getInput(dest
, status
);
1809 UnicodeString emptyString
;
1810 UText empty
= UTEXT_INITIALIZER
;
1812 utext_openUnicodeString(&empty
, &emptyString
, &status
);
1813 dest
= utext_clone(NULL
, &empty
, TRUE
, FALSE
, &status
);
1814 utext_close(&empty
);
1817 appendReplacement(dest
, replacement
, status
);
1818 appendTail(dest
, status
);
1824 //--------------------------------------------------------------------------------
1828 //--------------------------------------------------------------------------------
1829 UBool
RegexMatcher::requireEnd() const {
1834 //--------------------------------------------------------------------------------
1838 //--------------------------------------------------------------------------------
1839 RegexMatcher
&RegexMatcher::reset() {
1841 fRegionLimit
= fInputLength
;
1843 fActiveLimit
= fInputLength
;
1845 fAnchorLimit
= fInputLength
;
1847 fLookLimit
= fInputLength
;
1848 resetPreserveRegion();
1854 void RegexMatcher::resetPreserveRegion() {
1858 fAppendPosition
= 0;
1861 fRequireEnd
= FALSE
;
1863 fTickCounter
= TIMER_INITIAL_VALUE
;
1864 //resetStack(); // more expensive than it looks...
1868 RegexMatcher
&RegexMatcher::reset(const UnicodeString
&input
) {
1869 fInputText
= utext_openConstUnicodeString(fInputText
, &input
, &fDeferredStatus
);
1870 if (fPattern
->fNeedsAltInput
) {
1871 fAltInputText
= utext_clone(fAltInputText
, fInputText
, FALSE
, TRUE
, &fDeferredStatus
);
1873 fInputLength
= utext_nativeLength(fInputText
);
1879 // Do the following for any UnicodeString.
1880 // This is for compatibility for those clients who modify the input string "live" during regex operations.
1881 fInputUniStrMaybeMutable
= TRUE
;
1883 if (fWordBreakItr
!= NULL
) {
1884 #if UCONFIG_NO_BREAK_ITERATION==0
1885 UErrorCode status
= U_ZERO_ERROR
;
1886 fWordBreakItr
->setText(fInputText
, status
);
1893 RegexMatcher
&RegexMatcher::reset(UText
*input
) {
1894 if (fInputText
!= input
) {
1895 fInputText
= utext_clone(fInputText
, input
, FALSE
, TRUE
, &fDeferredStatus
);
1896 if (fPattern
->fNeedsAltInput
) fAltInputText
= utext_clone(fAltInputText
, fInputText
, FALSE
, TRUE
, &fDeferredStatus
);
1897 fInputLength
= utext_nativeLength(fInputText
);
1902 if (fWordBreakItr
!= NULL
) {
1903 #if UCONFIG_NO_BREAK_ITERATION==0
1904 UErrorCode status
= U_ZERO_ERROR
;
1905 fWordBreakItr
->setText(input
, status
);
1910 fInputUniStrMaybeMutable
= FALSE
;
1915 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
1916 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1920 RegexMatcher
&RegexMatcher::reset(int64_t position
, UErrorCode
&status
) {
1921 if (U_FAILURE(status
)) {
1924 reset(); // Reset also resets the region to be the entire string.
1926 if (position
< 0 || position
> fActiveLimit
) {
1927 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1930 fMatchEnd
= position
;
1935 //--------------------------------------------------------------------------------
1939 //--------------------------------------------------------------------------------
1940 RegexMatcher
&RegexMatcher::refreshInputText(UText
*input
, UErrorCode
&status
) {
1941 if (U_FAILURE(status
)) {
1944 if (input
== NULL
) {
1945 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1948 if (utext_nativeLength(fInputText
) != utext_nativeLength(input
)) {
1949 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1952 int64_t pos
= utext_getNativeIndex(fInputText
);
1953 // Shallow read-only clone of the new UText into the existing input UText
1954 fInputText
= utext_clone(fInputText
, input
, FALSE
, TRUE
, &status
);
1955 if (U_FAILURE(status
)) {
1958 utext_setNativeIndex(fInputText
, pos
);
1960 if (fAltInputText
!= NULL
) {
1961 pos
= utext_getNativeIndex(fAltInputText
);
1962 fAltInputText
= utext_clone(fAltInputText
, input
, FALSE
, TRUE
, &status
);
1963 if (U_FAILURE(status
)) {
1966 utext_setNativeIndex(fAltInputText
, pos
);
1973 //--------------------------------------------------------------------------------
1977 //--------------------------------------------------------------------------------
1978 void RegexMatcher::setTrace(UBool state
) {
1979 fTraceDebug
= state
;
1984 //---------------------------------------------------------------------
1988 //---------------------------------------------------------------------
1989 int32_t RegexMatcher::split(const UnicodeString
&input
,
1990 UnicodeString dest
[],
1991 int32_t destCapacity
,
1994 UText inputText
= UTEXT_INITIALIZER
;
1995 utext_openConstUnicodeString(&inputText
, &input
, &status
);
1996 if (U_FAILURE(status
)) {
2000 UText
**destText
= (UText
**)uprv_malloc(sizeof(UText
*)*destCapacity
);
2001 if (destText
== NULL
) {
2002 status
= U_MEMORY_ALLOCATION_ERROR
;
2006 for (i
= 0; i
< destCapacity
; i
++) {
2007 destText
[i
] = utext_openUnicodeString(NULL
, &dest
[i
], &status
);
2010 int32_t fieldCount
= split(&inputText
, destText
, destCapacity
, status
);
2012 for (i
= 0; i
< destCapacity
; i
++) {
2013 utext_close(destText
[i
]);
2016 uprv_free(destText
);
2017 utext_close(&inputText
);
2022 // split, UText mode
2024 int32_t RegexMatcher::split(UText
*input
,
2026 int32_t destCapacity
,
2030 // Check arguements for validity
2032 if (U_FAILURE(status
)) {
2036 if (destCapacity
< 1) {
2037 status
= U_ILLEGAL_ARGUMENT_ERROR
;
2042 // Reset for the input text
2045 int64_t nextOutputStringStart
= 0;
2046 if (fActiveLimit
== 0) {
2051 // Loop through the input text, searching for the delimiter pattern
2054 int32_t numCaptureGroups
= fPattern
->fGroupMap
->size();
2056 if (i
>=destCapacity
-1) {
2057 // There is one or zero output string left.
2058 // Fill the last output string with whatever is left from the input, then exit the loop.
2059 // ( i will be == destCapacity if we filled the output array while processing
2060 // capture groups of the delimiter expression, in which case we will discard the
2061 // last capture group saved in favor of the unprocessed remainder of the
2064 if (fActiveLimit
> nextOutputStringStart
) {
2065 if (UTEXT_FULL_TEXT_IN_CHUNK(input
, fInputLength
)) {
2067 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]),
2068 input
->chunkContents
+nextOutputStringStart
,
2069 (int32_t)(fActiveLimit
-nextOutputStringStart
), &status
);
2071 UText remainingText
= UTEXT_INITIALIZER
;
2072 utext_openUChars(&remainingText
, input
->chunkContents
+nextOutputStringStart
,
2073 fActiveLimit
-nextOutputStringStart
, &status
);
2074 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2075 utext_close(&remainingText
);
2078 UErrorCode lengthStatus
= U_ZERO_ERROR
;
2079 int32_t remaining16Length
=
2080 utext_extract(input
, nextOutputStringStart
, fActiveLimit
, NULL
, 0, &lengthStatus
);
2081 UChar
*remainingChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(remaining16Length
+1));
2082 if (remainingChars
== NULL
) {
2083 status
= U_MEMORY_ALLOCATION_ERROR
;
2087 utext_extract(input
, nextOutputStringStart
, fActiveLimit
, remainingChars
, remaining16Length
+1, &status
);
2089 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]), remainingChars
, remaining16Length
, &status
);
2091 UText remainingText
= UTEXT_INITIALIZER
;
2092 utext_openUChars(&remainingText
, remainingChars
, remaining16Length
, &status
);
2093 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2094 utext_close(&remainingText
);
2097 uprv_free(remainingChars
);
2103 // We found another delimiter. Move everything from where we started looking
2104 // up until the start of the delimiter into the next output string.
2105 if (UTEXT_FULL_TEXT_IN_CHUNK(input
, fInputLength
)) {
2107 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]),
2108 input
->chunkContents
+nextOutputStringStart
,
2109 (int32_t)(fMatchStart
-nextOutputStringStart
), &status
);
2111 UText remainingText
= UTEXT_INITIALIZER
;
2112 utext_openUChars(&remainingText
, input
->chunkContents
+nextOutputStringStart
,
2113 fMatchStart
-nextOutputStringStart
, &status
);
2114 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2115 utext_close(&remainingText
);
2118 UErrorCode lengthStatus
= U_ZERO_ERROR
;
2119 int32_t remaining16Length
= utext_extract(input
, nextOutputStringStart
, fMatchStart
, NULL
, 0, &lengthStatus
);
2120 UChar
*remainingChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(remaining16Length
+1));
2121 if (remainingChars
== NULL
) {
2122 status
= U_MEMORY_ALLOCATION_ERROR
;
2125 utext_extract(input
, nextOutputStringStart
, fMatchStart
, remainingChars
, remaining16Length
+1, &status
);
2127 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]), remainingChars
, remaining16Length
, &status
);
2129 UText remainingText
= UTEXT_INITIALIZER
;
2130 utext_openUChars(&remainingText
, remainingChars
, remaining16Length
, &status
);
2131 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2132 utext_close(&remainingText
);
2135 uprv_free(remainingChars
);
2137 nextOutputStringStart
= fMatchEnd
;
2139 // If the delimiter pattern has capturing parentheses, the captured
2140 // text goes out into the next n destination strings.
2142 for (groupNum
=1; groupNum
<=numCaptureGroups
; groupNum
++) {
2143 if (i
>= destCapacity
-2) {
2144 // Never fill the last available output string with capture group text.
2145 // It will filled with the last field, the remainder of the
2146 // unsplit input text.
2150 dest
[i
] = group(groupNum
, dest
[i
], status
);
2153 if (nextOutputStringStart
== fActiveLimit
) {
2154 // The delimiter was at the end of the string. We're done, but first
2155 // we output one last empty string, for the empty field following
2156 // the delimiter at the end of input.
2157 if (i
+1 < destCapacity
) {
2159 if (dest
[i
] == NULL
) {
2160 dest
[i
] = utext_openUChars(NULL
, NULL
, 0, &status
);
2162 static UChar emptyString
[] = {(UChar
)0};
2163 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]), emptyString
, 0, &status
);
2172 // We ran off the end of the input while looking for the next delimiter.
2173 // All the remaining text goes into the current output string.
2174 if (UTEXT_FULL_TEXT_IN_CHUNK(input
, fInputLength
)) {
2176 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]),
2177 input
->chunkContents
+nextOutputStringStart
,
2178 (int32_t)(fActiveLimit
-nextOutputStringStart
), &status
);
2180 UText remainingText
= UTEXT_INITIALIZER
;
2181 utext_openUChars(&remainingText
, input
->chunkContents
+nextOutputStringStart
,
2182 fActiveLimit
-nextOutputStringStart
, &status
);
2183 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2184 utext_close(&remainingText
);
2187 UErrorCode lengthStatus
= U_ZERO_ERROR
;
2188 int32_t remaining16Length
= utext_extract(input
, nextOutputStringStart
, fActiveLimit
, NULL
, 0, &lengthStatus
);
2189 UChar
*remainingChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(remaining16Length
+1));
2190 if (remainingChars
== NULL
) {
2191 status
= U_MEMORY_ALLOCATION_ERROR
;
2195 utext_extract(input
, nextOutputStringStart
, fActiveLimit
, remainingChars
, remaining16Length
+1, &status
);
2197 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]), remainingChars
, remaining16Length
, &status
);
2199 UText remainingText
= UTEXT_INITIALIZER
;
2200 utext_openUChars(&remainingText
, remainingChars
, remaining16Length
, &status
);
2201 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2202 utext_close(&remainingText
);
2205 uprv_free(remainingChars
);
2209 if (U_FAILURE(status
)) {
2212 } // end of for loop
2217 //--------------------------------------------------------------------------------
2221 //--------------------------------------------------------------------------------
2222 int32_t RegexMatcher::start(UErrorCode
&status
) const {
2223 return start(0, status
);
2226 int64_t RegexMatcher::start64(UErrorCode
&status
) const {
2227 return start64(0, status
);
2230 //--------------------------------------------------------------------------------
2232 // start(int32_t group, UErrorCode &status)
2234 //--------------------------------------------------------------------------------
2236 int64_t RegexMatcher::start64(int32_t group
, UErrorCode
&status
) const {
2237 if (U_FAILURE(status
)) {
2240 if (U_FAILURE(fDeferredStatus
)) {
2241 status
= fDeferredStatus
;
2244 if (fMatch
== FALSE
) {
2245 status
= U_REGEX_INVALID_STATE
;
2248 if (group
< 0 || group
> fPattern
->fGroupMap
->size()) {
2249 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
2256 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(group
-1);
2257 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
2258 U_ASSERT(groupOffset
>= 0);
2259 s
= fFrame
->fExtra
[groupOffset
];
2266 int32_t RegexMatcher::start(int32_t group
, UErrorCode
&status
) const {
2267 return (int32_t)start64(group
, status
);
2270 //--------------------------------------------------------------------------------
2272 // useAnchoringBounds
2274 //--------------------------------------------------------------------------------
2275 RegexMatcher
&RegexMatcher::useAnchoringBounds(UBool b
) {
2276 fAnchoringBounds
= b
;
2277 fAnchorStart
= (fAnchoringBounds
? fRegionStart
: 0);
2278 fAnchorLimit
= (fAnchoringBounds
? fRegionLimit
: fInputLength
);
2283 //--------------------------------------------------------------------------------
2285 // useTransparentBounds
2287 //--------------------------------------------------------------------------------
2288 RegexMatcher
&RegexMatcher::useTransparentBounds(UBool b
) {
2289 fTransparentBounds
= b
;
2290 fLookStart
= (fTransparentBounds
? 0 : fRegionStart
);
2291 fLookLimit
= (fTransparentBounds
? fInputLength
: fRegionLimit
);
2295 //--------------------------------------------------------------------------------
2299 //--------------------------------------------------------------------------------
2300 void RegexMatcher::setTimeLimit(int32_t limit
, UErrorCode
&status
) {
2301 if (U_FAILURE(status
)) {
2304 if (U_FAILURE(fDeferredStatus
)) {
2305 status
= fDeferredStatus
;
2309 status
= U_ILLEGAL_ARGUMENT_ERROR
;
2316 //--------------------------------------------------------------------------------
2320 //--------------------------------------------------------------------------------
2321 int32_t RegexMatcher::getTimeLimit() const {
2326 //--------------------------------------------------------------------------------
2330 //--------------------------------------------------------------------------------
2331 void RegexMatcher::setStackLimit(int32_t limit
, UErrorCode
&status
) {
2332 if (U_FAILURE(status
)) {
2335 if (U_FAILURE(fDeferredStatus
)) {
2336 status
= fDeferredStatus
;
2340 status
= U_ILLEGAL_ARGUMENT_ERROR
;
2344 // Reset the matcher. This is needed here in case there is a current match
2345 // whose final stack frame (containing the match results, pointed to by fFrame)
2346 // would be lost by resizing to a smaller stack size.
2350 // Unlimited stack expansion
2351 fStack
->setMaxCapacity(0);
2353 // Change the units of the limit from bytes to ints, and bump the size up
2354 // to be big enough to hold at least one stack frame for the pattern,
2355 // if it isn't there already.
2356 int32_t adjustedLimit
= limit
/ sizeof(int32_t);
2357 if (adjustedLimit
< fPattern
->fFrameSize
) {
2358 adjustedLimit
= fPattern
->fFrameSize
;
2360 fStack
->setMaxCapacity(adjustedLimit
);
2362 fStackLimit
= limit
;
2366 //--------------------------------------------------------------------------------
2370 //--------------------------------------------------------------------------------
2371 int32_t RegexMatcher::getStackLimit() const {
2376 //--------------------------------------------------------------------------------
2380 //--------------------------------------------------------------------------------
2381 void RegexMatcher::setMatchCallback(URegexMatchCallback
*callback
,
2382 const void *context
,
2383 UErrorCode
&status
) {
2384 if (U_FAILURE(status
)) {
2387 fCallbackFn
= callback
;
2388 fCallbackContext
= context
;
2392 //--------------------------------------------------------------------------------
2396 //--------------------------------------------------------------------------------
2397 void RegexMatcher::getMatchCallback(URegexMatchCallback
*&callback
,
2398 const void *&context
,
2399 UErrorCode
&status
) {
2400 if (U_FAILURE(status
)) {
2403 callback
= fCallbackFn
;
2404 context
= fCallbackContext
;
2408 //--------------------------------------------------------------------------------
2412 //--------------------------------------------------------------------------------
2413 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback
*callback
,
2414 const void *context
,
2415 UErrorCode
&status
) {
2416 if (U_FAILURE(status
)) {
2419 fFindProgressCallbackFn
= callback
;
2420 fFindProgressCallbackContext
= context
;
2424 //--------------------------------------------------------------------------------
2428 //--------------------------------------------------------------------------------
2429 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback
*&callback
,
2430 const void *&context
,
2431 UErrorCode
&status
) {
2432 if (U_FAILURE(status
)) {
2435 callback
= fFindProgressCallbackFn
;
2436 context
= fFindProgressCallbackContext
;
2440 //================================================================================
2442 // Code following this point in this file is the internal
2443 // Match Engine Implementation.
2445 //================================================================================
2448 //--------------------------------------------------------------------------------
2451 // Discard any previous contents of the state save stack, and initialize a
2452 // new stack frame to all -1. The -1s are needed for capture group limits,
2453 // where they indicate that a group has not yet matched anything.
2454 //--------------------------------------------------------------------------------
2455 REStackFrame
*RegexMatcher::resetStack() {
2456 // Discard any previous contents of the state save stack, and initialize a
2457 // new stack frame with all -1 data. The -1s are needed for capture group limits,
2458 // where they indicate that a group has not yet matched anything.
2459 fStack
->removeAllElements();
2461 REStackFrame
*iFrame
= (REStackFrame
*)fStack
->reserveBlock(fPattern
->fFrameSize
, fDeferredStatus
);
2463 for (i
=0; i
<fPattern
->fFrameSize
-RESTACKFRAME_HDRCOUNT
; i
++) {
2464 iFrame
->fExtra
[i
] = -1;
2471 //--------------------------------------------------------------------------------
2474 // in perl, "xab..cd..", \b is true at positions 0,3,5,7
2476 // If the current char is a combining mark,
2478 // Else Scan backwards to the first non-combining char.
2479 // We are at a boundary if the this char and the original chars are
2480 // opposite in membership in \w set
2482 // parameters: pos - the current position in the input buffer
2484 // TODO: double-check edge cases at region boundaries.
2486 //--------------------------------------------------------------------------------
2487 UBool
RegexMatcher::isWordBoundary(int64_t pos
) {
2488 UBool isBoundary
= FALSE
;
2489 UBool cIsWord
= FALSE
;
2491 if (pos
>= fLookLimit
) {
2494 // Determine whether char c at current position is a member of the word set of chars.
2495 // If we're off the end of the string, behave as though we're not at a word char.
2496 UTEXT_SETNATIVEINDEX(fInputText
, pos
);
2497 UChar32 c
= UTEXT_CURRENT32(fInputText
);
2498 if (u_hasBinaryProperty(c
, UCHAR_GRAPHEME_EXTEND
) || u_charType(c
) == U_FORMAT_CHAR
) {
2499 // Current char is a combining one. Not a boundary.
2502 cIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(c
);
2505 // Back up until we come to a non-combining char, determine whether
2506 // that char is a word char.
2507 UBool prevCIsWord
= FALSE
;
2509 if (UTEXT_GETNATIVEINDEX(fInputText
) <= fLookStart
) {
2512 UChar32 prevChar
= UTEXT_PREVIOUS32(fInputText
);
2513 if (!(u_hasBinaryProperty(prevChar
, UCHAR_GRAPHEME_EXTEND
)
2514 || u_charType(prevChar
) == U_FORMAT_CHAR
)) {
2515 prevCIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(prevChar
);
2519 isBoundary
= cIsWord
^ prevCIsWord
;
2523 UBool
RegexMatcher::isChunkWordBoundary(int32_t pos
) {
2524 UBool isBoundary
= FALSE
;
2525 UBool cIsWord
= FALSE
;
2527 const UChar
*inputBuf
= fInputText
->chunkContents
;
2529 if (pos
>= fLookLimit
) {
2532 // Determine whether char c at current position is a member of the word set of chars.
2533 // If we're off the end of the string, behave as though we're not at a word char.
2535 U16_GET(inputBuf
, fLookStart
, pos
, fLookLimit
, c
);
2536 if (u_hasBinaryProperty(c
, UCHAR_GRAPHEME_EXTEND
) || u_charType(c
) == U_FORMAT_CHAR
) {
2537 // Current char is a combining one. Not a boundary.
2540 cIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(c
);
2543 // Back up until we come to a non-combining char, determine whether
2544 // that char is a word char.
2545 UBool prevCIsWord
= FALSE
;
2547 if (pos
<= fLookStart
) {
2551 U16_PREV(inputBuf
, fLookStart
, pos
, prevChar
);
2552 if (!(u_hasBinaryProperty(prevChar
, UCHAR_GRAPHEME_EXTEND
)
2553 || u_charType(prevChar
) == U_FORMAT_CHAR
)) {
2554 prevCIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(prevChar
);
2558 isBoundary
= cIsWord
^ prevCIsWord
;
2562 //--------------------------------------------------------------------------------
2566 // Test for a word boundary using RBBI word break.
2568 // parameters: pos - the current position in the input buffer
2570 //--------------------------------------------------------------------------------
2571 UBool
RegexMatcher::isUWordBoundary(int64_t pos
) {
2572 UBool returnVal
= FALSE
;
2573 #if UCONFIG_NO_BREAK_ITERATION==0
2575 // If we haven't yet created a break iterator for this matcher, do it now.
2576 if (fWordBreakItr
== NULL
) {
2578 (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus
);
2579 if (U_FAILURE(fDeferredStatus
)) {
2582 fWordBreakItr
->setText(fInputText
, fDeferredStatus
);
2585 if (pos
>= fLookLimit
) {
2587 returnVal
= TRUE
; // With Unicode word rules, only positions within the interior of "real"
2588 // words are not boundaries. All non-word chars stand by themselves,
2589 // with word boundaries on both sides.
2591 if (!UTEXT_USES_U16(fInputText
)) {
2592 // !!!: Would like a better way to do this!
2593 UErrorCode status
= U_ZERO_ERROR
;
2594 pos
= utext_extract(fInputText
, 0, pos
, NULL
, 0, &status
);
2596 returnVal
= fWordBreakItr
->isBoundary((int32_t)pos
);
2602 //--------------------------------------------------------------------------------
2604 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state
2605 // saves. Increment the "time" counter, and call the
2606 // user callback function if there is one installed.
2608 // If the match operation needs to be aborted, either for a time-out
2609 // or because the user callback asked for it, just set an error status.
2610 // The engine will pick that up and stop in its outer loop.
2612 //--------------------------------------------------------------------------------
2613 void RegexMatcher::IncrementTime(UErrorCode
&status
) {
2614 fTickCounter
= TIMER_INITIAL_VALUE
;
2616 if (fCallbackFn
!= NULL
) {
2617 if ((*fCallbackFn
)(fCallbackContext
, fTime
) == FALSE
) {
2618 status
= U_REGEX_STOPPED_BY_CALLER
;
2622 if (fTimeLimit
> 0 && fTime
>= fTimeLimit
) {
2623 status
= U_REGEX_TIME_OUT
;
2627 //--------------------------------------------------------------------------------
2629 // ReportFindProgress This function is called once for each advance in the target
2630 // string from the find() function, and calls the user progress callback
2631 // function if there is one installed.
2635 // If the match operation needs to be aborted because the user
2636 // callback asked for it, just set an error status.
2637 // The engine will pick that up and stop in its outer loop.
2639 //--------------------------------------------------------------------------------
2640 UBool
RegexMatcher::ReportFindProgress(int64_t matchIndex
, UErrorCode
&status
) {
2641 if (fFindProgressCallbackFn
!= NULL
) {
2642 if ((*fFindProgressCallbackFn
)(fFindProgressCallbackContext
, matchIndex
) == FALSE
) {
2643 status
= U_ZERO_ERROR
/*U_REGEX_STOPPED_BY_CALLER*/;
2650 //--------------------------------------------------------------------------------
2653 // Make a new stack frame, initialized as a copy of the current stack frame.
2654 // Set the pattern index in the original stack frame from the operand value
2655 // in the opcode. Execution of the engine continues with the state in
2656 // the newly created stack frame
2658 // Note that reserveBlock() may grow the stack, resulting in the
2659 // whole thing being relocated in memory.
2662 // fp The top frame pointer when called. At return, a new
2663 // fame will be present
2664 // savePatIdx An index into the compiled pattern. Goes into the original
2665 // (not new) frame. If execution ever back-tracks out of the
2666 // new frame, this will be where we continue from in the pattern.
2668 // The new frame pointer.
2670 //--------------------------------------------------------------------------------
2671 inline REStackFrame
*RegexMatcher::StateSave(REStackFrame
*fp
, int64_t savePatIdx
, UErrorCode
&status
) {
2672 // push storage for a new frame.
2673 int64_t *newFP
= fStack
->reserveBlock(fFrameSize
, status
);
2674 if (newFP
== NULL
) {
2675 // Failure on attempted stack expansion.
2676 // Stack function set some other error code, change it to a more
2677 // specific one for regular expressions.
2678 status
= U_REGEX_STACK_OVERFLOW
;
2679 // We need to return a writable stack frame, so just return the
2680 // previous frame. The match operation will stop quickly
2681 // because of the error status, after which the frame will never
2682 // be looked at again.
2685 fp
= (REStackFrame
*)(newFP
- fFrameSize
); // in case of realloc of stack.
2687 // New stack frame = copy of old top frame.
2688 int64_t *source
= (int64_t *)fp
;
2689 int64_t *dest
= newFP
;
2691 *dest
++ = *source
++;
2692 if (source
== newFP
) {
2698 if (fTickCounter
<= 0) {
2699 IncrementTime(status
); // Re-initializes fTickCounter
2701 fp
->fPatIdx
= savePatIdx
;
2702 return (REStackFrame
*)newFP
;
2706 //--------------------------------------------------------------------------------
2708 // MatchAt This is the actual matching engine.
2710 // startIdx: begin matching a this index.
2711 // toEnd: if true, match must extend to end of the input region
2713 //--------------------------------------------------------------------------------
2714 void RegexMatcher::MatchAt(int64_t startIdx
, UBool toEnd
, UErrorCode
&status
) {
2715 UBool isMatch
= FALSE
; // True if the we have a match.
2717 int64_t backSearchIndex
= U_INT64_MAX
; // used after greedy single-character matches for searching backwards
2719 int32_t op
; // Operation from the compiled pattern, split into
2720 int32_t opType
; // the opcode
2721 int32_t opValue
; // and the operand value.
2723 #ifdef REGEX_RUN_DEBUG
2726 printf("MatchAt(startIdx=%ld)\n", startIdx
);
2727 printf("Original Pattern: ");
2728 UChar32 c
= utext_next32From(fPattern
->fPattern
, 0);
2729 while (c
!= U_SENTINEL
) {
2730 if (c
<32 || c
>256) {
2735 c
= UTEXT_NEXT32(fPattern
->fPattern
);
2738 printf("Input String: ");
2739 c
= utext_next32From(fInputText
, 0);
2740 while (c
!= U_SENTINEL
) {
2741 if (c
<32 || c
>256) {
2746 c
= UTEXT_NEXT32(fInputText
);
2753 if (U_FAILURE(status
)) {
2757 // Cache frequently referenced items from the compiled pattern
2759 int64_t *pat
= fPattern
->fCompiledPat
->getBuffer();
2761 const UChar
*litText
= fPattern
->fLiteralText
.getBuffer();
2762 UVector
*sets
= fPattern
->fSets
;
2764 fFrameSize
= fPattern
->fFrameSize
;
2765 REStackFrame
*fp
= resetStack();
2768 fp
->fInputIdx
= startIdx
;
2770 // Zero out the pattern's static data
2772 for (i
= 0; i
<fPattern
->fDataSize
; i
++) {
2777 // Main loop for interpreting the compiled pattern.
2778 // One iteration of the loop per pattern operation performed.
2781 op
= (int32_t)pat
[fp
->fPatIdx
];
2782 opType
= URX_TYPE(op
);
2783 opValue
= URX_VAL(op
);
2784 #ifdef REGEX_RUN_DEBUG
2786 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2787 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp
->fInputIdx
,
2788 UTEXT_CURRENT32(fInputText
), (int64_t *)fp
-fStack
->getBuffer(), fActiveLimit
);
2789 fPattern
->dumpOp(fp
->fPatIdx
);
2802 // Force a backtrack. In some circumstances, the pattern compiler
2803 // will notice that the pattern can't possibly match anything, and will
2804 // emit one of these at that point.
2805 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2810 if (fp
->fInputIdx
< fActiveLimit
) {
2811 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2812 UChar32 c
= UTEXT_NEXT32(fInputText
);
2814 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
2820 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2826 // Test input against a literal string.
2827 // Strings require two slots in the compiled pattern, one for the
2828 // offset to the string text, and one for the length.
2830 int32_t stringStartIdx
= opValue
;
2831 op
= (int32_t)pat
[fp
->fPatIdx
]; // Fetch the second operand
2833 opType
= URX_TYPE(op
);
2834 int32_t stringLen
= URX_VAL(op
);
2835 U_ASSERT(opType
== URX_STRING_LEN
);
2836 U_ASSERT(stringLen
>= 2);
2838 const UChar
*patternString
= litText
+stringStartIdx
;
2839 int32_t patternStringIndex
= 0;
2840 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2842 UChar32 patternChar
;
2843 UBool success
= TRUE
;
2844 while (patternStringIndex
< stringLen
) {
2845 if (UTEXT_GETNATIVEINDEX(fInputText
) >= fActiveLimit
) {
2850 inputChar
= UTEXT_NEXT32(fInputText
);
2851 U16_NEXT(patternString
, patternStringIndex
, stringLen
, patternChar
);
2852 if (patternChar
!= inputChar
) {
2859 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
2861 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2867 case URX_STATE_SAVE
:
2868 fp
= StateSave(fp
, opValue
, status
);
2873 // The match loop will exit via this path on a successful match,
2874 // when we reach the end of the pattern.
2875 if (toEnd
&& fp
->fInputIdx
!= fActiveLimit
) {
2876 // The pattern matched, but not to the end of input. Try some more.
2877 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2883 // Start and End Capture stack frame variables are laid out out like this:
2884 // fp->fExtra[opValue] - The start of a completed capture group
2885 // opValue+1 - The end of a completed capture group
2886 // opValue+2 - the start of a capture group whose end
2887 // has not yet been reached (and might not ever be).
2888 case URX_START_CAPTURE
:
2889 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
2890 fp
->fExtra
[opValue
+2] = fp
->fInputIdx
;
2894 case URX_END_CAPTURE
:
2895 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
2896 U_ASSERT(fp
->fExtra
[opValue
+2] >= 0); // Start pos for this group must be set.
2897 fp
->fExtra
[opValue
] = fp
->fExtra
[opValue
+2]; // Tentative start becomes real.
2898 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // End position
2899 U_ASSERT(fp
->fExtra
[opValue
] <= fp
->fExtra
[opValue
+1]);
2903 case URX_DOLLAR
: // $, test for End of line
2904 // or for position before new line at end of input
2906 if (fp
->fInputIdx
>= fAnchorLimit
) {
2907 // We really are at the end of input. Success.
2913 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2915 // If we are positioned just before a new-line that is located at the
2916 // end of input, succeed.
2917 UChar32 c
= UTEXT_NEXT32(fInputText
);
2918 if (UTEXT_GETNATIVEINDEX(fInputText
) >= fAnchorLimit
) {
2919 if ((c
>=0x0a && c
<=0x0d) || c
==0x85 || c
==0x2028 || c
==0x2029) {
2920 // If not in the middle of a CR/LF sequence
2921 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& ((void)UTEXT_PREVIOUS32(fInputText
), UTEXT_PREVIOUS32(fInputText
))==0x0d)) {
2922 // At new-line at end of input. Success
2930 UChar32 nextC
= UTEXT_NEXT32(fInputText
);
2931 if (c
== 0x0d && nextC
== 0x0a && UTEXT_GETNATIVEINDEX(fInputText
) >= fAnchorLimit
) {
2934 break; // At CR/LF at end of input. Success
2938 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2943 case URX_DOLLAR_D
: // $, test for End of Line, in UNIX_LINES mode.
2944 if (fp
->fInputIdx
>= fAnchorLimit
) {
2945 // Off the end of input. Success.
2950 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2951 UChar32 c
= UTEXT_NEXT32(fInputText
);
2952 // Either at the last character of input, or off the end.
2953 if (c
== 0x0a && UTEXT_GETNATIVEINDEX(fInputText
) == fAnchorLimit
) {
2960 // Not at end of input. Back-track out.
2961 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2965 case URX_DOLLAR_M
: // $, test for End of line in multi-line mode
2967 if (fp
->fInputIdx
>= fAnchorLimit
) {
2968 // We really are at the end of input. Success.
2973 // If we are positioned just before a new-line, succeed.
2974 // It makes no difference where the new-line is within the input.
2975 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2976 UChar32 c
= UTEXT_CURRENT32(fInputText
);
2977 if ((c
>=0x0a && c
<=0x0d) || c
==0x85 ||c
==0x2028 || c
==0x2029) {
2978 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
2979 // In multi-line mode, hitting a new-line just before the end of input does not
2980 // set the hitEnd or requireEnd flags
2981 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& UTEXT_PREVIOUS32(fInputText
)==0x0d)) {
2985 // not at a new line. Fail.
2986 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2991 case URX_DOLLAR_MD
: // $, test for End of line in multi-line and UNIX_LINES mode
2993 if (fp
->fInputIdx
>= fAnchorLimit
) {
2994 // We really are at the end of input. Success.
2996 fRequireEnd
= TRUE
; // Java set requireEnd in this case, even though
2997 break; // adding a new-line would not lose the match.
2999 // If we are not positioned just before a new-line, the test fails; backtrack out.
3000 // It makes no difference where the new-line is within the input.
3001 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3002 if (UTEXT_CURRENT32(fInputText
) != 0x0a) {
3003 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3009 case URX_CARET
: // ^, test for start of line
3010 if (fp
->fInputIdx
!= fAnchorStart
) {
3011 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3016 case URX_CARET_M
: // ^, test for start of line in mulit-line mode
3018 if (fp
->fInputIdx
== fAnchorStart
) {
3019 // We are at the start input. Success.
3022 // Check whether character just before the current pos is a new-line
3023 // unless we are at the end of input
3024 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3025 UChar32 c
= UTEXT_PREVIOUS32(fInputText
);
3026 if ((fp
->fInputIdx
< fAnchorLimit
) &&
3027 ((c
<=0x0d && c
>=0x0a) || c
==0x85 ||c
==0x2028 || c
==0x2029)) {
3028 // It's a new-line. ^ is true. Success.
3029 // TODO: what should be done with positions between a CR and LF?
3032 // Not at the start of a line. Fail.
3033 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3038 case URX_CARET_M_UNIX
: // ^, test for start of line in mulit-line + Unix-line mode
3040 U_ASSERT(fp
->fInputIdx
>= fAnchorStart
);
3041 if (fp
->fInputIdx
<= fAnchorStart
) {
3042 // We are at the start input. Success.
3045 // Check whether character just before the current pos is a new-line
3046 U_ASSERT(fp
->fInputIdx
<= fAnchorLimit
);
3047 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3048 UChar32 c
= UTEXT_PREVIOUS32(fInputText
);
3050 // Not at the start of a line. Back-track out.
3051 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3056 case URX_BACKSLASH_B
: // Test for word boundaries
3058 UBool success
= isWordBoundary(fp
->fInputIdx
);
3059 success
^= (UBool
)(opValue
!= 0); // flip sense for \B
3061 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3067 case URX_BACKSLASH_BU
: // Test for word boundaries, Unicode-style
3069 UBool success
= isUWordBoundary(fp
->fInputIdx
);
3070 success
^= (UBool
)(opValue
!= 0); // flip sense for \B
3072 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3078 case URX_BACKSLASH_D
: // Test for decimal digit
3080 if (fp
->fInputIdx
>= fActiveLimit
) {
3082 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3086 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3088 UChar32 c
= UTEXT_NEXT32(fInputText
);
3089 int8_t ctype
= u_charType(c
); // TODO: make a unicode set for this. Will be faster.
3090 UBool success
= (ctype
== U_DECIMAL_DIGIT_NUMBER
);
3091 success
^= (UBool
)(opValue
!= 0); // flip sense for \D
3093 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3095 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3101 case URX_BACKSLASH_G
: // Test for position at end of previous match
3102 if (!((fMatch
&& fp
->fInputIdx
==fMatchEnd
) || (fMatch
==FALSE
&& fp
->fInputIdx
==fActiveStart
))) {
3103 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3108 case URX_BACKSLASH_X
:
3109 // Match a Grapheme, as defined by Unicode TR 29.
3110 // Differs slightly from Perl, which consumes combining marks independently
3114 // Fail if at end of input
3115 if (fp
->fInputIdx
>= fActiveLimit
) {
3117 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3121 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3123 // Examine (and consume) the current char.
3124 // Dispatch into a little state machine, based on the char.
3126 c
= UTEXT_NEXT32(fInputText
);
3127 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3128 UnicodeSet
**sets
= fPattern
->fStaticSets
;
3129 if (sets
[URX_GC_NORMAL
]->contains(c
)) goto GC_Extend
;
3130 if (sets
[URX_GC_CONTROL
]->contains(c
)) goto GC_Control
;
3131 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
3132 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
3133 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
3134 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
3135 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
3141 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
3142 c
= UTEXT_NEXT32(fInputText
);
3143 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3144 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
3145 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
3146 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
3147 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
3148 (void)UTEXT_PREVIOUS32(fInputText
);
3149 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3153 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
3154 c
= UTEXT_NEXT32(fInputText
);
3155 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3156 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
3157 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
3158 (void)UTEXT_PREVIOUS32(fInputText
);
3159 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3163 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
3164 c
= UTEXT_NEXT32(fInputText
);
3165 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3166 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
3167 (void)UTEXT_PREVIOUS32(fInputText
);
3168 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3172 // Combining characters are consumed here
3174 if (fp
->fInputIdx
>= fActiveLimit
) {
3177 c
= UTEXT_CURRENT32(fInputText
);
3178 if (sets
[URX_GC_EXTEND
]->contains(c
) == FALSE
) {
3181 (void)UTEXT_NEXT32(fInputText
);
3182 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3187 // Most control chars stand alone (don't combine with combining chars),
3188 // except for that CR/LF sequence is a single grapheme cluster.
3189 if (c
== 0x0d && fp
->fInputIdx
< fActiveLimit
&& UTEXT_CURRENT32(fInputText
) == 0x0a) {
3190 c
= UTEXT_NEXT32(fInputText
);
3191 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3195 if (fp
->fInputIdx
>= fActiveLimit
) {
3204 case URX_BACKSLASH_Z
: // Test for end of Input
3205 if (fp
->fInputIdx
< fAnchorLimit
) {
3206 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3215 case URX_STATIC_SETREF
:
3217 // Test input character against one of the predefined sets
3218 // (Word Characters, for example)
3219 // The high bit of the op value is a flag for the match polarity.
3220 // 0: success if input char is in set.
3221 // 1: success if input char is not in set.
3222 if (fp
->fInputIdx
>= fActiveLimit
) {
3224 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3228 UBool success
= ((opValue
& URX_NEG_SET
) == URX_NEG_SET
);
3229 opValue
&= ~URX_NEG_SET
;
3230 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
3232 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3233 UChar32 c
= UTEXT_NEXT32(fInputText
);
3235 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
3236 if (s8
->contains(c
)) {
3240 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
3241 if (s
->contains(c
)) {
3246 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3248 // the character wasn't in the set.
3249 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3255 case URX_STAT_SETREF_N
:
3257 // Test input character for NOT being a member of one of
3258 // the predefined sets (Word Characters, for example)
3259 if (fp
->fInputIdx
>= fActiveLimit
) {
3261 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3265 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
3267 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3269 UChar32 c
= UTEXT_NEXT32(fInputText
);
3271 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
3272 if (s8
->contains(c
) == FALSE
) {
3273 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3277 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
3278 if (s
->contains(c
) == FALSE
) {
3279 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3283 // the character wasn't in the set.
3284 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3290 if (fp
->fInputIdx
>= fActiveLimit
) {
3292 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3295 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3297 // There is input left. Pick up one char and test it for set membership.
3298 UChar32 c
= UTEXT_NEXT32(fInputText
);
3299 U_ASSERT(opValue
> 0 && opValue
< sets
->size());
3301 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
3302 if (s8
->contains(c
)) {
3303 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3307 UnicodeSet
*s
= (UnicodeSet
*)sets
->elementAt(opValue
);
3308 if (s
->contains(c
)) {
3309 // The character is in the set. A Match.
3310 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3315 // the character wasn't in the set.
3316 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3323 // . matches anything, but stops at end-of-line.
3324 if (fp
->fInputIdx
>= fActiveLimit
) {
3325 // At end of input. Match failed. Backtrack out.
3327 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3331 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3333 // There is input left. Advance over one char, unless we've hit end-of-line
3334 UChar32 c
= UTEXT_NEXT32(fInputText
);
3335 if (((c
& 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
3336 ((c
<=0x0d && c
>=0x0a) || c
==0x85 ||c
==0x2028 || c
==0x2029)) {
3337 // End of line in normal mode. . does not match.
3338 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3341 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3346 case URX_DOTANY_ALL
:
3348 // ., in dot-matches-all (including new lines) mode
3349 if (fp
->fInputIdx
>= fActiveLimit
) {
3350 // At end of input. Match failed. Backtrack out.
3352 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3356 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3358 // There is input left. Advance over one char, except if we are
3359 // at a cr/lf, advance over both of them.
3361 c
= UTEXT_NEXT32(fInputText
);
3362 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3363 if (c
==0x0d && fp
->fInputIdx
< fActiveLimit
) {
3364 // In the case of a CR/LF, we need to advance over both.
3365 UChar32 nextc
= UTEXT_CURRENT32(fInputText
);
3366 if (nextc
== 0x0a) {
3367 (void)UTEXT_NEXT32(fInputText
);
3368 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3375 case URX_DOTANY_UNIX
:
3377 // '.' operator, matches all, but stops at end-of-line.
3378 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
3379 if (fp
->fInputIdx
>= fActiveLimit
) {
3380 // At end of input. Match failed. Backtrack out.
3382 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3386 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3388 // There is input left. Advance over one char, unless we've hit end-of-line
3389 UChar32 c
= UTEXT_NEXT32(fInputText
);
3391 // End of line in normal mode. '.' does not match the \n
3392 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3394 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3401 fp
->fPatIdx
= opValue
;
3409 U_ASSERT(opValue
< fPattern
->fCompiledPat
->size());
3410 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
3411 fp
->fPatIdx
= opValue
; // Then JMP.
3415 // This opcode is used with (x)+, when x can match a zero length string.
3416 // Same as JMP_SAV, except conditional on the match having made forward progress.
3417 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3418 // data address of the input position at the start of the loop.
3420 U_ASSERT(opValue
> 0 && opValue
< fPattern
->fCompiledPat
->size());
3421 int32_t stoOp
= (int32_t)pat
[opValue
-1];
3422 U_ASSERT(URX_TYPE(stoOp
) == URX_STO_INP_LOC
);
3423 int32_t frameLoc
= URX_VAL(stoOp
);
3424 U_ASSERT(frameLoc
>= 0 && frameLoc
< fFrameSize
);
3425 int64_t prevInputIdx
= fp
->fExtra
[frameLoc
];
3426 U_ASSERT(prevInputIdx
<= fp
->fInputIdx
);
3427 if (prevInputIdx
< fp
->fInputIdx
) {
3428 // The match did make progress. Repeat the loop.
3429 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
3430 fp
->fPatIdx
= opValue
;
3431 fp
->fExtra
[frameLoc
] = fp
->fInputIdx
;
3433 // If the input position did not advance, we do nothing here,
3434 // execution will fall out of the loop.
3440 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
3441 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
3443 // Pick up the three extra operands that CTR_INIT has, and
3444 // skip the pattern location counter past
3445 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
3447 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
3448 int32_t minCount
= (int32_t)pat
[instrOperandLoc
+1];
3449 int32_t maxCount
= (int32_t)pat
[instrOperandLoc
+2];
3450 U_ASSERT(minCount
>=0);
3451 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
3452 U_ASSERT(loopLoc
>=fp
->fPatIdx
);
3454 if (minCount
== 0) {
3455 fp
= StateSave(fp
, loopLoc
+1, status
);
3457 if (maxCount
== -1) {
3458 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // For loop breaking.
3459 } else if (maxCount
== 0) {
3460 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3467 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
3468 int32_t initOp
= (int32_t)pat
[opValue
];
3469 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT
);
3470 int64_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
3471 int32_t minCount
= (int32_t)pat
[opValue
+2];
3472 int32_t maxCount
= (int32_t)pat
[opValue
+3];
3474 if ((uint64_t)*pCounter
>= (uint32_t)maxCount
&& maxCount
!= -1) {
3475 U_ASSERT(*pCounter
== maxCount
);
3478 if (*pCounter
>= minCount
) {
3479 if (maxCount
== -1) {
3480 // Loop has no hard upper bound.
3481 // Check that it is progressing through the input, break if it is not.
3482 int64_t *pLastInputIdx
= &fp
->fExtra
[URX_VAL(initOp
) + 1];
3483 if (fp
->fInputIdx
== *pLastInputIdx
) {
3486 *pLastInputIdx
= fp
->fInputIdx
;
3489 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
3491 fp
->fPatIdx
= opValue
+ 4; // Loop back.
3495 case URX_CTR_INIT_NG
:
3497 // Initialize a non-greedy loop
3498 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
3499 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
3501 // Pick up the three extra operands that CTR_INIT_NG has, and
3502 // skip the pattern location counter past
3503 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
3505 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
3506 int32_t minCount
= (int32_t)pat
[instrOperandLoc
+1];
3507 int32_t maxCount
= (int32_t)pat
[instrOperandLoc
+2];
3508 U_ASSERT(minCount
>=0);
3509 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
3510 U_ASSERT(loopLoc
>fp
->fPatIdx
);
3511 if (maxCount
== -1) {
3512 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // Save initial input index for loop breaking.
3515 if (minCount
== 0) {
3516 if (maxCount
!= 0) {
3517 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
3519 fp
->fPatIdx
= loopLoc
+1; // Continue with stuff after repeated block
3524 case URX_CTR_LOOP_NG
:
3526 // Non-greedy {min, max} loops
3527 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
3528 int32_t initOp
= (int32_t)pat
[opValue
];
3529 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT_NG
);
3530 int64_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
3531 int32_t minCount
= (int32_t)pat
[opValue
+2];
3532 int32_t maxCount
= (int32_t)pat
[opValue
+3];
3535 if ((uint64_t)*pCounter
>= (uint32_t)maxCount
&& maxCount
!= -1) {
3536 // The loop has matched the maximum permitted number of times.
3537 // Break out of here with no action. Matching will
3538 // continue with the following pattern.
3539 U_ASSERT(*pCounter
== maxCount
);
3543 if (*pCounter
< minCount
) {
3544 // We haven't met the minimum number of matches yet.
3545 // Loop back for another one.
3546 fp
->fPatIdx
= opValue
+ 4; // Loop back.
3548 // We do have the minimum number of matches.
3550 // If there is no upper bound on the loop iterations, check that the input index
3551 // is progressing, and stop the loop if it is not.
3552 if (maxCount
== -1) {
3553 int64_t *pLastInputIdx
= &fp
->fExtra
[URX_VAL(initOp
) + 1];
3554 if (fp
->fInputIdx
== *pLastInputIdx
) {
3557 *pLastInputIdx
= fp
->fInputIdx
;
3560 // Loop Continuation: we will fall into the pattern following the loop
3561 // (non-greedy, don't execute loop body first), but first do
3562 // a state save to the top of the loop, so that a match failure
3563 // in the following pattern will try another iteration of the loop.
3564 fp
= StateSave(fp
, opValue
+ 4, status
);
3570 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
3571 fData
[opValue
] = fStack
->size();
3576 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
3577 int32_t newStackSize
= (int32_t)fData
[opValue
];
3578 U_ASSERT(newStackSize
<= fStack
->size());
3579 int64_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
3580 if (newFP
== (int64_t *)fp
) {
3584 for (i
=0; i
<fFrameSize
; i
++) {
3585 newFP
[i
] = ((int64_t *)fp
)[i
];
3587 fp
= (REStackFrame
*)newFP
;
3588 fStack
->setSize(newStackSize
);
3594 U_ASSERT(opValue
< fFrameSize
);
3595 int64_t groupStartIdx
= fp
->fExtra
[opValue
];
3596 int64_t groupEndIdx
= fp
->fExtra
[opValue
+1];
3597 U_ASSERT(groupStartIdx
<= groupEndIdx
);
3598 if (groupStartIdx
< 0) {
3599 // This capture group has not participated in the match thus far,
3600 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
3603 UTEXT_SETNATIVEINDEX(fAltInputText
, groupStartIdx
);
3604 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3606 // Note: if the capture group match was of an empty string the backref
3607 // match succeeds. Verified by testing: Perl matches succeed
3608 // in this case, so we do too.
3610 UBool success
= TRUE
;
3612 if (utext_getNativeIndex(fAltInputText
) >= groupEndIdx
) {
3616 if (utext_getNativeIndex(fInputText
) >= fActiveLimit
) {
3621 UChar32 captureGroupChar
= utext_next32(fAltInputText
);
3622 UChar32 inputChar
= utext_next32(fInputText
);
3623 if (inputChar
!= captureGroupChar
) {
3630 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3632 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3641 U_ASSERT(opValue
< fFrameSize
);
3642 int64_t groupStartIdx
= fp
->fExtra
[opValue
];
3643 int64_t groupEndIdx
= fp
->fExtra
[opValue
+1];
3644 U_ASSERT(groupStartIdx
<= groupEndIdx
);
3645 if (groupStartIdx
< 0) {
3646 // This capture group has not participated in the match thus far,
3647 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
3650 utext_setNativeIndex(fAltInputText
, groupStartIdx
);
3651 utext_setNativeIndex(fInputText
, fp
->fInputIdx
);
3652 CaseFoldingUTextIterator
captureGroupItr(*fAltInputText
);
3653 CaseFoldingUTextIterator
inputItr(*fInputText
);
3655 // Note: if the capture group match was of an empty string the backref
3656 // match succeeds. Verified by testing: Perl matches succeed
3657 // in this case, so we do too.
3659 UBool success
= TRUE
;
3661 if (!captureGroupItr
.inExpansion() && utext_getNativeIndex(fAltInputText
) >= groupEndIdx
) {
3665 if (!inputItr
.inExpansion() && utext_getNativeIndex(fInputText
) >= fActiveLimit
) {
3670 UChar32 captureGroupChar
= captureGroupItr
.next();
3671 UChar32 inputChar
= inputItr
.next();
3672 if (inputChar
!= captureGroupChar
) {
3678 if (success
&& inputItr
.inExpansion()) {
3679 // We otained a match by consuming part of a string obtained from
3680 // case-folding a single code point of the input text.
3681 // This does not count as an overall match.
3686 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3688 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3694 case URX_STO_INP_LOC
:
3696 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
);
3697 fp
->fExtra
[opValue
] = fp
->fInputIdx
;
3703 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
3705 int32_t dataLoc
= URX_VAL(pat
[instrOperandLoc
]);
3706 U_ASSERT(dataLoc
>= 0 && dataLoc
< fFrameSize
);
3707 int64_t savedInputIdx
= fp
->fExtra
[dataLoc
];
3708 U_ASSERT(savedInputIdx
<= fp
->fInputIdx
);
3709 if (savedInputIdx
< fp
->fInputIdx
) {
3710 fp
->fPatIdx
= opValue
; // JMP
3712 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no progress in loop.
3719 // Entering a lookahead block.
3720 // Save Stack Ptr, Input Pos.
3721 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3722 fData
[opValue
] = fStack
->size();
3723 fData
[opValue
+1] = fp
->fInputIdx
;
3724 fActiveStart
= fLookStart
; // Set the match region change for
3725 fActiveLimit
= fLookLimit
; // transparent bounds.
3731 // Leaving a look-ahead block.
3732 // restore Stack Ptr, Input Pos to positions they had on entry to block.
3733 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3734 int32_t stackSize
= fStack
->size();
3735 int32_t newStackSize
=(int32_t)fData
[opValue
];
3736 U_ASSERT(stackSize
>= newStackSize
);
3737 if (stackSize
> newStackSize
) {
3738 // Copy the current top frame back to the new (cut back) top frame.
3739 // This makes the capture groups from within the look-ahead
3740 // expression available.
3741 int64_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
3743 for (i
=0; i
<fFrameSize
; i
++) {
3744 newFP
[i
] = ((int64_t *)fp
)[i
];
3746 fp
= (REStackFrame
*)newFP
;
3747 fStack
->setSize(newStackSize
);
3749 fp
->fInputIdx
= fData
[opValue
+1];
3751 // Restore the active region bounds in the input string; they may have
3752 // been changed because of transparent bounds on a Region.
3753 fActiveStart
= fRegionStart
;
3754 fActiveLimit
= fRegionLimit
;
3759 // Case insensitive one char. The char from the pattern is already case folded.
3760 // Input text is not, but case folding the input can not reduce two or more code
3762 if (fp
->fInputIdx
< fActiveLimit
) {
3763 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3765 UChar32 c
= UTEXT_NEXT32(fInputText
);
3766 if (u_foldCase(c
, U_FOLD_CASE_DEFAULT
) == opValue
) {
3767 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3774 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3779 // Case-insensitive test input against a literal string.
3780 // Strings require two slots in the compiled pattern, one for the
3781 // offset to the string text, and one for the length.
3782 // The compiled string has already been case folded.
3784 const UChar
*patternString
= litText
+ opValue
;
3785 int32_t patternStringIdx
= 0;
3787 op
= (int32_t)pat
[fp
->fPatIdx
];
3789 opType
= URX_TYPE(op
);
3790 opValue
= URX_VAL(op
);
3791 U_ASSERT(opType
== URX_STRING_LEN
);
3792 int32_t patternStringLen
= opValue
; // Length of the string from the pattern.
3797 UBool success
= TRUE
;
3799 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3800 CaseFoldingUTextIterator
inputIterator(*fInputText
);
3801 while (patternStringIdx
< patternStringLen
) {
3802 if (!inputIterator
.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText
) >= fActiveLimit
) {
3807 U16_NEXT(patternString
, patternStringIdx
, patternStringLen
, cPattern
);
3808 cText
= inputIterator
.next();
3809 if (cText
!= cPattern
) {
3814 if (inputIterator
.inExpansion()) {
3819 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3821 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3829 // Entering a look-behind block.
3830 // Save Stack Ptr, Input Pos.
3831 // TODO: implement transparent bounds. Ticket #6067
3832 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3833 fData
[opValue
] = fStack
->size();
3834 fData
[opValue
+1] = fp
->fInputIdx
;
3835 // Init the variable containing the start index for attempted matches.
3836 fData
[opValue
+2] = -1;
3837 // Save input string length, then reset to pin any matches to end at
3838 // the current position.
3839 fData
[opValue
+3] = fActiveLimit
;
3840 fActiveLimit
= fp
->fInputIdx
;
3847 // Positive Look-Behind, at top of loop checking for matches of LB expression
3848 // at all possible input starting positions.
3850 // Fetch the min and max possible match lengths. They are the operands
3851 // of this op in the pattern.
3852 int32_t minML
= (int32_t)pat
[fp
->fPatIdx
++];
3853 int32_t maxML
= (int32_t)pat
[fp
->fPatIdx
++];
3854 U_ASSERT(minML
<= maxML
);
3855 U_ASSERT(minML
>= 0);
3857 // Fetch (from data) the last input index where a match was attempted.
3858 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3859 int64_t *lbStartIdx
= &fData
[opValue
+2];
3860 if (*lbStartIdx
< 0) {
3861 // First time through loop.
3862 *lbStartIdx
= fp
->fInputIdx
- minML
;
3864 // 2nd through nth time through the loop.
3865 // Back up start position for match by one.
3866 if (*lbStartIdx
== 0) {
3869 UTEXT_SETNATIVEINDEX(fInputText
, *lbStartIdx
);
3870 (void)UTEXT_PREVIOUS32(fInputText
);
3871 *lbStartIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3875 if (*lbStartIdx
< 0 || *lbStartIdx
< fp
->fInputIdx
- maxML
) {
3876 // We have tried all potential match starting points without
3877 // getting a match. Backtrack out, and out of the
3878 // Look Behind altogether.
3879 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3880 int64_t restoreInputLen
= fData
[opValue
+3];
3881 U_ASSERT(restoreInputLen
>= fActiveLimit
);
3882 U_ASSERT(restoreInputLen
<= fInputLength
);
3883 fActiveLimit
= restoreInputLen
;
3887 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3888 // (successful match will fall off the end of the loop.)
3889 fp
= StateSave(fp
, fp
->fPatIdx
-3, status
);
3890 fp
->fInputIdx
= *lbStartIdx
;
3895 // End of a look-behind block, after a successful match.
3897 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3898 if (fp
->fInputIdx
!= fActiveLimit
) {
3899 // The look-behind expression matched, but the match did not
3900 // extend all the way to the point that we are looking behind from.
3901 // FAIL out of here, which will take us back to the LB_CONT, which
3902 // will retry the match starting at another position or fail
3903 // the look-behind altogether, whichever is appropriate.
3904 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3908 // Look-behind match is good. Restore the orignal input string length,
3909 // which had been truncated to pin the end of the lookbehind match to the
3910 // position being looked-behind.
3911 int64_t originalInputLen
= fData
[opValue
+3];
3912 U_ASSERT(originalInputLen
>= fActiveLimit
);
3913 U_ASSERT(originalInputLen
<= fInputLength
);
3914 fActiveLimit
= originalInputLen
;
3921 // Negative Look-Behind, at top of loop checking for matches of LB expression
3922 // at all possible input starting positions.
3924 // Fetch the extra parameters of this op.
3925 int32_t minML
= (int32_t)pat
[fp
->fPatIdx
++];
3926 int32_t maxML
= (int32_t)pat
[fp
->fPatIdx
++];
3927 int32_t continueLoc
= (int32_t)pat
[fp
->fPatIdx
++];
3928 continueLoc
= URX_VAL(continueLoc
);
3929 U_ASSERT(minML
<= maxML
);
3930 U_ASSERT(minML
>= 0);
3931 U_ASSERT(continueLoc
> fp
->fPatIdx
);
3933 // Fetch (from data) the last input index where a match was attempted.
3934 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3935 int64_t *lbStartIdx
= &fData
[opValue
+2];
3936 if (*lbStartIdx
< 0) {
3937 // First time through loop.
3938 *lbStartIdx
= fp
->fInputIdx
- minML
;
3940 // 2nd through nth time through the loop.
3941 // Back up start position for match by one.
3942 if (*lbStartIdx
== 0) {
3945 UTEXT_SETNATIVEINDEX(fInputText
, *lbStartIdx
);
3946 (void)UTEXT_PREVIOUS32(fInputText
);
3947 *lbStartIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3951 if (*lbStartIdx
< 0 || *lbStartIdx
< fp
->fInputIdx
- maxML
) {
3952 // We have tried all potential match starting points without
3953 // getting a match, which means that the negative lookbehind as
3954 // a whole has succeeded. Jump forward to the continue location
3955 int64_t restoreInputLen
= fData
[opValue
+3];
3956 U_ASSERT(restoreInputLen
>= fActiveLimit
);
3957 U_ASSERT(restoreInputLen
<= fInputLength
);
3958 fActiveLimit
= restoreInputLen
;
3959 fp
->fPatIdx
= continueLoc
;
3963 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3964 // (successful match will cause a FAIL out of the loop altogether.)
3965 fp
= StateSave(fp
, fp
->fPatIdx
-4, status
);
3966 fp
->fInputIdx
= *lbStartIdx
;
3971 // End of a negative look-behind block, after a successful match.
3973 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3974 if (fp
->fInputIdx
!= fActiveLimit
) {
3975 // The look-behind expression matched, but the match did not
3976 // extend all the way to the point that we are looking behind from.
3977 // FAIL out of here, which will take us back to the LB_CONT, which
3978 // will retry the match starting at another position or succeed
3979 // the look-behind altogether, whichever is appropriate.
3980 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3984 // Look-behind expression matched, which means look-behind test as
3987 // Restore the orignal input string length, which had been truncated
3988 // inorder to pin the end of the lookbehind match
3989 // to the position being looked-behind.
3990 int64_t originalInputLen
= fData
[opValue
+3];
3991 U_ASSERT(originalInputLen
>= fActiveLimit
);
3992 U_ASSERT(originalInputLen
<= fInputLength
);
3993 fActiveLimit
= originalInputLen
;
3995 // Restore original stack position, discarding any state saved
3996 // by the successful pattern match.
3997 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3998 int32_t newStackSize
= (int32_t)fData
[opValue
];
3999 U_ASSERT(fStack
->size() > newStackSize
);
4000 fStack
->setSize(newStackSize
);
4002 // FAIL, which will take control back to someplace
4003 // prior to entering the look-behind test.
4004 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4010 // Loop Initialization for the optimized implementation of
4011 // [some character set]*
4012 // This op scans through all matching input.
4013 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4015 U_ASSERT(opValue
> 0 && opValue
< sets
->size());
4016 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
4017 UnicodeSet
*s
= (UnicodeSet
*)sets
->elementAt(opValue
);
4019 // Loop through input, until either the input is exhausted or
4020 // we reach a character that is not a member of the set.
4021 int64_t ix
= fp
->fInputIdx
;
4022 UTEXT_SETNATIVEINDEX(fInputText
, ix
);
4024 if (ix
>= fActiveLimit
) {
4028 UChar32 c
= UTEXT_NEXT32(fInputText
);
4030 if (s8
->contains(c
) == FALSE
) {
4034 if (s
->contains(c
) == FALSE
) {
4038 ix
= UTEXT_GETNATIVEINDEX(fInputText
);
4041 // If there were no matching characters, skip over the loop altogether.
4042 // The loop doesn't run at all, a * op always succeeds.
4043 if (ix
== fp
->fInputIdx
) {
4044 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
4048 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4049 // must follow. It's operand is the stack location
4050 // that holds the starting input index for the match of this [set]*
4051 int32_t loopcOp
= (int32_t)pat
[fp
->fPatIdx
];
4052 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
4053 int32_t stackLoc
= URX_VAL(loopcOp
);
4054 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
4055 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
4058 // Save State to the URX_LOOP_C op that follows this one,
4059 // so that match failures in the following code will return to there.
4060 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4061 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
4067 case URX_LOOP_DOT_I
:
4068 // Loop Initialization for the optimized implementation of .*
4069 // This op scans through all remaining input.
4070 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4072 // Loop through input until the input is exhausted (we reach an end-of-line)
4073 // In DOTALL mode, we can just go straight to the end of the input.
4075 if ((opValue
& 1) == 1) {
4076 // Dot-matches-All mode. Jump straight to the end of the string.
4080 // NOT DOT ALL mode. Line endings do not match '.'
4081 // Scan forward until a line ending or end of input.
4083 UTEXT_SETNATIVEINDEX(fInputText
, ix
);
4085 if (ix
>= fActiveLimit
) {
4089 UChar32 c
= UTEXT_NEXT32(fInputText
);
4090 if ((c
& 0x7f) <= 0x29) { // Fast filter of non-new-line-s
4091 if ((c
== 0x0a) || // 0x0a is newline in both modes.
4092 (((opValue
& 2) == 0) && // IF not UNIX_LINES mode
4093 (c
<=0x0d && c
>=0x0a)) || c
==0x85 ||c
==0x2028 || c
==0x2029) {
4094 // char is a line ending. Exit the scanning loop.
4098 ix
= UTEXT_GETNATIVEINDEX(fInputText
);
4102 // If there were no matching characters, skip over the loop altogether.
4103 // The loop doesn't run at all, a * op always succeeds.
4104 if (ix
== fp
->fInputIdx
) {
4105 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
4109 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4110 // must follow. It's operand is the stack location
4111 // that holds the starting input index for the match of this .*
4112 int32_t loopcOp
= (int32_t)pat
[fp
->fPatIdx
];
4113 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
4114 int32_t stackLoc
= URX_VAL(loopcOp
);
4115 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
4116 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
4119 // Save State to the URX_LOOP_C op that follows this one,
4120 // so that match failures in the following code will return to there.
4121 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4122 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
4130 U_ASSERT(opValue
>=0 && opValue
<fFrameSize
);
4131 backSearchIndex
= fp
->fExtra
[opValue
];
4132 U_ASSERT(backSearchIndex
<= fp
->fInputIdx
);
4133 if (backSearchIndex
== fp
->fInputIdx
) {
4134 // We've backed up the input idx to the point that the loop started.
4135 // The loop is done. Leave here without saving state.
4136 // Subsequent failures won't come back here.
4139 // Set up for the next iteration of the loop, with input index
4140 // backed up by one from the last time through,
4141 // and a state save to this instruction in case the following code fails again.
4142 // (We're going backwards because this loop emulates stack unwinding, not
4143 // the initial scan forward.)
4144 U_ASSERT(fp
->fInputIdx
> 0);
4145 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
4146 UChar32 prevC
= UTEXT_PREVIOUS32(fInputText
);
4147 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
4149 UChar32 twoPrevC
= UTEXT_PREVIOUS32(fInputText
);
4150 if (prevC
== 0x0a &&
4151 fp
->fInputIdx
> backSearchIndex
&&
4153 int32_t prevOp
= (int32_t)pat
[fp
->fPatIdx
-2];
4154 if (URX_TYPE(prevOp
) == URX_LOOP_DOT_I
) {
4155 // .*, stepping back over CRLF pair.
4156 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
4161 fp
= StateSave(fp
, fp
->fPatIdx
-1, status
);
4168 // Trouble. The compiled pattern contains an entry with an
4169 // unrecognized type tag.
4173 if (U_FAILURE(status
)) {
4182 fLastMatchEnd
= fMatchEnd
;
4183 fMatchStart
= startIdx
;
4184 fMatchEnd
= fp
->fInputIdx
;
4187 #ifdef REGEX_RUN_DEBUG
4190 printf("Match. start=%ld end=%ld\n\n", fMatchStart
, fMatchEnd
);
4192 printf("No match\n\n");
4197 fFrame
= fp
; // The active stack frame when the engine stopped.
4198 // Contains the capture group results that we need to
4204 //--------------------------------------------------------------------------------
4206 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with the
4207 // assumption that the entire string is available in the UText's
4208 // chunk buffer. For now, that means we can use int32_t indexes,
4209 // except for anything that needs to be saved (like group starts
4212 // startIdx: begin matching a this index.
4213 // toEnd: if true, match must extend to end of the input region
4215 //--------------------------------------------------------------------------------
4216 void RegexMatcher::MatchChunkAt(int32_t startIdx
, UBool toEnd
, UErrorCode
&status
) {
4217 UBool isMatch
= FALSE
; // True if the we have a match.
4219 int32_t backSearchIndex
= INT32_MAX
; // used after greedy single-character matches for searching backwards
4221 int32_t op
; // Operation from the compiled pattern, split into
4222 int32_t opType
; // the opcode
4223 int32_t opValue
; // and the operand value.
4225 #ifdef REGEX_RUN_DEBUG
4227 printf("MatchAt(startIdx=%d)\n", startIdx
);
4228 printf("Original Pattern: ");
4229 UChar32 c
= utext_next32From(fPattern
->fPattern
, 0);
4230 while (c
!= U_SENTINEL
) {
4231 if (c
<32 || c
>256) {
4236 c
= UTEXT_NEXT32(fPattern
->fPattern
);
4239 printf("Input String: ");
4240 c
= utext_next32From(fInputText
, 0);
4241 while (c
!= U_SENTINEL
) {
4242 if (c
<32 || c
>256) {
4247 c
= UTEXT_NEXT32(fInputText
);
4254 if (U_FAILURE(status
)) {
4258 // Cache frequently referenced items from the compiled pattern
4260 int64_t *pat
= fPattern
->fCompiledPat
->getBuffer();
4262 const UChar
*litText
= fPattern
->fLiteralText
.getBuffer();
4263 UVector
*sets
= fPattern
->fSets
;
4265 const UChar
*inputBuf
= fInputText
->chunkContents
;
4267 fFrameSize
= fPattern
->fFrameSize
;
4268 REStackFrame
*fp
= resetStack();
4271 fp
->fInputIdx
= startIdx
;
4273 // Zero out the pattern's static data
4275 for (i
= 0; i
<fPattern
->fDataSize
; i
++) {
4280 // Main loop for interpreting the compiled pattern.
4281 // One iteration of the loop per pattern operation performed.
4284 op
= (int32_t)pat
[fp
->fPatIdx
];
4285 opType
= URX_TYPE(op
);
4286 opValue
= URX_VAL(op
);
4287 #ifdef REGEX_RUN_DEBUG
4289 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
4290 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp
->fInputIdx
,
4291 UTEXT_CURRENT32(fInputText
), (int64_t *)fp
-fStack
->getBuffer(), fActiveLimit
);
4292 fPattern
->dumpOp(fp
->fPatIdx
);
4305 // Force a backtrack. In some circumstances, the pattern compiler
4306 // will notice that the pattern can't possibly match anything, and will
4307 // emit one of these at that point.
4308 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4313 if (fp
->fInputIdx
< fActiveLimit
) {
4315 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4322 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4328 // Test input against a literal string.
4329 // Strings require two slots in the compiled pattern, one for the
4330 // offset to the string text, and one for the length.
4331 int32_t stringStartIdx
= opValue
;
4334 op
= (int32_t)pat
[fp
->fPatIdx
]; // Fetch the second operand
4336 opType
= URX_TYPE(op
);
4337 stringLen
= URX_VAL(op
);
4338 U_ASSERT(opType
== URX_STRING_LEN
);
4339 U_ASSERT(stringLen
>= 2);
4341 const UChar
* pInp
= inputBuf
+ fp
->fInputIdx
;
4342 const UChar
* pInpLimit
= inputBuf
+ fActiveLimit
;
4343 const UChar
* pPat
= litText
+stringStartIdx
;
4344 const UChar
* pEnd
= pInp
+ stringLen
;
4345 UBool success
= TRUE
;
4346 while (pInp
< pEnd
) {
4347 if (pInp
>= pInpLimit
) {
4352 if (*pInp
++ != *pPat
++) {
4359 fp
->fInputIdx
+= stringLen
;
4361 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4367 case URX_STATE_SAVE
:
4368 fp
= StateSave(fp
, opValue
, status
);
4373 // The match loop will exit via this path on a successful match,
4374 // when we reach the end of the pattern.
4375 if (toEnd
&& fp
->fInputIdx
!= fActiveLimit
) {
4376 // The pattern matched, but not to the end of input. Try some more.
4377 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4383 // Start and End Capture stack frame variables are laid out out like this:
4384 // fp->fExtra[opValue] - The start of a completed capture group
4385 // opValue+1 - The end of a completed capture group
4386 // opValue+2 - the start of a capture group whose end
4387 // has not yet been reached (and might not ever be).
4388 case URX_START_CAPTURE
:
4389 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
4390 fp
->fExtra
[opValue
+2] = fp
->fInputIdx
;
4394 case URX_END_CAPTURE
:
4395 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
4396 U_ASSERT(fp
->fExtra
[opValue
+2] >= 0); // Start pos for this group must be set.
4397 fp
->fExtra
[opValue
] = fp
->fExtra
[opValue
+2]; // Tentative start becomes real.
4398 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // End position
4399 U_ASSERT(fp
->fExtra
[opValue
] <= fp
->fExtra
[opValue
+1]);
4403 case URX_DOLLAR
: // $, test for End of line
4404 // or for position before new line at end of input
4405 if (fp
->fInputIdx
< fAnchorLimit
-2) {
4406 // We are no where near the end of input. Fail.
4407 // This is the common case. Keep it first.
4408 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4411 if (fp
->fInputIdx
>= fAnchorLimit
) {
4412 // We really are at the end of input. Success.
4418 // If we are positioned just before a new-line that is located at the
4419 // end of input, succeed.
4420 if (fp
->fInputIdx
== fAnchorLimit
-1) {
4422 U16_GET(inputBuf
, fAnchorStart
, fp
->fInputIdx
, fAnchorLimit
, c
);
4424 if ((c
>=0x0a && c
<=0x0d) || c
==0x85 || c
==0x2028 || c
==0x2029) {
4425 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& inputBuf
[fp
->fInputIdx
-1]==0x0d)) {
4426 // At new-line at end of input. Success
4432 } else if (fp
->fInputIdx
== fAnchorLimit
-2 &&
4433 inputBuf
[fp
->fInputIdx
]==0x0d && inputBuf
[fp
->fInputIdx
+1]==0x0a) {
4436 break; // At CR/LF at end of input. Success
4439 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4444 case URX_DOLLAR_D
: // $, test for End of Line, in UNIX_LINES mode.
4445 if (fp
->fInputIdx
>= fAnchorLimit
-1) {
4446 // Either at the last character of input, or off the end.
4447 if (fp
->fInputIdx
== fAnchorLimit
-1) {
4448 // At last char of input. Success if it's a new line.
4449 if (inputBuf
[fp
->fInputIdx
] == 0x0a) {
4455 // Off the end of input. Success.
4462 // Not at end of input. Back-track out.
4463 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4467 case URX_DOLLAR_M
: // $, test for End of line in multi-line mode
4469 if (fp
->fInputIdx
>= fAnchorLimit
) {
4470 // We really are at the end of input. Success.
4475 // If we are positioned just before a new-line, succeed.
4476 // It makes no difference where the new-line is within the input.
4477 UChar32 c
= inputBuf
[fp
->fInputIdx
];
4478 if ((c
>=0x0a && c
<=0x0d) || c
==0x85 ||c
==0x2028 || c
==0x2029) {
4479 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
4480 // In multi-line mode, hitting a new-line just before the end of input does not
4481 // set the hitEnd or requireEnd flags
4482 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& inputBuf
[fp
->fInputIdx
-1]==0x0d)) {
4486 // not at a new line. Fail.
4487 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4492 case URX_DOLLAR_MD
: // $, test for End of line in multi-line and UNIX_LINES mode
4494 if (fp
->fInputIdx
>= fAnchorLimit
) {
4495 // We really are at the end of input. Success.
4497 fRequireEnd
= TRUE
; // Java set requireEnd in this case, even though
4498 break; // adding a new-line would not lose the match.
4500 // If we are not positioned just before a new-line, the test fails; backtrack out.
4501 // It makes no difference where the new-line is within the input.
4502 if (inputBuf
[fp
->fInputIdx
] != 0x0a) {
4503 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4509 case URX_CARET
: // ^, test for start of line
4510 if (fp
->fInputIdx
!= fAnchorStart
) {
4511 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4516 case URX_CARET_M
: // ^, test for start of line in mulit-line mode
4518 if (fp
->fInputIdx
== fAnchorStart
) {
4519 // We are at the start input. Success.
4522 // Check whether character just before the current pos is a new-line
4523 // unless we are at the end of input
4524 UChar c
= inputBuf
[fp
->fInputIdx
- 1];
4525 if ((fp
->fInputIdx
< fAnchorLimit
) &&
4526 ((c
<=0x0d && c
>=0x0a) || c
==0x85 ||c
==0x2028 || c
==0x2029)) {
4527 // It's a new-line. ^ is true. Success.
4528 // TODO: what should be done with positions between a CR and LF?
4531 // Not at the start of a line. Fail.
4532 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4537 case URX_CARET_M_UNIX
: // ^, test for start of line in mulit-line + Unix-line mode
4539 U_ASSERT(fp
->fInputIdx
>= fAnchorStart
);
4540 if (fp
->fInputIdx
<= fAnchorStart
) {
4541 // We are at the start input. Success.
4544 // Check whether character just before the current pos is a new-line
4545 U_ASSERT(fp
->fInputIdx
<= fAnchorLimit
);
4546 UChar c
= inputBuf
[fp
->fInputIdx
- 1];
4548 // Not at the start of a line. Back-track out.
4549 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4554 case URX_BACKSLASH_B
: // Test for word boundaries
4556 UBool success
= isChunkWordBoundary((int32_t)fp
->fInputIdx
);
4557 success
^= (UBool
)(opValue
!= 0); // flip sense for \B
4559 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4565 case URX_BACKSLASH_BU
: // Test for word boundaries, Unicode-style
4567 UBool success
= isUWordBoundary(fp
->fInputIdx
);
4568 success
^= (UBool
)(opValue
!= 0); // flip sense for \B
4570 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4576 case URX_BACKSLASH_D
: // Test for decimal digit
4578 if (fp
->fInputIdx
>= fActiveLimit
) {
4580 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4585 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4586 int8_t ctype
= u_charType(c
); // TODO: make a unicode set for this. Will be faster.
4587 UBool success
= (ctype
== U_DECIMAL_DIGIT_NUMBER
);
4588 success
^= (UBool
)(opValue
!= 0); // flip sense for \D
4590 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4596 case URX_BACKSLASH_G
: // Test for position at end of previous match
4597 if (!((fMatch
&& fp
->fInputIdx
==fMatchEnd
) || (fMatch
==FALSE
&& fp
->fInputIdx
==fActiveStart
))) {
4598 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4603 case URX_BACKSLASH_X
:
4604 // Match a Grapheme, as defined by Unicode TR 29.
4605 // Differs slightly from Perl, which consumes combining marks independently
4609 // Fail if at end of input
4610 if (fp
->fInputIdx
>= fActiveLimit
) {
4612 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4616 // Examine (and consume) the current char.
4617 // Dispatch into a little state machine, based on the char.
4619 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4620 UnicodeSet
**sets
= fPattern
->fStaticSets
;
4621 if (sets
[URX_GC_NORMAL
]->contains(c
)) goto GC_Extend
;
4622 if (sets
[URX_GC_CONTROL
]->contains(c
)) goto GC_Control
;
4623 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
4624 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
4625 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
4626 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
4627 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
4633 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
4634 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4635 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
4636 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
4637 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
4638 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
4639 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c
);
4643 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
4644 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4645 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
4646 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
4647 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c
);
4651 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
4652 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4653 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
4654 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c
);
4658 // Combining characters are consumed here
4660 if (fp
->fInputIdx
>= fActiveLimit
) {
4663 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4664 if (sets
[URX_GC_EXTEND
]->contains(c
) == FALSE
) {
4665 U16_BACK_1(inputBuf
, 0, fp
->fInputIdx
);
4672 // Most control chars stand alone (don't combine with combining chars),
4673 // except for that CR/LF sequence is a single grapheme cluster.
4674 if (c
== 0x0d && fp
->fInputIdx
< fActiveLimit
&& inputBuf
[fp
->fInputIdx
] == 0x0a) {
4679 if (fp
->fInputIdx
>= fActiveLimit
) {
4688 case URX_BACKSLASH_Z
: // Test for end of Input
4689 if (fp
->fInputIdx
< fAnchorLimit
) {
4690 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4699 case URX_STATIC_SETREF
:
4701 // Test input character against one of the predefined sets
4702 // (Word Characters, for example)
4703 // The high bit of the op value is a flag for the match polarity.
4704 // 0: success if input char is in set.
4705 // 1: success if input char is not in set.
4706 if (fp
->fInputIdx
>= fActiveLimit
) {
4708 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4712 UBool success
= ((opValue
& URX_NEG_SET
) == URX_NEG_SET
);
4713 opValue
&= ~URX_NEG_SET
;
4714 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
4717 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4719 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
4720 if (s8
->contains(c
)) {
4724 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
4725 if (s
->contains(c
)) {
4730 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4736 case URX_STAT_SETREF_N
:
4738 // Test input character for NOT being a member of one of
4739 // the predefined sets (Word Characters, for example)
4740 if (fp
->fInputIdx
>= fActiveLimit
) {
4742 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4746 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
4749 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4751 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
4752 if (s8
->contains(c
) == FALSE
) {
4756 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
4757 if (s
->contains(c
) == FALSE
) {
4761 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4768 if (fp
->fInputIdx
>= fActiveLimit
) {
4770 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4774 U_ASSERT(opValue
> 0 && opValue
< sets
->size());
4776 // There is input left. Pick up one char and test it for set membership.
4778 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4780 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
4781 if (s8
->contains(c
)) {
4782 // The character is in the set. A Match.
4786 UnicodeSet
*s
= (UnicodeSet
*)sets
->elementAt(opValue
);
4787 if (s
->contains(c
)) {
4788 // The character is in the set. A Match.
4793 // the character wasn't in the set.
4794 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4801 // . matches anything, but stops at end-of-line.
4802 if (fp
->fInputIdx
>= fActiveLimit
) {
4803 // At end of input. Match failed. Backtrack out.
4805 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4809 // There is input left. Advance over one char, unless we've hit end-of-line
4811 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4812 if (((c
& 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
4813 ((c
<=0x0d && c
>=0x0a) || c
==0x85 ||c
==0x2028 || c
==0x2029)) {
4814 // End of line in normal mode. . does not match.
4815 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4822 case URX_DOTANY_ALL
:
4824 // . in dot-matches-all (including new lines) mode
4825 if (fp
->fInputIdx
>= fActiveLimit
) {
4826 // At end of input. Match failed. Backtrack out.
4828 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4832 // There is input left. Advance over one char, except if we are
4833 // at a cr/lf, advance over both of them.
4835 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4836 if (c
==0x0d && fp
->fInputIdx
< fActiveLimit
) {
4837 // In the case of a CR/LF, we need to advance over both.
4838 if (inputBuf
[fp
->fInputIdx
] == 0x0a) {
4839 U16_FWD_1(inputBuf
, fp
->fInputIdx
, fActiveLimit
);
4846 case URX_DOTANY_UNIX
:
4848 // '.' operator, matches all, but stops at end-of-line.
4849 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
4850 if (fp
->fInputIdx
>= fActiveLimit
) {
4851 // At end of input. Match failed. Backtrack out.
4853 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4857 // There is input left. Advance over one char, unless we've hit end-of-line
4859 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4861 // End of line in normal mode. '.' does not match the \n
4862 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4869 fp
->fPatIdx
= opValue
;
4877 U_ASSERT(opValue
< fPattern
->fCompiledPat
->size());
4878 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
4879 fp
->fPatIdx
= opValue
; // Then JMP.
4883 // This opcode is used with (x)+, when x can match a zero length string.
4884 // Same as JMP_SAV, except conditional on the match having made forward progress.
4885 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
4886 // data address of the input position at the start of the loop.
4888 U_ASSERT(opValue
> 0 && opValue
< fPattern
->fCompiledPat
->size());
4889 int32_t stoOp
= (int32_t)pat
[opValue
-1];
4890 U_ASSERT(URX_TYPE(stoOp
) == URX_STO_INP_LOC
);
4891 int32_t frameLoc
= URX_VAL(stoOp
);
4892 U_ASSERT(frameLoc
>= 0 && frameLoc
< fFrameSize
);
4893 int32_t prevInputIdx
= (int32_t)fp
->fExtra
[frameLoc
];
4894 U_ASSERT(prevInputIdx
<= fp
->fInputIdx
);
4895 if (prevInputIdx
< fp
->fInputIdx
) {
4896 // The match did make progress. Repeat the loop.
4897 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
4898 fp
->fPatIdx
= opValue
;
4899 fp
->fExtra
[frameLoc
] = fp
->fInputIdx
;
4901 // If the input position did not advance, we do nothing here,
4902 // execution will fall out of the loop.
4908 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
4909 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
4911 // Pick up the three extra operands that CTR_INIT has, and
4912 // skip the pattern location counter past
4913 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
4915 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
4916 int32_t minCount
= (int32_t)pat
[instrOperandLoc
+1];
4917 int32_t maxCount
= (int32_t)pat
[instrOperandLoc
+2];
4918 U_ASSERT(minCount
>=0);
4919 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
4920 U_ASSERT(loopLoc
>=fp
->fPatIdx
);
4922 if (minCount
== 0) {
4923 fp
= StateSave(fp
, loopLoc
+1, status
);
4925 if (maxCount
== -1) {
4926 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // For loop breaking.
4927 } else if (maxCount
== 0) {
4928 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4935 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
4936 int32_t initOp
= (int32_t)pat
[opValue
];
4937 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT
);
4938 int64_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
4939 int32_t minCount
= (int32_t)pat
[opValue
+2];
4940 int32_t maxCount
= (int32_t)pat
[opValue
+3];
4942 if ((uint64_t)*pCounter
>= (uint32_t)maxCount
&& maxCount
!= -1) {
4943 U_ASSERT(*pCounter
== maxCount
);
4946 if (*pCounter
>= minCount
) {
4947 if (maxCount
== -1) {
4948 // Loop has no hard upper bound.
4949 // Check that it is progressing through the input, break if it is not.
4950 int64_t *pLastInputIdx
= &fp
->fExtra
[URX_VAL(initOp
) + 1];
4951 if (fp
->fInputIdx
== *pLastInputIdx
) {
4954 *pLastInputIdx
= fp
->fInputIdx
;
4957 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
4959 fp
->fPatIdx
= opValue
+ 4; // Loop back.
4963 case URX_CTR_INIT_NG
:
4965 // Initialize a non-greedy loop
4966 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
4967 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
4969 // Pick up the three extra operands that CTR_INIT_NG has, and
4970 // skip the pattern location counter past
4971 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
4973 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
4974 int32_t minCount
= (int32_t)pat
[instrOperandLoc
+1];
4975 int32_t maxCount
= (int32_t)pat
[instrOperandLoc
+2];
4976 U_ASSERT(minCount
>=0);
4977 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
4978 U_ASSERT(loopLoc
>fp
->fPatIdx
);
4979 if (maxCount
== -1) {
4980 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // Save initial input index for loop breaking.
4983 if (minCount
== 0) {
4984 if (maxCount
!= 0) {
4985 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
4987 fp
->fPatIdx
= loopLoc
+1; // Continue with stuff after repeated block
4992 case URX_CTR_LOOP_NG
:
4994 // Non-greedy {min, max} loops
4995 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
4996 int32_t initOp
= (int32_t)pat
[opValue
];
4997 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT_NG
);
4998 int64_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
4999 int32_t minCount
= (int32_t)pat
[opValue
+2];
5000 int32_t maxCount
= (int32_t)pat
[opValue
+3];
5003 if ((uint64_t)*pCounter
>= (uint32_t)maxCount
&& maxCount
!= -1) {
5004 // The loop has matched the maximum permitted number of times.
5005 // Break out of here with no action. Matching will
5006 // continue with the following pattern.
5007 U_ASSERT(*pCounter
== maxCount
);
5011 if (*pCounter
< minCount
) {
5012 // We haven't met the minimum number of matches yet.
5013 // Loop back for another one.
5014 fp
->fPatIdx
= opValue
+ 4; // Loop back.
5016 // We do have the minimum number of matches.
5018 // If there is no upper bound on the loop iterations, check that the input index
5019 // is progressing, and stop the loop if it is not.
5020 if (maxCount
== -1) {
5021 int64_t *pLastInputIdx
= &fp
->fExtra
[URX_VAL(initOp
) + 1];
5022 if (fp
->fInputIdx
== *pLastInputIdx
) {
5025 *pLastInputIdx
= fp
->fInputIdx
;
5028 // Loop Continuation: we will fall into the pattern following the loop
5029 // (non-greedy, don't execute loop body first), but first do
5030 // a state save to the top of the loop, so that a match failure
5031 // in the following pattern will try another iteration of the loop.
5032 fp
= StateSave(fp
, opValue
+ 4, status
);
5038 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
5039 fData
[opValue
] = fStack
->size();
5044 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
5045 int32_t newStackSize
= (int32_t)fData
[opValue
];
5046 U_ASSERT(newStackSize
<= fStack
->size());
5047 int64_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
5048 if (newFP
== (int64_t *)fp
) {
5052 for (i
=0; i
<fFrameSize
; i
++) {
5053 newFP
[i
] = ((int64_t *)fp
)[i
];
5055 fp
= (REStackFrame
*)newFP
;
5056 fStack
->setSize(newStackSize
);
5062 U_ASSERT(opValue
< fFrameSize
);
5063 int64_t groupStartIdx
= fp
->fExtra
[opValue
];
5064 int64_t groupEndIdx
= fp
->fExtra
[opValue
+1];
5065 U_ASSERT(groupStartIdx
<= groupEndIdx
);
5066 int64_t inputIndex
= fp
->fInputIdx
;
5067 if (groupStartIdx
< 0) {
5068 // This capture group has not participated in the match thus far,
5069 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
5072 UBool success
= TRUE
;
5073 for (int64_t groupIndex
= groupStartIdx
; groupIndex
< groupEndIdx
; ++groupIndex
,++inputIndex
) {
5074 if (inputIndex
>= fActiveLimit
) {
5079 if (inputBuf
[groupIndex
] != inputBuf
[inputIndex
]) {
5085 fp
->fInputIdx
= inputIndex
;
5087 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5094 U_ASSERT(opValue
< fFrameSize
);
5095 int64_t groupStartIdx
= fp
->fExtra
[opValue
];
5096 int64_t groupEndIdx
= fp
->fExtra
[opValue
+1];
5097 U_ASSERT(groupStartIdx
<= groupEndIdx
);
5098 if (groupStartIdx
< 0) {
5099 // This capture group has not participated in the match thus far,
5100 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
5103 CaseFoldingUCharIterator
captureGroupItr(inputBuf
, groupStartIdx
, groupEndIdx
);
5104 CaseFoldingUCharIterator
inputItr(inputBuf
, fp
->fInputIdx
, fActiveLimit
);
5106 // Note: if the capture group match was of an empty string the backref
5107 // match succeeds. Verified by testing: Perl matches succeed
5108 // in this case, so we do too.
5110 UBool success
= TRUE
;
5112 UChar32 captureGroupChar
= captureGroupItr
.next();
5113 if (captureGroupChar
== U_SENTINEL
) {
5117 UChar32 inputChar
= inputItr
.next();
5118 if (inputChar
== U_SENTINEL
) {
5123 if (inputChar
!= captureGroupChar
) {
5129 if (success
&& inputItr
.inExpansion()) {
5130 // We otained a match by consuming part of a string obtained from
5131 // case-folding a single code point of the input text.
5132 // This does not count as an overall match.
5137 fp
->fInputIdx
= inputItr
.getIndex();
5139 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5144 case URX_STO_INP_LOC
:
5146 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
);
5147 fp
->fExtra
[opValue
] = fp
->fInputIdx
;
5153 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
5155 int32_t dataLoc
= URX_VAL(pat
[instrOperandLoc
]);
5156 U_ASSERT(dataLoc
>= 0 && dataLoc
< fFrameSize
);
5157 int32_t savedInputIdx
= (int32_t)fp
->fExtra
[dataLoc
];
5158 U_ASSERT(savedInputIdx
<= fp
->fInputIdx
);
5159 if (savedInputIdx
< fp
->fInputIdx
) {
5160 fp
->fPatIdx
= opValue
; // JMP
5162 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no progress in loop.
5169 // Entering a lookahead block.
5170 // Save Stack Ptr, Input Pos.
5171 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5172 fData
[opValue
] = fStack
->size();
5173 fData
[opValue
+1] = fp
->fInputIdx
;
5174 fActiveStart
= fLookStart
; // Set the match region change for
5175 fActiveLimit
= fLookLimit
; // transparent bounds.
5181 // Leaving a look-ahead block.
5182 // restore Stack Ptr, Input Pos to positions they had on entry to block.
5183 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5184 int32_t stackSize
= fStack
->size();
5185 int32_t newStackSize
= (int32_t)fData
[opValue
];
5186 U_ASSERT(stackSize
>= newStackSize
);
5187 if (stackSize
> newStackSize
) {
5188 // Copy the current top frame back to the new (cut back) top frame.
5189 // This makes the capture groups from within the look-ahead
5190 // expression available.
5191 int64_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
5193 for (i
=0; i
<fFrameSize
; i
++) {
5194 newFP
[i
] = ((int64_t *)fp
)[i
];
5196 fp
= (REStackFrame
*)newFP
;
5197 fStack
->setSize(newStackSize
);
5199 fp
->fInputIdx
= fData
[opValue
+1];
5201 // Restore the active region bounds in the input string; they may have
5202 // been changed because of transparent bounds on a Region.
5203 fActiveStart
= fRegionStart
;
5204 fActiveLimit
= fRegionLimit
;
5209 if (fp
->fInputIdx
< fActiveLimit
) {
5211 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
5212 if (u_foldCase(c
, U_FOLD_CASE_DEFAULT
) == opValue
) {
5218 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5222 // Case-insensitive test input against a literal string.
5223 // Strings require two slots in the compiled pattern, one for the
5224 // offset to the string text, and one for the length.
5225 // The compiled string has already been case folded.
5227 const UChar
*patternString
= litText
+ opValue
;
5229 op
= (int32_t)pat
[fp
->fPatIdx
];
5231 opType
= URX_TYPE(op
);
5232 opValue
= URX_VAL(op
);
5233 U_ASSERT(opType
== URX_STRING_LEN
);
5234 int32_t patternStringLen
= opValue
; // Length of the string from the pattern.
5238 UBool success
= TRUE
;
5239 int32_t patternStringIdx
= 0;
5240 CaseFoldingUCharIterator
inputIterator(inputBuf
, fp
->fInputIdx
, fActiveLimit
);
5241 while (patternStringIdx
< patternStringLen
) {
5242 U16_NEXT(patternString
, patternStringIdx
, patternStringLen
, cPattern
);
5243 cText
= inputIterator
.next();
5244 if (cText
!= cPattern
) {
5246 if (cText
== U_SENTINEL
) {
5252 if (inputIterator
.inExpansion()) {
5257 fp
->fInputIdx
= inputIterator
.getIndex();
5259 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5266 // Entering a look-behind block.
5267 // Save Stack Ptr, Input Pos.
5268 // TODO: implement transparent bounds. Ticket #6067
5269 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5270 fData
[opValue
] = fStack
->size();
5271 fData
[opValue
+1] = fp
->fInputIdx
;
5272 // Init the variable containing the start index for attempted matches.
5273 fData
[opValue
+2] = -1;
5274 // Save input string length, then reset to pin any matches to end at
5275 // the current position.
5276 fData
[opValue
+3] = fActiveLimit
;
5277 fActiveLimit
= fp
->fInputIdx
;
5284 // Positive Look-Behind, at top of loop checking for matches of LB expression
5285 // at all possible input starting positions.
5287 // Fetch the min and max possible match lengths. They are the operands
5288 // of this op in the pattern.
5289 int32_t minML
= (int32_t)pat
[fp
->fPatIdx
++];
5290 int32_t maxML
= (int32_t)pat
[fp
->fPatIdx
++];
5291 U_ASSERT(minML
<= maxML
);
5292 U_ASSERT(minML
>= 0);
5294 // Fetch (from data) the last input index where a match was attempted.
5295 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5296 int64_t *lbStartIdx
= &fData
[opValue
+2];
5297 if (*lbStartIdx
< 0) {
5298 // First time through loop.
5299 *lbStartIdx
= fp
->fInputIdx
- minML
;
5301 // 2nd through nth time through the loop.
5302 // Back up start position for match by one.
5303 if (*lbStartIdx
== 0) {
5306 U16_BACK_1(inputBuf
, 0, *lbStartIdx
);
5310 if (*lbStartIdx
< 0 || *lbStartIdx
< fp
->fInputIdx
- maxML
) {
5311 // We have tried all potential match starting points without
5312 // getting a match. Backtrack out, and out of the
5313 // Look Behind altogether.
5314 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5315 int64_t restoreInputLen
= fData
[opValue
+3];
5316 U_ASSERT(restoreInputLen
>= fActiveLimit
);
5317 U_ASSERT(restoreInputLen
<= fInputLength
);
5318 fActiveLimit
= restoreInputLen
;
5322 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5323 // (successful match will fall off the end of the loop.)
5324 fp
= StateSave(fp
, fp
->fPatIdx
-3, status
);
5325 fp
->fInputIdx
= *lbStartIdx
;
5330 // End of a look-behind block, after a successful match.
5332 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5333 if (fp
->fInputIdx
!= fActiveLimit
) {
5334 // The look-behind expression matched, but the match did not
5335 // extend all the way to the point that we are looking behind from.
5336 // FAIL out of here, which will take us back to the LB_CONT, which
5337 // will retry the match starting at another position or fail
5338 // the look-behind altogether, whichever is appropriate.
5339 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5343 // Look-behind match is good. Restore the orignal input string length,
5344 // which had been truncated to pin the end of the lookbehind match to the
5345 // position being looked-behind.
5346 int64_t originalInputLen
= fData
[opValue
+3];
5347 U_ASSERT(originalInputLen
>= fActiveLimit
);
5348 U_ASSERT(originalInputLen
<= fInputLength
);
5349 fActiveLimit
= originalInputLen
;
5356 // Negative Look-Behind, at top of loop checking for matches of LB expression
5357 // at all possible input starting positions.
5359 // Fetch the extra parameters of this op.
5360 int32_t minML
= (int32_t)pat
[fp
->fPatIdx
++];
5361 int32_t maxML
= (int32_t)pat
[fp
->fPatIdx
++];
5362 int32_t continueLoc
= (int32_t)pat
[fp
->fPatIdx
++];
5363 continueLoc
= URX_VAL(continueLoc
);
5364 U_ASSERT(minML
<= maxML
);
5365 U_ASSERT(minML
>= 0);
5366 U_ASSERT(continueLoc
> fp
->fPatIdx
);
5368 // Fetch (from data) the last input index where a match was attempted.
5369 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5370 int64_t *lbStartIdx
= &fData
[opValue
+2];
5371 if (*lbStartIdx
< 0) {
5372 // First time through loop.
5373 *lbStartIdx
= fp
->fInputIdx
- minML
;
5375 // 2nd through nth time through the loop.
5376 // Back up start position for match by one.
5377 if (*lbStartIdx
== 0) {
5378 (*lbStartIdx
)--; // Because U16_BACK is unsafe starting at 0.
5380 U16_BACK_1(inputBuf
, 0, *lbStartIdx
);
5384 if (*lbStartIdx
< 0 || *lbStartIdx
< fp
->fInputIdx
- maxML
) {
5385 // We have tried all potential match starting points without
5386 // getting a match, which means that the negative lookbehind as
5387 // a whole has succeeded. Jump forward to the continue location
5388 int64_t restoreInputLen
= fData
[opValue
+3];
5389 U_ASSERT(restoreInputLen
>= fActiveLimit
);
5390 U_ASSERT(restoreInputLen
<= fInputLength
);
5391 fActiveLimit
= restoreInputLen
;
5392 fp
->fPatIdx
= continueLoc
;
5396 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5397 // (successful match will cause a FAIL out of the loop altogether.)
5398 fp
= StateSave(fp
, fp
->fPatIdx
-4, status
);
5399 fp
->fInputIdx
= *lbStartIdx
;
5404 // End of a negative look-behind block, after a successful match.
5406 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5407 if (fp
->fInputIdx
!= fActiveLimit
) {
5408 // The look-behind expression matched, but the match did not
5409 // extend all the way to the point that we are looking behind from.
5410 // FAIL out of here, which will take us back to the LB_CONT, which
5411 // will retry the match starting at another position or succeed
5412 // the look-behind altogether, whichever is appropriate.
5413 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5417 // Look-behind expression matched, which means look-behind test as
5420 // Restore the orignal input string length, which had been truncated
5421 // inorder to pin the end of the lookbehind match
5422 // to the position being looked-behind.
5423 int64_t originalInputLen
= fData
[opValue
+3];
5424 U_ASSERT(originalInputLen
>= fActiveLimit
);
5425 U_ASSERT(originalInputLen
<= fInputLength
);
5426 fActiveLimit
= originalInputLen
;
5428 // Restore original stack position, discarding any state saved
5429 // by the successful pattern match.
5430 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5431 int32_t newStackSize
= (int32_t)fData
[opValue
];
5432 U_ASSERT(fStack
->size() > newStackSize
);
5433 fStack
->setSize(newStackSize
);
5435 // FAIL, which will take control back to someplace
5436 // prior to entering the look-behind test.
5437 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5443 // Loop Initialization for the optimized implementation of
5444 // [some character set]*
5445 // This op scans through all matching input.
5446 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5448 U_ASSERT(opValue
> 0 && opValue
< sets
->size());
5449 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
5450 UnicodeSet
*s
= (UnicodeSet
*)sets
->elementAt(opValue
);
5452 // Loop through input, until either the input is exhausted or
5453 // we reach a character that is not a member of the set.
5454 int32_t ix
= (int32_t)fp
->fInputIdx
;
5456 if (ix
>= fActiveLimit
) {
5461 U16_NEXT(inputBuf
, ix
, fActiveLimit
, c
);
5463 if (s8
->contains(c
) == FALSE
) {
5464 U16_BACK_1(inputBuf
, 0, ix
);
5468 if (s
->contains(c
) == FALSE
) {
5469 U16_BACK_1(inputBuf
, 0, ix
);
5475 // If there were no matching characters, skip over the loop altogether.
5476 // The loop doesn't run at all, a * op always succeeds.
5477 if (ix
== fp
->fInputIdx
) {
5478 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
5482 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5483 // must follow. It's operand is the stack location
5484 // that holds the starting input index for the match of this [set]*
5485 int32_t loopcOp
= (int32_t)pat
[fp
->fPatIdx
];
5486 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
5487 int32_t stackLoc
= URX_VAL(loopcOp
);
5488 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
5489 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
5492 // Save State to the URX_LOOP_C op that follows this one,
5493 // so that match failures in the following code will return to there.
5494 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5495 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5501 case URX_LOOP_DOT_I
:
5502 // Loop Initialization for the optimized implementation of .*
5503 // This op scans through all remaining input.
5504 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5506 // Loop through input until the input is exhausted (we reach an end-of-line)
5507 // In DOTALL mode, we can just go straight to the end of the input.
5509 if ((opValue
& 1) == 1) {
5510 // Dot-matches-All mode. Jump straight to the end of the string.
5511 ix
= (int32_t)fActiveLimit
;
5514 // NOT DOT ALL mode. Line endings do not match '.'
5515 // Scan forward until a line ending or end of input.
5516 ix
= (int32_t)fp
->fInputIdx
;
5518 if (ix
>= fActiveLimit
) {
5523 U16_NEXT(inputBuf
, ix
, fActiveLimit
, c
); // c = inputBuf[ix++]
5524 if ((c
& 0x7f) <= 0x29) { // Fast filter of non-new-line-s
5525 if ((c
== 0x0a) || // 0x0a is newline in both modes.
5526 (((opValue
& 2) == 0) && // IF not UNIX_LINES mode
5527 ((c
<=0x0d && c
>=0x0a) || c
==0x85 || c
==0x2028 || c
==0x2029))) {
5528 // char is a line ending. Put the input pos back to the
5529 // line ending char, and exit the scanning loop.
5530 U16_BACK_1(inputBuf
, 0, ix
);
5537 // If there were no matching characters, skip over the loop altogether.
5538 // The loop doesn't run at all, a * op always succeeds.
5539 if (ix
== fp
->fInputIdx
) {
5540 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
5544 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5545 // must follow. It's operand is the stack location
5546 // that holds the starting input index for the match of this .*
5547 int32_t loopcOp
= (int32_t)pat
[fp
->fPatIdx
];
5548 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
5549 int32_t stackLoc
= URX_VAL(loopcOp
);
5550 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
5551 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
5554 // Save State to the URX_LOOP_C op that follows this one,
5555 // so that match failures in the following code will return to there.
5556 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5557 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5565 U_ASSERT(opValue
>=0 && opValue
<fFrameSize
);
5566 backSearchIndex
= (int32_t)fp
->fExtra
[opValue
];
5567 U_ASSERT(backSearchIndex
<= fp
->fInputIdx
);
5568 if (backSearchIndex
== fp
->fInputIdx
) {
5569 // We've backed up the input idx to the point that the loop started.
5570 // The loop is done. Leave here without saving state.
5571 // Subsequent failures won't come back here.
5574 // Set up for the next iteration of the loop, with input index
5575 // backed up by one from the last time through,
5576 // and a state save to this instruction in case the following code fails again.
5577 // (We're going backwards because this loop emulates stack unwinding, not
5578 // the initial scan forward.)
5579 U_ASSERT(fp
->fInputIdx
> 0);
5581 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, prevC
); // !!!: should this 0 be one of f*Limit?
5583 if (prevC
== 0x0a &&
5584 fp
->fInputIdx
> backSearchIndex
&&
5585 inputBuf
[fp
->fInputIdx
-1] == 0x0d) {
5586 int32_t prevOp
= (int32_t)pat
[fp
->fPatIdx
-2];
5587 if (URX_TYPE(prevOp
) == URX_LOOP_DOT_I
) {
5588 // .*, stepping back over CRLF pair.
5589 U16_BACK_1(inputBuf
, 0, fp
->fInputIdx
);
5594 fp
= StateSave(fp
, fp
->fPatIdx
-1, status
);
5601 // Trouble. The compiled pattern contains an entry with an
5602 // unrecognized type tag.
5606 if (U_FAILURE(status
)) {
5615 fLastMatchEnd
= fMatchEnd
;
5616 fMatchStart
= startIdx
;
5617 fMatchEnd
= fp
->fInputIdx
;
5620 #ifdef REGEX_RUN_DEBUG
5623 printf("Match. start=%ld end=%ld\n\n", fMatchStart
, fMatchEnd
);
5625 printf("No match\n\n");
5630 fFrame
= fp
; // The active stack frame when the engine stopped.
5631 // Contains the capture group results that we need to
5638 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher
)
5642 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS