2 **************************************************************************
3 * Copyright (C) 2002-2008 International Business Machines Corporation *
4 * and others. All rights reserved. *
5 **************************************************************************
10 // Contains the implementation of class RegexMatcher,
11 // which is one of the main API classes for the ICU regular expression package.
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
17 #include "unicode/regex.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/ustring.h"
21 #include "unicode/rbbi.h"
29 // #include <malloc.h> // Needed for heapcheck testing
33 // Default limit for the size of the back track stack, to avoid system
34 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
35 // This value puts ICU's limits higher than most other regexp implementations,
36 // which use recursion rather than the heap, and take more storage per
39 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY
= 8000000;
41 // Time limit counter constant.
42 // Time limits for expression evaluation are in terms of quanta of work by
43 // the engine, each of which is 10,000 state saves.
44 // This constant determines that state saves per tick number.
45 static const int32_t TIMER_INITIAL_VALUE
= 10000;
47 //-----------------------------------------------------------------------------
49 // Constructor and Destructor
51 //-----------------------------------------------------------------------------
52 RegexMatcher::RegexMatcher(const RegexPattern
*pat
) {
53 fDeferredStatus
= U_ZERO_ERROR
;
54 init(fDeferredStatus
);
55 if (U_FAILURE(fDeferredStatus
)) {
59 fDeferredStatus
= U_ILLEGAL_ARGUMENT_ERROR
;
63 init2(RegexStaticSets::gStaticSets
->fEmptyString
, fDeferredStatus
);
68 RegexMatcher::RegexMatcher(const UnicodeString
®exp
, const UnicodeString
&input
,
69 uint32_t flags
, UErrorCode
&status
) {
71 if (U_FAILURE(status
)) {
75 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
76 fPattern
= fPatternOwned
;
81 RegexMatcher::RegexMatcher(const UnicodeString
®exp
,
82 uint32_t flags
, UErrorCode
&status
) {
84 if (U_FAILURE(status
)) {
88 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
89 fPattern
= fPatternOwned
;
90 init2(RegexStaticSets::gStaticSets
->fEmptyString
, status
);
96 RegexMatcher::~RegexMatcher() {
98 if (fData
!= fSmallData
) {
103 delete fPatternOwned
;
104 fPatternOwned
= NULL
;
107 #if UCONFIG_NO_BREAK_ITERATION==0
108 delete fWordBreakItr
;
113 // init() common initialization for use by all constructors.
114 // Initialize all fields, get the object into a consistent state.
115 // This must be done even when the initial status shows an error,
116 // so that the object is initialized sufficiently well for the destructor
119 void RegexMatcher::init(UErrorCode
&status
) {
121 fPatternOwned
= NULL
;
132 fTransparentBounds
= FALSE
;
133 fAnchoringBounds
= TRUE
;
146 fStackLimit
= DEFAULT_BACKTRACK_STACK_CAPACITY
;
148 fCallbackContext
= NULL
;
150 fDeferredStatus
= status
;
152 fWordBreakItr
= NULL
;
154 fStack
= new UVector32(status
);
155 if (U_FAILURE(status
)) {
156 fDeferredStatus
= status
;
161 // init2() Common initialization for use by RegexMatcher constructors, part 2.
162 // This handles the common setup to be done after the Pattern is available.
164 void RegexMatcher::init2(const UnicodeString
&input
, UErrorCode
&status
) {
165 if (U_FAILURE(status
)) {
166 fDeferredStatus
= status
;
170 if (fPattern
->fDataSize
> (int32_t)(sizeof(fSmallData
)/sizeof(int32_t))) {
171 fData
= (int32_t *)uprv_malloc(fPattern
->fDataSize
* sizeof(int32_t));
173 status
= fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
179 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY
, status
);
180 if (U_FAILURE(status
)) {
181 fDeferredStatus
= status
;
187 static const UChar BACKSLASH
= 0x5c;
188 static const UChar DOLLARSIGN
= 0x24;
189 //--------------------------------------------------------------------------------
193 //--------------------------------------------------------------------------------
194 RegexMatcher
&RegexMatcher::appendReplacement(UnicodeString
&dest
,
195 const UnicodeString
&replacement
,
196 UErrorCode
&status
) {
197 if (U_FAILURE(status
)) {
200 if (U_FAILURE(fDeferredStatus
)) {
201 status
= fDeferredStatus
;
204 if (fMatch
== FALSE
) {
205 status
= U_REGEX_INVALID_STATE
;
209 // Copy input string from the end of previous match to start of current match
210 int32_t len
= fMatchStart
-fAppendPosition
;
212 dest
.append(*fInput
, fAppendPosition
, len
);
214 fAppendPosition
= fMatchEnd
;
217 // scan the replacement text, looking for substitutions ($n) and \escapes.
218 // TODO: optimize this loop by efficiently scanning for '$' or '\',
219 // move entire ranges not containing substitutions.
220 int32_t replLen
= replacement
.length();
222 while (replIdx
<replLen
) {
223 UChar c
= replacement
.charAt(replIdx
);
225 if (c
== BACKSLASH
) {
226 // Backslash Escape. Copy the following char out without further checks.
227 // Note: Surrogate pairs don't need any special handling
228 // The second half wont be a '$' or a '\', and
229 // will move to the dest normally on the next
231 if (replIdx
>= replLen
) {
234 c
= replacement
.charAt(replIdx
);
236 if (c
==0x55/*U*/ || c
==0x75/*u*/) {
237 // We have a \udddd or \Udddddddd escape sequence.
238 UChar32 escapedChar
= replacement
.unescapeAt(replIdx
);
239 if (escapedChar
!= (UChar32
)0xFFFFFFFF) {
240 dest
.append(escapedChar
);
241 // TODO: Report errors for mal-formed \u escapes?
242 // As this is, the original sequence is output, which may be OK.
247 // Plain backslash escape. Just put out the escaped character.
253 if (c
!= DOLLARSIGN
) {
254 // Normal char, not a $. Copy it out without further checks.
259 // We've got a $. Pick up a capture group number if one follows.
260 // Consume at most the number of digits necessary for the largest capture
261 // number that is valid for this pattern.
263 int32_t numDigits
= 0;
264 int32_t groupNum
= 0;
267 if (replIdx
>= replLen
) {
270 digitC
= replacement
.char32At(replIdx
);
271 if (u_isdigit(digitC
) == FALSE
) {
274 replIdx
= replacement
.moveIndex32(replIdx
, 1);
275 groupNum
=groupNum
*10 + u_charDigitValue(digitC
);
277 if (numDigits
>= fPattern
->fMaxCaptureDigits
) {
283 if (numDigits
== 0) {
284 // The $ didn't introduce a group number at all.
285 // Treat it as just part of the substitution text.
286 dest
.append(DOLLARSIGN
);
290 // Finally, append the capture group data to the destination.
291 dest
.append(group(groupNum
, status
));
292 if (U_FAILURE(status
)) {
293 // Can fail if group number is out of range.
304 //--------------------------------------------------------------------------------
306 // appendTail Intended to be used in conjunction with appendReplacement()
307 // To the destination string, append everything following
308 // the last match position from the input string.
310 // Note: Match ranges do not affect appendTail or appendReplacement
312 //--------------------------------------------------------------------------------
313 UnicodeString
&RegexMatcher::appendTail(UnicodeString
&dest
) {
314 int32_t len
= fInput
->length() - fAppendPosition
;
316 dest
.append(*fInput
, fAppendPosition
, len
);
323 //--------------------------------------------------------------------------------
327 //--------------------------------------------------------------------------------
328 int32_t RegexMatcher::end(UErrorCode
&err
) const {
334 int32_t RegexMatcher::end(int32_t group
, UErrorCode
&err
) const {
335 if (U_FAILURE(err
)) {
338 if (fMatch
== FALSE
) {
339 err
= U_REGEX_INVALID_STATE
;
342 if (group
< 0 || group
> fPattern
->fGroupMap
->size()) {
343 err
= U_INDEX_OUTOFBOUNDS_ERROR
;
350 // Get the position within the stack frame of the variables for
351 // this capture group.
352 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(group
-1);
353 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
354 U_ASSERT(groupOffset
>= 0);
355 e
= fFrame
->fExtra
[groupOffset
+ 1];
362 //--------------------------------------------------------------------------------
366 //--------------------------------------------------------------------------------
367 UBool
RegexMatcher::find() {
368 // Start at the position of the last match end. (Will be zero if the
369 // matcher has been reset.
371 if (U_FAILURE(fDeferredStatus
)) {
375 int32_t startPos
= fMatchEnd
;
377 startPos
= fActiveStart
;
381 // Save the position of any previous successful match.
382 fLastMatchEnd
= fMatchEnd
;
384 if (fMatchStart
== fMatchEnd
) {
385 // Previous match had zero length. Move start position up one position
386 // to avoid sending find() into a loop on zero-length matches.
387 if (startPos
>= fActiveLimit
) {
392 startPos
= fInput
->moveIndex32(startPos
, 1);
395 if (fLastMatchEnd
>= 0) {
396 // A previous find() failed to match. Don't try again.
397 // (without this test, a pattern with a zero-length match
398 // could match again at the end of an input string.)
405 // Compute the position in the input string beyond which a match can not begin, because
406 // the minimum length match would extend past the end of the input.
407 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
408 // Be aware of possible overflows if making changes here.
409 int32_t testLen
= fActiveLimit
- fPattern
->fMinMatchLen
;
410 if (startPos
> testLen
) {
416 const UChar
*inputBuf
= fInput
->getBuffer();
418 U_ASSERT(startPos
>= 0);
420 switch (fPattern
->fStartType
) {
422 // No optimization was found.
423 // Try a match at each input position.
425 MatchAt(startPos
, FALSE
, fDeferredStatus
);
426 if (U_FAILURE(fDeferredStatus
)) {
432 if (startPos
>= testLen
) {
436 U16_FWD_1(inputBuf
, startPos
, fActiveLimit
);
437 // Note that it's perfectly OK for a pattern to have a zero-length
438 // match at the end of a string, so we must make sure that the loop
439 // runs with startPos == testLen the last time through.
444 // Matches are only possible at the start of the input string
445 // (pattern begins with ^ or \A)
446 if (startPos
> fActiveStart
) {
450 MatchAt(startPos
, FALSE
, fDeferredStatus
);
451 if (U_FAILURE(fDeferredStatus
)) {
459 // Match may start on any char from a pre-computed set.
460 U_ASSERT(fPattern
->fMinMatchLen
> 0);
462 int32_t pos
= startPos
;
463 U16_NEXT(inputBuf
, startPos
, fActiveLimit
, c
); // like c = inputBuf[startPos++];
464 if (c
<256 && fPattern
->fInitialChars8
->contains(c
) ||
465 c
>=256 && fPattern
->fInitialChars
->contains(c
)) {
466 MatchAt(pos
, FALSE
, fDeferredStatus
);
467 if (U_FAILURE(fDeferredStatus
)) {
474 if (pos
>= testLen
) {
486 // Match starts on exactly one char.
487 U_ASSERT(fPattern
->fMinMatchLen
> 0);
488 UChar32 theChar
= fPattern
->fInitialChar
;
490 int32_t pos
= startPos
;
491 U16_NEXT(inputBuf
, startPos
, fActiveLimit
, c
); // like c = inputBuf[startPos++];
493 MatchAt(pos
, FALSE
, fDeferredStatus
);
494 if (U_FAILURE(fDeferredStatus
)) {
501 if (pos
>= testLen
) {
513 if (startPos
== fAnchorStart
) {
514 MatchAt(startPos
, FALSE
, fDeferredStatus
);
515 if (U_FAILURE(fDeferredStatus
)) {
521 U16_NEXT(inputBuf
, startPos
, fActiveLimit
, c
); // like c = inputBuf[startPos++];
524 if (fPattern
->fFlags
& UREGEX_UNIX_LINES
) {
526 c
= inputBuf
[startPos
-1];
528 MatchAt(startPos
, FALSE
, fDeferredStatus
);
529 if (U_FAILURE(fDeferredStatus
)) {
536 if (startPos
>= testLen
) {
541 U16_NEXT(inputBuf
, startPos
, fActiveLimit
, c
); // like c = inputBuf[startPos++];
542 // Note that it's perfectly OK for a pattern to have a zero-length
543 // match at the end of a string, so we must make sure that the loop
544 // runs with startPos == testLen the last time through.
548 c
= inputBuf
[startPos
-1];
549 if (((c
& 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
550 ((c
<=0x0d && c
>=0x0a) || c
==0x85 ||c
==0x2028 || c
==0x2029 )) {
551 if (c
== 0x0d && startPos
< fActiveLimit
&& inputBuf
[startPos
] == 0x0a) {
554 MatchAt(startPos
, FALSE
, fDeferredStatus
);
555 if (U_FAILURE(fDeferredStatus
)) {
562 if (startPos
>= testLen
) {
567 U16_NEXT(inputBuf
, startPos
, fActiveLimit
, c
); // like c = inputBuf[startPos++];
568 // Note that it's perfectly OK for a pattern to have a zero-length
569 // match at the end of a string, so we must make sure that the loop
570 // runs with startPos == testLen the last time through.
585 UBool
RegexMatcher::find(int32_t start
, UErrorCode
&status
) {
586 if (U_FAILURE(status
)) {
589 if (U_FAILURE(fDeferredStatus
)) {
590 status
= fDeferredStatus
;
593 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
594 // This will reset the region to be the full input length.
595 if (start
< fActiveStart
|| start
> fActiveLimit
) {
596 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
605 //--------------------------------------------------------------------------------
609 //--------------------------------------------------------------------------------
610 UnicodeString
RegexMatcher::group(UErrorCode
&status
) const {
611 return group(0, status
);
616 UnicodeString
RegexMatcher::group(int32_t groupNum
, UErrorCode
&status
) const {
617 int32_t s
= start(groupNum
, status
);
618 int32_t e
= end(groupNum
, status
);
620 // Note: calling start() and end() above will do all necessary checking that
621 // the group number is OK and that a match exists. status will be set.
622 if (U_FAILURE(status
)) {
623 return UnicodeString();
625 if (U_FAILURE(fDeferredStatus
)) {
626 status
= fDeferredStatus
;
627 return UnicodeString();
631 // A capture group wasn't part of the match
632 return UnicodeString();
635 return UnicodeString(*fInput
, s
, e
-s
);
641 int32_t RegexMatcher::groupCount() const {
642 return fPattern
->fGroupMap
->size();
647 const UnicodeString
&RegexMatcher::input() const {
652 //--------------------------------------------------------------------------------
654 // hasAnchoringBounds()
656 //--------------------------------------------------------------------------------
657 UBool
RegexMatcher::hasAnchoringBounds() const {
658 return fAnchoringBounds
;
662 //--------------------------------------------------------------------------------
664 // hasTransparentBounds()
666 //--------------------------------------------------------------------------------
667 UBool
RegexMatcher::hasTransparentBounds() const {
668 return fTransparentBounds
;
673 //--------------------------------------------------------------------------------
677 //--------------------------------------------------------------------------------
678 UBool
RegexMatcher::hitEnd() const {
682 //--------------------------------------------------------------------------------
686 //--------------------------------------------------------------------------------
687 UBool
RegexMatcher::lookingAt(UErrorCode
&status
) {
688 if (U_FAILURE(status
)) {
691 if (U_FAILURE(fDeferredStatus
)) {
692 status
= fDeferredStatus
;
695 resetPreserveRegion();
696 MatchAt(fActiveStart
, FALSE
, status
);
701 UBool
RegexMatcher::lookingAt(int32_t start
, UErrorCode
&status
) {
702 if (U_FAILURE(status
)) {
705 if (U_FAILURE(fDeferredStatus
)) {
706 status
= fDeferredStatus
;
710 if (start
< fActiveStart
|| start
> fActiveLimit
) {
711 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
714 MatchAt(start
, FALSE
, status
);
720 //--------------------------------------------------------------------------------
724 //--------------------------------------------------------------------------------
725 UBool
RegexMatcher::matches(UErrorCode
&status
) {
726 if (U_FAILURE(status
)) {
729 if (U_FAILURE(fDeferredStatus
)) {
730 status
= fDeferredStatus
;
733 resetPreserveRegion();
734 MatchAt(fActiveStart
, TRUE
, status
);
739 UBool
RegexMatcher::matches(int32_t start
, UErrorCode
&status
) {
740 if (U_FAILURE(status
)) {
743 if (U_FAILURE(fDeferredStatus
)) {
744 status
= fDeferredStatus
;
748 if (start
< fActiveStart
|| start
> fActiveLimit
) {
749 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
752 MatchAt(start
, TRUE
, status
);
758 //--------------------------------------------------------------------------------
762 //--------------------------------------------------------------------------------
763 const RegexPattern
&RegexMatcher::pattern() const {
769 //--------------------------------------------------------------------------------
773 //--------------------------------------------------------------------------------
774 RegexMatcher
&RegexMatcher::region(int32_t start
, int32_t limit
, UErrorCode
&status
) {
775 if (U_FAILURE(status
)) {
778 if (start
>limit
|| start
<0 || limit
<0 || limit
>fInput
->length()) {
779 status
= U_ILLEGAL_ARGUMENT_ERROR
;
782 fRegionStart
= start
;
783 fRegionLimit
= limit
;
784 fActiveStart
= start
;
785 fActiveLimit
= limit
;
786 if (!fTransparentBounds
) {
790 if (fAnchoringBounds
) {
791 fAnchorStart
= start
;
792 fAnchorLimit
= limit
;
799 //--------------------------------------------------------------------------------
803 //--------------------------------------------------------------------------------
804 int32_t RegexMatcher::regionEnd() const {
809 //--------------------------------------------------------------------------------
813 //--------------------------------------------------------------------------------
814 int32_t RegexMatcher::regionStart() const {
819 //--------------------------------------------------------------------------------
823 //--------------------------------------------------------------------------------
824 UnicodeString
RegexMatcher::replaceAll(const UnicodeString
&replacement
, UErrorCode
&status
) {
825 if (U_FAILURE(status
)) {
828 if (U_FAILURE(fDeferredStatus
)) {
829 status
= fDeferredStatus
;
832 UnicodeString destString
;
835 appendReplacement(destString
, replacement
, status
);
836 if (U_FAILURE(status
)) {
840 appendTail(destString
);
847 //--------------------------------------------------------------------------------
851 //--------------------------------------------------------------------------------
852 UnicodeString
RegexMatcher::replaceFirst(const UnicodeString
&replacement
, UErrorCode
&status
) {
853 if (U_FAILURE(status
)) {
856 if (U_FAILURE(fDeferredStatus
)) {
857 status
= fDeferredStatus
;
866 UnicodeString destString
;
867 appendReplacement(destString
, replacement
, status
);
868 appendTail(destString
);
873 //--------------------------------------------------------------------------------
877 //--------------------------------------------------------------------------------
878 UBool
RegexMatcher::requireEnd() const {
883 //--------------------------------------------------------------------------------
887 //--------------------------------------------------------------------------------
888 RegexMatcher
&RegexMatcher::reset() {
890 fRegionLimit
= fInput
->length();
892 fActiveLimit
= fRegionLimit
;
894 fAnchorLimit
= fRegionLimit
;
896 fLookLimit
= fRegionLimit
;
897 resetPreserveRegion();
903 void RegexMatcher::resetPreserveRegion() {
912 fTickCounter
= TIMER_INITIAL_VALUE
;
917 RegexMatcher
&RegexMatcher::reset(const UnicodeString
&input
) {
920 if (fWordBreakItr
!= NULL
) {
921 #if UCONFIG_NO_BREAK_ITERATION==0
922 fWordBreakItr
->setText(input
);
928 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
929 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
934 RegexMatcher
&RegexMatcher::reset(int32_t position
, UErrorCode
&status
) {
935 if (U_FAILURE(status
)) {
938 reset(); // Reset also resets the region to be the entire string.
939 if (position
< 0 || position
>= fActiveLimit
) {
940 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
943 fMatchEnd
= position
;
951 //--------------------------------------------------------------------------------
955 //--------------------------------------------------------------------------------
956 void RegexMatcher::setTrace(UBool state
) {
962 //---------------------------------------------------------------------
966 //---------------------------------------------------------------------
967 int32_t RegexMatcher::split(const UnicodeString
&input
,
968 UnicodeString dest
[],
969 int32_t destCapacity
,
973 // Check arguements for validity
975 if (U_FAILURE(status
)) {
979 if (destCapacity
< 1) {
980 status
= U_ILLEGAL_ARGUMENT_ERROR
;
985 // Reset for the input text
988 int32_t nextOutputStringStart
= 0;
989 if (fActiveLimit
== 0) {
994 // Loop through the input text, searching for the delimiter pattern
997 int32_t numCaptureGroups
= fPattern
->fGroupMap
->size();
999 if (i
>=destCapacity
-1) {
1000 // There is one or zero output string left.
1001 // Fill the last output string with whatever is left from the input, then exit the loop.
1002 // ( i will be == destCapicity if we filled the output array while processing
1003 // capture groups of the delimiter expression, in which case we will discard the
1004 // last capture group saved in favor of the unprocessed remainder of the
1007 int32_t remainingLength
= fActiveLimit
-nextOutputStringStart
;
1008 if (remainingLength
> 0) {
1009 dest
[i
].setTo(input
, nextOutputStringStart
, remainingLength
);
1014 // We found another delimiter. Move everything from where we started looking
1015 // up until the start of the delimiter into the next output string.
1016 int32_t fieldLen
= fMatchStart
- nextOutputStringStart
;
1017 dest
[i
].setTo(input
, nextOutputStringStart
, fieldLen
);
1018 nextOutputStringStart
= fMatchEnd
;
1020 // If the delimiter pattern has capturing parentheses, the captured
1021 // text goes out into the next n destination strings.
1023 for (groupNum
=1; groupNum
<=numCaptureGroups
; groupNum
++) {
1024 if (i
==destCapacity
-1) {
1028 dest
[i
] = group(groupNum
, status
);
1031 if (nextOutputStringStart
== fActiveLimit
) {
1032 // The delimiter was at the end of the string. We're done.
1039 // We ran off the end of the input while looking for the next delimiter.
1040 // All the remaining text goes into the current output string.
1041 dest
[i
].setTo(input
, nextOutputStringStart
, fActiveLimit
-nextOutputStringStart
);
1050 //--------------------------------------------------------------------------------
1054 //--------------------------------------------------------------------------------
1055 int32_t RegexMatcher::start(UErrorCode
&status
) const {
1056 return start(0, status
);
1062 //--------------------------------------------------------------------------------
1064 // start(int32_t group, UErrorCode &status)
1066 //--------------------------------------------------------------------------------
1067 int32_t RegexMatcher::start(int32_t group
, UErrorCode
&status
) const {
1068 if (U_FAILURE(status
)) {
1071 if (U_FAILURE(fDeferredStatus
)) {
1072 status
= fDeferredStatus
;
1075 if (fMatch
== FALSE
) {
1076 status
= U_REGEX_INVALID_STATE
;
1079 if (group
< 0 || group
> fPattern
->fGroupMap
->size()) {
1080 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1087 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(group
-1);
1088 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
1089 U_ASSERT(groupOffset
>= 0);
1090 s
= fFrame
->fExtra
[groupOffset
];
1097 //--------------------------------------------------------------------------------
1099 // useAnchoringBounds
1101 //--------------------------------------------------------------------------------
1102 RegexMatcher
&RegexMatcher::useAnchoringBounds(UBool b
) {
1103 fAnchoringBounds
= b
;
1104 UErrorCode status
= U_ZERO_ERROR
;
1105 region(fRegionStart
, fRegionLimit
, status
);
1106 U_ASSERT(U_SUCCESS(status
));
1111 //--------------------------------------------------------------------------------
1113 // useTransparentBounds
1115 //--------------------------------------------------------------------------------
1116 RegexMatcher
&RegexMatcher::useTransparentBounds(UBool b
) {
1117 fTransparentBounds
= b
;
1118 UErrorCode status
= U_ZERO_ERROR
;
1119 region(fRegionStart
, fRegionLimit
, status
);
1120 U_ASSERT(U_SUCCESS(status
));
1124 //--------------------------------------------------------------------------------
1128 //--------------------------------------------------------------------------------
1129 void RegexMatcher::setTimeLimit(int32_t limit
, UErrorCode
&status
) {
1130 if (U_FAILURE(status
)) {
1133 if (U_FAILURE(fDeferredStatus
)) {
1134 status
= fDeferredStatus
;
1138 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1145 //--------------------------------------------------------------------------------
1149 //--------------------------------------------------------------------------------
1150 int32_t RegexMatcher::getTimeLimit() const {
1155 //--------------------------------------------------------------------------------
1159 //--------------------------------------------------------------------------------
1160 void RegexMatcher::setStackLimit(int32_t limit
, UErrorCode
&status
) {
1161 if (U_FAILURE(status
)) {
1164 if (U_FAILURE(fDeferredStatus
)) {
1165 status
= fDeferredStatus
;
1169 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1173 // Reset the matcher. This is needed here in case there is a current match
1174 // whose final stack frame (containing the match results, pointed to by fFrame)
1175 // would be lost by resizing to a smaller stack size.
1179 // Unlimited stack expansion
1180 fStack
->setMaxCapacity(0);
1182 // Change the units of the limit from bytes to ints, and bump the size up
1183 // to be big enough to hold at least one stack frame for the pattern,
1184 // if it isn't there already.
1185 int32_t adjustedLimit
= limit
/ sizeof(int32_t);
1186 if (adjustedLimit
< fPattern
->fFrameSize
) {
1187 adjustedLimit
= fPattern
->fFrameSize
;
1189 fStack
->setMaxCapacity(adjustedLimit
);
1191 fStackLimit
= limit
;
1195 //--------------------------------------------------------------------------------
1199 //--------------------------------------------------------------------------------
1200 int32_t RegexMatcher::getStackLimit() const {
1205 //--------------------------------------------------------------------------------
1209 //--------------------------------------------------------------------------------
1210 void RegexMatcher::setMatchCallback(URegexMatchCallback
*callback
,
1211 const void *context
,
1212 UErrorCode
&status
) {
1213 if (U_FAILURE(status
)) {
1216 fCallbackFn
= callback
;
1217 fCallbackContext
= context
;
1221 //--------------------------------------------------------------------------------
1225 //--------------------------------------------------------------------------------
1226 void RegexMatcher::getMatchCallback(URegexMatchCallback
*&callback
,
1227 const void *&context
,
1228 UErrorCode
&status
) {
1229 if (U_FAILURE(status
)) {
1232 callback
= fCallbackFn
;
1233 context
= fCallbackContext
;
1237 //================================================================================
1239 // Code following this point in this file is the internal
1240 // Match Engine Implementation.
1242 //================================================================================
1245 //--------------------------------------------------------------------------------
1248 // Discard any previous contents of the state save stack, and initialize a
1249 // new stack frame to all -1. The -1s are needed for capture group limits,
1250 // where they indicate that a group has not yet matched anything.
1251 //--------------------------------------------------------------------------------
1252 REStackFrame
*RegexMatcher::resetStack() {
1253 // Discard any previous contents of the state save stack, and initialize a
1254 // new stack frame to all -1. The -1s are needed for capture group limits, where
1255 // they indicate that a group has not yet matched anything.
1256 fStack
->removeAllElements();
1258 int32_t *iFrame
= fStack
->reserveBlock(fPattern
->fFrameSize
, fDeferredStatus
);
1260 for (i
=0; i
<fPattern
->fFrameSize
; i
++) {
1263 return (REStackFrame
*)iFrame
;
1268 //--------------------------------------------------------------------------------
1271 // in perl, "xab..cd..", \b is true at positions 0,3,5,7
1273 // If the current char is a combining mark,
1275 // Else Scan backwards to the first non-combining char.
1276 // We are at a boundary if the this char and the original chars are
1277 // opposite in membership in \w set
1279 // parameters: pos - the current position in the input buffer
1281 // TODO: double-check edge cases at region boundaries.
1283 //--------------------------------------------------------------------------------
1284 UBool
RegexMatcher::isWordBoundary(int32_t pos
) {
1285 UBool isBoundary
= FALSE
;
1286 UBool cIsWord
= FALSE
;
1288 if (pos
>= fLookLimit
) {
1291 // Determine whether char c at current position is a member of the word set of chars.
1292 // If we're off the end of the string, behave as though we're not at a word char.
1293 UChar32 c
= fInput
->char32At(pos
);
1294 if (u_hasBinaryProperty(c
, UCHAR_GRAPHEME_EXTEND
) || u_charType(c
) == U_FORMAT_CHAR
) {
1295 // Current char is a combining one. Not a boundary.
1298 cIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(c
);
1301 // Back up until we come to a non-combining char, determine whether
1302 // that char is a word char.
1303 UBool prevCIsWord
= FALSE
;
1304 int32_t prevPos
= pos
;
1306 if (prevPos
<= fLookStart
) {
1309 prevPos
= fInput
->moveIndex32(prevPos
, -1);
1310 UChar32 prevChar
= fInput
->char32At(prevPos
);
1311 if (!(u_hasBinaryProperty(prevChar
, UCHAR_GRAPHEME_EXTEND
)
1312 || u_charType(prevChar
) == U_FORMAT_CHAR
)) {
1313 prevCIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(prevChar
);
1317 isBoundary
= cIsWord
^ prevCIsWord
;
1321 //--------------------------------------------------------------------------------
1325 // Test for a word boundary using RBBI word break.
1327 // parameters: pos - the current position in the input buffer
1329 //--------------------------------------------------------------------------------
1330 UBool
RegexMatcher::isUWordBoundary(int32_t pos
) {
1331 UBool returnVal
= FALSE
;
1332 #if UCONFIG_NO_BREAK_ITERATION==0
1334 // If we haven't yet created a break iterator for this matcher, do it now.
1335 if (fWordBreakItr
== NULL
) {
1337 (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus
);
1338 if (U_FAILURE(fDeferredStatus
)) {
1341 fWordBreakItr
->setText(*fInput
);
1344 if (pos
>= fLookLimit
) {
1346 returnVal
= TRUE
; // With Unicode word rules, only positions within the interior of "real"
1347 // words are not boundaries. All non-word chars stand by themselves,
1348 // with word boundaries on both sides.
1350 returnVal
= fWordBreakItr
->isBoundary(pos
);
1356 //--------------------------------------------------------------------------------
1358 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state
1359 // saves. Increment the "time" counter, and call the
1360 // user callback function if there is one installed.
1362 // If the match operation needs to be aborted, either for a time-out
1363 // or because the user callback asked for it, just set an error status.
1364 // The engine will pick that up and stop in its outer loop.
1366 //--------------------------------------------------------------------------------
1367 void RegexMatcher::IncrementTime(UErrorCode
&status
) {
1368 fTickCounter
= TIMER_INITIAL_VALUE
;
1370 if (fCallbackFn
!= NULL
) {
1371 if ((*fCallbackFn
)(fCallbackContext
, fTime
) == FALSE
) {
1372 status
= U_REGEX_STOPPED_BY_CALLER
;
1376 if (fTimeLimit
> 0 && fTime
>= fTimeLimit
) {
1377 status
= U_REGEX_TIME_OUT
;
1381 //--------------------------------------------------------------------------------
1384 // Make a new stack frame, initialized as a copy of the current stack frame.
1385 // Set the pattern index in the original stack frame from the operand value
1386 // in the opcode. Execution of the engine continues with the state in
1387 // the newly created stack frame
1389 // Note that reserveBlock() may grow the stack, resulting in the
1390 // whole thing being relocated in memory.
1393 // fp The top frame pointer when called. At return, a new
1394 // fame will be present
1395 // savePatIdx An index into the compiled pattern. Goes into the original
1396 // (not new) frame. If execution ever back-tracks out of the
1397 // new frame, this will be where we continue from in the pattern.
1399 // The new frame pointer.
1401 //--------------------------------------------------------------------------------
1402 inline REStackFrame
*RegexMatcher::StateSave(REStackFrame
*fp
, int32_t savePatIdx
, UErrorCode
&status
) {
1403 // push storage for a new frame.
1404 int32_t *newFP
= fStack
->reserveBlock(fFrameSize
, status
);
1405 if (newFP
== NULL
) {
1406 // Failure on attempted stack expansion.
1407 // Stack function set some other error code, change it to a more
1408 // specific one for regular expressions.
1409 status
= U_REGEX_STACK_OVERFLOW
;
1410 // We need to return a writable stack frame, so just return the
1411 // previous frame. The match operation will stop quickly
1412 // because of the error status, after which the frame will never
1413 // be looked at again.
1416 fp
= (REStackFrame
*)(newFP
- fFrameSize
); // in case of realloc of stack.
1418 // New stack frame = copy of old top frame.
1419 int32_t *source
= (int32_t *)fp
;
1420 int32_t *dest
= newFP
;
1422 *dest
++ = *source
++;
1423 if (source
== newFP
) {
1429 if (fTickCounter
<= 0) {
1430 IncrementTime(status
); // Re-initializes fTickCounter
1432 fp
->fPatIdx
= savePatIdx
;
1433 return (REStackFrame
*)newFP
;
1437 //--------------------------------------------------------------------------------
1439 // MatchAt This is the actual matching engine.
1441 // startIdx: begin matching a this index.
1442 // toEnd: if true, match must extend to end of the input region
1444 //--------------------------------------------------------------------------------
1445 void RegexMatcher::MatchAt(int32_t startIdx
, UBool toEnd
, UErrorCode
&status
) {
1446 UBool isMatch
= FALSE
; // True if the we have a match.
1448 int32_t op
; // Operation from the compiled pattern, split into
1449 int32_t opType
; // the opcode
1450 int32_t opValue
; // and the operand value.
1452 #ifdef REGEX_RUN_DEBUG
1455 printf("MatchAt(startIdx=%d)\n", startIdx
);
1456 printf("Original Pattern: ");
1458 for (i
=0; i
<fPattern
->fPattern
.length(); i
++) {
1459 printf("%c", fPattern
->fPattern
.charAt(i
));
1462 printf("Input String: ");
1463 for (i
=0; i
<fInput
->length(); i
++) {
1464 UChar c
= fInput
->charAt(i
);
1465 if (c
<32 || c
>256) {
1475 if (U_FAILURE(status
)) {
1479 // Cache frequently referenced items from the compiled pattern
1481 int32_t *pat
= fPattern
->fCompiledPat
->getBuffer();
1483 const UChar
*litText
= fPattern
->fLiteralText
.getBuffer();
1484 UVector
*sets
= fPattern
->fSets
;
1486 const UChar
*inputBuf
= fInput
->getBuffer();
1488 fFrameSize
= fPattern
->fFrameSize
;
1489 REStackFrame
*fp
= resetStack();
1492 fp
->fInputIdx
= startIdx
;
1494 // Zero out the pattern's static data
1496 for (i
= 0; i
<fPattern
->fDataSize
; i
++) {
1501 // Main loop for interpreting the compiled pattern.
1502 // One iteration of the loop per pattern operation performed.
1506 if (_heapchk() != _HEAPOK
) {
1507 fprintf(stderr
, "Heap Trouble\n");
1510 op
= pat
[fp
->fPatIdx
];
1511 opType
= URX_TYPE(op
);
1512 opValue
= URX_VAL(op
);
1513 #ifdef REGEX_RUN_DEBUG
1515 printf("inputIdx=%d inputChar=%c sp=%3d ", fp
->fInputIdx
,
1516 fInput
->char32At(fp
->fInputIdx
), (int32_t *)fp
-fStack
->getBuffer());
1517 fPattern
->dumpOp(fp
->fPatIdx
);
1530 // Force a backtrack. In some circumstances, the pattern compiler
1531 // will notice that the pattern can't possibly match anything, and will
1532 // emit one of these at that point.
1533 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
1538 if (fp
->fInputIdx
< fActiveLimit
) {
1540 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
1547 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
1553 // Test input against a literal string.
1554 // Strings require two slots in the compiled pattern, one for the
1555 // offset to the string text, and one for the length.
1556 int32_t stringStartIdx
= opValue
;
1559 op
= pat
[fp
->fPatIdx
]; // Fetch the second operand
1561 opType
= URX_TYPE(op
);
1562 stringLen
= URX_VAL(op
);
1563 U_ASSERT(opType
== URX_STRING_LEN
);
1564 U_ASSERT(stringLen
>= 2);
1566 if (fp
->fInputIdx
+ stringLen
> fActiveLimit
) {
1567 // No match. String is longer than the remaining input text.
1568 fHitEnd
= TRUE
; // TODO: See ticket 6074
1569 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
1573 const UChar
* pInp
= inputBuf
+ fp
->fInputIdx
;
1574 const UChar
* pPat
= litText
+stringStartIdx
;
1575 const UChar
* pEnd
= pInp
+ stringLen
;
1577 if (*pInp
== *pPat
) {
1581 // Successful Match.
1582 fp
->fInputIdx
+= stringLen
;
1587 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
1596 case URX_STATE_SAVE
:
1597 fp
= StateSave(fp
, opValue
, status
);
1602 // The match loop will exit via this path on a successful match,
1603 // when we reach the end of the pattern.
1604 if (toEnd
&& fp
->fInputIdx
!= fActiveLimit
) {
1605 // The pattern matched, but not to the end of input. Try some more.
1606 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
1612 // Start and End Capture stack frame variables are layout out like this:
1613 // fp->fExtra[opValue] - The start of a completed capture group
1614 // opValue+1 - The end of a completed capture group
1615 // opValue+2 - the start of a capture group whose end
1616 // has not yet been reached (and might not ever be).
1617 case URX_START_CAPTURE
:
1618 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
1619 fp
->fExtra
[opValue
+2] = fp
->fInputIdx
;
1623 case URX_END_CAPTURE
:
1624 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
1625 U_ASSERT(fp
->fExtra
[opValue
+2] >= 0); // Start pos for this group must be set.
1626 fp
->fExtra
[opValue
] = fp
->fExtra
[opValue
+2]; // Tentative start becomes real.
1627 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // End position
1628 U_ASSERT(fp
->fExtra
[opValue
] <= fp
->fExtra
[opValue
+1]);
1632 case URX_DOLLAR
: // $, test for End of line
1633 // or for position before new line at end of input
1634 if (fp
->fInputIdx
< fAnchorLimit
-2) {
1635 // We are no where near the end of input. Fail.
1636 // This is the common case. Keep it first.
1637 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
1640 if (fp
->fInputIdx
>= fAnchorLimit
) {
1641 // We really are at the end of input. Success.
1646 // If we are positioned just before a new-line that is located at the
1647 // end of input, succeed.
1648 if (fp
->fInputIdx
== fAnchorLimit
-1) {
1649 UChar32 c
= fInput
->char32At(fp
->fInputIdx
);
1650 if ((c
>=0x0a && c
<=0x0d) || c
==0x85 || c
==0x2028 || c
==0x2029) {
1651 // If not in the middle of a CR/LF sequence
1652 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& inputBuf
[fp
->fInputIdx
-1]==0x0d)) {
1653 // At new-line at end of input. Success
1661 if (fp
->fInputIdx
== fAnchorLimit
-2 &&
1662 fInput
->char32At(fp
->fInputIdx
) == 0x0d && fInput
->char32At(fp
->fInputIdx
+1) == 0x0a) {
1665 break; // At CR/LF at end of input. Success
1668 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
1673 case URX_DOLLAR_D
: // $, test for End of Line, in UNIX_LINES mode.
1674 if (fp
->fInputIdx
>= fAnchorLimit
-1) {
1675 // Either at the last character of input, or off the end.
1676 if (fp
->fInputIdx
== fAnchorLimit
-1) {
1677 // At last char of input. Success if it's a new line.
1678 if (fInput
->char32At(fp
->fInputIdx
) == 0x0a) {
1684 // Off the end of input. Success.
1691 // Not at end of input. Back-track out.
1692 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
1696 case URX_DOLLAR_M
: // $, test for End of line in multi-line mode
1698 if (fp
->fInputIdx
>= fAnchorLimit
) {
1699 // We really are at the end of input. Success.
1704 // If we are positioned just before a new-line, succeed.
1705 // It makes no difference where the new-line is within the input.
1706 UChar32 c
= inputBuf
[fp
->fInputIdx
];
1707 if ((c
>=0x0a && c
<=0x0d) || c
==0x85 ||c
==0x2028 || c
==0x2029) {
1708 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
1709 // In multi-line mode, hitting a new-line just before the end of input does not
1710 // set the hitEnd or requireEnd flags
1711 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& inputBuf
[fp
->fInputIdx
-1]==0x0d)) {
1715 // not at a new line. Fail.
1716 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
1721 case URX_DOLLAR_MD
: // $, test for End of line in multi-line and UNIX_LINES mode
1723 if (fp
->fInputIdx
>= fAnchorLimit
) {
1724 // We really are at the end of input. Success.
1726 fRequireEnd
= TRUE
; // Java set requireEnd in this case, even though
1727 break; // adding a new-line would not lose the match.
1729 // If we are not positioned just before a new-line, the test fails; backtrack out.
1730 // It makes no difference where the new-line is within the input.
1731 if (inputBuf
[fp
->fInputIdx
] != 0x0a) {
1732 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
1738 case URX_CARET
: // ^, test for start of line
1739 if (fp
->fInputIdx
!= fAnchorStart
) {
1740 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
1745 case URX_CARET_M
: // ^, test for start of line in mulit-line mode
1747 if (fp
->fInputIdx
== fAnchorStart
) {
1748 // We are at the start input. Success.
1751 // Check whether character just before the current pos is a new-line
1752 // unless we are at the end of input
1753 UChar c
= inputBuf
[fp
->fInputIdx
- 1];
1754 if ((fp
->fInputIdx
< fAnchorLimit
) &&
1755 ((c
<=0x0d && c
>=0x0a) || c
==0x85 ||c
==0x2028 || c
==0x2029)) {
1756 // It's a new-line. ^ is true. Success.
1757 // TODO: what should be done with positions between a CR and LF?
1760 // Not at the start of a line. Fail.
1761 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
1766 case URX_CARET_M_UNIX
: // ^, test for start of line in mulit-line + Unix-line mode
1768 U_ASSERT(fp
->fInputIdx
>= fAnchorStart
);
1769 if (fp
->fInputIdx
<= fAnchorStart
) {
1770 // We are at the start input. Success.
1773 // Check whether character just before the current pos is a new-line
1774 U_ASSERT(fp
->fInputIdx
<= fAnchorLimit
);
1775 UChar c
= inputBuf
[fp
->fInputIdx
- 1];
1777 // Not at the start of a line. Back-track out.
1778 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
1783 case URX_BACKSLASH_B
: // Test for word boundaries
1785 UBool success
= isWordBoundary(fp
->fInputIdx
);
1786 success
^= (opValue
!= 0); // flip sense for \B
1788 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
1794 case URX_BACKSLASH_BU
: // Test for word boundaries, Unicode-style
1796 UBool success
= isUWordBoundary(fp
->fInputIdx
);
1797 success
^= (opValue
!= 0); // flip sense for \B
1799 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
1805 case URX_BACKSLASH_D
: // Test for decimal digit
1807 if (fp
->fInputIdx
>= fActiveLimit
) {
1809 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
1813 UChar32 c
= fInput
->char32At(fp
->fInputIdx
);
1814 int8_t ctype
= u_charType(c
); // TODO: make a unicode set for this. Will be faster.
1815 UBool success
= (ctype
== U_DECIMAL_DIGIT_NUMBER
);
1816 success
^= (opValue
!= 0); // flip sense for \D
1818 fp
->fInputIdx
= fInput
->moveIndex32(fp
->fInputIdx
, 1);
1820 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
1826 case URX_BACKSLASH_G
: // Test for position at end of previous match
1827 if (!((fMatch
&& fp
->fInputIdx
==fMatchEnd
) || fMatch
==FALSE
&& fp
->fInputIdx
==fActiveStart
)) {
1828 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
1833 case URX_BACKSLASH_X
:
1834 // Match a Grapheme, as defined by Unicode TR 29.
1835 // Differs slightly from Perl, which consumes combining marks independently
1839 // Fail if at end of input
1840 if (fp
->fInputIdx
>= fActiveLimit
) {
1842 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
1846 // Examine (and consume) the current char.
1847 // Dispatch into a little state machine, based on the char.
1849 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
1850 UnicodeSet
**sets
= fPattern
->fStaticSets
;
1851 if (sets
[URX_GC_NORMAL
]->contains(c
)) goto GC_Extend
;
1852 if (sets
[URX_GC_CONTROL
]->contains(c
)) goto GC_Control
;
1853 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
1854 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
1855 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
1856 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
1857 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
1863 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
1864 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
1865 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
1866 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
1867 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
1868 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
1869 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c
);
1873 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
1874 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
1875 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
1876 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
1877 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c
);
1881 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
1882 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
1883 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
1884 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c
);
1888 // Combining characters are consumed here
1890 if (fp
->fInputIdx
>= fActiveLimit
) {
1893 U16_GET(inputBuf
, 0, fp
->fInputIdx
, fActiveLimit
, c
);
1894 if (sets
[URX_GC_EXTEND
]->contains(c
) == FALSE
) {
1897 U16_FWD_1(inputBuf
, fp
->fInputIdx
, fActiveLimit
);
1902 // Most control chars stand alone (don't combine with combining chars),
1903 // except for that CR/LF sequence is a single grapheme cluster.
1904 if (c
== 0x0d && fp
->fInputIdx
< fActiveLimit
&& inputBuf
[fp
->fInputIdx
] == 0x0a) {
1909 if (fp
->fInputIdx
>= fActiveLimit
) {
1918 case URX_BACKSLASH_Z
: // Test for end of Input
1919 if (fp
->fInputIdx
< fAnchorLimit
) {
1920 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
1929 case URX_STATIC_SETREF
:
1931 // Test input character against one of the predefined sets
1932 // (Word Characters, for example)
1933 // The high bit of the op value is a flag for the match polarity.
1934 // 0: success if input char is in set.
1935 // 1: success if input char is not in set.
1936 if (fp
->fInputIdx
>= fActiveLimit
) {
1938 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
1942 UBool success
= ((opValue
& URX_NEG_SET
) == URX_NEG_SET
);
1943 opValue
&= ~URX_NEG_SET
;
1944 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
1946 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
1948 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
1949 if (s8
->contains(c
)) {
1953 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
1954 if (s
->contains(c
)) {
1959 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
1965 case URX_STAT_SETREF_N
:
1967 // Test input character for NOT being a member of one of
1968 // the predefined sets (Word Characters, for example)
1969 if (fp
->fInputIdx
>= fActiveLimit
) {
1971 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
1975 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
1977 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
1979 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
1980 if (s8
->contains(c
) == FALSE
) {
1984 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
1985 if (s
->contains(c
) == FALSE
) {
1990 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
1996 if (fp
->fInputIdx
>= fActiveLimit
) {
1998 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2001 // There is input left. Pick up one char and test it for set membership.
2003 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
2004 U_ASSERT(opValue
> 0 && opValue
< sets
->size());
2006 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
2007 if (s8
->contains(c
)) {
2011 UnicodeSet
*s
= (UnicodeSet
*)sets
->elementAt(opValue
);
2012 if (s
->contains(c
)) {
2013 // The character is in the set. A Match.
2017 // the character wasn't in the set. Back track out.
2018 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2024 // . matches anything, but stops at end-of-line.
2025 if (fp
->fInputIdx
>= fActiveLimit
) {
2026 // At end of input. Match failed. Backtrack out.
2028 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2031 // There is input left. Advance over one char, unless we've hit end-of-line
2033 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
2034 if (((c
& 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
2035 ((c
<=0x0d && c
>=0x0a) || c
==0x85 ||c
==0x2028 || c
==0x2029)) {
2036 // End of line in normal mode. . does not match.
2037 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2044 case URX_DOTANY_ALL
:
2046 // ., in dot-matches-all (including new lines) mode
2047 if (fp
->fInputIdx
>= fActiveLimit
) {
2048 // At end of input. Match failed. Backtrack out.
2050 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2053 // There is input left. Advance over one char, except if we are
2054 // at a cr/lf, advance over both of them.
2056 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
2057 if (c
==0x0d && fp
->fInputIdx
< fActiveLimit
) {
2058 // In the case of a CR/LF, we need to advance over both.
2059 UChar nextc
= inputBuf
[fp
->fInputIdx
];
2060 if (nextc
== 0x0a) {
2068 case URX_DOTANY_UNIX
:
2070 // '.' operator, matches all, but stops at end-of-line.
2071 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
2072 if (fp
->fInputIdx
>= fActiveLimit
) {
2073 // At end of input. Match failed. Backtrack out.
2075 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2078 // There is input left. Advance over one char, unless we've hit end-of-line
2080 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
2082 // End of line in normal mode. '.' does not match the \n
2083 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2090 fp
->fPatIdx
= opValue
;
2098 U_ASSERT(opValue
< fPattern
->fCompiledPat
->size());
2099 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
2100 fp
->fPatIdx
= opValue
; // Then JMP.
2104 // This opcode is used with (x)+, when x can match a zero length string.
2105 // Same as JMP_SAV, except conditional on the match having made forward progress.
2106 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
2107 // data address of the input position at the start of the loop.
2109 U_ASSERT(opValue
> 0 && opValue
< fPattern
->fCompiledPat
->size());
2110 int32_t stoOp
= pat
[opValue
-1];
2111 U_ASSERT(URX_TYPE(stoOp
) == URX_STO_INP_LOC
);
2112 int32_t frameLoc
= URX_VAL(stoOp
);
2113 U_ASSERT(frameLoc
>= 0 && frameLoc
< fFrameSize
);
2114 int32_t prevInputIdx
= fp
->fExtra
[frameLoc
];
2115 U_ASSERT(prevInputIdx
<= fp
->fInputIdx
);
2116 if (prevInputIdx
< fp
->fInputIdx
) {
2117 // The match did make progress. Repeat the loop.
2118 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
2119 fp
->fPatIdx
= opValue
;
2120 fp
->fExtra
[frameLoc
] = fp
->fInputIdx
;
2122 // If the input position did not advance, we do nothing here,
2123 // execution will fall out of the loop.
2129 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
2130 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
2132 // Pick up the three extra operands that CTR_INIT has, and
2133 // skip the pattern location counter past
2134 int32_t instrOperandLoc
= fp
->fPatIdx
;
2136 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
2137 int32_t minCount
= pat
[instrOperandLoc
+1];
2138 int32_t maxCount
= pat
[instrOperandLoc
+2];
2139 U_ASSERT(minCount
>=0);
2140 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
2141 U_ASSERT(loopLoc
>fp
->fPatIdx
);
2143 if (minCount
== 0) {
2144 fp
= StateSave(fp
, loopLoc
+1, status
);
2146 if (maxCount
== 0) {
2147 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2154 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
2155 int32_t initOp
= pat
[opValue
];
2156 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT
);
2157 int32_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
2158 int32_t minCount
= pat
[opValue
+2];
2159 int32_t maxCount
= pat
[opValue
+3];
2160 // Increment the counter. Note: we're not worrying about counter
2161 // overflow, since the data comes from UnicodeStrings, which
2162 // stores its length in an int32_t.
2164 U_ASSERT(*pCounter
> 0);
2165 if ((uint32_t)*pCounter
>= (uint32_t)maxCount
) {
2166 U_ASSERT(*pCounter
== maxCount
|| maxCount
== -1);
2169 if (*pCounter
>= minCount
) {
2170 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
2172 fp
->fPatIdx
= opValue
+ 4; // Loop back.
2176 case URX_CTR_INIT_NG
:
2178 // Initialize a non-greedy loop
2179 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
2180 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
2182 // Pick up the three extra operands that CTR_INIT has, and
2183 // skip the pattern location counter past
2184 int32_t instrOperandLoc
= fp
->fPatIdx
;
2186 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
2187 int32_t minCount
= pat
[instrOperandLoc
+1];
2188 int32_t maxCount
= pat
[instrOperandLoc
+2];
2189 U_ASSERT(minCount
>=0);
2190 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
2191 U_ASSERT(loopLoc
>fp
->fPatIdx
);
2193 if (minCount
== 0) {
2194 if (maxCount
!= 0) {
2195 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
2197 fp
->fPatIdx
= loopLoc
+1; // Continue with stuff after repeated block
2202 case URX_CTR_LOOP_NG
:
2204 // Non-greedy {min, max} loops
2205 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
2206 int32_t initOp
= pat
[opValue
];
2207 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT_NG
);
2208 int32_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
2209 int32_t minCount
= pat
[opValue
+2];
2210 int32_t maxCount
= pat
[opValue
+3];
2211 // Increment the counter. Note: we're not worrying about counter
2212 // overflow, since the data comes from UnicodeStrings, which
2213 // stores its length in an int32_t.
2215 U_ASSERT(*pCounter
> 0);
2217 if ((uint32_t)*pCounter
>= (uint32_t)maxCount
) {
2218 // The loop has matched the maximum permitted number of times.
2219 // Break out of here with no action. Matching will
2220 // continue with the following pattern.
2221 U_ASSERT(*pCounter
== maxCount
|| maxCount
== -1);
2225 if (*pCounter
< minCount
) {
2226 // We haven't met the minimum number of matches yet.
2227 // Loop back for another one.
2228 fp
->fPatIdx
= opValue
+ 4; // Loop back.
2230 // We do have the minimum number of matches.
2231 // Fall into the following pattern, but first do
2232 // a state save to the top of the loop, so that a failure
2233 // in the following pattern will try another iteration of the loop.
2234 fp
= StateSave(fp
, opValue
+ 4, status
);
2240 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
2241 fData
[opValue
] = fStack
->size();
2246 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
2247 int32_t newStackSize
= fData
[opValue
];
2248 U_ASSERT(newStackSize
<= fStack
->size());
2249 int32_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
2250 if (newFP
== (int32_t *)fp
) {
2254 for (i
=0; i
<fFrameSize
; i
++) {
2255 newFP
[i
] = ((int32_t *)fp
)[i
];
2257 fp
= (REStackFrame
*)newFP
;
2258 fStack
->setSize(newStackSize
);
2265 U_ASSERT(opValue
< fFrameSize
);
2266 int32_t groupStartIdx
= fp
->fExtra
[opValue
];
2267 int32_t groupEndIdx
= fp
->fExtra
[opValue
+1];
2268 U_ASSERT(groupStartIdx
<= groupEndIdx
);
2269 int32_t len
= groupEndIdx
-groupStartIdx
;
2270 if (groupStartIdx
< 0) {
2271 // This capture group has not participated in the match thus far,
2272 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
2276 // The capture group match was of an empty string.
2277 // Verified by testing: Perl matches succeed in this case, so
2282 UBool haveMatch
= FALSE
;
2283 if (fp
->fInputIdx
+ len
<= fActiveLimit
) {
2284 if (opType
== URX_BACKREF
) {
2285 if (u_strncmp(inputBuf
+groupStartIdx
, inputBuf
+fp
->fInputIdx
, len
) == 0) {
2289 if (u_strncasecmp(inputBuf
+groupStartIdx
, inputBuf
+fp
->fInputIdx
,
2290 len
, U_FOLD_CASE_DEFAULT
) == 0) {
2295 // TODO: probably need to do a partial string comparison, and only
2296 // set HitEnd if the available input matched. Ticket #6074
2300 fp
->fInputIdx
+= len
; // Match. Advance current input position.
2302 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
2307 case URX_STO_INP_LOC
:
2309 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
);
2310 fp
->fExtra
[opValue
] = fp
->fInputIdx
;
2316 int32_t instrOperandLoc
= fp
->fPatIdx
;
2318 int32_t dataLoc
= URX_VAL(pat
[instrOperandLoc
]);
2319 U_ASSERT(dataLoc
>= 0 && dataLoc
< fFrameSize
);
2320 int32_t savedInputIdx
= fp
->fExtra
[dataLoc
];
2321 U_ASSERT(savedInputIdx
<= fp
->fInputIdx
);
2322 if (savedInputIdx
< fp
->fInputIdx
) {
2323 fp
->fPatIdx
= opValue
; // JMP
2325 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no progress in loop.
2332 // Entering a lookahead block.
2333 // Save Stack Ptr, Input Pos.
2334 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
2335 fData
[opValue
] = fStack
->size();
2336 fData
[opValue
+1] = fp
->fInputIdx
;
2337 fActiveStart
= fLookStart
; // Set the match region change for
2338 fActiveLimit
= fLookLimit
; // transparent bounds.
2344 // Leaving a look-ahead block.
2345 // restore Stack Ptr, Input Pos to positions they had on entry to block.
2346 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
2347 int32_t stackSize
= fStack
->size();
2348 int32_t newStackSize
= fData
[opValue
];
2349 U_ASSERT(stackSize
>= newStackSize
);
2350 if (stackSize
> newStackSize
) {
2351 // Copy the current top frame back to the new (cut back) top frame.
2352 // This makes the capture groups from within the look-ahead
2353 // expression available.
2354 int32_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
2356 for (i
=0; i
<fFrameSize
; i
++) {
2357 newFP
[i
] = ((int32_t *)fp
)[i
];
2359 fp
= (REStackFrame
*)newFP
;
2360 fStack
->setSize(newStackSize
);
2362 fp
->fInputIdx
= fData
[opValue
+1];
2364 // Restore the active region bounds in the input string; they may have
2365 // been changed because of transparent bounds on a Region.
2366 fActiveStart
= fRegionStart
;
2367 fActiveLimit
= fRegionLimit
;
2372 if (fp
->fInputIdx
< fActiveLimit
) {
2374 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
2375 if (u_foldCase(c
, U_FOLD_CASE_DEFAULT
) == opValue
) {
2381 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2386 // Test input against a literal string.
2387 // Strings require two slots in the compiled pattern, one for the
2388 // offset to the string text, and one for the length.
2389 int32_t stringStartIdx
, stringLen
;
2390 stringStartIdx
= opValue
;
2392 op
= pat
[fp
->fPatIdx
];
2394 opType
= URX_TYPE(op
);
2395 opValue
= URX_VAL(op
);
2396 U_ASSERT(opType
== URX_STRING_LEN
);
2397 stringLen
= opValue
;
2399 int32_t stringEndIndex
= fp
->fInputIdx
+ stringLen
;
2400 if (stringEndIndex
<= fActiveLimit
) {
2401 if (u_strncasecmp(inputBuf
+fp
->fInputIdx
, litText
+stringStartIdx
,
2402 stringLen
, U_FOLD_CASE_DEFAULT
) == 0) {
2403 // Success. Advance the current input position.
2404 fp
->fInputIdx
= stringEndIndex
;
2408 // Insufficent input left for a match.
2409 fHitEnd
= TRUE
; // See ticket 6074
2411 // No match. Back up matching to a saved state
2412 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2418 // Entering a look-behind block.
2419 // Save Stack Ptr, Input Pos.
2420 // TODO: implement transparent bounds. Ticket #6067
2421 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
2422 fData
[opValue
] = fStack
->size();
2423 fData
[opValue
+1] = fp
->fInputIdx
;
2424 // Init the variable containing the start index for attempted matches.
2425 fData
[opValue
+2] = -1;
2426 // Save input string length, then reset to pin any matches to end at
2427 // the current position.
2428 fData
[opValue
+3] = fActiveLimit
;
2429 fActiveLimit
= fp
->fInputIdx
;
2436 // Positive Look-Behind, at top of loop checking for matches of LB expression
2437 // at all possible input starting positions.
2439 // Fetch the min and max possible match lengths. They are the operands
2440 // of this op in the pattern.
2441 int32_t minML
= pat
[fp
->fPatIdx
++];
2442 int32_t maxML
= pat
[fp
->fPatIdx
++];
2443 U_ASSERT(minML
<= maxML
);
2444 U_ASSERT(minML
>= 0);
2446 // Fetch (from data) the last input index where a match was attempted.
2447 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
2448 int32_t *lbStartIdx
= &fData
[opValue
+2];
2449 if (*lbStartIdx
< 0) {
2450 // First time through loop.
2451 *lbStartIdx
= fp
->fInputIdx
- minML
;
2453 // 2nd through nth time through the loop.
2454 // Back up start position for match by one.
2455 if (*lbStartIdx
== 0) {
2456 (*lbStartIdx
)--; // Because U16_BACK is unsafe starting at 0.
2458 U16_BACK_1(inputBuf
, 0, *lbStartIdx
);
2462 if (*lbStartIdx
< 0 || *lbStartIdx
< fp
->fInputIdx
- maxML
) {
2463 // We have tried all potential match starting points without
2464 // getting a match. Backtrack out, and out of the
2465 // Look Behind altogether.
2466 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2467 int32_t restoreInputLen
= fData
[opValue
+3];
2468 U_ASSERT(restoreInputLen
>= fActiveLimit
);
2469 U_ASSERT(restoreInputLen
<= fInput
->length());
2470 fActiveLimit
= restoreInputLen
;
2474 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
2475 // (successful match will fall off the end of the loop.)
2476 fp
= StateSave(fp
, fp
->fPatIdx
-3, status
);
2477 fp
->fInputIdx
= *lbStartIdx
;
2482 // End of a look-behind block, after a successful match.
2484 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
2485 if (fp
->fInputIdx
!= fActiveLimit
) {
2486 // The look-behind expression matched, but the match did not
2487 // extend all the way to the point that we are looking behind from.
2488 // FAIL out of here, which will take us back to the LB_CONT, which
2489 // will retry the match starting at another position or fail
2490 // the look-behind altogether, whichever is appropriate.
2491 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2495 // Look-behind match is good. Restore the orignal input string length,
2496 // which had been truncated to pin the end of the lookbehind match to the
2497 // position being looked-behind.
2498 int32_t originalInputLen
= fData
[opValue
+3];
2499 U_ASSERT(originalInputLen
>= fActiveLimit
);
2500 U_ASSERT(originalInputLen
<= fInput
->length());
2501 fActiveLimit
= originalInputLen
;
2508 // Negative Look-Behind, at top of loop checking for matches of LB expression
2509 // at all possible input starting positions.
2511 // Fetch the extra parameters of this op.
2512 int32_t minML
= pat
[fp
->fPatIdx
++];
2513 int32_t maxML
= pat
[fp
->fPatIdx
++];
2514 int32_t continueLoc
= pat
[fp
->fPatIdx
++];
2515 continueLoc
= URX_VAL(continueLoc
);
2516 U_ASSERT(minML
<= maxML
);
2517 U_ASSERT(minML
>= 0);
2518 U_ASSERT(continueLoc
> fp
->fPatIdx
);
2520 // Fetch (from data) the last input index where a match was attempted.
2521 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
2522 int32_t *lbStartIdx
= &fData
[opValue
+2];
2523 if (*lbStartIdx
< 0) {
2524 // First time through loop.
2525 *lbStartIdx
= fp
->fInputIdx
- minML
;
2527 // 2nd through nth time through the loop.
2528 // Back up start position for match by one.
2529 if (*lbStartIdx
== 0) {
2530 (*lbStartIdx
)--; // Because U16_BACK is unsafe starting at 0.
2532 U16_BACK_1(inputBuf
, 0, *lbStartIdx
);
2536 if (*lbStartIdx
< 0 || *lbStartIdx
< fp
->fInputIdx
- maxML
) {
2537 // We have tried all potential match starting points without
2538 // getting a match, which means that the negative lookbehind as
2539 // a whole has succeeded. Jump forward to the continue location
2540 int32_t restoreInputLen
= fData
[opValue
+3];
2541 U_ASSERT(restoreInputLen
>= fActiveLimit
);
2542 U_ASSERT(restoreInputLen
<= fInput
->length());
2543 fActiveLimit
= restoreInputLen
;
2544 fp
->fPatIdx
= continueLoc
;
2548 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
2549 // (successful match will cause a FAIL out of the loop altogether.)
2550 fp
= StateSave(fp
, fp
->fPatIdx
-4, status
);
2551 fp
->fInputIdx
= *lbStartIdx
;
2556 // End of a negative look-behind block, after a successful match.
2558 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
2559 if (fp
->fInputIdx
!= fActiveLimit
) {
2560 // The look-behind expression matched, but the match did not
2561 // extend all the way to the point that we are looking behind from.
2562 // FAIL out of here, which will take us back to the LB_CONT, which
2563 // will retry the match starting at another position or succeed
2564 // the look-behind altogether, whichever is appropriate.
2565 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2569 // Look-behind expression matched, which means look-behind test as
2572 // Restore the orignal input string length, which had been truncated
2573 // inorder to pin the end of the lookbehind match
2574 // to the position being looked-behind.
2575 int32_t originalInputLen
= fData
[opValue
+3];
2576 U_ASSERT(originalInputLen
>= fActiveLimit
);
2577 U_ASSERT(originalInputLen
<= fInput
->length());
2578 fActiveLimit
= originalInputLen
;
2580 // Restore original stack position, discarding any state saved
2581 // by the successful pattern match.
2582 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
2583 int32_t newStackSize
= fData
[opValue
];
2584 U_ASSERT(fStack
->size() > newStackSize
);
2585 fStack
->setSize(newStackSize
);
2587 // FAIL, which will take control back to someplace
2588 // prior to entering the look-behind test.
2589 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2595 // Loop Initialization for the optimized implementation of
2596 // [some character set]*
2597 // This op scans through all matching input.
2598 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
2600 U_ASSERT(opValue
> 0 && opValue
< sets
->size());
2601 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
2602 UnicodeSet
*s
= (UnicodeSet
*)sets
->elementAt(opValue
);
2604 // Loop through input, until either the input is exhausted or
2605 // we reach a character that is not a member of the set.
2606 int32_t ix
= fp
->fInputIdx
;
2608 if (ix
>= fActiveLimit
) {
2613 U16_NEXT(inputBuf
, ix
, fActiveLimit
, c
);
2615 if (s8
->contains(c
) == FALSE
) {
2616 U16_BACK_1(inputBuf
, 0, ix
);
2620 if (s
->contains(c
) == FALSE
) {
2621 U16_BACK_1(inputBuf
, 0, ix
);
2627 // If there were no matching characters, skip over the loop altogether.
2628 // The loop doesn't run at all, a * op always succeeds.
2629 if (ix
== fp
->fInputIdx
) {
2630 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
2634 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
2635 // must follow. It's operand is the stack location
2636 // that holds the starting input index for the match of this [set]*
2637 int32_t loopcOp
= pat
[fp
->fPatIdx
];
2638 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
2639 int32_t stackLoc
= URX_VAL(loopcOp
);
2640 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
2641 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
2644 // Save State to the URX_LOOP_C op that follows this one,
2645 // so that match failures in the following code will return to there.
2646 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
2647 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
2653 case URX_LOOP_DOT_I
:
2654 // Loop Initialization for the optimized implementation of .*
2655 // This op scans through all remaining input.
2656 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
2658 // Loop through input until the input is exhausted (we reach an end-of-line)
2659 // In DOTALL mode, we can just go straight to the end of the input.
2661 if ((opValue
& 1) == 1) {
2662 // Dot-matches-All mode. Jump straight to the end of the string.
2666 // NOT DOT ALL mode. Line endings do not match '.'
2667 // Scan forward until a line ending or end of input.
2670 if (ix
>= fActiveLimit
) {
2676 U16_NEXT(inputBuf
, ix
, fActiveLimit
, c
); // c = inputBuf[ix++]
2677 if ((c
& 0x7f) <= 0x29) { // Fast filter of non-new-line-s
2678 if ((c
== 0x0a) || // 0x0a is newline in both modes.
2679 ((opValue
& 2) == 0) && // IF not UNIX_LINES mode
2680 (c
<=0x0d && c
>=0x0a) || c
==0x85 ||c
==0x2028 || c
==0x2029) {
2681 // char is a line ending. Put the input pos back to the
2682 // line ending char, and exit the scanning loop.
2683 U16_BACK_1(inputBuf
, 0, ix
);
2690 // If there were no matching characters, skip over the loop altogether.
2691 // The loop doesn't run at all, a * op always succeeds.
2692 if (ix
== fp
->fInputIdx
) {
2693 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
2697 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
2698 // must follow. It's operand is the stack location
2699 // that holds the starting input index for the match of this .*
2700 int32_t loopcOp
= pat
[fp
->fPatIdx
];
2701 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
2702 int32_t stackLoc
= URX_VAL(loopcOp
);
2703 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
2704 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
2707 // Save State to the URX_LOOP_C op that follows this one,
2708 // so that match failures in the following code will return to there.
2709 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
2710 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
2718 U_ASSERT(opValue
>=0 && opValue
<fFrameSize
);
2719 int32_t terminalIdx
= fp
->fExtra
[opValue
];
2720 U_ASSERT(terminalIdx
<= fp
->fInputIdx
);
2721 if (terminalIdx
== fp
->fInputIdx
) {
2722 // We've backed up the input idx to the point that the loop started.
2723 // The loop is done. Leave here without saving state.
2724 // Subsequent failures won't come back here.
2727 // Set up for the next iteration of the loop, with input index
2728 // backed up by one from the last time through,
2729 // and a state save to this instruction in case the following code fails again.
2730 // (We're going backwards because this loop emulates stack unwinding, not
2731 // the initial scan forward.)
2732 U_ASSERT(fp
->fInputIdx
> 0);
2733 U16_BACK_1(inputBuf
, 0, fp
->fInputIdx
);
2734 if (inputBuf
[fp
->fInputIdx
] == 0x0a &&
2735 fp
->fInputIdx
> terminalIdx
&&
2736 inputBuf
[fp
->fInputIdx
-1] == 0x0d) {
2737 int32_t prevOp
= pat
[fp
->fPatIdx
-2];
2738 if (URX_TYPE(prevOp
) == URX_LOOP_DOT_I
) {
2739 // .*, stepping back over CRLF pair.
2745 fp
= StateSave(fp
, fp
->fPatIdx
-1, status
);
2752 // Trouble. The compiled pattern contains an entry with an
2753 // unrecognized type tag.
2757 if (U_FAILURE(status
)) {
2766 fLastMatchEnd
= fMatchEnd
;
2767 fMatchStart
= startIdx
;
2768 fMatchEnd
= fp
->fInputIdx
;
2770 REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart
, fMatchEnd
));
2776 REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
2780 fFrame
= fp
; // The active stack frame when the engine stopped.
2781 // Contains the capture group results that we need to
2789 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher
)
2793 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS