4 // Contains the implementation of class RegexMatcher,
5 // which is one of the main API classes for the ICU regular expression package.
8 **************************************************************************
9 * Copyright (C) 2002-2005 International Business Machines Corporation *
10 * and others. All rights reserved. *
11 **************************************************************************
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
17 #include "unicode/regex.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/ustring.h"
21 #include "unicode/rbbi.h"
29 // #include <malloc.h> // Needed for heapcheck testing
33 //-----------------------------------------------------------------------------
35 // Constructor and Destructor
37 //-----------------------------------------------------------------------------
38 RegexMatcher::RegexMatcher(const RegexPattern
*pat
) {
43 fDeferredStatus
= U_ZERO_ERROR
;
44 fStack
= new UVector32(fDeferredStatus
);
48 fDeferredStatus
= U_ILLEGAL_ARGUMENT_ERROR
;
51 if (pat
->fDataSize
> (int32_t)(sizeof(fSmallData
)/sizeof(int32_t))) {
52 fData
= (int32_t *)uprv_malloc(pat
->fDataSize
* sizeof(int32_t));
54 if (fStack
== NULL
|| fData
== NULL
) {
55 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
58 reset(*RegexStaticSets::gStaticSets
->fEmptyString
);
63 RegexMatcher::RegexMatcher(const UnicodeString
®exp
, const UnicodeString
&input
,
64 uint32_t flags
, UErrorCode
&status
) {
66 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
67 fPattern
= fPatternOwned
;
69 fDeferredStatus
= U_ZERO_ERROR
;
70 fStack
= new UVector32(status
);
73 if (U_FAILURE(status
)) {
76 if (fPattern
->fDataSize
> (int32_t)(sizeof(fSmallData
)/sizeof(int32_t))) {
77 fData
= (int32_t *)uprv_malloc(fPattern
->fDataSize
* sizeof(int32_t));
79 if (fStack
== NULL
|| fData
== NULL
) {
80 status
= U_MEMORY_ALLOCATION_ERROR
;
86 RegexMatcher::RegexMatcher(const UnicodeString
®exp
,
87 uint32_t flags
, UErrorCode
&status
) {
90 fDeferredStatus
= U_ZERO_ERROR
;
91 fStack
= new UVector32(status
);
93 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
94 fPattern
= fPatternOwned
;
96 if (U_FAILURE(status
)) {
100 if (fPattern
->fDataSize
> (int32_t)(sizeof(fSmallData
)/sizeof(int32_t))) {
101 fData
= (int32_t *)uprv_malloc(fPattern
->fDataSize
* sizeof(int32_t));
103 if (fStack
== NULL
|| fData
== NULL
) {
104 status
= U_MEMORY_ALLOCATION_ERROR
;
106 reset(*RegexStaticSets::gStaticSets
->fEmptyString
);
111 RegexMatcher::~RegexMatcher() {
113 if (fData
!= fSmallData
) {
118 delete fPatternOwned
;
119 fPatternOwned
= NULL
;
122 #if UCONFIG_NO_BREAK_ITERATION==0
123 delete fWordBreakItr
;
129 static const UChar BACKSLASH
= 0x5c;
130 static const UChar DOLLARSIGN
= 0x24;
131 //--------------------------------------------------------------------------------
135 //--------------------------------------------------------------------------------
136 RegexMatcher
&RegexMatcher::appendReplacement(UnicodeString
&dest
,
137 const UnicodeString
&replacement
,
138 UErrorCode
&status
) {
139 if (U_FAILURE(status
)) {
142 if (U_FAILURE(fDeferredStatus
)) {
143 status
= fDeferredStatus
;
146 if (fMatch
== FALSE
) {
147 status
= U_REGEX_INVALID_STATE
;
151 // Copy input string from the end of previous match to start of current match
152 int32_t len
= fMatchStart
-fLastReplaceEnd
;
154 dest
.append(*fInput
, fLastReplaceEnd
, len
);
156 fLastReplaceEnd
= fMatchEnd
;
159 // scan the replacement text, looking for substitutions ($n) and \escapes.
160 // TODO: optimize this loop by efficiently scanning for '$' or '\',
161 // move entire ranges not containing substitutions.
162 int32_t replLen
= replacement
.length();
164 while (replIdx
<replLen
) {
165 UChar c
= replacement
.charAt(replIdx
);
167 if (c
== BACKSLASH
) {
168 // Backslash Escape. Copy the following char out without further checks.
169 // Note: Surrogate pairs don't need any special handling
170 // The second half wont be a '$' or a '\', and
171 // will move to the dest normally on the next
173 if (replIdx
>= replLen
) {
176 c
= replacement
.charAt(replIdx
);
178 if (c
==0x55/*U*/ || c
==0x75/*u*/) {
179 // We have a \udddd or \Udddddddd escape sequence.
180 UChar32 escapedChar
= replacement
.unescapeAt(replIdx
);
181 if (escapedChar
!= (UChar32
)0xFFFFFFFF) {
182 dest
.append(escapedChar
);
183 // TODO: Report errors for mal-formed \u escapes?
184 // As this is, the original sequence is output, which may be OK.
189 // Plain backslash escape. Just put out the escaped character.
195 if (c
!= DOLLARSIGN
) {
196 // Normal char, not a $. Copy it out without further checks.
201 // We've got a $. Pick up a capture group number if one follows.
202 // Consume at most the number of digits necessary for the largest capture
203 // number that is valid for this pattern.
205 int32_t numDigits
= 0;
206 int32_t groupNum
= 0;
209 if (replIdx
>= replLen
) {
212 digitC
= replacement
.char32At(replIdx
);
213 if (u_isdigit(digitC
) == FALSE
) {
216 replIdx
= replacement
.moveIndex32(replIdx
, 1);
217 groupNum
=groupNum
*10 + u_charDigitValue(digitC
);
219 if (numDigits
>= fPattern
->fMaxCaptureDigits
) {
225 if (numDigits
== 0) {
226 // The $ didn't introduce a group number at all.
227 // Treat it as just part of the substitution text.
228 dest
.append(DOLLARSIGN
);
232 // Finally, append the capture group data to the destination.
233 dest
.append(group(groupNum
, status
));
234 if (U_FAILURE(status
)) {
235 // Can fail if group number is out of range.
246 //--------------------------------------------------------------------------------
248 // appendTail Intended to be used in conjunction with appendReplacement()
249 // To the destination string, append everything following
250 // the last match position from the input string.
252 //--------------------------------------------------------------------------------
253 UnicodeString
&RegexMatcher::appendTail(UnicodeString
&dest
) {
254 int32_t len
= fInput
->length()-fMatchEnd
;
256 dest
.append(*fInput
, fMatchEnd
, len
);
263 //--------------------------------------------------------------------------------
267 //--------------------------------------------------------------------------------
268 int32_t RegexMatcher::end(UErrorCode
&err
) const {
274 int32_t RegexMatcher::end(int32_t group
, UErrorCode
&err
) const {
275 if (U_FAILURE(err
)) {
278 if (fMatch
== FALSE
) {
279 err
= U_REGEX_INVALID_STATE
;
282 if (group
< 0 || group
> fPattern
->fGroupMap
->size()) {
283 err
= U_INDEX_OUTOFBOUNDS_ERROR
;
290 // Get the position within the stack frame of the variables for
291 // this capture group.
292 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(group
-1);
293 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
294 U_ASSERT(groupOffset
>= 0);
295 e
= fFrame
->fExtra
[groupOffset
+ 1];
302 //--------------------------------------------------------------------------------
306 //--------------------------------------------------------------------------------
307 UBool
RegexMatcher::find() {
308 // Start at the position of the last match end. (Will be zero if the
309 // matcher has been reset.
311 if (U_FAILURE(fDeferredStatus
)) {
315 int32_t startPos
= fMatchEnd
;
318 // Save the position of any previous successful match.
319 fLastMatchEnd
= fMatchEnd
;
321 if (fMatchStart
== fMatchEnd
) {
322 // Previous match had zero length. Move start position up one position
323 // to avoid sending find() into a loop on zero-length matches.
324 if (startPos
== fInput
->length()) {
328 startPos
= fInput
->moveIndex32(startPos
, 1);
331 if (fLastMatchEnd
>= 0) {
332 // A previous find() failed to match. Don't try again.
333 // (without this test, a pattern with a zero-length match
334 // could match again at the end of an input string.)
339 int32_t inputLen
= fInput
->length();
341 // Compute the position in the input string beyond which a match can not begin, because
342 // the minimum length match would extend past the end of the input.
343 int32_t testLen
= inputLen
- fPattern
->fMinMatchLen
;
344 if (startPos
> testLen
) {
349 const UChar
*inputBuf
= fInput
->getBuffer();
351 U_ASSERT(startPos
>= 0);
353 switch (fPattern
->fStartType
) {
355 // No optimization was found.
356 // Try a match at each input position.
358 MatchAt(startPos
, fDeferredStatus
);
359 if (U_FAILURE(fDeferredStatus
)) {
365 if (startPos
>= testLen
) {
368 U16_FWD_1(inputBuf
, startPos
, inputLen
);
369 // Note that it's perfectly OK for a pattern to have a zero-length
370 // match at the end of a string, so we must make sure that the loop
371 // runs with startPos == testLen the last time through.
376 // Matches are only possible at the start of the input string
377 // (pattern begins with ^ or \A)
382 MatchAt(startPos
, fDeferredStatus
);
383 if (U_FAILURE(fDeferredStatus
)) {
391 // Match may start on any char from a pre-computed set.
392 U_ASSERT(fPattern
->fMinMatchLen
> 0);
394 int32_t pos
= startPos
;
395 U16_NEXT(inputBuf
, startPos
, inputLen
, c
); // like c = inputBuf[startPos++];
396 if (c
<256 && fPattern
->fInitialChars8
->contains(c
) ||
397 c
>=256 && fPattern
->fInitialChars
->contains(c
)) {
398 MatchAt(pos
, fDeferredStatus
);
399 if (U_FAILURE(fDeferredStatus
)) {
406 if (pos
>= testLen
) {
417 // Match starts on exactly one char.
418 U_ASSERT(fPattern
->fMinMatchLen
> 0);
419 UChar32 theChar
= fPattern
->fInitialChar
;
421 int32_t pos
= startPos
;
422 U16_NEXT(inputBuf
, startPos
, inputLen
, c
); // like c = inputBuf[startPos++];
424 MatchAt(pos
, fDeferredStatus
);
425 if (U_FAILURE(fDeferredStatus
)) {
432 if (pos
>= testLen
) {
444 MatchAt(startPos
, fDeferredStatus
);
445 if (U_FAILURE(fDeferredStatus
)) {
451 U16_NEXT(inputBuf
, startPos
, inputLen
, c
); // like c = inputBuf[startPos++];
455 c
= inputBuf
[startPos
-1];
456 if (((c
& 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
457 ((c
<=0x0d && c
>=0x0a) || c
==0x85 ||c
==0x2028 || c
==0x2029 )) {
458 if (c
== 0x0d && startPos
< inputLen
&& inputBuf
[startPos
] == 0x0a) {
461 MatchAt(startPos
, fDeferredStatus
);
462 if (U_FAILURE(fDeferredStatus
)) {
469 if (startPos
>= testLen
) {
473 U16_NEXT(inputBuf
, startPos
, inputLen
, c
); // like c = inputBuf[startPos++];
474 // Note that it's perfectly OK for a pattern to have a zero-length
475 // match at the end of a string, so we must make sure that the loop
476 // runs with startPos == testLen the last time through.
490 UBool
RegexMatcher::find(int32_t start
, UErrorCode
&status
) {
491 if (U_FAILURE(status
)) {
494 if (U_FAILURE(fDeferredStatus
)) {
495 status
= fDeferredStatus
;
498 int32_t inputLen
= fInput
->length();
499 if (start
< 0 || start
> inputLen
) {
500 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
510 //--------------------------------------------------------------------------------
514 //--------------------------------------------------------------------------------
515 UnicodeString
RegexMatcher::group(UErrorCode
&status
) const {
516 return group(0, status
);
521 UnicodeString
RegexMatcher::group(int32_t groupNum
, UErrorCode
&status
) const {
522 int32_t s
= start(groupNum
, status
);
523 int32_t e
= end(groupNum
, status
);
525 // Note: calling start() and end() above will do all necessary checking that
526 // the group number is OK and that a match exists. status will be set.
527 if (U_FAILURE(status
)) {
528 return UnicodeString();
530 if (U_FAILURE(fDeferredStatus
)) {
531 status
= fDeferredStatus
;
532 return UnicodeString();
536 // A capture group wasn't part of the match
537 return UnicodeString();
540 return UnicodeString(*fInput
, s
, e
-s
);
546 int32_t RegexMatcher::groupCount() const {
547 return fPattern
->fGroupMap
->size();
552 const UnicodeString
&RegexMatcher::input() const {
559 //--------------------------------------------------------------------------------
563 //--------------------------------------------------------------------------------
564 UBool
RegexMatcher::lookingAt(UErrorCode
&status
) {
565 if (U_FAILURE(status
)) {
568 if (U_FAILURE(fDeferredStatus
)) {
569 status
= fDeferredStatus
;
578 UBool
RegexMatcher::lookingAt(int32_t start
, UErrorCode
&status
) {
579 if (U_FAILURE(status
)) {
582 if (U_FAILURE(fDeferredStatus
)) {
583 status
= fDeferredStatus
;
586 if (start
< 0 || start
> fInput
->length()) {
587 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
591 MatchAt(start
, status
);
597 //--------------------------------------------------------------------------------
601 //--------------------------------------------------------------------------------
602 UBool
RegexMatcher::matches(UErrorCode
&status
) {
603 if (U_FAILURE(status
)) {
606 if (U_FAILURE(fDeferredStatus
)) {
607 status
= fDeferredStatus
;
612 UBool success
= (fMatch
&& fMatchEnd
==fInput
->length());
617 UBool
RegexMatcher::matches(int32_t start
, UErrorCode
&status
) {
618 if (U_FAILURE(status
)) {
621 if (U_FAILURE(fDeferredStatus
)) {
622 status
= fDeferredStatus
;
625 if (start
< 0 || start
> fInput
->length()) {
626 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
630 MatchAt(start
, status
);
631 UBool success
= (fMatch
&& fMatchEnd
==fInput
->length());
637 const RegexPattern
&RegexMatcher::pattern() const {
643 //--------------------------------------------------------------------------------
647 //--------------------------------------------------------------------------------
648 UnicodeString
RegexMatcher::replaceAll(const UnicodeString
&replacement
, UErrorCode
&status
) {
649 if (U_FAILURE(status
)) {
652 if (U_FAILURE(fDeferredStatus
)) {
653 status
= fDeferredStatus
;
656 UnicodeString destString
;
657 for (reset(); find(); ) {
658 appendReplacement(destString
, replacement
, status
);
659 if (U_FAILURE(status
)) {
663 appendTail(destString
);
670 //--------------------------------------------------------------------------------
674 //--------------------------------------------------------------------------------
675 UnicodeString
RegexMatcher::replaceFirst(const UnicodeString
&replacement
, UErrorCode
&status
) {
676 if (U_FAILURE(status
)) {
679 if (U_FAILURE(fDeferredStatus
)) {
680 status
= fDeferredStatus
;
689 UnicodeString destString
;
690 appendReplacement(destString
, replacement
, status
);
691 appendTail(destString
);
697 //--------------------------------------------------------------------------------
701 //--------------------------------------------------------------------------------
702 RegexMatcher
&RegexMatcher::reset() {
714 RegexMatcher
&RegexMatcher::reset(const UnicodeString
&input
) {
717 if (fWordBreakItr
!= NULL
) {
718 #if UCONFIG_NO_BREAK_ITERATION==0
719 fWordBreakItr
->setText(input
);
725 RegexMatcher
&RegexMatcher::reset(const UChar
*) {
726 fDeferredStatus
= U_INTERNAL_PROGRAM_ERROR
;
731 RegexMatcher
&RegexMatcher::reset(int32_t position
, UErrorCode
&status
) {
732 if (U_FAILURE(status
)) {
736 if (position
< 0 || position
>= fInput
->length()) {
737 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
740 fMatchEnd
= position
;
748 //--------------------------------------------------------------------------------
752 //--------------------------------------------------------------------------------
753 void RegexMatcher::setTrace(UBool state
) {
759 //---------------------------------------------------------------------
763 //---------------------------------------------------------------------
764 int32_t RegexMatcher::split(const UnicodeString
&input
,
765 UnicodeString dest
[],
766 int32_t destCapacity
,
770 // Check arguements for validity
772 if (U_FAILURE(status
)) {
776 if (destCapacity
< 1) {
777 status
= U_ILLEGAL_ARGUMENT_ERROR
;
783 // Reset for the input text
786 int32_t inputLen
= input
.length();
787 int32_t nextOutputStringStart
= 0;
794 // Loop through the input text, searching for the delimiter pattern
797 int32_t numCaptureGroups
= fPattern
->fGroupMap
->size();
799 if (i
>=destCapacity
-1) {
800 // There is one or zero output string left.
801 // Fill the last output string with whatever is left from the input, then exit the loop.
802 // ( i will be == destCapicity if we filled the output array while processing
803 // capture groups of the delimiter expression, in which case we will discard the
804 // last capture group saved in favor of the unprocessed remainder of the
807 int32_t remainingLength
= inputLen
-nextOutputStringStart
;
808 if (remainingLength
> 0) {
809 dest
[i
].setTo(input
, nextOutputStringStart
, remainingLength
);
814 // We found another delimiter. Move everything from where we started looking
815 // up until the start of the delimiter into the next output string.
816 int32_t fieldLen
= fMatchStart
- nextOutputStringStart
;
817 dest
[i
].setTo(input
, nextOutputStringStart
, fieldLen
);
818 nextOutputStringStart
= fMatchEnd
;
820 // If the delimiter pattern has capturing parentheses, the captured
821 // text goes out into the next n destination strings.
823 for (groupNum
=1; groupNum
<=numCaptureGroups
; groupNum
++) {
824 if (i
==destCapacity
-1) {
828 dest
[i
] = group(groupNum
, status
);
831 if (nextOutputStringStart
== inputLen
) {
832 // The delimiter was at the end of the string. We're done.
839 // We ran off the end of the input while looking for the next delimiter.
840 // All the remaining text goes into the current output string.
841 dest
[i
].setTo(input
, nextOutputStringStart
, inputLen
-nextOutputStringStart
);
850 //--------------------------------------------------------------------------------
854 //--------------------------------------------------------------------------------
855 int32_t RegexMatcher::start(UErrorCode
&status
) const {
856 return start(0, status
);
862 int32_t RegexMatcher::start(int32_t group
, UErrorCode
&status
) const {
863 if (U_FAILURE(status
)) {
866 if (U_FAILURE(fDeferredStatus
)) {
867 status
= fDeferredStatus
;
870 if (fMatch
== FALSE
) {
871 status
= U_REGEX_INVALID_STATE
;
874 if (group
< 0 || group
> fPattern
->fGroupMap
->size()) {
875 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
882 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(group
-1);
883 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
884 U_ASSERT(groupOffset
>= 0);
885 s
= fFrame
->fExtra
[groupOffset
];
892 //================================================================================
894 // Code following this point in this file is the internal
895 // Match Engine Implementation.
897 //================================================================================
900 //--------------------------------------------------------------------------------
903 // Discard any previous contents of the state save stack, and initialize a
904 // new stack frame to all -1. The -1s are needed for capture group limits,
905 // where they indicate that a group has not yet matched anything.
906 //--------------------------------------------------------------------------------
907 REStackFrame
*RegexMatcher::resetStack() {
908 // Discard any previous contents of the state save stack, and initialize a
909 // new stack frame to all -1. The -1s are needed for capture group limits, where
910 // they indicate that a group has not yet matched anything.
911 fStack
->removeAllElements();
913 int32_t *iFrame
= fStack
->reserveBlock(fPattern
->fFrameSize
, fDeferredStatus
);
915 for (i
=0; i
<fPattern
->fFrameSize
; i
++) {
918 return (REStackFrame
*)iFrame
;
923 //--------------------------------------------------------------------------------
926 // in perl, "xab..cd..", \b is true at positions 0,3,5,7
928 // If the current char is a combining mark,
930 // Else Scan backwards to the first non-combining char.
931 // We are at a boundary if the this char and the original chars are
932 // opposite in membership in \w set
934 // parameters: pos - the current position in the input buffer
936 //--------------------------------------------------------------------------------
937 UBool
RegexMatcher::isWordBoundary(int32_t pos
) {
938 UBool isBoundary
= FALSE
;
939 UBool cIsWord
= FALSE
;
941 // Determine whether char c at current position is a member of the word set of chars.
942 // If we're off the end of the string, behave as though we're not at a word char.
943 if (pos
< fInput
->length()) {
944 UChar32 c
= fInput
->char32At(pos
);
945 int8_t ctype
= u_charType(c
);
946 if (ctype
==U_NON_SPACING_MARK
|| ctype
==U_ENCLOSING_MARK
) {
947 // Current char is a combining one. Not a boundary.
950 cIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(c
);
953 // Back up until we come to a non-combining char, determine whether
954 // that char is a word char.
955 UBool prevCIsWord
= FALSE
;
956 int32_t prevPos
= pos
;
961 prevPos
= fInput
->moveIndex32(prevPos
, -1);
962 UChar32 prevChar
= fInput
->char32At(prevPos
);
963 int8_t prevCType
= u_charType(prevChar
);
964 if (!(prevCType
==U_NON_SPACING_MARK
|| prevCType
==U_ENCLOSING_MARK
)) {
965 prevCIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(prevChar
);
969 isBoundary
= cIsWord
^ prevCIsWord
;
973 //--------------------------------------------------------------------------------
977 // Test for a word boundary using RBBI word break.
979 // parameters: pos - the current position in the input buffer
981 //--------------------------------------------------------------------------------
982 UBool
RegexMatcher::isUWordBoundary(int32_t pos
) {
983 UBool returnVal
= FALSE
;
984 #if UCONFIG_NO_BREAK_ITERATION==0
985 UErrorCode status
= U_ZERO_ERROR
;
987 // If we haven't yet created a break iterator for this matcher, do it now.
988 if (fWordBreakItr
== NULL
) {
990 (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale::getEnglish(), status
);
991 if (U_FAILURE(status
)) {
992 // TODO: reliable error reporting for BI failures.
995 fWordBreakItr
->setText(*fInput
);
998 returnVal
= fWordBreakItr
->isBoundary(pos
);
1003 //--------------------------------------------------------------------------------
1006 // Make a new stack frame, initialized as a copy of the current stack frame.
1007 // Set the pattern index in the original stack frame from the operand value
1008 // in the opcode. Execution of the engine continues with the state in
1009 // the newly created stack frame
1011 // Note that reserveBlock() may grow the stack, resulting in the
1012 // whole thing being relocated in memory.
1014 //--------------------------------------------------------------------------------
1015 inline REStackFrame
*RegexMatcher::StateSave(REStackFrame
*fp
, int32_t savePatIdx
, int32_t frameSize
, UErrorCode
&status
) {
1016 // push storage for a new frame.
1017 int32_t *newFP
= fStack
->reserveBlock(frameSize
, status
);
1018 fp
= (REStackFrame
*)(newFP
- frameSize
); // in case of realloc of stack.
1020 // New stack frame = copy of old top frame.
1021 int32_t *source
= (int32_t *)fp
;
1022 int32_t *dest
= newFP
;
1024 *dest
++ = *source
++;
1025 if (source
== newFP
) {
1030 fp
->fPatIdx
= savePatIdx
;
1031 return (REStackFrame
*)newFP
;
1035 //--------------------------------------------------------------------------------
1037 // MatchAt This is the actual matching engine.
1039 //--------------------------------------------------------------------------------
1040 void RegexMatcher::MatchAt(int32_t startIdx
, UErrorCode
&status
) {
1041 UBool isMatch
= FALSE
; // True if the we have a match.
1043 int32_t op
; // Operation from the compiled pattern, split into
1044 int32_t opType
; // the opcode
1045 int32_t opValue
; // and the operand value.
1047 #ifdef REGEX_RUN_DEBUG
1050 printf("MatchAt(startIdx=%d)\n", startIdx
);
1051 printf("Original Pattern: ");
1053 for (i
=0; i
<fPattern
->fPattern
.length(); i
++) {
1054 printf("%c", fPattern
->fPattern
.charAt(i
));
1057 printf("Input String: ");
1058 for (i
=0; i
<fInput
->length(); i
++) {
1059 UChar c
= fInput
->charAt(i
);
1060 if (c
<32 || c
>256) {
1070 if (U_FAILURE(status
)) {
1074 // Cache frequently referenced items from the compiled pattern
1075 // in local variables.
1077 int32_t *pat
= fPattern
->fCompiledPat
->getBuffer();
1079 const UChar
*litText
= fPattern
->fLiteralText
.getBuffer();
1080 UVector
*sets
= fPattern
->fSets
;
1081 int32_t inputLen
= fInput
->length();
1082 const UChar
*inputBuf
= fInput
->getBuffer();
1084 REStackFrame
*fp
= resetStack();
1085 int32_t frameSize
= fPattern
->fFrameSize
;
1088 fp
->fInputIdx
= startIdx
;
1090 // Zero out the pattern's static data
1092 for (i
= 0; i
<fPattern
->fDataSize
; i
++) {
1097 // Main loop for interpreting the compiled pattern.
1098 // One iteration of the loop per pattern operation performed.
1102 if (_heapchk() != _HEAPOK
) {
1103 fprintf(stderr
, "Heap Trouble\n");
1106 op
= pat
[fp
->fPatIdx
];
1107 opType
= URX_TYPE(op
);
1108 opValue
= URX_VAL(op
);
1109 #ifdef REGEX_RUN_DEBUG
1111 printf("inputIdx=%d inputChar=%c sp=%3d ", fp
->fInputIdx
,
1112 fInput
->char32At(fp
->fInputIdx
), (int32_t *)fp
-fStack
->getBuffer());
1113 fPattern
->dumpOp(fp
->fPatIdx
);
1126 // Force a backtrack. In some circumstances, the pattern compiler
1127 // will notice that the pattern can't possibly match anything, and will
1128 // emit one of these at that point.
1129 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1134 if (fp
->fInputIdx
< inputLen
) {
1136 U16_NEXT(inputBuf
, fp
->fInputIdx
, inputLen
, c
);
1141 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1147 // Test input against a literal string.
1148 // Strings require two slots in the compiled pattern, one for the
1149 // offset to the string text, and one for the length.
1150 int32_t stringStartIdx
= opValue
;
1153 op
= pat
[fp
->fPatIdx
]; // Fetch the second operand
1155 opType
= URX_TYPE(op
);
1156 stringLen
= URX_VAL(op
);
1157 U_ASSERT(opType
== URX_STRING_LEN
);
1158 U_ASSERT(stringLen
>= 2);
1160 if (fp
->fInputIdx
+ stringLen
> inputLen
) {
1161 // No match. String is longer than the remaining input text.
1162 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1166 const UChar
* pInp
= inputBuf
+ fp
->fInputIdx
;
1167 const UChar
* pPat
= litText
+stringStartIdx
;
1168 const UChar
* pEnd
= pInp
+ stringLen
;
1170 if (*pInp
== *pPat
) {
1174 // Successful Match.
1175 fp
->fInputIdx
+= stringLen
;
1180 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1189 case URX_STATE_SAVE
:
1190 fp
= StateSave(fp
, opValue
, frameSize
, status
);
1195 // The match loop will exit via this path on a successful match,
1196 // when we reach the end of the pattern.
1200 // Start and End Capture stack frame variables are layout out like this:
1201 // fp->fExtra[opValue] - The start of a completed capture group
1202 // opValue+1 - The end of a completed capture group
1203 // opValue+2 - the start of a capture group whose end
1204 // has not yet been reached (and might not ever be).
1205 case URX_START_CAPTURE
:
1206 U_ASSERT(opValue
>= 0 && opValue
< frameSize
-3);
1207 fp
->fExtra
[opValue
+2] = fp
->fInputIdx
;
1211 case URX_END_CAPTURE
:
1212 U_ASSERT(opValue
>= 0 && opValue
< frameSize
-3);
1213 U_ASSERT(fp
->fExtra
[opValue
+2] >= 0); // Start pos for this group must be set.
1214 fp
->fExtra
[opValue
] = fp
->fExtra
[opValue
+2]; // Tentative start becomes real.
1215 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // End position
1216 U_ASSERT(fp
->fExtra
[opValue
] <= fp
->fExtra
[opValue
+1]);
1220 case URX_DOLLAR
: // $, test for End of line
1221 // or for position before new line at end of input
1222 if (fp
->fInputIdx
< inputLen
-2) {
1223 // We are no where near the end of input. Fail.
1224 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1227 if (fp
->fInputIdx
>= inputLen
) {
1228 // We really are at the end of input. Success.
1231 // If we are positioned just before a new-line that is located at the
1232 // end of input, succeed.
1233 if (fp
->fInputIdx
== inputLen
-1) {
1234 UChar32 c
= fInput
->char32At(fp
->fInputIdx
);
1235 if ((c
<=0x0d && c
>=0x0a) || c
==0x85 ||c
==0x2028 || c
==0x2029) {
1236 // If not in the middle of a CR/LF sequence
1237 if ( !(c
==0x0a && fp
->fInputIdx
>0 && inputBuf
[fp
->fInputIdx
-1]==0x0d)) {
1239 // At new-line at end of input. Success
1244 if (fp
->fInputIdx
== inputLen
-2) {
1245 if (fInput
->char32At(fp
->fInputIdx
) == 0x0d && fInput
->char32At(fp
->fInputIdx
+1) == 0x0a) {
1246 break; // At CR/LF at end of input. Success
1250 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1255 case URX_DOLLAR_M
: // $, test for End of line in multi-line mode
1257 if (fp
->fInputIdx
>= inputLen
) {
1258 // We really are at the end of input. Success.
1261 // If we are positioned just before a new-line, succeed.
1262 // It makes no difference where the new-line is within the input.
1263 UChar32 c
= inputBuf
[fp
->fInputIdx
];
1264 if ((c
<=0x0d && c
>=0x0a) || c
==0x85 ||c
==0x2028 || c
==0x2029) {
1265 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
1266 if ( !(c
==0x0a && fp
->fInputIdx
>0 && inputBuf
[fp
->fInputIdx
-1]==0x0d)) {
1267 break; // At new-line at end of input. Success
1271 // not at a new line. Fail.
1272 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1277 case URX_CARET
: // ^, test for start of line
1278 if (fp
->fInputIdx
!= 0) {
1279 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1284 case URX_CARET_M
: // ^, test for start of line in mulit-line mode
1286 if (fp
->fInputIdx
== 0) {
1287 // We are at the start input. Success.
1290 // Check whether character just before the current pos is a new-line
1291 // unless we are at the end of input
1292 UChar c
= inputBuf
[fp
->fInputIdx
- 1];
1293 if ((fp
->fInputIdx
< inputLen
) &&
1294 ((c
<=0x0d && c
>=0x0a) || c
==0x85 ||c
==0x2028 || c
==0x2029)) {
1295 // It's a new-line. ^ is true. Success.
1298 // Not at the start of a line. Fail.
1299 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1304 case URX_BACKSLASH_B
: // Test for word boundaries
1306 UBool success
= isWordBoundary(fp
->fInputIdx
);
1307 success
^= (opValue
!= 0); // flip sense for \B
1309 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1315 case URX_BACKSLASH_BU
: // Test for word boundaries, Unicode-style
1317 UBool success
= isUWordBoundary(fp
->fInputIdx
);
1318 success
^= (opValue
!= 0); // flip sense for \B
1320 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1326 case URX_BACKSLASH_D
: // Test for decimal digit
1328 if (fp
->fInputIdx
>= inputLen
) {
1329 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1333 UChar32 c
= fInput
->char32At(fp
->fInputIdx
);
1334 int8_t ctype
= u_charType(c
);
1335 UBool success
= (ctype
== U_DECIMAL_DIGIT_NUMBER
);
1336 success
^= (opValue
!= 0); // flip sense for \D
1338 fp
->fInputIdx
= fInput
->moveIndex32(fp
->fInputIdx
, 1);
1340 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1348 case URX_BACKSLASH_G
: // Test for position at end of previous match
1349 if (!((fMatch
&& fp
->fInputIdx
==fMatchEnd
) || fMatch
==FALSE
&& fp
->fInputIdx
==0)) {
1350 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1355 case URX_BACKSLASH_X
:
1356 // Match a Grapheme, as defined by Unicode TR 29.
1357 // Differs slightly from Perl, which consumes combining marks independently
1361 // Fail if at end of input
1362 if (fp
->fInputIdx
>= inputLen
) {
1363 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1367 // Examine (and consume) the current char.
1368 // Dispatch into a little state machine, based on the char.
1370 U16_NEXT(inputBuf
, fp
->fInputIdx
, inputLen
, c
);
1371 UnicodeSet
**sets
= fPattern
->fStaticSets
;
1372 if (sets
[URX_GC_NORMAL
]->contains(c
)) goto GC_Extend
;
1373 if (sets
[URX_GC_CONTROL
]->contains(c
)) goto GC_Control
;
1374 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
1375 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
1376 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
1377 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
1378 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
1384 if (fp
->fInputIdx
>= inputLen
) goto GC_Done
;
1385 U16_NEXT(inputBuf
, fp
->fInputIdx
, inputLen
, c
);
1386 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
1387 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
1388 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
1389 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
1390 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c
);
1394 if (fp
->fInputIdx
>= inputLen
) goto GC_Done
;
1395 U16_NEXT(inputBuf
, fp
->fInputIdx
, inputLen
, c
);
1396 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
1397 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
1398 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c
);
1402 if (fp
->fInputIdx
>= inputLen
) goto GC_Done
;
1403 U16_NEXT(inputBuf
, fp
->fInputIdx
, inputLen
, c
);
1404 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
1405 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c
);
1409 // Combining characters are consumed here
1411 if (fp
->fInputIdx
>= inputLen
) {
1414 U16_GET(inputBuf
, 0, fp
->fInputIdx
, inputLen
, c
);
1415 if (sets
[URX_GC_EXTEND
]->contains(c
) == FALSE
) {
1418 U16_FWD_1(inputBuf
, fp
->fInputIdx
, inputLen
);
1423 // Most control chars stand alone (don't combine with combining chars),
1424 // except for that CR/LF sequence is a single grapheme cluster.
1425 if (c
== 0x0d && fp
->fInputIdx
< inputLen
&& inputBuf
[fp
->fInputIdx
] == 0x0a) {
1436 case URX_BACKSLASH_Z
: // Test for end of line
1437 if (fp
->fInputIdx
< inputLen
) {
1438 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1444 case URX_STATIC_SETREF
:
1446 // Test input character against one of the predefined sets
1447 // (Word Characters, for example)
1448 // The high bit of the op value is a flag for the match polarity.
1449 // 0: success if input char is in set.
1450 // 1: success if input char is not in set.
1451 if (fp
->fInputIdx
>= inputLen
) {
1452 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1456 UBool success
= ((opValue
& URX_NEG_SET
) == URX_NEG_SET
);
1457 opValue
&= ~URX_NEG_SET
;
1458 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
1460 U16_NEXT(inputBuf
, fp
->fInputIdx
, inputLen
, c
);
1462 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
1463 if (s8
->contains(c
)) {
1467 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
1468 if (s
->contains(c
)) {
1473 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1479 case URX_STAT_SETREF_N
:
1481 // Test input character for NOT being a member of one of
1482 // the predefined sets (Word Characters, for example)
1483 if (fp
->fInputIdx
>= inputLen
) {
1484 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1488 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
1490 U16_NEXT(inputBuf
, fp
->fInputIdx
, inputLen
, c
);
1492 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
1493 if (s8
->contains(c
) == FALSE
) {
1497 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
1498 if (s
->contains(c
) == FALSE
) {
1503 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1509 if (fp
->fInputIdx
< inputLen
) {
1510 // There is input left. Pick up one char and test it for set membership.
1512 U16_NEXT(inputBuf
, fp
->fInputIdx
, inputLen
, c
);
1513 U_ASSERT(opValue
> 0 && opValue
< sets
->size());
1515 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
1516 if (s8
->contains(c
)) {
1521 UnicodeSet
*s
= (UnicodeSet
*)sets
->elementAt(opValue
);
1522 if (s
->contains(c
)) {
1523 // The character is in the set. A Match.
1528 // Either at end of input, or the character wasn't in the set.
1529 // Either way, we need to back track out.
1530 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1536 // . matches anything, but stops at end-of-line.
1537 if (fp
->fInputIdx
>= inputLen
) {
1538 // At end of input. Match failed. Backtrack out.
1539 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1542 // There is input left. Advance over one char, unless we've hit end-of-line
1544 U16_NEXT(inputBuf
, fp
->fInputIdx
, inputLen
, c
);
1545 if (((c
& 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
1546 ((c
<=0x0d && c
>=0x0a) || c
==0x85 ||c
==0x2028 || c
==0x2029)) {
1547 // End of line in normal mode. . does not match.
1548 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1555 case URX_DOTANY_ALL
:
1557 // ., in dot-matches-all (including new lines) mode
1558 if (fp
->fInputIdx
>= inputLen
) {
1559 // At end of input. Match failed. Backtrack out.
1560 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1563 // There is input left. Advance over one char, except if we are
1564 // at a cr/lf, advance over both of them.
1566 U16_NEXT(inputBuf
, fp
->fInputIdx
, inputLen
, c
);
1568 // In the case of a CR/LF, we need to advance over both.
1569 UChar nextc
= inputBuf
[fp
->fInputIdx
];
1570 if (nextc
== 0x0a) {
1578 // Match all up to and end-of-line or end-of-input.
1580 // Fail if input already exhausted.
1581 if (fp
->fInputIdx
>= inputLen
) {
1582 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1586 // There is input left. Fail if we are at the end of a line.
1588 U16_NEXT(inputBuf
, fp
->fInputIdx
, inputLen
, c
);
1589 if (((c
& 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
1590 ((c
<=0x0d && c
>=0x0a) || c
==0x85 ||c
==0x2028 || c
==0x2029)) {
1591 // End of line in normal mode. . does not match.
1592 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1596 // There was input left. Consume it until we hit the end of a line,
1597 // or until it's exhausted.
1598 while (fp
->fInputIdx
< inputLen
) {
1599 U16_NEXT(inputBuf
, fp
->fInputIdx
, inputLen
, c
);
1600 if (((c
& 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
1601 ((c
<=0x0d && c
>=0x0a) || c
==0x85 ||c
==0x2028 || c
==0x2029)) {
1602 U16_BACK_1(inputBuf
, 0, fp
->fInputIdx
)
1603 // Scan has reached a line-end. We are done.
1610 case URX_DOTANY_ALL_PL
:
1612 // Match up to end of input. Fail if already at end of input.
1613 if (fp
->fInputIdx
>= inputLen
) {
1614 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1616 fp
->fInputIdx
= inputLen
;
1623 fp
->fPatIdx
= opValue
;
1631 U_ASSERT(opValue
< fPattern
->fCompiledPat
->size());
1632 fp
= StateSave(fp
, fp
->fPatIdx
, frameSize
, status
); // State save to loc following current
1633 fp
->fPatIdx
= opValue
; // Then JMP.
1637 // This opcode is used with (x)+, when x can match a zero length string.
1638 // Same as JMP_SAV, except conditional on the match having made forward progress.
1639 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
1640 // data address of the input position at the start of the loop.
1642 U_ASSERT(opValue
> 0 && opValue
< fPattern
->fCompiledPat
->size());
1643 int32_t stoOp
= pat
[opValue
-1];
1644 U_ASSERT(URX_TYPE(stoOp
) == URX_STO_INP_LOC
);
1645 int32_t frameLoc
= URX_VAL(stoOp
);
1646 U_ASSERT(frameLoc
>= 0 && frameLoc
< frameSize
);
1647 int32_t prevInputIdx
= fp
->fExtra
[frameLoc
];
1648 U_ASSERT(prevInputIdx
<= fp
->fInputIdx
);
1649 if (prevInputIdx
< fp
->fInputIdx
) {
1650 // The match did make progress. Repeat the loop.
1651 fp
= StateSave(fp
, fp
->fPatIdx
, frameSize
, status
); // State save to loc following current
1652 fp
->fPatIdx
= opValue
;
1653 fp
->fExtra
[frameLoc
] = fp
->fInputIdx
;
1655 // If the input position did not advance, we do nothing here,
1656 // execution will fall out of the loop.
1662 U_ASSERT(opValue
>= 0 && opValue
< frameSize
-2);
1663 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
1665 // Pick up the three extra operands that CTR_INIT has, and
1666 // skip the pattern location counter past
1667 int32_t instrOperandLoc
= fp
->fPatIdx
;
1669 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
1670 int32_t minCount
= pat
[instrOperandLoc
+1];
1671 int32_t maxCount
= pat
[instrOperandLoc
+2];
1672 U_ASSERT(minCount
>=0);
1673 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
1674 U_ASSERT(loopLoc
>fp
->fPatIdx
);
1676 if (minCount
== 0) {
1677 fp
= StateSave(fp
, loopLoc
+1, frameSize
, status
);
1679 if (maxCount
== 0) {
1680 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1687 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
1688 int32_t initOp
= pat
[opValue
];
1689 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT
);
1690 int32_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
1691 int32_t minCount
= pat
[opValue
+2];
1692 int32_t maxCount
= pat
[opValue
+3];
1693 // Increment the counter. Note: we're not worrying about counter
1694 // overflow, since the data comes from UnicodeStrings, which
1695 // stores its length in an int32_t.
1697 U_ASSERT(*pCounter
> 0);
1698 if ((uint32_t)*pCounter
>= (uint32_t)maxCount
) {
1699 U_ASSERT(*pCounter
== maxCount
|| maxCount
== -1);
1702 if (*pCounter
>= minCount
) {
1703 fp
= StateSave(fp
, fp
->fPatIdx
, frameSize
, status
);
1705 fp
->fPatIdx
= opValue
+ 4; // Loop back.
1709 case URX_CTR_INIT_NG
:
1711 U_ASSERT(opValue
>= 0 && opValue
< frameSize
-2);
1712 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
1714 // Pick up the three extra operands that CTR_INIT has, and
1715 // skip the pattern location counter past
1716 int32_t instrOperandLoc
= fp
->fPatIdx
;
1718 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
1719 int32_t minCount
= pat
[instrOperandLoc
+1];
1720 int32_t maxCount
= pat
[instrOperandLoc
+2];
1721 U_ASSERT(minCount
>=0);
1722 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
1723 U_ASSERT(loopLoc
>fp
->fPatIdx
);
1725 if (minCount
== 0) {
1726 if (maxCount
!= 0) {
1727 fp
= StateSave(fp
, fp
->fPatIdx
, frameSize
, status
);
1729 fp
->fPatIdx
= loopLoc
+1; // Continue with stuff after repeated block
1734 case URX_CTR_LOOP_NG
:
1736 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
1737 int32_t initOp
= pat
[opValue
];
1738 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT_NG
);
1739 int32_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
1740 int32_t minCount
= pat
[opValue
+2];
1741 int32_t maxCount
= pat
[opValue
+3];
1742 // Increment the counter. Note: we're not worrying about counter
1743 // overflow, since the data comes from UnicodeStrings, which
1744 // stores its length in an int32_t.
1746 U_ASSERT(*pCounter
> 0);
1748 if ((uint32_t)*pCounter
>= (uint32_t)maxCount
) {
1749 // The loop has matched the maximum permitted number of times.
1750 // Break out of here with no action. Matching will
1751 // continue with the following pattern.
1752 U_ASSERT(*pCounter
== maxCount
|| maxCount
== -1);
1756 if (*pCounter
< minCount
) {
1757 // We haven't met the minimum number of matches yet.
1758 // Loop back for another one.
1759 fp
->fPatIdx
= opValue
+ 4; // Loop back.
1761 // We do have the minimum number of matches.
1762 // Fall into the following pattern, but first do
1763 // a state save to the top of the loop, so that a failure
1764 // in the following pattern will try another iteration of the loop.
1765 fp
= StateSave(fp
, opValue
+ 4, frameSize
, status
);
1770 // TODO: Possessive flavor of loop ops, or take them out if no longer needed.
1773 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
1774 fData
[opValue
] = fStack
->size();
1779 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
1780 int32_t newStackSize
= fData
[opValue
];
1781 U_ASSERT(newStackSize
<= fStack
->size());
1782 int32_t *newFP
= fStack
->getBuffer() + newStackSize
- frameSize
;
1783 if (newFP
== (int32_t *)fp
) {
1787 for (i
=0; i
<frameSize
; i
++) {
1788 newFP
[i
] = ((int32_t *)fp
)[i
];
1790 fp
= (REStackFrame
*)newFP
;
1791 fStack
->setSize(newStackSize
);
1798 U_ASSERT(opValue
< frameSize
);
1799 int32_t groupStartIdx
= fp
->fExtra
[opValue
];
1800 int32_t groupEndIdx
= fp
->fExtra
[opValue
+1];
1801 U_ASSERT(groupStartIdx
<= groupEndIdx
);
1802 int32_t len
= groupEndIdx
-groupStartIdx
;
1803 if (groupStartIdx
< 0) {
1804 // This capture group has not participated in the match thus far,
1805 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
); // FAIL, no match.
1809 // The capture group match was of an empty string.
1810 // Verified by testing: Perl matches succeed in this case, so
1815 UBool haveMatch
= FALSE
;
1816 if (fp
->fInputIdx
+ len
<= inputLen
) {
1817 if (opType
== URX_BACKREF
) {
1818 if (u_strncmp(inputBuf
+groupStartIdx
, inputBuf
+fp
->fInputIdx
, len
) == 0) {
1822 if (u_strncasecmp(inputBuf
+groupStartIdx
, inputBuf
+fp
->fInputIdx
,
1823 len
, U_FOLD_CASE_DEFAULT
) == 0) {
1829 fp
->fInputIdx
+= len
; // Match. Advance current input position.
1831 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
); // FAIL, no match.
1836 case URX_STO_INP_LOC
:
1838 U_ASSERT(opValue
>= 0 && opValue
< frameSize
);
1839 fp
->fExtra
[opValue
] = fp
->fInputIdx
;
1845 int32_t instrOperandLoc
= fp
->fPatIdx
;
1847 int32_t dataLoc
= URX_VAL(pat
[instrOperandLoc
]);
1848 U_ASSERT(dataLoc
>= 0 && dataLoc
< frameSize
);
1849 int32_t savedInputIdx
= fp
->fExtra
[dataLoc
];
1850 U_ASSERT(savedInputIdx
<= fp
->fInputIdx
);
1851 if (savedInputIdx
< fp
->fInputIdx
) {
1852 fp
->fPatIdx
= opValue
; // JMP
1854 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
); // FAIL, no progress in loop.
1861 // Entering a lookahead block.
1862 // Save Stack Ptr, Input Pos.
1863 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
1864 fData
[opValue
] = fStack
->size();
1865 fData
[opValue
+1] = fp
->fInputIdx
;
1871 // Leaving a look-ahead block.
1872 // restore Stack Ptr, Input Pos to positions they had on entry to block.
1873 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
1874 int32_t stackSize
= fStack
->size();
1875 int32_t newStackSize
= fData
[opValue
];
1876 U_ASSERT(stackSize
>= newStackSize
);
1877 if (stackSize
> newStackSize
) {
1878 int32_t *newFP
= fStack
->getBuffer() + newStackSize
- frameSize
;
1880 for (i
=0; i
<frameSize
; i
++) {
1881 newFP
[i
] = ((int32_t *)fp
)[i
];
1883 fp
= (REStackFrame
*)newFP
;
1884 fStack
->setSize(newStackSize
);
1886 fp
->fInputIdx
= fData
[opValue
+1];
1891 if (fp
->fInputIdx
< inputLen
) {
1893 U16_NEXT(inputBuf
, fp
->fInputIdx
, inputLen
, c
);
1894 if (u_foldCase(c
, U_FOLD_CASE_DEFAULT
) == opValue
) {
1898 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1903 // Test input against a literal string.
1904 // Strings require two slots in the compiled pattern, one for the
1905 // offset to the string text, and one for the length.
1906 int32_t stringStartIdx
, stringLen
;
1907 stringStartIdx
= opValue
;
1909 op
= pat
[fp
->fPatIdx
];
1911 opType
= URX_TYPE(op
);
1912 opValue
= URX_VAL(op
);
1913 U_ASSERT(opType
== URX_STRING_LEN
);
1914 stringLen
= opValue
;
1916 int32_t stringEndIndex
= fp
->fInputIdx
+ stringLen
;
1917 if (stringEndIndex
<= inputLen
) {
1918 if (u_strncasecmp(inputBuf
+fp
->fInputIdx
, litText
+stringStartIdx
,
1919 stringLen
, U_FOLD_CASE_DEFAULT
) == 0) {
1920 // Success. Advance the current input position.
1921 fp
->fInputIdx
= stringEndIndex
;
1925 // No match. Back up matching to a saved state
1926 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1932 // Entering a look-behind block.
1933 // Save Stack Ptr, Input Pos.
1934 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
1935 fData
[opValue
] = fStack
->size();
1936 fData
[opValue
+1] = fp
->fInputIdx
;
1937 // Init the variable containing the start index for attempted matches.
1938 fData
[opValue
+2] = -1;
1939 // Save input string length, then reset to pin any matches to end at
1940 // the current position.
1941 fData
[opValue
+3] = inputLen
;
1942 inputLen
= fp
->fInputIdx
;
1949 // Positive Look-Behind, at top of loop checking for matches of LB expression
1950 // at all possible input starting positions.
1952 // Fetch the min and max possible match lengths. They are the operands
1953 // of this op in the pattern.
1954 int32_t minML
= pat
[fp
->fPatIdx
++];
1955 int32_t maxML
= pat
[fp
->fPatIdx
++];
1956 U_ASSERT(minML
<= maxML
);
1957 U_ASSERT(minML
>= 0);
1959 // Fetch (from data) the last input index where a match was attempted.
1960 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
1961 int32_t *lbStartIdx
= &fData
[opValue
+2];
1962 if (*lbStartIdx
< 0) {
1963 // First time through loop.
1964 *lbStartIdx
= fp
->fInputIdx
- minML
;
1966 // 2nd through nth time through the loop.
1967 // Back up start position for match by one.
1968 if (*lbStartIdx
== 0) {
1969 (*lbStartIdx
)--; // Because U16_BACK is unsafe starting at 0.
1971 U16_BACK_1(inputBuf
, 0, *lbStartIdx
);
1975 if (*lbStartIdx
< 0 || *lbStartIdx
< fp
->fInputIdx
- maxML
) {
1976 // We have tried all potential match starting points without
1977 // getting a match. Backtrack out, and out of the
1978 // Look Behind altogether.
1979 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1980 int32_t restoreInputLen
= fData
[opValue
+3];
1981 U_ASSERT(restoreInputLen
>= inputLen
);
1982 U_ASSERT(restoreInputLen
<= fInput
->length());
1983 inputLen
= restoreInputLen
;
1987 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
1988 // (successful match will fall off the end of the loop.)
1989 fp
= StateSave(fp
, fp
->fPatIdx
-3, frameSize
, status
);
1990 fp
->fInputIdx
= *lbStartIdx
;
1995 // End of a look-behind block, after a successful match.
1997 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
1998 if (fp
->fInputIdx
!= inputLen
) {
1999 // The look-behind expression matched, but the match did not
2000 // extend all the way to the point that we are looking behind from.
2001 // FAIL out of here, which will take us back to the LB_CONT, which
2002 // will retry the match starting at another position or fail
2003 // the look-behind altogether, whichever is appropriate.
2004 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
2008 // Look-behind match is good. Restore the orignal input string length,
2009 // which had been truncated to pin the end of the lookbehind match to the
2010 // position being looked-behind.
2011 int32_t originalInputLen
= fData
[opValue
+3];
2012 U_ASSERT(originalInputLen
>= inputLen
);
2013 U_ASSERT(originalInputLen
<= fInput
->length());
2014 inputLen
= originalInputLen
;
2021 // Negative Look-Behind, at top of loop checking for matches of LB expression
2022 // at all possible input starting positions.
2024 // Fetch the extra parameters of this op.
2025 int32_t minML
= pat
[fp
->fPatIdx
++];
2026 int32_t maxML
= pat
[fp
->fPatIdx
++];
2027 int32_t continueLoc
= pat
[fp
->fPatIdx
++];
2028 continueLoc
= URX_VAL(continueLoc
);
2029 U_ASSERT(minML
<= maxML
);
2030 U_ASSERT(minML
>= 0);
2031 U_ASSERT(continueLoc
> fp
->fPatIdx
);
2033 // Fetch (from data) the last input index where a match was attempted.
2034 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
2035 int32_t *lbStartIdx
= &fData
[opValue
+2];
2036 if (*lbStartIdx
< 0) {
2037 // First time through loop.
2038 *lbStartIdx
= fp
->fInputIdx
- minML
;
2040 // 2nd through nth time through the loop.
2041 // Back up start position for match by one.
2042 if (*lbStartIdx
== 0) {
2043 (*lbStartIdx
)--; // Because U16_BACK is unsafe starting at 0.
2045 U16_BACK_1(inputBuf
, 0, *lbStartIdx
);
2049 if (*lbStartIdx
< 0 || *lbStartIdx
< fp
->fInputIdx
- maxML
) {
2050 // We have tried all potential match starting points without
2051 // getting a match, which means that the negative lookbehind as
2052 // a whole has succeeded. Jump forward to the continue location
2053 int32_t restoreInputLen
= fData
[opValue
+3];
2054 U_ASSERT(restoreInputLen
>= inputLen
);
2055 U_ASSERT(restoreInputLen
<= fInput
->length());
2056 inputLen
= restoreInputLen
;
2057 fp
->fPatIdx
= continueLoc
;
2061 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
2062 // (successful match will cause a FAIL out of the loop altogether.)
2063 fp
= StateSave(fp
, fp
->fPatIdx
-4, frameSize
, status
);
2064 fp
->fInputIdx
= *lbStartIdx
;
2069 // End of a negative look-behind block, after a successful match.
2071 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
2072 if (fp
->fInputIdx
!= inputLen
) {
2073 // The look-behind expression matched, but the match did not
2074 // extend all the way to the point that we are looking behind from.
2075 // FAIL out of here, which will take us back to the LB_CONT, which
2076 // will retry the match starting at another position or succeed
2077 // the look-behind altogether, whichever is appropriate.
2078 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
2082 // Look-behind expression matched, which means look-behind test as
2085 // Restore the orignal input string length, which had been truncated
2086 // inorder to pin the end of the lookbehind match
2087 // to the position being looked-behind.
2088 int32_t originalInputLen
= fData
[opValue
+3];
2089 U_ASSERT(originalInputLen
>= inputLen
);
2090 U_ASSERT(originalInputLen
<= fInput
->length());
2091 inputLen
= originalInputLen
;
2093 // Restore original stack position, discarding any state saved
2094 // by the successful pattern match.
2095 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
2096 int32_t newStackSize
= fData
[opValue
];
2097 U_ASSERT(fStack
->size() > newStackSize
);
2098 fStack
->setSize(newStackSize
);
2100 // FAIL, which will take control back to someplace
2101 // prior to entering the look-behind test.
2102 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
2108 // Loop Initialization for the optimized implementation of
2109 // [some character set]*
2110 // This op scans through all matching input.
2111 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
2113 U_ASSERT(opValue
> 0 && opValue
< sets
->size());
2114 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
2115 UnicodeSet
*s
= (UnicodeSet
*)sets
->elementAt(opValue
);
2117 // Loop through input, until either the input is exhausted or
2118 // we reach a character that is not a member of the set.
2119 int32_t ix
= fp
->fInputIdx
;
2121 if (ix
>= inputLen
) {
2125 U16_NEXT(inputBuf
, ix
, inputLen
, c
);
2127 if (s8
->contains(c
) == FALSE
) {
2128 U16_BACK_1(inputBuf
, 0, ix
);
2132 if (s
->contains(c
) == FALSE
) {
2133 U16_BACK_1(inputBuf
, 0, ix
);
2139 // If there were no matching characters, skip over the loop altogether.
2140 // The loop doesn't run at all, a * op always succeeds.
2141 if (ix
== fp
->fInputIdx
) {
2142 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
2146 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
2147 // must follow. It's operand is the stack location
2148 // that holds the starting input index for the match of this [set]*
2149 int32_t loopcOp
= pat
[fp
->fPatIdx
];
2150 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
2151 int32_t stackLoc
= URX_VAL(loopcOp
);
2152 U_ASSERT(stackLoc
>= 0 && stackLoc
< frameSize
);
2153 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
2156 // Save State to the URX_LOOP_C op that follows this one,
2157 // so that match failures in the following code will return to there.
2158 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
2159 fp
= StateSave(fp
, fp
->fPatIdx
, frameSize
, status
);
2165 case URX_LOOP_DOT_I
:
2166 // Loop Initialization for the optimized implementation of .*
2167 // This op scans through all remaining input.
2168 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
2170 // Loop through input until the input is exhausted (we reach an end-of-line)
2171 // In multi-line mode, we can just go straight to the end of the input.
2177 // NOT multi-line mode. Line endings do not match '.'
2178 // Scan forward until a line ending or end of input.
2181 if (ix
>= inputLen
) {
2186 U16_NEXT(inputBuf
, ix
, inputLen
, c
); // c = inputBuf[ix++]
2187 if (((c
& 0x7f) <= 0x29) &&
2188 ((c
<=0x0d && c
>=0x0a) || c
==0x85 ||c
==0x2028 || c
==0x2029)) {
2189 // char is a line ending. Put the input pos back to the
2190 // line ending char, and exit the scanning loop.
2191 U16_BACK_1(inputBuf
, 0, ix
);
2197 // If there were no matching characters, skip over the loop altogether.
2198 // The loop doesn't run at all, a * op always succeeds.
2199 if (ix
== fp
->fInputIdx
) {
2200 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
2204 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
2205 // must follow. It's operand is the stack location
2206 // that holds the starting input index for the match of this [set]*
2207 int32_t loopcOp
= pat
[fp
->fPatIdx
];
2208 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
2209 int32_t stackLoc
= URX_VAL(loopcOp
);
2210 U_ASSERT(stackLoc
>= 0 && stackLoc
< frameSize
);
2211 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
2214 // Save State to the URX_LOOP_C op that follows this one,
2215 // so that match failures in the following code will return to there.
2216 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
2217 fp
= StateSave(fp
, fp
->fPatIdx
, frameSize
, status
);
2225 U_ASSERT(opValue
>=0 && opValue
<frameSize
);
2226 int32_t terminalIdx
= fp
->fExtra
[opValue
];
2227 U_ASSERT(terminalIdx
<= fp
->fInputIdx
);
2228 if (terminalIdx
== fp
->fInputIdx
) {
2229 // We've backed up the input idx to the point that the loop started.
2230 // The loop is done. Leave here without saving state.
2231 // Subsequent failures won't come back here.
2234 // Set up for the next iteration of the loop, with input index
2235 // backed up by one from the last time through,
2236 // and a state save to this instruction in case the following code fails again.
2237 // (We're going backwards because this loop emulates stack unwinding, not
2238 // the initial scan forward.)
2239 U_ASSERT(fp
->fInputIdx
> 0);
2240 U16_BACK_1(inputBuf
, 0, fp
->fInputIdx
);
2241 if (inputBuf
[fp
->fInputIdx
] == 0x0a &&
2242 fp
->fInputIdx
> terminalIdx
&&
2243 inputBuf
[fp
->fInputIdx
-1] == 0x0d) {
2244 int32_t prevOp
= pat
[fp
->fPatIdx
-2];
2245 if (URX_TYPE(prevOp
) == URX_LOOP_DOT_I
) {
2246 // .*, stepping back over CRLF pair.
2252 fp
= StateSave(fp
, fp
->fPatIdx
-1, frameSize
, status
);
2259 // Trouble. The compiled pattern contains an entry with an
2260 // unrecognized type tag.
2264 if (U_FAILURE(status
)) {
2272 fLastMatchEnd
= fMatchEnd
;
2273 fMatchStart
= startIdx
;
2274 fMatchEnd
= fp
->fInputIdx
;
2276 REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart
, fMatchEnd
));
2282 REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
2286 fFrame
= fp
; // The active stack frame when the engine stopped.
2287 // Contains the capture group results that we need to
2295 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher
)
2299 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS