4 // Contains the implementation of class RegexMatcher,
5 // which is one of the main API classes for the ICU regular expression package.
8 **************************************************************************
9 * Copyright (C) 2002-2004 International Business Machines Corporation *
10 * and others. All rights reserved. *
11 **************************************************************************
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
17 #include "unicode/regex.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/ustring.h"
21 #include "unicode/rbbi.h"
29 // #include <malloc.h> // Needed for heapcheck testing
33 //-----------------------------------------------------------------------------
35 // Constructor and Destructor
37 //-----------------------------------------------------------------------------
38 RegexMatcher::RegexMatcher(const RegexPattern
*pat
) {
43 fDeferredStatus
= U_ZERO_ERROR
;
44 fStack
= new UVector32(fDeferredStatus
);
48 fDeferredStatus
= U_ILLEGAL_ARGUMENT_ERROR
;
51 if (pat
->fDataSize
> (int32_t)(sizeof(fSmallData
)/sizeof(int32_t))) {
52 fData
= (int32_t *)uprv_malloc(pat
->fDataSize
* sizeof(int32_t));
54 if (fStack
== NULL
|| fData
== NULL
) {
55 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
58 reset(*RegexStaticSets::gStaticSets
->fEmptyString
);
63 RegexMatcher::RegexMatcher(const UnicodeString
®exp
, const UnicodeString
&input
,
64 uint32_t flags
, UErrorCode
&status
) {
66 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
67 fPattern
= fPatternOwned
;
69 fDeferredStatus
= U_ZERO_ERROR
;
70 fStack
= new UVector32(status
);
73 if (U_FAILURE(status
)) {
76 if (fPattern
->fDataSize
> (int32_t)(sizeof(fSmallData
)/sizeof(int32_t))) {
77 fData
= (int32_t *)uprv_malloc(fPattern
->fDataSize
* sizeof(int32_t));
79 if (fStack
== NULL
|| fData
== NULL
) {
80 status
= U_MEMORY_ALLOCATION_ERROR
;
86 RegexMatcher::RegexMatcher(const UnicodeString
®exp
,
87 uint32_t flags
, UErrorCode
&status
) {
90 fDeferredStatus
= U_ZERO_ERROR
;
91 fStack
= new UVector32(status
);
93 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
94 fPattern
= fPatternOwned
;
96 if (U_FAILURE(status
)) {
100 if (fPattern
->fDataSize
> (int32_t)(sizeof(fSmallData
)/sizeof(int32_t))) {
101 fData
= (int32_t *)uprv_malloc(fPattern
->fDataSize
* sizeof(int32_t));
103 if (fStack
== NULL
|| fData
== NULL
) {
104 status
= U_MEMORY_ALLOCATION_ERROR
;
106 reset(*RegexStaticSets::gStaticSets
->fEmptyString
);
111 RegexMatcher::~RegexMatcher() {
113 if (fData
!= fSmallData
) {
118 delete fPatternOwned
;
119 fPatternOwned
= NULL
;
122 #if UCONFIG_NO_BREAK_ITERATION==0
123 delete fWordBreakItr
;
129 static const UChar BACKSLASH
= 0x5c;
130 static const UChar DOLLARSIGN
= 0x24;
131 //--------------------------------------------------------------------------------
135 //--------------------------------------------------------------------------------
136 RegexMatcher
&RegexMatcher::appendReplacement(UnicodeString
&dest
,
137 const UnicodeString
&replacement
,
138 UErrorCode
&status
) {
139 if (U_FAILURE(status
)) {
142 if (U_FAILURE(fDeferredStatus
)) {
143 status
= fDeferredStatus
;
146 if (fMatch
== FALSE
) {
147 status
= U_REGEX_INVALID_STATE
;
151 // Copy input string from the end of previous match to start of current match
152 int32_t len
= fMatchStart
-fLastReplaceEnd
;
154 dest
.append(*fInput
, fLastReplaceEnd
, len
);
156 fLastReplaceEnd
= fMatchEnd
;
159 // scan the replacement text, looking for substitutions ($n) and \escapes.
160 // TODO: optimize this loop by efficiently scanning for '$' or '\',
161 // move entire ranges not containing substitutions.
162 int32_t replLen
= replacement
.length();
164 while (replIdx
<replLen
) {
165 UChar c
= replacement
.charAt(replIdx
);
167 if (c
== BACKSLASH
) {
168 // Backslash Escape. Copy the following char out without further checks.
169 // Note: Surrogate pairs don't need any special handling
170 // The second half wont be a '$' or a '\', and
171 // will move to the dest normally on the next
173 if (replIdx
>= replLen
) {
176 c
= replacement
.charAt(replIdx
);
178 if (c
==0x55/*U*/ || c
==0x75/*u*/) {
179 // We have a \udddd or \Udddddddd escape sequence.
180 UChar32 escapedChar
= replacement
.unescapeAt(replIdx
);
181 if (escapedChar
!= (UChar32
)0xFFFFFFFF) {
182 dest
.append(escapedChar
);
183 // TODO: Report errors for mal-formed \u escapes?
184 // As this is, the original sequence is output, which may be OK.
189 // Plain backslash escape. Just put out the escaped character.
195 if (c
!= DOLLARSIGN
) {
196 // Normal char, not a $. Copy it out without further checks.
201 // We've got a $. Pick up a capture group number if one follows.
202 // Consume at most the number of digits necessary for the largest capture
203 // number that is valid for this pattern.
205 int32_t numDigits
= 0;
206 int32_t groupNum
= 0;
209 if (replIdx
>= replLen
) {
212 digitC
= replacement
.char32At(replIdx
);
213 if (u_isdigit(digitC
) == FALSE
) {
216 replIdx
= replacement
.moveIndex32(replIdx
, 1);
217 groupNum
=groupNum
*10 + u_charDigitValue(digitC
);
219 if (numDigits
>= fPattern
->fMaxCaptureDigits
) {
225 if (numDigits
== 0) {
226 // The $ didn't introduce a group number at all.
227 // Treat it as just part of the substitution text.
228 dest
.append(DOLLARSIGN
);
232 // Finally, append the capture group data to the destination.
233 dest
.append(group(groupNum
, status
));
234 if (U_FAILURE(status
)) {
235 // Can fail if group number is out of range.
246 //--------------------------------------------------------------------------------
248 // appendTail Intended to be used in conjunction with appendReplacement()
249 // To the destination string, append everything following
250 // the last match position from the input string.
252 //--------------------------------------------------------------------------------
253 UnicodeString
&RegexMatcher::appendTail(UnicodeString
&dest
) {
254 int32_t len
= fInput
->length()-fMatchEnd
;
256 dest
.append(*fInput
, fMatchEnd
, len
);
263 //--------------------------------------------------------------------------------
267 //--------------------------------------------------------------------------------
268 int32_t RegexMatcher::end(UErrorCode
&err
) const {
274 int32_t RegexMatcher::end(int group
, UErrorCode
&err
) const {
275 if (U_FAILURE(err
)) {
278 if (fMatch
== FALSE
) {
279 err
= U_REGEX_INVALID_STATE
;
282 if (group
< 0 || group
> fPattern
->fGroupMap
->size()) {
283 err
= U_INDEX_OUTOFBOUNDS_ERROR
;
290 // Get the position within the stack frame of the variables for
291 // this capture group.
292 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(group
-1);
293 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
294 U_ASSERT(groupOffset
>= 0);
295 e
= fFrame
->fExtra
[groupOffset
+ 1];
302 //--------------------------------------------------------------------------------
306 //--------------------------------------------------------------------------------
307 UBool
RegexMatcher::find() {
308 // Start at the position of the last match end. (Will be zero if the
309 // matcher has been reset.
311 if (U_FAILURE(fDeferredStatus
)) {
315 int32_t startPos
= fMatchEnd
;
318 // Save the position of any previous successful match.
319 fLastMatchEnd
= fMatchEnd
;
321 if (fMatchStart
== fMatchEnd
) {
322 // Previous match had zero length. Move start position up one position
323 // to avoid sending find() into a loop on zero-length matches.
324 if (startPos
== fInput
->length()) {
328 startPos
= fInput
->moveIndex32(startPos
, 1);
331 if (fLastMatchEnd
>= 0) {
332 // A previous find() failed to match. Don't try again.
333 // (without this test, a pattern with a zero-length match
334 // could match again at the end of an input string.)
339 int32_t inputLen
= fInput
->length();
341 // Compute the position in the input string beyond which a match can not begin, because
342 // the minimum length match would extend past the end of the input.
343 int32_t testLen
= inputLen
- fPattern
->fMinMatchLen
;
344 if (startPos
> testLen
) {
349 const UChar
*inputBuf
= fInput
->getBuffer();
351 U_ASSERT(startPos
>= 0);
353 switch (fPattern
->fStartType
) {
355 // No optimization was found.
356 // Try a match at each input position.
358 MatchAt(startPos
, fDeferredStatus
);
359 if (U_FAILURE(fDeferredStatus
)) {
365 if (startPos
>= testLen
) {
368 U16_FWD_1(inputBuf
, startPos
, inputLen
);
369 // Note that it's perfectly OK for a pattern to have a zero-length
370 // match at the end of a string, so we must make sure that the loop
371 // runs with startPos == testLen the last time through.
376 // Matches are only possible at the start of the input string
377 // (pattern begins with ^ or \A)
382 MatchAt(startPos
, fDeferredStatus
);
383 if (U_FAILURE(fDeferredStatus
)) {
391 // Match may start on any char from a pre-computed set.
392 U_ASSERT(fPattern
->fMinMatchLen
> 0);
394 int32_t pos
= startPos
;
395 U16_NEXT(inputBuf
, startPos
, inputLen
, c
); // like c = inputBuf[startPos++];
396 if (c
<256 && fPattern
->fInitialChars8
->contains(c
) ||
397 c
>=256 && fPattern
->fInitialChars
->contains(c
)) {
398 MatchAt(pos
, fDeferredStatus
);
399 if (U_FAILURE(fDeferredStatus
)) {
406 if (pos
>= testLen
) {
417 // Match starts on exactly one char.
418 U_ASSERT(fPattern
->fMinMatchLen
> 0);
419 UChar32 theChar
= fPattern
->fInitialChar
;
421 int32_t pos
= startPos
;
422 U16_NEXT(inputBuf
, startPos
, inputLen
, c
); // like c = inputBuf[startPos++];
424 MatchAt(pos
, fDeferredStatus
);
425 if (U_FAILURE(fDeferredStatus
)) {
432 if (pos
>= testLen
) {
444 MatchAt(startPos
, fDeferredStatus
);
445 if (U_FAILURE(fDeferredStatus
)) {
451 U16_NEXT(inputBuf
, startPos
, inputLen
, c
); // like c = inputBuf[startPos++];
455 c
= inputBuf
[startPos
-1];
456 if (((c
& 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
457 (c
== 0x0a || c
== 0x0d || c
==0x0c || c
==0x85 ||c
==0x2028 || c
==0x2029 )) {
458 if (c
== 0x0d && startPos
< inputLen
&& inputBuf
[startPos
] == 0x0a) {
461 MatchAt(startPos
, fDeferredStatus
);
462 if (U_FAILURE(fDeferredStatus
)) {
469 if (startPos
>= testLen
) {
473 U16_NEXT(inputBuf
, startPos
, inputLen
, c
); // like c = inputBuf[startPos++];
474 // Note that it's perfectly OK for a pattern to have a zero-length
475 // match at the end of a string, so we must make sure that the loop
476 // runs with startPos == testLen the last time through.
490 UBool
RegexMatcher::find(int32_t start
, UErrorCode
&status
) {
491 if (U_FAILURE(status
)) {
494 if (U_FAILURE(fDeferredStatus
)) {
495 status
= fDeferredStatus
;
498 int32_t inputLen
= fInput
->length();
499 if (start
< 0 || start
> inputLen
) {
500 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
510 //--------------------------------------------------------------------------------
514 //--------------------------------------------------------------------------------
515 UnicodeString
RegexMatcher::group(UErrorCode
&status
) const {
516 return group(0, status
);
521 UnicodeString
RegexMatcher::group(int32_t groupNum
, UErrorCode
&status
) const {
522 int32_t s
= start(groupNum
, status
);
523 int32_t e
= end(groupNum
, status
);
525 // Note: calling start() and end() above will do all necessary checking that
526 // the group number is OK and that a match exists. status will be set.
527 if (U_FAILURE(status
)) {
528 return UnicodeString();
530 if (U_FAILURE(fDeferredStatus
)) {
531 status
= fDeferredStatus
;
532 return UnicodeString();
536 // A capture group wasn't part of the match
537 return UnicodeString();
540 return UnicodeString(*fInput
, s
, e
-s
);
546 int32_t RegexMatcher::groupCount() const {
547 return fPattern
->fGroupMap
->size();
552 const UnicodeString
&RegexMatcher::input() const {
559 //--------------------------------------------------------------------------------
563 //--------------------------------------------------------------------------------
564 UBool
RegexMatcher::lookingAt(UErrorCode
&status
) {
565 if (U_FAILURE(status
)) {
568 if (U_FAILURE(fDeferredStatus
)) {
569 status
= fDeferredStatus
;
578 UBool
RegexMatcher::lookingAt(int32_t start
, UErrorCode
&status
) {
579 if (U_FAILURE(status
)) {
582 if (U_FAILURE(fDeferredStatus
)) {
583 status
= fDeferredStatus
;
586 if (start
< 0 || start
> fInput
->length()) {
587 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
591 MatchAt(start
, status
);
597 //--------------------------------------------------------------------------------
601 //--------------------------------------------------------------------------------
602 UBool
RegexMatcher::matches(UErrorCode
&status
) {
603 if (U_FAILURE(status
)) {
606 if (U_FAILURE(fDeferredStatus
)) {
607 status
= fDeferredStatus
;
612 UBool success
= (fMatch
&& fMatchEnd
==fInput
->length());
617 UBool
RegexMatcher::matches(int32_t start
, UErrorCode
&status
) {
618 if (U_FAILURE(status
)) {
621 if (U_FAILURE(fDeferredStatus
)) {
622 status
= fDeferredStatus
;
625 if (start
< 0 || start
> fInput
->length()) {
626 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
630 MatchAt(start
, status
);
631 UBool success
= (fMatch
&& fMatchEnd
==fInput
->length());
637 const RegexPattern
&RegexMatcher::pattern() const {
643 //--------------------------------------------------------------------------------
647 //--------------------------------------------------------------------------------
648 UnicodeString
RegexMatcher::replaceAll(const UnicodeString
&replacement
, UErrorCode
&status
) {
649 if (U_FAILURE(status
)) {
652 if (U_FAILURE(fDeferredStatus
)) {
653 status
= fDeferredStatus
;
656 UnicodeString destString
;
657 for (reset(); find(); ) {
658 appendReplacement(destString
, replacement
, status
);
659 if (U_FAILURE(status
)) {
663 appendTail(destString
);
670 //--------------------------------------------------------------------------------
674 //--------------------------------------------------------------------------------
675 UnicodeString
RegexMatcher::replaceFirst(const UnicodeString
&replacement
, UErrorCode
&status
) {
676 if (U_FAILURE(status
)) {
679 if (U_FAILURE(fDeferredStatus
)) {
680 status
= fDeferredStatus
;
689 UnicodeString destString
;
690 appendReplacement(destString
, replacement
, status
);
691 appendTail(destString
);
697 //--------------------------------------------------------------------------------
701 //--------------------------------------------------------------------------------
702 RegexMatcher
&RegexMatcher::reset() {
714 RegexMatcher
&RegexMatcher::reset(const UnicodeString
&input
) {
717 if (fWordBreakItr
!= NULL
) {
718 #if UCONFIG_NO_BREAK_ITERATION==0
719 fWordBreakItr
->setText(input
);
725 RegexMatcher
&RegexMatcher::reset(const UChar
*) {
726 fDeferredStatus
= U_INTERNAL_PROGRAM_ERROR
;
731 RegexMatcher
&RegexMatcher::reset(int32_t position
, UErrorCode
&status
) {
732 if (U_FAILURE(status
)) {
736 if (position
< 0 || position
>= fInput
->length()) {
737 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
740 fMatchEnd
= position
;
748 //--------------------------------------------------------------------------------
752 //--------------------------------------------------------------------------------
753 void RegexMatcher::setTrace(UBool state
) {
759 //---------------------------------------------------------------------
763 //---------------------------------------------------------------------
764 int32_t RegexMatcher::split(const UnicodeString
&input
,
765 UnicodeString dest
[],
766 int32_t destCapacity
,
770 // Check arguements for validity
772 if (U_FAILURE(status
)) {
776 if (destCapacity
< 1) {
777 status
= U_ILLEGAL_ARGUMENT_ERROR
;
783 // Reset for the input text
786 int32_t inputLen
= input
.length();
787 int32_t nextOutputStringStart
= 0;
794 // Loop through the input text, searching for the delimiter pattern
797 int32_t numCaptureGroups
= fPattern
->fGroupMap
->size();
799 if (i
>=destCapacity
-1) {
800 // There is one or zero output string left.
801 // Fill the last output string with whatever is left from the input, then exit the loop.
802 // ( i will be == destCapicity if we filled the output array while processing
803 // capture groups of the delimiter expression, in which case we will discard the
804 // last capture group saved in favor of the unprocessed remainder of the
807 int32_t remainingLength
= inputLen
-nextOutputStringStart
;
808 if (remainingLength
> 0) {
809 dest
[i
].setTo(input
, nextOutputStringStart
, remainingLength
);
814 // We found another delimiter. Move everything from where we started looking
815 // up until the start of the delimiter into the next output string.
816 int32_t fieldLen
= fMatchStart
- nextOutputStringStart
;
817 dest
[i
].setTo(input
, nextOutputStringStart
, fieldLen
);
818 nextOutputStringStart
= fMatchEnd
;
820 // If the delimiter pattern has capturing parentheses, the captured
821 // text goes out into the next n destination strings.
823 for (groupNum
=1; groupNum
<=numCaptureGroups
; groupNum
++) {
824 if (i
==destCapacity
-1) {
828 dest
[i
] = group(groupNum
, status
);
831 if (nextOutputStringStart
== inputLen
) {
832 // The delimiter was at the end of the string. We're done.
839 // We ran off the end of the input while looking for the next delimiter.
840 // All the remaining text goes into the current output string.
841 dest
[i
].setTo(input
, nextOutputStringStart
, inputLen
-nextOutputStringStart
);
850 //--------------------------------------------------------------------------------
854 //--------------------------------------------------------------------------------
855 int32_t RegexMatcher::start(UErrorCode
&status
) const {
856 return start(0, status
);
862 int32_t RegexMatcher::start(int group
, UErrorCode
&status
) const {
863 if (U_FAILURE(status
)) {
866 if (U_FAILURE(fDeferredStatus
)) {
867 status
= fDeferredStatus
;
870 if (fMatch
== FALSE
) {
871 status
= U_REGEX_INVALID_STATE
;
874 if (group
< 0 || group
> fPattern
->fGroupMap
->size()) {
875 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
882 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(group
-1);
883 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
884 U_ASSERT(groupOffset
>= 0);
885 s
= fFrame
->fExtra
[groupOffset
];
892 //================================================================================
894 // Code following this point in this file is the internal
895 // Match Engine Implementation.
897 //================================================================================
900 //--------------------------------------------------------------------------------
903 // Discard any previous contents of the state save stack, and initialize a
904 // new stack frame to all -1. The -1s are needed for capture group limits,
905 // where they indicate that a group has not yet matched anything.
906 //--------------------------------------------------------------------------------
907 REStackFrame
*RegexMatcher::resetStack() {
908 // Discard any previous contents of the state save stack, and initialize a
909 // new stack frame to all -1. The -1s are needed for capture group limits, where
910 // they indicate that a group has not yet matched anything.
911 fStack
->removeAllElements();
913 int32_t *iFrame
= fStack
->reserveBlock(fPattern
->fFrameSize
, fDeferredStatus
);
915 for (i
=0; i
<fPattern
->fFrameSize
; i
++) {
918 return (REStackFrame
*)iFrame
;
923 //--------------------------------------------------------------------------------
926 // in perl, "xab..cd..", \b is true at positions 0,3,5,7
928 // If the current char is a combining mark,
930 // Else Scan backwards to the first non-combining char.
931 // We are at a boundary if the this char and the original chars are
932 // opposite in membership in \w set
934 // parameters: pos - the current position in the input buffer
936 //--------------------------------------------------------------------------------
937 UBool
RegexMatcher::isWordBoundary(int32_t pos
) {
938 UBool isBoundary
= FALSE
;
939 UBool cIsWord
= FALSE
;
941 // Determine whether char c at current position is a member of the word set of chars.
942 // If we're off the end of the string, behave as though we're not at a word char.
943 if (pos
< fInput
->length()) {
944 UChar32 c
= fInput
->char32At(pos
);
945 int8_t ctype
= u_charType(c
);
946 if (ctype
==U_NON_SPACING_MARK
|| ctype
==U_ENCLOSING_MARK
) {
947 // Current char is a combining one. Not a boundary.
950 cIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(c
);
953 // Back up until we come to a non-combining char, determine whether
954 // that char is a word char.
955 UBool prevCIsWord
= FALSE
;
956 int32_t prevPos
= pos
;
961 prevPos
= fInput
->moveIndex32(prevPos
, -1);
962 UChar32 prevChar
= fInput
->char32At(prevPos
);
963 int8_t prevCType
= u_charType(prevChar
);
964 if (!(prevCType
==U_NON_SPACING_MARK
|| prevCType
==U_ENCLOSING_MARK
)) {
965 prevCIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(prevChar
);
969 isBoundary
= cIsWord
^ prevCIsWord
;
973 //--------------------------------------------------------------------------------
977 // Test for a word boundary using RBBI word break.
979 // parameters: pos - the current position in the input buffer
981 //--------------------------------------------------------------------------------
982 UBool
RegexMatcher::isUWordBoundary(int32_t pos
) {
983 UBool returnVal
= FALSE
;
984 #if UCONFIG_NO_BREAK_ITERATION==0
985 UErrorCode status
= U_ZERO_ERROR
;
987 // If we haven't yet created a break iterator for this matcher, do it now.
988 if (fWordBreakItr
== NULL
) {
990 (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale::getEnglish(), status
);
991 if (U_FAILURE(status
)) {
992 // TODO: reliable error reporting for BI failures.
995 fWordBreakItr
->setText(*fInput
);
998 returnVal
= fWordBreakItr
->isBoundary(pos
);
1003 //--------------------------------------------------------------------------------
1006 // Make a new stack frame, initialized as a copy of the current stack frame.
1007 // Set the pattern index in the original stack frame from the operand value
1008 // in the opcode. Execution of the engine continues with the state in
1009 // the newly created stack frame
1011 // Note that reserveBlock() may grow the stack, resulting in the
1012 // whole thing being relocated in memory.
1014 //--------------------------------------------------------------------------------
1015 inline REStackFrame
*RegexMatcher::StateSave(REStackFrame
*fp
, int32_t savePatIdx
, int32_t frameSize
, UErrorCode
&status
) {
1016 // push storage for a new frame.
1017 int32_t *newFP
= fStack
->reserveBlock(frameSize
, status
);
1018 fp
= (REStackFrame
*)(newFP
- frameSize
); // in case of realloc of stack.
1020 // New stack frame = copy of old top frame.
1021 int32_t *source
= (int32_t *)fp
;
1022 int32_t *dest
= newFP
;
1024 *dest
++ = *source
++;
1025 if (source
== newFP
) {
1030 fp
->fPatIdx
= savePatIdx
;
1031 return (REStackFrame
*)newFP
;
1035 //--------------------------------------------------------------------------------
1037 // MatchAt This is the actual matching engine.
1039 //--------------------------------------------------------------------------------
1040 void RegexMatcher::MatchAt(int32_t startIdx
, UErrorCode
&status
) {
1041 UBool isMatch
= FALSE
; // True if the we have a match.
1043 int32_t op
; // Operation from the compiled pattern, split into
1044 int32_t opType
; // the opcode
1045 int32_t opValue
; // and the operand value.
1047 #ifdef REGEX_RUN_DEBUG
1050 printf("MatchAt(startIdx=%d)\n", startIdx
);
1051 printf("Original Pattern: ");
1053 for (i
=0; i
<fPattern
->fPattern
.length(); i
++) {
1054 printf("%c", fPattern
->fPattern
.charAt(i
));
1057 printf("Input String: ");
1058 for (i
=0; i
<fInput
->length(); i
++) {
1059 UChar c
= fInput
->charAt(i
);
1060 if (c
<32 || c
>256) {
1070 if (U_FAILURE(status
)) {
1074 // Cache frequently referenced items from the compiled pattern
1075 // in local variables.
1077 int32_t *pat
= fPattern
->fCompiledPat
->getBuffer();
1079 const UChar
*litText
= fPattern
->fLiteralText
.getBuffer();
1080 UVector
*sets
= fPattern
->fSets
;
1081 int32_t inputLen
= fInput
->length();
1082 const UChar
*inputBuf
= fInput
->getBuffer();
1084 REStackFrame
*fp
= resetStack();
1085 int32_t frameSize
= fPattern
->fFrameSize
;
1088 fp
->fInputIdx
= startIdx
;
1090 // Zero out the pattern's static data
1092 for (i
= 0; i
<fPattern
->fDataSize
; i
++) {
1097 // Main loop for interpreting the compiled pattern.
1098 // One iteration of the loop per pattern operation performed.
1102 if (_heapchk() != _HEAPOK
) {
1103 fprintf(stderr
, "Heap Trouble\n");
1106 op
= pat
[fp
->fPatIdx
];
1107 opType
= URX_TYPE(op
);
1108 opValue
= URX_VAL(op
);
1109 #ifdef REGEX_RUN_DEBUG
1111 printf("inputIdx=%d inputChar=%c sp=%3d ", fp
->fInputIdx
,
1112 fInput
->char32At(fp
->fInputIdx
), (int32_t *)fp
-fStack
->getBuffer());
1113 fPattern
->dumpOp(fp
->fPatIdx
);
1126 // Force a backtrack. In some circumstances, the pattern compiler
1127 // will notice that the pattern can't possibly match anything, and will
1128 // emit one of these at that point.
1129 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1134 if (fp
->fInputIdx
< inputLen
) {
1136 U16_NEXT(inputBuf
, fp
->fInputIdx
, inputLen
, c
);
1141 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1147 // Test input against a literal string.
1148 // Strings require two slots in the compiled pattern, one for the
1149 // offset to the string text, and one for the length.
1150 int32_t stringStartIdx
= opValue
;
1153 op
= pat
[fp
->fPatIdx
]; // Fetch the second operand
1155 opType
= URX_TYPE(op
);
1156 stringLen
= URX_VAL(op
);
1157 U_ASSERT(opType
== URX_STRING_LEN
);
1158 U_ASSERT(stringLen
>= 2);
1160 if (fp
->fInputIdx
+ stringLen
> inputLen
) {
1161 // No match. String is longer than the remaining input text.
1162 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1166 const UChar
* pInp
= inputBuf
+ fp
->fInputIdx
;
1167 const UChar
* pPat
= litText
+stringStartIdx
;
1168 const UChar
* pEnd
= pInp
+ stringLen
;
1170 if (*pInp
== *pPat
) {
1174 // Successful Match.
1175 fp
->fInputIdx
+= stringLen
;
1180 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1191 case URX_STATE_SAVE
:
1192 fp
= StateSave(fp
, opValue
, frameSize
, status
);
1197 // The match loop will exit via this path on a successful match,
1198 // when we reach the end of the pattern.
1202 // Start and End Capture stack frame variables are layout out like this:
1203 // fp->fExtra[opValue] - The start of a completed capture group
1204 // opValue+1 - The end of a completed capture group
1205 // opValue+2 - the start of a capture group whose end
1206 // has not yet been reached (and might not ever be).
1207 case URX_START_CAPTURE
:
1208 U_ASSERT(opValue
>= 0 && opValue
< frameSize
-3);
1209 fp
->fExtra
[opValue
+2] = fp
->fInputIdx
;
1213 case URX_END_CAPTURE
:
1214 U_ASSERT(opValue
>= 0 && opValue
< frameSize
-3);
1215 U_ASSERT(fp
->fExtra
[opValue
+2] >= 0); // Start pos for this group must be set.
1216 fp
->fExtra
[opValue
] = fp
->fExtra
[opValue
+2]; // Tentative start becomes real.
1217 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // End position
1218 U_ASSERT(fp
->fExtra
[opValue
] <= fp
->fExtra
[opValue
+1]);
1222 case URX_DOLLAR
: // $, test for End of line
1223 // or for position before new line at end of input
1224 if (fp
->fInputIdx
< inputLen
-2) {
1225 // We are no where near the end of input. Fail.
1226 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1229 if (fp
->fInputIdx
>= inputLen
) {
1230 // We really are at the end of input. Success.
1233 // If we are positioned just before a new-line that is located at the
1234 // end of input, succeed.
1235 if (fp
->fInputIdx
== inputLen
-1) {
1236 UChar32 c
= fInput
->char32At(fp
->fInputIdx
);
1237 if (c
== 0x0a || c
==0x0d || c
==0x0c || c
==0x85 ||c
==0x2028 || c
==0x2029) {
1238 // If not in the middle of a CR/LF sequence
1239 if ( !(c
==0x0a && fp
->fInputIdx
>0 && inputBuf
[fp
->fInputIdx
-1]==0x0d)) {
1241 // At new-line at end of input. Success
1246 if (fp
->fInputIdx
== inputLen
-2) {
1247 if (fInput
->char32At(fp
->fInputIdx
) == 0x0d && fInput
->char32At(fp
->fInputIdx
+1) == 0x0a) {
1248 break; // At CR/LF at end of input. Success
1252 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1257 case URX_DOLLAR_M
: // $, test for End of line in multi-line mode
1259 if (fp
->fInputIdx
>= inputLen
) {
1260 // We really are at the end of input. Success.
1263 // If we are positioned just before a new-line, succeed.
1264 // It makes no difference where the new-line is within the input.
1265 UChar32 c
= inputBuf
[fp
->fInputIdx
];
1266 if (c
== 0x0a || c
==0x0d || c
==0x0c || c
==0x85 ||c
==0x2028 || c
==0x2029) {
1267 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
1268 if ( !(c
==0x0a && fp
->fInputIdx
>0 && inputBuf
[fp
->fInputIdx
-1]==0x0d)) {
1269 break; // At new-line at end of input. Success
1273 // not at a new line. Fail.
1274 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1279 case URX_CARET
: // ^, test for start of line
1280 if (fp
->fInputIdx
!= 0) {
1281 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1286 case URX_CARET_M
: // ^, test for start of line in mulit-line mode
1288 if (fp
->fInputIdx
== 0) {
1289 // We are at the start input. Success.
1292 // Check whether character just before the current pos is a new-line
1293 // unless we are at the end of input
1294 UChar c
= inputBuf
[fp
->fInputIdx
- 1];
1295 if ((fp
->fInputIdx
< inputLen
) &&
1296 (c
== 0x0a || c
==0x0d || c
==0x0c || c
==0x85 ||c
==0x2028 || c
==0x2029)) {
1297 // It's a new-line. ^ is true. Success.
1300 // Not at the start of a line. Fail.
1301 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1306 case URX_BACKSLASH_B
: // Test for word boundaries
1308 UBool success
= isWordBoundary(fp
->fInputIdx
);
1309 success
^= (opValue
!= 0); // flip sense for \B
1311 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1317 case URX_BACKSLASH_BU
: // Test for word boundaries, Unicode-style
1319 UBool success
= isUWordBoundary(fp
->fInputIdx
);
1320 success
^= (opValue
!= 0); // flip sense for \B
1322 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1328 case URX_BACKSLASH_D
: // Test for decimal digit
1330 if (fp
->fInputIdx
>= inputLen
) {
1331 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1335 UChar32 c
= fInput
->char32At(fp
->fInputIdx
);
1336 int8_t ctype
= u_charType(c
);
1337 UBool success
= (ctype
== U_DECIMAL_DIGIT_NUMBER
);
1338 success
^= (opValue
!= 0); // flip sense for \D
1340 fp
->fInputIdx
= fInput
->moveIndex32(fp
->fInputIdx
, 1);
1342 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1350 case URX_BACKSLASH_G
: // Test for position at end of previous match
1351 if (!((fMatch
&& fp
->fInputIdx
==fMatchEnd
) || fMatch
==FALSE
&& fp
->fInputIdx
==0)) {
1352 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1357 case URX_BACKSLASH_X
:
1358 // Match a Grapheme, as defined by Unicode TR 29.
1359 // Differs slightly from Perl, which consumes combining marks independently
1363 // Fail if at end of input
1364 if (fp
->fInputIdx
>= inputLen
) {
1365 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1369 // Examine (and consume) the current char.
1370 // Dispatch into a little state machine, based on the char.
1372 U16_NEXT(inputBuf
, fp
->fInputIdx
, inputLen
, c
);
1373 UnicodeSet
**sets
= fPattern
->fStaticSets
;
1374 if (sets
[URX_GC_NORMAL
]->contains(c
)) goto GC_Extend
;
1375 if (sets
[URX_GC_CONTROL
]->contains(c
)) goto GC_Control
;
1376 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
1377 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
1378 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
1379 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
1380 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
1386 if (fp
->fInputIdx
>= inputLen
) goto GC_Done
;
1387 U16_NEXT(inputBuf
, fp
->fInputIdx
, inputLen
, c
);
1388 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
1389 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
1390 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
1391 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
1392 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c
);
1396 if (fp
->fInputIdx
>= inputLen
) goto GC_Done
;
1397 U16_NEXT(inputBuf
, fp
->fInputIdx
, inputLen
, c
);
1398 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
1399 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
1400 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c
);
1404 if (fp
->fInputIdx
>= inputLen
) goto GC_Done
;
1405 U16_NEXT(inputBuf
, fp
->fInputIdx
, inputLen
, c
);
1406 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
1407 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c
);
1411 // Combining characters are consumed here
1413 if (fp
->fInputIdx
>= inputLen
) {
1416 U16_GET(inputBuf
, 0, fp
->fInputIdx
, inputLen
, c
);
1417 if (sets
[URX_GC_EXTEND
]->contains(c
) == FALSE
) {
1420 U16_FWD_1(inputBuf
, fp
->fInputIdx
, inputLen
);
1425 // Most control chars stand alone (don't combine with combining chars),
1426 // except for that CR/LF sequence is a single grapheme cluster.
1427 if (c
== 0x0d && fp
->fInputIdx
< inputLen
&& inputBuf
[fp
->fInputIdx
] == 0x0a) {
1438 case URX_BACKSLASH_Z
: // Test for end of line
1439 if (fp
->fInputIdx
< inputLen
) {
1440 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1446 case URX_STATIC_SETREF
:
1448 // Test input character against one of the predefined sets
1449 // (Word Characters, for example)
1450 // The high bit of the op value is a flag for the match polarity.
1451 // 0: success if input char is in set.
1452 // 1: success if input char is not in set.
1453 if (fp
->fInputIdx
>= inputLen
) {
1454 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1458 UBool success
= ((opValue
& URX_NEG_SET
) == URX_NEG_SET
);
1459 opValue
&= ~URX_NEG_SET
;
1460 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
1462 U16_NEXT(inputBuf
, fp
->fInputIdx
, inputLen
, c
);
1464 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
1465 if (s8
->contains(c
)) {
1469 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
1470 if (s
->contains(c
)) {
1475 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1481 case URX_STAT_SETREF_N
:
1483 // Test input character for NOT being a member of one of
1484 // the predefined sets (Word Characters, for example)
1485 if (fp
->fInputIdx
>= inputLen
) {
1486 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1490 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
1492 U16_NEXT(inputBuf
, fp
->fInputIdx
, inputLen
, c
);
1494 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
1495 if (s8
->contains(c
) == FALSE
) {
1499 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
1500 if (s
->contains(c
) == FALSE
) {
1505 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1511 if (fp
->fInputIdx
< inputLen
) {
1512 // There is input left. Pick up one char and test it for set membership.
1514 U16_NEXT(inputBuf
, fp
->fInputIdx
, inputLen
, c
);
1515 U_ASSERT(opValue
> 0 && opValue
< sets
->size());
1517 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
1518 if (s8
->contains(c
)) {
1523 UnicodeSet
*s
= (UnicodeSet
*)sets
->elementAt(opValue
);
1524 if (s
->contains(c
)) {
1525 // The character is in the set. A Match.
1530 // Either at end of input, or the character wasn't in the set.
1531 // Either way, we need to back track out.
1532 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1538 // . matches anything, but stops at end-of-line.
1539 if (fp
->fInputIdx
>= inputLen
) {
1540 // At end of input. Match failed. Backtrack out.
1541 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1544 // There is input left. Advance over one char, unless we've hit end-of-line
1546 U16_NEXT(inputBuf
, fp
->fInputIdx
, inputLen
, c
);
1547 if (((c
& 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
1548 (c
== 0x0a || c
==0x0d || c
==0x0c || c
==0x85 ||c
==0x2028 || c
==0x2029)) {
1549 // End of line in normal mode. . does not match.
1550 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1557 case URX_DOTANY_ALL
:
1559 // ., in dot-matches-all (including new lines) mode
1560 if (fp
->fInputIdx
>= inputLen
) {
1561 // At end of input. Match failed. Backtrack out.
1562 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1565 // There is input left. Advance over one char, except if we are
1566 // at a cr/lf, advance over both of them.
1568 U16_NEXT(inputBuf
, fp
->fInputIdx
, inputLen
, c
);
1570 // In the case of a CR/LF, we need to advance over both.
1571 UChar nextc
= inputBuf
[fp
->fInputIdx
];
1572 if (nextc
== 0x0a) {
1580 // Match all up to and end-of-line or end-of-input.
1582 // Fail if input already exhausted.
1583 if (fp
->fInputIdx
>= inputLen
) {
1584 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1588 // There is input left. Fail if we are at the end of a line.
1590 U16_NEXT(inputBuf
, fp
->fInputIdx
, inputLen
, c
);
1591 if (((c
& 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
1592 (c
== 0x0a || c
==0x0d || c
==0x0c || c
==0x85 ||c
==0x2028 || c
==0x2029)) {
1593 // End of line in normal mode. . does not match.
1594 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1598 // There was input left. Consume it until we hit the end of a line,
1599 // or until it's exhausted.
1600 while (fp
->fInputIdx
< inputLen
) {
1601 U16_NEXT(inputBuf
, fp
->fInputIdx
, inputLen
, c
);
1602 if (((c
& 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
1603 (c
== 0x0a || c
==0x0d || c
==0x0c || c
==0x85 ||c
==0x2028 || c
==0x2029)) {
1604 U16_BACK_1(inputBuf
, 0, fp
->fInputIdx
)
1605 // Scan has reached a line-end. We are done.
1612 case URX_DOTANY_ALL_PL
:
1614 // Match up to end of input. Fail if already at end of input.
1615 if (fp
->fInputIdx
>= inputLen
) {
1616 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1618 fp
->fInputIdx
= inputLen
;
1625 fp
->fPatIdx
= opValue
;
1633 U_ASSERT(opValue
< fPattern
->fCompiledPat
->size());
1634 fp
= StateSave(fp
, fp
->fPatIdx
, frameSize
, status
); // State save to loc following current
1635 fp
->fPatIdx
= opValue
; // Then JMP.
1639 // This opcode is used with (x)+, when x can match a zero length string.
1640 // Same as JMP_SAV, except conditional on the match having made forward progress.
1641 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
1642 // data address of the input position at the start of the loop.
1644 U_ASSERT(opValue
> 0 && opValue
< fPattern
->fCompiledPat
->size());
1645 int32_t stoOp
= pat
[opValue
-1];
1646 U_ASSERT(URX_TYPE(stoOp
) == URX_STO_INP_LOC
);
1647 int32_t frameLoc
= URX_VAL(stoOp
);
1648 U_ASSERT(frameLoc
>= 0 && frameLoc
< frameSize
);
1649 int32_t prevInputIdx
= fp
->fExtra
[frameLoc
];
1650 U_ASSERT(prevInputIdx
<= fp
->fInputIdx
);
1651 if (prevInputIdx
< fp
->fInputIdx
) {
1652 // The match did make progress. Repeat the loop.
1653 fp
= StateSave(fp
, fp
->fPatIdx
, frameSize
, status
); // State save to loc following current
1654 fp
->fPatIdx
= opValue
;
1655 fp
->fExtra
[frameLoc
] = fp
->fInputIdx
;
1657 // If the input position did not advance, we do nothing here,
1658 // execution will fall out of the loop.
1664 U_ASSERT(opValue
>= 0 && opValue
< frameSize
-2);
1665 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
1667 // Pick up the three extra operands that CTR_INIT has, and
1668 // skip the pattern location counter past
1669 int32_t instrOperandLoc
= fp
->fPatIdx
;
1671 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
1672 int32_t minCount
= pat
[instrOperandLoc
+1];
1673 int32_t maxCount
= pat
[instrOperandLoc
+2];
1674 U_ASSERT(minCount
>=0);
1675 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
1676 U_ASSERT(loopLoc
>fp
->fPatIdx
);
1678 if (minCount
== 0) {
1679 fp
= StateSave(fp
, loopLoc
+1, frameSize
, status
);
1681 if (maxCount
== 0) {
1682 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1689 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
1690 int32_t initOp
= pat
[opValue
];
1691 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT
);
1692 int32_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
1693 int32_t minCount
= pat
[opValue
+2];
1694 int32_t maxCount
= pat
[opValue
+3];
1695 // Increment the counter. Note: we're not worrying about counter
1696 // overflow, since the data comes from UnicodeStrings, which
1697 // stores its length in an int32_t.
1699 U_ASSERT(*pCounter
> 0);
1700 if ((uint32_t)*pCounter
>= (uint32_t)maxCount
) {
1701 U_ASSERT(*pCounter
== maxCount
|| maxCount
== -1);
1704 if (*pCounter
>= minCount
) {
1705 fp
= StateSave(fp
, fp
->fPatIdx
, frameSize
, status
);
1707 fp
->fPatIdx
= opValue
+ 4; // Loop back.
1711 case URX_CTR_INIT_NG
:
1713 U_ASSERT(opValue
>= 0 && opValue
< frameSize
-2);
1714 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
1716 // Pick up the three extra operands that CTR_INIT has, and
1717 // skip the pattern location counter past
1718 int32_t instrOperandLoc
= fp
->fPatIdx
;
1720 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
1721 int32_t minCount
= pat
[instrOperandLoc
+1];
1722 int32_t maxCount
= pat
[instrOperandLoc
+2];
1723 U_ASSERT(minCount
>=0);
1724 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
1725 U_ASSERT(loopLoc
>fp
->fPatIdx
);
1727 if (minCount
== 0) {
1728 if (maxCount
!= 0) {
1729 fp
= StateSave(fp
, fp
->fPatIdx
, frameSize
, status
);
1731 fp
->fPatIdx
= loopLoc
+1; // Continue with stuff after repeated block
1736 case URX_CTR_LOOP_NG
:
1738 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
1739 int32_t initOp
= pat
[opValue
];
1740 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT_NG
);
1741 int32_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
1742 int32_t minCount
= pat
[opValue
+2];
1743 int32_t maxCount
= pat
[opValue
+3];
1744 // Increment the counter. Note: we're not worrying about counter
1745 // overflow, since the data comes from UnicodeStrings, which
1746 // stores its length in an int32_t.
1748 U_ASSERT(*pCounter
> 0);
1750 if ((uint32_t)*pCounter
>= (uint32_t)maxCount
) {
1751 // The loop has matched the maximum permitted number of times.
1752 // Break out of here with no action. Matching will
1753 // continue with the following pattern.
1754 U_ASSERT(*pCounter
== maxCount
|| maxCount
== -1);
1758 if (*pCounter
< minCount
) {
1759 // We haven't met the minimum number of matches yet.
1760 // Loop back for another one.
1761 fp
->fPatIdx
= opValue
+ 4; // Loop back.
1763 // We do have the minimum number of matches.
1764 // Fall into the following pattern, but first do
1765 // a state save to the top of the loop, so that a failure
1766 // in the following pattern will try another iteration of the loop.
1767 fp
= StateSave(fp
, opValue
+ 4, frameSize
, status
);
1772 // TODO: Possessive flavor of loop ops, or take them out if no longer needed.
1775 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
1776 fData
[opValue
] = fStack
->size();
1781 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
1782 int32_t newStackSize
= fData
[opValue
];
1783 U_ASSERT(newStackSize
<= fStack
->size());
1784 int32_t *newFP
= fStack
->getBuffer() + newStackSize
- frameSize
;
1785 if (newFP
== (int32_t *)fp
) {
1789 for (i
=0; i
<frameSize
; i
++) {
1790 newFP
[i
] = ((int32_t *)fp
)[i
];
1792 fp
= (REStackFrame
*)newFP
;
1793 fStack
->setSize(newStackSize
);
1800 U_ASSERT(opValue
< frameSize
);
1801 int32_t groupStartIdx
= fp
->fExtra
[opValue
];
1802 int32_t groupEndIdx
= fp
->fExtra
[opValue
+1];
1803 U_ASSERT(groupStartIdx
<= groupEndIdx
);
1804 int32_t len
= groupEndIdx
-groupStartIdx
;
1805 if (groupStartIdx
< 0) {
1806 // This capture group has not participated in the match thus far,
1807 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
); // FAIL, no match.
1811 // The capture group match was of an empty string.
1812 // Verified by testing: Perl matches succeed in this case, so
1817 UBool haveMatch
= FALSE
;
1818 if (fp
->fInputIdx
+ len
<= inputLen
) {
1819 if (opType
== URX_BACKREF
) {
1820 if (u_strncmp(inputBuf
+groupStartIdx
, inputBuf
+fp
->fInputIdx
, len
) == 0) {
1824 if (u_strncasecmp(inputBuf
+groupStartIdx
, inputBuf
+fp
->fInputIdx
,
1825 len
, U_FOLD_CASE_DEFAULT
) == 0) {
1831 fp
->fInputIdx
+= len
; // Match. Advance current input position.
1833 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
); // FAIL, no match.
1838 case URX_STO_INP_LOC
:
1840 U_ASSERT(opValue
>= 0 && opValue
< frameSize
);
1841 fp
->fExtra
[opValue
] = fp
->fInputIdx
;
1847 int32_t instrOperandLoc
= fp
->fPatIdx
;
1849 int32_t dataLoc
= URX_VAL(pat
[instrOperandLoc
]);
1850 U_ASSERT(dataLoc
>= 0 && dataLoc
< frameSize
);
1851 int32_t savedInputIdx
= fp
->fExtra
[dataLoc
];
1852 U_ASSERT(savedInputIdx
<= fp
->fInputIdx
);
1853 if (savedInputIdx
< fp
->fInputIdx
) {
1854 fp
->fPatIdx
= opValue
; // JMP
1856 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
); // FAIL, no progress in loop.
1863 // Entering a lookahead block.
1864 // Save Stack Ptr, Input Pos.
1865 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
1866 fData
[opValue
] = fStack
->size();
1867 fData
[opValue
+1] = fp
->fInputIdx
;
1873 // Leaving a look-ahead block.
1874 // restore Stack Ptr, Input Pos to positions they had on entry to block.
1875 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
1876 int32_t stackSize
= fStack
->size();
1877 int32_t newStackSize
= fData
[opValue
];
1878 U_ASSERT(stackSize
>= newStackSize
);
1879 if (stackSize
> newStackSize
) {
1880 int32_t *newFP
= fStack
->getBuffer() + newStackSize
- frameSize
;
1882 for (i
=0; i
<frameSize
; i
++) {
1883 newFP
[i
] = ((int32_t *)fp
)[i
];
1885 fp
= (REStackFrame
*)newFP
;
1886 fStack
->setSize(newStackSize
);
1888 fp
->fInputIdx
= fData
[opValue
+1];
1893 if (fp
->fInputIdx
< inputLen
) {
1895 U16_NEXT(inputBuf
, fp
->fInputIdx
, inputLen
, c
);
1896 if (u_foldCase(c
, U_FOLD_CASE_DEFAULT
) == opValue
) {
1900 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1905 // Test input against a literal string.
1906 // Strings require two slots in the compiled pattern, one for the
1907 // offset to the string text, and one for the length.
1908 int32_t stringStartIdx
, stringLen
;
1909 stringStartIdx
= opValue
;
1911 op
= pat
[fp
->fPatIdx
];
1913 opType
= URX_TYPE(op
);
1914 opValue
= URX_VAL(op
);
1915 U_ASSERT(opType
== URX_STRING_LEN
);
1916 stringLen
= opValue
;
1918 int32_t stringEndIndex
= fp
->fInputIdx
+ stringLen
;
1919 if (stringEndIndex
<= inputLen
) {
1920 if (u_strncasecmp(inputBuf
+fp
->fInputIdx
, litText
+stringStartIdx
,
1921 stringLen
, U_FOLD_CASE_DEFAULT
) == 0) {
1922 // Success. Advance the current input position.
1923 fp
->fInputIdx
= stringEndIndex
;
1927 // No match. Back up matching to a saved state
1928 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1934 // Entering a look-behind block.
1935 // Save Stack Ptr, Input Pos.
1936 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
1937 fData
[opValue
] = fStack
->size();
1938 fData
[opValue
+1] = fp
->fInputIdx
;
1939 // Init the variable containing the start index for attempted matches.
1940 fData
[opValue
+2] = -1;
1941 // Save input string length, then reset to pin any matches to end at
1942 // the current position.
1943 fData
[opValue
+3] = inputLen
;
1944 inputLen
= fp
->fInputIdx
;
1951 // Positive Look-Behind, at top of loop checking for matches of LB expression
1952 // at all possible input starting positions.
1954 // Fetch the min and max possible match lengths. They are the operands
1955 // of this op in the pattern.
1956 int32_t minML
= pat
[fp
->fPatIdx
++];
1957 int32_t maxML
= pat
[fp
->fPatIdx
++];
1958 U_ASSERT(minML
<= maxML
);
1959 U_ASSERT(minML
>= 0);
1961 // Fetch (from data) the last input index where a match was attempted.
1962 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
1963 int32_t *lbStartIdx
= &fData
[opValue
+2];
1964 if (*lbStartIdx
< 0) {
1965 // First time through loop.
1966 *lbStartIdx
= fp
->fInputIdx
- minML
;
1968 // 2nd through nth time through the loop.
1969 // Back up start position for match by one.
1970 if (*lbStartIdx
== 0) {
1971 (*lbStartIdx
)--; // Because U16_BACK is unsafe starting at 0.
1973 U16_BACK_1(inputBuf
, 0, *lbStartIdx
);
1977 if (*lbStartIdx
< 0 || *lbStartIdx
< fp
->fInputIdx
- maxML
) {
1978 // We have tried all potential match starting points without
1979 // getting a match. Backtrack out, and out of the
1980 // Look Behind altogether.
1981 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
1982 int32_t restoreInputLen
= fData
[opValue
+3];
1983 U_ASSERT(restoreInputLen
>= inputLen
);
1984 U_ASSERT(restoreInputLen
<= fInput
->length());
1985 inputLen
= restoreInputLen
;
1989 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
1990 // (successful match will fall off the end of the loop.)
1991 fp
= StateSave(fp
, fp
->fPatIdx
-3, frameSize
, status
);
1992 fp
->fInputIdx
= *lbStartIdx
;
1997 // End of a look-behind block, after a successful match.
1999 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
2000 if (fp
->fInputIdx
!= inputLen
) {
2001 // The look-behind expression matched, but the match did not
2002 // extend all the way to the point that we are looking behind from.
2003 // FAIL out of here, which will take us back to the LB_CONT, which
2004 // will retry the match starting at another position or fail
2005 // the look-behind altogether, whichever is appropriate.
2006 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
2010 // Look-behind match is good. Restore the orignal input string length,
2011 // which had been truncated to pin the end of the lookbehind match to the
2012 // position being looked-behind.
2013 int32_t originalInputLen
= fData
[opValue
+3];
2014 U_ASSERT(originalInputLen
>= inputLen
);
2015 U_ASSERT(originalInputLen
<= fInput
->length());
2016 inputLen
= originalInputLen
;
2023 // Negative Look-Behind, at top of loop checking for matches of LB expression
2024 // at all possible input starting positions.
2026 // Fetch the extra parameters of this op.
2027 int32_t minML
= pat
[fp
->fPatIdx
++];
2028 int32_t maxML
= pat
[fp
->fPatIdx
++];
2029 int32_t continueLoc
= pat
[fp
->fPatIdx
++];
2030 continueLoc
= URX_VAL(continueLoc
);
2031 U_ASSERT(minML
<= maxML
);
2032 U_ASSERT(minML
>= 0);
2033 U_ASSERT(continueLoc
> fp
->fPatIdx
);
2035 // Fetch (from data) the last input index where a match was attempted.
2036 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
2037 int32_t *lbStartIdx
= &fData
[opValue
+2];
2038 if (*lbStartIdx
< 0) {
2039 // First time through loop.
2040 *lbStartIdx
= fp
->fInputIdx
- minML
;
2042 // 2nd through nth time through the loop.
2043 // Back up start position for match by one.
2044 if (*lbStartIdx
== 0) {
2045 (*lbStartIdx
)--; // Because U16_BACK is unsafe starting at 0.
2047 U16_BACK_1(inputBuf
, 0, *lbStartIdx
);
2051 if (*lbStartIdx
< 0 || *lbStartIdx
< fp
->fInputIdx
- maxML
) {
2052 // We have tried all potential match starting points without
2053 // getting a match, which means that the negative lookbehind as
2054 // a whole has succeeded. Jump forward to the continue location
2055 int32_t restoreInputLen
= fData
[opValue
+3];
2056 U_ASSERT(restoreInputLen
>= inputLen
);
2057 U_ASSERT(restoreInputLen
<= fInput
->length());
2058 inputLen
= restoreInputLen
;
2059 fp
->fPatIdx
= continueLoc
;
2063 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
2064 // (successful match will cause a FAIL out of the loop altogether.)
2065 fp
= StateSave(fp
, fp
->fPatIdx
-4, frameSize
, status
);
2066 fp
->fInputIdx
= *lbStartIdx
;
2071 // End of a negative look-behind block, after a successful match.
2073 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
2074 if (fp
->fInputIdx
!= inputLen
) {
2075 // The look-behind expression matched, but the match did not
2076 // extend all the way to the point that we are looking behind from.
2077 // FAIL out of here, which will take us back to the LB_CONT, which
2078 // will retry the match starting at another position or succeed
2079 // the look-behind altogether, whichever is appropriate.
2080 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
2084 // Look-behind expression matched, which means look-behind test as
2087 // Restore the orignal input string length, which had been truncated
2088 // inorder to pin the end of the lookbehind match
2089 // to the position being looked-behind.
2090 int32_t originalInputLen
= fData
[opValue
+3];
2091 U_ASSERT(originalInputLen
>= inputLen
);
2092 U_ASSERT(originalInputLen
<= fInput
->length());
2093 inputLen
= originalInputLen
;
2095 // Restore original stack position, discarding any state saved
2096 // by the successful pattern match.
2097 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
2098 int32_t newStackSize
= fData
[opValue
];
2099 U_ASSERT(fStack
->size() > newStackSize
);
2100 fStack
->setSize(newStackSize
);
2102 // FAIL, which will take control back to someplace
2103 // prior to entering the look-behind test.
2104 fp
= (REStackFrame
*)fStack
->popFrame(frameSize
);
2110 // Loop Initialization for the optimized implementation of
2111 // [some character set]*
2112 // This op scans through all matching input.
2113 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
2115 U_ASSERT(opValue
> 0 && opValue
< sets
->size());
2116 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
2117 UnicodeSet
*s
= (UnicodeSet
*)sets
->elementAt(opValue
);
2119 // Loop through input, until either the input is exhausted or
2120 // we reach a character that is not a member of the set.
2121 int32_t ix
= fp
->fInputIdx
;
2123 if (ix
>= inputLen
) {
2127 U16_NEXT(inputBuf
, ix
, inputLen
, c
);
2129 if (s8
->contains(c
) == FALSE
) {
2130 U16_BACK_1(inputBuf
, 0, ix
);
2134 if (s
->contains(c
) == FALSE
) {
2135 U16_BACK_1(inputBuf
, 0, ix
);
2141 // If there were no matching characters, skip over the loop altogether.
2142 // The loop doesn't run at all, a * op always succeeds.
2143 if (ix
== fp
->fInputIdx
) {
2144 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
2148 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
2149 // must follow. It's operand is the stack location
2150 // that holds the starting input index for the match of this [set]*
2151 int32_t loopcOp
= pat
[fp
->fPatIdx
];
2152 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
2153 int32_t stackLoc
= URX_VAL(loopcOp
);
2154 U_ASSERT(stackLoc
>= 0 && stackLoc
< frameSize
);
2155 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
2158 // Save State to the URX_LOOP_C op that follows this one,
2159 // so that match failures in the following code will return to there.
2160 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
2161 fp
= StateSave(fp
, fp
->fPatIdx
, frameSize
, status
);
2167 case URX_LOOP_DOT_I
:
2168 // Loop Initialization for the optimized implementation of .*
2169 // This op scans through all remaining input.
2170 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
2172 // Loop through input until the input is exhausted (we reach an end-of-line)
2173 // In multi-line mode, we can just go straight to the end of the input.
2179 // NOT multi-line mode. Line endings do not match '.'
2180 // Scan forward until a line ending or end of input.
2183 if (ix
>= inputLen
) {
2188 U16_NEXT(inputBuf
, ix
, inputLen
, c
); // c = inputBuf[ix++]
2189 if (((c
& 0x7f) <= 0x29) &&
2190 (c
== 0x0a || c
==0x0d || c
==0x0c || c
==0x85 ||c
==0x2028 || c
==0x2029)) {
2191 // char is a line ending. Put the input pos back to the
2192 // line ending char, and exit the scanning loop.
2193 U16_BACK_1(inputBuf
, 0, ix
);
2199 // If there were no matching characters, skip over the loop altogether.
2200 // The loop doesn't run at all, a * op always succeeds.
2201 if (ix
== fp
->fInputIdx
) {
2202 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
2206 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
2207 // must follow. It's operand is the stack location
2208 // that holds the starting input index for the match of this [set]*
2209 int32_t loopcOp
= pat
[fp
->fPatIdx
];
2210 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
2211 int32_t stackLoc
= URX_VAL(loopcOp
);
2212 U_ASSERT(stackLoc
>= 0 && stackLoc
< frameSize
);
2213 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
2216 // Save State to the URX_LOOP_C op that follows this one,
2217 // so that match failures in the following code will return to there.
2218 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
2219 fp
= StateSave(fp
, fp
->fPatIdx
, frameSize
, status
);
2227 U_ASSERT(opValue
>=0 && opValue
<frameSize
);
2228 int32_t terminalIdx
= fp
->fExtra
[opValue
];
2229 U_ASSERT(terminalIdx
<= fp
->fInputIdx
);
2230 if (terminalIdx
== fp
->fInputIdx
) {
2231 // We've backed up the input idx to the point that the loop started.
2232 // The loop is done. Leave here without saving state.
2233 // Subsequent failures won't come back here.
2236 // Set up for the next iteration of the loop, with input index
2237 // backed up by one from the last time through,
2238 // and a state save to this instruction in case the following code fails again.
2239 // (We're going backwards because this loop emulates stack unwinding, not
2240 // the initial scan forward.)
2241 U_ASSERT(fp
->fInputIdx
> 0);
2242 U16_BACK_1(inputBuf
, 0, fp
->fInputIdx
);
2243 if (inputBuf
[fp
->fInputIdx
] == 0x0a &&
2244 fp
->fInputIdx
> terminalIdx
&&
2245 inputBuf
[fp
->fInputIdx
-1] == 0x0d) {
2246 int32_t prevOp
= pat
[fp
->fPatIdx
-2];
2247 if (URX_TYPE(prevOp
) == URX_LOOP_DOT_I
) {
2248 // .*, stepping back over CRLF pair.
2254 fp
= StateSave(fp
, fp
->fPatIdx
-1, frameSize
, status
);
2261 // Trouble. The compiled pattern contains an entry with an
2262 // unrecognized type tag.
2266 if (U_FAILURE(status
)) {
2274 fLastMatchEnd
= fMatchEnd
;
2275 fMatchStart
= startIdx
;
2276 fMatchEnd
= fp
->fInputIdx
;
2278 REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart
, fMatchEnd
));
2284 REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
2288 fFrame
= fp
; // The active stack frame when the engine stopped.
2289 // Contains the capture group results that we need to
2297 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher
)
2301 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS