2 **************************************************************************
3 * Copyright (C) 2002-2012 International Business Machines Corporation *
4 * and others. All rights reserved. *
5 **************************************************************************
10 // Contains the implementation of class RegexMatcher,
11 // which is one of the main API classes for the ICU regular expression package.
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
17 #include "unicode/regex.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/ustring.h"
21 #include "unicode/rbbi.h"
22 #include "unicode/utf.h"
23 #include "unicode/utf16.h"
34 // #include <malloc.h> // Needed for heapcheck testing
37 // Find progress callback
38 // ----------------------
39 // Macro to inline test & call to ReportFindProgress(). Eliminates unnecessary function call.
41 #define REGEXFINDPROGRESS_INTERRUPT(pos, status) \
42 (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE)
47 // When a failure would go back to a LOOP_C instruction,
48 // strings, characters, and setrefs scan backwards for a valid start
49 // character themselves, pop the stack, and save state, emulating the
50 // LOOP_C's effect but assured that the next character of input is a
51 // possible matching character.
53 // Good idea in theory; unfortunately it only helps out a few specific
54 // cases and slows the engine down a little in the rest.
58 // Default limit for the size of the back track stack, to avoid system
59 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
60 // This value puts ICU's limits higher than most other regexp implementations,
61 // which use recursion rather than the heap, and take more storage per
64 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY
= 8000000;
66 // Time limit counter constant.
67 // Time limits for expression evaluation are in terms of quanta of work by
68 // the engine, each of which is 10,000 state saves.
69 // This constant determines that state saves per tick number.
70 static const int32_t TIMER_INITIAL_VALUE
= 10000;
72 //-----------------------------------------------------------------------------
74 // Constructor and Destructor
76 //-----------------------------------------------------------------------------
77 RegexMatcher::RegexMatcher(const RegexPattern
*pat
) {
78 fDeferredStatus
= U_ZERO_ERROR
;
79 init(fDeferredStatus
);
80 if (U_FAILURE(fDeferredStatus
)) {
84 fDeferredStatus
= U_ILLEGAL_ARGUMENT_ERROR
;
88 init2(RegexStaticSets::gStaticSets
->fEmptyText
, fDeferredStatus
);
93 RegexMatcher::RegexMatcher(const UnicodeString
®exp
, const UnicodeString
&input
,
94 uint32_t flags
, UErrorCode
&status
) {
96 if (U_FAILURE(status
)) {
100 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
101 fPattern
= fPatternOwned
;
103 UText inputText
= UTEXT_INITIALIZER
;
104 utext_openConstUnicodeString(&inputText
, &input
, &status
);
105 init2(&inputText
, status
);
106 utext_close(&inputText
);
108 fInputUniStrMaybeMutable
= TRUE
;
112 RegexMatcher::RegexMatcher(UText
*regexp
, UText
*input
,
113 uint32_t flags
, UErrorCode
&status
) {
115 if (U_FAILURE(status
)) {
119 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
120 if (U_FAILURE(status
)) {
124 fPattern
= fPatternOwned
;
125 init2(input
, status
);
129 RegexMatcher::RegexMatcher(const UnicodeString
®exp
,
130 uint32_t flags
, UErrorCode
&status
) {
132 if (U_FAILURE(status
)) {
136 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
137 if (U_FAILURE(status
)) {
140 fPattern
= fPatternOwned
;
141 init2(RegexStaticSets::gStaticSets
->fEmptyText
, status
);
144 RegexMatcher::RegexMatcher(UText
*regexp
,
145 uint32_t flags
, UErrorCode
&status
) {
147 if (U_FAILURE(status
)) {
151 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
152 if (U_FAILURE(status
)) {
156 fPattern
= fPatternOwned
;
157 init2(RegexStaticSets::gStaticSets
->fEmptyText
, status
);
163 RegexMatcher::~RegexMatcher() {
165 if (fData
!= fSmallData
) {
170 delete fPatternOwned
;
171 fPatternOwned
= NULL
;
179 utext_close(fInputText
);
182 utext_close(fAltInputText
);
185 #if UCONFIG_NO_BREAK_ITERATION==0
186 delete fWordBreakItr
;
191 // init() common initialization for use by all constructors.
192 // Initialize all fields, get the object into a consistent state.
193 // This must be done even when the initial status shows an error,
194 // so that the object is initialized sufficiently well for the destructor
197 void RegexMatcher::init(UErrorCode
&status
) {
199 fPatternOwned
= NULL
;
209 fTransparentBounds
= FALSE
;
210 fAnchoringBounds
= TRUE
;
223 fStackLimit
= DEFAULT_BACKTRACK_STACK_CAPACITY
;
225 fCallbackContext
= NULL
;
226 fFindProgressCallbackFn
= NULL
;
227 fFindProgressCallbackContext
= NULL
;
229 fDeferredStatus
= status
;
231 fWordBreakItr
= NULL
;
235 fAltInputText
= NULL
;
238 fInputUniStrMaybeMutable
= FALSE
;
240 if (U_FAILURE(status
)) {
241 fDeferredStatus
= status
;
246 // init2() Common initialization for use by RegexMatcher constructors, part 2.
247 // This handles the common setup to be done after the Pattern is available.
249 void RegexMatcher::init2(UText
*input
, UErrorCode
&status
) {
250 if (U_FAILURE(status
)) {
251 fDeferredStatus
= status
;
255 if (fPattern
->fDataSize
> (int32_t)(sizeof(fSmallData
)/sizeof(fSmallData
[0]))) {
256 fData
= (int64_t *)uprv_malloc(fPattern
->fDataSize
* sizeof(int64_t));
258 status
= fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
263 fStack
= new UVector64(status
);
264 if (fStack
== NULL
) {
265 status
= fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
270 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY
, status
);
271 if (U_FAILURE(status
)) {
272 fDeferredStatus
= status
;
278 static const UChar BACKSLASH
= 0x5c;
279 static const UChar DOLLARSIGN
= 0x24;
280 //--------------------------------------------------------------------------------
284 //--------------------------------------------------------------------------------
285 RegexMatcher
&RegexMatcher::appendReplacement(UnicodeString
&dest
,
286 const UnicodeString
&replacement
,
287 UErrorCode
&status
) {
288 UText replacementText
= UTEXT_INITIALIZER
;
290 utext_openConstUnicodeString(&replacementText
, &replacement
, &status
);
291 if (U_SUCCESS(status
)) {
292 UText resultText
= UTEXT_INITIALIZER
;
293 utext_openUnicodeString(&resultText
, &dest
, &status
);
295 if (U_SUCCESS(status
)) {
296 appendReplacement(&resultText
, &replacementText
, status
);
297 utext_close(&resultText
);
299 utext_close(&replacementText
);
306 // appendReplacement, UText mode
308 RegexMatcher
&RegexMatcher::appendReplacement(UText
*dest
,
310 UErrorCode
&status
) {
311 if (U_FAILURE(status
)) {
314 if (U_FAILURE(fDeferredStatus
)) {
315 status
= fDeferredStatus
;
318 if (fMatch
== FALSE
) {
319 status
= U_REGEX_INVALID_STATE
;
323 // Copy input string from the end of previous match to start of current match
324 int64_t destLen
= utext_nativeLength(dest
);
325 if (fMatchStart
> fAppendPosition
) {
326 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
327 destLen
+= utext_replace(dest
, destLen
, destLen
, fInputText
->chunkContents
+fAppendPosition
,
328 (int32_t)(fMatchStart
-fAppendPosition
), &status
);
331 if (UTEXT_USES_U16(fInputText
)) {
332 len16
= (int32_t)(fMatchStart
-fAppendPosition
);
334 UErrorCode lengthStatus
= U_ZERO_ERROR
;
335 len16
= utext_extract(fInputText
, fAppendPosition
, fMatchStart
, NULL
, 0, &lengthStatus
);
337 UChar
*inputChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(len16
+1));
338 if (inputChars
== NULL
) {
339 status
= U_MEMORY_ALLOCATION_ERROR
;
342 utext_extract(fInputText
, fAppendPosition
, fMatchStart
, inputChars
, len16
+1, &status
);
343 destLen
+= utext_replace(dest
, destLen
, destLen
, inputChars
, len16
, &status
);
344 uprv_free(inputChars
);
347 fAppendPosition
= fMatchEnd
;
350 // scan the replacement text, looking for substitutions ($n) and \escapes.
351 // TODO: optimize this loop by efficiently scanning for '$' or '\',
352 // move entire ranges not containing substitutions.
353 UTEXT_SETNATIVEINDEX(replacement
, 0);
354 UChar32 c
= UTEXT_NEXT32(replacement
);
355 while (c
!= U_SENTINEL
) {
356 if (c
== BACKSLASH
) {
357 // Backslash Escape. Copy the following char out without further checks.
358 // Note: Surrogate pairs don't need any special handling
359 // The second half wont be a '$' or a '\', and
360 // will move to the dest normally on the next
362 c
= UTEXT_CURRENT32(replacement
);
363 if (c
== U_SENTINEL
) {
367 if (c
==0x55/*U*/ || c
==0x75/*u*/) {
368 // We have a \udddd or \Udddddddd escape sequence.
370 struct URegexUTextUnescapeCharContext context
= U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement
);
371 UChar32 escapedChar
= u_unescapeAt(uregex_utext_unescape_charAt
, &offset
, INT32_MAX
, &context
);
372 if (escapedChar
!= (UChar32
)0xFFFFFFFF) {
373 if (U_IS_BMP(escapedChar
)) {
374 UChar c16
= (UChar
)escapedChar
;
375 destLen
+= utext_replace(dest
, destLen
, destLen
, &c16
, 1, &status
);
378 surrogate
[0] = U16_LEAD(escapedChar
);
379 surrogate
[1] = U16_TRAIL(escapedChar
);
380 if (U_SUCCESS(status
)) {
381 destLen
+= utext_replace(dest
, destLen
, destLen
, surrogate
, 2, &status
);
384 // TODO: Report errors for mal-formed \u escapes?
385 // As this is, the original sequence is output, which may be OK.
386 if (context
.lastOffset
== offset
) {
387 (void)UTEXT_PREVIOUS32(replacement
);
388 } else if (context
.lastOffset
!= offset
-1) {
389 utext_moveIndex32(replacement
, offset
- context
.lastOffset
- 1);
393 (void)UTEXT_NEXT32(replacement
);
394 // Plain backslash escape. Just put out the escaped character.
396 UChar c16
= (UChar
)c
;
397 destLen
+= utext_replace(dest
, destLen
, destLen
, &c16
, 1, &status
);
400 surrogate
[0] = U16_LEAD(c
);
401 surrogate
[1] = U16_TRAIL(c
);
402 if (U_SUCCESS(status
)) {
403 destLen
+= utext_replace(dest
, destLen
, destLen
, surrogate
, 2, &status
);
407 } else if (c
!= DOLLARSIGN
) {
408 // Normal char, not a $. Copy it out without further checks.
410 UChar c16
= (UChar
)c
;
411 destLen
+= utext_replace(dest
, destLen
, destLen
, &c16
, 1, &status
);
414 surrogate
[0] = U16_LEAD(c
);
415 surrogate
[1] = U16_TRAIL(c
);
416 if (U_SUCCESS(status
)) {
417 destLen
+= utext_replace(dest
, destLen
, destLen
, surrogate
, 2, &status
);
421 // We've got a $. Pick up a capture group number if one follows.
422 // Consume at most the number of digits necessary for the largest capture
423 // number that is valid for this pattern.
425 int32_t numDigits
= 0;
426 int32_t groupNum
= 0;
429 digitC
= UTEXT_CURRENT32(replacement
);
430 if (digitC
== U_SENTINEL
) {
433 if (u_isdigit(digitC
) == FALSE
) {
436 (void)UTEXT_NEXT32(replacement
);
437 groupNum
=groupNum
*10 + u_charDigitValue(digitC
);
439 if (numDigits
>= fPattern
->fMaxCaptureDigits
) {
445 if (numDigits
== 0) {
446 // The $ didn't introduce a group number at all.
447 // Treat it as just part of the substitution text.
448 UChar c16
= DOLLARSIGN
;
449 destLen
+= utext_replace(dest
, destLen
, destLen
, &c16
, 1, &status
);
451 // Finally, append the capture group data to the destination.
452 destLen
+= appendGroup(groupNum
, dest
, status
);
453 if (U_FAILURE(status
)) {
454 // Can fail if group number is out of range.
460 if (U_FAILURE(status
)) {
463 c
= UTEXT_NEXT32(replacement
);
472 //--------------------------------------------------------------------------------
474 // appendTail Intended to be used in conjunction with appendReplacement()
475 // To the destination string, append everything following
476 // the last match position from the input string.
478 // Note: Match ranges do not affect appendTail or appendReplacement
480 //--------------------------------------------------------------------------------
481 UnicodeString
&RegexMatcher::appendTail(UnicodeString
&dest
) {
482 UErrorCode status
= U_ZERO_ERROR
;
483 UText resultText
= UTEXT_INITIALIZER
;
484 utext_openUnicodeString(&resultText
, &dest
, &status
);
486 if (U_SUCCESS(status
)) {
487 appendTail(&resultText
, status
);
488 utext_close(&resultText
);
495 // appendTail, UText mode
497 UText
*RegexMatcher::appendTail(UText
*dest
, UErrorCode
&status
) {
498 UBool bailOut
= FALSE
;
499 if (U_FAILURE(status
)) {
502 if (U_FAILURE(fDeferredStatus
)) {
503 status
= fDeferredStatus
;
508 // dest must not be NULL
510 utext_replace(dest
, utext_nativeLength(dest
), utext_nativeLength(dest
), NULL
, 0, &status
);
515 if (fInputLength
> fAppendPosition
) {
516 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
517 int64_t destLen
= utext_nativeLength(dest
);
518 utext_replace(dest
, destLen
, destLen
, fInputText
->chunkContents
+fAppendPosition
,
519 (int32_t)(fInputLength
-fAppendPosition
), &status
);
522 if (UTEXT_USES_U16(fInputText
)) {
523 len16
= (int32_t)(fInputLength
-fAppendPosition
);
525 len16
= utext_extract(fInputText
, fAppendPosition
, fInputLength
, NULL
, 0, &status
);
526 status
= U_ZERO_ERROR
; // buffer overflow
529 UChar
*inputChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(len16
));
530 if (inputChars
== NULL
) {
531 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
533 utext_extract(fInputText
, fAppendPosition
, fInputLength
, inputChars
, len16
, &status
); // unterminated
534 int64_t destLen
= utext_nativeLength(dest
);
535 utext_replace(dest
, destLen
, destLen
, inputChars
, len16
, &status
);
536 uprv_free(inputChars
);
545 //--------------------------------------------------------------------------------
549 //--------------------------------------------------------------------------------
550 int32_t RegexMatcher::end(UErrorCode
&err
) const {
554 int64_t RegexMatcher::end64(UErrorCode
&err
) const {
555 return end64(0, err
);
558 int64_t RegexMatcher::end64(int32_t group
, UErrorCode
&err
) const {
559 if (U_FAILURE(err
)) {
562 if (fMatch
== FALSE
) {
563 err
= U_REGEX_INVALID_STATE
;
566 if (group
< 0 || group
> fPattern
->fGroupMap
->size()) {
567 err
= U_INDEX_OUTOFBOUNDS_ERROR
;
574 // Get the position within the stack frame of the variables for
575 // this capture group.
576 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(group
-1);
577 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
578 U_ASSERT(groupOffset
>= 0);
579 e
= fFrame
->fExtra
[groupOffset
+ 1];
585 int32_t RegexMatcher::end(int32_t group
, UErrorCode
&err
) const {
586 return (int32_t)end64(group
, err
);
590 //--------------------------------------------------------------------------------
594 //--------------------------------------------------------------------------------
595 UBool
RegexMatcher::find() {
596 // Start at the position of the last match end. (Will be zero if the
597 // matcher has been reset.)
599 if (U_FAILURE(fDeferredStatus
)) {
603 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
604 return findUsingChunk();
607 int64_t startPos
= fMatchEnd
;
609 startPos
= fActiveStart
;
613 // Save the position of any previous successful match.
614 fLastMatchEnd
= fMatchEnd
;
616 if (fMatchStart
== fMatchEnd
) {
617 // Previous match had zero length. Move start position up one position
618 // to avoid sending find() into a loop on zero-length matches.
619 if (startPos
>= fActiveLimit
) {
624 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
625 (void)UTEXT_NEXT32(fInputText
);
626 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
629 if (fLastMatchEnd
>= 0) {
630 // A previous find() failed to match. Don't try again.
631 // (without this test, a pattern with a zero-length match
632 // could match again at the end of an input string.)
639 // Compute the position in the input string beyond which a match can not begin, because
640 // the minimum length match would extend past the end of the input.
641 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
642 // Be aware of possible overflows if making changes here.
643 int64_t testStartLimit
;
644 if (UTEXT_USES_U16(fInputText
)) {
645 testStartLimit
= fActiveLimit
- fPattern
->fMinMatchLen
;
646 if (startPos
> testStartLimit
) {
652 // For now, let the matcher discover that it can't match on its own
653 // We don't know how long the match len is in native characters
654 testStartLimit
= fActiveLimit
;
658 U_ASSERT(startPos
>= 0);
660 switch (fPattern
->fStartType
) {
662 // No optimization was found.
663 // Try a match at each input position.
665 MatchAt(startPos
, FALSE
, fDeferredStatus
);
666 if (U_FAILURE(fDeferredStatus
)) {
672 if (startPos
>= testStartLimit
) {
676 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
677 (void)UTEXT_NEXT32(fInputText
);
678 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
679 // Note that it's perfectly OK for a pattern to have a zero-length
680 // match at the end of a string, so we must make sure that the loop
681 // runs with startPos == testStartLimit the last time through.
682 if (REGEXFINDPROGRESS_INTERRUPT(startPos
, fDeferredStatus
))
688 // Matches are only possible at the start of the input string
689 // (pattern begins with ^ or \A)
690 if (startPos
> fActiveStart
) {
694 MatchAt(startPos
, FALSE
, fDeferredStatus
);
695 if (U_FAILURE(fDeferredStatus
)) {
703 // Match may start on any char from a pre-computed set.
704 U_ASSERT(fPattern
->fMinMatchLen
> 0);
706 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
708 c
= UTEXT_NEXT32(fInputText
);
709 pos
= UTEXT_GETNATIVEINDEX(fInputText
);
710 // c will be -1 (U_SENTINEL) at end of text, in which case we
711 // skip this next block (so we don't have a negative array index)
712 // and handle end of text in the following block.
713 if (c
>= 0 && ((c
<256 && fPattern
->fInitialChars8
->contains(c
)) ||
714 (c
>=256 && fPattern
->fInitialChars
->contains(c
)))) {
715 MatchAt(startPos
, FALSE
, fDeferredStatus
);
716 if (U_FAILURE(fDeferredStatus
)) {
722 UTEXT_SETNATIVEINDEX(fInputText
, pos
);
724 if (startPos
>= testStartLimit
) {
730 if (REGEXFINDPROGRESS_INTERRUPT(startPos
, fDeferredStatus
))
739 // Match starts on exactly one char.
740 U_ASSERT(fPattern
->fMinMatchLen
> 0);
741 UChar32 theChar
= fPattern
->fInitialChar
;
743 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
745 c
= UTEXT_NEXT32(fInputText
);
746 pos
= UTEXT_GETNATIVEINDEX(fInputText
);
748 MatchAt(startPos
, FALSE
, fDeferredStatus
);
749 if (U_FAILURE(fDeferredStatus
)) {
755 UTEXT_SETNATIVEINDEX(fInputText
, pos
);
757 if (startPos
>= testStartLimit
) {
763 if (REGEXFINDPROGRESS_INTERRUPT(startPos
, fDeferredStatus
))
772 if (startPos
== fAnchorStart
) {
773 MatchAt(startPos
, FALSE
, fDeferredStatus
);
774 if (U_FAILURE(fDeferredStatus
)) {
780 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
781 c
= UTEXT_NEXT32(fInputText
);
782 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
784 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
785 c
= UTEXT_PREVIOUS32(fInputText
);
786 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
789 if (fPattern
->fFlags
& UREGEX_UNIX_LINES
) {
792 MatchAt(startPos
, FALSE
, fDeferredStatus
);
793 if (U_FAILURE(fDeferredStatus
)) {
799 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
801 if (startPos
>= testStartLimit
) {
806 c
= UTEXT_NEXT32(fInputText
);
807 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
808 // Note that it's perfectly OK for a pattern to have a zero-length
809 // match at the end of a string, so we must make sure that the loop
810 // runs with startPos == testStartLimit the last time through.
811 if (REGEXFINDPROGRESS_INTERRUPT(startPos
, fDeferredStatus
))
816 if (((c
& 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
817 ((c
<=0x0d && c
>=0x0a) || c
==0x85 ||c
==0x2028 || c
==0x2029 )) {
818 if (c
== 0x0d && startPos
< fActiveLimit
&& UTEXT_CURRENT32(fInputText
) == 0x0a) {
819 (void)UTEXT_NEXT32(fInputText
);
820 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
822 MatchAt(startPos
, FALSE
, fDeferredStatus
);
823 if (U_FAILURE(fDeferredStatus
)) {
829 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
831 if (startPos
>= testStartLimit
) {
836 c
= UTEXT_NEXT32(fInputText
);
837 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
838 // Note that it's perfectly OK for a pattern to have a zero-length
839 // match at the end of a string, so we must make sure that the loop
840 // runs with startPos == testStartLimit the last time through.
841 if (REGEXFINDPROGRESS_INTERRUPT(startPos
, fDeferredStatus
))
857 UBool
RegexMatcher::find(int64_t start
, UErrorCode
&status
) {
858 if (U_FAILURE(status
)) {
861 if (U_FAILURE(fDeferredStatus
)) {
862 status
= fDeferredStatus
;
865 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
866 // This will reset the region to be the full input length.
868 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
872 int64_t nativeStart
= start
;
873 if (nativeStart
< fActiveStart
|| nativeStart
> fActiveLimit
) {
874 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
877 fMatchEnd
= nativeStart
;
882 //--------------------------------------------------------------------------------
884 // findUsingChunk() -- like find(), but with the advance knowledge that the
885 // entire string is available in the UText's chunk buffer.
887 //--------------------------------------------------------------------------------
888 UBool
RegexMatcher::findUsingChunk() {
889 // Start at the position of the last match end. (Will be zero if the
890 // matcher has been reset.
893 int32_t startPos
= (int32_t)fMatchEnd
;
895 startPos
= (int32_t)fActiveStart
;
898 const UChar
*inputBuf
= fInputText
->chunkContents
;
901 // Save the position of any previous successful match.
902 fLastMatchEnd
= fMatchEnd
;
904 if (fMatchStart
== fMatchEnd
) {
905 // Previous match had zero length. Move start position up one position
906 // to avoid sending find() into a loop on zero-length matches.
907 if (startPos
>= fActiveLimit
) {
912 U16_FWD_1(inputBuf
, startPos
, fInputLength
);
915 if (fLastMatchEnd
>= 0) {
916 // A previous find() failed to match. Don't try again.
917 // (without this test, a pattern with a zero-length match
918 // could match again at the end of an input string.)
925 // Compute the position in the input string beyond which a match can not begin, because
926 // the minimum length match would extend past the end of the input.
927 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
928 // Be aware of possible overflows if making changes here.
929 int32_t testLen
= (int32_t)(fActiveLimit
- fPattern
->fMinMatchLen
);
930 if (startPos
> testLen
) {
937 U_ASSERT(startPos
>= 0);
939 switch (fPattern
->fStartType
) {
941 // No optimization was found.
942 // Try a match at each input position.
944 MatchChunkAt(startPos
, FALSE
, fDeferredStatus
);
945 if (U_FAILURE(fDeferredStatus
)) {
951 if (startPos
>= testLen
) {
955 U16_FWD_1(inputBuf
, startPos
, fActiveLimit
);
956 // Note that it's perfectly OK for a pattern to have a zero-length
957 // match at the end of a string, so we must make sure that the loop
958 // runs with startPos == testLen the last time through.
959 if (REGEXFINDPROGRESS_INTERRUPT(startPos
, fDeferredStatus
))
965 // Matches are only possible at the start of the input string
966 // (pattern begins with ^ or \A)
967 if (startPos
> fActiveStart
) {
971 MatchChunkAt(startPos
, FALSE
, fDeferredStatus
);
972 if (U_FAILURE(fDeferredStatus
)) {
980 // Match may start on any char from a pre-computed set.
981 U_ASSERT(fPattern
->fMinMatchLen
> 0);
983 int32_t pos
= startPos
;
984 U16_NEXT(inputBuf
, startPos
, fActiveLimit
, c
); // like c = inputBuf[startPos++];
985 if ((c
<256 && fPattern
->fInitialChars8
->contains(c
)) ||
986 (c
>=256 && fPattern
->fInitialChars
->contains(c
))) {
987 MatchChunkAt(pos
, FALSE
, fDeferredStatus
);
988 if (U_FAILURE(fDeferredStatus
)) {
995 if (pos
>= testLen
) {
1000 if (REGEXFINDPROGRESS_INTERRUPT(startPos
, fDeferredStatus
))
1009 // Match starts on exactly one char.
1010 U_ASSERT(fPattern
->fMinMatchLen
> 0);
1011 UChar32 theChar
= fPattern
->fInitialChar
;
1013 int32_t pos
= startPos
;
1014 U16_NEXT(inputBuf
, startPos
, fActiveLimit
, c
); // like c = inputBuf[startPos++];
1016 MatchChunkAt(pos
, FALSE
, fDeferredStatus
);
1017 if (U_FAILURE(fDeferredStatus
)) {
1024 if (pos
>= testLen
) {
1029 if (REGEXFINDPROGRESS_INTERRUPT(startPos
, fDeferredStatus
))
1038 if (startPos
== fAnchorStart
) {
1039 MatchChunkAt(startPos
, FALSE
, fDeferredStatus
);
1040 if (U_FAILURE(fDeferredStatus
)) {
1046 U16_FWD_1(inputBuf
, startPos
, fActiveLimit
);
1049 if (fPattern
->fFlags
& UREGEX_UNIX_LINES
) {
1051 c
= inputBuf
[startPos
-1];
1053 MatchChunkAt(startPos
, FALSE
, fDeferredStatus
);
1054 if (U_FAILURE(fDeferredStatus
)) {
1061 if (startPos
>= testLen
) {
1066 U16_FWD_1(inputBuf
, startPos
, fActiveLimit
);
1067 // Note that it's perfectly OK for a pattern to have a zero-length
1068 // match at the end of a string, so we must make sure that the loop
1069 // runs with startPos == testLen the last time through.
1070 if (REGEXFINDPROGRESS_INTERRUPT(startPos
, fDeferredStatus
))
1075 c
= inputBuf
[startPos
-1];
1076 if (((c
& 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
1077 ((c
<=0x0d && c
>=0x0a) || c
==0x85 ||c
==0x2028 || c
==0x2029 )) {
1078 if (c
== 0x0d && startPos
< fActiveLimit
&& inputBuf
[startPos
] == 0x0a) {
1081 MatchChunkAt(startPos
, FALSE
, fDeferredStatus
);
1082 if (U_FAILURE(fDeferredStatus
)) {
1089 if (startPos
>= testLen
) {
1094 U16_FWD_1(inputBuf
, startPos
, fActiveLimit
);
1095 // Note that it's perfectly OK for a pattern to have a zero-length
1096 // match at the end of a string, so we must make sure that the loop
1097 // runs with startPos == testLen the last time through.
1098 if (REGEXFINDPROGRESS_INTERRUPT(startPos
, fDeferredStatus
))
1114 //--------------------------------------------------------------------------------
1118 //--------------------------------------------------------------------------------
1119 UnicodeString
RegexMatcher::group(UErrorCode
&status
) const {
1120 return group(0, status
);
1123 // Return immutable shallow clone
1124 UText
*RegexMatcher::group(UText
*dest
, int64_t &group_len
, UErrorCode
&status
) const {
1125 return group(0, dest
, group_len
, status
);
1128 // Return immutable shallow clone
1129 UText
*RegexMatcher::group(int32_t groupNum
, UText
*dest
, int64_t &group_len
, UErrorCode
&status
) const {
1131 UBool bailOut
= FALSE
;
1132 if (U_FAILURE(status
)) {
1135 if (U_FAILURE(fDeferredStatus
)) {
1136 status
= fDeferredStatus
;
1139 if (fMatch
== FALSE
) {
1140 status
= U_REGEX_INVALID_STATE
;
1143 if (groupNum
< 0 || groupNum
> fPattern
->fGroupMap
->size()) {
1144 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1149 return (dest
) ? dest
: utext_openUChars(NULL
, NULL
, 0, &status
);
1153 if (groupNum
== 0) {
1157 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(groupNum
-1);
1158 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
1159 U_ASSERT(groupOffset
>= 0);
1160 s
= fFrame
->fExtra
[groupOffset
];
1161 e
= fFrame
->fExtra
[groupOffset
+1];
1165 // A capture group wasn't part of the match
1166 return utext_clone(dest
, fInputText
, FALSE
, TRUE
, &status
);
1171 dest
= utext_clone(dest
, fInputText
, FALSE
, TRUE
, &status
);
1173 UTEXT_SETNATIVEINDEX(dest
, s
);
1177 UnicodeString
RegexMatcher::group(int32_t groupNum
, UErrorCode
&status
) const {
1178 UnicodeString result
;
1179 if (U_FAILURE(status
)) {
1182 UText resultText
= UTEXT_INITIALIZER
;
1183 utext_openUnicodeString(&resultText
, &result
, &status
);
1184 group(groupNum
, &resultText
, status
);
1185 utext_close(&resultText
);
1190 // Return deep (mutable) clone
1191 // Technology Preview (as an API), but note that the UnicodeString API is implemented
1192 // using this function.
1193 UText
*RegexMatcher::group(int32_t groupNum
, UText
*dest
, UErrorCode
&status
) const {
1194 UBool bailOut
= FALSE
;
1195 if (U_FAILURE(status
)) {
1198 if (U_FAILURE(fDeferredStatus
)) {
1199 status
= fDeferredStatus
;
1203 if (fMatch
== FALSE
) {
1204 status
= U_REGEX_INVALID_STATE
;
1207 if (groupNum
< 0 || groupNum
> fPattern
->fGroupMap
->size()) {
1208 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1214 utext_replace(dest
, 0, utext_nativeLength(dest
), NULL
, 0, &status
);
1217 return utext_openUChars(NULL
, NULL
, 0, &status
);
1222 if (groupNum
== 0) {
1226 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(groupNum
-1);
1227 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
1228 U_ASSERT(groupOffset
>= 0);
1229 s
= fFrame
->fExtra
[groupOffset
];
1230 e
= fFrame
->fExtra
[groupOffset
+1];
1234 // A capture group wasn't part of the match
1236 utext_replace(dest
, 0, utext_nativeLength(dest
), NULL
, 0, &status
);
1239 return utext_openUChars(NULL
, NULL
, 0, &status
);
1244 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1245 U_ASSERT(e
<= fInputLength
);
1247 utext_replace(dest
, 0, utext_nativeLength(dest
), fInputText
->chunkContents
+s
, (int32_t)(e
-s
), &status
);
1249 UText groupText
= UTEXT_INITIALIZER
;
1250 utext_openUChars(&groupText
, fInputText
->chunkContents
+s
, e
-s
, &status
);
1251 dest
= utext_clone(NULL
, &groupText
, TRUE
, FALSE
, &status
);
1252 utext_close(&groupText
);
1256 if (UTEXT_USES_U16(fInputText
)) {
1257 len16
= (int32_t)(e
-s
);
1259 UErrorCode lengthStatus
= U_ZERO_ERROR
;
1260 len16
= utext_extract(fInputText
, s
, e
, NULL
, 0, &lengthStatus
);
1262 UChar
*groupChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(len16
+1));
1263 if (groupChars
== NULL
) {
1264 status
= U_MEMORY_ALLOCATION_ERROR
;
1267 utext_extract(fInputText
, s
, e
, groupChars
, len16
+1, &status
);
1270 utext_replace(dest
, 0, utext_nativeLength(dest
), groupChars
, len16
, &status
);
1272 UText groupText
= UTEXT_INITIALIZER
;
1273 utext_openUChars(&groupText
, groupChars
, len16
, &status
);
1274 dest
= utext_clone(NULL
, &groupText
, TRUE
, FALSE
, &status
);
1275 utext_close(&groupText
);
1278 uprv_free(groupChars
);
1283 //--------------------------------------------------------------------------------
1285 // appendGroup() -- currently internal only, appends a group to a UText rather
1286 // than replacing its contents
1288 //--------------------------------------------------------------------------------
1290 int64_t RegexMatcher::appendGroup(int32_t groupNum
, UText
*dest
, UErrorCode
&status
) const {
1291 if (U_FAILURE(status
)) {
1294 if (U_FAILURE(fDeferredStatus
)) {
1295 status
= fDeferredStatus
;
1298 int64_t destLen
= utext_nativeLength(dest
);
1300 if (fMatch
== FALSE
) {
1301 status
= U_REGEX_INVALID_STATE
;
1302 return utext_replace(dest
, destLen
, destLen
, NULL
, 0, &status
);
1304 if (groupNum
< 0 || groupNum
> fPattern
->fGroupMap
->size()) {
1305 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1306 return utext_replace(dest
, destLen
, destLen
, NULL
, 0, &status
);
1310 if (groupNum
== 0) {
1314 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(groupNum
-1);
1315 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
1316 U_ASSERT(groupOffset
>= 0);
1317 s
= fFrame
->fExtra
[groupOffset
];
1318 e
= fFrame
->fExtra
[groupOffset
+1];
1322 // A capture group wasn't part of the match
1323 return utext_replace(dest
, destLen
, destLen
, NULL
, 0, &status
);
1328 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1329 U_ASSERT(e
<= fInputLength
);
1330 deltaLen
= utext_replace(dest
, destLen
, destLen
, fInputText
->chunkContents
+s
, (int32_t)(e
-s
), &status
);
1333 if (UTEXT_USES_U16(fInputText
)) {
1334 len16
= (int32_t)(e
-s
);
1336 UErrorCode lengthStatus
= U_ZERO_ERROR
;
1337 len16
= utext_extract(fInputText
, s
, e
, NULL
, 0, &lengthStatus
);
1339 UChar
*groupChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(len16
+1));
1340 if (groupChars
== NULL
) {
1341 status
= U_MEMORY_ALLOCATION_ERROR
;
1344 utext_extract(fInputText
, s
, e
, groupChars
, len16
+1, &status
);
1346 deltaLen
= utext_replace(dest
, destLen
, destLen
, groupChars
, len16
, &status
);
1347 uprv_free(groupChars
);
1354 //--------------------------------------------------------------------------------
1358 //--------------------------------------------------------------------------------
1359 int32_t RegexMatcher::groupCount() const {
1360 return fPattern
->fGroupMap
->size();
1365 //--------------------------------------------------------------------------------
1367 // hasAnchoringBounds()
1369 //--------------------------------------------------------------------------------
1370 UBool
RegexMatcher::hasAnchoringBounds() const {
1371 return fAnchoringBounds
;
1375 //--------------------------------------------------------------------------------
1377 // hasTransparentBounds()
1379 //--------------------------------------------------------------------------------
1380 UBool
RegexMatcher::hasTransparentBounds() const {
1381 return fTransparentBounds
;
1386 //--------------------------------------------------------------------------------
1390 //--------------------------------------------------------------------------------
1391 UBool
RegexMatcher::hitEnd() const {
1396 //--------------------------------------------------------------------------------
1400 //--------------------------------------------------------------------------------
1401 const UnicodeString
&RegexMatcher::input() const {
1403 UErrorCode status
= U_ZERO_ERROR
;
1405 if (UTEXT_USES_U16(fInputText
)) {
1406 len16
= (int32_t)fInputLength
;
1408 len16
= utext_extract(fInputText
, 0, fInputLength
, NULL
, 0, &status
);
1409 status
= U_ZERO_ERROR
; // overflow, length status
1411 UnicodeString
*result
= new UnicodeString(len16
, 0, 0);
1413 UChar
*inputChars
= result
->getBuffer(len16
);
1414 utext_extract(fInputText
, 0, fInputLength
, inputChars
, len16
, &status
); // unterminated warning
1415 result
->releaseBuffer(len16
);
1417 (*(const UnicodeString
**)&fInput
) = result
; // pointer assignment, rather than operator=
1423 //--------------------------------------------------------------------------------
1427 //--------------------------------------------------------------------------------
1428 UText
*RegexMatcher::inputText() const {
1433 //--------------------------------------------------------------------------------
1435 // getInput() -- like inputText(), but makes a clone or copies into another UText
1437 //--------------------------------------------------------------------------------
1438 UText
*RegexMatcher::getInput (UText
*dest
, UErrorCode
&status
) const {
1439 UBool bailOut
= FALSE
;
1440 if (U_FAILURE(status
)) {
1443 if (U_FAILURE(fDeferredStatus
)) {
1444 status
= fDeferredStatus
;
1450 utext_replace(dest
, 0, utext_nativeLength(dest
), NULL
, 0, &status
);
1453 return utext_clone(NULL
, fInputText
, FALSE
, TRUE
, &status
);
1458 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1459 utext_replace(dest
, 0, utext_nativeLength(dest
), fInputText
->chunkContents
, (int32_t)fInputLength
, &status
);
1462 if (UTEXT_USES_U16(fInputText
)) {
1463 input16Len
= (int32_t)fInputLength
;
1465 UErrorCode lengthStatus
= U_ZERO_ERROR
;
1466 input16Len
= utext_extract(fInputText
, 0, fInputLength
, NULL
, 0, &lengthStatus
); // buffer overflow error
1468 UChar
*inputChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(input16Len
));
1469 if (inputChars
== NULL
) {
1473 status
= U_ZERO_ERROR
;
1474 utext_extract(fInputText
, 0, fInputLength
, inputChars
, input16Len
, &status
); // not terminated warning
1475 status
= U_ZERO_ERROR
;
1476 utext_replace(dest
, 0, utext_nativeLength(dest
), inputChars
, input16Len
, &status
);
1478 uprv_free(inputChars
);
1482 return utext_clone(NULL
, fInputText
, FALSE
, TRUE
, &status
);
1487 static UBool
compat_SyncMutableUTextContents(UText
*ut
);
1488 static UBool
compat_SyncMutableUTextContents(UText
*ut
) {
1489 UBool retVal
= FALSE
;
1491 // In the following test, we're really only interested in whether the UText should switch
1492 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents
1493 // will still point to the correct data.
1494 if (utext_nativeLength(ut
) != ut
->nativeIndexingLimit
) {
1495 UnicodeString
*us
=(UnicodeString
*)ut
->context
;
1497 // Update to the latest length.
1498 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1499 int32_t newLength
= us
->length();
1501 // Update the chunk description.
1502 // The buffer may have switched between stack- and heap-based.
1503 ut
->chunkContents
= us
->getBuffer();
1504 ut
->chunkLength
= newLength
;
1505 ut
->chunkNativeLimit
= newLength
;
1506 ut
->nativeIndexingLimit
= newLength
;
1513 //--------------------------------------------------------------------------------
1517 //--------------------------------------------------------------------------------
1518 UBool
RegexMatcher::lookingAt(UErrorCode
&status
) {
1519 if (U_FAILURE(status
)) {
1522 if (U_FAILURE(fDeferredStatus
)) {
1523 status
= fDeferredStatus
;
1527 if (fInputUniStrMaybeMutable
) {
1528 if (compat_SyncMutableUTextContents(fInputText
)) {
1529 fInputLength
= utext_nativeLength(fInputText
);
1534 resetPreserveRegion();
1536 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1537 MatchChunkAt((int32_t)fActiveStart
, FALSE
, status
);
1539 MatchAt(fActiveStart
, FALSE
, status
);
1545 UBool
RegexMatcher::lookingAt(int64_t start
, UErrorCode
&status
) {
1546 if (U_FAILURE(status
)) {
1549 if (U_FAILURE(fDeferredStatus
)) {
1550 status
= fDeferredStatus
;
1556 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1560 if (fInputUniStrMaybeMutable
) {
1561 if (compat_SyncMutableUTextContents(fInputText
)) {
1562 fInputLength
= utext_nativeLength(fInputText
);
1567 int64_t nativeStart
;
1568 nativeStart
= start
;
1569 if (nativeStart
< fActiveStart
|| nativeStart
> fActiveLimit
) {
1570 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1574 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1575 MatchChunkAt((int32_t)nativeStart
, FALSE
, status
);
1577 MatchAt(nativeStart
, FALSE
, status
);
1584 //--------------------------------------------------------------------------------
1588 //--------------------------------------------------------------------------------
1589 UBool
RegexMatcher::matches(UErrorCode
&status
) {
1590 if (U_FAILURE(status
)) {
1593 if (U_FAILURE(fDeferredStatus
)) {
1594 status
= fDeferredStatus
;
1598 if (fInputUniStrMaybeMutable
) {
1599 if (compat_SyncMutableUTextContents(fInputText
)) {
1600 fInputLength
= utext_nativeLength(fInputText
);
1605 resetPreserveRegion();
1608 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1609 MatchChunkAt((int32_t)fActiveStart
, TRUE
, status
);
1611 MatchAt(fActiveStart
, TRUE
, status
);
1617 UBool
RegexMatcher::matches(int64_t start
, UErrorCode
&status
) {
1618 if (U_FAILURE(status
)) {
1621 if (U_FAILURE(fDeferredStatus
)) {
1622 status
= fDeferredStatus
;
1628 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1632 if (fInputUniStrMaybeMutable
) {
1633 if (compat_SyncMutableUTextContents(fInputText
)) {
1634 fInputLength
= utext_nativeLength(fInputText
);
1639 int64_t nativeStart
;
1640 nativeStart
= start
;
1641 if (nativeStart
< fActiveStart
|| nativeStart
> fActiveLimit
) {
1642 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1646 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1647 MatchChunkAt((int32_t)nativeStart
, TRUE
, status
);
1649 MatchAt(nativeStart
, TRUE
, status
);
1656 //--------------------------------------------------------------------------------
1660 //--------------------------------------------------------------------------------
1661 const RegexPattern
&RegexMatcher::pattern() const {
1667 //--------------------------------------------------------------------------------
1671 //--------------------------------------------------------------------------------
1672 RegexMatcher
&RegexMatcher::region(int64_t regionStart
, int64_t regionLimit
, int64_t startIndex
, UErrorCode
&status
) {
1673 if (U_FAILURE(status
)) {
1677 if (regionStart
>regionLimit
|| regionStart
<0 || regionLimit
<0) {
1678 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1681 int64_t nativeStart
= regionStart
;
1682 int64_t nativeLimit
= regionLimit
;
1683 if (nativeStart
> fInputLength
|| nativeLimit
> fInputLength
) {
1684 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1687 if (startIndex
== -1)
1690 resetPreserveRegion();
1692 fRegionStart
= nativeStart
;
1693 fRegionLimit
= nativeLimit
;
1694 fActiveStart
= nativeStart
;
1695 fActiveLimit
= nativeLimit
;
1697 if (startIndex
!= -1) {
1698 if (startIndex
< fActiveStart
|| startIndex
> fActiveLimit
) {
1699 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1701 fMatchEnd
= startIndex
;
1704 if (!fTransparentBounds
) {
1705 fLookStart
= nativeStart
;
1706 fLookLimit
= nativeLimit
;
1708 if (fAnchoringBounds
) {
1709 fAnchorStart
= nativeStart
;
1710 fAnchorLimit
= nativeLimit
;
1715 RegexMatcher
&RegexMatcher::region(int64_t start
, int64_t limit
, UErrorCode
&status
) {
1716 return region(start
, limit
, -1, status
);
1719 //--------------------------------------------------------------------------------
1723 //--------------------------------------------------------------------------------
1724 int32_t RegexMatcher::regionEnd() const {
1725 return (int32_t)fRegionLimit
;
1728 int64_t RegexMatcher::regionEnd64() const {
1729 return fRegionLimit
;
1732 //--------------------------------------------------------------------------------
1736 //--------------------------------------------------------------------------------
1737 int32_t RegexMatcher::regionStart() const {
1738 return (int32_t)fRegionStart
;
1741 int64_t RegexMatcher::regionStart64() const {
1742 return fRegionStart
;
1746 //--------------------------------------------------------------------------------
1750 //--------------------------------------------------------------------------------
1751 UnicodeString
RegexMatcher::replaceAll(const UnicodeString
&replacement
, UErrorCode
&status
) {
1752 UText replacementText
= UTEXT_INITIALIZER
;
1753 UText resultText
= UTEXT_INITIALIZER
;
1754 UnicodeString resultString
;
1755 if (U_FAILURE(status
)) {
1756 return resultString
;
1759 utext_openConstUnicodeString(&replacementText
, &replacement
, &status
);
1760 utext_openUnicodeString(&resultText
, &resultString
, &status
);
1762 replaceAll(&replacementText
, &resultText
, status
);
1764 utext_close(&resultText
);
1765 utext_close(&replacementText
);
1767 return resultString
;
1772 // replaceAll, UText mode
1774 UText
*RegexMatcher::replaceAll(UText
*replacement
, UText
*dest
, UErrorCode
&status
) {
1775 if (U_FAILURE(status
)) {
1778 if (U_FAILURE(fDeferredStatus
)) {
1779 status
= fDeferredStatus
;
1784 UnicodeString emptyString
;
1785 UText empty
= UTEXT_INITIALIZER
;
1787 utext_openUnicodeString(&empty
, &emptyString
, &status
);
1788 dest
= utext_clone(NULL
, &empty
, TRUE
, FALSE
, &status
);
1789 utext_close(&empty
);
1792 if (U_SUCCESS(status
)) {
1795 appendReplacement(dest
, replacement
, status
);
1796 if (U_FAILURE(status
)) {
1800 appendTail(dest
, status
);
1807 //--------------------------------------------------------------------------------
1811 //--------------------------------------------------------------------------------
1812 UnicodeString
RegexMatcher::replaceFirst(const UnicodeString
&replacement
, UErrorCode
&status
) {
1813 UText replacementText
= UTEXT_INITIALIZER
;
1814 UText resultText
= UTEXT_INITIALIZER
;
1815 UnicodeString resultString
;
1817 utext_openConstUnicodeString(&replacementText
, &replacement
, &status
);
1818 utext_openUnicodeString(&resultText
, &resultString
, &status
);
1820 replaceFirst(&replacementText
, &resultText
, status
);
1822 utext_close(&resultText
);
1823 utext_close(&replacementText
);
1825 return resultString
;
1829 // replaceFirst, UText mode
1831 UText
*RegexMatcher::replaceFirst(UText
*replacement
, UText
*dest
, UErrorCode
&status
) {
1832 if (U_FAILURE(status
)) {
1835 if (U_FAILURE(fDeferredStatus
)) {
1836 status
= fDeferredStatus
;
1842 return getInput(dest
, status
);
1846 UnicodeString emptyString
;
1847 UText empty
= UTEXT_INITIALIZER
;
1849 utext_openUnicodeString(&empty
, &emptyString
, &status
);
1850 dest
= utext_clone(NULL
, &empty
, TRUE
, FALSE
, &status
);
1851 utext_close(&empty
);
1854 appendReplacement(dest
, replacement
, status
);
1855 appendTail(dest
, status
);
1861 //--------------------------------------------------------------------------------
1865 //--------------------------------------------------------------------------------
1866 UBool
RegexMatcher::requireEnd() const {
1871 //--------------------------------------------------------------------------------
1875 //--------------------------------------------------------------------------------
1876 RegexMatcher
&RegexMatcher::reset() {
1878 fRegionLimit
= fInputLength
;
1880 fActiveLimit
= fInputLength
;
1882 fAnchorLimit
= fInputLength
;
1884 fLookLimit
= fInputLength
;
1885 resetPreserveRegion();
1891 void RegexMatcher::resetPreserveRegion() {
1895 fAppendPosition
= 0;
1898 fRequireEnd
= FALSE
;
1900 fTickCounter
= TIMER_INITIAL_VALUE
;
1901 //resetStack(); // more expensive than it looks...
1905 RegexMatcher
&RegexMatcher::reset(const UnicodeString
&input
) {
1906 fInputText
= utext_openConstUnicodeString(fInputText
, &input
, &fDeferredStatus
);
1907 if (fPattern
->fNeedsAltInput
) {
1908 fAltInputText
= utext_clone(fAltInputText
, fInputText
, FALSE
, TRUE
, &fDeferredStatus
);
1910 fInputLength
= utext_nativeLength(fInputText
);
1916 // Do the following for any UnicodeString.
1917 // This is for compatibility for those clients who modify the input string "live" during regex operations.
1918 fInputUniStrMaybeMutable
= TRUE
;
1920 if (fWordBreakItr
!= NULL
) {
1921 #if UCONFIG_NO_BREAK_ITERATION==0
1922 UErrorCode status
= U_ZERO_ERROR
;
1923 fWordBreakItr
->setText(fInputText
, status
);
1930 RegexMatcher
&RegexMatcher::reset(UText
*input
) {
1931 if (fInputText
!= input
) {
1932 fInputText
= utext_clone(fInputText
, input
, FALSE
, TRUE
, &fDeferredStatus
);
1933 if (fPattern
->fNeedsAltInput
) fAltInputText
= utext_clone(fAltInputText
, fInputText
, FALSE
, TRUE
, &fDeferredStatus
);
1934 fInputLength
= utext_nativeLength(fInputText
);
1939 if (fWordBreakItr
!= NULL
) {
1940 #if UCONFIG_NO_BREAK_ITERATION==0
1941 UErrorCode status
= U_ZERO_ERROR
;
1942 fWordBreakItr
->setText(input
, status
);
1947 fInputUniStrMaybeMutable
= FALSE
;
1952 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
1953 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1957 RegexMatcher
&RegexMatcher::reset(int64_t position
, UErrorCode
&status
) {
1958 if (U_FAILURE(status
)) {
1961 reset(); // Reset also resets the region to be the entire string.
1963 if (position
< 0 || position
> fActiveLimit
) {
1964 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1967 fMatchEnd
= position
;
1972 //--------------------------------------------------------------------------------
1976 //--------------------------------------------------------------------------------
1977 RegexMatcher
&RegexMatcher::refreshInputText(UText
*input
, UErrorCode
&status
) {
1978 if (U_FAILURE(status
)) {
1981 if (input
== NULL
) {
1982 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1985 if (utext_nativeLength(fInputText
) != utext_nativeLength(input
)) {
1986 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1989 int64_t pos
= utext_getNativeIndex(fInputText
);
1990 // Shallow read-only clone of the new UText into the existing input UText
1991 fInputText
= utext_clone(fInputText
, input
, FALSE
, TRUE
, &status
);
1992 if (U_FAILURE(status
)) {
1995 utext_setNativeIndex(fInputText
, pos
);
1997 if (fAltInputText
!= NULL
) {
1998 pos
= utext_getNativeIndex(fAltInputText
);
1999 fAltInputText
= utext_clone(fAltInputText
, input
, FALSE
, TRUE
, &status
);
2000 if (U_FAILURE(status
)) {
2003 utext_setNativeIndex(fAltInputText
, pos
);
2010 //--------------------------------------------------------------------------------
2014 //--------------------------------------------------------------------------------
2015 void RegexMatcher::setTrace(UBool state
) {
2016 fTraceDebug
= state
;
2021 //---------------------------------------------------------------------
2025 //---------------------------------------------------------------------
2026 int32_t RegexMatcher::split(const UnicodeString
&input
,
2027 UnicodeString dest
[],
2028 int32_t destCapacity
,
2031 UText inputText
= UTEXT_INITIALIZER
;
2032 utext_openConstUnicodeString(&inputText
, &input
, &status
);
2033 if (U_FAILURE(status
)) {
2037 UText
**destText
= (UText
**)uprv_malloc(sizeof(UText
*)*destCapacity
);
2038 if (destText
== NULL
) {
2039 status
= U_MEMORY_ALLOCATION_ERROR
;
2043 for (i
= 0; i
< destCapacity
; i
++) {
2044 destText
[i
] = utext_openUnicodeString(NULL
, &dest
[i
], &status
);
2047 int32_t fieldCount
= split(&inputText
, destText
, destCapacity
, status
);
2049 for (i
= 0; i
< destCapacity
; i
++) {
2050 utext_close(destText
[i
]);
2053 uprv_free(destText
);
2054 utext_close(&inputText
);
2059 // split, UText mode
2061 int32_t RegexMatcher::split(UText
*input
,
2063 int32_t destCapacity
,
2067 // Check arguements for validity
2069 if (U_FAILURE(status
)) {
2073 if (destCapacity
< 1) {
2074 status
= U_ILLEGAL_ARGUMENT_ERROR
;
2079 // Reset for the input text
2082 int64_t nextOutputStringStart
= 0;
2083 if (fActiveLimit
== 0) {
2088 // Loop through the input text, searching for the delimiter pattern
2091 int32_t numCaptureGroups
= fPattern
->fGroupMap
->size();
2093 if (i
>=destCapacity
-1) {
2094 // There is one or zero output string left.
2095 // Fill the last output string with whatever is left from the input, then exit the loop.
2096 // ( i will be == destCapacity if we filled the output array while processing
2097 // capture groups of the delimiter expression, in which case we will discard the
2098 // last capture group saved in favor of the unprocessed remainder of the
2101 if (fActiveLimit
> nextOutputStringStart
) {
2102 if (UTEXT_FULL_TEXT_IN_CHUNK(input
, fInputLength
)) {
2104 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]),
2105 input
->chunkContents
+nextOutputStringStart
,
2106 (int32_t)(fActiveLimit
-nextOutputStringStart
), &status
);
2108 UText remainingText
= UTEXT_INITIALIZER
;
2109 utext_openUChars(&remainingText
, input
->chunkContents
+nextOutputStringStart
,
2110 fActiveLimit
-nextOutputStringStart
, &status
);
2111 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2112 utext_close(&remainingText
);
2115 UErrorCode lengthStatus
= U_ZERO_ERROR
;
2116 int32_t remaining16Length
=
2117 utext_extract(input
, nextOutputStringStart
, fActiveLimit
, NULL
, 0, &lengthStatus
);
2118 UChar
*remainingChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(remaining16Length
+1));
2119 if (remainingChars
== NULL
) {
2120 status
= U_MEMORY_ALLOCATION_ERROR
;
2124 utext_extract(input
, nextOutputStringStart
, fActiveLimit
, remainingChars
, remaining16Length
+1, &status
);
2126 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]), remainingChars
, remaining16Length
, &status
);
2128 UText remainingText
= UTEXT_INITIALIZER
;
2129 utext_openUChars(&remainingText
, remainingChars
, remaining16Length
, &status
);
2130 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2131 utext_close(&remainingText
);
2134 uprv_free(remainingChars
);
2140 // We found another delimiter. Move everything from where we started looking
2141 // up until the start of the delimiter into the next output string.
2142 if (UTEXT_FULL_TEXT_IN_CHUNK(input
, fInputLength
)) {
2144 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]),
2145 input
->chunkContents
+nextOutputStringStart
,
2146 (int32_t)(fMatchStart
-nextOutputStringStart
), &status
);
2148 UText remainingText
= UTEXT_INITIALIZER
;
2149 utext_openUChars(&remainingText
, input
->chunkContents
+nextOutputStringStart
,
2150 fMatchStart
-nextOutputStringStart
, &status
);
2151 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2152 utext_close(&remainingText
);
2155 UErrorCode lengthStatus
= U_ZERO_ERROR
;
2156 int32_t remaining16Length
= utext_extract(input
, nextOutputStringStart
, fMatchStart
, NULL
, 0, &lengthStatus
);
2157 UChar
*remainingChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(remaining16Length
+1));
2158 if (remainingChars
== NULL
) {
2159 status
= U_MEMORY_ALLOCATION_ERROR
;
2162 utext_extract(input
, nextOutputStringStart
, fMatchStart
, remainingChars
, remaining16Length
+1, &status
);
2164 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]), remainingChars
, remaining16Length
, &status
);
2166 UText remainingText
= UTEXT_INITIALIZER
;
2167 utext_openUChars(&remainingText
, remainingChars
, remaining16Length
, &status
);
2168 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2169 utext_close(&remainingText
);
2172 uprv_free(remainingChars
);
2174 nextOutputStringStart
= fMatchEnd
;
2176 // If the delimiter pattern has capturing parentheses, the captured
2177 // text goes out into the next n destination strings.
2179 for (groupNum
=1; groupNum
<=numCaptureGroups
; groupNum
++) {
2180 if (i
>= destCapacity
-2) {
2181 // Never fill the last available output string with capture group text.
2182 // It will filled with the last field, the remainder of the
2183 // unsplit input text.
2187 dest
[i
] = group(groupNum
, dest
[i
], status
);
2190 if (nextOutputStringStart
== fActiveLimit
) {
2191 // The delimiter was at the end of the string. We're done, but first
2192 // we output one last empty string, for the empty field following
2193 // the delimiter at the end of input.
2194 if (i
+1 < destCapacity
) {
2196 if (dest
[i
] == NULL
) {
2197 dest
[i
] = utext_openUChars(NULL
, NULL
, 0, &status
);
2199 static UChar emptyString
[] = {(UChar
)0};
2200 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]), emptyString
, 0, &status
);
2209 // We ran off the end of the input while looking for the next delimiter.
2210 // All the remaining text goes into the current output string.
2211 if (UTEXT_FULL_TEXT_IN_CHUNK(input
, fInputLength
)) {
2213 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]),
2214 input
->chunkContents
+nextOutputStringStart
,
2215 (int32_t)(fActiveLimit
-nextOutputStringStart
), &status
);
2217 UText remainingText
= UTEXT_INITIALIZER
;
2218 utext_openUChars(&remainingText
, input
->chunkContents
+nextOutputStringStart
,
2219 fActiveLimit
-nextOutputStringStart
, &status
);
2220 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2221 utext_close(&remainingText
);
2224 UErrorCode lengthStatus
= U_ZERO_ERROR
;
2225 int32_t remaining16Length
= utext_extract(input
, nextOutputStringStart
, fActiveLimit
, NULL
, 0, &lengthStatus
);
2226 UChar
*remainingChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(remaining16Length
+1));
2227 if (remainingChars
== NULL
) {
2228 status
= U_MEMORY_ALLOCATION_ERROR
;
2232 utext_extract(input
, nextOutputStringStart
, fActiveLimit
, remainingChars
, remaining16Length
+1, &status
);
2234 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]), remainingChars
, remaining16Length
, &status
);
2236 UText remainingText
= UTEXT_INITIALIZER
;
2237 utext_openUChars(&remainingText
, remainingChars
, remaining16Length
, &status
);
2238 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2239 utext_close(&remainingText
);
2242 uprv_free(remainingChars
);
2246 if (U_FAILURE(status
)) {
2249 } // end of for loop
2254 //--------------------------------------------------------------------------------
2258 //--------------------------------------------------------------------------------
2259 int32_t RegexMatcher::start(UErrorCode
&status
) const {
2260 return start(0, status
);
2263 int64_t RegexMatcher::start64(UErrorCode
&status
) const {
2264 return start64(0, status
);
2267 //--------------------------------------------------------------------------------
2269 // start(int32_t group, UErrorCode &status)
2271 //--------------------------------------------------------------------------------
2273 int64_t RegexMatcher::start64(int32_t group
, UErrorCode
&status
) const {
2274 if (U_FAILURE(status
)) {
2277 if (U_FAILURE(fDeferredStatus
)) {
2278 status
= fDeferredStatus
;
2281 if (fMatch
== FALSE
) {
2282 status
= U_REGEX_INVALID_STATE
;
2285 if (group
< 0 || group
> fPattern
->fGroupMap
->size()) {
2286 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
2293 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(group
-1);
2294 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
2295 U_ASSERT(groupOffset
>= 0);
2296 s
= fFrame
->fExtra
[groupOffset
];
2303 int32_t RegexMatcher::start(int32_t group
, UErrorCode
&status
) const {
2304 return (int32_t)start64(group
, status
);
2307 //--------------------------------------------------------------------------------
2309 // useAnchoringBounds
2311 //--------------------------------------------------------------------------------
2312 RegexMatcher
&RegexMatcher::useAnchoringBounds(UBool b
) {
2313 fAnchoringBounds
= b
;
2314 fAnchorStart
= (fAnchoringBounds
? fRegionStart
: 0);
2315 fAnchorLimit
= (fAnchoringBounds
? fRegionLimit
: fInputLength
);
2320 //--------------------------------------------------------------------------------
2322 // useTransparentBounds
2324 //--------------------------------------------------------------------------------
2325 RegexMatcher
&RegexMatcher::useTransparentBounds(UBool b
) {
2326 fTransparentBounds
= b
;
2327 fLookStart
= (fTransparentBounds
? 0 : fRegionStart
);
2328 fLookLimit
= (fTransparentBounds
? fInputLength
: fRegionLimit
);
2332 //--------------------------------------------------------------------------------
2336 //--------------------------------------------------------------------------------
2337 void RegexMatcher::setTimeLimit(int32_t limit
, UErrorCode
&status
) {
2338 if (U_FAILURE(status
)) {
2341 if (U_FAILURE(fDeferredStatus
)) {
2342 status
= fDeferredStatus
;
2346 status
= U_ILLEGAL_ARGUMENT_ERROR
;
2353 //--------------------------------------------------------------------------------
2357 //--------------------------------------------------------------------------------
2358 int32_t RegexMatcher::getTimeLimit() const {
2363 //--------------------------------------------------------------------------------
2367 //--------------------------------------------------------------------------------
2368 void RegexMatcher::setStackLimit(int32_t limit
, UErrorCode
&status
) {
2369 if (U_FAILURE(status
)) {
2372 if (U_FAILURE(fDeferredStatus
)) {
2373 status
= fDeferredStatus
;
2377 status
= U_ILLEGAL_ARGUMENT_ERROR
;
2381 // Reset the matcher. This is needed here in case there is a current match
2382 // whose final stack frame (containing the match results, pointed to by fFrame)
2383 // would be lost by resizing to a smaller stack size.
2387 // Unlimited stack expansion
2388 fStack
->setMaxCapacity(0);
2390 // Change the units of the limit from bytes to ints, and bump the size up
2391 // to be big enough to hold at least one stack frame for the pattern,
2392 // if it isn't there already.
2393 int32_t adjustedLimit
= limit
/ sizeof(int32_t);
2394 if (adjustedLimit
< fPattern
->fFrameSize
) {
2395 adjustedLimit
= fPattern
->fFrameSize
;
2397 fStack
->setMaxCapacity(adjustedLimit
);
2399 fStackLimit
= limit
;
2403 //--------------------------------------------------------------------------------
2407 //--------------------------------------------------------------------------------
2408 int32_t RegexMatcher::getStackLimit() const {
2413 //--------------------------------------------------------------------------------
2417 //--------------------------------------------------------------------------------
2418 void RegexMatcher::setMatchCallback(URegexMatchCallback
*callback
,
2419 const void *context
,
2420 UErrorCode
&status
) {
2421 if (U_FAILURE(status
)) {
2424 fCallbackFn
= callback
;
2425 fCallbackContext
= context
;
2429 //--------------------------------------------------------------------------------
2433 //--------------------------------------------------------------------------------
2434 void RegexMatcher::getMatchCallback(URegexMatchCallback
*&callback
,
2435 const void *&context
,
2436 UErrorCode
&status
) {
2437 if (U_FAILURE(status
)) {
2440 callback
= fCallbackFn
;
2441 context
= fCallbackContext
;
2445 //--------------------------------------------------------------------------------
2449 //--------------------------------------------------------------------------------
2450 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback
*callback
,
2451 const void *context
,
2452 UErrorCode
&status
) {
2453 if (U_FAILURE(status
)) {
2456 fFindProgressCallbackFn
= callback
;
2457 fFindProgressCallbackContext
= context
;
2461 //--------------------------------------------------------------------------------
2465 //--------------------------------------------------------------------------------
2466 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback
*&callback
,
2467 const void *&context
,
2468 UErrorCode
&status
) {
2469 if (U_FAILURE(status
)) {
2472 callback
= fFindProgressCallbackFn
;
2473 context
= fFindProgressCallbackContext
;
2477 //================================================================================
2479 // Code following this point in this file is the internal
2480 // Match Engine Implementation.
2482 //================================================================================
2485 //--------------------------------------------------------------------------------
2488 // Discard any previous contents of the state save stack, and initialize a
2489 // new stack frame to all -1. The -1s are needed for capture group limits,
2490 // where they indicate that a group has not yet matched anything.
2491 //--------------------------------------------------------------------------------
2492 REStackFrame
*RegexMatcher::resetStack() {
2493 // Discard any previous contents of the state save stack, and initialize a
2494 // new stack frame with all -1 data. The -1s are needed for capture group limits,
2495 // where they indicate that a group has not yet matched anything.
2496 fStack
->removeAllElements();
2498 REStackFrame
*iFrame
= (REStackFrame
*)fStack
->reserveBlock(fPattern
->fFrameSize
, fDeferredStatus
);
2500 for (i
=0; i
<fPattern
->fFrameSize
-RESTACKFRAME_HDRCOUNT
; i
++) {
2501 iFrame
->fExtra
[i
] = -1;
2508 //--------------------------------------------------------------------------------
2511 // in perl, "xab..cd..", \b is true at positions 0,3,5,7
2513 // If the current char is a combining mark,
2515 // Else Scan backwards to the first non-combining char.
2516 // We are at a boundary if the this char and the original chars are
2517 // opposite in membership in \w set
2519 // parameters: pos - the current position in the input buffer
2521 // TODO: double-check edge cases at region boundaries.
2523 //--------------------------------------------------------------------------------
2524 UBool
RegexMatcher::isWordBoundary(int64_t pos
) {
2525 UBool isBoundary
= FALSE
;
2526 UBool cIsWord
= FALSE
;
2528 if (pos
>= fLookLimit
) {
2531 // Determine whether char c at current position is a member of the word set of chars.
2532 // If we're off the end of the string, behave as though we're not at a word char.
2533 UTEXT_SETNATIVEINDEX(fInputText
, pos
);
2534 UChar32 c
= UTEXT_CURRENT32(fInputText
);
2535 if (u_hasBinaryProperty(c
, UCHAR_GRAPHEME_EXTEND
) || u_charType(c
) == U_FORMAT_CHAR
) {
2536 // Current char is a combining one. Not a boundary.
2539 cIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(c
);
2542 // Back up until we come to a non-combining char, determine whether
2543 // that char is a word char.
2544 UBool prevCIsWord
= FALSE
;
2546 if (UTEXT_GETNATIVEINDEX(fInputText
) <= fLookStart
) {
2549 UChar32 prevChar
= UTEXT_PREVIOUS32(fInputText
);
2550 if (!(u_hasBinaryProperty(prevChar
, UCHAR_GRAPHEME_EXTEND
)
2551 || u_charType(prevChar
) == U_FORMAT_CHAR
)) {
2552 prevCIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(prevChar
);
2556 isBoundary
= cIsWord
^ prevCIsWord
;
2560 UBool
RegexMatcher::isChunkWordBoundary(int32_t pos
) {
2561 UBool isBoundary
= FALSE
;
2562 UBool cIsWord
= FALSE
;
2564 const UChar
*inputBuf
= fInputText
->chunkContents
;
2566 if (pos
>= fLookLimit
) {
2569 // Determine whether char c at current position is a member of the word set of chars.
2570 // If we're off the end of the string, behave as though we're not at a word char.
2572 U16_GET(inputBuf
, fLookStart
, pos
, fLookLimit
, c
);
2573 if (u_hasBinaryProperty(c
, UCHAR_GRAPHEME_EXTEND
) || u_charType(c
) == U_FORMAT_CHAR
) {
2574 // Current char is a combining one. Not a boundary.
2577 cIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(c
);
2580 // Back up until we come to a non-combining char, determine whether
2581 // that char is a word char.
2582 UBool prevCIsWord
= FALSE
;
2584 if (pos
<= fLookStart
) {
2588 U16_PREV(inputBuf
, fLookStart
, pos
, prevChar
);
2589 if (!(u_hasBinaryProperty(prevChar
, UCHAR_GRAPHEME_EXTEND
)
2590 || u_charType(prevChar
) == U_FORMAT_CHAR
)) {
2591 prevCIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(prevChar
);
2595 isBoundary
= cIsWord
^ prevCIsWord
;
2599 //--------------------------------------------------------------------------------
2603 // Test for a word boundary using RBBI word break.
2605 // parameters: pos - the current position in the input buffer
2607 //--------------------------------------------------------------------------------
2608 UBool
RegexMatcher::isUWordBoundary(int64_t pos
) {
2609 UBool returnVal
= FALSE
;
2610 #if UCONFIG_NO_BREAK_ITERATION==0
2612 // If we haven't yet created a break iterator for this matcher, do it now.
2613 if (fWordBreakItr
== NULL
) {
2615 (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus
);
2616 if (U_FAILURE(fDeferredStatus
)) {
2619 fWordBreakItr
->setText(fInputText
, fDeferredStatus
);
2622 if (pos
>= fLookLimit
) {
2624 returnVal
= TRUE
; // With Unicode word rules, only positions within the interior of "real"
2625 // words are not boundaries. All non-word chars stand by themselves,
2626 // with word boundaries on both sides.
2628 if (!UTEXT_USES_U16(fInputText
)) {
2629 // !!!: Would like a better way to do this!
2630 UErrorCode status
= U_ZERO_ERROR
;
2631 pos
= utext_extract(fInputText
, 0, pos
, NULL
, 0, &status
);
2633 returnVal
= fWordBreakItr
->isBoundary((int32_t)pos
);
2639 //--------------------------------------------------------------------------------
2641 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state
2642 // saves. Increment the "time" counter, and call the
2643 // user callback function if there is one installed.
2645 // If the match operation needs to be aborted, either for a time-out
2646 // or because the user callback asked for it, just set an error status.
2647 // The engine will pick that up and stop in its outer loop.
2649 //--------------------------------------------------------------------------------
2650 void RegexMatcher::IncrementTime(UErrorCode
&status
) {
2651 fTickCounter
= TIMER_INITIAL_VALUE
;
2653 if (fCallbackFn
!= NULL
) {
2654 if ((*fCallbackFn
)(fCallbackContext
, fTime
) == FALSE
) {
2655 status
= U_REGEX_STOPPED_BY_CALLER
;
2659 if (fTimeLimit
> 0 && fTime
>= fTimeLimit
) {
2660 status
= U_REGEX_TIME_OUT
;
2664 //--------------------------------------------------------------------------------
2666 // ReportFindProgress This function is called once for each advance in the target
2667 // string from the find() function, and calls the user progress callback
2668 // function if there is one installed.
2672 // If the match operation needs to be aborted because the user
2673 // callback asked for it, just set an error status.
2674 // The engine will pick that up and stop in its outer loop.
2676 //--------------------------------------------------------------------------------
2677 UBool
RegexMatcher::ReportFindProgress(int64_t matchIndex
, UErrorCode
&status
) {
2678 if (fFindProgressCallbackFn
!= NULL
) {
2679 if ((*fFindProgressCallbackFn
)(fFindProgressCallbackContext
, matchIndex
) == FALSE
) {
2680 status
= U_ZERO_ERROR
/*U_REGEX_STOPPED_BY_CALLER*/;
2687 //--------------------------------------------------------------------------------
2690 // Make a new stack frame, initialized as a copy of the current stack frame.
2691 // Set the pattern index in the original stack frame from the operand value
2692 // in the opcode. Execution of the engine continues with the state in
2693 // the newly created stack frame
2695 // Note that reserveBlock() may grow the stack, resulting in the
2696 // whole thing being relocated in memory.
2699 // fp The top frame pointer when called. At return, a new
2700 // fame will be present
2701 // savePatIdx An index into the compiled pattern. Goes into the original
2702 // (not new) frame. If execution ever back-tracks out of the
2703 // new frame, this will be where we continue from in the pattern.
2705 // The new frame pointer.
2707 //--------------------------------------------------------------------------------
2708 inline REStackFrame
*RegexMatcher::StateSave(REStackFrame
*fp
, int64_t savePatIdx
, UErrorCode
&status
) {
2709 // push storage for a new frame.
2710 int64_t *newFP
= fStack
->reserveBlock(fFrameSize
, status
);
2711 if (newFP
== NULL
) {
2712 // Failure on attempted stack expansion.
2713 // Stack function set some other error code, change it to a more
2714 // specific one for regular expressions.
2715 status
= U_REGEX_STACK_OVERFLOW
;
2716 // We need to return a writable stack frame, so just return the
2717 // previous frame. The match operation will stop quickly
2718 // because of the error status, after which the frame will never
2719 // be looked at again.
2722 fp
= (REStackFrame
*)(newFP
- fFrameSize
); // in case of realloc of stack.
2724 // New stack frame = copy of old top frame.
2725 int64_t *source
= (int64_t *)fp
;
2726 int64_t *dest
= newFP
;
2728 *dest
++ = *source
++;
2729 if (source
== newFP
) {
2735 if (fTickCounter
<= 0) {
2736 IncrementTime(status
); // Re-initializes fTickCounter
2738 fp
->fPatIdx
= savePatIdx
;
2739 return (REStackFrame
*)newFP
;
2743 //--------------------------------------------------------------------------------
2745 // MatchAt This is the actual matching engine.
2747 // startIdx: begin matching a this index.
2748 // toEnd: if true, match must extend to end of the input region
2750 //--------------------------------------------------------------------------------
2751 void RegexMatcher::MatchAt(int64_t startIdx
, UBool toEnd
, UErrorCode
&status
) {
2752 UBool isMatch
= FALSE
; // True if the we have a match.
2754 int64_t backSearchIndex
= U_INT64_MAX
; // used after greedy single-character matches for searching backwards
2756 int32_t op
; // Operation from the compiled pattern, split into
2757 int32_t opType
; // the opcode
2758 int32_t opValue
; // and the operand value.
2760 #ifdef REGEX_RUN_DEBUG
2763 printf("MatchAt(startIdx=%ld)\n", startIdx
);
2764 printf("Original Pattern: ");
2765 UChar32 c
= utext_next32From(fPattern
->fPattern
, 0);
2766 while (c
!= U_SENTINEL
) {
2767 if (c
<32 || c
>256) {
2770 REGEX_DUMP_DEBUG_PRINTF(("%c", c
));
2772 c
= UTEXT_NEXT32(fPattern
->fPattern
);
2775 printf("Input String: ");
2776 c
= utext_next32From(fInputText
, 0);
2777 while (c
!= U_SENTINEL
) {
2778 if (c
<32 || c
>256) {
2783 c
= UTEXT_NEXT32(fInputText
);
2790 if (U_FAILURE(status
)) {
2794 // Cache frequently referenced items from the compiled pattern
2796 int64_t *pat
= fPattern
->fCompiledPat
->getBuffer();
2798 const UChar
*litText
= fPattern
->fLiteralText
.getBuffer();
2799 UVector
*sets
= fPattern
->fSets
;
2801 fFrameSize
= fPattern
->fFrameSize
;
2802 REStackFrame
*fp
= resetStack();
2805 fp
->fInputIdx
= startIdx
;
2807 // Zero out the pattern's static data
2809 for (i
= 0; i
<fPattern
->fDataSize
; i
++) {
2814 // Main loop for interpreting the compiled pattern.
2815 // One iteration of the loop per pattern operation performed.
2819 if (_heapchk() != _HEAPOK
) {
2820 fprintf(stderr
, "Heap Trouble\n");
2824 op
= (int32_t)pat
[fp
->fPatIdx
];
2825 opType
= URX_TYPE(op
);
2826 opValue
= URX_VAL(op
);
2827 #ifdef REGEX_RUN_DEBUG
2829 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2830 printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp
->fInputIdx
,
2831 UTEXT_CURRENT32(fInputText
), (int64_t *)fp
-fStack
->getBuffer(), fActiveLimit
);
2832 fPattern
->dumpOp(fp
->fPatIdx
);
2845 // Force a backtrack. In some circumstances, the pattern compiler
2846 // will notice that the pattern can't possibly match anything, and will
2847 // emit one of these at that point.
2848 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2853 if (fp
->fInputIdx
< fActiveLimit
) {
2854 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2855 UChar32 c
= UTEXT_NEXT32(fInputText
);
2857 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
2863 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2869 // Test input against a literal string.
2870 // Strings require two slots in the compiled pattern, one for the
2871 // offset to the string text, and one for the length.
2873 int32_t stringStartIdx
= opValue
;
2874 op
= (int32_t)pat
[fp
->fPatIdx
]; // Fetch the second operand
2876 opType
= URX_TYPE(op
);
2877 int32_t stringLen
= URX_VAL(op
);
2878 U_ASSERT(opType
== URX_STRING_LEN
);
2879 U_ASSERT(stringLen
>= 2);
2881 const UChar
*patternString
= litText
+stringStartIdx
;
2882 int32_t patternStringIndex
= 0;
2883 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2885 UChar32 patternChar
;
2886 UBool success
= TRUE
;
2887 while (patternStringIndex
< stringLen
) {
2888 if (UTEXT_GETNATIVEINDEX(fInputText
) >= fActiveLimit
) {
2893 inputChar
= UTEXT_NEXT32(fInputText
);
2894 U16_NEXT(patternString
, patternStringIndex
, stringLen
, patternChar
);
2895 if (patternChar
!= inputChar
) {
2902 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
2904 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2910 case URX_STATE_SAVE
:
2911 fp
= StateSave(fp
, opValue
, status
);
2916 // The match loop will exit via this path on a successful match,
2917 // when we reach the end of the pattern.
2918 if (toEnd
&& fp
->fInputIdx
!= fActiveLimit
) {
2919 // The pattern matched, but not to the end of input. Try some more.
2920 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2926 // Start and End Capture stack frame variables are laid out out like this:
2927 // fp->fExtra[opValue] - The start of a completed capture group
2928 // opValue+1 - The end of a completed capture group
2929 // opValue+2 - the start of a capture group whose end
2930 // has not yet been reached (and might not ever be).
2931 case URX_START_CAPTURE
:
2932 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
2933 fp
->fExtra
[opValue
+2] = fp
->fInputIdx
;
2937 case URX_END_CAPTURE
:
2938 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
2939 U_ASSERT(fp
->fExtra
[opValue
+2] >= 0); // Start pos for this group must be set.
2940 fp
->fExtra
[opValue
] = fp
->fExtra
[opValue
+2]; // Tentative start becomes real.
2941 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // End position
2942 U_ASSERT(fp
->fExtra
[opValue
] <= fp
->fExtra
[opValue
+1]);
2946 case URX_DOLLAR
: // $, test for End of line
2947 // or for position before new line at end of input
2949 if (fp
->fInputIdx
>= fAnchorLimit
) {
2950 // We really are at the end of input. Success.
2956 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2958 // If we are positioned just before a new-line that is located at the
2959 // end of input, succeed.
2960 UChar32 c
= UTEXT_NEXT32(fInputText
);
2961 if (UTEXT_GETNATIVEINDEX(fInputText
) >= fAnchorLimit
) {
2962 if ((c
>=0x0a && c
<=0x0d) || c
==0x85 || c
==0x2028 || c
==0x2029) {
2963 // If not in the middle of a CR/LF sequence
2964 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& ((void)UTEXT_PREVIOUS32(fInputText
), UTEXT_PREVIOUS32(fInputText
))==0x0d)) {
2965 // At new-line at end of input. Success
2973 UChar32 nextC
= UTEXT_NEXT32(fInputText
);
2974 if (c
== 0x0d && nextC
== 0x0a && UTEXT_GETNATIVEINDEX(fInputText
) >= fAnchorLimit
) {
2977 break; // At CR/LF at end of input. Success
2981 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2986 case URX_DOLLAR_D
: // $, test for End of Line, in UNIX_LINES mode.
2987 if (fp
->fInputIdx
>= fAnchorLimit
) {
2988 // Off the end of input. Success.
2993 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2994 UChar32 c
= UTEXT_NEXT32(fInputText
);
2995 // Either at the last character of input, or off the end.
2996 if (c
== 0x0a && UTEXT_GETNATIVEINDEX(fInputText
) == fAnchorLimit
) {
3003 // Not at end of input. Back-track out.
3004 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3008 case URX_DOLLAR_M
: // $, test for End of line in multi-line mode
3010 if (fp
->fInputIdx
>= fAnchorLimit
) {
3011 // We really are at the end of input. Success.
3016 // If we are positioned just before a new-line, succeed.
3017 // It makes no difference where the new-line is within the input.
3018 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3019 UChar32 c
= UTEXT_CURRENT32(fInputText
);
3020 if ((c
>=0x0a && c
<=0x0d) || c
==0x85 ||c
==0x2028 || c
==0x2029) {
3021 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
3022 // In multi-line mode, hitting a new-line just before the end of input does not
3023 // set the hitEnd or requireEnd flags
3024 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& UTEXT_PREVIOUS32(fInputText
)==0x0d)) {
3028 // not at a new line. Fail.
3029 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3034 case URX_DOLLAR_MD
: // $, test for End of line in multi-line and UNIX_LINES mode
3036 if (fp
->fInputIdx
>= fAnchorLimit
) {
3037 // We really are at the end of input. Success.
3039 fRequireEnd
= TRUE
; // Java set requireEnd in this case, even though
3040 break; // adding a new-line would not lose the match.
3042 // If we are not positioned just before a new-line, the test fails; backtrack out.
3043 // It makes no difference where the new-line is within the input.
3044 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3045 if (UTEXT_CURRENT32(fInputText
) != 0x0a) {
3046 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3052 case URX_CARET
: // ^, test for start of line
3053 if (fp
->fInputIdx
!= fAnchorStart
) {
3054 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3059 case URX_CARET_M
: // ^, test for start of line in mulit-line mode
3061 if (fp
->fInputIdx
== fAnchorStart
) {
3062 // We are at the start input. Success.
3065 // Check whether character just before the current pos is a new-line
3066 // unless we are at the end of input
3067 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3068 UChar32 c
= UTEXT_PREVIOUS32(fInputText
);
3069 if ((fp
->fInputIdx
< fAnchorLimit
) &&
3070 ((c
<=0x0d && c
>=0x0a) || c
==0x85 ||c
==0x2028 || c
==0x2029)) {
3071 // It's a new-line. ^ is true. Success.
3072 // TODO: what should be done with positions between a CR and LF?
3075 // Not at the start of a line. Fail.
3076 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3081 case URX_CARET_M_UNIX
: // ^, test for start of line in mulit-line + Unix-line mode
3083 U_ASSERT(fp
->fInputIdx
>= fAnchorStart
);
3084 if (fp
->fInputIdx
<= fAnchorStart
) {
3085 // We are at the start input. Success.
3088 // Check whether character just before the current pos is a new-line
3089 U_ASSERT(fp
->fInputIdx
<= fAnchorLimit
);
3090 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3091 UChar32 c
= UTEXT_PREVIOUS32(fInputText
);
3093 // Not at the start of a line. Back-track out.
3094 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3099 case URX_BACKSLASH_B
: // Test for word boundaries
3101 UBool success
= isWordBoundary(fp
->fInputIdx
);
3102 success
^= (opValue
!= 0); // flip sense for \B
3104 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3110 case URX_BACKSLASH_BU
: // Test for word boundaries, Unicode-style
3112 UBool success
= isUWordBoundary(fp
->fInputIdx
);
3113 success
^= (opValue
!= 0); // flip sense for \B
3115 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3121 case URX_BACKSLASH_D
: // Test for decimal digit
3123 if (fp
->fInputIdx
>= fActiveLimit
) {
3125 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3129 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3131 UChar32 c
= UTEXT_NEXT32(fInputText
);
3132 int8_t ctype
= u_charType(c
); // TODO: make a unicode set for this. Will be faster.
3133 UBool success
= (ctype
== U_DECIMAL_DIGIT_NUMBER
);
3134 success
^= (opValue
!= 0); // flip sense for \D
3136 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3138 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3144 case URX_BACKSLASH_G
: // Test for position at end of previous match
3145 if (!((fMatch
&& fp
->fInputIdx
==fMatchEnd
) || (fMatch
==FALSE
&& fp
->fInputIdx
==fActiveStart
))) {
3146 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3151 case URX_BACKSLASH_X
:
3152 // Match a Grapheme, as defined by Unicode TR 29.
3153 // Differs slightly from Perl, which consumes combining marks independently
3157 // Fail if at end of input
3158 if (fp
->fInputIdx
>= fActiveLimit
) {
3160 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3164 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3166 // Examine (and consume) the current char.
3167 // Dispatch into a little state machine, based on the char.
3169 c
= UTEXT_NEXT32(fInputText
);
3170 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3171 UnicodeSet
**sets
= fPattern
->fStaticSets
;
3172 if (sets
[URX_GC_NORMAL
]->contains(c
)) goto GC_Extend
;
3173 if (sets
[URX_GC_CONTROL
]->contains(c
)) goto GC_Control
;
3174 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
3175 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
3176 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
3177 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
3178 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
3184 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
3185 c
= UTEXT_NEXT32(fInputText
);
3186 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3187 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
3188 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
3189 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
3190 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
3191 (void)UTEXT_PREVIOUS32(fInputText
);
3192 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3196 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
3197 c
= UTEXT_NEXT32(fInputText
);
3198 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3199 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
3200 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
3201 (void)UTEXT_PREVIOUS32(fInputText
);
3202 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3206 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
3207 c
= UTEXT_NEXT32(fInputText
);
3208 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3209 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
3210 (void)UTEXT_PREVIOUS32(fInputText
);
3211 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3215 // Combining characters are consumed here
3217 if (fp
->fInputIdx
>= fActiveLimit
) {
3220 c
= UTEXT_CURRENT32(fInputText
);
3221 if (sets
[URX_GC_EXTEND
]->contains(c
) == FALSE
) {
3224 (void)UTEXT_NEXT32(fInputText
);
3225 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3230 // Most control chars stand alone (don't combine with combining chars),
3231 // except for that CR/LF sequence is a single grapheme cluster.
3232 if (c
== 0x0d && fp
->fInputIdx
< fActiveLimit
&& UTEXT_CURRENT32(fInputText
) == 0x0a) {
3233 c
= UTEXT_NEXT32(fInputText
);
3234 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3238 if (fp
->fInputIdx
>= fActiveLimit
) {
3247 case URX_BACKSLASH_Z
: // Test for end of Input
3248 if (fp
->fInputIdx
< fAnchorLimit
) {
3249 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3258 case URX_STATIC_SETREF
:
3260 // Test input character against one of the predefined sets
3261 // (Word Characters, for example)
3262 // The high bit of the op value is a flag for the match polarity.
3263 // 0: success if input char is in set.
3264 // 1: success if input char is not in set.
3265 if (fp
->fInputIdx
>= fActiveLimit
) {
3267 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3271 UBool success
= ((opValue
& URX_NEG_SET
) == URX_NEG_SET
);
3272 opValue
&= ~URX_NEG_SET
;
3273 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
3275 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3276 UChar32 c
= UTEXT_NEXT32(fInputText
);
3278 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
3279 if (s8
->contains(c
)) {
3283 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
3284 if (s
->contains(c
)) {
3289 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3291 // the character wasn't in the set.
3292 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3298 case URX_STAT_SETREF_N
:
3300 // Test input character for NOT being a member of one of
3301 // the predefined sets (Word Characters, for example)
3302 if (fp
->fInputIdx
>= fActiveLimit
) {
3304 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3308 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
3310 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3312 UChar32 c
= UTEXT_NEXT32(fInputText
);
3314 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
3315 if (s8
->contains(c
) == FALSE
) {
3316 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3320 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
3321 if (s
->contains(c
) == FALSE
) {
3322 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3326 // the character wasn't in the set.
3327 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3333 if (fp
->fInputIdx
>= fActiveLimit
) {
3335 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3338 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3340 // There is input left. Pick up one char and test it for set membership.
3341 UChar32 c
= UTEXT_NEXT32(fInputText
);
3342 U_ASSERT(opValue
> 0 && opValue
< sets
->size());
3344 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
3345 if (s8
->contains(c
)) {
3346 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3350 UnicodeSet
*s
= (UnicodeSet
*)sets
->elementAt(opValue
);
3351 if (s
->contains(c
)) {
3352 // The character is in the set. A Match.
3353 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3358 // the character wasn't in the set.
3359 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3366 // . matches anything, but stops at end-of-line.
3367 if (fp
->fInputIdx
>= fActiveLimit
) {
3368 // At end of input. Match failed. Backtrack out.
3370 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3374 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3376 // There is input left. Advance over one char, unless we've hit end-of-line
3377 UChar32 c
= UTEXT_NEXT32(fInputText
);
3378 if (((c
& 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
3379 ((c
<=0x0d && c
>=0x0a) || c
==0x85 ||c
==0x2028 || c
==0x2029)) {
3380 // End of line in normal mode. . does not match.
3381 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3384 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3389 case URX_DOTANY_ALL
:
3391 // ., in dot-matches-all (including new lines) mode
3392 if (fp
->fInputIdx
>= fActiveLimit
) {
3393 // At end of input. Match failed. Backtrack out.
3395 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3399 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3401 // There is input left. Advance over one char, except if we are
3402 // at a cr/lf, advance over both of them.
3404 c
= UTEXT_NEXT32(fInputText
);
3405 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3406 if (c
==0x0d && fp
->fInputIdx
< fActiveLimit
) {
3407 // In the case of a CR/LF, we need to advance over both.
3408 UChar32 nextc
= UTEXT_CURRENT32(fInputText
);
3409 if (nextc
== 0x0a) {
3410 (void)UTEXT_NEXT32(fInputText
);
3411 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3418 case URX_DOTANY_UNIX
:
3420 // '.' operator, matches all, but stops at end-of-line.
3421 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
3422 if (fp
->fInputIdx
>= fActiveLimit
) {
3423 // At end of input. Match failed. Backtrack out.
3425 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3429 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3431 // There is input left. Advance over one char, unless we've hit end-of-line
3432 UChar32 c
= UTEXT_NEXT32(fInputText
);
3434 // End of line in normal mode. '.' does not match the \n
3435 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3437 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3444 fp
->fPatIdx
= opValue
;
3452 U_ASSERT(opValue
< fPattern
->fCompiledPat
->size());
3453 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
3454 fp
->fPatIdx
= opValue
; // Then JMP.
3458 // This opcode is used with (x)+, when x can match a zero length string.
3459 // Same as JMP_SAV, except conditional on the match having made forward progress.
3460 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3461 // data address of the input position at the start of the loop.
3463 U_ASSERT(opValue
> 0 && opValue
< fPattern
->fCompiledPat
->size());
3464 int32_t stoOp
= (int32_t)pat
[opValue
-1];
3465 U_ASSERT(URX_TYPE(stoOp
) == URX_STO_INP_LOC
);
3466 int32_t frameLoc
= URX_VAL(stoOp
);
3467 U_ASSERT(frameLoc
>= 0 && frameLoc
< fFrameSize
);
3468 int64_t prevInputIdx
= fp
->fExtra
[frameLoc
];
3469 U_ASSERT(prevInputIdx
<= fp
->fInputIdx
);
3470 if (prevInputIdx
< fp
->fInputIdx
) {
3471 // The match did make progress. Repeat the loop.
3472 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
3473 fp
->fPatIdx
= opValue
;
3474 fp
->fExtra
[frameLoc
] = fp
->fInputIdx
;
3476 // If the input position did not advance, we do nothing here,
3477 // execution will fall out of the loop.
3483 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
3484 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
3486 // Pick up the three extra operands that CTR_INIT has, and
3487 // skip the pattern location counter past
3488 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
3490 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
3491 int32_t minCount
= (int32_t)pat
[instrOperandLoc
+1];
3492 int32_t maxCount
= (int32_t)pat
[instrOperandLoc
+2];
3493 U_ASSERT(minCount
>=0);
3494 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
3495 U_ASSERT(loopLoc
>fp
->fPatIdx
);
3497 if (minCount
== 0) {
3498 fp
= StateSave(fp
, loopLoc
+1, status
);
3500 if (maxCount
== 0) {
3501 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3508 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
3509 int32_t initOp
= (int32_t)pat
[opValue
];
3510 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT
);
3511 int64_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
3512 int32_t minCount
= (int32_t)pat
[opValue
+2];
3513 int32_t maxCount
= (int32_t)pat
[opValue
+3];
3514 // Increment the counter. Note: we DIDN'T worry about counter
3515 // overflow, since the data comes from UnicodeStrings, which
3516 // stores its length in an int32_t. Do we have to think about
3517 // this now that we're using UText? Probably not, since the length
3518 // in UChar32s is still an int32_t.
3520 U_ASSERT(*pCounter
> 0);
3521 if ((uint64_t)*pCounter
>= (uint32_t)maxCount
) {
3522 U_ASSERT(*pCounter
== maxCount
|| maxCount
== -1);
3525 if (*pCounter
>= minCount
) {
3526 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
3528 fp
->fPatIdx
= opValue
+ 4; // Loop back.
3532 case URX_CTR_INIT_NG
:
3534 // Initialize a non-greedy loop
3535 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
3536 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
3538 // Pick up the three extra operands that CTR_INIT has, and
3539 // skip the pattern location counter past
3540 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
3542 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
3543 int32_t minCount
= (int32_t)pat
[instrOperandLoc
+1];
3544 int32_t maxCount
= (int32_t)pat
[instrOperandLoc
+2];
3545 U_ASSERT(minCount
>=0);
3546 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
3547 U_ASSERT(loopLoc
>fp
->fPatIdx
);
3549 if (minCount
== 0) {
3550 if (maxCount
!= 0) {
3551 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
3553 fp
->fPatIdx
= loopLoc
+1; // Continue with stuff after repeated block
3558 case URX_CTR_LOOP_NG
:
3560 // Non-greedy {min, max} loops
3561 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
3562 int32_t initOp
= (int32_t)pat
[opValue
];
3563 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT_NG
);
3564 int64_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
3565 int32_t minCount
= (int32_t)pat
[opValue
+2];
3566 int32_t maxCount
= (int32_t)pat
[opValue
+3];
3567 // Increment the counter. Note: we DIDN'T worry about counter
3568 // overflow, since the data comes from UnicodeStrings, which
3569 // stores its length in an int32_t. Do we have to think about
3570 // this now that we're using UText? Probably not, since the length
3571 // in UChar32s is still an int32_t.
3573 U_ASSERT(*pCounter
> 0);
3575 if ((uint64_t)*pCounter
>= (uint32_t)maxCount
) {
3576 // The loop has matched the maximum permitted number of times.
3577 // Break out of here with no action. Matching will
3578 // continue with the following pattern.
3579 U_ASSERT(*pCounter
== maxCount
|| maxCount
== -1);
3583 if (*pCounter
< minCount
) {
3584 // We haven't met the minimum number of matches yet.
3585 // Loop back for another one.
3586 fp
->fPatIdx
= opValue
+ 4; // Loop back.
3588 // We do have the minimum number of matches.
3589 // Fall into the following pattern, but first do
3590 // a state save to the top of the loop, so that a failure
3591 // in the following pattern will try another iteration of the loop.
3592 fp
= StateSave(fp
, opValue
+ 4, status
);
3598 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
3599 fData
[opValue
] = fStack
->size();
3604 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
3605 int32_t newStackSize
= (int32_t)fData
[opValue
];
3606 U_ASSERT(newStackSize
<= fStack
->size());
3607 int64_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
3608 if (newFP
== (int64_t *)fp
) {
3612 for (i
=0; i
<fFrameSize
; i
++) {
3613 newFP
[i
] = ((int64_t *)fp
)[i
];
3615 fp
= (REStackFrame
*)newFP
;
3616 fStack
->setSize(newStackSize
);
3622 U_ASSERT(opValue
< fFrameSize
);
3623 int64_t groupStartIdx
= fp
->fExtra
[opValue
];
3624 int64_t groupEndIdx
= fp
->fExtra
[opValue
+1];
3625 U_ASSERT(groupStartIdx
<= groupEndIdx
);
3626 if (groupStartIdx
< 0) {
3627 // This capture group has not participated in the match thus far,
3628 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
3631 UTEXT_SETNATIVEINDEX(fAltInputText
, groupStartIdx
);
3632 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3634 // Note: if the capture group match was of an empty string the backref
3635 // match succeeds. Verified by testing: Perl matches succeed
3636 // in this case, so we do too.
3638 UBool success
= TRUE
;
3640 if (utext_getNativeIndex(fAltInputText
) >= groupEndIdx
) {
3644 if (utext_getNativeIndex(fInputText
) >= fActiveLimit
) {
3649 UChar32 captureGroupChar
= utext_next32(fAltInputText
);
3650 UChar32 inputChar
= utext_next32(fInputText
);
3651 if (inputChar
!= captureGroupChar
) {
3658 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3660 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3669 U_ASSERT(opValue
< fFrameSize
);
3670 int64_t groupStartIdx
= fp
->fExtra
[opValue
];
3671 int64_t groupEndIdx
= fp
->fExtra
[opValue
+1];
3672 U_ASSERT(groupStartIdx
<= groupEndIdx
);
3673 if (groupStartIdx
< 0) {
3674 // This capture group has not participated in the match thus far,
3675 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
3678 utext_setNativeIndex(fAltInputText
, groupStartIdx
);
3679 utext_setNativeIndex(fInputText
, fp
->fInputIdx
);
3680 CaseFoldingUTextIterator
captureGroupItr(*fAltInputText
);
3681 CaseFoldingUTextIterator
inputItr(*fInputText
);
3683 // Note: if the capture group match was of an empty string the backref
3684 // match succeeds. Verified by testing: Perl matches succeed
3685 // in this case, so we do too.
3687 UBool success
= TRUE
;
3689 if (!captureGroupItr
.inExpansion() && utext_getNativeIndex(fAltInputText
) >= groupEndIdx
) {
3693 if (!inputItr
.inExpansion() && utext_getNativeIndex(fInputText
) >= fActiveLimit
) {
3698 UChar32 captureGroupChar
= captureGroupItr
.next();
3699 UChar32 inputChar
= inputItr
.next();
3700 if (inputChar
!= captureGroupChar
) {
3706 if (success
&& inputItr
.inExpansion()) {
3707 // We otained a match by consuming part of a string obtained from
3708 // case-folding a single code point of the input text.
3709 // This does not count as an overall match.
3714 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3716 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3722 case URX_STO_INP_LOC
:
3724 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
);
3725 fp
->fExtra
[opValue
] = fp
->fInputIdx
;
3731 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
3733 int32_t dataLoc
= URX_VAL(pat
[instrOperandLoc
]);
3734 U_ASSERT(dataLoc
>= 0 && dataLoc
< fFrameSize
);
3735 int64_t savedInputIdx
= fp
->fExtra
[dataLoc
];
3736 U_ASSERT(savedInputIdx
<= fp
->fInputIdx
);
3737 if (savedInputIdx
< fp
->fInputIdx
) {
3738 fp
->fPatIdx
= opValue
; // JMP
3740 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no progress in loop.
3747 // Entering a lookahead block.
3748 // Save Stack Ptr, Input Pos.
3749 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3750 fData
[opValue
] = fStack
->size();
3751 fData
[opValue
+1] = fp
->fInputIdx
;
3752 fActiveStart
= fLookStart
; // Set the match region change for
3753 fActiveLimit
= fLookLimit
; // transparent bounds.
3759 // Leaving a look-ahead block.
3760 // restore Stack Ptr, Input Pos to positions they had on entry to block.
3761 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3762 int32_t stackSize
= fStack
->size();
3763 int32_t newStackSize
=(int32_t)fData
[opValue
];
3764 U_ASSERT(stackSize
>= newStackSize
);
3765 if (stackSize
> newStackSize
) {
3766 // Copy the current top frame back to the new (cut back) top frame.
3767 // This makes the capture groups from within the look-ahead
3768 // expression available.
3769 int64_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
3771 for (i
=0; i
<fFrameSize
; i
++) {
3772 newFP
[i
] = ((int64_t *)fp
)[i
];
3774 fp
= (REStackFrame
*)newFP
;
3775 fStack
->setSize(newStackSize
);
3777 fp
->fInputIdx
= fData
[opValue
+1];
3779 // Restore the active region bounds in the input string; they may have
3780 // been changed because of transparent bounds on a Region.
3781 fActiveStart
= fRegionStart
;
3782 fActiveLimit
= fRegionLimit
;
3787 // Case insensitive one char. The char from the pattern is already case folded.
3788 // Input text is not, but case folding the input can not reduce two or more code
3790 if (fp
->fInputIdx
< fActiveLimit
) {
3791 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3793 UChar32 c
= UTEXT_NEXT32(fInputText
);
3794 if (u_foldCase(c
, U_FOLD_CASE_DEFAULT
) == opValue
) {
3795 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3802 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3807 // Case-insensitive test input against a literal string.
3808 // Strings require two slots in the compiled pattern, one for the
3809 // offset to the string text, and one for the length.
3810 // The compiled string has already been case folded.
3812 const UChar
*patternString
= litText
+ opValue
;
3813 int32_t patternStringIdx
= 0;
3815 op
= (int32_t)pat
[fp
->fPatIdx
];
3817 opType
= URX_TYPE(op
);
3818 opValue
= URX_VAL(op
);
3819 U_ASSERT(opType
== URX_STRING_LEN
);
3820 int32_t patternStringLen
= opValue
; // Length of the string from the pattern.
3825 UBool success
= TRUE
;
3827 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3828 CaseFoldingUTextIterator
inputIterator(*fInputText
);
3829 while (patternStringIdx
< patternStringLen
) {
3830 if (!inputIterator
.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText
) >= fActiveLimit
) {
3835 U16_NEXT(patternString
, patternStringIdx
, patternStringLen
, cPattern
);
3836 cText
= inputIterator
.next();
3837 if (cText
!= cPattern
) {
3842 if (inputIterator
.inExpansion()) {
3847 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3849 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3857 // Entering a look-behind block.
3858 // Save Stack Ptr, Input Pos.
3859 // TODO: implement transparent bounds. Ticket #6067
3860 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3861 fData
[opValue
] = fStack
->size();
3862 fData
[opValue
+1] = fp
->fInputIdx
;
3863 // Init the variable containing the start index for attempted matches.
3864 fData
[opValue
+2] = -1;
3865 // Save input string length, then reset to pin any matches to end at
3866 // the current position.
3867 fData
[opValue
+3] = fActiveLimit
;
3868 fActiveLimit
= fp
->fInputIdx
;
3875 // Positive Look-Behind, at top of loop checking for matches of LB expression
3876 // at all possible input starting positions.
3878 // Fetch the min and max possible match lengths. They are the operands
3879 // of this op in the pattern.
3880 int32_t minML
= (int32_t)pat
[fp
->fPatIdx
++];
3881 int32_t maxML
= (int32_t)pat
[fp
->fPatIdx
++];
3882 U_ASSERT(minML
<= maxML
);
3883 U_ASSERT(minML
>= 0);
3885 // Fetch (from data) the last input index where a match was attempted.
3886 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3887 int64_t *lbStartIdx
= &fData
[opValue
+2];
3888 if (*lbStartIdx
< 0) {
3889 // First time through loop.
3890 *lbStartIdx
= fp
->fInputIdx
- minML
;
3892 // 2nd through nth time through the loop.
3893 // Back up start position for match by one.
3894 if (*lbStartIdx
== 0) {
3897 UTEXT_SETNATIVEINDEX(fInputText
, *lbStartIdx
);
3898 (void)UTEXT_PREVIOUS32(fInputText
);
3899 *lbStartIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3903 if (*lbStartIdx
< 0 || *lbStartIdx
< fp
->fInputIdx
- maxML
) {
3904 // We have tried all potential match starting points without
3905 // getting a match. Backtrack out, and out of the
3906 // Look Behind altogether.
3907 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3908 int64_t restoreInputLen
= fData
[opValue
+3];
3909 U_ASSERT(restoreInputLen
>= fActiveLimit
);
3910 U_ASSERT(restoreInputLen
<= fInputLength
);
3911 fActiveLimit
= restoreInputLen
;
3915 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3916 // (successful match will fall off the end of the loop.)
3917 fp
= StateSave(fp
, fp
->fPatIdx
-3, status
);
3918 fp
->fInputIdx
= *lbStartIdx
;
3923 // End of a look-behind block, after a successful match.
3925 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3926 if (fp
->fInputIdx
!= fActiveLimit
) {
3927 // The look-behind expression matched, but the match did not
3928 // extend all the way to the point that we are looking behind from.
3929 // FAIL out of here, which will take us back to the LB_CONT, which
3930 // will retry the match starting at another position or fail
3931 // the look-behind altogether, whichever is appropriate.
3932 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3936 // Look-behind match is good. Restore the orignal input string length,
3937 // which had been truncated to pin the end of the lookbehind match to the
3938 // position being looked-behind.
3939 int64_t originalInputLen
= fData
[opValue
+3];
3940 U_ASSERT(originalInputLen
>= fActiveLimit
);
3941 U_ASSERT(originalInputLen
<= fInputLength
);
3942 fActiveLimit
= originalInputLen
;
3949 // Negative Look-Behind, at top of loop checking for matches of LB expression
3950 // at all possible input starting positions.
3952 // Fetch the extra parameters of this op.
3953 int32_t minML
= (int32_t)pat
[fp
->fPatIdx
++];
3954 int32_t maxML
= (int32_t)pat
[fp
->fPatIdx
++];
3955 int32_t continueLoc
= (int32_t)pat
[fp
->fPatIdx
++];
3956 continueLoc
= URX_VAL(continueLoc
);
3957 U_ASSERT(minML
<= maxML
);
3958 U_ASSERT(minML
>= 0);
3959 U_ASSERT(continueLoc
> fp
->fPatIdx
);
3961 // Fetch (from data) the last input index where a match was attempted.
3962 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3963 int64_t *lbStartIdx
= &fData
[opValue
+2];
3964 if (*lbStartIdx
< 0) {
3965 // First time through loop.
3966 *lbStartIdx
= fp
->fInputIdx
- minML
;
3968 // 2nd through nth time through the loop.
3969 // Back up start position for match by one.
3970 if (*lbStartIdx
== 0) {
3973 UTEXT_SETNATIVEINDEX(fInputText
, *lbStartIdx
);
3974 (void)UTEXT_PREVIOUS32(fInputText
);
3975 *lbStartIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3979 if (*lbStartIdx
< 0 || *lbStartIdx
< fp
->fInputIdx
- maxML
) {
3980 // We have tried all potential match starting points without
3981 // getting a match, which means that the negative lookbehind as
3982 // a whole has succeeded. Jump forward to the continue location
3983 int64_t restoreInputLen
= fData
[opValue
+3];
3984 U_ASSERT(restoreInputLen
>= fActiveLimit
);
3985 U_ASSERT(restoreInputLen
<= fInputLength
);
3986 fActiveLimit
= restoreInputLen
;
3987 fp
->fPatIdx
= continueLoc
;
3991 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3992 // (successful match will cause a FAIL out of the loop altogether.)
3993 fp
= StateSave(fp
, fp
->fPatIdx
-4, status
);
3994 fp
->fInputIdx
= *lbStartIdx
;
3999 // End of a negative look-behind block, after a successful match.
4001 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
4002 if (fp
->fInputIdx
!= fActiveLimit
) {
4003 // The look-behind expression matched, but the match did not
4004 // extend all the way to the point that we are looking behind from.
4005 // FAIL out of here, which will take us back to the LB_CONT, which
4006 // will retry the match starting at another position or succeed
4007 // the look-behind altogether, whichever is appropriate.
4008 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4012 // Look-behind expression matched, which means look-behind test as
4015 // Restore the orignal input string length, which had been truncated
4016 // inorder to pin the end of the lookbehind match
4017 // to the position being looked-behind.
4018 int64_t originalInputLen
= fData
[opValue
+3];
4019 U_ASSERT(originalInputLen
>= fActiveLimit
);
4020 U_ASSERT(originalInputLen
<= fInputLength
);
4021 fActiveLimit
= originalInputLen
;
4023 // Restore original stack position, discarding any state saved
4024 // by the successful pattern match.
4025 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
4026 int32_t newStackSize
= (int32_t)fData
[opValue
];
4027 U_ASSERT(fStack
->size() > newStackSize
);
4028 fStack
->setSize(newStackSize
);
4030 // FAIL, which will take control back to someplace
4031 // prior to entering the look-behind test.
4032 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4038 // Loop Initialization for the optimized implementation of
4039 // [some character set]*
4040 // This op scans through all matching input.
4041 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4043 U_ASSERT(opValue
> 0 && opValue
< sets
->size());
4044 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
4045 UnicodeSet
*s
= (UnicodeSet
*)sets
->elementAt(opValue
);
4047 // Loop through input, until either the input is exhausted or
4048 // we reach a character that is not a member of the set.
4049 int64_t ix
= fp
->fInputIdx
;
4050 UTEXT_SETNATIVEINDEX(fInputText
, ix
);
4052 if (ix
>= fActiveLimit
) {
4056 UChar32 c
= UTEXT_NEXT32(fInputText
);
4058 if (s8
->contains(c
) == FALSE
) {
4062 if (s
->contains(c
) == FALSE
) {
4066 ix
= UTEXT_GETNATIVEINDEX(fInputText
);
4069 // If there were no matching characters, skip over the loop altogether.
4070 // The loop doesn't run at all, a * op always succeeds.
4071 if (ix
== fp
->fInputIdx
) {
4072 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
4076 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4077 // must follow. It's operand is the stack location
4078 // that holds the starting input index for the match of this [set]*
4079 int32_t loopcOp
= (int32_t)pat
[fp
->fPatIdx
];
4080 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
4081 int32_t stackLoc
= URX_VAL(loopcOp
);
4082 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
4083 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
4086 // Save State to the URX_LOOP_C op that follows this one,
4087 // so that match failures in the following code will return to there.
4088 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4089 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
4095 case URX_LOOP_DOT_I
:
4096 // Loop Initialization for the optimized implementation of .*
4097 // This op scans through all remaining input.
4098 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4100 // Loop through input until the input is exhausted (we reach an end-of-line)
4101 // In DOTALL mode, we can just go straight to the end of the input.
4103 if ((opValue
& 1) == 1) {
4104 // Dot-matches-All mode. Jump straight to the end of the string.
4108 // NOT DOT ALL mode. Line endings do not match '.'
4109 // Scan forward until a line ending or end of input.
4111 UTEXT_SETNATIVEINDEX(fInputText
, ix
);
4113 if (ix
>= fActiveLimit
) {
4117 UChar32 c
= UTEXT_NEXT32(fInputText
);
4118 if ((c
& 0x7f) <= 0x29) { // Fast filter of non-new-line-s
4119 if ((c
== 0x0a) || // 0x0a is newline in both modes.
4120 (((opValue
& 2) == 0) && // IF not UNIX_LINES mode
4121 (c
<=0x0d && c
>=0x0a)) || c
==0x85 ||c
==0x2028 || c
==0x2029) {
4122 // char is a line ending. Exit the scanning loop.
4126 ix
= UTEXT_GETNATIVEINDEX(fInputText
);
4130 // If there were no matching characters, skip over the loop altogether.
4131 // The loop doesn't run at all, a * op always succeeds.
4132 if (ix
== fp
->fInputIdx
) {
4133 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
4137 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4138 // must follow. It's operand is the stack location
4139 // that holds the starting input index for the match of this .*
4140 int32_t loopcOp
= (int32_t)pat
[fp
->fPatIdx
];
4141 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
4142 int32_t stackLoc
= URX_VAL(loopcOp
);
4143 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
4144 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
4147 // Save State to the URX_LOOP_C op that follows this one,
4148 // so that match failures in the following code will return to there.
4149 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4150 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
4158 U_ASSERT(opValue
>=0 && opValue
<fFrameSize
);
4159 backSearchIndex
= fp
->fExtra
[opValue
];
4160 U_ASSERT(backSearchIndex
<= fp
->fInputIdx
);
4161 if (backSearchIndex
== fp
->fInputIdx
) {
4162 // We've backed up the input idx to the point that the loop started.
4163 // The loop is done. Leave here without saving state.
4164 // Subsequent failures won't come back here.
4167 // Set up for the next iteration of the loop, with input index
4168 // backed up by one from the last time through,
4169 // and a state save to this instruction in case the following code fails again.
4170 // (We're going backwards because this loop emulates stack unwinding, not
4171 // the initial scan forward.)
4172 U_ASSERT(fp
->fInputIdx
> 0);
4173 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
4174 UChar32 prevC
= UTEXT_PREVIOUS32(fInputText
);
4175 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
4177 UChar32 twoPrevC
= UTEXT_PREVIOUS32(fInputText
);
4178 if (prevC
== 0x0a &&
4179 fp
->fInputIdx
> backSearchIndex
&&
4181 int32_t prevOp
= (int32_t)pat
[fp
->fPatIdx
-2];
4182 if (URX_TYPE(prevOp
) == URX_LOOP_DOT_I
) {
4183 // .*, stepping back over CRLF pair.
4184 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
4189 fp
= StateSave(fp
, fp
->fPatIdx
-1, status
);
4196 // Trouble. The compiled pattern contains an entry with an
4197 // unrecognized type tag.
4201 if (U_FAILURE(status
)) {
4210 fLastMatchEnd
= fMatchEnd
;
4211 fMatchStart
= startIdx
;
4212 fMatchEnd
= fp
->fInputIdx
;
4214 REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart
, fMatchEnd
));
4220 REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
4224 fFrame
= fp
; // The active stack frame when the engine stopped.
4225 // Contains the capture group results that we need to
4231 //--------------------------------------------------------------------------------
4233 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with the
4234 // assumption that the entire string is available in the UText's
4235 // chunk buffer. For now, that means we can use int32_t indexes,
4236 // except for anything that needs to be saved (like group starts
4239 // startIdx: begin matching a this index.
4240 // toEnd: if true, match must extend to end of the input region
4242 //--------------------------------------------------------------------------------
4243 void RegexMatcher::MatchChunkAt(int32_t startIdx
, UBool toEnd
, UErrorCode
&status
) {
4244 UBool isMatch
= FALSE
; // True if the we have a match.
4246 int32_t backSearchIndex
= INT32_MAX
; // used after greedy single-character matches for searching backwards
4248 int32_t op
; // Operation from the compiled pattern, split into
4249 int32_t opType
; // the opcode
4250 int32_t opValue
; // and the operand value.
4252 #ifdef REGEX_RUN_DEBUG
4255 printf("MatchAt(startIdx=%ld)\n", startIdx
);
4256 printf("Original Pattern: ");
4257 UChar32 c
= utext_next32From(fPattern
->fPattern
, 0);
4258 while (c
!= U_SENTINEL
) {
4259 if (c
<32 || c
>256) {
4262 REGEX_DUMP_DEBUG_PRINTF(("%c", c
));
4264 c
= UTEXT_NEXT32(fPattern
->fPattern
);
4267 printf("Input String: ");
4268 c
= utext_next32From(fInputText
, 0);
4269 while (c
!= U_SENTINEL
) {
4270 if (c
<32 || c
>256) {
4275 c
= UTEXT_NEXT32(fInputText
);
4282 if (U_FAILURE(status
)) {
4286 // Cache frequently referenced items from the compiled pattern
4288 int64_t *pat
= fPattern
->fCompiledPat
->getBuffer();
4290 const UChar
*litText
= fPattern
->fLiteralText
.getBuffer();
4291 UVector
*sets
= fPattern
->fSets
;
4293 const UChar
*inputBuf
= fInputText
->chunkContents
;
4295 fFrameSize
= fPattern
->fFrameSize
;
4296 REStackFrame
*fp
= resetStack();
4299 fp
->fInputIdx
= startIdx
;
4301 // Zero out the pattern's static data
4303 for (i
= 0; i
<fPattern
->fDataSize
; i
++) {
4308 // Main loop for interpreting the compiled pattern.
4309 // One iteration of the loop per pattern operation performed.
4313 if (_heapchk() != _HEAPOK
) {
4314 fprintf(stderr
, "Heap Trouble\n");
4318 op
= (int32_t)pat
[fp
->fPatIdx
];
4319 opType
= URX_TYPE(op
);
4320 opValue
= URX_VAL(op
);
4321 #ifdef REGEX_RUN_DEBUG
4323 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
4324 printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp
->fInputIdx
,
4325 UTEXT_CURRENT32(fInputText
), (int64_t *)fp
-fStack
->getBuffer(), fActiveLimit
);
4326 fPattern
->dumpOp(fp
->fPatIdx
);
4339 // Force a backtrack. In some circumstances, the pattern compiler
4340 // will notice that the pattern can't possibly match anything, and will
4341 // emit one of these at that point.
4342 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4347 if (fp
->fInputIdx
< fActiveLimit
) {
4349 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4356 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4362 // Test input against a literal string.
4363 // Strings require two slots in the compiled pattern, one for the
4364 // offset to the string text, and one for the length.
4365 int32_t stringStartIdx
= opValue
;
4368 op
= (int32_t)pat
[fp
->fPatIdx
]; // Fetch the second operand
4370 opType
= URX_TYPE(op
);
4371 stringLen
= URX_VAL(op
);
4372 U_ASSERT(opType
== URX_STRING_LEN
);
4373 U_ASSERT(stringLen
>= 2);
4375 const UChar
* pInp
= inputBuf
+ fp
->fInputIdx
;
4376 const UChar
* pInpLimit
= inputBuf
+ fActiveLimit
;
4377 const UChar
* pPat
= litText
+stringStartIdx
;
4378 const UChar
* pEnd
= pInp
+ stringLen
;
4379 UBool success
= TRUE
;
4380 while (pInp
< pEnd
) {
4381 if (pInp
>= pInpLimit
) {
4386 if (*pInp
++ != *pPat
++) {
4393 fp
->fInputIdx
+= stringLen
;
4395 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4401 case URX_STATE_SAVE
:
4402 fp
= StateSave(fp
, opValue
, status
);
4407 // The match loop will exit via this path on a successful match,
4408 // when we reach the end of the pattern.
4409 if (toEnd
&& fp
->fInputIdx
!= fActiveLimit
) {
4410 // The pattern matched, but not to the end of input. Try some more.
4411 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4417 // Start and End Capture stack frame variables are laid out out like this:
4418 // fp->fExtra[opValue] - The start of a completed capture group
4419 // opValue+1 - The end of a completed capture group
4420 // opValue+2 - the start of a capture group whose end
4421 // has not yet been reached (and might not ever be).
4422 case URX_START_CAPTURE
:
4423 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
4424 fp
->fExtra
[opValue
+2] = fp
->fInputIdx
;
4428 case URX_END_CAPTURE
:
4429 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
4430 U_ASSERT(fp
->fExtra
[opValue
+2] >= 0); // Start pos for this group must be set.
4431 fp
->fExtra
[opValue
] = fp
->fExtra
[opValue
+2]; // Tentative start becomes real.
4432 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // End position
4433 U_ASSERT(fp
->fExtra
[opValue
] <= fp
->fExtra
[opValue
+1]);
4437 case URX_DOLLAR
: // $, test for End of line
4438 // or for position before new line at end of input
4439 if (fp
->fInputIdx
< fAnchorLimit
-2) {
4440 // We are no where near the end of input. Fail.
4441 // This is the common case. Keep it first.
4442 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4445 if (fp
->fInputIdx
>= fAnchorLimit
) {
4446 // We really are at the end of input. Success.
4452 // If we are positioned just before a new-line that is located at the
4453 // end of input, succeed.
4454 if (fp
->fInputIdx
== fAnchorLimit
-1) {
4456 U16_GET(inputBuf
, fAnchorStart
, fp
->fInputIdx
, fAnchorLimit
, c
);
4458 if ((c
>=0x0a && c
<=0x0d) || c
==0x85 || c
==0x2028 || c
==0x2029) {
4459 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& inputBuf
[fp
->fInputIdx
-1]==0x0d)) {
4460 // At new-line at end of input. Success
4466 } else if (fp
->fInputIdx
== fAnchorLimit
-2 &&
4467 inputBuf
[fp
->fInputIdx
]==0x0d && inputBuf
[fp
->fInputIdx
+1]==0x0a) {
4470 break; // At CR/LF at end of input. Success
4473 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4478 case URX_DOLLAR_D
: // $, test for End of Line, in UNIX_LINES mode.
4479 if (fp
->fInputIdx
>= fAnchorLimit
-1) {
4480 // Either at the last character of input, or off the end.
4481 if (fp
->fInputIdx
== fAnchorLimit
-1) {
4482 // At last char of input. Success if it's a new line.
4483 if (inputBuf
[fp
->fInputIdx
] == 0x0a) {
4489 // Off the end of input. Success.
4496 // Not at end of input. Back-track out.
4497 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4501 case URX_DOLLAR_M
: // $, test for End of line in multi-line mode
4503 if (fp
->fInputIdx
>= fAnchorLimit
) {
4504 // We really are at the end of input. Success.
4509 // If we are positioned just before a new-line, succeed.
4510 // It makes no difference where the new-line is within the input.
4511 UChar32 c
= inputBuf
[fp
->fInputIdx
];
4512 if ((c
>=0x0a && c
<=0x0d) || c
==0x85 ||c
==0x2028 || c
==0x2029) {
4513 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
4514 // In multi-line mode, hitting a new-line just before the end of input does not
4515 // set the hitEnd or requireEnd flags
4516 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& inputBuf
[fp
->fInputIdx
-1]==0x0d)) {
4520 // not at a new line. Fail.
4521 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4526 case URX_DOLLAR_MD
: // $, test for End of line in multi-line and UNIX_LINES mode
4528 if (fp
->fInputIdx
>= fAnchorLimit
) {
4529 // We really are at the end of input. Success.
4531 fRequireEnd
= TRUE
; // Java set requireEnd in this case, even though
4532 break; // adding a new-line would not lose the match.
4534 // If we are not positioned just before a new-line, the test fails; backtrack out.
4535 // It makes no difference where the new-line is within the input.
4536 if (inputBuf
[fp
->fInputIdx
] != 0x0a) {
4537 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4543 case URX_CARET
: // ^, test for start of line
4544 if (fp
->fInputIdx
!= fAnchorStart
) {
4545 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4550 case URX_CARET_M
: // ^, test for start of line in mulit-line mode
4552 if (fp
->fInputIdx
== fAnchorStart
) {
4553 // We are at the start input. Success.
4556 // Check whether character just before the current pos is a new-line
4557 // unless we are at the end of input
4558 UChar c
= inputBuf
[fp
->fInputIdx
- 1];
4559 if ((fp
->fInputIdx
< fAnchorLimit
) &&
4560 ((c
<=0x0d && c
>=0x0a) || c
==0x85 ||c
==0x2028 || c
==0x2029)) {
4561 // It's a new-line. ^ is true. Success.
4562 // TODO: what should be done with positions between a CR and LF?
4565 // Not at the start of a line. Fail.
4566 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4571 case URX_CARET_M_UNIX
: // ^, test for start of line in mulit-line + Unix-line mode
4573 U_ASSERT(fp
->fInputIdx
>= fAnchorStart
);
4574 if (fp
->fInputIdx
<= fAnchorStart
) {
4575 // We are at the start input. Success.
4578 // Check whether character just before the current pos is a new-line
4579 U_ASSERT(fp
->fInputIdx
<= fAnchorLimit
);
4580 UChar c
= inputBuf
[fp
->fInputIdx
- 1];
4582 // Not at the start of a line. Back-track out.
4583 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4588 case URX_BACKSLASH_B
: // Test for word boundaries
4590 UBool success
= isChunkWordBoundary((int32_t)fp
->fInputIdx
);
4591 success
^= (opValue
!= 0); // flip sense for \B
4593 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4599 case URX_BACKSLASH_BU
: // Test for word boundaries, Unicode-style
4601 UBool success
= isUWordBoundary(fp
->fInputIdx
);
4602 success
^= (opValue
!= 0); // flip sense for \B
4604 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4610 case URX_BACKSLASH_D
: // Test for decimal digit
4612 if (fp
->fInputIdx
>= fActiveLimit
) {
4614 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4619 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4620 int8_t ctype
= u_charType(c
); // TODO: make a unicode set for this. Will be faster.
4621 UBool success
= (ctype
== U_DECIMAL_DIGIT_NUMBER
);
4622 success
^= (opValue
!= 0); // flip sense for \D
4624 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4630 case URX_BACKSLASH_G
: // Test for position at end of previous match
4631 if (!((fMatch
&& fp
->fInputIdx
==fMatchEnd
) || (fMatch
==FALSE
&& fp
->fInputIdx
==fActiveStart
))) {
4632 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4637 case URX_BACKSLASH_X
:
4638 // Match a Grapheme, as defined by Unicode TR 29.
4639 // Differs slightly from Perl, which consumes combining marks independently
4643 // Fail if at end of input
4644 if (fp
->fInputIdx
>= fActiveLimit
) {
4646 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4650 // Examine (and consume) the current char.
4651 // Dispatch into a little state machine, based on the char.
4653 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4654 UnicodeSet
**sets
= fPattern
->fStaticSets
;
4655 if (sets
[URX_GC_NORMAL
]->contains(c
)) goto GC_Extend
;
4656 if (sets
[URX_GC_CONTROL
]->contains(c
)) goto GC_Control
;
4657 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
4658 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
4659 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
4660 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
4661 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
4667 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
4668 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4669 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
4670 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
4671 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
4672 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
4673 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c
);
4677 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
4678 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4679 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
4680 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
4681 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c
);
4685 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
4686 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4687 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
4688 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c
);
4692 // Combining characters are consumed here
4694 if (fp
->fInputIdx
>= fActiveLimit
) {
4697 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4698 if (sets
[URX_GC_EXTEND
]->contains(c
) == FALSE
) {
4699 U16_BACK_1(inputBuf
, 0, fp
->fInputIdx
);
4706 // Most control chars stand alone (don't combine with combining chars),
4707 // except for that CR/LF sequence is a single grapheme cluster.
4708 if (c
== 0x0d && fp
->fInputIdx
< fActiveLimit
&& inputBuf
[fp
->fInputIdx
] == 0x0a) {
4713 if (fp
->fInputIdx
>= fActiveLimit
) {
4722 case URX_BACKSLASH_Z
: // Test for end of Input
4723 if (fp
->fInputIdx
< fAnchorLimit
) {
4724 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4733 case URX_STATIC_SETREF
:
4735 // Test input character against one of the predefined sets
4736 // (Word Characters, for example)
4737 // The high bit of the op value is a flag for the match polarity.
4738 // 0: success if input char is in set.
4739 // 1: success if input char is not in set.
4740 if (fp
->fInputIdx
>= fActiveLimit
) {
4742 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4746 UBool success
= ((opValue
& URX_NEG_SET
) == URX_NEG_SET
);
4747 opValue
&= ~URX_NEG_SET
;
4748 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
4751 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4753 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
4754 if (s8
->contains(c
)) {
4758 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
4759 if (s
->contains(c
)) {
4764 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4770 case URX_STAT_SETREF_N
:
4772 // Test input character for NOT being a member of one of
4773 // the predefined sets (Word Characters, for example)
4774 if (fp
->fInputIdx
>= fActiveLimit
) {
4776 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4780 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
4783 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4785 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
4786 if (s8
->contains(c
) == FALSE
) {
4790 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
4791 if (s
->contains(c
) == FALSE
) {
4795 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4802 if (fp
->fInputIdx
>= fActiveLimit
) {
4804 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4808 U_ASSERT(opValue
> 0 && opValue
< sets
->size());
4810 // There is input left. Pick up one char and test it for set membership.
4812 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4814 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
4815 if (s8
->contains(c
)) {
4816 // The character is in the set. A Match.
4820 UnicodeSet
*s
= (UnicodeSet
*)sets
->elementAt(opValue
);
4821 if (s
->contains(c
)) {
4822 // The character is in the set. A Match.
4827 // the character wasn't in the set.
4828 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4835 // . matches anything, but stops at end-of-line.
4836 if (fp
->fInputIdx
>= fActiveLimit
) {
4837 // At end of input. Match failed. Backtrack out.
4839 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4843 // There is input left. Advance over one char, unless we've hit end-of-line
4845 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4846 if (((c
& 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
4847 ((c
<=0x0d && c
>=0x0a) || c
==0x85 ||c
==0x2028 || c
==0x2029)) {
4848 // End of line in normal mode. . does not match.
4849 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4856 case URX_DOTANY_ALL
:
4858 // . in dot-matches-all (including new lines) mode
4859 if (fp
->fInputIdx
>= fActiveLimit
) {
4860 // At end of input. Match failed. Backtrack out.
4862 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4866 // There is input left. Advance over one char, except if we are
4867 // at a cr/lf, advance over both of them.
4869 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4870 if (c
==0x0d && fp
->fInputIdx
< fActiveLimit
) {
4871 // In the case of a CR/LF, we need to advance over both.
4872 if (inputBuf
[fp
->fInputIdx
] == 0x0a) {
4873 U16_FWD_1(inputBuf
, fp
->fInputIdx
, fActiveLimit
);
4880 case URX_DOTANY_UNIX
:
4882 // '.' operator, matches all, but stops at end-of-line.
4883 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
4884 if (fp
->fInputIdx
>= fActiveLimit
) {
4885 // At end of input. Match failed. Backtrack out.
4887 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4891 // There is input left. Advance over one char, unless we've hit end-of-line
4893 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4895 // End of line in normal mode. '.' does not match the \n
4896 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4903 fp
->fPatIdx
= opValue
;
4911 U_ASSERT(opValue
< fPattern
->fCompiledPat
->size());
4912 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
4913 fp
->fPatIdx
= opValue
; // Then JMP.
4917 // This opcode is used with (x)+, when x can match a zero length string.
4918 // Same as JMP_SAV, except conditional on the match having made forward progress.
4919 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
4920 // data address of the input position at the start of the loop.
4922 U_ASSERT(opValue
> 0 && opValue
< fPattern
->fCompiledPat
->size());
4923 int32_t stoOp
= (int32_t)pat
[opValue
-1];
4924 U_ASSERT(URX_TYPE(stoOp
) == URX_STO_INP_LOC
);
4925 int32_t frameLoc
= URX_VAL(stoOp
);
4926 U_ASSERT(frameLoc
>= 0 && frameLoc
< fFrameSize
);
4927 int32_t prevInputIdx
= (int32_t)fp
->fExtra
[frameLoc
];
4928 U_ASSERT(prevInputIdx
<= fp
->fInputIdx
);
4929 if (prevInputIdx
< fp
->fInputIdx
) {
4930 // The match did make progress. Repeat the loop.
4931 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
4932 fp
->fPatIdx
= opValue
;
4933 fp
->fExtra
[frameLoc
] = fp
->fInputIdx
;
4935 // If the input position did not advance, we do nothing here,
4936 // execution will fall out of the loop.
4942 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
4943 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
4945 // Pick up the three extra operands that CTR_INIT has, and
4946 // skip the pattern location counter past
4947 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
4949 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
4950 int32_t minCount
= (int32_t)pat
[instrOperandLoc
+1];
4951 int32_t maxCount
= (int32_t)pat
[instrOperandLoc
+2];
4952 U_ASSERT(minCount
>=0);
4953 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
4954 U_ASSERT(loopLoc
>fp
->fPatIdx
);
4956 if (minCount
== 0) {
4957 fp
= StateSave(fp
, loopLoc
+1, status
);
4959 if (maxCount
== 0) {
4960 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4967 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
4968 int32_t initOp
= (int32_t)pat
[opValue
];
4969 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT
);
4970 int64_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
4971 int32_t minCount
= (int32_t)pat
[opValue
+2];
4972 int32_t maxCount
= (int32_t)pat
[opValue
+3];
4973 // Increment the counter. Note: we DIDN'T worry about counter
4974 // overflow, since the data comes from UnicodeStrings, which
4975 // stores its length in an int32_t. Do we have to think about
4976 // this now that we're using UText? Probably not, since the length
4977 // in UChar32s is still an int32_t.
4979 U_ASSERT(*pCounter
> 0);
4980 if ((uint64_t)*pCounter
>= (uint32_t)maxCount
) {
4981 U_ASSERT(*pCounter
== maxCount
|| maxCount
== -1);
4984 if (*pCounter
>= minCount
) {
4985 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
4987 fp
->fPatIdx
= opValue
+ 4; // Loop back.
4991 case URX_CTR_INIT_NG
:
4993 // Initialize a non-greedy loop
4994 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
4995 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
4997 // Pick up the three extra operands that CTR_INIT has, and
4998 // skip the pattern location counter past
4999 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
5001 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
5002 int32_t minCount
= (int32_t)pat
[instrOperandLoc
+1];
5003 int32_t maxCount
= (int32_t)pat
[instrOperandLoc
+2];
5004 U_ASSERT(minCount
>=0);
5005 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
5006 U_ASSERT(loopLoc
>fp
->fPatIdx
);
5008 if (minCount
== 0) {
5009 if (maxCount
!= 0) {
5010 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5012 fp
->fPatIdx
= loopLoc
+1; // Continue with stuff after repeated block
5017 case URX_CTR_LOOP_NG
:
5019 // Non-greedy {min, max} loops
5020 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
5021 int32_t initOp
= (int32_t)pat
[opValue
];
5022 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT_NG
);
5023 int64_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
5024 int32_t minCount
= (int32_t)pat
[opValue
+2];
5025 int32_t maxCount
= (int32_t)pat
[opValue
+3];
5026 // Increment the counter. Note: we DIDN'T worry about counter
5027 // overflow, since the data comes from UnicodeStrings, which
5028 // stores its length in an int32_t. Do we have to think about
5029 // this now that we're using UText? Probably not, since the length
5030 // in UChar32s is still an int32_t.
5032 U_ASSERT(*pCounter
> 0);
5034 if ((uint64_t)*pCounter
>= (uint32_t)maxCount
) {
5035 // The loop has matched the maximum permitted number of times.
5036 // Break out of here with no action. Matching will
5037 // continue with the following pattern.
5038 U_ASSERT(*pCounter
== maxCount
|| maxCount
== -1);
5042 if (*pCounter
< minCount
) {
5043 // We haven't met the minimum number of matches yet.
5044 // Loop back for another one.
5045 fp
->fPatIdx
= opValue
+ 4; // Loop back.
5047 // We do have the minimum number of matches.
5048 // Fall into the following pattern, but first do
5049 // a state save to the top of the loop, so that a failure
5050 // in the following pattern will try another iteration of the loop.
5051 fp
= StateSave(fp
, opValue
+ 4, status
);
5057 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
5058 fData
[opValue
] = fStack
->size();
5063 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
5064 int32_t newStackSize
= (int32_t)fData
[opValue
];
5065 U_ASSERT(newStackSize
<= fStack
->size());
5066 int64_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
5067 if (newFP
== (int64_t *)fp
) {
5071 for (i
=0; i
<fFrameSize
; i
++) {
5072 newFP
[i
] = ((int64_t *)fp
)[i
];
5074 fp
= (REStackFrame
*)newFP
;
5075 fStack
->setSize(newStackSize
);
5081 U_ASSERT(opValue
< fFrameSize
);
5082 int64_t groupStartIdx
= fp
->fExtra
[opValue
];
5083 int64_t groupEndIdx
= fp
->fExtra
[opValue
+1];
5084 U_ASSERT(groupStartIdx
<= groupEndIdx
);
5085 int64_t inputIndex
= fp
->fInputIdx
;
5086 if (groupStartIdx
< 0) {
5087 // This capture group has not participated in the match thus far,
5088 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
5091 UBool success
= TRUE
;
5092 for (int64_t groupIndex
= groupStartIdx
; groupIndex
< groupEndIdx
; ++groupIndex
,++inputIndex
) {
5093 if (inputIndex
>= fActiveLimit
) {
5098 if (inputBuf
[groupIndex
] != inputBuf
[inputIndex
]) {
5104 fp
->fInputIdx
= inputIndex
;
5106 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5113 U_ASSERT(opValue
< fFrameSize
);
5114 int64_t groupStartIdx
= fp
->fExtra
[opValue
];
5115 int64_t groupEndIdx
= fp
->fExtra
[opValue
+1];
5116 U_ASSERT(groupStartIdx
<= groupEndIdx
);
5117 if (groupStartIdx
< 0) {
5118 // This capture group has not participated in the match thus far,
5119 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
5122 CaseFoldingUCharIterator
captureGroupItr(inputBuf
, groupStartIdx
, groupEndIdx
);
5123 CaseFoldingUCharIterator
inputItr(inputBuf
, fp
->fInputIdx
, fActiveLimit
);
5125 // Note: if the capture group match was of an empty string the backref
5126 // match succeeds. Verified by testing: Perl matches succeed
5127 // in this case, so we do too.
5129 UBool success
= TRUE
;
5131 UChar32 captureGroupChar
= captureGroupItr
.next();
5132 if (captureGroupChar
== U_SENTINEL
) {
5136 UChar32 inputChar
= inputItr
.next();
5137 if (inputChar
== U_SENTINEL
) {
5142 if (inputChar
!= captureGroupChar
) {
5148 if (success
&& inputItr
.inExpansion()) {
5149 // We otained a match by consuming part of a string obtained from
5150 // case-folding a single code point of the input text.
5151 // This does not count as an overall match.
5156 fp
->fInputIdx
= inputItr
.getIndex();
5158 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5163 case URX_STO_INP_LOC
:
5165 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
);
5166 fp
->fExtra
[opValue
] = fp
->fInputIdx
;
5172 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
5174 int32_t dataLoc
= URX_VAL(pat
[instrOperandLoc
]);
5175 U_ASSERT(dataLoc
>= 0 && dataLoc
< fFrameSize
);
5176 int32_t savedInputIdx
= (int32_t)fp
->fExtra
[dataLoc
];
5177 U_ASSERT(savedInputIdx
<= fp
->fInputIdx
);
5178 if (savedInputIdx
< fp
->fInputIdx
) {
5179 fp
->fPatIdx
= opValue
; // JMP
5181 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no progress in loop.
5188 // Entering a lookahead block.
5189 // Save Stack Ptr, Input Pos.
5190 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5191 fData
[opValue
] = fStack
->size();
5192 fData
[opValue
+1] = fp
->fInputIdx
;
5193 fActiveStart
= fLookStart
; // Set the match region change for
5194 fActiveLimit
= fLookLimit
; // transparent bounds.
5200 // Leaving a look-ahead block.
5201 // restore Stack Ptr, Input Pos to positions they had on entry to block.
5202 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5203 int32_t stackSize
= fStack
->size();
5204 int32_t newStackSize
= (int32_t)fData
[opValue
];
5205 U_ASSERT(stackSize
>= newStackSize
);
5206 if (stackSize
> newStackSize
) {
5207 // Copy the current top frame back to the new (cut back) top frame.
5208 // This makes the capture groups from within the look-ahead
5209 // expression available.
5210 int64_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
5212 for (i
=0; i
<fFrameSize
; i
++) {
5213 newFP
[i
] = ((int64_t *)fp
)[i
];
5215 fp
= (REStackFrame
*)newFP
;
5216 fStack
->setSize(newStackSize
);
5218 fp
->fInputIdx
= fData
[opValue
+1];
5220 // Restore the active region bounds in the input string; they may have
5221 // been changed because of transparent bounds on a Region.
5222 fActiveStart
= fRegionStart
;
5223 fActiveLimit
= fRegionLimit
;
5228 if (fp
->fInputIdx
< fActiveLimit
) {
5230 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
5231 if (u_foldCase(c
, U_FOLD_CASE_DEFAULT
) == opValue
) {
5237 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5241 // Case-insensitive test input against a literal string.
5242 // Strings require two slots in the compiled pattern, one for the
5243 // offset to the string text, and one for the length.
5244 // The compiled string has already been case folded.
5246 const UChar
*patternString
= litText
+ opValue
;
5248 op
= (int32_t)pat
[fp
->fPatIdx
];
5250 opType
= URX_TYPE(op
);
5251 opValue
= URX_VAL(op
);
5252 U_ASSERT(opType
== URX_STRING_LEN
);
5253 int32_t patternStringLen
= opValue
; // Length of the string from the pattern.
5257 UBool success
= TRUE
;
5258 int32_t patternStringIdx
= 0;
5259 CaseFoldingUCharIterator
inputIterator(inputBuf
, fp
->fInputIdx
, fActiveLimit
);
5260 while (patternStringIdx
< patternStringLen
) {
5261 U16_NEXT(patternString
, patternStringIdx
, patternStringLen
, cPattern
);
5262 cText
= inputIterator
.next();
5263 if (cText
!= cPattern
) {
5265 if (cText
== U_SENTINEL
) {
5271 if (inputIterator
.inExpansion()) {
5276 fp
->fInputIdx
= inputIterator
.getIndex();
5278 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5285 // Entering a look-behind block.
5286 // Save Stack Ptr, Input Pos.
5287 // TODO: implement transparent bounds. Ticket #6067
5288 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5289 fData
[opValue
] = fStack
->size();
5290 fData
[opValue
+1] = fp
->fInputIdx
;
5291 // Init the variable containing the start index for attempted matches.
5292 fData
[opValue
+2] = -1;
5293 // Save input string length, then reset to pin any matches to end at
5294 // the current position.
5295 fData
[opValue
+3] = fActiveLimit
;
5296 fActiveLimit
= fp
->fInputIdx
;
5303 // Positive Look-Behind, at top of loop checking for matches of LB expression
5304 // at all possible input starting positions.
5306 // Fetch the min and max possible match lengths. They are the operands
5307 // of this op in the pattern.
5308 int32_t minML
= (int32_t)pat
[fp
->fPatIdx
++];
5309 int32_t maxML
= (int32_t)pat
[fp
->fPatIdx
++];
5310 U_ASSERT(minML
<= maxML
);
5311 U_ASSERT(minML
>= 0);
5313 // Fetch (from data) the last input index where a match was attempted.
5314 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5315 int64_t *lbStartIdx
= &fData
[opValue
+2];
5316 if (*lbStartIdx
< 0) {
5317 // First time through loop.
5318 *lbStartIdx
= fp
->fInputIdx
- minML
;
5320 // 2nd through nth time through the loop.
5321 // Back up start position for match by one.
5322 if (*lbStartIdx
== 0) {
5325 U16_BACK_1(inputBuf
, 0, *lbStartIdx
);
5329 if (*lbStartIdx
< 0 || *lbStartIdx
< fp
->fInputIdx
- maxML
) {
5330 // We have tried all potential match starting points without
5331 // getting a match. Backtrack out, and out of the
5332 // Look Behind altogether.
5333 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5334 int64_t restoreInputLen
= fData
[opValue
+3];
5335 U_ASSERT(restoreInputLen
>= fActiveLimit
);
5336 U_ASSERT(restoreInputLen
<= fInputLength
);
5337 fActiveLimit
= restoreInputLen
;
5341 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5342 // (successful match will fall off the end of the loop.)
5343 fp
= StateSave(fp
, fp
->fPatIdx
-3, status
);
5344 fp
->fInputIdx
= *lbStartIdx
;
5349 // End of a look-behind block, after a successful match.
5351 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5352 if (fp
->fInputIdx
!= fActiveLimit
) {
5353 // The look-behind expression matched, but the match did not
5354 // extend all the way to the point that we are looking behind from.
5355 // FAIL out of here, which will take us back to the LB_CONT, which
5356 // will retry the match starting at another position or fail
5357 // the look-behind altogether, whichever is appropriate.
5358 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5362 // Look-behind match is good. Restore the orignal input string length,
5363 // which had been truncated to pin the end of the lookbehind match to the
5364 // position being looked-behind.
5365 int64_t originalInputLen
= fData
[opValue
+3];
5366 U_ASSERT(originalInputLen
>= fActiveLimit
);
5367 U_ASSERT(originalInputLen
<= fInputLength
);
5368 fActiveLimit
= originalInputLen
;
5375 // Negative Look-Behind, at top of loop checking for matches of LB expression
5376 // at all possible input starting positions.
5378 // Fetch the extra parameters of this op.
5379 int32_t minML
= (int32_t)pat
[fp
->fPatIdx
++];
5380 int32_t maxML
= (int32_t)pat
[fp
->fPatIdx
++];
5381 int32_t continueLoc
= (int32_t)pat
[fp
->fPatIdx
++];
5382 continueLoc
= URX_VAL(continueLoc
);
5383 U_ASSERT(minML
<= maxML
);
5384 U_ASSERT(minML
>= 0);
5385 U_ASSERT(continueLoc
> fp
->fPatIdx
);
5387 // Fetch (from data) the last input index where a match was attempted.
5388 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5389 int64_t *lbStartIdx
= &fData
[opValue
+2];
5390 if (*lbStartIdx
< 0) {
5391 // First time through loop.
5392 *lbStartIdx
= fp
->fInputIdx
- minML
;
5394 // 2nd through nth time through the loop.
5395 // Back up start position for match by one.
5396 if (*lbStartIdx
== 0) {
5397 (*lbStartIdx
)--; // Because U16_BACK is unsafe starting at 0.
5399 U16_BACK_1(inputBuf
, 0, *lbStartIdx
);
5403 if (*lbStartIdx
< 0 || *lbStartIdx
< fp
->fInputIdx
- maxML
) {
5404 // We have tried all potential match starting points without
5405 // getting a match, which means that the negative lookbehind as
5406 // a whole has succeeded. Jump forward to the continue location
5407 int64_t restoreInputLen
= fData
[opValue
+3];
5408 U_ASSERT(restoreInputLen
>= fActiveLimit
);
5409 U_ASSERT(restoreInputLen
<= fInputLength
);
5410 fActiveLimit
= restoreInputLen
;
5411 fp
->fPatIdx
= continueLoc
;
5415 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5416 // (successful match will cause a FAIL out of the loop altogether.)
5417 fp
= StateSave(fp
, fp
->fPatIdx
-4, status
);
5418 fp
->fInputIdx
= *lbStartIdx
;
5423 // End of a negative look-behind block, after a successful match.
5425 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5426 if (fp
->fInputIdx
!= fActiveLimit
) {
5427 // The look-behind expression matched, but the match did not
5428 // extend all the way to the point that we are looking behind from.
5429 // FAIL out of here, which will take us back to the LB_CONT, which
5430 // will retry the match starting at another position or succeed
5431 // the look-behind altogether, whichever is appropriate.
5432 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5436 // Look-behind expression matched, which means look-behind test as
5439 // Restore the orignal input string length, which had been truncated
5440 // inorder to pin the end of the lookbehind match
5441 // to the position being looked-behind.
5442 int64_t originalInputLen
= fData
[opValue
+3];
5443 U_ASSERT(originalInputLen
>= fActiveLimit
);
5444 U_ASSERT(originalInputLen
<= fInputLength
);
5445 fActiveLimit
= originalInputLen
;
5447 // Restore original stack position, discarding any state saved
5448 // by the successful pattern match.
5449 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5450 int32_t newStackSize
= (int32_t)fData
[opValue
];
5451 U_ASSERT(fStack
->size() > newStackSize
);
5452 fStack
->setSize(newStackSize
);
5454 // FAIL, which will take control back to someplace
5455 // prior to entering the look-behind test.
5456 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5462 // Loop Initialization for the optimized implementation of
5463 // [some character set]*
5464 // This op scans through all matching input.
5465 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5467 U_ASSERT(opValue
> 0 && opValue
< sets
->size());
5468 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
5469 UnicodeSet
*s
= (UnicodeSet
*)sets
->elementAt(opValue
);
5471 // Loop through input, until either the input is exhausted or
5472 // we reach a character that is not a member of the set.
5473 int32_t ix
= (int32_t)fp
->fInputIdx
;
5475 if (ix
>= fActiveLimit
) {
5480 U16_NEXT(inputBuf
, ix
, fActiveLimit
, c
);
5482 if (s8
->contains(c
) == FALSE
) {
5483 U16_BACK_1(inputBuf
, 0, ix
);
5487 if (s
->contains(c
) == FALSE
) {
5488 U16_BACK_1(inputBuf
, 0, ix
);
5494 // If there were no matching characters, skip over the loop altogether.
5495 // The loop doesn't run at all, a * op always succeeds.
5496 if (ix
== fp
->fInputIdx
) {
5497 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
5501 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5502 // must follow. It's operand is the stack location
5503 // that holds the starting input index for the match of this [set]*
5504 int32_t loopcOp
= (int32_t)pat
[fp
->fPatIdx
];
5505 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
5506 int32_t stackLoc
= URX_VAL(loopcOp
);
5507 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
5508 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
5511 // Save State to the URX_LOOP_C op that follows this one,
5512 // so that match failures in the following code will return to there.
5513 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5514 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5520 case URX_LOOP_DOT_I
:
5521 // Loop Initialization for the optimized implementation of .*
5522 // This op scans through all remaining input.
5523 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5525 // Loop through input until the input is exhausted (we reach an end-of-line)
5526 // In DOTALL mode, we can just go straight to the end of the input.
5528 if ((opValue
& 1) == 1) {
5529 // Dot-matches-All mode. Jump straight to the end of the string.
5530 ix
= (int32_t)fActiveLimit
;
5533 // NOT DOT ALL mode. Line endings do not match '.'
5534 // Scan forward until a line ending or end of input.
5535 ix
= (int32_t)fp
->fInputIdx
;
5537 if (ix
>= fActiveLimit
) {
5542 U16_NEXT(inputBuf
, ix
, fActiveLimit
, c
); // c = inputBuf[ix++]
5543 if ((c
& 0x7f) <= 0x29) { // Fast filter of non-new-line-s
5544 if ((c
== 0x0a) || // 0x0a is newline in both modes.
5545 (((opValue
& 2) == 0) && // IF not UNIX_LINES mode
5546 ((c
<=0x0d && c
>=0x0a) || c
==0x85 || c
==0x2028 || c
==0x2029))) {
5547 // char is a line ending. Put the input pos back to the
5548 // line ending char, and exit the scanning loop.
5549 U16_BACK_1(inputBuf
, 0, ix
);
5556 // If there were no matching characters, skip over the loop altogether.
5557 // The loop doesn't run at all, a * op always succeeds.
5558 if (ix
== fp
->fInputIdx
) {
5559 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
5563 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5564 // must follow. It's operand is the stack location
5565 // that holds the starting input index for the match of this .*
5566 int32_t loopcOp
= (int32_t)pat
[fp
->fPatIdx
];
5567 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
5568 int32_t stackLoc
= URX_VAL(loopcOp
);
5569 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
5570 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
5573 // Save State to the URX_LOOP_C op that follows this one,
5574 // so that match failures in the following code will return to there.
5575 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5576 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5584 U_ASSERT(opValue
>=0 && opValue
<fFrameSize
);
5585 backSearchIndex
= (int32_t)fp
->fExtra
[opValue
];
5586 U_ASSERT(backSearchIndex
<= fp
->fInputIdx
);
5587 if (backSearchIndex
== fp
->fInputIdx
) {
5588 // We've backed up the input idx to the point that the loop started.
5589 // The loop is done. Leave here without saving state.
5590 // Subsequent failures won't come back here.
5593 // Set up for the next iteration of the loop, with input index
5594 // backed up by one from the last time through,
5595 // and a state save to this instruction in case the following code fails again.
5596 // (We're going backwards because this loop emulates stack unwinding, not
5597 // the initial scan forward.)
5598 U_ASSERT(fp
->fInputIdx
> 0);
5600 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, prevC
); // !!!: should this 0 be one of f*Limit?
5602 if (prevC
== 0x0a &&
5603 fp
->fInputIdx
> backSearchIndex
&&
5604 inputBuf
[fp
->fInputIdx
-1] == 0x0d) {
5605 int32_t prevOp
= (int32_t)pat
[fp
->fPatIdx
-2];
5606 if (URX_TYPE(prevOp
) == URX_LOOP_DOT_I
) {
5607 // .*, stepping back over CRLF pair.
5608 U16_BACK_1(inputBuf
, 0, fp
->fInputIdx
);
5613 fp
= StateSave(fp
, fp
->fPatIdx
-1, status
);
5620 // Trouble. The compiled pattern contains an entry with an
5621 // unrecognized type tag.
5625 if (U_FAILURE(status
)) {
5634 fLastMatchEnd
= fMatchEnd
;
5635 fMatchStart
= startIdx
;
5636 fMatchEnd
= fp
->fInputIdx
;
5638 REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart
, fMatchEnd
));
5644 REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
5648 fFrame
= fp
; // The active stack frame when the engine stopped.
5649 // Contains the capture group results that we need to
5656 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher
)
5660 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS