1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **************************************************************************
5 * Copyright (C) 2002-2016 International Business Machines Corporation
6 * and others. All rights reserved.
7 **************************************************************************
12 // Contains the implementation of class RegexMatcher,
13 // which is one of the main API classes for the ICU regular expression package.
16 #include "unicode/utypes.h"
17 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
19 #include "unicode/regex.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/ustring.h"
23 #include "unicode/rbbi.h"
24 #include "unicode/utf.h"
25 #include "unicode/utf16.h"
37 // #include <malloc.h> // Needed for heapcheck testing
42 // Default limit for the size of the back track stack, to avoid system
43 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
44 // This value puts ICU's limits higher than most other regexp implementations,
45 // which use recursion rather than the heap, and take more storage per
48 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY
= 8000000;
50 // Time limit counter constant.
51 // Time limits for expression evaluation are in terms of quanta of work by
52 // the engine, each of which is 10,000 state saves.
53 // This constant determines that state saves per tick number.
54 static const int32_t TIMER_INITIAL_VALUE
= 10000;
57 // Test for any of the Unicode line terminating characters.
58 static inline UBool
isLineTerminator(UChar32 c
) {
59 if (c
& ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
62 return (c
<=0x0d && c
>=0x0a) || c
==0x85 || c
==0x2028 || c
==0x2029;
65 //-----------------------------------------------------------------------------
67 // Constructor and Destructor
69 //-----------------------------------------------------------------------------
70 RegexMatcher::RegexMatcher(const RegexPattern
*pat
) {
71 fDeferredStatus
= U_ZERO_ERROR
;
72 init(fDeferredStatus
);
73 if (U_FAILURE(fDeferredStatus
)) {
77 fDeferredStatus
= U_ILLEGAL_ARGUMENT_ERROR
;
81 init2(RegexStaticSets::gStaticSets
->fEmptyText
, fDeferredStatus
);
86 RegexMatcher::RegexMatcher(const UnicodeString
®exp
, const UnicodeString
&input
,
87 uint32_t flags
, UErrorCode
&status
) {
89 if (U_FAILURE(status
)) {
93 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
94 fPattern
= fPatternOwned
;
96 UText inputText
= UTEXT_INITIALIZER
;
97 utext_openConstUnicodeString(&inputText
, &input
, &status
);
98 init2(&inputText
, status
);
99 utext_close(&inputText
);
101 fInputUniStrMaybeMutable
= TRUE
;
105 RegexMatcher::RegexMatcher(UText
*regexp
, UText
*input
,
106 uint32_t flags
, UErrorCode
&status
) {
108 if (U_FAILURE(status
)) {
112 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
113 if (U_FAILURE(status
)) {
117 fPattern
= fPatternOwned
;
118 init2(input
, status
);
122 RegexMatcher::RegexMatcher(const UnicodeString
®exp
,
123 uint32_t flags
, UErrorCode
&status
) {
125 if (U_FAILURE(status
)) {
129 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
130 if (U_FAILURE(status
)) {
133 fPattern
= fPatternOwned
;
134 init2(RegexStaticSets::gStaticSets
->fEmptyText
, status
);
137 RegexMatcher::RegexMatcher(UText
*regexp
,
138 uint32_t flags
, UErrorCode
&status
) {
140 if (U_FAILURE(status
)) {
144 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
145 if (U_FAILURE(status
)) {
149 fPattern
= fPatternOwned
;
150 init2(RegexStaticSets::gStaticSets
->fEmptyText
, status
);
156 RegexMatcher::~RegexMatcher() {
158 if (fData
!= fSmallData
) {
163 delete fPatternOwned
;
164 fPatternOwned
= NULL
;
172 utext_close(fInputText
);
175 utext_close(fAltInputText
);
178 #if UCONFIG_NO_BREAK_ITERATION==0
179 delete fWordBreakItr
;
184 // init() common initialization for use by all constructors.
185 // Initialize all fields, get the object into a consistent state.
186 // This must be done even when the initial status shows an error,
187 // so that the object is initialized sufficiently well for the destructor
190 void RegexMatcher::init(UErrorCode
&status
) {
192 fPatternOwned
= NULL
;
202 fTransparentBounds
= FALSE
;
203 fAnchoringBounds
= TRUE
;
216 fStackLimit
= DEFAULT_BACKTRACK_STACK_CAPACITY
;
218 fCallbackContext
= NULL
;
219 fFindProgressCallbackFn
= NULL
;
220 fFindProgressCallbackContext
= NULL
;
222 fDeferredStatus
= status
;
224 fWordBreakItr
= NULL
;
228 fAltInputText
= NULL
;
231 fInputUniStrMaybeMutable
= FALSE
;
235 // init2() Common initialization for use by RegexMatcher constructors, part 2.
236 // This handles the common setup to be done after the Pattern is available.
238 void RegexMatcher::init2(UText
*input
, UErrorCode
&status
) {
239 if (U_FAILURE(status
)) {
240 fDeferredStatus
= status
;
244 if (fPattern
->fDataSize
> UPRV_LENGTHOF(fSmallData
)) {
245 fData
= (int64_t *)uprv_malloc(fPattern
->fDataSize
* sizeof(int64_t));
247 status
= fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
252 fStack
= new UVector64(status
);
253 if (fStack
== NULL
) {
254 status
= fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
259 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY
, status
);
260 if (U_FAILURE(status
)) {
261 fDeferredStatus
= status
;
267 static const UChar BACKSLASH
= 0x5c;
268 static const UChar DOLLARSIGN
= 0x24;
269 static const UChar LEFTBRACKET
= 0x7b;
270 static const UChar RIGHTBRACKET
= 0x7d;
272 //--------------------------------------------------------------------------------
276 //--------------------------------------------------------------------------------
277 RegexMatcher
&RegexMatcher::appendReplacement(UnicodeString
&dest
,
278 const UnicodeString
&replacement
,
279 UErrorCode
&status
) {
280 UText replacementText
= UTEXT_INITIALIZER
;
282 utext_openConstUnicodeString(&replacementText
, &replacement
, &status
);
283 if (U_SUCCESS(status
)) {
284 UText resultText
= UTEXT_INITIALIZER
;
285 utext_openUnicodeString(&resultText
, &dest
, &status
);
287 if (U_SUCCESS(status
)) {
288 appendReplacement(&resultText
, &replacementText
, status
);
289 utext_close(&resultText
);
291 utext_close(&replacementText
);
298 // appendReplacement, UText mode
300 RegexMatcher
&RegexMatcher::appendReplacement(UText
*dest
,
302 UErrorCode
&status
) {
303 if (U_FAILURE(status
)) {
306 if (U_FAILURE(fDeferredStatus
)) {
307 status
= fDeferredStatus
;
310 if (fMatch
== FALSE
) {
311 status
= U_REGEX_INVALID_STATE
;
315 // Copy input string from the end of previous match to start of current match
316 int64_t destLen
= utext_nativeLength(dest
);
317 if (fMatchStart
> fAppendPosition
) {
318 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
319 destLen
+= utext_replace(dest
, destLen
, destLen
, fInputText
->chunkContents
+fAppendPosition
,
320 (int32_t)(fMatchStart
-fAppendPosition
), &status
);
323 if (UTEXT_USES_U16(fInputText
)) {
324 len16
= (int32_t)(fMatchStart
-fAppendPosition
);
326 UErrorCode lengthStatus
= U_ZERO_ERROR
;
327 len16
= utext_extract(fInputText
, fAppendPosition
, fMatchStart
, NULL
, 0, &lengthStatus
);
329 UChar
*inputChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(len16
+1));
330 if (inputChars
== NULL
) {
331 status
= U_MEMORY_ALLOCATION_ERROR
;
334 utext_extract(fInputText
, fAppendPosition
, fMatchStart
, inputChars
, len16
+1, &status
);
335 destLen
+= utext_replace(dest
, destLen
, destLen
, inputChars
, len16
, &status
);
336 uprv_free(inputChars
);
339 fAppendPosition
= fMatchEnd
;
342 // scan the replacement text, looking for substitutions ($n) and \escapes.
343 // TODO: optimize this loop by efficiently scanning for '$' or '\',
344 // move entire ranges not containing substitutions.
345 UTEXT_SETNATIVEINDEX(replacement
, 0);
346 for (UChar32 c
= UTEXT_NEXT32(replacement
); U_SUCCESS(status
) && c
!= U_SENTINEL
; c
= UTEXT_NEXT32(replacement
)) {
347 if (c
== BACKSLASH
) {
348 // Backslash Escape. Copy the following char out without further checks.
349 // Note: Surrogate pairs don't need any special handling
350 // The second half wont be a '$' or a '\', and
351 // will move to the dest normally on the next
353 c
= UTEXT_CURRENT32(replacement
);
354 if (c
== U_SENTINEL
) {
358 if (c
==0x55/*U*/ || c
==0x75/*u*/) {
359 // We have a \udddd or \Udddddddd escape sequence.
361 struct URegexUTextUnescapeCharContext context
= U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement
);
362 UChar32 escapedChar
= u_unescapeAt(uregex_utext_unescape_charAt
, &offset
, INT32_MAX
, &context
);
363 if (escapedChar
!= (UChar32
)0xFFFFFFFF) {
364 if (U_IS_BMP(escapedChar
)) {
365 UChar c16
= (UChar
)escapedChar
;
366 destLen
+= utext_replace(dest
, destLen
, destLen
, &c16
, 1, &status
);
369 surrogate
[0] = U16_LEAD(escapedChar
);
370 surrogate
[1] = U16_TRAIL(escapedChar
);
371 if (U_SUCCESS(status
)) {
372 destLen
+= utext_replace(dest
, destLen
, destLen
, surrogate
, 2, &status
);
375 // TODO: Report errors for mal-formed \u escapes?
376 // As this is, the original sequence is output, which may be OK.
377 if (context
.lastOffset
== offset
) {
378 (void)UTEXT_PREVIOUS32(replacement
);
379 } else if (context
.lastOffset
!= offset
-1) {
380 utext_moveIndex32(replacement
, offset
- context
.lastOffset
- 1);
384 (void)UTEXT_NEXT32(replacement
);
385 // Plain backslash escape. Just put out the escaped character.
387 UChar c16
= (UChar
)c
;
388 destLen
+= utext_replace(dest
, destLen
, destLen
, &c16
, 1, &status
);
391 surrogate
[0] = U16_LEAD(c
);
392 surrogate
[1] = U16_TRAIL(c
);
393 if (U_SUCCESS(status
)) {
394 destLen
+= utext_replace(dest
, destLen
, destLen
, surrogate
, 2, &status
);
398 } else if (c
!= DOLLARSIGN
) {
399 // Normal char, not a $. Copy it out without further checks.
401 UChar c16
= (UChar
)c
;
402 destLen
+= utext_replace(dest
, destLen
, destLen
, &c16
, 1, &status
);
405 surrogate
[0] = U16_LEAD(c
);
406 surrogate
[1] = U16_TRAIL(c
);
407 if (U_SUCCESS(status
)) {
408 destLen
+= utext_replace(dest
, destLen
, destLen
, surrogate
, 2, &status
);
412 // We've got a $. Pick up a capture group name or number if one follows.
413 // Consume digits so long as the resulting group number <= the number of
414 // number of capture groups in the pattern.
416 int32_t groupNum
= 0;
417 int32_t numDigits
= 0;
418 UChar32 nextChar
= utext_current32(replacement
);
419 if (nextChar
== LEFTBRACKET
) {
420 // Scan for a Named Capture Group, ${name}.
421 UnicodeString groupName
;
422 utext_next32(replacement
);
423 while(U_SUCCESS(status
) && nextChar
!= RIGHTBRACKET
) {
424 nextChar
= utext_next32(replacement
);
425 if (nextChar
== U_SENTINEL
) {
426 status
= U_REGEX_INVALID_CAPTURE_GROUP_NAME
;
427 } else if ((nextChar
>= 0x41 && nextChar
<= 0x5a) || // A..Z
428 (nextChar
>= 0x61 && nextChar
<= 0x7a) || // a..z
429 (nextChar
>= 0x31 && nextChar
<= 0x39)) { // 0..9
430 groupName
.append(nextChar
);
431 } else if (nextChar
== RIGHTBRACKET
) {
432 groupNum
= fPattern
->fNamedCaptureMap
? uhash_geti(fPattern
->fNamedCaptureMap
, &groupName
) : 0;
434 status
= U_REGEX_INVALID_CAPTURE_GROUP_NAME
;
437 // Character was something other than a name char or a closing '}'
438 status
= U_REGEX_INVALID_CAPTURE_GROUP_NAME
;
442 } else if (u_isdigit(nextChar
)) {
443 // $n Scan for a capture group number
444 int32_t numCaptureGroups
= fPattern
->fGroupMap
->size();
446 nextChar
= UTEXT_CURRENT32(replacement
);
447 if (nextChar
== U_SENTINEL
) {
450 if (u_isdigit(nextChar
) == FALSE
) {
453 int32_t nextDigitVal
= u_charDigitValue(nextChar
);
454 if (groupNum
*10 + nextDigitVal
> numCaptureGroups
) {
455 // Don't consume the next digit if it makes the capture group number too big.
456 if (numDigits
== 0) {
457 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
461 (void)UTEXT_NEXT32(replacement
);
462 groupNum
=groupNum
*10 + nextDigitVal
;
466 // $ not followed by capture group name or number.
467 status
= U_REGEX_INVALID_CAPTURE_GROUP_NAME
;
470 if (U_SUCCESS(status
)) {
471 destLen
+= appendGroup(groupNum
, dest
, status
);
473 } // End of $ capture group handling
474 } // End of per-character loop through the replacement string.
481 //--------------------------------------------------------------------------------
483 // appendTail Intended to be used in conjunction with appendReplacement()
484 // To the destination string, append everything following
485 // the last match position from the input string.
487 // Note: Match ranges do not affect appendTail or appendReplacement
489 //--------------------------------------------------------------------------------
490 UnicodeString
&RegexMatcher::appendTail(UnicodeString
&dest
) {
491 UErrorCode status
= U_ZERO_ERROR
;
492 UText resultText
= UTEXT_INITIALIZER
;
493 utext_openUnicodeString(&resultText
, &dest
, &status
);
495 if (U_SUCCESS(status
)) {
496 appendTail(&resultText
, status
);
497 utext_close(&resultText
);
504 // appendTail, UText mode
506 UText
*RegexMatcher::appendTail(UText
*dest
, UErrorCode
&status
) {
507 if (U_FAILURE(status
)) {
510 if (U_FAILURE(fDeferredStatus
)) {
511 status
= fDeferredStatus
;
515 if (fInputLength
> fAppendPosition
) {
516 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
517 int64_t destLen
= utext_nativeLength(dest
);
518 utext_replace(dest
, destLen
, destLen
, fInputText
->chunkContents
+fAppendPosition
,
519 (int32_t)(fInputLength
-fAppendPosition
), &status
);
522 if (UTEXT_USES_U16(fInputText
)) {
523 len16
= (int32_t)(fInputLength
-fAppendPosition
);
525 len16
= utext_extract(fInputText
, fAppendPosition
, fInputLength
, NULL
, 0, &status
);
526 status
= U_ZERO_ERROR
; // buffer overflow
529 UChar
*inputChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(len16
));
530 if (inputChars
== NULL
) {
531 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
533 utext_extract(fInputText
, fAppendPosition
, fInputLength
, inputChars
, len16
, &status
); // unterminated
534 int64_t destLen
= utext_nativeLength(dest
);
535 utext_replace(dest
, destLen
, destLen
, inputChars
, len16
, &status
);
536 uprv_free(inputChars
);
545 //--------------------------------------------------------------------------------
549 //--------------------------------------------------------------------------------
550 int32_t RegexMatcher::end(UErrorCode
&err
) const {
554 int64_t RegexMatcher::end64(UErrorCode
&err
) const {
555 return end64(0, err
);
558 int64_t RegexMatcher::end64(int32_t group
, UErrorCode
&err
) const {
559 if (U_FAILURE(err
)) {
562 if (fMatch
== FALSE
) {
563 err
= U_REGEX_INVALID_STATE
;
566 if (group
< 0 || group
> fPattern
->fGroupMap
->size()) {
567 err
= U_INDEX_OUTOFBOUNDS_ERROR
;
574 // Get the position within the stack frame of the variables for
575 // this capture group.
576 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(group
-1);
577 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
578 U_ASSERT(groupOffset
>= 0);
579 e
= fFrame
->fExtra
[groupOffset
+ 1];
585 int32_t RegexMatcher::end(int32_t group
, UErrorCode
&err
) const {
586 return (int32_t)end64(group
, err
);
589 //--------------------------------------------------------------------------------
591 // findProgressInterrupt This function is called once for each advance in the target
592 // string from the find() function, and calls the user progress callback
593 // function if there is one installed.
595 // Return: TRUE if the find operation is to be terminated.
596 // FALSE if the find operation is to continue running.
598 //--------------------------------------------------------------------------------
599 UBool
RegexMatcher::findProgressInterrupt(int64_t pos
, UErrorCode
&status
) {
600 if (fFindProgressCallbackFn
&& !(*fFindProgressCallbackFn
)(fFindProgressCallbackContext
, pos
)) {
601 status
= U_REGEX_STOPPED_BY_CALLER
;
607 //--------------------------------------------------------------------------------
611 //--------------------------------------------------------------------------------
612 UBool
RegexMatcher::find() {
613 if (U_FAILURE(fDeferredStatus
)) {
616 UErrorCode status
= U_ZERO_ERROR
;
617 UBool result
= find(status
);
621 //--------------------------------------------------------------------------------
625 //--------------------------------------------------------------------------------
626 UBool
RegexMatcher::find(UErrorCode
&status
) {
627 // Start at the position of the last match end. (Will be zero if the
628 // matcher has been reset.)
630 if (U_FAILURE(status
)) {
633 if (U_FAILURE(fDeferredStatus
)) {
634 status
= fDeferredStatus
;
638 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
639 return findUsingChunk(status
);
642 int64_t startPos
= fMatchEnd
;
644 startPos
= fActiveStart
;
648 // Save the position of any previous successful match.
649 fLastMatchEnd
= fMatchEnd
;
651 if (fMatchStart
== fMatchEnd
) {
652 // Previous match had zero length. Move start position up one position
653 // to avoid sending find() into a loop on zero-length matches.
654 if (startPos
>= fActiveLimit
) {
659 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
660 (void)UTEXT_NEXT32(fInputText
);
661 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
664 if (fLastMatchEnd
>= 0) {
665 // A previous find() failed to match. Don't try again.
666 // (without this test, a pattern with a zero-length match
667 // could match again at the end of an input string.)
674 // Compute the position in the input string beyond which a match can not begin, because
675 // the minimum length match would extend past the end of the input.
676 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
677 // Be aware of possible overflows if making changes here.
678 int64_t testStartLimit
;
679 if (UTEXT_USES_U16(fInputText
)) {
680 testStartLimit
= fActiveLimit
- fPattern
->fMinMatchLen
;
681 if (startPos
> testStartLimit
) {
687 // We don't know exactly how long the minimum match length is in native characters.
688 // Treat anything > 0 as 1.
689 testStartLimit
= fActiveLimit
- (fPattern
->fMinMatchLen
> 0 ? 1 : 0);
693 U_ASSERT(startPos
>= 0);
695 switch (fPattern
->fStartType
) {
697 // No optimization was found.
698 // Try a match at each input position.
700 MatchAt(startPos
, FALSE
, status
);
701 if (U_FAILURE(status
)) {
707 if (startPos
>= testStartLimit
) {
711 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
712 (void)UTEXT_NEXT32(fInputText
);
713 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
714 // Note that it's perfectly OK for a pattern to have a zero-length
715 // match at the end of a string, so we must make sure that the loop
716 // runs with startPos == testStartLimit the last time through.
717 if (findProgressInterrupt(startPos
, status
))
723 // Matches are only possible at the start of the input string
724 // (pattern begins with ^ or \A)
725 if (startPos
> fActiveStart
) {
729 MatchAt(startPos
, FALSE
, status
);
730 if (U_FAILURE(status
)) {
738 // Match may start on any char from a pre-computed set.
739 U_ASSERT(fPattern
->fMinMatchLen
> 0);
740 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
742 int64_t pos
= startPos
;
743 c
= UTEXT_NEXT32(fInputText
);
744 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
745 // c will be -1 (U_SENTINEL) at end of text, in which case we
746 // skip this next block (so we don't have a negative array index)
747 // and handle end of text in the following block.
748 if (c
>= 0 && ((c
<256 && fPattern
->fInitialChars8
->contains(c
)) ||
749 (c
>=256 && fPattern
->fInitialChars
->contains(c
)))) {
750 MatchAt(pos
, FALSE
, status
);
751 if (U_FAILURE(status
)) {
757 UTEXT_SETNATIVEINDEX(fInputText
, pos
);
759 if (startPos
> testStartLimit
) {
764 if (findProgressInterrupt(startPos
, status
))
773 // Match starts on exactly one char.
774 U_ASSERT(fPattern
->fMinMatchLen
> 0);
775 UChar32 theChar
= fPattern
->fInitialChar
;
776 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
778 int64_t pos
= startPos
;
779 c
= UTEXT_NEXT32(fInputText
);
780 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
782 MatchAt(pos
, FALSE
, status
);
783 if (U_FAILURE(status
)) {
789 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
791 if (startPos
> testStartLimit
) {
796 if (findProgressInterrupt(startPos
, status
))
805 if (startPos
== fAnchorStart
) {
806 MatchAt(startPos
, FALSE
, status
);
807 if (U_FAILURE(status
)) {
813 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
814 ch
= UTEXT_NEXT32(fInputText
);
815 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
817 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
818 ch
= UTEXT_PREVIOUS32(fInputText
);
819 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
822 if (fPattern
->fFlags
& UREGEX_UNIX_LINES
) {
825 MatchAt(startPos
, FALSE
, status
);
826 if (U_FAILURE(status
)) {
832 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
834 if (startPos
>= testStartLimit
) {
839 ch
= UTEXT_NEXT32(fInputText
);
840 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
841 // Note that it's perfectly OK for a pattern to have a zero-length
842 // match at the end of a string, so we must make sure that the loop
843 // runs with startPos == testStartLimit the last time through.
844 if (findProgressInterrupt(startPos
, status
))
849 if (isLineTerminator(ch
)) {
850 if (ch
== 0x0d && startPos
< fActiveLimit
&& UTEXT_CURRENT32(fInputText
) == 0x0a) {
851 (void)UTEXT_NEXT32(fInputText
);
852 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
854 MatchAt(startPos
, FALSE
, status
);
855 if (U_FAILURE(status
)) {
861 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
863 if (startPos
>= testStartLimit
) {
868 ch
= UTEXT_NEXT32(fInputText
);
869 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
870 // Note that it's perfectly OK for a pattern to have a zero-length
871 // match at the end of a string, so we must make sure that the loop
872 // runs with startPos == testStartLimit the last time through.
873 if (findProgressInterrupt(startPos
, status
))
888 UBool
RegexMatcher::find(int64_t start
, UErrorCode
&status
) {
889 if (U_FAILURE(status
)) {
892 if (U_FAILURE(fDeferredStatus
)) {
893 status
= fDeferredStatus
;
896 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
897 // This will reset the region to be the full input length.
899 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
903 int64_t nativeStart
= start
;
904 if (nativeStart
< fActiveStart
|| nativeStart
> fActiveLimit
) {
905 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
908 fMatchEnd
= nativeStart
;
913 //--------------------------------------------------------------------------------
915 // findUsingChunk() -- like find(), but with the advance knowledge that the
916 // entire string is available in the UText's chunk buffer.
918 //--------------------------------------------------------------------------------
919 UBool
RegexMatcher::findUsingChunk(UErrorCode
&status
) {
920 // Start at the position of the last match end. (Will be zero if the
921 // matcher has been reset.
924 int32_t startPos
= (int32_t)fMatchEnd
;
926 startPos
= (int32_t)fActiveStart
;
929 const UChar
*inputBuf
= fInputText
->chunkContents
;
932 // Save the position of any previous successful match.
933 fLastMatchEnd
= fMatchEnd
;
935 if (fMatchStart
== fMatchEnd
) {
936 // Previous match had zero length. Move start position up one position
937 // to avoid sending find() into a loop on zero-length matches.
938 if (startPos
>= fActiveLimit
) {
943 U16_FWD_1(inputBuf
, startPos
, fInputLength
);
946 if (fLastMatchEnd
>= 0) {
947 // A previous find() failed to match. Don't try again.
948 // (without this test, a pattern with a zero-length match
949 // could match again at the end of an input string.)
956 // Compute the position in the input string beyond which a match can not begin, because
957 // the minimum length match would extend past the end of the input.
958 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
959 // Be aware of possible overflows if making changes here.
960 // Note: a match can begin at inputBuf + testLen; it is an inclusive limit.
961 int32_t testLen
= (int32_t)(fActiveLimit
- fPattern
->fMinMatchLen
);
962 if (startPos
> testLen
) {
969 U_ASSERT(startPos
>= 0);
971 switch (fPattern
->fStartType
) {
973 // No optimization was found.
974 // Try a match at each input position.
976 MatchChunkAt(startPos
, FALSE
, status
);
977 if (U_FAILURE(status
)) {
983 if (startPos
>= testLen
) {
987 U16_FWD_1(inputBuf
, startPos
, fActiveLimit
);
988 // Note that it's perfectly OK for a pattern to have a zero-length
989 // match at the end of a string, so we must make sure that the loop
990 // runs with startPos == testLen the last time through.
991 if (findProgressInterrupt(startPos
, status
))
997 // Matches are only possible at the start of the input string
998 // (pattern begins with ^ or \A)
999 if (startPos
> fActiveStart
) {
1003 MatchChunkAt(startPos
, FALSE
, status
);
1004 if (U_FAILURE(status
)) {
1012 // Match may start on any char from a pre-computed set.
1013 U_ASSERT(fPattern
->fMinMatchLen
> 0);
1015 int32_t pos
= startPos
;
1016 U16_NEXT(inputBuf
, startPos
, fActiveLimit
, c
); // like c = inputBuf[startPos++];
1017 if ((c
<256 && fPattern
->fInitialChars8
->contains(c
)) ||
1018 (c
>=256 && fPattern
->fInitialChars
->contains(c
))) {
1019 MatchChunkAt(pos
, FALSE
, status
);
1020 if (U_FAILURE(status
)) {
1027 if (startPos
> testLen
) {
1032 if (findProgressInterrupt(startPos
, status
))
1041 // Match starts on exactly one char.
1042 U_ASSERT(fPattern
->fMinMatchLen
> 0);
1043 UChar32 theChar
= fPattern
->fInitialChar
;
1045 int32_t pos
= startPos
;
1046 U16_NEXT(inputBuf
, startPos
, fActiveLimit
, c
); // like c = inputBuf[startPos++];
1048 MatchChunkAt(pos
, FALSE
, status
);
1049 if (U_FAILURE(status
)) {
1056 if (startPos
> testLen
) {
1061 if (findProgressInterrupt(startPos
, status
))
1070 if (startPos
== fAnchorStart
) {
1071 MatchChunkAt(startPos
, FALSE
, status
);
1072 if (U_FAILURE(status
)) {
1078 // In bug 31063104 which has a zero-length text buffer we get here with
1079 // inputBuf=NULL, startPos=fActiveLimit=0 (and fMatch F) which violates the
1080 // requirement for U16_FWD_1 (utf16.h) that startPos < fActiveLimit. Having
1081 // inputBuf=NULL (chunkContexts NULL) is probably due to an error in the
1082 // CFStringUText functions. Nevertheless, to be defensive, add test below.
1083 if (startPos
>= testLen
) {
1087 U16_FWD_1(inputBuf
, startPos
, fActiveLimit
);
1090 if (fPattern
->fFlags
& UREGEX_UNIX_LINES
) {
1092 ch
= inputBuf
[startPos
-1];
1094 MatchChunkAt(startPos
, FALSE
, status
);
1095 if (U_FAILURE(status
)) {
1102 if (startPos
>= testLen
) {
1107 U16_FWD_1(inputBuf
, startPos
, fActiveLimit
);
1108 // Note that it's perfectly OK for a pattern to have a zero-length
1109 // match at the end of a string, so we must make sure that the loop
1110 // runs with startPos == testLen the last time through.
1111 if (findProgressInterrupt(startPos
, status
))
1116 ch
= inputBuf
[startPos
-1];
1117 if (isLineTerminator(ch
)) {
1118 if (ch
== 0x0d && startPos
< fActiveLimit
&& inputBuf
[startPos
] == 0x0a) {
1121 MatchChunkAt(startPos
, FALSE
, status
);
1122 if (U_FAILURE(status
)) {
1129 if (startPos
>= testLen
) {
1134 U16_FWD_1(inputBuf
, startPos
, fActiveLimit
);
1135 // Note that it's perfectly OK for a pattern to have a zero-length
1136 // match at the end of a string, so we must make sure that the loop
1137 // runs with startPos == testLen the last time through.
1138 if (findProgressInterrupt(startPos
, status
))
1153 //--------------------------------------------------------------------------------
1157 //--------------------------------------------------------------------------------
1158 UnicodeString
RegexMatcher::group(UErrorCode
&status
) const {
1159 return group(0, status
);
1162 // Return immutable shallow clone
1163 UText
*RegexMatcher::group(UText
*dest
, int64_t &group_len
, UErrorCode
&status
) const {
1164 return group(0, dest
, group_len
, status
);
1167 // Return immutable shallow clone
1168 UText
*RegexMatcher::group(int32_t groupNum
, UText
*dest
, int64_t &group_len
, UErrorCode
&status
) const {
1170 if (U_FAILURE(status
)) {
1173 if (U_FAILURE(fDeferredStatus
)) {
1174 status
= fDeferredStatus
;
1175 } else if (fMatch
== FALSE
) {
1176 status
= U_REGEX_INVALID_STATE
;
1177 } else if (groupNum
< 0 || groupNum
> fPattern
->fGroupMap
->size()) {
1178 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1181 if (U_FAILURE(status
)) {
1186 if (groupNum
== 0) {
1190 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(groupNum
-1);
1191 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
1192 U_ASSERT(groupOffset
>= 0);
1193 s
= fFrame
->fExtra
[groupOffset
];
1194 e
= fFrame
->fExtra
[groupOffset
+1];
1198 // A capture group wasn't part of the match
1199 return utext_clone(dest
, fInputText
, FALSE
, TRUE
, &status
);
1204 dest
= utext_clone(dest
, fInputText
, FALSE
, TRUE
, &status
);
1206 UTEXT_SETNATIVEINDEX(dest
, s
);
1210 UnicodeString
RegexMatcher::group(int32_t groupNum
, UErrorCode
&status
) const {
1211 UnicodeString result
;
1212 int64_t groupStart
= start64(groupNum
, status
);
1213 int64_t groupEnd
= end64(groupNum
, status
);
1214 if (U_FAILURE(status
) || groupStart
== -1 || groupStart
== groupEnd
) {
1218 // Get the group length using a utext_extract preflight.
1219 // UText is actually pretty efficient at this when underlying encoding is UTF-16.
1220 int32_t length
= utext_extract(fInputText
, groupStart
, groupEnd
, NULL
, 0, &status
);
1221 if (status
!= U_BUFFER_OVERFLOW_ERROR
) {
1225 status
= U_ZERO_ERROR
;
1226 UChar
*buf
= result
.getBuffer(length
);
1228 status
= U_MEMORY_ALLOCATION_ERROR
;
1230 int32_t extractLength
= utext_extract(fInputText
, groupStart
, groupEnd
, buf
, length
, &status
);
1231 result
.releaseBuffer(extractLength
);
1232 U_ASSERT(length
== extractLength
);
1238 //--------------------------------------------------------------------------------
1240 // appendGroup() -- currently internal only, appends a group to a UText rather
1241 // than replacing its contents
1243 //--------------------------------------------------------------------------------
1245 int64_t RegexMatcher::appendGroup(int32_t groupNum
, UText
*dest
, UErrorCode
&status
) const {
1246 if (U_FAILURE(status
)) {
1249 if (U_FAILURE(fDeferredStatus
)) {
1250 status
= fDeferredStatus
;
1253 int64_t destLen
= utext_nativeLength(dest
);
1255 if (fMatch
== FALSE
) {
1256 status
= U_REGEX_INVALID_STATE
;
1257 return utext_replace(dest
, destLen
, destLen
, NULL
, 0, &status
);
1259 if (groupNum
< 0 || groupNum
> fPattern
->fGroupMap
->size()) {
1260 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1261 return utext_replace(dest
, destLen
, destLen
, NULL
, 0, &status
);
1265 if (groupNum
== 0) {
1269 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(groupNum
-1);
1270 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
1271 U_ASSERT(groupOffset
>= 0);
1272 s
= fFrame
->fExtra
[groupOffset
];
1273 e
= fFrame
->fExtra
[groupOffset
+1];
1277 // A capture group wasn't part of the match
1278 return utext_replace(dest
, destLen
, destLen
, NULL
, 0, &status
);
1283 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1284 U_ASSERT(e
<= fInputLength
);
1285 deltaLen
= utext_replace(dest
, destLen
, destLen
, fInputText
->chunkContents
+s
, (int32_t)(e
-s
), &status
);
1288 if (UTEXT_USES_U16(fInputText
)) {
1289 len16
= (int32_t)(e
-s
);
1291 UErrorCode lengthStatus
= U_ZERO_ERROR
;
1292 len16
= utext_extract(fInputText
, s
, e
, NULL
, 0, &lengthStatus
);
1294 UChar
*groupChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(len16
+1));
1295 if (groupChars
== NULL
) {
1296 status
= U_MEMORY_ALLOCATION_ERROR
;
1299 utext_extract(fInputText
, s
, e
, groupChars
, len16
+1, &status
);
1301 deltaLen
= utext_replace(dest
, destLen
, destLen
, groupChars
, len16
, &status
);
1302 uprv_free(groupChars
);
1309 //--------------------------------------------------------------------------------
1313 //--------------------------------------------------------------------------------
1314 int32_t RegexMatcher::groupCount() const {
1315 return fPattern
->fGroupMap
->size();
1318 //--------------------------------------------------------------------------------
1320 // hasAnchoringBounds()
1322 //--------------------------------------------------------------------------------
1323 UBool
RegexMatcher::hasAnchoringBounds() const {
1324 return fAnchoringBounds
;
1328 //--------------------------------------------------------------------------------
1330 // hasTransparentBounds()
1332 //--------------------------------------------------------------------------------
1333 UBool
RegexMatcher::hasTransparentBounds() const {
1334 return fTransparentBounds
;
1339 //--------------------------------------------------------------------------------
1343 //--------------------------------------------------------------------------------
1344 UBool
RegexMatcher::hitEnd() const {
1349 //--------------------------------------------------------------------------------
1353 //--------------------------------------------------------------------------------
1354 const UnicodeString
&RegexMatcher::input() const {
1356 UErrorCode status
= U_ZERO_ERROR
;
1358 if (UTEXT_USES_U16(fInputText
)) {
1359 len16
= (int32_t)fInputLength
;
1361 len16
= utext_extract(fInputText
, 0, fInputLength
, NULL
, 0, &status
);
1362 status
= U_ZERO_ERROR
; // overflow, length status
1364 UnicodeString
*result
= new UnicodeString(len16
, 0, 0);
1366 UChar
*inputChars
= result
->getBuffer(len16
);
1367 utext_extract(fInputText
, 0, fInputLength
, inputChars
, len16
, &status
); // unterminated warning
1368 result
->releaseBuffer(len16
);
1370 (*(const UnicodeString
**)&fInput
) = result
; // pointer assignment, rather than operator=
1376 //--------------------------------------------------------------------------------
1380 //--------------------------------------------------------------------------------
1381 UText
*RegexMatcher::inputText() const {
1386 //--------------------------------------------------------------------------------
1388 // getInput() -- like inputText(), but makes a clone or copies into another UText
1390 //--------------------------------------------------------------------------------
1391 UText
*RegexMatcher::getInput (UText
*dest
, UErrorCode
&status
) const {
1392 if (U_FAILURE(status
)) {
1395 if (U_FAILURE(fDeferredStatus
)) {
1396 status
= fDeferredStatus
;
1401 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1402 utext_replace(dest
, 0, utext_nativeLength(dest
), fInputText
->chunkContents
, (int32_t)fInputLength
, &status
);
1405 if (UTEXT_USES_U16(fInputText
)) {
1406 input16Len
= (int32_t)fInputLength
;
1408 UErrorCode lengthStatus
= U_ZERO_ERROR
;
1409 input16Len
= utext_extract(fInputText
, 0, fInputLength
, NULL
, 0, &lengthStatus
); // buffer overflow error
1411 UChar
*inputChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(input16Len
));
1412 if (inputChars
== NULL
) {
1416 status
= U_ZERO_ERROR
;
1417 utext_extract(fInputText
, 0, fInputLength
, inputChars
, input16Len
, &status
); // not terminated warning
1418 status
= U_ZERO_ERROR
;
1419 utext_replace(dest
, 0, utext_nativeLength(dest
), inputChars
, input16Len
, &status
);
1421 uprv_free(inputChars
);
1425 return utext_clone(NULL
, fInputText
, FALSE
, TRUE
, &status
);
1430 static UBool
compat_SyncMutableUTextContents(UText
*ut
);
1431 static UBool
compat_SyncMutableUTextContents(UText
*ut
) {
1432 UBool retVal
= FALSE
;
1434 // In the following test, we're really only interested in whether the UText should switch
1435 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents
1436 // will still point to the correct data.
1437 if (utext_nativeLength(ut
) != ut
->nativeIndexingLimit
) {
1438 UnicodeString
*us
=(UnicodeString
*)ut
->context
;
1440 // Update to the latest length.
1441 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1442 int32_t newLength
= us
->length();
1444 // Update the chunk description.
1445 // The buffer may have switched between stack- and heap-based.
1446 ut
->chunkContents
= us
->getBuffer();
1447 ut
->chunkLength
= newLength
;
1448 ut
->chunkNativeLimit
= newLength
;
1449 ut
->nativeIndexingLimit
= newLength
;
1456 //--------------------------------------------------------------------------------
1460 //--------------------------------------------------------------------------------
1461 UBool
RegexMatcher::lookingAt(UErrorCode
&status
) {
1462 if (U_FAILURE(status
)) {
1465 if (U_FAILURE(fDeferredStatus
)) {
1466 status
= fDeferredStatus
;
1470 if (fInputUniStrMaybeMutable
) {
1471 if (compat_SyncMutableUTextContents(fInputText
)) {
1472 fInputLength
= utext_nativeLength(fInputText
);
1477 resetPreserveRegion();
1479 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1480 MatchChunkAt((int32_t)fActiveStart
, FALSE
, status
);
1482 MatchAt(fActiveStart
, FALSE
, status
);
1488 UBool
RegexMatcher::lookingAt(int64_t start
, UErrorCode
&status
) {
1489 if (U_FAILURE(status
)) {
1492 if (U_FAILURE(fDeferredStatus
)) {
1493 status
= fDeferredStatus
;
1499 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1503 if (fInputUniStrMaybeMutable
) {
1504 if (compat_SyncMutableUTextContents(fInputText
)) {
1505 fInputLength
= utext_nativeLength(fInputText
);
1510 int64_t nativeStart
;
1511 nativeStart
= start
;
1512 if (nativeStart
< fActiveStart
|| nativeStart
> fActiveLimit
) {
1513 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1517 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1518 MatchChunkAt((int32_t)nativeStart
, FALSE
, status
);
1520 MatchAt(nativeStart
, FALSE
, status
);
1527 //--------------------------------------------------------------------------------
1531 //--------------------------------------------------------------------------------
1532 UBool
RegexMatcher::matches(UErrorCode
&status
) {
1533 if (U_FAILURE(status
)) {
1536 if (U_FAILURE(fDeferredStatus
)) {
1537 status
= fDeferredStatus
;
1541 if (fInputUniStrMaybeMutable
) {
1542 if (compat_SyncMutableUTextContents(fInputText
)) {
1543 fInputLength
= utext_nativeLength(fInputText
);
1548 resetPreserveRegion();
1551 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1552 MatchChunkAt((int32_t)fActiveStart
, TRUE
, status
);
1554 MatchAt(fActiveStart
, TRUE
, status
);
1560 UBool
RegexMatcher::matches(int64_t start
, UErrorCode
&status
) {
1561 if (U_FAILURE(status
)) {
1564 if (U_FAILURE(fDeferredStatus
)) {
1565 status
= fDeferredStatus
;
1571 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1575 if (fInputUniStrMaybeMutable
) {
1576 if (compat_SyncMutableUTextContents(fInputText
)) {
1577 fInputLength
= utext_nativeLength(fInputText
);
1582 int64_t nativeStart
;
1583 nativeStart
= start
;
1584 if (nativeStart
< fActiveStart
|| nativeStart
> fActiveLimit
) {
1585 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1589 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1590 MatchChunkAt((int32_t)nativeStart
, TRUE
, status
);
1592 MatchAt(nativeStart
, TRUE
, status
);
1599 //--------------------------------------------------------------------------------
1603 //--------------------------------------------------------------------------------
1604 const RegexPattern
&RegexMatcher::pattern() const {
1610 //--------------------------------------------------------------------------------
1614 //--------------------------------------------------------------------------------
1615 RegexMatcher
&RegexMatcher::region(int64_t regionStart
, int64_t regionLimit
, int64_t startIndex
, UErrorCode
&status
) {
1616 if (U_FAILURE(status
)) {
1620 if (regionStart
>regionLimit
|| regionStart
<0 || regionLimit
<0) {
1621 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1624 int64_t nativeStart
= regionStart
;
1625 int64_t nativeLimit
= regionLimit
;
1626 if (nativeStart
> fInputLength
|| nativeLimit
> fInputLength
) {
1627 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1630 if (startIndex
== -1)
1633 resetPreserveRegion();
1635 fRegionStart
= nativeStart
;
1636 fRegionLimit
= nativeLimit
;
1637 fActiveStart
= nativeStart
;
1638 fActiveLimit
= nativeLimit
;
1640 if (startIndex
!= -1) {
1641 if (startIndex
< fActiveStart
|| startIndex
> fActiveLimit
) {
1642 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1644 fMatchEnd
= startIndex
;
1647 if (!fTransparentBounds
) {
1648 fLookStart
= nativeStart
;
1649 fLookLimit
= nativeLimit
;
1651 if (fAnchoringBounds
) {
1652 fAnchorStart
= nativeStart
;
1653 fAnchorLimit
= nativeLimit
;
1658 RegexMatcher
&RegexMatcher::region(int64_t start
, int64_t limit
, UErrorCode
&status
) {
1659 return region(start
, limit
, -1, status
);
1662 //--------------------------------------------------------------------------------
1666 //--------------------------------------------------------------------------------
1667 int32_t RegexMatcher::regionEnd() const {
1668 return (int32_t)fRegionLimit
;
1671 int64_t RegexMatcher::regionEnd64() const {
1672 return fRegionLimit
;
1675 //--------------------------------------------------------------------------------
1679 //--------------------------------------------------------------------------------
1680 int32_t RegexMatcher::regionStart() const {
1681 return (int32_t)fRegionStart
;
1684 int64_t RegexMatcher::regionStart64() const {
1685 return fRegionStart
;
1689 //--------------------------------------------------------------------------------
1693 //--------------------------------------------------------------------------------
1694 UnicodeString
RegexMatcher::replaceAll(const UnicodeString
&replacement
, UErrorCode
&status
) {
1695 UText replacementText
= UTEXT_INITIALIZER
;
1696 UText resultText
= UTEXT_INITIALIZER
;
1697 UnicodeString resultString
;
1698 if (U_FAILURE(status
)) {
1699 return resultString
;
1702 utext_openConstUnicodeString(&replacementText
, &replacement
, &status
);
1703 utext_openUnicodeString(&resultText
, &resultString
, &status
);
1705 replaceAll(&replacementText
, &resultText
, status
);
1707 utext_close(&resultText
);
1708 utext_close(&replacementText
);
1710 return resultString
;
1715 // replaceAll, UText mode
1717 UText
*RegexMatcher::replaceAll(UText
*replacement
, UText
*dest
, UErrorCode
&status
) {
1718 if (U_FAILURE(status
)) {
1721 if (U_FAILURE(fDeferredStatus
)) {
1722 status
= fDeferredStatus
;
1727 UnicodeString emptyString
;
1728 UText empty
= UTEXT_INITIALIZER
;
1730 utext_openUnicodeString(&empty
, &emptyString
, &status
);
1731 dest
= utext_clone(NULL
, &empty
, TRUE
, FALSE
, &status
);
1732 utext_close(&empty
);
1735 if (U_SUCCESS(status
)) {
1738 appendReplacement(dest
, replacement
, status
);
1739 if (U_FAILURE(status
)) {
1743 appendTail(dest
, status
);
1750 //--------------------------------------------------------------------------------
1754 //--------------------------------------------------------------------------------
1755 UnicodeString
RegexMatcher::replaceFirst(const UnicodeString
&replacement
, UErrorCode
&status
) {
1756 UText replacementText
= UTEXT_INITIALIZER
;
1757 UText resultText
= UTEXT_INITIALIZER
;
1758 UnicodeString resultString
;
1760 utext_openConstUnicodeString(&replacementText
, &replacement
, &status
);
1761 utext_openUnicodeString(&resultText
, &resultString
, &status
);
1763 replaceFirst(&replacementText
, &resultText
, status
);
1765 utext_close(&resultText
);
1766 utext_close(&replacementText
);
1768 return resultString
;
1772 // replaceFirst, UText mode
1774 UText
*RegexMatcher::replaceFirst(UText
*replacement
, UText
*dest
, UErrorCode
&status
) {
1775 if (U_FAILURE(status
)) {
1778 if (U_FAILURE(fDeferredStatus
)) {
1779 status
= fDeferredStatus
;
1785 return getInput(dest
, status
);
1789 UnicodeString emptyString
;
1790 UText empty
= UTEXT_INITIALIZER
;
1792 utext_openUnicodeString(&empty
, &emptyString
, &status
);
1793 dest
= utext_clone(NULL
, &empty
, TRUE
, FALSE
, &status
);
1794 utext_close(&empty
);
1797 appendReplacement(dest
, replacement
, status
);
1798 appendTail(dest
, status
);
1804 //--------------------------------------------------------------------------------
1808 //--------------------------------------------------------------------------------
1809 UBool
RegexMatcher::requireEnd() const {
1814 //--------------------------------------------------------------------------------
1818 //--------------------------------------------------------------------------------
1819 RegexMatcher
&RegexMatcher::reset() {
1821 fRegionLimit
= fInputLength
;
1823 fActiveLimit
= fInputLength
;
1825 fAnchorLimit
= fInputLength
;
1827 fLookLimit
= fInputLength
;
1828 resetPreserveRegion();
1834 void RegexMatcher::resetPreserveRegion() {
1838 fAppendPosition
= 0;
1841 fRequireEnd
= FALSE
;
1843 fTickCounter
= TIMER_INITIAL_VALUE
;
1844 //resetStack(); // more expensive than it looks...
1848 RegexMatcher
&RegexMatcher::reset(const UnicodeString
&input
) {
1849 fInputText
= utext_openConstUnicodeString(fInputText
, &input
, &fDeferredStatus
);
1850 if (fPattern
->fNeedsAltInput
) {
1851 fAltInputText
= utext_clone(fAltInputText
, fInputText
, FALSE
, TRUE
, &fDeferredStatus
);
1853 if (U_FAILURE(fDeferredStatus
)) {
1856 fInputLength
= utext_nativeLength(fInputText
);
1862 // Do the following for any UnicodeString.
1863 // This is for compatibility for those clients who modify the input string "live" during regex operations.
1864 fInputUniStrMaybeMutable
= TRUE
;
1866 if (fWordBreakItr
!= NULL
) {
1867 #if UCONFIG_NO_BREAK_ITERATION==0
1868 UErrorCode status
= U_ZERO_ERROR
;
1869 fWordBreakItr
->setText(fInputText
, status
);
1876 RegexMatcher
&RegexMatcher::reset(UText
*input
) {
1877 if (fInputText
!= input
) {
1878 fInputText
= utext_clone(fInputText
, input
, FALSE
, TRUE
, &fDeferredStatus
);
1879 if (fPattern
->fNeedsAltInput
) fAltInputText
= utext_clone(fAltInputText
, fInputText
, FALSE
, TRUE
, &fDeferredStatus
);
1880 if (U_FAILURE(fDeferredStatus
)) {
1883 fInputLength
= utext_nativeLength(fInputText
);
1888 if (fWordBreakItr
!= NULL
) {
1889 #if UCONFIG_NO_BREAK_ITERATION==0
1890 UErrorCode status
= U_ZERO_ERROR
;
1891 fWordBreakItr
->setText(input
, status
);
1896 fInputUniStrMaybeMutable
= FALSE
;
1901 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
1902 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1906 RegexMatcher
&RegexMatcher::reset(int64_t position
, UErrorCode
&status
) {
1907 if (U_FAILURE(status
)) {
1910 reset(); // Reset also resets the region to be the entire string.
1912 if (position
< 0 || position
> fActiveLimit
) {
1913 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1916 fMatchEnd
= position
;
1921 //--------------------------------------------------------------------------------
1925 //--------------------------------------------------------------------------------
1926 RegexMatcher
&RegexMatcher::refreshInputText(UText
*input
, UErrorCode
&status
) {
1927 if (U_FAILURE(status
)) {
1930 if (input
== NULL
) {
1931 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1934 if (utext_nativeLength(fInputText
) != utext_nativeLength(input
)) {
1935 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1938 int64_t pos
= utext_getNativeIndex(fInputText
);
1939 // Shallow read-only clone of the new UText into the existing input UText
1940 fInputText
= utext_clone(fInputText
, input
, FALSE
, TRUE
, &status
);
1941 if (U_FAILURE(status
)) {
1944 utext_setNativeIndex(fInputText
, pos
);
1946 if (fAltInputText
!= NULL
) {
1947 pos
= utext_getNativeIndex(fAltInputText
);
1948 fAltInputText
= utext_clone(fAltInputText
, input
, FALSE
, TRUE
, &status
);
1949 if (U_FAILURE(status
)) {
1952 utext_setNativeIndex(fAltInputText
, pos
);
1959 //--------------------------------------------------------------------------------
1963 //--------------------------------------------------------------------------------
1964 void RegexMatcher::setTrace(UBool state
) {
1965 fTraceDebug
= state
;
1971 * UText, replace entire contents of the destination UText with a substring of the source UText.
1973 * @param src The source UText
1974 * @param dest The destination UText. Must be writable.
1975 * May be NULL, in which case a new UText will be allocated.
1976 * @param start Start index of source substring.
1977 * @param limit Limit index of source substring.
1978 * @param status An error code.
1980 static UText
*utext_extract_replace(UText
*src
, UText
*dest
, int64_t start
, int64_t limit
, UErrorCode
*status
) {
1981 if (U_FAILURE(*status
)) {
1984 if (start
== limit
) {
1986 utext_replace(dest
, 0, utext_nativeLength(dest
), NULL
, 0, status
);
1989 return utext_openUChars(NULL
, NULL
, 0, status
);
1992 int32_t length
= utext_extract(src
, start
, limit
, NULL
, 0, status
);
1993 if (*status
!= U_BUFFER_OVERFLOW_ERROR
&& U_FAILURE(*status
)) {
1996 *status
= U_ZERO_ERROR
;
1997 MaybeStackArray
<UChar
, 40> buffer
;
1998 if (length
>= buffer
.getCapacity()) {
1999 UChar
*newBuf
= buffer
.resize(length
+1); // Leave space for terminating Nul.
2000 if (newBuf
== NULL
) {
2001 *status
= U_MEMORY_ALLOCATION_ERROR
;
2004 utext_extract(src
, start
, limit
, buffer
.getAlias(), length
+1, status
);
2006 utext_replace(dest
, 0, utext_nativeLength(dest
), buffer
.getAlias(), length
, status
);
2010 // Caller did not provide a prexisting UText.
2011 // Open a new one, and have it adopt the text buffer storage.
2012 if (U_FAILURE(*status
)) {
2015 int32_t ownedLength
= 0;
2016 UChar
*ownedBuf
= buffer
.orphanOrClone(length
+1, ownedLength
);
2017 if (ownedBuf
== NULL
) {
2018 *status
= U_MEMORY_ALLOCATION_ERROR
;
2021 UText
*result
= utext_openUChars(NULL
, ownedBuf
, length
, status
);
2022 if (U_FAILURE(*status
)) {
2023 uprv_free(ownedBuf
);
2026 result
->providerProperties
|= (1 << UTEXT_PROVIDER_OWNS_TEXT
);
2031 //---------------------------------------------------------------------
2035 //---------------------------------------------------------------------
2036 int32_t RegexMatcher::split(const UnicodeString
&input
,
2037 UnicodeString dest
[],
2038 int32_t destCapacity
,
2041 UText inputText
= UTEXT_INITIALIZER
;
2042 utext_openConstUnicodeString(&inputText
, &input
, &status
);
2043 if (U_FAILURE(status
)) {
2047 UText
**destText
= (UText
**)uprv_malloc(sizeof(UText
*)*destCapacity
);
2048 if (destText
== NULL
) {
2049 status
= U_MEMORY_ALLOCATION_ERROR
;
2053 for (i
= 0; i
< destCapacity
; i
++) {
2054 destText
[i
] = utext_openUnicodeString(NULL
, &dest
[i
], &status
);
2057 int32_t fieldCount
= split(&inputText
, destText
, destCapacity
, status
);
2059 for (i
= 0; i
< destCapacity
; i
++) {
2060 utext_close(destText
[i
]);
2063 uprv_free(destText
);
2064 utext_close(&inputText
);
2069 // split, UText mode
2071 int32_t RegexMatcher::split(UText
*input
,
2073 int32_t destCapacity
,
2077 // Check arguements for validity
2079 if (U_FAILURE(status
)) {
2083 if (destCapacity
< 1) {
2084 status
= U_ILLEGAL_ARGUMENT_ERROR
;
2089 // Reset for the input text
2092 int64_t nextOutputStringStart
= 0;
2093 if (fActiveLimit
== 0) {
2098 // Loop through the input text, searching for the delimiter pattern
2101 int32_t numCaptureGroups
= fPattern
->fGroupMap
->size();
2103 if (i
>=destCapacity
-1) {
2104 // There is one or zero output string left.
2105 // Fill the last output string with whatever is left from the input, then exit the loop.
2106 // ( i will be == destCapacity if we filled the output array while processing
2107 // capture groups of the delimiter expression, in which case we will discard the
2108 // last capture group saved in favor of the unprocessed remainder of the
2111 if (fActiveLimit
> nextOutputStringStart
) {
2112 if (UTEXT_FULL_TEXT_IN_CHUNK(input
, fInputLength
)) {
2114 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]),
2115 input
->chunkContents
+nextOutputStringStart
,
2116 (int32_t)(fActiveLimit
-nextOutputStringStart
), &status
);
2118 UText remainingText
= UTEXT_INITIALIZER
;
2119 utext_openUChars(&remainingText
, input
->chunkContents
+nextOutputStringStart
,
2120 fActiveLimit
-nextOutputStringStart
, &status
);
2121 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2122 utext_close(&remainingText
);
2125 UErrorCode lengthStatus
= U_ZERO_ERROR
;
2126 int32_t remaining16Length
=
2127 utext_extract(input
, nextOutputStringStart
, fActiveLimit
, NULL
, 0, &lengthStatus
);
2128 UChar
*remainingChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(remaining16Length
+1));
2129 if (remainingChars
== NULL
) {
2130 status
= U_MEMORY_ALLOCATION_ERROR
;
2134 utext_extract(input
, nextOutputStringStart
, fActiveLimit
, remainingChars
, remaining16Length
+1, &status
);
2136 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]), remainingChars
, remaining16Length
, &status
);
2138 UText remainingText
= UTEXT_INITIALIZER
;
2139 utext_openUChars(&remainingText
, remainingChars
, remaining16Length
, &status
);
2140 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2141 utext_close(&remainingText
);
2144 uprv_free(remainingChars
);
2150 // We found another delimiter. Move everything from where we started looking
2151 // up until the start of the delimiter into the next output string.
2152 if (UTEXT_FULL_TEXT_IN_CHUNK(input
, fInputLength
)) {
2154 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]),
2155 input
->chunkContents
+nextOutputStringStart
,
2156 (int32_t)(fMatchStart
-nextOutputStringStart
), &status
);
2158 UText remainingText
= UTEXT_INITIALIZER
;
2159 utext_openUChars(&remainingText
, input
->chunkContents
+nextOutputStringStart
,
2160 fMatchStart
-nextOutputStringStart
, &status
);
2161 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2162 utext_close(&remainingText
);
2165 UErrorCode lengthStatus
= U_ZERO_ERROR
;
2166 int32_t remaining16Length
= utext_extract(input
, nextOutputStringStart
, fMatchStart
, NULL
, 0, &lengthStatus
);
2167 UChar
*remainingChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(remaining16Length
+1));
2168 if (remainingChars
== NULL
) {
2169 status
= U_MEMORY_ALLOCATION_ERROR
;
2172 utext_extract(input
, nextOutputStringStart
, fMatchStart
, remainingChars
, remaining16Length
+1, &status
);
2174 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]), remainingChars
, remaining16Length
, &status
);
2176 UText remainingText
= UTEXT_INITIALIZER
;
2177 utext_openUChars(&remainingText
, remainingChars
, remaining16Length
, &status
);
2178 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2179 utext_close(&remainingText
);
2182 uprv_free(remainingChars
);
2184 nextOutputStringStart
= fMatchEnd
;
2186 // If the delimiter pattern has capturing parentheses, the captured
2187 // text goes out into the next n destination strings.
2189 for (groupNum
=1; groupNum
<=numCaptureGroups
; groupNum
++) {
2190 if (i
>= destCapacity
-2) {
2191 // Never fill the last available output string with capture group text.
2192 // It will filled with the last field, the remainder of the
2193 // unsplit input text.
2197 dest
[i
] = utext_extract_replace(fInputText
, dest
[i
],
2198 start64(groupNum
, status
), end64(groupNum
, status
), &status
);
2201 if (nextOutputStringStart
== fActiveLimit
) {
2202 // The delimiter was at the end of the string. We're done, but first
2203 // we output one last empty string, for the empty field following
2204 // the delimiter at the end of input.
2205 if (i
+1 < destCapacity
) {
2207 if (dest
[i
] == NULL
) {
2208 dest
[i
] = utext_openUChars(NULL
, NULL
, 0, &status
);
2210 static const UChar emptyString
[] = {(UChar
)0};
2211 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]), emptyString
, 0, &status
);
2220 // We ran off the end of the input while looking for the next delimiter.
2221 // All the remaining text goes into the current output string.
2222 if (UTEXT_FULL_TEXT_IN_CHUNK(input
, fInputLength
)) {
2224 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]),
2225 input
->chunkContents
+nextOutputStringStart
,
2226 (int32_t)(fActiveLimit
-nextOutputStringStart
), &status
);
2228 UText remainingText
= UTEXT_INITIALIZER
;
2229 utext_openUChars(&remainingText
, input
->chunkContents
+nextOutputStringStart
,
2230 fActiveLimit
-nextOutputStringStart
, &status
);
2231 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2232 utext_close(&remainingText
);
2235 UErrorCode lengthStatus
= U_ZERO_ERROR
;
2236 int32_t remaining16Length
= utext_extract(input
, nextOutputStringStart
, fActiveLimit
, NULL
, 0, &lengthStatus
);
2237 UChar
*remainingChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(remaining16Length
+1));
2238 if (remainingChars
== NULL
) {
2239 status
= U_MEMORY_ALLOCATION_ERROR
;
2243 utext_extract(input
, nextOutputStringStart
, fActiveLimit
, remainingChars
, remaining16Length
+1, &status
);
2245 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]), remainingChars
, remaining16Length
, &status
);
2247 UText remainingText
= UTEXT_INITIALIZER
;
2248 utext_openUChars(&remainingText
, remainingChars
, remaining16Length
, &status
);
2249 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2250 utext_close(&remainingText
);
2253 uprv_free(remainingChars
);
2257 if (U_FAILURE(status
)) {
2260 } // end of for loop
2265 //--------------------------------------------------------------------------------
2269 //--------------------------------------------------------------------------------
2270 int32_t RegexMatcher::start(UErrorCode
&status
) const {
2271 return start(0, status
);
2274 int64_t RegexMatcher::start64(UErrorCode
&status
) const {
2275 return start64(0, status
);
2278 //--------------------------------------------------------------------------------
2280 // start(int32_t group, UErrorCode &status)
2282 //--------------------------------------------------------------------------------
2284 int64_t RegexMatcher::start64(int32_t group
, UErrorCode
&status
) const {
2285 if (U_FAILURE(status
)) {
2288 if (U_FAILURE(fDeferredStatus
)) {
2289 status
= fDeferredStatus
;
2292 if (fMatch
== FALSE
) {
2293 status
= U_REGEX_INVALID_STATE
;
2296 if (group
< 0 || group
> fPattern
->fGroupMap
->size()) {
2297 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
2304 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(group
-1);
2305 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
2306 U_ASSERT(groupOffset
>= 0);
2307 s
= fFrame
->fExtra
[groupOffset
];
2314 int32_t RegexMatcher::start(int32_t group
, UErrorCode
&status
) const {
2315 return (int32_t)start64(group
, status
);
2318 //--------------------------------------------------------------------------------
2320 // useAnchoringBounds
2322 //--------------------------------------------------------------------------------
2323 RegexMatcher
&RegexMatcher::useAnchoringBounds(UBool b
) {
2324 fAnchoringBounds
= b
;
2325 fAnchorStart
= (fAnchoringBounds
? fRegionStart
: 0);
2326 fAnchorLimit
= (fAnchoringBounds
? fRegionLimit
: fInputLength
);
2331 //--------------------------------------------------------------------------------
2333 // useTransparentBounds
2335 //--------------------------------------------------------------------------------
2336 RegexMatcher
&RegexMatcher::useTransparentBounds(UBool b
) {
2337 fTransparentBounds
= b
;
2338 fLookStart
= (fTransparentBounds
? 0 : fRegionStart
);
2339 fLookLimit
= (fTransparentBounds
? fInputLength
: fRegionLimit
);
2343 //--------------------------------------------------------------------------------
2347 //--------------------------------------------------------------------------------
2348 void RegexMatcher::setTimeLimit(int32_t limit
, UErrorCode
&status
) {
2349 if (U_FAILURE(status
)) {
2352 if (U_FAILURE(fDeferredStatus
)) {
2353 status
= fDeferredStatus
;
2357 status
= U_ILLEGAL_ARGUMENT_ERROR
;
2364 //--------------------------------------------------------------------------------
2368 //--------------------------------------------------------------------------------
2369 int32_t RegexMatcher::getTimeLimit() const {
2374 //--------------------------------------------------------------------------------
2378 //--------------------------------------------------------------------------------
2379 void RegexMatcher::setStackLimit(int32_t limit
, UErrorCode
&status
) {
2380 if (U_FAILURE(status
)) {
2383 if (U_FAILURE(fDeferredStatus
)) {
2384 status
= fDeferredStatus
;
2388 status
= U_ILLEGAL_ARGUMENT_ERROR
;
2392 // Reset the matcher. This is needed here in case there is a current match
2393 // whose final stack frame (containing the match results, pointed to by fFrame)
2394 // would be lost by resizing to a smaller stack size.
2398 // Unlimited stack expansion
2399 fStack
->setMaxCapacity(0);
2401 // Change the units of the limit from bytes to ints, and bump the size up
2402 // to be big enough to hold at least one stack frame for the pattern,
2403 // if it isn't there already.
2404 int32_t adjustedLimit
= limit
/ sizeof(int32_t);
2405 if (adjustedLimit
< fPattern
->fFrameSize
) {
2406 adjustedLimit
= fPattern
->fFrameSize
;
2408 fStack
->setMaxCapacity(adjustedLimit
);
2410 fStackLimit
= limit
;
2414 //--------------------------------------------------------------------------------
2418 //--------------------------------------------------------------------------------
2419 int32_t RegexMatcher::getStackLimit() const {
2424 //--------------------------------------------------------------------------------
2428 //--------------------------------------------------------------------------------
2429 void RegexMatcher::setMatchCallback(URegexMatchCallback
*callback
,
2430 const void *context
,
2431 UErrorCode
&status
) {
2432 if (U_FAILURE(status
)) {
2435 fCallbackFn
= callback
;
2436 fCallbackContext
= context
;
2440 //--------------------------------------------------------------------------------
2444 //--------------------------------------------------------------------------------
2445 void RegexMatcher::getMatchCallback(URegexMatchCallback
*&callback
,
2446 const void *&context
,
2447 UErrorCode
&status
) {
2448 if (U_FAILURE(status
)) {
2451 callback
= fCallbackFn
;
2452 context
= fCallbackContext
;
2456 //--------------------------------------------------------------------------------
2460 //--------------------------------------------------------------------------------
2461 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback
*callback
,
2462 const void *context
,
2463 UErrorCode
&status
) {
2464 if (U_FAILURE(status
)) {
2467 fFindProgressCallbackFn
= callback
;
2468 fFindProgressCallbackContext
= context
;
2472 //--------------------------------------------------------------------------------
2476 //--------------------------------------------------------------------------------
2477 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback
*&callback
,
2478 const void *&context
,
2479 UErrorCode
&status
) {
2480 if (U_FAILURE(status
)) {
2483 callback
= fFindProgressCallbackFn
;
2484 context
= fFindProgressCallbackContext
;
2488 //================================================================================
2490 // Code following this point in this file is the internal
2491 // Match Engine Implementation.
2493 //================================================================================
2496 //--------------------------------------------------------------------------------
2499 // Discard any previous contents of the state save stack, and initialize a
2500 // new stack frame to all -1. The -1s are needed for capture group limits,
2501 // where they indicate that a group has not yet matched anything.
2502 //--------------------------------------------------------------------------------
2503 REStackFrame
*RegexMatcher::resetStack() {
2504 // Discard any previous contents of the state save stack, and initialize a
2505 // new stack frame with all -1 data. The -1s are needed for capture group limits,
2506 // where they indicate that a group has not yet matched anything.
2507 fStack
->removeAllElements();
2509 REStackFrame
*iFrame
= (REStackFrame
*)fStack
->reserveBlock(fPattern
->fFrameSize
, fDeferredStatus
);
2510 if(U_FAILURE(fDeferredStatus
)) {
2515 for (i
=0; i
<fPattern
->fFrameSize
-RESTACKFRAME_HDRCOUNT
; i
++) {
2516 iFrame
->fExtra
[i
] = -1;
2523 //--------------------------------------------------------------------------------
2526 // in perl, "xab..cd..", \b is true at positions 0,3,5,7
2528 // If the current char is a combining mark,
2530 // Else Scan backwards to the first non-combining char.
2531 // We are at a boundary if the this char and the original chars are
2532 // opposite in membership in \w set
2534 // parameters: pos - the current position in the input buffer
2536 // TODO: double-check edge cases at region boundaries.
2538 //--------------------------------------------------------------------------------
2539 UBool
RegexMatcher::isWordBoundary(int64_t pos
) {
2540 UBool isBoundary
= FALSE
;
2541 UBool cIsWord
= FALSE
;
2543 if (pos
>= fLookLimit
) {
2546 // Determine whether char c at current position is a member of the word set of chars.
2547 // If we're off the end of the string, behave as though we're not at a word char.
2548 UTEXT_SETNATIVEINDEX(fInputText
, pos
);
2549 UChar32 c
= UTEXT_CURRENT32(fInputText
);
2550 if (u_hasBinaryProperty(c
, UCHAR_GRAPHEME_EXTEND
) || u_charType(c
) == U_FORMAT_CHAR
) {
2551 // Current char is a combining one. Not a boundary.
2554 cIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(c
);
2557 // Back up until we come to a non-combining char, determine whether
2558 // that char is a word char.
2559 UBool prevCIsWord
= FALSE
;
2561 if (UTEXT_GETNATIVEINDEX(fInputText
) <= fLookStart
) {
2564 UChar32 prevChar
= UTEXT_PREVIOUS32(fInputText
);
2565 if (!(u_hasBinaryProperty(prevChar
, UCHAR_GRAPHEME_EXTEND
)
2566 || u_charType(prevChar
) == U_FORMAT_CHAR
)) {
2567 prevCIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(prevChar
);
2571 isBoundary
= cIsWord
^ prevCIsWord
;
2575 UBool
RegexMatcher::isChunkWordBoundary(int32_t pos
) {
2576 UBool isBoundary
= FALSE
;
2577 UBool cIsWord
= FALSE
;
2579 const UChar
*inputBuf
= fInputText
->chunkContents
;
2581 if (pos
>= fLookLimit
) {
2584 // Determine whether char c at current position is a member of the word set of chars.
2585 // If we're off the end of the string, behave as though we're not at a word char.
2587 U16_GET(inputBuf
, fLookStart
, pos
, fLookLimit
, c
);
2588 if (u_hasBinaryProperty(c
, UCHAR_GRAPHEME_EXTEND
) || u_charType(c
) == U_FORMAT_CHAR
) {
2589 // Current char is a combining one. Not a boundary.
2592 cIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(c
);
2595 // Back up until we come to a non-combining char, determine whether
2596 // that char is a word char.
2597 UBool prevCIsWord
= FALSE
;
2599 if (pos
<= fLookStart
) {
2603 U16_PREV(inputBuf
, fLookStart
, pos
, prevChar
);
2604 if (!(u_hasBinaryProperty(prevChar
, UCHAR_GRAPHEME_EXTEND
)
2605 || u_charType(prevChar
) == U_FORMAT_CHAR
)) {
2606 prevCIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(prevChar
);
2610 isBoundary
= cIsWord
^ prevCIsWord
;
2614 //--------------------------------------------------------------------------------
2618 // Test for a word boundary using RBBI word break.
2620 // parameters: pos - the current position in the input buffer
2622 //--------------------------------------------------------------------------------
2623 UBool
RegexMatcher::isUWordBoundary(int64_t pos
) {
2624 UBool returnVal
= FALSE
;
2625 #if UCONFIG_NO_BREAK_ITERATION==0
2627 // If we haven't yet created a break iterator for this matcher, do it now.
2628 if (fWordBreakItr
== NULL
) {
2630 (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus
);
2631 if (U_FAILURE(fDeferredStatus
)) {
2634 fWordBreakItr
->setText(fInputText
, fDeferredStatus
);
2637 if (pos
>= fLookLimit
) {
2639 returnVal
= TRUE
; // With Unicode word rules, only positions within the interior of "real"
2640 // words are not boundaries. All non-word chars stand by themselves,
2641 // with word boundaries on both sides.
2643 if (!UTEXT_USES_U16(fInputText
)) {
2644 // !!!: Would like a better way to do this!
2645 UErrorCode status
= U_ZERO_ERROR
;
2646 pos
= utext_extract(fInputText
, 0, pos
, NULL
, 0, &status
);
2648 returnVal
= fWordBreakItr
->isBoundary((int32_t)pos
);
2654 //--------------------------------------------------------------------------------
2656 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state
2657 // saves. Increment the "time" counter, and call the
2658 // user callback function if there is one installed.
2660 // If the match operation needs to be aborted, either for a time-out
2661 // or because the user callback asked for it, just set an error status.
2662 // The engine will pick that up and stop in its outer loop.
2664 //--------------------------------------------------------------------------------
2665 void RegexMatcher::IncrementTime(UErrorCode
&status
) {
2666 fTickCounter
= TIMER_INITIAL_VALUE
;
2668 if (fCallbackFn
!= NULL
) {
2669 if ((*fCallbackFn
)(fCallbackContext
, fTime
) == FALSE
) {
2670 status
= U_REGEX_STOPPED_BY_CALLER
;
2674 if (fTimeLimit
> 0 && fTime
>= fTimeLimit
) {
2675 status
= U_REGEX_TIME_OUT
;
2679 //--------------------------------------------------------------------------------
2682 // Make a new stack frame, initialized as a copy of the current stack frame.
2683 // Set the pattern index in the original stack frame from the operand value
2684 // in the opcode. Execution of the engine continues with the state in
2685 // the newly created stack frame
2687 // Note that reserveBlock() may grow the stack, resulting in the
2688 // whole thing being relocated in memory.
2691 // fp The top frame pointer when called. At return, a new
2692 // fame will be present
2693 // savePatIdx An index into the compiled pattern. Goes into the original
2694 // (not new) frame. If execution ever back-tracks out of the
2695 // new frame, this will be where we continue from in the pattern.
2697 // The new frame pointer.
2699 //--------------------------------------------------------------------------------
2700 inline REStackFrame
*RegexMatcher::StateSave(REStackFrame
*fp
, int64_t savePatIdx
, UErrorCode
&status
) {
2701 if (U_FAILURE(status
)) {
2704 // push storage for a new frame.
2705 int64_t *newFP
= fStack
->reserveBlock(fFrameSize
, status
);
2706 if (U_FAILURE(status
)) {
2707 // Failure on attempted stack expansion.
2708 // Stack function set some other error code, change it to a more
2709 // specific one for regular expressions.
2710 status
= U_REGEX_STACK_OVERFLOW
;
2711 // We need to return a writable stack frame, so just return the
2712 // previous frame. The match operation will stop quickly
2713 // because of the error status, after which the frame will never
2714 // be looked at again.
2717 fp
= (REStackFrame
*)(newFP
- fFrameSize
); // in case of realloc of stack.
2719 // New stack frame = copy of old top frame.
2720 int64_t *source
= (int64_t *)fp
;
2721 int64_t *dest
= newFP
;
2723 *dest
++ = *source
++;
2724 if (source
== newFP
) {
2730 if (fTickCounter
<= 0) {
2731 IncrementTime(status
); // Re-initializes fTickCounter
2733 fp
->fPatIdx
= savePatIdx
;
2734 return (REStackFrame
*)newFP
;
2737 #if defined(REGEX_DEBUG)
2739 UnicodeString
StringFromUText(UText
*ut
) {
2740 UnicodeString result
;
2741 for (UChar32 c
= utext_next32From(ut
, 0); c
!= U_SENTINEL
; c
= UTEXT_NEXT32(ut
)) {
2747 #endif // REGEX_DEBUG
2750 //--------------------------------------------------------------------------------
2752 // MatchAt This is the actual matching engine.
2754 // startIdx: begin matching a this index.
2755 // toEnd: if true, match must extend to end of the input region
2757 //--------------------------------------------------------------------------------
2758 void RegexMatcher::MatchAt(int64_t startIdx
, UBool toEnd
, UErrorCode
&status
) {
2759 UBool isMatch
= FALSE
; // True if the we have a match.
2761 int64_t backSearchIndex
= U_INT64_MAX
; // used after greedy single-character matches for searching backwards
2763 int32_t op
; // Operation from the compiled pattern, split into
2764 int32_t opType
; // the opcode
2765 int32_t opValue
; // and the operand value.
2767 #ifdef REGEX_RUN_DEBUG
2769 printf("MatchAt(startIdx=%ld)\n", startIdx
);
2770 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern
->fPattern
))());
2771 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText
))());
2775 if (U_FAILURE(status
)) {
2779 // Cache frequently referenced items from the compiled pattern
2781 int64_t *pat
= fPattern
->fCompiledPat
->getBuffer();
2783 const UChar
*litText
= fPattern
->fLiteralText
.getBuffer();
2784 UVector
*fSets
= fPattern
->fSets
;
2786 fFrameSize
= fPattern
->fFrameSize
;
2787 REStackFrame
*fp
= resetStack();
2788 if (U_FAILURE(fDeferredStatus
)) {
2789 status
= fDeferredStatus
;
2794 fp
->fInputIdx
= startIdx
;
2796 // Zero out the pattern's static data
2798 for (i
= 0; i
<fPattern
->fDataSize
; i
++) {
2803 // Main loop for interpreting the compiled pattern.
2804 // One iteration of the loop per pattern operation performed.
2807 op
= (int32_t)pat
[fp
->fPatIdx
];
2808 opType
= URX_TYPE(op
);
2809 opValue
= URX_VAL(op
);
2810 #ifdef REGEX_RUN_DEBUG
2812 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2813 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp
->fInputIdx
,
2814 UTEXT_CURRENT32(fInputText
), (int64_t *)fp
-fStack
->getBuffer(), fActiveLimit
);
2815 fPattern
->dumpOp(fp
->fPatIdx
);
2828 // Force a backtrack. In some circumstances, the pattern compiler
2829 // will notice that the pattern can't possibly match anything, and will
2830 // emit one of these at that point.
2831 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2836 if (fp
->fInputIdx
< fActiveLimit
) {
2837 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2838 UChar32 c
= UTEXT_NEXT32(fInputText
);
2840 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
2846 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2852 // Test input against a literal string.
2853 // Strings require two slots in the compiled pattern, one for the
2854 // offset to the string text, and one for the length.
2856 int32_t stringStartIdx
= opValue
;
2857 op
= (int32_t)pat
[fp
->fPatIdx
]; // Fetch the second operand
2859 opType
= URX_TYPE(op
);
2860 int32_t stringLen
= URX_VAL(op
);
2861 U_ASSERT(opType
== URX_STRING_LEN
);
2862 U_ASSERT(stringLen
>= 2);
2864 const UChar
*patternString
= litText
+stringStartIdx
;
2865 int32_t patternStringIndex
= 0;
2866 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2868 UChar32 patternChar
;
2869 UBool success
= TRUE
;
2870 while (patternStringIndex
< stringLen
) {
2871 if (UTEXT_GETNATIVEINDEX(fInputText
) >= fActiveLimit
) {
2876 inputChar
= UTEXT_NEXT32(fInputText
);
2877 U16_NEXT(patternString
, patternStringIndex
, stringLen
, patternChar
);
2878 if (patternChar
!= inputChar
) {
2885 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
2887 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2893 case URX_STATE_SAVE
:
2894 fp
= StateSave(fp
, opValue
, status
);
2899 // The match loop will exit via this path on a successful match,
2900 // when we reach the end of the pattern.
2901 if (toEnd
&& fp
->fInputIdx
!= fActiveLimit
) {
2902 // The pattern matched, but not to the end of input. Try some more.
2903 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2909 // Start and End Capture stack frame variables are laid out out like this:
2910 // fp->fExtra[opValue] - The start of a completed capture group
2911 // opValue+1 - The end of a completed capture group
2912 // opValue+2 - the start of a capture group whose end
2913 // has not yet been reached (and might not ever be).
2914 case URX_START_CAPTURE
:
2915 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
2916 fp
->fExtra
[opValue
+2] = fp
->fInputIdx
;
2920 case URX_END_CAPTURE
:
2921 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
2922 U_ASSERT(fp
->fExtra
[opValue
+2] >= 0); // Start pos for this group must be set.
2923 fp
->fExtra
[opValue
] = fp
->fExtra
[opValue
+2]; // Tentative start becomes real.
2924 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // End position
2925 U_ASSERT(fp
->fExtra
[opValue
] <= fp
->fExtra
[opValue
+1]);
2929 case URX_DOLLAR
: // $, test for End of line
2930 // or for position before new line at end of input
2932 if (fp
->fInputIdx
>= fAnchorLimit
) {
2933 // We really are at the end of input. Success.
2939 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2941 // If we are positioned just before a new-line that is located at the
2942 // end of input, succeed.
2943 UChar32 c
= UTEXT_NEXT32(fInputText
);
2944 if (UTEXT_GETNATIVEINDEX(fInputText
) >= fAnchorLimit
) {
2945 if (isLineTerminator(c
)) {
2946 // If not in the middle of a CR/LF sequence
2947 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& ((void)UTEXT_PREVIOUS32(fInputText
), UTEXT_PREVIOUS32(fInputText
))==0x0d)) {
2948 // At new-line at end of input. Success
2956 UChar32 nextC
= UTEXT_NEXT32(fInputText
);
2957 if (c
== 0x0d && nextC
== 0x0a && UTEXT_GETNATIVEINDEX(fInputText
) >= fAnchorLimit
) {
2960 break; // At CR/LF at end of input. Success
2964 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2969 case URX_DOLLAR_D
: // $, test for End of Line, in UNIX_LINES mode.
2970 if (fp
->fInputIdx
>= fAnchorLimit
) {
2971 // Off the end of input. Success.
2976 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2977 UChar32 c
= UTEXT_NEXT32(fInputText
);
2978 // Either at the last character of input, or off the end.
2979 if (c
== 0x0a && UTEXT_GETNATIVEINDEX(fInputText
) == fAnchorLimit
) {
2986 // Not at end of input. Back-track out.
2987 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2991 case URX_DOLLAR_M
: // $, test for End of line in multi-line mode
2993 if (fp
->fInputIdx
>= fAnchorLimit
) {
2994 // We really are at the end of input. Success.
2999 // If we are positioned just before a new-line, succeed.
3000 // It makes no difference where the new-line is within the input.
3001 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3002 UChar32 c
= UTEXT_CURRENT32(fInputText
);
3003 if (isLineTerminator(c
)) {
3004 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
3005 // In multi-line mode, hitting a new-line just before the end of input does not
3006 // set the hitEnd or requireEnd flags
3007 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& UTEXT_PREVIOUS32(fInputText
)==0x0d)) {
3011 // not at a new line. Fail.
3012 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3017 case URX_DOLLAR_MD
: // $, test for End of line in multi-line and UNIX_LINES mode
3019 if (fp
->fInputIdx
>= fAnchorLimit
) {
3020 // We really are at the end of input. Success.
3022 fRequireEnd
= TRUE
; // Java set requireEnd in this case, even though
3023 break; // adding a new-line would not lose the match.
3025 // If we are not positioned just before a new-line, the test fails; backtrack out.
3026 // It makes no difference where the new-line is within the input.
3027 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3028 if (UTEXT_CURRENT32(fInputText
) != 0x0a) {
3029 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3035 case URX_CARET
: // ^, test for start of line
3036 if (fp
->fInputIdx
!= fAnchorStart
) {
3037 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3042 case URX_CARET_M
: // ^, test for start of line in mulit-line mode
3044 if (fp
->fInputIdx
== fAnchorStart
) {
3045 // We are at the start input. Success.
3048 // Check whether character just before the current pos is a new-line
3049 // unless we are at the end of input
3050 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3051 UChar32 c
= UTEXT_PREVIOUS32(fInputText
);
3052 if ((fp
->fInputIdx
< fAnchorLimit
) && isLineTerminator(c
)) {
3053 // It's a new-line. ^ is true. Success.
3054 // TODO: what should be done with positions between a CR and LF?
3057 // Not at the start of a line. Fail.
3058 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3063 case URX_CARET_M_UNIX
: // ^, test for start of line in mulit-line + Unix-line mode
3065 U_ASSERT(fp
->fInputIdx
>= fAnchorStart
);
3066 if (fp
->fInputIdx
<= fAnchorStart
) {
3067 // We are at the start input. Success.
3070 // Check whether character just before the current pos is a new-line
3071 U_ASSERT(fp
->fInputIdx
<= fAnchorLimit
);
3072 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3073 UChar32 c
= UTEXT_PREVIOUS32(fInputText
);
3075 // Not at the start of a line. Back-track out.
3076 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3081 case URX_BACKSLASH_B
: // Test for word boundaries
3083 UBool success
= isWordBoundary(fp
->fInputIdx
);
3084 success
^= (UBool
)(opValue
!= 0); // flip sense for \B
3086 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3092 case URX_BACKSLASH_BU
: // Test for word boundaries, Unicode-style
3094 UBool success
= isUWordBoundary(fp
->fInputIdx
);
3095 success
^= (UBool
)(opValue
!= 0); // flip sense for \B
3097 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3103 case URX_BACKSLASH_D
: // Test for decimal digit
3105 if (fp
->fInputIdx
>= fActiveLimit
) {
3107 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3111 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3113 UChar32 c
= UTEXT_NEXT32(fInputText
);
3114 int8_t ctype
= u_charType(c
); // TODO: make a unicode set for this. Will be faster.
3115 UBool success
= (ctype
== U_DECIMAL_DIGIT_NUMBER
);
3116 success
^= (UBool
)(opValue
!= 0); // flip sense for \D
3118 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3120 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3126 case URX_BACKSLASH_G
: // Test for position at end of previous match
3127 if (!((fMatch
&& fp
->fInputIdx
==fMatchEnd
) || (fMatch
==FALSE
&& fp
->fInputIdx
==fActiveStart
))) {
3128 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3133 case URX_BACKSLASH_H
: // Test for \h, horizontal white space.
3135 if (fp
->fInputIdx
>= fActiveLimit
) {
3137 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3140 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3141 UChar32 c
= UTEXT_NEXT32(fInputText
);
3142 int8_t ctype
= u_charType(c
);
3143 UBool success
= (ctype
== U_SPACE_SEPARATOR
|| c
== 9); // SPACE_SEPARATOR || TAB
3144 success
^= (UBool
)(opValue
!= 0); // flip sense for \H
3146 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3148 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3154 case URX_BACKSLASH_R
: // Test for \R, any line break sequence.
3156 if (fp
->fInputIdx
>= fActiveLimit
) {
3158 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3161 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3162 UChar32 c
= UTEXT_NEXT32(fInputText
);
3163 if (isLineTerminator(c
)) {
3164 if (c
== 0x0d && utext_current32(fInputText
) == 0x0a) {
3165 utext_next32(fInputText
);
3167 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3169 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3175 case URX_BACKSLASH_V
: // \v, any single line ending character.
3177 if (fp
->fInputIdx
>= fActiveLimit
) {
3179 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3182 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3183 UChar32 c
= UTEXT_NEXT32(fInputText
);
3184 UBool success
= isLineTerminator(c
);
3185 success
^= (UBool
)(opValue
!= 0); // flip sense for \V
3187 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3189 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3195 case URX_BACKSLASH_X
:
3196 // Match a Grapheme, as defined by Unicode TR 29.
3197 // Differs slightly from Perl, which consumes combining marks independently
3201 // Fail if at end of input
3202 if (fp
->fInputIdx
>= fActiveLimit
) {
3204 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3208 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3210 // Examine (and consume) the current char.
3211 // Dispatch into a little state machine, based on the char.
3213 c
= UTEXT_NEXT32(fInputText
);
3214 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3215 UnicodeSet
**sets
= fPattern
->fStaticSets
;
3216 if (sets
[URX_GC_NORMAL
]->contains(c
)) goto GC_Extend
;
3217 if (sets
[URX_GC_CONTROL
]->contains(c
)) goto GC_Control
;
3218 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
3219 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
3220 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
3221 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
3222 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
3228 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
3229 c
= UTEXT_NEXT32(fInputText
);
3230 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3231 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
3232 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
3233 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
3234 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
3235 (void)UTEXT_PREVIOUS32(fInputText
);
3236 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3240 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
3241 c
= UTEXT_NEXT32(fInputText
);
3242 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3243 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
3244 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
3245 (void)UTEXT_PREVIOUS32(fInputText
);
3246 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3250 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
3251 c
= UTEXT_NEXT32(fInputText
);
3252 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3253 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
3254 (void)UTEXT_PREVIOUS32(fInputText
);
3255 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3259 // Combining characters are consumed here
3261 if (fp
->fInputIdx
>= fActiveLimit
) {
3264 c
= UTEXT_CURRENT32(fInputText
);
3265 if (sets
[URX_GC_EXTEND
]->contains(c
) == FALSE
) {
3268 (void)UTEXT_NEXT32(fInputText
);
3269 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3274 // Most control chars stand alone (don't combine with combining chars),
3275 // except for that CR/LF sequence is a single grapheme cluster.
3276 if (c
== 0x0d && fp
->fInputIdx
< fActiveLimit
&& UTEXT_CURRENT32(fInputText
) == 0x0a) {
3277 c
= UTEXT_NEXT32(fInputText
);
3278 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3282 if (fp
->fInputIdx
>= fActiveLimit
) {
3291 case URX_BACKSLASH_Z
: // Test for end of Input
3292 if (fp
->fInputIdx
< fAnchorLimit
) {
3293 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3302 case URX_STATIC_SETREF
:
3304 // Test input character against one of the predefined sets
3305 // (Word Characters, for example)
3306 // The high bit of the op value is a flag for the match polarity.
3307 // 0: success if input char is in set.
3308 // 1: success if input char is not in set.
3309 if (fp
->fInputIdx
>= fActiveLimit
) {
3311 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3315 UBool success
= ((opValue
& URX_NEG_SET
) == URX_NEG_SET
);
3316 opValue
&= ~URX_NEG_SET
;
3317 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
3319 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3320 UChar32 c
= UTEXT_NEXT32(fInputText
);
3322 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
3323 if (s8
->contains(c
)) {
3327 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
3328 if (s
->contains(c
)) {
3333 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3335 // the character wasn't in the set.
3336 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3342 case URX_STAT_SETREF_N
:
3344 // Test input character for NOT being a member of one of
3345 // the predefined sets (Word Characters, for example)
3346 if (fp
->fInputIdx
>= fActiveLimit
) {
3348 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3352 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
3354 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3356 UChar32 c
= UTEXT_NEXT32(fInputText
);
3358 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
3359 if (s8
->contains(c
) == FALSE
) {
3360 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3364 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
3365 if (s
->contains(c
) == FALSE
) {
3366 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3370 // the character wasn't in the set.
3371 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3377 if (fp
->fInputIdx
>= fActiveLimit
) {
3379 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3382 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3384 // There is input left. Pick up one char and test it for set membership.
3385 UChar32 c
= UTEXT_NEXT32(fInputText
);
3386 U_ASSERT(opValue
> 0 && opValue
< fSets
->size());
3388 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
3389 if (s8
->contains(c
)) {
3390 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3394 UnicodeSet
*s
= (UnicodeSet
*)fSets
->elementAt(opValue
);
3395 if (s
->contains(c
)) {
3396 // The character is in the set. A Match.
3397 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3402 // the character wasn't in the set.
3403 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3410 // . matches anything, but stops at end-of-line.
3411 if (fp
->fInputIdx
>= fActiveLimit
) {
3412 // At end of input. Match failed. Backtrack out.
3414 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3418 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3420 // There is input left. Advance over one char, unless we've hit end-of-line
3421 UChar32 c
= UTEXT_NEXT32(fInputText
);
3422 if (isLineTerminator(c
)) {
3423 // End of line in normal mode. . does not match.
3424 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3427 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3432 case URX_DOTANY_ALL
:
3434 // ., in dot-matches-all (including new lines) mode
3435 if (fp
->fInputIdx
>= fActiveLimit
) {
3436 // At end of input. Match failed. Backtrack out.
3438 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3442 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3444 // There is input left. Advance over one char, except if we are
3445 // at a cr/lf, advance over both of them.
3447 c
= UTEXT_NEXT32(fInputText
);
3448 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3449 if (c
==0x0d && fp
->fInputIdx
< fActiveLimit
) {
3450 // In the case of a CR/LF, we need to advance over both.
3451 UChar32 nextc
= UTEXT_CURRENT32(fInputText
);
3452 if (nextc
== 0x0a) {
3453 (void)UTEXT_NEXT32(fInputText
);
3454 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3461 case URX_DOTANY_UNIX
:
3463 // '.' operator, matches all, but stops at end-of-line.
3464 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
3465 if (fp
->fInputIdx
>= fActiveLimit
) {
3466 // At end of input. Match failed. Backtrack out.
3468 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3472 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3474 // There is input left. Advance over one char, unless we've hit end-of-line
3475 UChar32 c
= UTEXT_NEXT32(fInputText
);
3477 // End of line in normal mode. '.' does not match the \n
3478 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3480 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3487 fp
->fPatIdx
= opValue
;
3495 U_ASSERT(opValue
< fPattern
->fCompiledPat
->size());
3496 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
3497 fp
->fPatIdx
= opValue
; // Then JMP.
3501 // This opcode is used with (x)+, when x can match a zero length string.
3502 // Same as JMP_SAV, except conditional on the match having made forward progress.
3503 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3504 // data address of the input position at the start of the loop.
3506 U_ASSERT(opValue
> 0 && opValue
< fPattern
->fCompiledPat
->size());
3507 int32_t stoOp
= (int32_t)pat
[opValue
-1];
3508 U_ASSERT(URX_TYPE(stoOp
) == URX_STO_INP_LOC
);
3509 int32_t frameLoc
= URX_VAL(stoOp
);
3510 U_ASSERT(frameLoc
>= 0 && frameLoc
< fFrameSize
);
3511 int64_t prevInputIdx
= fp
->fExtra
[frameLoc
];
3512 U_ASSERT(prevInputIdx
<= fp
->fInputIdx
);
3513 if (prevInputIdx
< fp
->fInputIdx
) {
3514 // The match did make progress. Repeat the loop.
3515 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
3516 fp
->fPatIdx
= opValue
;
3517 fp
->fExtra
[frameLoc
] = fp
->fInputIdx
;
3519 // If the input position did not advance, we do nothing here,
3520 // execution will fall out of the loop.
3526 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
3527 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
3529 // Pick up the three extra operands that CTR_INIT has, and
3530 // skip the pattern location counter past
3531 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
3533 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
3534 int32_t minCount
= (int32_t)pat
[instrOperandLoc
+1];
3535 int32_t maxCount
= (int32_t)pat
[instrOperandLoc
+2];
3536 U_ASSERT(minCount
>=0);
3537 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
3538 U_ASSERT(loopLoc
>=fp
->fPatIdx
);
3540 if (minCount
== 0) {
3541 fp
= StateSave(fp
, loopLoc
+1, status
);
3543 if (maxCount
== -1) {
3544 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // For loop breaking.
3545 } else if (maxCount
== 0) {
3546 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3553 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
3554 int32_t initOp
= (int32_t)pat
[opValue
];
3555 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT
);
3556 int64_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
3557 int32_t minCount
= (int32_t)pat
[opValue
+2];
3558 int32_t maxCount
= (int32_t)pat
[opValue
+3];
3560 if ((uint64_t)*pCounter
>= (uint32_t)maxCount
&& maxCount
!= -1) {
3561 U_ASSERT(*pCounter
== maxCount
);
3564 if (*pCounter
>= minCount
) {
3565 if (maxCount
== -1) {
3566 // Loop has no hard upper bound.
3567 // Check that it is progressing through the input, break if it is not.
3568 int64_t *pLastInputIdx
= &fp
->fExtra
[URX_VAL(initOp
) + 1];
3569 if (fp
->fInputIdx
== *pLastInputIdx
) {
3572 *pLastInputIdx
= fp
->fInputIdx
;
3575 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
3577 // Increment time-out counter. (StateSave() does it if count >= minCount)
3579 if (fTickCounter
<= 0) {
3580 IncrementTime(status
); // Re-initializes fTickCounter
3584 fp
->fPatIdx
= opValue
+ 4; // Loop back.
3588 case URX_CTR_INIT_NG
:
3590 // Initialize a non-greedy loop
3591 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
3592 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
3594 // Pick up the three extra operands that CTR_INIT_NG has, and
3595 // skip the pattern location counter past
3596 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
3598 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
3599 int32_t minCount
= (int32_t)pat
[instrOperandLoc
+1];
3600 int32_t maxCount
= (int32_t)pat
[instrOperandLoc
+2];
3601 U_ASSERT(minCount
>=0);
3602 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
3603 U_ASSERT(loopLoc
>fp
->fPatIdx
);
3604 if (maxCount
== -1) {
3605 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // Save initial input index for loop breaking.
3608 if (minCount
== 0) {
3609 if (maxCount
!= 0) {
3610 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
3612 fp
->fPatIdx
= loopLoc
+1; // Continue with stuff after repeated block
3617 case URX_CTR_LOOP_NG
:
3619 // Non-greedy {min, max} loops
3620 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
3621 int32_t initOp
= (int32_t)pat
[opValue
];
3622 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT_NG
);
3623 int64_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
3624 int32_t minCount
= (int32_t)pat
[opValue
+2];
3625 int32_t maxCount
= (int32_t)pat
[opValue
+3];
3628 if ((uint64_t)*pCounter
>= (uint32_t)maxCount
&& maxCount
!= -1) {
3629 // The loop has matched the maximum permitted number of times.
3630 // Break out of here with no action. Matching will
3631 // continue with the following pattern.
3632 U_ASSERT(*pCounter
== maxCount
);
3636 if (*pCounter
< minCount
) {
3637 // We haven't met the minimum number of matches yet.
3638 // Loop back for another one.
3639 fp
->fPatIdx
= opValue
+ 4; // Loop back.
3640 // Increment time-out counter. (StateSave() does it if count >= minCount)
3642 if (fTickCounter
<= 0) {
3643 IncrementTime(status
); // Re-initializes fTickCounter
3646 // We do have the minimum number of matches.
3648 // If there is no upper bound on the loop iterations, check that the input index
3649 // is progressing, and stop the loop if it is not.
3650 if (maxCount
== -1) {
3651 int64_t *pLastInputIdx
= &fp
->fExtra
[URX_VAL(initOp
) + 1];
3652 if (fp
->fInputIdx
== *pLastInputIdx
) {
3655 *pLastInputIdx
= fp
->fInputIdx
;
3658 // Loop Continuation: we will fall into the pattern following the loop
3659 // (non-greedy, don't execute loop body first), but first do
3660 // a state save to the top of the loop, so that a match failure
3661 // in the following pattern will try another iteration of the loop.
3662 fp
= StateSave(fp
, opValue
+ 4, status
);
3668 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
3669 fData
[opValue
] = fStack
->size();
3674 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
3675 int32_t newStackSize
= (int32_t)fData
[opValue
];
3676 U_ASSERT(newStackSize
<= fStack
->size());
3677 int64_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
3678 if (newFP
== (int64_t *)fp
) {
3682 for (j
=0; j
<fFrameSize
; j
++) {
3683 newFP
[j
] = ((int64_t *)fp
)[j
];
3685 fp
= (REStackFrame
*)newFP
;
3686 fStack
->setSize(newStackSize
);
3692 U_ASSERT(opValue
< fFrameSize
);
3693 int64_t groupStartIdx
= fp
->fExtra
[opValue
];
3694 int64_t groupEndIdx
= fp
->fExtra
[opValue
+1];
3695 U_ASSERT(groupStartIdx
<= groupEndIdx
);
3696 if (groupStartIdx
< 0) {
3697 // This capture group has not participated in the match thus far,
3698 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
3701 UTEXT_SETNATIVEINDEX(fAltInputText
, groupStartIdx
);
3702 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3704 // Note: if the capture group match was of an empty string the backref
3705 // match succeeds. Verified by testing: Perl matches succeed
3706 // in this case, so we do too.
3708 UBool success
= TRUE
;
3710 if (utext_getNativeIndex(fAltInputText
) >= groupEndIdx
) {
3714 if (utext_getNativeIndex(fInputText
) >= fActiveLimit
) {
3719 UChar32 captureGroupChar
= utext_next32(fAltInputText
);
3720 UChar32 inputChar
= utext_next32(fInputText
);
3721 if (inputChar
!= captureGroupChar
) {
3728 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3730 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3739 U_ASSERT(opValue
< fFrameSize
);
3740 int64_t groupStartIdx
= fp
->fExtra
[opValue
];
3741 int64_t groupEndIdx
= fp
->fExtra
[opValue
+1];
3742 U_ASSERT(groupStartIdx
<= groupEndIdx
);
3743 if (groupStartIdx
< 0) {
3744 // This capture group has not participated in the match thus far,
3745 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
3748 utext_setNativeIndex(fAltInputText
, groupStartIdx
);
3749 utext_setNativeIndex(fInputText
, fp
->fInputIdx
);
3750 CaseFoldingUTextIterator
captureGroupItr(*fAltInputText
);
3751 CaseFoldingUTextIterator
inputItr(*fInputText
);
3753 // Note: if the capture group match was of an empty string the backref
3754 // match succeeds. Verified by testing: Perl matches succeed
3755 // in this case, so we do too.
3757 UBool success
= TRUE
;
3759 if (!captureGroupItr
.inExpansion() && utext_getNativeIndex(fAltInputText
) >= groupEndIdx
) {
3763 if (!inputItr
.inExpansion() && utext_getNativeIndex(fInputText
) >= fActiveLimit
) {
3768 UChar32 captureGroupChar
= captureGroupItr
.next();
3769 UChar32 inputChar
= inputItr
.next();
3770 if (inputChar
!= captureGroupChar
) {
3776 if (success
&& inputItr
.inExpansion()) {
3777 // We otained a match by consuming part of a string obtained from
3778 // case-folding a single code point of the input text.
3779 // This does not count as an overall match.
3784 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3786 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3792 case URX_STO_INP_LOC
:
3794 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
);
3795 fp
->fExtra
[opValue
] = fp
->fInputIdx
;
3801 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
3803 int32_t dataLoc
= URX_VAL(pat
[instrOperandLoc
]);
3804 U_ASSERT(dataLoc
>= 0 && dataLoc
< fFrameSize
);
3805 int64_t savedInputIdx
= fp
->fExtra
[dataLoc
];
3806 U_ASSERT(savedInputIdx
<= fp
->fInputIdx
);
3807 if (savedInputIdx
< fp
->fInputIdx
) {
3808 fp
->fPatIdx
= opValue
; // JMP
3810 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no progress in loop.
3817 // Entering a look around block.
3818 // Save Stack Ptr, Input Pos.
3819 U_ASSERT(opValue
>=0 && opValue
+3<fPattern
->fDataSize
);
3820 fData
[opValue
] = fStack
->size();
3821 fData
[opValue
+1] = fp
->fInputIdx
;
3822 fData
[opValue
+2] = fActiveStart
;
3823 fData
[opValue
+3] = fActiveLimit
;
3824 fActiveStart
= fLookStart
; // Set the match region change for
3825 fActiveLimit
= fLookLimit
; // transparent bounds.
3831 // Leaving a look-ahead block.
3832 // restore Stack Ptr, Input Pos to positions they had on entry to block.
3833 U_ASSERT(opValue
>=0 && opValue
+3<fPattern
->fDataSize
);
3834 int32_t stackSize
= fStack
->size();
3835 int32_t newStackSize
=(int32_t)fData
[opValue
];
3836 U_ASSERT(stackSize
>= newStackSize
);
3837 if (stackSize
> newStackSize
) {
3838 // Copy the current top frame back to the new (cut back) top frame.
3839 // This makes the capture groups from within the look-ahead
3840 // expression available.
3841 int64_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
3843 for (j
=0; j
<fFrameSize
; j
++) {
3844 newFP
[j
] = ((int64_t *)fp
)[j
];
3846 fp
= (REStackFrame
*)newFP
;
3847 fStack
->setSize(newStackSize
);
3849 fp
->fInputIdx
= fData
[opValue
+1];
3851 // Restore the active region bounds in the input string; they may have
3852 // been changed because of transparent bounds on a Region.
3853 fActiveStart
= fData
[opValue
+2];
3854 fActiveLimit
= fData
[opValue
+3];
3855 U_ASSERT(fActiveStart
>= 0);
3856 U_ASSERT(fActiveLimit
<= fInputLength
);
3861 // Case insensitive one char. The char from the pattern is already case folded.
3862 // Input text is not, but case folding the input can not reduce two or more code
3864 if (fp
->fInputIdx
< fActiveLimit
) {
3865 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3867 UChar32 c
= UTEXT_NEXT32(fInputText
);
3868 if (u_foldCase(c
, U_FOLD_CASE_DEFAULT
) == opValue
) {
3869 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3876 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3881 // Case-insensitive test input against a literal string.
3882 // Strings require two slots in the compiled pattern, one for the
3883 // offset to the string text, and one for the length.
3884 // The compiled string has already been case folded.
3886 const UChar
*patternString
= litText
+ opValue
;
3887 int32_t patternStringIdx
= 0;
3889 op
= (int32_t)pat
[fp
->fPatIdx
];
3891 opType
= URX_TYPE(op
);
3892 opValue
= URX_VAL(op
);
3893 U_ASSERT(opType
== URX_STRING_LEN
);
3894 int32_t patternStringLen
= opValue
; // Length of the string from the pattern.
3899 UBool success
= TRUE
;
3901 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3902 CaseFoldingUTextIterator
inputIterator(*fInputText
);
3903 while (patternStringIdx
< patternStringLen
) {
3904 if (!inputIterator
.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText
) >= fActiveLimit
) {
3909 U16_NEXT(patternString
, patternStringIdx
, patternStringLen
, cPattern
);
3910 cText
= inputIterator
.next();
3911 if (cText
!= cPattern
) {
3916 if (inputIterator
.inExpansion()) {
3921 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3923 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3931 // Entering a look-behind block.
3932 // Save Stack Ptr, Input Pos and active input region.
3933 // TODO: implement transparent bounds. Ticket #6067
3934 U_ASSERT(opValue
>=0 && opValue
+4<fPattern
->fDataSize
);
3935 fData
[opValue
] = fStack
->size();
3936 fData
[opValue
+1] = fp
->fInputIdx
;
3937 // Save input string length, then reset to pin any matches to end at
3938 // the current position.
3939 fData
[opValue
+2] = fActiveStart
;
3940 fData
[opValue
+3] = fActiveLimit
;
3941 fActiveStart
= fRegionStart
;
3942 fActiveLimit
= fp
->fInputIdx
;
3943 // Init the variable containing the start index for attempted matches.
3944 fData
[opValue
+4] = -1;
3951 // Positive Look-Behind, at top of loop checking for matches of LB expression
3952 // at all possible input starting positions.
3954 // Fetch the min and max possible match lengths. They are the operands
3955 // of this op in the pattern.
3956 int32_t minML
= (int32_t)pat
[fp
->fPatIdx
++];
3957 int32_t maxML
= (int32_t)pat
[fp
->fPatIdx
++];
3958 if (!UTEXT_USES_U16(fInputText
)) {
3959 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
3960 // The max length need not be exact; it just needs to be >= actual maximum.
3963 U_ASSERT(minML
<= maxML
);
3964 U_ASSERT(minML
>= 0);
3966 // Fetch (from data) the last input index where a match was attempted.
3967 U_ASSERT(opValue
>=0 && opValue
+4<fPattern
->fDataSize
);
3968 int64_t &lbStartIdx
= fData
[opValue
+4];
3969 if (lbStartIdx
< 0) {
3970 // First time through loop.
3971 lbStartIdx
= fp
->fInputIdx
- minML
;
3972 if (lbStartIdx
> 0) {
3973 // move index to a code point boudary, if it's not on one already.
3974 UTEXT_SETNATIVEINDEX(fInputText
, lbStartIdx
);
3975 lbStartIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3978 // 2nd through nth time through the loop.
3979 // Back up start position for match by one.
3980 if (lbStartIdx
== 0) {
3983 UTEXT_SETNATIVEINDEX(fInputText
, lbStartIdx
);
3984 (void)UTEXT_PREVIOUS32(fInputText
);
3985 lbStartIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3989 if (lbStartIdx
< 0 || lbStartIdx
< fp
->fInputIdx
- maxML
) {
3990 // We have tried all potential match starting points without
3991 // getting a match. Backtrack out, and out of the
3992 // Look Behind altogether.
3993 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3994 fActiveStart
= fData
[opValue
+2];
3995 fActiveLimit
= fData
[opValue
+3];
3996 U_ASSERT(fActiveStart
>= 0);
3997 U_ASSERT(fActiveLimit
<= fInputLength
);
4001 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4002 // (successful match will fall off the end of the loop.)
4003 fp
= StateSave(fp
, fp
->fPatIdx
-3, status
);
4004 fp
->fInputIdx
= lbStartIdx
;
4009 // End of a look-behind block, after a successful match.
4011 U_ASSERT(opValue
>=0 && opValue
+4<fPattern
->fDataSize
);
4012 if (fp
->fInputIdx
!= fActiveLimit
) {
4013 // The look-behind expression matched, but the match did not
4014 // extend all the way to the point that we are looking behind from.
4015 // FAIL out of here, which will take us back to the LB_CONT, which
4016 // will retry the match starting at another position or fail
4017 // the look-behind altogether, whichever is appropriate.
4018 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4022 // Look-behind match is good. Restore the orignal input string region,
4023 // which had been truncated to pin the end of the lookbehind match to the
4024 // position being looked-behind.
4025 fActiveStart
= fData
[opValue
+2];
4026 fActiveLimit
= fData
[opValue
+3];
4027 U_ASSERT(fActiveStart
>= 0);
4028 U_ASSERT(fActiveLimit
<= fInputLength
);
4035 // Negative Look-Behind, at top of loop checking for matches of LB expression
4036 // at all possible input starting positions.
4038 // Fetch the extra parameters of this op.
4039 int32_t minML
= (int32_t)pat
[fp
->fPatIdx
++];
4040 int32_t maxML
= (int32_t)pat
[fp
->fPatIdx
++];
4041 if (!UTEXT_USES_U16(fInputText
)) {
4042 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
4043 // The max length need not be exact; it just needs to be >= actual maximum.
4046 int32_t continueLoc
= (int32_t)pat
[fp
->fPatIdx
++];
4047 continueLoc
= URX_VAL(continueLoc
);
4048 U_ASSERT(minML
<= maxML
);
4049 U_ASSERT(minML
>= 0);
4050 U_ASSERT(continueLoc
> fp
->fPatIdx
);
4052 // Fetch (from data) the last input index where a match was attempted.
4053 U_ASSERT(opValue
>=0 && opValue
+4<fPattern
->fDataSize
);
4054 int64_t &lbStartIdx
= fData
[opValue
+4];
4055 if (lbStartIdx
< 0) {
4056 // First time through loop.
4057 lbStartIdx
= fp
->fInputIdx
- minML
;
4058 if (lbStartIdx
> 0) {
4059 // move index to a code point boudary, if it's not on one already.
4060 UTEXT_SETNATIVEINDEX(fInputText
, lbStartIdx
);
4061 lbStartIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
4064 // 2nd through nth time through the loop.
4065 // Back up start position for match by one.
4066 if (lbStartIdx
== 0) {
4069 UTEXT_SETNATIVEINDEX(fInputText
, lbStartIdx
);
4070 (void)UTEXT_PREVIOUS32(fInputText
);
4071 lbStartIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
4075 if (lbStartIdx
< 0 || lbStartIdx
< fp
->fInputIdx
- maxML
) {
4076 // We have tried all potential match starting points without
4077 // getting a match, which means that the negative lookbehind as
4078 // a whole has succeeded. Jump forward to the continue location
4079 fActiveStart
= fData
[opValue
+2];
4080 fActiveLimit
= fData
[opValue
+3];
4081 U_ASSERT(fActiveStart
>= 0);
4082 U_ASSERT(fActiveLimit
<= fInputLength
);
4083 fp
->fPatIdx
= continueLoc
;
4087 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4088 // (successful match will cause a FAIL out of the loop altogether.)
4089 fp
= StateSave(fp
, fp
->fPatIdx
-4, status
);
4090 fp
->fInputIdx
= lbStartIdx
;
4095 // End of a negative look-behind block, after a successful match.
4097 U_ASSERT(opValue
>=0 && opValue
+4<fPattern
->fDataSize
);
4098 if (fp
->fInputIdx
!= fActiveLimit
) {
4099 // The look-behind expression matched, but the match did not
4100 // extend all the way to the point that we are looking behind from.
4101 // FAIL out of here, which will take us back to the LB_CONT, which
4102 // will retry the match starting at another position or succeed
4103 // the look-behind altogether, whichever is appropriate.
4104 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4108 // Look-behind expression matched, which means look-behind test as
4111 // Restore the orignal input string length, which had been truncated
4112 // inorder to pin the end of the lookbehind match
4113 // to the position being looked-behind.
4114 fActiveStart
= fData
[opValue
+2];
4115 fActiveLimit
= fData
[opValue
+3];
4116 U_ASSERT(fActiveStart
>= 0);
4117 U_ASSERT(fActiveLimit
<= fInputLength
);
4119 // Restore original stack position, discarding any state saved
4120 // by the successful pattern match.
4121 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
4122 int32_t newStackSize
= (int32_t)fData
[opValue
];
4123 U_ASSERT(fStack
->size() > newStackSize
);
4124 fStack
->setSize(newStackSize
);
4126 // FAIL, which will take control back to someplace
4127 // prior to entering the look-behind test.
4128 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4134 // Loop Initialization for the optimized implementation of
4135 // [some character set]*
4136 // This op scans through all matching input.
4137 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4139 U_ASSERT(opValue
> 0 && opValue
< fSets
->size());
4140 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
4141 UnicodeSet
*s
= (UnicodeSet
*)fSets
->elementAt(opValue
);
4143 // Loop through input, until either the input is exhausted or
4144 // we reach a character that is not a member of the set.
4145 int64_t ix
= fp
->fInputIdx
;
4146 UTEXT_SETNATIVEINDEX(fInputText
, ix
);
4148 if (ix
>= fActiveLimit
) {
4152 UChar32 c
= UTEXT_NEXT32(fInputText
);
4154 if (s8
->contains(c
) == FALSE
) {
4158 if (s
->contains(c
) == FALSE
) {
4162 ix
= UTEXT_GETNATIVEINDEX(fInputText
);
4165 // If there were no matching characters, skip over the loop altogether.
4166 // The loop doesn't run at all, a * op always succeeds.
4167 if (ix
== fp
->fInputIdx
) {
4168 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
4172 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4173 // must follow. It's operand is the stack location
4174 // that holds the starting input index for the match of this [set]*
4175 int32_t loopcOp
= (int32_t)pat
[fp
->fPatIdx
];
4176 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
4177 int32_t stackLoc
= URX_VAL(loopcOp
);
4178 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
4179 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
4182 // Save State to the URX_LOOP_C op that follows this one,
4183 // so that match failures in the following code will return to there.
4184 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4185 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
4191 case URX_LOOP_DOT_I
:
4192 // Loop Initialization for the optimized implementation of .*
4193 // This op scans through all remaining input.
4194 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4196 // Loop through input until the input is exhausted (we reach an end-of-line)
4197 // In DOTALL mode, we can just go straight to the end of the input.
4199 if ((opValue
& 1) == 1) {
4200 // Dot-matches-All mode. Jump straight to the end of the string.
4204 // NOT DOT ALL mode. Line endings do not match '.'
4205 // Scan forward until a line ending or end of input.
4207 UTEXT_SETNATIVEINDEX(fInputText
, ix
);
4209 if (ix
>= fActiveLimit
) {
4213 UChar32 c
= UTEXT_NEXT32(fInputText
);
4214 if ((c
& 0x7f) <= 0x29) { // Fast filter of non-new-line-s
4215 if ((c
== 0x0a) || // 0x0a is newline in both modes.
4216 (((opValue
& 2) == 0) && // IF not UNIX_LINES mode
4217 isLineTerminator(c
))) {
4218 // char is a line ending. Exit the scanning loop.
4222 ix
= UTEXT_GETNATIVEINDEX(fInputText
);
4226 // If there were no matching characters, skip over the loop altogether.
4227 // The loop doesn't run at all, a * op always succeeds.
4228 if (ix
== fp
->fInputIdx
) {
4229 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
4233 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4234 // must follow. It's operand is the stack location
4235 // that holds the starting input index for the match of this .*
4236 int32_t loopcOp
= (int32_t)pat
[fp
->fPatIdx
];
4237 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
4238 int32_t stackLoc
= URX_VAL(loopcOp
);
4239 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
4240 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
4243 // Save State to the URX_LOOP_C op that follows this one,
4244 // so that match failures in the following code will return to there.
4245 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4246 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
4254 U_ASSERT(opValue
>=0 && opValue
<fFrameSize
);
4255 backSearchIndex
= fp
->fExtra
[opValue
];
4256 U_ASSERT(backSearchIndex
<= fp
->fInputIdx
);
4257 if (backSearchIndex
== fp
->fInputIdx
) {
4258 // We've backed up the input idx to the point that the loop started.
4259 // The loop is done. Leave here without saving state.
4260 // Subsequent failures won't come back here.
4263 // Set up for the next iteration of the loop, with input index
4264 // backed up by one from the last time through,
4265 // and a state save to this instruction in case the following code fails again.
4266 // (We're going backwards because this loop emulates stack unwinding, not
4267 // the initial scan forward.)
4268 U_ASSERT(fp
->fInputIdx
> 0);
4269 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
4270 UChar32 prevC
= UTEXT_PREVIOUS32(fInputText
);
4271 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
4273 UChar32 twoPrevC
= UTEXT_PREVIOUS32(fInputText
);
4274 if (prevC
== 0x0a &&
4275 fp
->fInputIdx
> backSearchIndex
&&
4277 int32_t prevOp
= (int32_t)pat
[fp
->fPatIdx
-2];
4278 if (URX_TYPE(prevOp
) == URX_LOOP_DOT_I
) {
4279 // .*, stepping back over CRLF pair.
4280 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
4285 fp
= StateSave(fp
, fp
->fPatIdx
-1, status
);
4292 // Trouble. The compiled pattern contains an entry with an
4293 // unrecognized type tag.
4297 if (U_FAILURE(status
)) {
4306 fLastMatchEnd
= fMatchEnd
;
4307 fMatchStart
= startIdx
;
4308 fMatchEnd
= fp
->fInputIdx
;
4311 #ifdef REGEX_RUN_DEBUG
4314 printf("Match. start=%ld end=%ld\n\n", fMatchStart
, fMatchEnd
);
4316 printf("No match\n\n");
4321 fFrame
= fp
; // The active stack frame when the engine stopped.
4322 // Contains the capture group results that we need to
4328 //--------------------------------------------------------------------------------
4330 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with the
4331 // assumption that the entire string is available in the UText's
4332 // chunk buffer. For now, that means we can use int32_t indexes,
4333 // except for anything that needs to be saved (like group starts
4336 // startIdx: begin matching a this index.
4337 // toEnd: if true, match must extend to end of the input region
4339 //--------------------------------------------------------------------------------
4340 void RegexMatcher::MatchChunkAt(int32_t startIdx
, UBool toEnd
, UErrorCode
&status
) {
4341 UBool isMatch
= FALSE
; // True if the we have a match.
4343 int32_t backSearchIndex
= INT32_MAX
; // used after greedy single-character matches for searching backwards
4345 int32_t op
; // Operation from the compiled pattern, split into
4346 int32_t opType
; // the opcode
4347 int32_t opValue
; // and the operand value.
4349 #ifdef REGEX_RUN_DEBUG
4351 printf("MatchAt(startIdx=%d)\n", startIdx
);
4352 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern
->fPattern
))());
4353 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText
))());
4357 if (U_FAILURE(status
)) {
4361 // Cache frequently referenced items from the compiled pattern
4363 int64_t *pat
= fPattern
->fCompiledPat
->getBuffer();
4365 const UChar
*litText
= fPattern
->fLiteralText
.getBuffer();
4366 UVector
*fSets
= fPattern
->fSets
;
4368 const UChar
*inputBuf
= fInputText
->chunkContents
;
4370 fFrameSize
= fPattern
->fFrameSize
;
4371 REStackFrame
*fp
= resetStack();
4372 if (U_FAILURE(fDeferredStatus
)) {
4373 status
= fDeferredStatus
;
4378 fp
->fInputIdx
= startIdx
;
4380 // Zero out the pattern's static data
4382 for (i
= 0; i
<fPattern
->fDataSize
; i
++) {
4387 // Main loop for interpreting the compiled pattern.
4388 // One iteration of the loop per pattern operation performed.
4391 op
= (int32_t)pat
[fp
->fPatIdx
];
4392 opType
= URX_TYPE(op
);
4393 opValue
= URX_VAL(op
);
4394 #ifdef REGEX_RUN_DEBUG
4396 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
4397 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp
->fInputIdx
,
4398 UTEXT_CURRENT32(fInputText
), (int64_t *)fp
-fStack
->getBuffer(), fActiveLimit
);
4399 fPattern
->dumpOp(fp
->fPatIdx
);
4412 // Force a backtrack. In some circumstances, the pattern compiler
4413 // will notice that the pattern can't possibly match anything, and will
4414 // emit one of these at that point.
4415 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4420 if (fp
->fInputIdx
< fActiveLimit
) {
4422 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4429 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4435 // Test input against a literal string.
4436 // Strings require two slots in the compiled pattern, one for the
4437 // offset to the string text, and one for the length.
4438 int32_t stringStartIdx
= opValue
;
4441 op
= (int32_t)pat
[fp
->fPatIdx
]; // Fetch the second operand
4443 opType
= URX_TYPE(op
);
4444 stringLen
= URX_VAL(op
);
4445 U_ASSERT(opType
== URX_STRING_LEN
);
4446 U_ASSERT(stringLen
>= 2);
4448 const UChar
* pInp
= inputBuf
+ fp
->fInputIdx
;
4449 const UChar
* pInpLimit
= inputBuf
+ fActiveLimit
;
4450 const UChar
* pPat
= litText
+stringStartIdx
;
4451 const UChar
* pEnd
= pInp
+ stringLen
;
4452 UBool success
= TRUE
;
4453 while (pInp
< pEnd
) {
4454 if (pInp
>= pInpLimit
) {
4459 if (*pInp
++ != *pPat
++) {
4466 fp
->fInputIdx
+= stringLen
;
4468 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4474 case URX_STATE_SAVE
:
4475 fp
= StateSave(fp
, opValue
, status
);
4480 // The match loop will exit via this path on a successful match,
4481 // when we reach the end of the pattern.
4482 if (toEnd
&& fp
->fInputIdx
!= fActiveLimit
) {
4483 // The pattern matched, but not to the end of input. Try some more.
4484 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4490 // Start and End Capture stack frame variables are laid out out like this:
4491 // fp->fExtra[opValue] - The start of a completed capture group
4492 // opValue+1 - The end of a completed capture group
4493 // opValue+2 - the start of a capture group whose end
4494 // has not yet been reached (and might not ever be).
4495 case URX_START_CAPTURE
:
4496 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
4497 fp
->fExtra
[opValue
+2] = fp
->fInputIdx
;
4501 case URX_END_CAPTURE
:
4502 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
4503 U_ASSERT(fp
->fExtra
[opValue
+2] >= 0); // Start pos for this group must be set.
4504 fp
->fExtra
[opValue
] = fp
->fExtra
[opValue
+2]; // Tentative start becomes real.
4505 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // End position
4506 U_ASSERT(fp
->fExtra
[opValue
] <= fp
->fExtra
[opValue
+1]);
4510 case URX_DOLLAR
: // $, test for End of line
4511 // or for position before new line at end of input
4512 if (fp
->fInputIdx
< fAnchorLimit
-2) {
4513 // We are no where near the end of input. Fail.
4514 // This is the common case. Keep it first.
4515 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4518 if (fp
->fInputIdx
>= fAnchorLimit
) {
4519 // We really are at the end of input. Success.
4525 // If we are positioned just before a new-line that is located at the
4526 // end of input, succeed.
4527 if (fp
->fInputIdx
== fAnchorLimit
-1) {
4529 U16_GET(inputBuf
, fAnchorStart
, fp
->fInputIdx
, fAnchorLimit
, c
);
4531 if (isLineTerminator(c
)) {
4532 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& inputBuf
[fp
->fInputIdx
-1]==0x0d)) {
4533 // At new-line at end of input. Success
4539 } else if (fp
->fInputIdx
== fAnchorLimit
-2 &&
4540 inputBuf
[fp
->fInputIdx
]==0x0d && inputBuf
[fp
->fInputIdx
+1]==0x0a) {
4543 break; // At CR/LF at end of input. Success
4546 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4551 case URX_DOLLAR_D
: // $, test for End of Line, in UNIX_LINES mode.
4552 if (fp
->fInputIdx
>= fAnchorLimit
-1) {
4553 // Either at the last character of input, or off the end.
4554 if (fp
->fInputIdx
== fAnchorLimit
-1) {
4555 // At last char of input. Success if it's a new line.
4556 if (inputBuf
[fp
->fInputIdx
] == 0x0a) {
4562 // Off the end of input. Success.
4569 // Not at end of input. Back-track out.
4570 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4574 case URX_DOLLAR_M
: // $, test for End of line in multi-line mode
4576 if (fp
->fInputIdx
>= fAnchorLimit
) {
4577 // We really are at the end of input. Success.
4582 // If we are positioned just before a new-line, succeed.
4583 // It makes no difference where the new-line is within the input.
4584 UChar32 c
= inputBuf
[fp
->fInputIdx
];
4585 if (isLineTerminator(c
)) {
4586 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
4587 // In multi-line mode, hitting a new-line just before the end of input does not
4588 // set the hitEnd or requireEnd flags
4589 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& inputBuf
[fp
->fInputIdx
-1]==0x0d)) {
4593 // not at a new line. Fail.
4594 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4599 case URX_DOLLAR_MD
: // $, test for End of line in multi-line and UNIX_LINES mode
4601 if (fp
->fInputIdx
>= fAnchorLimit
) {
4602 // We really are at the end of input. Success.
4604 fRequireEnd
= TRUE
; // Java set requireEnd in this case, even though
4605 break; // adding a new-line would not lose the match.
4607 // If we are not positioned just before a new-line, the test fails; backtrack out.
4608 // It makes no difference where the new-line is within the input.
4609 if (inputBuf
[fp
->fInputIdx
] != 0x0a) {
4610 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4616 case URX_CARET
: // ^, test for start of line
4617 if (fp
->fInputIdx
!= fAnchorStart
) {
4618 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4623 case URX_CARET_M
: // ^, test for start of line in mulit-line mode
4625 if (fp
->fInputIdx
== fAnchorStart
) {
4626 // We are at the start input. Success.
4629 // Check whether character just before the current pos is a new-line
4630 // unless we are at the end of input
4631 UChar c
= inputBuf
[fp
->fInputIdx
- 1];
4632 if ((fp
->fInputIdx
< fAnchorLimit
) &&
4633 isLineTerminator(c
)) {
4634 // It's a new-line. ^ is true. Success.
4635 // TODO: what should be done with positions between a CR and LF?
4638 // Not at the start of a line. Fail.
4639 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4644 case URX_CARET_M_UNIX
: // ^, test for start of line in mulit-line + Unix-line mode
4646 U_ASSERT(fp
->fInputIdx
>= fAnchorStart
);
4647 if (fp
->fInputIdx
<= fAnchorStart
) {
4648 // We are at the start input. Success.
4651 // Check whether character just before the current pos is a new-line
4652 U_ASSERT(fp
->fInputIdx
<= fAnchorLimit
);
4653 UChar c
= inputBuf
[fp
->fInputIdx
- 1];
4655 // Not at the start of a line. Back-track out.
4656 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4661 case URX_BACKSLASH_B
: // Test for word boundaries
4663 UBool success
= isChunkWordBoundary((int32_t)fp
->fInputIdx
);
4664 success
^= (UBool
)(opValue
!= 0); // flip sense for \B
4666 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4672 case URX_BACKSLASH_BU
: // Test for word boundaries, Unicode-style
4674 UBool success
= isUWordBoundary(fp
->fInputIdx
);
4675 success
^= (UBool
)(opValue
!= 0); // flip sense for \B
4677 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4683 case URX_BACKSLASH_D
: // Test for decimal digit
4685 if (fp
->fInputIdx
>= fActiveLimit
) {
4687 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4692 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4693 int8_t ctype
= u_charType(c
); // TODO: make a unicode set for this. Will be faster.
4694 UBool success
= (ctype
== U_DECIMAL_DIGIT_NUMBER
);
4695 success
^= (UBool
)(opValue
!= 0); // flip sense for \D
4697 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4703 case URX_BACKSLASH_G
: // Test for position at end of previous match
4704 if (!((fMatch
&& fp
->fInputIdx
==fMatchEnd
) || (fMatch
==FALSE
&& fp
->fInputIdx
==fActiveStart
))) {
4705 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4710 case URX_BACKSLASH_H
: // Test for \h, horizontal white space.
4712 if (fp
->fInputIdx
>= fActiveLimit
) {
4714 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4718 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4719 int8_t ctype
= u_charType(c
);
4720 UBool success
= (ctype
== U_SPACE_SEPARATOR
|| c
== 9); // SPACE_SEPARATOR || TAB
4721 success
^= (UBool
)(opValue
!= 0); // flip sense for \H
4723 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4729 case URX_BACKSLASH_R
: // Test for \R, any line break sequence.
4731 if (fp
->fInputIdx
>= fActiveLimit
) {
4733 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4737 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4738 if (isLineTerminator(c
)) {
4739 if (c
== 0x0d && fp
->fInputIdx
< fActiveLimit
) {
4740 // Check for CR/LF sequence. Consume both together when found.
4742 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c2
);
4744 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c2
);
4748 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4754 case URX_BACKSLASH_V
: // Any single code point line ending.
4756 if (fp
->fInputIdx
>= fActiveLimit
) {
4758 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4762 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4763 UBool success
= isLineTerminator(c
);
4764 success
^= (UBool
)(opValue
!= 0); // flip sense for \V
4766 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4773 case URX_BACKSLASH_X
:
4774 // Match a Grapheme, as defined by Unicode TR 29.
4775 // Differs slightly from Perl, which consumes combining marks independently
4779 // Fail if at end of input
4780 if (fp
->fInputIdx
>= fActiveLimit
) {
4782 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4786 // Examine (and consume) the current char.
4787 // Dispatch into a little state machine, based on the char.
4789 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4790 UnicodeSet
**sets
= fPattern
->fStaticSets
;
4791 if (sets
[URX_GC_NORMAL
]->contains(c
)) goto GC_Extend
;
4792 if (sets
[URX_GC_CONTROL
]->contains(c
)) goto GC_Control
;
4793 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
4794 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
4795 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
4796 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
4797 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
4803 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
4804 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4805 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
4806 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
4807 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
4808 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
4809 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c
);
4813 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
4814 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4815 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
4816 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
4817 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c
);
4821 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
4822 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4823 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
4824 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c
);
4828 // Combining characters are consumed here
4830 if (fp
->fInputIdx
>= fActiveLimit
) {
4833 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4834 if (sets
[URX_GC_EXTEND
]->contains(c
) == FALSE
) {
4835 U16_BACK_1(inputBuf
, 0, fp
->fInputIdx
);
4842 // Most control chars stand alone (don't combine with combining chars),
4843 // except for that CR/LF sequence is a single grapheme cluster.
4844 if (c
== 0x0d && fp
->fInputIdx
< fActiveLimit
&& inputBuf
[fp
->fInputIdx
] == 0x0a) {
4849 if (fp
->fInputIdx
>= fActiveLimit
) {
4858 case URX_BACKSLASH_Z
: // Test for end of Input
4859 if (fp
->fInputIdx
< fAnchorLimit
) {
4860 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4869 case URX_STATIC_SETREF
:
4871 // Test input character against one of the predefined sets
4872 // (Word Characters, for example)
4873 // The high bit of the op value is a flag for the match polarity.
4874 // 0: success if input char is in set.
4875 // 1: success if input char is not in set.
4876 if (fp
->fInputIdx
>= fActiveLimit
) {
4878 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4882 UBool success
= ((opValue
& URX_NEG_SET
) == URX_NEG_SET
);
4883 opValue
&= ~URX_NEG_SET
;
4884 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
4887 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4889 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
4890 if (s8
->contains(c
)) {
4894 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
4895 if (s
->contains(c
)) {
4900 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4906 case URX_STAT_SETREF_N
:
4908 // Test input character for NOT being a member of one of
4909 // the predefined sets (Word Characters, for example)
4910 if (fp
->fInputIdx
>= fActiveLimit
) {
4912 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4916 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
4919 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4921 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
4922 if (s8
->contains(c
) == FALSE
) {
4926 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
4927 if (s
->contains(c
) == FALSE
) {
4931 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4938 if (fp
->fInputIdx
>= fActiveLimit
) {
4940 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4944 U_ASSERT(opValue
> 0 && opValue
< fSets
->size());
4946 // There is input left. Pick up one char and test it for set membership.
4948 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4950 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
4951 if (s8
->contains(c
)) {
4952 // The character is in the set. A Match.
4956 UnicodeSet
*s
= (UnicodeSet
*)fSets
->elementAt(opValue
);
4957 if (s
->contains(c
)) {
4958 // The character is in the set. A Match.
4963 // the character wasn't in the set.
4964 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4971 // . matches anything, but stops at end-of-line.
4972 if (fp
->fInputIdx
>= fActiveLimit
) {
4973 // At end of input. Match failed. Backtrack out.
4975 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4979 // There is input left. Advance over one char, unless we've hit end-of-line
4981 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4982 if (isLineTerminator(c
)) {
4983 // End of line in normal mode. . does not match.
4984 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4991 case URX_DOTANY_ALL
:
4993 // . in dot-matches-all (including new lines) mode
4994 if (fp
->fInputIdx
>= fActiveLimit
) {
4995 // At end of input. Match failed. Backtrack out.
4997 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5001 // There is input left. Advance over one char, except if we are
5002 // at a cr/lf, advance over both of them.
5004 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
5005 if (c
==0x0d && fp
->fInputIdx
< fActiveLimit
) {
5006 // In the case of a CR/LF, we need to advance over both.
5007 if (inputBuf
[fp
->fInputIdx
] == 0x0a) {
5008 U16_FWD_1(inputBuf
, fp
->fInputIdx
, fActiveLimit
);
5015 case URX_DOTANY_UNIX
:
5017 // '.' operator, matches all, but stops at end-of-line.
5018 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
5019 if (fp
->fInputIdx
>= fActiveLimit
) {
5020 // At end of input. Match failed. Backtrack out.
5022 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5026 // There is input left. Advance over one char, unless we've hit end-of-line
5028 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
5030 // End of line in normal mode. '.' does not match the \n
5031 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5038 fp
->fPatIdx
= opValue
;
5046 U_ASSERT(opValue
< fPattern
->fCompiledPat
->size());
5047 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
5048 fp
->fPatIdx
= opValue
; // Then JMP.
5052 // This opcode is used with (x)+, when x can match a zero length string.
5053 // Same as JMP_SAV, except conditional on the match having made forward progress.
5054 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
5055 // data address of the input position at the start of the loop.
5057 U_ASSERT(opValue
> 0 && opValue
< fPattern
->fCompiledPat
->size());
5058 int32_t stoOp
= (int32_t)pat
[opValue
-1];
5059 U_ASSERT(URX_TYPE(stoOp
) == URX_STO_INP_LOC
);
5060 int32_t frameLoc
= URX_VAL(stoOp
);
5061 U_ASSERT(frameLoc
>= 0 && frameLoc
< fFrameSize
);
5062 int32_t prevInputIdx
= (int32_t)fp
->fExtra
[frameLoc
];
5063 U_ASSERT(prevInputIdx
<= fp
->fInputIdx
);
5064 if (prevInputIdx
< fp
->fInputIdx
) {
5065 // The match did make progress. Repeat the loop.
5066 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
5067 fp
->fPatIdx
= opValue
;
5068 fp
->fExtra
[frameLoc
] = fp
->fInputIdx
;
5070 // If the input position did not advance, we do nothing here,
5071 // execution will fall out of the loop.
5077 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
5078 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
5080 // Pick up the three extra operands that CTR_INIT has, and
5081 // skip the pattern location counter past
5082 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
5084 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
5085 int32_t minCount
= (int32_t)pat
[instrOperandLoc
+1];
5086 int32_t maxCount
= (int32_t)pat
[instrOperandLoc
+2];
5087 U_ASSERT(minCount
>=0);
5088 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
5089 U_ASSERT(loopLoc
>=fp
->fPatIdx
);
5091 if (minCount
== 0) {
5092 fp
= StateSave(fp
, loopLoc
+1, status
);
5094 if (maxCount
== -1) {
5095 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // For loop breaking.
5096 } else if (maxCount
== 0) {
5097 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5104 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
5105 int32_t initOp
= (int32_t)pat
[opValue
];
5106 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT
);
5107 int64_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
5108 int32_t minCount
= (int32_t)pat
[opValue
+2];
5109 int32_t maxCount
= (int32_t)pat
[opValue
+3];
5111 if ((uint64_t)*pCounter
>= (uint32_t)maxCount
&& maxCount
!= -1) {
5112 U_ASSERT(*pCounter
== maxCount
);
5115 if (*pCounter
>= minCount
) {
5116 if (maxCount
== -1) {
5117 // Loop has no hard upper bound.
5118 // Check that it is progressing through the input, break if it is not.
5119 int64_t *pLastInputIdx
= &fp
->fExtra
[URX_VAL(initOp
) + 1];
5120 if (fp
->fInputIdx
== *pLastInputIdx
) {
5123 *pLastInputIdx
= fp
->fInputIdx
;
5126 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5128 // Increment time-out counter. (StateSave() does it if count >= minCount)
5130 if (fTickCounter
<= 0) {
5131 IncrementTime(status
); // Re-initializes fTickCounter
5134 fp
->fPatIdx
= opValue
+ 4; // Loop back.
5138 case URX_CTR_INIT_NG
:
5140 // Initialize a non-greedy loop
5141 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
5142 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
5144 // Pick up the three extra operands that CTR_INIT_NG has, and
5145 // skip the pattern location counter past
5146 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
5148 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
5149 int32_t minCount
= (int32_t)pat
[instrOperandLoc
+1];
5150 int32_t maxCount
= (int32_t)pat
[instrOperandLoc
+2];
5151 U_ASSERT(minCount
>=0);
5152 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
5153 U_ASSERT(loopLoc
>fp
->fPatIdx
);
5154 if (maxCount
== -1) {
5155 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // Save initial input index for loop breaking.
5158 if (minCount
== 0) {
5159 if (maxCount
!= 0) {
5160 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5162 fp
->fPatIdx
= loopLoc
+1; // Continue with stuff after repeated block
5167 case URX_CTR_LOOP_NG
:
5169 // Non-greedy {min, max} loops
5170 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
5171 int32_t initOp
= (int32_t)pat
[opValue
];
5172 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT_NG
);
5173 int64_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
5174 int32_t minCount
= (int32_t)pat
[opValue
+2];
5175 int32_t maxCount
= (int32_t)pat
[opValue
+3];
5178 if ((uint64_t)*pCounter
>= (uint32_t)maxCount
&& maxCount
!= -1) {
5179 // The loop has matched the maximum permitted number of times.
5180 // Break out of here with no action. Matching will
5181 // continue with the following pattern.
5182 U_ASSERT(*pCounter
== maxCount
);
5186 if (*pCounter
< minCount
) {
5187 // We haven't met the minimum number of matches yet.
5188 // Loop back for another one.
5189 fp
->fPatIdx
= opValue
+ 4; // Loop back.
5191 if (fTickCounter
<= 0) {
5192 IncrementTime(status
); // Re-initializes fTickCounter
5195 // We do have the minimum number of matches.
5197 // If there is no upper bound on the loop iterations, check that the input index
5198 // is progressing, and stop the loop if it is not.
5199 if (maxCount
== -1) {
5200 int64_t *pLastInputIdx
= &fp
->fExtra
[URX_VAL(initOp
) + 1];
5201 if (fp
->fInputIdx
== *pLastInputIdx
) {
5204 *pLastInputIdx
= fp
->fInputIdx
;
5207 // Loop Continuation: we will fall into the pattern following the loop
5208 // (non-greedy, don't execute loop body first), but first do
5209 // a state save to the top of the loop, so that a match failure
5210 // in the following pattern will try another iteration of the loop.
5211 fp
= StateSave(fp
, opValue
+ 4, status
);
5217 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
5218 fData
[opValue
] = fStack
->size();
5223 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
5224 int32_t newStackSize
= (int32_t)fData
[opValue
];
5225 U_ASSERT(newStackSize
<= fStack
->size());
5226 int64_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
5227 if (newFP
== (int64_t *)fp
) {
5231 for (j
=0; j
<fFrameSize
; j
++) {
5232 newFP
[j
] = ((int64_t *)fp
)[j
];
5234 fp
= (REStackFrame
*)newFP
;
5235 fStack
->setSize(newStackSize
);
5241 U_ASSERT(opValue
< fFrameSize
);
5242 int64_t groupStartIdx
= fp
->fExtra
[opValue
];
5243 int64_t groupEndIdx
= fp
->fExtra
[opValue
+1];
5244 U_ASSERT(groupStartIdx
<= groupEndIdx
);
5245 int64_t inputIndex
= fp
->fInputIdx
;
5246 if (groupStartIdx
< 0) {
5247 // This capture group has not participated in the match thus far,
5248 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
5251 UBool success
= TRUE
;
5252 for (int64_t groupIndex
= groupStartIdx
; groupIndex
< groupEndIdx
; ++groupIndex
,++inputIndex
) {
5253 if (inputIndex
>= fActiveLimit
) {
5258 if (inputBuf
[groupIndex
] != inputBuf
[inputIndex
]) {
5263 if (success
&& groupStartIdx
< groupEndIdx
&& U16_IS_LEAD(inputBuf
[groupEndIdx
-1]) &&
5264 inputIndex
< fActiveLimit
&& U16_IS_TRAIL(inputBuf
[inputIndex
])) {
5265 // Capture group ended with an unpaired lead surrogate.
5266 // Back reference is not permitted to match lead only of a surrogatge pair.
5270 fp
->fInputIdx
= inputIndex
;
5272 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5279 U_ASSERT(opValue
< fFrameSize
);
5280 int64_t groupStartIdx
= fp
->fExtra
[opValue
];
5281 int64_t groupEndIdx
= fp
->fExtra
[opValue
+1];
5282 U_ASSERT(groupStartIdx
<= groupEndIdx
);
5283 if (groupStartIdx
< 0) {
5284 // This capture group has not participated in the match thus far,
5285 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
5288 CaseFoldingUCharIterator
captureGroupItr(inputBuf
, groupStartIdx
, groupEndIdx
);
5289 CaseFoldingUCharIterator
inputItr(inputBuf
, fp
->fInputIdx
, fActiveLimit
);
5291 // Note: if the capture group match was of an empty string the backref
5292 // match succeeds. Verified by testing: Perl matches succeed
5293 // in this case, so we do too.
5295 UBool success
= TRUE
;
5297 UChar32 captureGroupChar
= captureGroupItr
.next();
5298 if (captureGroupChar
== U_SENTINEL
) {
5302 UChar32 inputChar
= inputItr
.next();
5303 if (inputChar
== U_SENTINEL
) {
5308 if (inputChar
!= captureGroupChar
) {
5314 if (success
&& inputItr
.inExpansion()) {
5315 // We otained a match by consuming part of a string obtained from
5316 // case-folding a single code point of the input text.
5317 // This does not count as an overall match.
5322 fp
->fInputIdx
= inputItr
.getIndex();
5324 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5329 case URX_STO_INP_LOC
:
5331 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
);
5332 fp
->fExtra
[opValue
] = fp
->fInputIdx
;
5338 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
5340 int32_t dataLoc
= URX_VAL(pat
[instrOperandLoc
]);
5341 U_ASSERT(dataLoc
>= 0 && dataLoc
< fFrameSize
);
5342 int32_t savedInputIdx
= (int32_t)fp
->fExtra
[dataLoc
];
5343 U_ASSERT(savedInputIdx
<= fp
->fInputIdx
);
5344 if (savedInputIdx
< fp
->fInputIdx
) {
5345 fp
->fPatIdx
= opValue
; // JMP
5347 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no progress in loop.
5354 // Entering a look around block.
5355 // Save Stack Ptr, Input Pos.
5356 U_ASSERT(opValue
>=0 && opValue
+3<fPattern
->fDataSize
);
5357 fData
[opValue
] = fStack
->size();
5358 fData
[opValue
+1] = fp
->fInputIdx
;
5359 fData
[opValue
+2] = fActiveStart
;
5360 fData
[opValue
+3] = fActiveLimit
;
5361 fActiveStart
= fLookStart
; // Set the match region change for
5362 fActiveLimit
= fLookLimit
; // transparent bounds.
5368 // Leaving a look around block.
5369 // restore Stack Ptr, Input Pos to positions they had on entry to block.
5370 U_ASSERT(opValue
>=0 && opValue
+3<fPattern
->fDataSize
);
5371 int32_t stackSize
= fStack
->size();
5372 int32_t newStackSize
= (int32_t)fData
[opValue
];
5373 U_ASSERT(stackSize
>= newStackSize
);
5374 if (stackSize
> newStackSize
) {
5375 // Copy the current top frame back to the new (cut back) top frame.
5376 // This makes the capture groups from within the look-ahead
5377 // expression available.
5378 int64_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
5380 for (j
=0; j
<fFrameSize
; j
++) {
5381 newFP
[j
] = ((int64_t *)fp
)[j
];
5383 fp
= (REStackFrame
*)newFP
;
5384 fStack
->setSize(newStackSize
);
5386 fp
->fInputIdx
= fData
[opValue
+1];
5388 // Restore the active region bounds in the input string; they may have
5389 // been changed because of transparent bounds on a Region.
5390 fActiveStart
= fData
[opValue
+2];
5391 fActiveLimit
= fData
[opValue
+3];
5392 U_ASSERT(fActiveStart
>= 0);
5393 U_ASSERT(fActiveLimit
<= fInputLength
);
5398 if (fp
->fInputIdx
< fActiveLimit
) {
5400 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
5401 if (u_foldCase(c
, U_FOLD_CASE_DEFAULT
) == opValue
) {
5407 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5411 // Case-insensitive test input against a literal string.
5412 // Strings require two slots in the compiled pattern, one for the
5413 // offset to the string text, and one for the length.
5414 // The compiled string has already been case folded.
5416 const UChar
*patternString
= litText
+ opValue
;
5418 op
= (int32_t)pat
[fp
->fPatIdx
];
5420 opType
= URX_TYPE(op
);
5421 opValue
= URX_VAL(op
);
5422 U_ASSERT(opType
== URX_STRING_LEN
);
5423 int32_t patternStringLen
= opValue
; // Length of the string from the pattern.
5427 UBool success
= TRUE
;
5428 int32_t patternStringIdx
= 0;
5429 CaseFoldingUCharIterator
inputIterator(inputBuf
, fp
->fInputIdx
, fActiveLimit
);
5430 while (patternStringIdx
< patternStringLen
) {
5431 U16_NEXT(patternString
, patternStringIdx
, patternStringLen
, cPattern
);
5432 cText
= inputIterator
.next();
5433 if (cText
!= cPattern
) {
5435 if (cText
== U_SENTINEL
) {
5441 if (inputIterator
.inExpansion()) {
5446 fp
->fInputIdx
= inputIterator
.getIndex();
5448 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5455 // Entering a look-behind block.
5456 // Save Stack Ptr, Input Pos and active input region.
5457 // TODO: implement transparent bounds. Ticket #6067
5458 U_ASSERT(opValue
>=0 && opValue
+4<fPattern
->fDataSize
);
5459 fData
[opValue
] = fStack
->size();
5460 fData
[opValue
+1] = fp
->fInputIdx
;
5461 // Save input string length, then reset to pin any matches to end at
5462 // the current position.
5463 fData
[opValue
+2] = fActiveStart
;
5464 fData
[opValue
+3] = fActiveLimit
;
5465 fActiveStart
= fRegionStart
;
5466 fActiveLimit
= fp
->fInputIdx
;
5467 // Init the variable containing the start index for attempted matches.
5468 fData
[opValue
+4] = -1;
5475 // Positive Look-Behind, at top of loop checking for matches of LB expression
5476 // at all possible input starting positions.
5478 // Fetch the min and max possible match lengths. They are the operands
5479 // of this op in the pattern.
5480 int32_t minML
= (int32_t)pat
[fp
->fPatIdx
++];
5481 int32_t maxML
= (int32_t)pat
[fp
->fPatIdx
++];
5482 U_ASSERT(minML
<= maxML
);
5483 U_ASSERT(minML
>= 0);
5485 // Fetch (from data) the last input index where a match was attempted.
5486 U_ASSERT(opValue
>=0 && opValue
+4<fPattern
->fDataSize
);
5487 int64_t &lbStartIdx
= fData
[opValue
+4];
5488 if (lbStartIdx
< 0) {
5489 // First time through loop.
5490 lbStartIdx
= fp
->fInputIdx
- minML
;
5491 if (lbStartIdx
> 0 && lbStartIdx
< fInputLength
) {
5492 U16_SET_CP_START(inputBuf
, 0, lbStartIdx
);
5495 // 2nd through nth time through the loop.
5496 // Back up start position for match by one.
5497 if (lbStartIdx
== 0) {
5500 U16_BACK_1(inputBuf
, 0, lbStartIdx
);
5504 if (lbStartIdx
< 0 || lbStartIdx
< fp
->fInputIdx
- maxML
) {
5505 // We have tried all potential match starting points without
5506 // getting a match. Backtrack out, and out of the
5507 // Look Behind altogether.
5508 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5509 fActiveStart
= fData
[opValue
+2];
5510 fActiveLimit
= fData
[opValue
+3];
5511 U_ASSERT(fActiveStart
>= 0);
5512 U_ASSERT(fActiveLimit
<= fInputLength
);
5516 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5517 // (successful match will fall off the end of the loop.)
5518 fp
= StateSave(fp
, fp
->fPatIdx
-3, status
);
5519 fp
->fInputIdx
= lbStartIdx
;
5524 // End of a look-behind block, after a successful match.
5526 U_ASSERT(opValue
>=0 && opValue
+4<fPattern
->fDataSize
);
5527 if (fp
->fInputIdx
!= fActiveLimit
) {
5528 // The look-behind expression matched, but the match did not
5529 // extend all the way to the point that we are looking behind from.
5530 // FAIL out of here, which will take us back to the LB_CONT, which
5531 // will retry the match starting at another position or fail
5532 // the look-behind altogether, whichever is appropriate.
5533 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5537 // Look-behind match is good. Restore the orignal input string region,
5538 // which had been truncated to pin the end of the lookbehind match to the
5539 // position being looked-behind.
5540 fActiveStart
= fData
[opValue
+2];
5541 fActiveLimit
= fData
[opValue
+3];
5542 U_ASSERT(fActiveStart
>= 0);
5543 U_ASSERT(fActiveLimit
<= fInputLength
);
5550 // Negative Look-Behind, at top of loop checking for matches of LB expression
5551 // at all possible input starting positions.
5553 // Fetch the extra parameters of this op.
5554 int32_t minML
= (int32_t)pat
[fp
->fPatIdx
++];
5555 int32_t maxML
= (int32_t)pat
[fp
->fPatIdx
++];
5556 int32_t continueLoc
= (int32_t)pat
[fp
->fPatIdx
++];
5557 continueLoc
= URX_VAL(continueLoc
);
5558 U_ASSERT(minML
<= maxML
);
5559 U_ASSERT(minML
>= 0);
5560 U_ASSERT(continueLoc
> fp
->fPatIdx
);
5562 // Fetch (from data) the last input index where a match was attempted.
5563 U_ASSERT(opValue
>=0 && opValue
+4<fPattern
->fDataSize
);
5564 int64_t &lbStartIdx
= fData
[opValue
+4];
5565 if (lbStartIdx
< 0) {
5566 // First time through loop.
5567 lbStartIdx
= fp
->fInputIdx
- minML
;
5568 if (lbStartIdx
> 0 && lbStartIdx
< fInputLength
) {
5569 U16_SET_CP_START(inputBuf
, 0, lbStartIdx
);
5572 // 2nd through nth time through the loop.
5573 // Back up start position for match by one.
5574 if (lbStartIdx
== 0) {
5575 lbStartIdx
--; // Because U16_BACK is unsafe starting at 0.
5577 U16_BACK_1(inputBuf
, 0, lbStartIdx
);
5581 if (lbStartIdx
< 0 || lbStartIdx
< fp
->fInputIdx
- maxML
) {
5582 // We have tried all potential match starting points without
5583 // getting a match, which means that the negative lookbehind as
5584 // a whole has succeeded. Jump forward to the continue location
5585 fActiveStart
= fData
[opValue
+2];
5586 fActiveLimit
= fData
[opValue
+3];
5587 U_ASSERT(fActiveStart
>= 0);
5588 U_ASSERT(fActiveLimit
<= fInputLength
);
5589 fp
->fPatIdx
= continueLoc
;
5593 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5594 // (successful match will cause a FAIL out of the loop altogether.)
5595 fp
= StateSave(fp
, fp
->fPatIdx
-4, status
);
5596 fp
->fInputIdx
= lbStartIdx
;
5601 // End of a negative look-behind block, after a successful match.
5603 U_ASSERT(opValue
>=0 && opValue
+4<fPattern
->fDataSize
);
5604 if (fp
->fInputIdx
!= fActiveLimit
) {
5605 // The look-behind expression matched, but the match did not
5606 // extend all the way to the point that we are looking behind from.
5607 // FAIL out of here, which will take us back to the LB_CONT, which
5608 // will retry the match starting at another position or succeed
5609 // the look-behind altogether, whichever is appropriate.
5610 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5614 // Look-behind expression matched, which means look-behind test as
5617 // Restore the orignal input string length, which had been truncated
5618 // inorder to pin the end of the lookbehind match
5619 // to the position being looked-behind.
5620 fActiveStart
= fData
[opValue
+2];
5621 fActiveLimit
= fData
[opValue
+3];
5622 U_ASSERT(fActiveStart
>= 0);
5623 U_ASSERT(fActiveLimit
<= fInputLength
);
5625 // Restore original stack position, discarding any state saved
5626 // by the successful pattern match.
5627 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5628 int32_t newStackSize
= (int32_t)fData
[opValue
];
5629 U_ASSERT(fStack
->size() > newStackSize
);
5630 fStack
->setSize(newStackSize
);
5632 // FAIL, which will take control back to someplace
5633 // prior to entering the look-behind test.
5634 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5640 // Loop Initialization for the optimized implementation of
5641 // [some character set]*
5642 // This op scans through all matching input.
5643 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5645 U_ASSERT(opValue
> 0 && opValue
< fSets
->size());
5646 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
5647 UnicodeSet
*s
= (UnicodeSet
*)fSets
->elementAt(opValue
);
5649 // Loop through input, until either the input is exhausted or
5650 // we reach a character that is not a member of the set.
5651 int32_t ix
= (int32_t)fp
->fInputIdx
;
5653 if (ix
>= fActiveLimit
) {
5658 U16_NEXT(inputBuf
, ix
, fActiveLimit
, c
);
5660 if (s8
->contains(c
) == FALSE
) {
5661 U16_BACK_1(inputBuf
, 0, ix
);
5665 if (s
->contains(c
) == FALSE
) {
5666 U16_BACK_1(inputBuf
, 0, ix
);
5672 // If there were no matching characters, skip over the loop altogether.
5673 // The loop doesn't run at all, a * op always succeeds.
5674 if (ix
== fp
->fInputIdx
) {
5675 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
5679 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5680 // must follow. It's operand is the stack location
5681 // that holds the starting input index for the match of this [set]*
5682 int32_t loopcOp
= (int32_t)pat
[fp
->fPatIdx
];
5683 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
5684 int32_t stackLoc
= URX_VAL(loopcOp
);
5685 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
5686 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
5689 // Save State to the URX_LOOP_C op that follows this one,
5690 // so that match failures in the following code will return to there.
5691 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5692 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5698 case URX_LOOP_DOT_I
:
5699 // Loop Initialization for the optimized implementation of .*
5700 // This op scans through all remaining input.
5701 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5703 // Loop through input until the input is exhausted (we reach an end-of-line)
5704 // In DOTALL mode, we can just go straight to the end of the input.
5706 if ((opValue
& 1) == 1) {
5707 // Dot-matches-All mode. Jump straight to the end of the string.
5708 ix
= (int32_t)fActiveLimit
;
5711 // NOT DOT ALL mode. Line endings do not match '.'
5712 // Scan forward until a line ending or end of input.
5713 ix
= (int32_t)fp
->fInputIdx
;
5715 if (ix
>= fActiveLimit
) {
5720 U16_NEXT(inputBuf
, ix
, fActiveLimit
, c
); // c = inputBuf[ix++]
5721 if ((c
& 0x7f) <= 0x29) { // Fast filter of non-new-line-s
5722 if ((c
== 0x0a) || // 0x0a is newline in both modes.
5723 (((opValue
& 2) == 0) && // IF not UNIX_LINES mode
5724 isLineTerminator(c
))) {
5725 // char is a line ending. Put the input pos back to the
5726 // line ending char, and exit the scanning loop.
5727 U16_BACK_1(inputBuf
, 0, ix
);
5734 // If there were no matching characters, skip over the loop altogether.
5735 // The loop doesn't run at all, a * op always succeeds.
5736 if (ix
== fp
->fInputIdx
) {
5737 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
5741 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5742 // must follow. It's operand is the stack location
5743 // that holds the starting input index for the match of this .*
5744 int32_t loopcOp
= (int32_t)pat
[fp
->fPatIdx
];
5745 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
5746 int32_t stackLoc
= URX_VAL(loopcOp
);
5747 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
5748 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
5751 // Save State to the URX_LOOP_C op that follows this one,
5752 // so that match failures in the following code will return to there.
5753 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5754 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5762 U_ASSERT(opValue
>=0 && opValue
<fFrameSize
);
5763 backSearchIndex
= (int32_t)fp
->fExtra
[opValue
];
5764 U_ASSERT(backSearchIndex
<= fp
->fInputIdx
);
5765 if (backSearchIndex
== fp
->fInputIdx
) {
5766 // We've backed up the input idx to the point that the loop started.
5767 // The loop is done. Leave here without saving state.
5768 // Subsequent failures won't come back here.
5771 // Set up for the next iteration of the loop, with input index
5772 // backed up by one from the last time through,
5773 // and a state save to this instruction in case the following code fails again.
5774 // (We're going backwards because this loop emulates stack unwinding, not
5775 // the initial scan forward.)
5776 U_ASSERT(fp
->fInputIdx
> 0);
5778 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, prevC
); // !!!: should this 0 be one of f*Limit?
5780 if (prevC
== 0x0a &&
5781 fp
->fInputIdx
> backSearchIndex
&&
5782 inputBuf
[fp
->fInputIdx
-1] == 0x0d) {
5783 int32_t prevOp
= (int32_t)pat
[fp
->fPatIdx
-2];
5784 if (URX_TYPE(prevOp
) == URX_LOOP_DOT_I
) {
5785 // .*, stepping back over CRLF pair.
5786 U16_BACK_1(inputBuf
, 0, fp
->fInputIdx
);
5791 fp
= StateSave(fp
, fp
->fPatIdx
-1, status
);
5798 // Trouble. The compiled pattern contains an entry with an
5799 // unrecognized type tag.
5803 if (U_FAILURE(status
)) {
5812 fLastMatchEnd
= fMatchEnd
;
5813 fMatchStart
= startIdx
;
5814 fMatchEnd
= fp
->fInputIdx
;
5817 #ifdef REGEX_RUN_DEBUG
5820 printf("Match. start=%ld end=%ld\n\n", fMatchStart
, fMatchEnd
);
5822 printf("No match\n\n");
5827 fFrame
= fp
; // The active stack frame when the engine stopped.
5828 // Contains the capture group results that we need to
5835 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher
)
5839 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS