1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **************************************************************************
5 * Copyright (C) 2002-2016 International Business Machines Corporation
6 * and others. All rights reserved.
7 **************************************************************************
12 // Contains the implementation of class RegexMatcher,
13 // which is one of the main API classes for the ICU regular expression package.
16 #include "unicode/utypes.h"
17 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
19 #include "unicode/regex.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/ustring.h"
23 #include "unicode/rbbi.h"
24 #include "unicode/utf.h"
25 #include "unicode/utf16.h"
37 // #include <malloc.h> // Needed for heapcheck testing
42 // Default limit for the size of the back track stack, to avoid system
43 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
44 // This value puts ICU's limits higher than most other regexp implementations,
45 // which use recursion rather than the heap, and take more storage per
48 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY
= 8000000;
50 // Time limit counter constant.
51 // Time limits for expression evaluation are in terms of quanta of work by
52 // the engine, each of which is 10,000 state saves.
53 // This constant determines that state saves per tick number.
54 static const int32_t TIMER_INITIAL_VALUE
= 10000;
57 // Test for any of the Unicode line terminating characters.
58 static inline UBool
isLineTerminator(UChar32 c
) {
59 if (c
& ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
62 return (c
<=0x0d && c
>=0x0a) || c
==0x85 || c
==0x2028 || c
==0x2029;
65 //-----------------------------------------------------------------------------
67 // Constructor and Destructor
69 //-----------------------------------------------------------------------------
70 RegexMatcher::RegexMatcher(const RegexPattern
*pat
) {
71 fDeferredStatus
= U_ZERO_ERROR
;
72 init(fDeferredStatus
);
73 if (U_FAILURE(fDeferredStatus
)) {
77 fDeferredStatus
= U_ILLEGAL_ARGUMENT_ERROR
;
81 init2(RegexStaticSets::gStaticSets
->fEmptyText
, fDeferredStatus
);
86 RegexMatcher::RegexMatcher(const UnicodeString
®exp
, const UnicodeString
&input
,
87 uint32_t flags
, UErrorCode
&status
) {
89 if (U_FAILURE(status
)) {
93 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
94 fPattern
= fPatternOwned
;
96 UText inputText
= UTEXT_INITIALIZER
;
97 utext_openConstUnicodeString(&inputText
, &input
, &status
);
98 init2(&inputText
, status
);
99 utext_close(&inputText
);
101 fInputUniStrMaybeMutable
= TRUE
;
105 RegexMatcher::RegexMatcher(UText
*regexp
, UText
*input
,
106 uint32_t flags
, UErrorCode
&status
) {
108 if (U_FAILURE(status
)) {
112 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
113 if (U_FAILURE(status
)) {
117 fPattern
= fPatternOwned
;
118 init2(input
, status
);
122 RegexMatcher::RegexMatcher(const UnicodeString
®exp
,
123 uint32_t flags
, UErrorCode
&status
) {
125 if (U_FAILURE(status
)) {
129 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
130 if (U_FAILURE(status
)) {
133 fPattern
= fPatternOwned
;
134 init2(RegexStaticSets::gStaticSets
->fEmptyText
, status
);
137 RegexMatcher::RegexMatcher(UText
*regexp
,
138 uint32_t flags
, UErrorCode
&status
) {
140 if (U_FAILURE(status
)) {
144 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
145 if (U_FAILURE(status
)) {
149 fPattern
= fPatternOwned
;
150 init2(RegexStaticSets::gStaticSets
->fEmptyText
, status
);
156 RegexMatcher::~RegexMatcher() {
158 if (fData
!= fSmallData
) {
163 delete fPatternOwned
;
164 fPatternOwned
= NULL
;
172 utext_close(fInputText
);
175 utext_close(fAltInputText
);
178 #if UCONFIG_NO_BREAK_ITERATION==0
179 delete fWordBreakItr
;
184 // init() common initialization for use by all constructors.
185 // Initialize all fields, get the object into a consistent state.
186 // This must be done even when the initial status shows an error,
187 // so that the object is initialized sufficiently well for the destructor
190 void RegexMatcher::init(UErrorCode
&status
) {
192 fPatternOwned
= NULL
;
202 fTransparentBounds
= FALSE
;
203 fAnchoringBounds
= TRUE
;
216 fStackLimit
= DEFAULT_BACKTRACK_STACK_CAPACITY
;
218 fCallbackContext
= NULL
;
219 fFindProgressCallbackFn
= NULL
;
220 fFindProgressCallbackContext
= NULL
;
222 fDeferredStatus
= status
;
224 fWordBreakItr
= NULL
;
228 fAltInputText
= NULL
;
231 fInputUniStrMaybeMutable
= FALSE
;
235 // init2() Common initialization for use by RegexMatcher constructors, part 2.
236 // This handles the common setup to be done after the Pattern is available.
238 void RegexMatcher::init2(UText
*input
, UErrorCode
&status
) {
239 if (U_FAILURE(status
)) {
240 fDeferredStatus
= status
;
244 if (fPattern
->fDataSize
> UPRV_LENGTHOF(fSmallData
)) {
245 fData
= (int64_t *)uprv_malloc(fPattern
->fDataSize
* sizeof(int64_t));
247 status
= fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
252 fStack
= new UVector64(status
);
253 if (fStack
== NULL
) {
254 status
= fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
259 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY
, status
);
260 if (U_FAILURE(status
)) {
261 fDeferredStatus
= status
;
267 static const UChar BACKSLASH
= 0x5c;
268 static const UChar DOLLARSIGN
= 0x24;
269 static const UChar LEFTBRACKET
= 0x7b;
270 static const UChar RIGHTBRACKET
= 0x7d;
272 //--------------------------------------------------------------------------------
276 //--------------------------------------------------------------------------------
277 RegexMatcher
&RegexMatcher::appendReplacement(UnicodeString
&dest
,
278 const UnicodeString
&replacement
,
279 UErrorCode
&status
) {
280 UText replacementText
= UTEXT_INITIALIZER
;
282 utext_openConstUnicodeString(&replacementText
, &replacement
, &status
);
283 if (U_SUCCESS(status
)) {
284 UText resultText
= UTEXT_INITIALIZER
;
285 utext_openUnicodeString(&resultText
, &dest
, &status
);
287 if (U_SUCCESS(status
)) {
288 appendReplacement(&resultText
, &replacementText
, status
);
289 utext_close(&resultText
);
291 utext_close(&replacementText
);
298 // appendReplacement, UText mode
300 RegexMatcher
&RegexMatcher::appendReplacement(UText
*dest
,
302 UErrorCode
&status
) {
303 if (U_FAILURE(status
)) {
306 if (U_FAILURE(fDeferredStatus
)) {
307 status
= fDeferredStatus
;
310 if (fMatch
== FALSE
) {
311 status
= U_REGEX_INVALID_STATE
;
315 // Copy input string from the end of previous match to start of current match
316 int64_t destLen
= utext_nativeLength(dest
);
317 if (fMatchStart
> fAppendPosition
) {
318 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
319 destLen
+= utext_replace(dest
, destLen
, destLen
, fInputText
->chunkContents
+fAppendPosition
,
320 (int32_t)(fMatchStart
-fAppendPosition
), &status
);
323 if (UTEXT_USES_U16(fInputText
)) {
324 len16
= (int32_t)(fMatchStart
-fAppendPosition
);
326 UErrorCode lengthStatus
= U_ZERO_ERROR
;
327 len16
= utext_extract(fInputText
, fAppendPosition
, fMatchStart
, NULL
, 0, &lengthStatus
);
329 UChar
*inputChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(len16
+1));
330 if (inputChars
== NULL
) {
331 status
= U_MEMORY_ALLOCATION_ERROR
;
334 utext_extract(fInputText
, fAppendPosition
, fMatchStart
, inputChars
, len16
+1, &status
);
335 destLen
+= utext_replace(dest
, destLen
, destLen
, inputChars
, len16
, &status
);
336 uprv_free(inputChars
);
339 fAppendPosition
= fMatchEnd
;
342 // scan the replacement text, looking for substitutions ($n) and \escapes.
343 // TODO: optimize this loop by efficiently scanning for '$' or '\',
344 // move entire ranges not containing substitutions.
345 UTEXT_SETNATIVEINDEX(replacement
, 0);
346 for (UChar32 c
= UTEXT_NEXT32(replacement
); U_SUCCESS(status
) && c
!= U_SENTINEL
; c
= UTEXT_NEXT32(replacement
)) {
347 if (c
== BACKSLASH
) {
348 // Backslash Escape. Copy the following char out without further checks.
349 // Note: Surrogate pairs don't need any special handling
350 // The second half wont be a '$' or a '\', and
351 // will move to the dest normally on the next
353 c
= UTEXT_CURRENT32(replacement
);
354 if (c
== U_SENTINEL
) {
358 if (c
==0x55/*U*/ || c
==0x75/*u*/) {
359 // We have a \udddd or \Udddddddd escape sequence.
361 struct URegexUTextUnescapeCharContext context
= U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement
);
362 UChar32 escapedChar
= u_unescapeAt(uregex_utext_unescape_charAt
, &offset
, INT32_MAX
, &context
);
363 if (escapedChar
!= (UChar32
)0xFFFFFFFF) {
364 if (U_IS_BMP(escapedChar
)) {
365 UChar c16
= (UChar
)escapedChar
;
366 destLen
+= utext_replace(dest
, destLen
, destLen
, &c16
, 1, &status
);
369 surrogate
[0] = U16_LEAD(escapedChar
);
370 surrogate
[1] = U16_TRAIL(escapedChar
);
371 if (U_SUCCESS(status
)) {
372 destLen
+= utext_replace(dest
, destLen
, destLen
, surrogate
, 2, &status
);
375 // TODO: Report errors for mal-formed \u escapes?
376 // As this is, the original sequence is output, which may be OK.
377 if (context
.lastOffset
== offset
) {
378 (void)UTEXT_PREVIOUS32(replacement
);
379 } else if (context
.lastOffset
!= offset
-1) {
380 utext_moveIndex32(replacement
, offset
- context
.lastOffset
- 1);
384 (void)UTEXT_NEXT32(replacement
);
385 // Plain backslash escape. Just put out the escaped character.
387 UChar c16
= (UChar
)c
;
388 destLen
+= utext_replace(dest
, destLen
, destLen
, &c16
, 1, &status
);
391 surrogate
[0] = U16_LEAD(c
);
392 surrogate
[1] = U16_TRAIL(c
);
393 if (U_SUCCESS(status
)) {
394 destLen
+= utext_replace(dest
, destLen
, destLen
, surrogate
, 2, &status
);
398 } else if (c
!= DOLLARSIGN
) {
399 // Normal char, not a $. Copy it out without further checks.
401 UChar c16
= (UChar
)c
;
402 destLen
+= utext_replace(dest
, destLen
, destLen
, &c16
, 1, &status
);
405 surrogate
[0] = U16_LEAD(c
);
406 surrogate
[1] = U16_TRAIL(c
);
407 if (U_SUCCESS(status
)) {
408 destLen
+= utext_replace(dest
, destLen
, destLen
, surrogate
, 2, &status
);
412 // We've got a $. Pick up a capture group name or number if one follows.
413 // Consume digits so long as the resulting group number <= the number of
414 // number of capture groups in the pattern.
416 int32_t groupNum
= 0;
417 int32_t numDigits
= 0;
418 UChar32 nextChar
= utext_current32(replacement
);
419 if (nextChar
== LEFTBRACKET
) {
420 // Scan for a Named Capture Group, ${name}.
421 UnicodeString groupName
;
422 utext_next32(replacement
);
423 while(U_SUCCESS(status
) && nextChar
!= RIGHTBRACKET
) {
424 nextChar
= utext_next32(replacement
);
425 if (nextChar
== U_SENTINEL
) {
426 status
= U_REGEX_INVALID_CAPTURE_GROUP_NAME
;
427 } else if ((nextChar
>= 0x41 && nextChar
<= 0x5a) || // A..Z
428 (nextChar
>= 0x61 && nextChar
<= 0x7a) || // a..z
429 (nextChar
>= 0x31 && nextChar
<= 0x39)) { // 0..9
430 groupName
.append(nextChar
);
431 } else if (nextChar
== RIGHTBRACKET
) {
432 groupNum
= uhash_geti(fPattern
->fNamedCaptureMap
, &groupName
);
434 status
= U_REGEX_INVALID_CAPTURE_GROUP_NAME
;
437 // Character was something other than a name char or a closing '}'
438 status
= U_REGEX_INVALID_CAPTURE_GROUP_NAME
;
442 } else if (u_isdigit(nextChar
)) {
443 // $n Scan for a capture group number
444 int32_t numCaptureGroups
= fPattern
->fGroupMap
->size();
446 nextChar
= UTEXT_CURRENT32(replacement
);
447 if (nextChar
== U_SENTINEL
) {
450 if (u_isdigit(nextChar
) == FALSE
) {
453 int32_t nextDigitVal
= u_charDigitValue(nextChar
);
454 if (groupNum
*10 + nextDigitVal
> numCaptureGroups
) {
455 // Don't consume the next digit if it makes the capture group number too big.
456 if (numDigits
== 0) {
457 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
461 (void)UTEXT_NEXT32(replacement
);
462 groupNum
=groupNum
*10 + nextDigitVal
;
466 // $ not followed by capture group name or number.
467 status
= U_REGEX_INVALID_CAPTURE_GROUP_NAME
;
470 if (U_SUCCESS(status
)) {
471 destLen
+= appendGroup(groupNum
, dest
, status
);
473 } // End of $ capture group handling
474 } // End of per-character loop through the replacement string.
481 //--------------------------------------------------------------------------------
483 // appendTail Intended to be used in conjunction with appendReplacement()
484 // To the destination string, append everything following
485 // the last match position from the input string.
487 // Note: Match ranges do not affect appendTail or appendReplacement
489 //--------------------------------------------------------------------------------
490 UnicodeString
&RegexMatcher::appendTail(UnicodeString
&dest
) {
491 UErrorCode status
= U_ZERO_ERROR
;
492 UText resultText
= UTEXT_INITIALIZER
;
493 utext_openUnicodeString(&resultText
, &dest
, &status
);
495 if (U_SUCCESS(status
)) {
496 appendTail(&resultText
, status
);
497 utext_close(&resultText
);
504 // appendTail, UText mode
506 UText
*RegexMatcher::appendTail(UText
*dest
, UErrorCode
&status
) {
507 if (U_FAILURE(status
)) {
510 if (U_FAILURE(fDeferredStatus
)) {
511 status
= fDeferredStatus
;
515 if (fInputLength
> fAppendPosition
) {
516 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
517 int64_t destLen
= utext_nativeLength(dest
);
518 utext_replace(dest
, destLen
, destLen
, fInputText
->chunkContents
+fAppendPosition
,
519 (int32_t)(fInputLength
-fAppendPosition
), &status
);
522 if (UTEXT_USES_U16(fInputText
)) {
523 len16
= (int32_t)(fInputLength
-fAppendPosition
);
525 len16
= utext_extract(fInputText
, fAppendPosition
, fInputLength
, NULL
, 0, &status
);
526 status
= U_ZERO_ERROR
; // buffer overflow
529 UChar
*inputChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(len16
));
530 if (inputChars
== NULL
) {
531 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
533 utext_extract(fInputText
, fAppendPosition
, fInputLength
, inputChars
, len16
, &status
); // unterminated
534 int64_t destLen
= utext_nativeLength(dest
);
535 utext_replace(dest
, destLen
, destLen
, inputChars
, len16
, &status
);
536 uprv_free(inputChars
);
545 //--------------------------------------------------------------------------------
549 //--------------------------------------------------------------------------------
550 int32_t RegexMatcher::end(UErrorCode
&err
) const {
554 int64_t RegexMatcher::end64(UErrorCode
&err
) const {
555 return end64(0, err
);
558 int64_t RegexMatcher::end64(int32_t group
, UErrorCode
&err
) const {
559 if (U_FAILURE(err
)) {
562 if (fMatch
== FALSE
) {
563 err
= U_REGEX_INVALID_STATE
;
566 if (group
< 0 || group
> fPattern
->fGroupMap
->size()) {
567 err
= U_INDEX_OUTOFBOUNDS_ERROR
;
574 // Get the position within the stack frame of the variables for
575 // this capture group.
576 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(group
-1);
577 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
578 U_ASSERT(groupOffset
>= 0);
579 e
= fFrame
->fExtra
[groupOffset
+ 1];
585 int32_t RegexMatcher::end(int32_t group
, UErrorCode
&err
) const {
586 return (int32_t)end64(group
, err
);
589 //--------------------------------------------------------------------------------
591 // findProgressInterrupt This function is called once for each advance in the target
592 // string from the find() function, and calls the user progress callback
593 // function if there is one installed.
595 // Return: TRUE if the find operation is to be terminated.
596 // FALSE if the find operation is to continue running.
598 //--------------------------------------------------------------------------------
599 UBool
RegexMatcher::findProgressInterrupt(int64_t pos
, UErrorCode
&status
) {
600 if (fFindProgressCallbackFn
&& !(*fFindProgressCallbackFn
)(fFindProgressCallbackContext
, pos
)) {
601 status
= U_REGEX_STOPPED_BY_CALLER
;
607 //--------------------------------------------------------------------------------
611 //--------------------------------------------------------------------------------
612 UBool
RegexMatcher::find() {
613 if (U_FAILURE(fDeferredStatus
)) {
616 UErrorCode status
= U_ZERO_ERROR
;
617 UBool result
= find(status
);
621 //--------------------------------------------------------------------------------
625 //--------------------------------------------------------------------------------
626 UBool
RegexMatcher::find(UErrorCode
&status
) {
627 // Start at the position of the last match end. (Will be zero if the
628 // matcher has been reset.)
630 if (U_FAILURE(status
)) {
633 if (U_FAILURE(fDeferredStatus
)) {
634 status
= fDeferredStatus
;
638 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
639 return findUsingChunk(status
);
642 int64_t startPos
= fMatchEnd
;
644 startPos
= fActiveStart
;
648 // Save the position of any previous successful match.
649 fLastMatchEnd
= fMatchEnd
;
651 if (fMatchStart
== fMatchEnd
) {
652 // Previous match had zero length. Move start position up one position
653 // to avoid sending find() into a loop on zero-length matches.
654 if (startPos
>= fActiveLimit
) {
659 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
660 (void)UTEXT_NEXT32(fInputText
);
661 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
664 if (fLastMatchEnd
>= 0) {
665 // A previous find() failed to match. Don't try again.
666 // (without this test, a pattern with a zero-length match
667 // could match again at the end of an input string.)
674 // Compute the position in the input string beyond which a match can not begin, because
675 // the minimum length match would extend past the end of the input.
676 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
677 // Be aware of possible overflows if making changes here.
678 int64_t testStartLimit
;
679 if (UTEXT_USES_U16(fInputText
)) {
680 testStartLimit
= fActiveLimit
- fPattern
->fMinMatchLen
;
681 if (startPos
> testStartLimit
) {
687 // We don't know exactly how long the minimum match length is in native characters.
688 // Treat anything > 0 as 1.
689 testStartLimit
= fActiveLimit
- (fPattern
->fMinMatchLen
> 0 ? 1 : 0);
693 U_ASSERT(startPos
>= 0);
695 switch (fPattern
->fStartType
) {
697 // No optimization was found.
698 // Try a match at each input position.
700 MatchAt(startPos
, FALSE
, status
);
701 if (U_FAILURE(status
)) {
707 if (startPos
>= testStartLimit
) {
711 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
712 (void)UTEXT_NEXT32(fInputText
);
713 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
714 // Note that it's perfectly OK for a pattern to have a zero-length
715 // match at the end of a string, so we must make sure that the loop
716 // runs with startPos == testStartLimit the last time through.
717 if (findProgressInterrupt(startPos
, status
))
723 // Matches are only possible at the start of the input string
724 // (pattern begins with ^ or \A)
725 if (startPos
> fActiveStart
) {
729 MatchAt(startPos
, FALSE
, status
);
730 if (U_FAILURE(status
)) {
738 // Match may start on any char from a pre-computed set.
739 U_ASSERT(fPattern
->fMinMatchLen
> 0);
740 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
742 int64_t pos
= startPos
;
743 c
= UTEXT_NEXT32(fInputText
);
744 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
745 // c will be -1 (U_SENTINEL) at end of text, in which case we
746 // skip this next block (so we don't have a negative array index)
747 // and handle end of text in the following block.
748 if (c
>= 0 && ((c
<256 && fPattern
->fInitialChars8
->contains(c
)) ||
749 (c
>=256 && fPattern
->fInitialChars
->contains(c
)))) {
750 MatchAt(pos
, FALSE
, status
);
751 if (U_FAILURE(status
)) {
757 UTEXT_SETNATIVEINDEX(fInputText
, pos
);
759 if (startPos
> testStartLimit
) {
764 if (findProgressInterrupt(startPos
, status
))
773 // Match starts on exactly one char.
774 U_ASSERT(fPattern
->fMinMatchLen
> 0);
775 UChar32 theChar
= fPattern
->fInitialChar
;
776 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
778 int64_t pos
= startPos
;
779 c
= UTEXT_NEXT32(fInputText
);
780 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
782 MatchAt(pos
, FALSE
, status
);
783 if (U_FAILURE(status
)) {
789 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
791 if (startPos
> testStartLimit
) {
796 if (findProgressInterrupt(startPos
, status
))
805 if (startPos
== fAnchorStart
) {
806 MatchAt(startPos
, FALSE
, status
);
807 if (U_FAILURE(status
)) {
813 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
814 ch
= UTEXT_NEXT32(fInputText
);
815 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
817 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
818 ch
= UTEXT_PREVIOUS32(fInputText
);
819 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
822 if (fPattern
->fFlags
& UREGEX_UNIX_LINES
) {
825 MatchAt(startPos
, FALSE
, status
);
826 if (U_FAILURE(status
)) {
832 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
834 if (startPos
>= testStartLimit
) {
839 ch
= UTEXT_NEXT32(fInputText
);
840 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
841 // Note that it's perfectly OK for a pattern to have a zero-length
842 // match at the end of a string, so we must make sure that the loop
843 // runs with startPos == testStartLimit the last time through.
844 if (findProgressInterrupt(startPos
, status
))
849 if (isLineTerminator(ch
)) {
850 if (ch
== 0x0d && startPos
< fActiveLimit
&& UTEXT_CURRENT32(fInputText
) == 0x0a) {
851 (void)UTEXT_NEXT32(fInputText
);
852 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
854 MatchAt(startPos
, FALSE
, status
);
855 if (U_FAILURE(status
)) {
861 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
863 if (startPos
>= testStartLimit
) {
868 ch
= UTEXT_NEXT32(fInputText
);
869 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
870 // Note that it's perfectly OK for a pattern to have a zero-length
871 // match at the end of a string, so we must make sure that the loop
872 // runs with startPos == testStartLimit the last time through.
873 if (findProgressInterrupt(startPos
, status
))
888 UBool
RegexMatcher::find(int64_t start
, UErrorCode
&status
) {
889 if (U_FAILURE(status
)) {
892 if (U_FAILURE(fDeferredStatus
)) {
893 status
= fDeferredStatus
;
896 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
897 // This will reset the region to be the full input length.
899 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
903 int64_t nativeStart
= start
;
904 if (nativeStart
< fActiveStart
|| nativeStart
> fActiveLimit
) {
905 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
908 fMatchEnd
= nativeStart
;
913 //--------------------------------------------------------------------------------
915 // findUsingChunk() -- like find(), but with the advance knowledge that the
916 // entire string is available in the UText's chunk buffer.
918 //--------------------------------------------------------------------------------
919 UBool
RegexMatcher::findUsingChunk(UErrorCode
&status
) {
920 // Start at the position of the last match end. (Will be zero if the
921 // matcher has been reset.
924 int32_t startPos
= (int32_t)fMatchEnd
;
926 startPos
= (int32_t)fActiveStart
;
929 const UChar
*inputBuf
= fInputText
->chunkContents
;
932 // Save the position of any previous successful match.
933 fLastMatchEnd
= fMatchEnd
;
935 if (fMatchStart
== fMatchEnd
) {
936 // Previous match had zero length. Move start position up one position
937 // to avoid sending find() into a loop on zero-length matches.
938 if (startPos
>= fActiveLimit
) {
943 U16_FWD_1(inputBuf
, startPos
, fInputLength
);
946 if (fLastMatchEnd
>= 0) {
947 // A previous find() failed to match. Don't try again.
948 // (without this test, a pattern with a zero-length match
949 // could match again at the end of an input string.)
956 // Compute the position in the input string beyond which a match can not begin, because
957 // the minimum length match would extend past the end of the input.
958 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
959 // Be aware of possible overflows if making changes here.
960 // Note: a match can begin at inputBuf + testLen; it is an inclusive limit.
961 int32_t testLen
= (int32_t)(fActiveLimit
- fPattern
->fMinMatchLen
);
962 if (startPos
> testLen
) {
969 U_ASSERT(startPos
>= 0);
971 switch (fPattern
->fStartType
) {
973 // No optimization was found.
974 // Try a match at each input position.
976 MatchChunkAt(startPos
, FALSE
, status
);
977 if (U_FAILURE(status
)) {
983 if (startPos
>= testLen
) {
987 U16_FWD_1(inputBuf
, startPos
, fActiveLimit
);
988 // Note that it's perfectly OK for a pattern to have a zero-length
989 // match at the end of a string, so we must make sure that the loop
990 // runs with startPos == testLen the last time through.
991 if (findProgressInterrupt(startPos
, status
))
997 // Matches are only possible at the start of the input string
998 // (pattern begins with ^ or \A)
999 if (startPos
> fActiveStart
) {
1003 MatchChunkAt(startPos
, FALSE
, status
);
1004 if (U_FAILURE(status
)) {
1012 // Match may start on any char from a pre-computed set.
1013 U_ASSERT(fPattern
->fMinMatchLen
> 0);
1015 int32_t pos
= startPos
;
1016 U16_NEXT(inputBuf
, startPos
, fActiveLimit
, c
); // like c = inputBuf[startPos++];
1017 if ((c
<256 && fPattern
->fInitialChars8
->contains(c
)) ||
1018 (c
>=256 && fPattern
->fInitialChars
->contains(c
))) {
1019 MatchChunkAt(pos
, FALSE
, status
);
1020 if (U_FAILURE(status
)) {
1027 if (startPos
> testLen
) {
1032 if (findProgressInterrupt(startPos
, status
))
1041 // Match starts on exactly one char.
1042 U_ASSERT(fPattern
->fMinMatchLen
> 0);
1043 UChar32 theChar
= fPattern
->fInitialChar
;
1045 int32_t pos
= startPos
;
1046 U16_NEXT(inputBuf
, startPos
, fActiveLimit
, c
); // like c = inputBuf[startPos++];
1048 MatchChunkAt(pos
, FALSE
, status
);
1049 if (U_FAILURE(status
)) {
1056 if (startPos
> testLen
) {
1061 if (findProgressInterrupt(startPos
, status
))
1070 if (startPos
== fAnchorStart
) {
1071 MatchChunkAt(startPos
, FALSE
, status
);
1072 if (U_FAILURE(status
)) {
1078 // In bug 31063104 which has a zero-length text buffer we get here with
1079 // inputBuf=NULL, startPos=fActiveLimit=0 (and fMatch F) which violates the
1080 // requirement for U16_FWD_1 (utf16.h) that startPos < fActiveLimit. Having
1081 // inputBuf=NULL (chunkContexts NULL) is probably due to an error in the
1082 // CFStringUText functions. Nevertheless, to be defensive, add test below.
1083 if (startPos
>= testLen
) {
1087 U16_FWD_1(inputBuf
, startPos
, fActiveLimit
);
1090 if (fPattern
->fFlags
& UREGEX_UNIX_LINES
) {
1092 ch
= inputBuf
[startPos
-1];
1094 MatchChunkAt(startPos
, FALSE
, status
);
1095 if (U_FAILURE(status
)) {
1102 if (startPos
>= testLen
) {
1107 U16_FWD_1(inputBuf
, startPos
, fActiveLimit
);
1108 // Note that it's perfectly OK for a pattern to have a zero-length
1109 // match at the end of a string, so we must make sure that the loop
1110 // runs with startPos == testLen the last time through.
1111 if (findProgressInterrupt(startPos
, status
))
1116 ch
= inputBuf
[startPos
-1];
1117 if (isLineTerminator(ch
)) {
1118 if (ch
== 0x0d && startPos
< fActiveLimit
&& inputBuf
[startPos
] == 0x0a) {
1121 MatchChunkAt(startPos
, FALSE
, status
);
1122 if (U_FAILURE(status
)) {
1129 if (startPos
>= testLen
) {
1134 U16_FWD_1(inputBuf
, startPos
, fActiveLimit
);
1135 // Note that it's perfectly OK for a pattern to have a zero-length
1136 // match at the end of a string, so we must make sure that the loop
1137 // runs with startPos == testLen the last time through.
1138 if (findProgressInterrupt(startPos
, status
))
1153 //--------------------------------------------------------------------------------
1157 //--------------------------------------------------------------------------------
1158 UnicodeString
RegexMatcher::group(UErrorCode
&status
) const {
1159 return group(0, status
);
1162 // Return immutable shallow clone
1163 UText
*RegexMatcher::group(UText
*dest
, int64_t &group_len
, UErrorCode
&status
) const {
1164 return group(0, dest
, group_len
, status
);
1167 // Return immutable shallow clone
1168 UText
*RegexMatcher::group(int32_t groupNum
, UText
*dest
, int64_t &group_len
, UErrorCode
&status
) const {
1170 if (U_FAILURE(status
)) {
1173 if (U_FAILURE(fDeferredStatus
)) {
1174 status
= fDeferredStatus
;
1175 } else if (fMatch
== FALSE
) {
1176 status
= U_REGEX_INVALID_STATE
;
1177 } else if (groupNum
< 0 || groupNum
> fPattern
->fGroupMap
->size()) {
1178 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1181 if (U_FAILURE(status
)) {
1186 if (groupNum
== 0) {
1190 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(groupNum
-1);
1191 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
1192 U_ASSERT(groupOffset
>= 0);
1193 s
= fFrame
->fExtra
[groupOffset
];
1194 e
= fFrame
->fExtra
[groupOffset
+1];
1198 // A capture group wasn't part of the match
1199 return utext_clone(dest
, fInputText
, FALSE
, TRUE
, &status
);
1204 dest
= utext_clone(dest
, fInputText
, FALSE
, TRUE
, &status
);
1206 UTEXT_SETNATIVEINDEX(dest
, s
);
1210 UnicodeString
RegexMatcher::group(int32_t groupNum
, UErrorCode
&status
) const {
1211 UnicodeString result
;
1212 int64_t groupStart
= start64(groupNum
, status
);
1213 int64_t groupEnd
= end64(groupNum
, status
);
1214 if (U_FAILURE(status
) || groupStart
== -1 || groupStart
== groupEnd
) {
1218 // Get the group length using a utext_extract preflight.
1219 // UText is actually pretty efficient at this when underlying encoding is UTF-16.
1220 int32_t length
= utext_extract(fInputText
, groupStart
, groupEnd
, NULL
, 0, &status
);
1221 if (status
!= U_BUFFER_OVERFLOW_ERROR
) {
1225 status
= U_ZERO_ERROR
;
1226 UChar
*buf
= result
.getBuffer(length
);
1228 status
= U_MEMORY_ALLOCATION_ERROR
;
1230 int32_t extractLength
= utext_extract(fInputText
, groupStart
, groupEnd
, buf
, length
, &status
);
1231 result
.releaseBuffer(extractLength
);
1232 U_ASSERT(length
== extractLength
);
1238 //--------------------------------------------------------------------------------
1240 // appendGroup() -- currently internal only, appends a group to a UText rather
1241 // than replacing its contents
1243 //--------------------------------------------------------------------------------
1245 int64_t RegexMatcher::appendGroup(int32_t groupNum
, UText
*dest
, UErrorCode
&status
) const {
1246 if (U_FAILURE(status
)) {
1249 if (U_FAILURE(fDeferredStatus
)) {
1250 status
= fDeferredStatus
;
1253 int64_t destLen
= utext_nativeLength(dest
);
1255 if (fMatch
== FALSE
) {
1256 status
= U_REGEX_INVALID_STATE
;
1257 return utext_replace(dest
, destLen
, destLen
, NULL
, 0, &status
);
1259 if (groupNum
< 0 || groupNum
> fPattern
->fGroupMap
->size()) {
1260 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1261 return utext_replace(dest
, destLen
, destLen
, NULL
, 0, &status
);
1265 if (groupNum
== 0) {
1269 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(groupNum
-1);
1270 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
1271 U_ASSERT(groupOffset
>= 0);
1272 s
= fFrame
->fExtra
[groupOffset
];
1273 e
= fFrame
->fExtra
[groupOffset
+1];
1277 // A capture group wasn't part of the match
1278 return utext_replace(dest
, destLen
, destLen
, NULL
, 0, &status
);
1283 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1284 U_ASSERT(e
<= fInputLength
);
1285 deltaLen
= utext_replace(dest
, destLen
, destLen
, fInputText
->chunkContents
+s
, (int32_t)(e
-s
), &status
);
1288 if (UTEXT_USES_U16(fInputText
)) {
1289 len16
= (int32_t)(e
-s
);
1291 UErrorCode lengthStatus
= U_ZERO_ERROR
;
1292 len16
= utext_extract(fInputText
, s
, e
, NULL
, 0, &lengthStatus
);
1294 UChar
*groupChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(len16
+1));
1295 if (groupChars
== NULL
) {
1296 status
= U_MEMORY_ALLOCATION_ERROR
;
1299 utext_extract(fInputText
, s
, e
, groupChars
, len16
+1, &status
);
1301 deltaLen
= utext_replace(dest
, destLen
, destLen
, groupChars
, len16
, &status
);
1302 uprv_free(groupChars
);
1309 //--------------------------------------------------------------------------------
1313 //--------------------------------------------------------------------------------
1314 int32_t RegexMatcher::groupCount() const {
1315 return fPattern
->fGroupMap
->size();
1318 //--------------------------------------------------------------------------------
1320 // hasAnchoringBounds()
1322 //--------------------------------------------------------------------------------
1323 UBool
RegexMatcher::hasAnchoringBounds() const {
1324 return fAnchoringBounds
;
1328 //--------------------------------------------------------------------------------
1330 // hasTransparentBounds()
1332 //--------------------------------------------------------------------------------
1333 UBool
RegexMatcher::hasTransparentBounds() const {
1334 return fTransparentBounds
;
1339 //--------------------------------------------------------------------------------
1343 //--------------------------------------------------------------------------------
1344 UBool
RegexMatcher::hitEnd() const {
1349 //--------------------------------------------------------------------------------
1353 //--------------------------------------------------------------------------------
1354 const UnicodeString
&RegexMatcher::input() const {
1356 UErrorCode status
= U_ZERO_ERROR
;
1358 if (UTEXT_USES_U16(fInputText
)) {
1359 len16
= (int32_t)fInputLength
;
1361 len16
= utext_extract(fInputText
, 0, fInputLength
, NULL
, 0, &status
);
1362 status
= U_ZERO_ERROR
; // overflow, length status
1364 UnicodeString
*result
= new UnicodeString(len16
, 0, 0);
1366 UChar
*inputChars
= result
->getBuffer(len16
);
1367 utext_extract(fInputText
, 0, fInputLength
, inputChars
, len16
, &status
); // unterminated warning
1368 result
->releaseBuffer(len16
);
1370 (*(const UnicodeString
**)&fInput
) = result
; // pointer assignment, rather than operator=
1376 //--------------------------------------------------------------------------------
1380 //--------------------------------------------------------------------------------
1381 UText
*RegexMatcher::inputText() const {
1386 //--------------------------------------------------------------------------------
1388 // getInput() -- like inputText(), but makes a clone or copies into another UText
1390 //--------------------------------------------------------------------------------
1391 UText
*RegexMatcher::getInput (UText
*dest
, UErrorCode
&status
) const {
1392 if (U_FAILURE(status
)) {
1395 if (U_FAILURE(fDeferredStatus
)) {
1396 status
= fDeferredStatus
;
1401 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1402 utext_replace(dest
, 0, utext_nativeLength(dest
), fInputText
->chunkContents
, (int32_t)fInputLength
, &status
);
1405 if (UTEXT_USES_U16(fInputText
)) {
1406 input16Len
= (int32_t)fInputLength
;
1408 UErrorCode lengthStatus
= U_ZERO_ERROR
;
1409 input16Len
= utext_extract(fInputText
, 0, fInputLength
, NULL
, 0, &lengthStatus
); // buffer overflow error
1411 UChar
*inputChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(input16Len
));
1412 if (inputChars
== NULL
) {
1416 status
= U_ZERO_ERROR
;
1417 utext_extract(fInputText
, 0, fInputLength
, inputChars
, input16Len
, &status
); // not terminated warning
1418 status
= U_ZERO_ERROR
;
1419 utext_replace(dest
, 0, utext_nativeLength(dest
), inputChars
, input16Len
, &status
);
1421 uprv_free(inputChars
);
1425 return utext_clone(NULL
, fInputText
, FALSE
, TRUE
, &status
);
1430 static UBool
compat_SyncMutableUTextContents(UText
*ut
);
1431 static UBool
compat_SyncMutableUTextContents(UText
*ut
) {
1432 UBool retVal
= FALSE
;
1434 // In the following test, we're really only interested in whether the UText should switch
1435 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents
1436 // will still point to the correct data.
1437 if (utext_nativeLength(ut
) != ut
->nativeIndexingLimit
) {
1438 UnicodeString
*us
=(UnicodeString
*)ut
->context
;
1440 // Update to the latest length.
1441 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1442 int32_t newLength
= us
->length();
1444 // Update the chunk description.
1445 // The buffer may have switched between stack- and heap-based.
1446 ut
->chunkContents
= us
->getBuffer();
1447 ut
->chunkLength
= newLength
;
1448 ut
->chunkNativeLimit
= newLength
;
1449 ut
->nativeIndexingLimit
= newLength
;
1456 //--------------------------------------------------------------------------------
1460 //--------------------------------------------------------------------------------
1461 UBool
RegexMatcher::lookingAt(UErrorCode
&status
) {
1462 if (U_FAILURE(status
)) {
1465 if (U_FAILURE(fDeferredStatus
)) {
1466 status
= fDeferredStatus
;
1470 if (fInputUniStrMaybeMutable
) {
1471 if (compat_SyncMutableUTextContents(fInputText
)) {
1472 fInputLength
= utext_nativeLength(fInputText
);
1477 resetPreserveRegion();
1479 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1480 MatchChunkAt((int32_t)fActiveStart
, FALSE
, status
);
1482 MatchAt(fActiveStart
, FALSE
, status
);
1488 UBool
RegexMatcher::lookingAt(int64_t start
, UErrorCode
&status
) {
1489 if (U_FAILURE(status
)) {
1492 if (U_FAILURE(fDeferredStatus
)) {
1493 status
= fDeferredStatus
;
1499 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1503 if (fInputUniStrMaybeMutable
) {
1504 if (compat_SyncMutableUTextContents(fInputText
)) {
1505 fInputLength
= utext_nativeLength(fInputText
);
1510 int64_t nativeStart
;
1511 nativeStart
= start
;
1512 if (nativeStart
< fActiveStart
|| nativeStart
> fActiveLimit
) {
1513 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1517 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1518 MatchChunkAt((int32_t)nativeStart
, FALSE
, status
);
1520 MatchAt(nativeStart
, FALSE
, status
);
1527 //--------------------------------------------------------------------------------
1531 //--------------------------------------------------------------------------------
1532 UBool
RegexMatcher::matches(UErrorCode
&status
) {
1533 if (U_FAILURE(status
)) {
1536 if (U_FAILURE(fDeferredStatus
)) {
1537 status
= fDeferredStatus
;
1541 if (fInputUniStrMaybeMutable
) {
1542 if (compat_SyncMutableUTextContents(fInputText
)) {
1543 fInputLength
= utext_nativeLength(fInputText
);
1548 resetPreserveRegion();
1551 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1552 MatchChunkAt((int32_t)fActiveStart
, TRUE
, status
);
1554 MatchAt(fActiveStart
, TRUE
, status
);
1560 UBool
RegexMatcher::matches(int64_t start
, UErrorCode
&status
) {
1561 if (U_FAILURE(status
)) {
1564 if (U_FAILURE(fDeferredStatus
)) {
1565 status
= fDeferredStatus
;
1571 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1575 if (fInputUniStrMaybeMutable
) {
1576 if (compat_SyncMutableUTextContents(fInputText
)) {
1577 fInputLength
= utext_nativeLength(fInputText
);
1582 int64_t nativeStart
;
1583 nativeStart
= start
;
1584 if (nativeStart
< fActiveStart
|| nativeStart
> fActiveLimit
) {
1585 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1589 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1590 MatchChunkAt((int32_t)nativeStart
, TRUE
, status
);
1592 MatchAt(nativeStart
, TRUE
, status
);
1599 //--------------------------------------------------------------------------------
1603 //--------------------------------------------------------------------------------
1604 const RegexPattern
&RegexMatcher::pattern() const {
1610 //--------------------------------------------------------------------------------
1614 //--------------------------------------------------------------------------------
1615 RegexMatcher
&RegexMatcher::region(int64_t regionStart
, int64_t regionLimit
, int64_t startIndex
, UErrorCode
&status
) {
1616 if (U_FAILURE(status
)) {
1620 if (regionStart
>regionLimit
|| regionStart
<0 || regionLimit
<0) {
1621 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1624 int64_t nativeStart
= regionStart
;
1625 int64_t nativeLimit
= regionLimit
;
1626 if (nativeStart
> fInputLength
|| nativeLimit
> fInputLength
) {
1627 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1630 if (startIndex
== -1)
1633 resetPreserveRegion();
1635 fRegionStart
= nativeStart
;
1636 fRegionLimit
= nativeLimit
;
1637 fActiveStart
= nativeStart
;
1638 fActiveLimit
= nativeLimit
;
1640 if (startIndex
!= -1) {
1641 if (startIndex
< fActiveStart
|| startIndex
> fActiveLimit
) {
1642 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1644 fMatchEnd
= startIndex
;
1647 if (!fTransparentBounds
) {
1648 fLookStart
= nativeStart
;
1649 fLookLimit
= nativeLimit
;
1651 if (fAnchoringBounds
) {
1652 fAnchorStart
= nativeStart
;
1653 fAnchorLimit
= nativeLimit
;
1658 RegexMatcher
&RegexMatcher::region(int64_t start
, int64_t limit
, UErrorCode
&status
) {
1659 return region(start
, limit
, -1, status
);
1662 //--------------------------------------------------------------------------------
1666 //--------------------------------------------------------------------------------
1667 int32_t RegexMatcher::regionEnd() const {
1668 return (int32_t)fRegionLimit
;
1671 int64_t RegexMatcher::regionEnd64() const {
1672 return fRegionLimit
;
1675 //--------------------------------------------------------------------------------
1679 //--------------------------------------------------------------------------------
1680 int32_t RegexMatcher::regionStart() const {
1681 return (int32_t)fRegionStart
;
1684 int64_t RegexMatcher::regionStart64() const {
1685 return fRegionStart
;
1689 //--------------------------------------------------------------------------------
1693 //--------------------------------------------------------------------------------
1694 UnicodeString
RegexMatcher::replaceAll(const UnicodeString
&replacement
, UErrorCode
&status
) {
1695 UText replacementText
= UTEXT_INITIALIZER
;
1696 UText resultText
= UTEXT_INITIALIZER
;
1697 UnicodeString resultString
;
1698 if (U_FAILURE(status
)) {
1699 return resultString
;
1702 utext_openConstUnicodeString(&replacementText
, &replacement
, &status
);
1703 utext_openUnicodeString(&resultText
, &resultString
, &status
);
1705 replaceAll(&replacementText
, &resultText
, status
);
1707 utext_close(&resultText
);
1708 utext_close(&replacementText
);
1710 return resultString
;
1715 // replaceAll, UText mode
1717 UText
*RegexMatcher::replaceAll(UText
*replacement
, UText
*dest
, UErrorCode
&status
) {
1718 if (U_FAILURE(status
)) {
1721 if (U_FAILURE(fDeferredStatus
)) {
1722 status
= fDeferredStatus
;
1727 UnicodeString emptyString
;
1728 UText empty
= UTEXT_INITIALIZER
;
1730 utext_openUnicodeString(&empty
, &emptyString
, &status
);
1731 dest
= utext_clone(NULL
, &empty
, TRUE
, FALSE
, &status
);
1732 utext_close(&empty
);
1735 if (U_SUCCESS(status
)) {
1738 appendReplacement(dest
, replacement
, status
);
1739 if (U_FAILURE(status
)) {
1743 appendTail(dest
, status
);
1750 //--------------------------------------------------------------------------------
1754 //--------------------------------------------------------------------------------
1755 UnicodeString
RegexMatcher::replaceFirst(const UnicodeString
&replacement
, UErrorCode
&status
) {
1756 UText replacementText
= UTEXT_INITIALIZER
;
1757 UText resultText
= UTEXT_INITIALIZER
;
1758 UnicodeString resultString
;
1760 utext_openConstUnicodeString(&replacementText
, &replacement
, &status
);
1761 utext_openUnicodeString(&resultText
, &resultString
, &status
);
1763 replaceFirst(&replacementText
, &resultText
, status
);
1765 utext_close(&resultText
);
1766 utext_close(&replacementText
);
1768 return resultString
;
1772 // replaceFirst, UText mode
1774 UText
*RegexMatcher::replaceFirst(UText
*replacement
, UText
*dest
, UErrorCode
&status
) {
1775 if (U_FAILURE(status
)) {
1778 if (U_FAILURE(fDeferredStatus
)) {
1779 status
= fDeferredStatus
;
1785 return getInput(dest
, status
);
1789 UnicodeString emptyString
;
1790 UText empty
= UTEXT_INITIALIZER
;
1792 utext_openUnicodeString(&empty
, &emptyString
, &status
);
1793 dest
= utext_clone(NULL
, &empty
, TRUE
, FALSE
, &status
);
1794 utext_close(&empty
);
1797 appendReplacement(dest
, replacement
, status
);
1798 appendTail(dest
, status
);
1804 //--------------------------------------------------------------------------------
1808 //--------------------------------------------------------------------------------
1809 UBool
RegexMatcher::requireEnd() const {
1814 //--------------------------------------------------------------------------------
1818 //--------------------------------------------------------------------------------
1819 RegexMatcher
&RegexMatcher::reset() {
1821 fRegionLimit
= fInputLength
;
1823 fActiveLimit
= fInputLength
;
1825 fAnchorLimit
= fInputLength
;
1827 fLookLimit
= fInputLength
;
1828 resetPreserveRegion();
1834 void RegexMatcher::resetPreserveRegion() {
1838 fAppendPosition
= 0;
1841 fRequireEnd
= FALSE
;
1843 fTickCounter
= TIMER_INITIAL_VALUE
;
1844 //resetStack(); // more expensive than it looks...
1848 RegexMatcher
&RegexMatcher::reset(const UnicodeString
&input
) {
1849 fInputText
= utext_openConstUnicodeString(fInputText
, &input
, &fDeferredStatus
);
1850 if (fPattern
->fNeedsAltInput
) {
1851 fAltInputText
= utext_clone(fAltInputText
, fInputText
, FALSE
, TRUE
, &fDeferredStatus
);
1853 if (U_FAILURE(fDeferredStatus
)) {
1856 fInputLength
= utext_nativeLength(fInputText
);
1862 // Do the following for any UnicodeString.
1863 // This is for compatibility for those clients who modify the input string "live" during regex operations.
1864 fInputUniStrMaybeMutable
= TRUE
;
1866 if (fWordBreakItr
!= NULL
) {
1867 #if UCONFIG_NO_BREAK_ITERATION==0
1868 UErrorCode status
= U_ZERO_ERROR
;
1869 fWordBreakItr
->setText(fInputText
, status
);
1876 RegexMatcher
&RegexMatcher::reset(UText
*input
) {
1877 if (fInputText
!= input
) {
1878 fInputText
= utext_clone(fInputText
, input
, FALSE
, TRUE
, &fDeferredStatus
);
1879 if (fPattern
->fNeedsAltInput
) fAltInputText
= utext_clone(fAltInputText
, fInputText
, FALSE
, TRUE
, &fDeferredStatus
);
1880 if (U_FAILURE(fDeferredStatus
)) {
1883 fInputLength
= utext_nativeLength(fInputText
);
1888 if (fWordBreakItr
!= NULL
) {
1889 #if UCONFIG_NO_BREAK_ITERATION==0
1890 UErrorCode status
= U_ZERO_ERROR
;
1891 fWordBreakItr
->setText(input
, status
);
1896 fInputUniStrMaybeMutable
= FALSE
;
1901 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
1902 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1906 RegexMatcher
&RegexMatcher::reset(int64_t position
, UErrorCode
&status
) {
1907 if (U_FAILURE(status
)) {
1910 reset(); // Reset also resets the region to be the entire string.
1912 if (position
< 0 || position
> fActiveLimit
) {
1913 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1916 fMatchEnd
= position
;
1921 //--------------------------------------------------------------------------------
1925 //--------------------------------------------------------------------------------
1926 RegexMatcher
&RegexMatcher::refreshInputText(UText
*input
, UErrorCode
&status
) {
1927 if (U_FAILURE(status
)) {
1930 if (input
== NULL
) {
1931 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1934 if (utext_nativeLength(fInputText
) != utext_nativeLength(input
)) {
1935 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1938 int64_t pos
= utext_getNativeIndex(fInputText
);
1939 // Shallow read-only clone of the new UText into the existing input UText
1940 fInputText
= utext_clone(fInputText
, input
, FALSE
, TRUE
, &status
);
1941 if (U_FAILURE(status
)) {
1944 utext_setNativeIndex(fInputText
, pos
);
1946 if (fAltInputText
!= NULL
) {
1947 pos
= utext_getNativeIndex(fAltInputText
);
1948 fAltInputText
= utext_clone(fAltInputText
, input
, FALSE
, TRUE
, &status
);
1949 if (U_FAILURE(status
)) {
1952 utext_setNativeIndex(fAltInputText
, pos
);
1959 //--------------------------------------------------------------------------------
1963 //--------------------------------------------------------------------------------
1964 void RegexMatcher::setTrace(UBool state
) {
1965 fTraceDebug
= state
;
1971 * UText, replace entire contents of the destination UText with a substring of the source UText.
1973 * @param src The source UText
1974 * @param dest The destination UText. Must be writable.
1975 * May be NULL, in which case a new UText will be allocated.
1976 * @param start Start index of source substring.
1977 * @param limit Limit index of source substring.
1978 * @param status An error code.
1980 static UText
*utext_extract_replace(UText
*src
, UText
*dest
, int64_t start
, int64_t limit
, UErrorCode
*status
) {
1981 if (U_FAILURE(*status
)) {
1984 if (start
== limit
) {
1986 utext_replace(dest
, 0, utext_nativeLength(dest
), NULL
, 0, status
);
1989 return utext_openUChars(NULL
, NULL
, 0, status
);
1992 int32_t length
= utext_extract(src
, start
, limit
, NULL
, 0, status
);
1993 if (*status
!= U_BUFFER_OVERFLOW_ERROR
&& U_FAILURE(*status
)) {
1996 *status
= U_ZERO_ERROR
;
1997 MaybeStackArray
<UChar
, 40> buffer
;
1998 if (length
>= buffer
.getCapacity()) {
1999 UChar
*newBuf
= buffer
.resize(length
+1); // Leave space for terminating Nul.
2000 if (newBuf
== NULL
) {
2001 *status
= U_MEMORY_ALLOCATION_ERROR
;
2004 utext_extract(src
, start
, limit
, buffer
.getAlias(), length
+1, status
);
2006 utext_replace(dest
, 0, utext_nativeLength(dest
), buffer
.getAlias(), length
, status
);
2010 // Caller did not provide a prexisting UText.
2011 // Open a new one, and have it adopt the text buffer storage.
2012 if (U_FAILURE(*status
)) {
2015 int32_t ownedLength
= 0;
2016 UChar
*ownedBuf
= buffer
.orphanOrClone(length
+1, ownedLength
);
2017 if (ownedBuf
== NULL
) {
2018 *status
= U_MEMORY_ALLOCATION_ERROR
;
2021 UText
*result
= utext_openUChars(NULL
, ownedBuf
, length
, status
);
2022 if (U_FAILURE(*status
)) {
2023 uprv_free(ownedBuf
);
2026 result
->providerProperties
|= (1 << UTEXT_PROVIDER_OWNS_TEXT
);
2031 //---------------------------------------------------------------------
2035 //---------------------------------------------------------------------
2036 int32_t RegexMatcher::split(const UnicodeString
&input
,
2037 UnicodeString dest
[],
2038 int32_t destCapacity
,
2041 UText inputText
= UTEXT_INITIALIZER
;
2042 utext_openConstUnicodeString(&inputText
, &input
, &status
);
2043 if (U_FAILURE(status
)) {
2047 UText
**destText
= (UText
**)uprv_malloc(sizeof(UText
*)*destCapacity
);
2048 if (destText
== NULL
) {
2049 status
= U_MEMORY_ALLOCATION_ERROR
;
2053 for (i
= 0; i
< destCapacity
; i
++) {
2054 destText
[i
] = utext_openUnicodeString(NULL
, &dest
[i
], &status
);
2057 int32_t fieldCount
= split(&inputText
, destText
, destCapacity
, status
);
2059 for (i
= 0; i
< destCapacity
; i
++) {
2060 utext_close(destText
[i
]);
2063 uprv_free(destText
);
2064 utext_close(&inputText
);
2069 // split, UText mode
2071 int32_t RegexMatcher::split(UText
*input
,
2073 int32_t destCapacity
,
2077 // Check arguements for validity
2079 if (U_FAILURE(status
)) {
2083 if (destCapacity
< 1) {
2084 status
= U_ILLEGAL_ARGUMENT_ERROR
;
2089 // Reset for the input text
2092 int64_t nextOutputStringStart
= 0;
2093 if (fActiveLimit
== 0) {
2098 // Loop through the input text, searching for the delimiter pattern
2101 int32_t numCaptureGroups
= fPattern
->fGroupMap
->size();
2103 if (i
>=destCapacity
-1) {
2104 // There is one or zero output string left.
2105 // Fill the last output string with whatever is left from the input, then exit the loop.
2106 // ( i will be == destCapacity if we filled the output array while processing
2107 // capture groups of the delimiter expression, in which case we will discard the
2108 // last capture group saved in favor of the unprocessed remainder of the
2111 if (fActiveLimit
> nextOutputStringStart
) {
2112 if (UTEXT_FULL_TEXT_IN_CHUNK(input
, fInputLength
)) {
2114 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]),
2115 input
->chunkContents
+nextOutputStringStart
,
2116 (int32_t)(fActiveLimit
-nextOutputStringStart
), &status
);
2118 UText remainingText
= UTEXT_INITIALIZER
;
2119 utext_openUChars(&remainingText
, input
->chunkContents
+nextOutputStringStart
,
2120 fActiveLimit
-nextOutputStringStart
, &status
);
2121 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2122 utext_close(&remainingText
);
2125 UErrorCode lengthStatus
= U_ZERO_ERROR
;
2126 int32_t remaining16Length
=
2127 utext_extract(input
, nextOutputStringStart
, fActiveLimit
, NULL
, 0, &lengthStatus
);
2128 UChar
*remainingChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(remaining16Length
+1));
2129 if (remainingChars
== NULL
) {
2130 status
= U_MEMORY_ALLOCATION_ERROR
;
2134 utext_extract(input
, nextOutputStringStart
, fActiveLimit
, remainingChars
, remaining16Length
+1, &status
);
2136 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]), remainingChars
, remaining16Length
, &status
);
2138 UText remainingText
= UTEXT_INITIALIZER
;
2139 utext_openUChars(&remainingText
, remainingChars
, remaining16Length
, &status
);
2140 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2141 utext_close(&remainingText
);
2144 uprv_free(remainingChars
);
2150 // We found another delimiter. Move everything from where we started looking
2151 // up until the start of the delimiter into the next output string.
2152 if (UTEXT_FULL_TEXT_IN_CHUNK(input
, fInputLength
)) {
2154 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]),
2155 input
->chunkContents
+nextOutputStringStart
,
2156 (int32_t)(fMatchStart
-nextOutputStringStart
), &status
);
2158 UText remainingText
= UTEXT_INITIALIZER
;
2159 utext_openUChars(&remainingText
, input
->chunkContents
+nextOutputStringStart
,
2160 fMatchStart
-nextOutputStringStart
, &status
);
2161 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2162 utext_close(&remainingText
);
2165 UErrorCode lengthStatus
= U_ZERO_ERROR
;
2166 int32_t remaining16Length
= utext_extract(input
, nextOutputStringStart
, fMatchStart
, NULL
, 0, &lengthStatus
);
2167 UChar
*remainingChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(remaining16Length
+1));
2168 if (remainingChars
== NULL
) {
2169 status
= U_MEMORY_ALLOCATION_ERROR
;
2172 utext_extract(input
, nextOutputStringStart
, fMatchStart
, remainingChars
, remaining16Length
+1, &status
);
2174 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]), remainingChars
, remaining16Length
, &status
);
2176 UText remainingText
= UTEXT_INITIALIZER
;
2177 utext_openUChars(&remainingText
, remainingChars
, remaining16Length
, &status
);
2178 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2179 utext_close(&remainingText
);
2182 uprv_free(remainingChars
);
2184 nextOutputStringStart
= fMatchEnd
;
2186 // If the delimiter pattern has capturing parentheses, the captured
2187 // text goes out into the next n destination strings.
2189 for (groupNum
=1; groupNum
<=numCaptureGroups
; groupNum
++) {
2190 if (i
>= destCapacity
-2) {
2191 // Never fill the last available output string with capture group text.
2192 // It will filled with the last field, the remainder of the
2193 // unsplit input text.
2197 dest
[i
] = utext_extract_replace(fInputText
, dest
[i
],
2198 start64(groupNum
, status
), end64(groupNum
, status
), &status
);
2201 if (nextOutputStringStart
== fActiveLimit
) {
2202 // The delimiter was at the end of the string. We're done, but first
2203 // we output one last empty string, for the empty field following
2204 // the delimiter at the end of input.
2205 if (i
+1 < destCapacity
) {
2207 if (dest
[i
] == NULL
) {
2208 dest
[i
] = utext_openUChars(NULL
, NULL
, 0, &status
);
2210 static const UChar emptyString
[] = {(UChar
)0};
2211 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]), emptyString
, 0, &status
);
2220 // We ran off the end of the input while looking for the next delimiter.
2221 // All the remaining text goes into the current output string.
2222 if (UTEXT_FULL_TEXT_IN_CHUNK(input
, fInputLength
)) {
2224 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]),
2225 input
->chunkContents
+nextOutputStringStart
,
2226 (int32_t)(fActiveLimit
-nextOutputStringStart
), &status
);
2228 UText remainingText
= UTEXT_INITIALIZER
;
2229 utext_openUChars(&remainingText
, input
->chunkContents
+nextOutputStringStart
,
2230 fActiveLimit
-nextOutputStringStart
, &status
);
2231 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2232 utext_close(&remainingText
);
2235 UErrorCode lengthStatus
= U_ZERO_ERROR
;
2236 int32_t remaining16Length
= utext_extract(input
, nextOutputStringStart
, fActiveLimit
, NULL
, 0, &lengthStatus
);
2237 UChar
*remainingChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(remaining16Length
+1));
2238 if (remainingChars
== NULL
) {
2239 status
= U_MEMORY_ALLOCATION_ERROR
;
2243 utext_extract(input
, nextOutputStringStart
, fActiveLimit
, remainingChars
, remaining16Length
+1, &status
);
2245 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]), remainingChars
, remaining16Length
, &status
);
2247 UText remainingText
= UTEXT_INITIALIZER
;
2248 utext_openUChars(&remainingText
, remainingChars
, remaining16Length
, &status
);
2249 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2250 utext_close(&remainingText
);
2253 uprv_free(remainingChars
);
2257 if (U_FAILURE(status
)) {
2260 } // end of for loop
2265 //--------------------------------------------------------------------------------
2269 //--------------------------------------------------------------------------------
2270 int32_t RegexMatcher::start(UErrorCode
&status
) const {
2271 return start(0, status
);
2274 int64_t RegexMatcher::start64(UErrorCode
&status
) const {
2275 return start64(0, status
);
2278 //--------------------------------------------------------------------------------
2280 // start(int32_t group, UErrorCode &status)
2282 //--------------------------------------------------------------------------------
2284 int64_t RegexMatcher::start64(int32_t group
, UErrorCode
&status
) const {
2285 if (U_FAILURE(status
)) {
2288 if (U_FAILURE(fDeferredStatus
)) {
2289 status
= fDeferredStatus
;
2292 if (fMatch
== FALSE
) {
2293 status
= U_REGEX_INVALID_STATE
;
2296 if (group
< 0 || group
> fPattern
->fGroupMap
->size()) {
2297 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
2304 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(group
-1);
2305 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
2306 U_ASSERT(groupOffset
>= 0);
2307 s
= fFrame
->fExtra
[groupOffset
];
2314 int32_t RegexMatcher::start(int32_t group
, UErrorCode
&status
) const {
2315 return (int32_t)start64(group
, status
);
2318 //--------------------------------------------------------------------------------
2320 // useAnchoringBounds
2322 //--------------------------------------------------------------------------------
2323 RegexMatcher
&RegexMatcher::useAnchoringBounds(UBool b
) {
2324 fAnchoringBounds
= b
;
2325 fAnchorStart
= (fAnchoringBounds
? fRegionStart
: 0);
2326 fAnchorLimit
= (fAnchoringBounds
? fRegionLimit
: fInputLength
);
2331 //--------------------------------------------------------------------------------
2333 // useTransparentBounds
2335 //--------------------------------------------------------------------------------
2336 RegexMatcher
&RegexMatcher::useTransparentBounds(UBool b
) {
2337 fTransparentBounds
= b
;
2338 fLookStart
= (fTransparentBounds
? 0 : fRegionStart
);
2339 fLookLimit
= (fTransparentBounds
? fInputLength
: fRegionLimit
);
2343 //--------------------------------------------------------------------------------
2347 //--------------------------------------------------------------------------------
2348 void RegexMatcher::setTimeLimit(int32_t limit
, UErrorCode
&status
) {
2349 if (U_FAILURE(status
)) {
2352 if (U_FAILURE(fDeferredStatus
)) {
2353 status
= fDeferredStatus
;
2357 status
= U_ILLEGAL_ARGUMENT_ERROR
;
2364 //--------------------------------------------------------------------------------
2368 //--------------------------------------------------------------------------------
2369 int32_t RegexMatcher::getTimeLimit() const {
2374 //--------------------------------------------------------------------------------
2378 //--------------------------------------------------------------------------------
2379 void RegexMatcher::setStackLimit(int32_t limit
, UErrorCode
&status
) {
2380 if (U_FAILURE(status
)) {
2383 if (U_FAILURE(fDeferredStatus
)) {
2384 status
= fDeferredStatus
;
2388 status
= U_ILLEGAL_ARGUMENT_ERROR
;
2392 // Reset the matcher. This is needed here in case there is a current match
2393 // whose final stack frame (containing the match results, pointed to by fFrame)
2394 // would be lost by resizing to a smaller stack size.
2398 // Unlimited stack expansion
2399 fStack
->setMaxCapacity(0);
2401 // Change the units of the limit from bytes to ints, and bump the size up
2402 // to be big enough to hold at least one stack frame for the pattern,
2403 // if it isn't there already.
2404 int32_t adjustedLimit
= limit
/ sizeof(int32_t);
2405 if (adjustedLimit
< fPattern
->fFrameSize
) {
2406 adjustedLimit
= fPattern
->fFrameSize
;
2408 fStack
->setMaxCapacity(adjustedLimit
);
2410 fStackLimit
= limit
;
2414 //--------------------------------------------------------------------------------
2418 //--------------------------------------------------------------------------------
2419 int32_t RegexMatcher::getStackLimit() const {
2424 //--------------------------------------------------------------------------------
2428 //--------------------------------------------------------------------------------
2429 void RegexMatcher::setMatchCallback(URegexMatchCallback
*callback
,
2430 const void *context
,
2431 UErrorCode
&status
) {
2432 if (U_FAILURE(status
)) {
2435 fCallbackFn
= callback
;
2436 fCallbackContext
= context
;
2440 //--------------------------------------------------------------------------------
2444 //--------------------------------------------------------------------------------
2445 void RegexMatcher::getMatchCallback(URegexMatchCallback
*&callback
,
2446 const void *&context
,
2447 UErrorCode
&status
) {
2448 if (U_FAILURE(status
)) {
2451 callback
= fCallbackFn
;
2452 context
= fCallbackContext
;
2456 //--------------------------------------------------------------------------------
2460 //--------------------------------------------------------------------------------
2461 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback
*callback
,
2462 const void *context
,
2463 UErrorCode
&status
) {
2464 if (U_FAILURE(status
)) {
2467 fFindProgressCallbackFn
= callback
;
2468 fFindProgressCallbackContext
= context
;
2472 //--------------------------------------------------------------------------------
2476 //--------------------------------------------------------------------------------
2477 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback
*&callback
,
2478 const void *&context
,
2479 UErrorCode
&status
) {
2480 if (U_FAILURE(status
)) {
2483 callback
= fFindProgressCallbackFn
;
2484 context
= fFindProgressCallbackContext
;
2488 //================================================================================
2490 // Code following this point in this file is the internal
2491 // Match Engine Implementation.
2493 //================================================================================
2496 //--------------------------------------------------------------------------------
2499 // Discard any previous contents of the state save stack, and initialize a
2500 // new stack frame to all -1. The -1s are needed for capture group limits,
2501 // where they indicate that a group has not yet matched anything.
2502 //--------------------------------------------------------------------------------
2503 REStackFrame
*RegexMatcher::resetStack() {
2504 // Discard any previous contents of the state save stack, and initialize a
2505 // new stack frame with all -1 data. The -1s are needed for capture group limits,
2506 // where they indicate that a group has not yet matched anything.
2507 fStack
->removeAllElements();
2509 REStackFrame
*iFrame
= (REStackFrame
*)fStack
->reserveBlock(fPattern
->fFrameSize
, fDeferredStatus
);
2510 if(U_FAILURE(fDeferredStatus
)) {
2515 for (i
=0; i
<fPattern
->fFrameSize
-RESTACKFRAME_HDRCOUNT
; i
++) {
2516 iFrame
->fExtra
[i
] = -1;
2523 //--------------------------------------------------------------------------------
2526 // in perl, "xab..cd..", \b is true at positions 0,3,5,7
2528 // If the current char is a combining mark,
2530 // Else Scan backwards to the first non-combining char.
2531 // We are at a boundary if the this char and the original chars are
2532 // opposite in membership in \w set
2534 // parameters: pos - the current position in the input buffer
2536 // TODO: double-check edge cases at region boundaries.
2538 //--------------------------------------------------------------------------------
2539 UBool
RegexMatcher::isWordBoundary(int64_t pos
) {
2540 UBool isBoundary
= FALSE
;
2541 UBool cIsWord
= FALSE
;
2543 if (pos
>= fLookLimit
) {
2546 // Determine whether char c at current position is a member of the word set of chars.
2547 // If we're off the end of the string, behave as though we're not at a word char.
2548 UTEXT_SETNATIVEINDEX(fInputText
, pos
);
2549 UChar32 c
= UTEXT_CURRENT32(fInputText
);
2550 if (u_hasBinaryProperty(c
, UCHAR_GRAPHEME_EXTEND
) || u_charType(c
) == U_FORMAT_CHAR
) {
2551 // Current char is a combining one. Not a boundary.
2554 cIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(c
);
2557 // Back up until we come to a non-combining char, determine whether
2558 // that char is a word char.
2559 UBool prevCIsWord
= FALSE
;
2561 if (UTEXT_GETNATIVEINDEX(fInputText
) <= fLookStart
) {
2564 UChar32 prevChar
= UTEXT_PREVIOUS32(fInputText
);
2565 if (!(u_hasBinaryProperty(prevChar
, UCHAR_GRAPHEME_EXTEND
)
2566 || u_charType(prevChar
) == U_FORMAT_CHAR
)) {
2567 prevCIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(prevChar
);
2571 isBoundary
= cIsWord
^ prevCIsWord
;
2575 UBool
RegexMatcher::isChunkWordBoundary(int32_t pos
) {
2576 UBool isBoundary
= FALSE
;
2577 UBool cIsWord
= FALSE
;
2579 const UChar
*inputBuf
= fInputText
->chunkContents
;
2581 if (pos
>= fLookLimit
) {
2584 // Determine whether char c at current position is a member of the word set of chars.
2585 // If we're off the end of the string, behave as though we're not at a word char.
2587 U16_GET(inputBuf
, fLookStart
, pos
, fLookLimit
, c
);
2588 if (u_hasBinaryProperty(c
, UCHAR_GRAPHEME_EXTEND
) || u_charType(c
) == U_FORMAT_CHAR
) {
2589 // Current char is a combining one. Not a boundary.
2592 cIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(c
);
2595 // Back up until we come to a non-combining char, determine whether
2596 // that char is a word char.
2597 UBool prevCIsWord
= FALSE
;
2599 if (pos
<= fLookStart
) {
2603 U16_PREV(inputBuf
, fLookStart
, pos
, prevChar
);
2604 if (!(u_hasBinaryProperty(prevChar
, UCHAR_GRAPHEME_EXTEND
)
2605 || u_charType(prevChar
) == U_FORMAT_CHAR
)) {
2606 prevCIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(prevChar
);
2610 isBoundary
= cIsWord
^ prevCIsWord
;
2614 //--------------------------------------------------------------------------------
2618 // Test for a word boundary using RBBI word break.
2620 // parameters: pos - the current position in the input buffer
2622 //--------------------------------------------------------------------------------
2623 UBool
RegexMatcher::isUWordBoundary(int64_t pos
) {
2624 UBool returnVal
= FALSE
;
2625 #if UCONFIG_NO_BREAK_ITERATION==0
2627 // If we haven't yet created a break iterator for this matcher, do it now.
2628 if (fWordBreakItr
== NULL
) {
2630 (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus
);
2631 if (U_FAILURE(fDeferredStatus
)) {
2634 fWordBreakItr
->setText(fInputText
, fDeferredStatus
);
2637 if (pos
>= fLookLimit
) {
2639 returnVal
= TRUE
; // With Unicode word rules, only positions within the interior of "real"
2640 // words are not boundaries. All non-word chars stand by themselves,
2641 // with word boundaries on both sides.
2643 if (!UTEXT_USES_U16(fInputText
)) {
2644 // !!!: Would like a better way to do this!
2645 UErrorCode status
= U_ZERO_ERROR
;
2646 pos
= utext_extract(fInputText
, 0, pos
, NULL
, 0, &status
);
2648 returnVal
= fWordBreakItr
->isBoundary((int32_t)pos
);
2654 //--------------------------------------------------------------------------------
2656 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state
2657 // saves. Increment the "time" counter, and call the
2658 // user callback function if there is one installed.
2660 // If the match operation needs to be aborted, either for a time-out
2661 // or because the user callback asked for it, just set an error status.
2662 // The engine will pick that up and stop in its outer loop.
2664 //--------------------------------------------------------------------------------
2665 void RegexMatcher::IncrementTime(UErrorCode
&status
) {
2666 fTickCounter
= TIMER_INITIAL_VALUE
;
2668 if (fCallbackFn
!= NULL
) {
2669 if ((*fCallbackFn
)(fCallbackContext
, fTime
) == FALSE
) {
2670 status
= U_REGEX_STOPPED_BY_CALLER
;
2674 if (fTimeLimit
> 0 && fTime
>= fTimeLimit
) {
2675 status
= U_REGEX_TIME_OUT
;
2679 //--------------------------------------------------------------------------------
2682 // Make a new stack frame, initialized as a copy of the current stack frame.
2683 // Set the pattern index in the original stack frame from the operand value
2684 // in the opcode. Execution of the engine continues with the state in
2685 // the newly created stack frame
2687 // Note that reserveBlock() may grow the stack, resulting in the
2688 // whole thing being relocated in memory.
2691 // fp The top frame pointer when called. At return, a new
2692 // fame will be present
2693 // savePatIdx An index into the compiled pattern. Goes into the original
2694 // (not new) frame. If execution ever back-tracks out of the
2695 // new frame, this will be where we continue from in the pattern.
2697 // The new frame pointer.
2699 //--------------------------------------------------------------------------------
2700 inline REStackFrame
*RegexMatcher::StateSave(REStackFrame
*fp
, int64_t savePatIdx
, UErrorCode
&status
) {
2701 if (U_FAILURE(status
)) {
2704 // push storage for a new frame.
2705 int64_t *newFP
= fStack
->reserveBlock(fFrameSize
, status
);
2706 if (U_FAILURE(status
)) {
2707 // Failure on attempted stack expansion.
2708 // Stack function set some other error code, change it to a more
2709 // specific one for regular expressions.
2710 status
= U_REGEX_STACK_OVERFLOW
;
2711 // We need to return a writable stack frame, so just return the
2712 // previous frame. The match operation will stop quickly
2713 // because of the error status, after which the frame will never
2714 // be looked at again.
2717 fp
= (REStackFrame
*)(newFP
- fFrameSize
); // in case of realloc of stack.
2719 // New stack frame = copy of old top frame.
2720 int64_t *source
= (int64_t *)fp
;
2721 int64_t *dest
= newFP
;
2723 *dest
++ = *source
++;
2724 if (source
== newFP
) {
2730 if (fTickCounter
<= 0) {
2731 IncrementTime(status
); // Re-initializes fTickCounter
2733 fp
->fPatIdx
= savePatIdx
;
2734 return (REStackFrame
*)newFP
;
2737 #if defined(REGEX_DEBUG)
2739 UnicodeString
StringFromUText(UText
*ut
) {
2740 UnicodeString result
;
2741 for (UChar32 c
= utext_next32From(ut
, 0); c
!= U_SENTINEL
; c
= UTEXT_NEXT32(ut
)) {
2747 #endif // REGEX_DEBUG
2750 //--------------------------------------------------------------------------------
2752 // MatchAt This is the actual matching engine.
2754 // startIdx: begin matching a this index.
2755 // toEnd: if true, match must extend to end of the input region
2757 //--------------------------------------------------------------------------------
2758 void RegexMatcher::MatchAt(int64_t startIdx
, UBool toEnd
, UErrorCode
&status
) {
2759 UBool isMatch
= FALSE
; // True if the we have a match.
2761 int64_t backSearchIndex
= U_INT64_MAX
; // used after greedy single-character matches for searching backwards
2763 int32_t op
; // Operation from the compiled pattern, split into
2764 int32_t opType
; // the opcode
2765 int32_t opValue
; // and the operand value.
2767 #ifdef REGEX_RUN_DEBUG
2769 printf("MatchAt(startIdx=%ld)\n", startIdx
);
2770 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern
->fPattern
))());
2771 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText
))());
2775 if (U_FAILURE(status
)) {
2779 // Cache frequently referenced items from the compiled pattern
2781 int64_t *pat
= fPattern
->fCompiledPat
->getBuffer();
2783 const UChar
*litText
= fPattern
->fLiteralText
.getBuffer();
2784 UVector
*fSets
= fPattern
->fSets
;
2786 fFrameSize
= fPattern
->fFrameSize
;
2787 REStackFrame
*fp
= resetStack();
2788 if (U_FAILURE(fDeferredStatus
)) {
2789 status
= fDeferredStatus
;
2794 fp
->fInputIdx
= startIdx
;
2796 // Zero out the pattern's static data
2798 for (i
= 0; i
<fPattern
->fDataSize
; i
++) {
2803 // Main loop for interpreting the compiled pattern.
2804 // One iteration of the loop per pattern operation performed.
2807 op
= (int32_t)pat
[fp
->fPatIdx
];
2808 opType
= URX_TYPE(op
);
2809 opValue
= URX_VAL(op
);
2810 #ifdef REGEX_RUN_DEBUG
2812 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2813 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp
->fInputIdx
,
2814 UTEXT_CURRENT32(fInputText
), (int64_t *)fp
-fStack
->getBuffer(), fActiveLimit
);
2815 fPattern
->dumpOp(fp
->fPatIdx
);
2828 // Force a backtrack. In some circumstances, the pattern compiler
2829 // will notice that the pattern can't possibly match anything, and will
2830 // emit one of these at that point.
2831 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2836 if (fp
->fInputIdx
< fActiveLimit
) {
2837 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2838 UChar32 c
= UTEXT_NEXT32(fInputText
);
2840 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
2846 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2852 // Test input against a literal string.
2853 // Strings require two slots in the compiled pattern, one for the
2854 // offset to the string text, and one for the length.
2856 int32_t stringStartIdx
= opValue
;
2857 op
= (int32_t)pat
[fp
->fPatIdx
]; // Fetch the second operand
2859 opType
= URX_TYPE(op
);
2860 int32_t stringLen
= URX_VAL(op
);
2861 U_ASSERT(opType
== URX_STRING_LEN
);
2862 U_ASSERT(stringLen
>= 2);
2864 const UChar
*patternString
= litText
+stringStartIdx
;
2865 int32_t patternStringIndex
= 0;
2866 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2868 UChar32 patternChar
;
2869 UBool success
= TRUE
;
2870 while (patternStringIndex
< stringLen
) {
2871 if (UTEXT_GETNATIVEINDEX(fInputText
) >= fActiveLimit
) {
2876 inputChar
= UTEXT_NEXT32(fInputText
);
2877 U16_NEXT(patternString
, patternStringIndex
, stringLen
, patternChar
);
2878 if (patternChar
!= inputChar
) {
2885 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
2887 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2893 case URX_STATE_SAVE
:
2894 fp
= StateSave(fp
, opValue
, status
);
2899 // The match loop will exit via this path on a successful match,
2900 // when we reach the end of the pattern.
2901 if (toEnd
&& fp
->fInputIdx
!= fActiveLimit
) {
2902 // The pattern matched, but not to the end of input. Try some more.
2903 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2909 // Start and End Capture stack frame variables are laid out out like this:
2910 // fp->fExtra[opValue] - The start of a completed capture group
2911 // opValue+1 - The end of a completed capture group
2912 // opValue+2 - the start of a capture group whose end
2913 // has not yet been reached (and might not ever be).
2914 case URX_START_CAPTURE
:
2915 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
2916 fp
->fExtra
[opValue
+2] = fp
->fInputIdx
;
2920 case URX_END_CAPTURE
:
2921 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
2922 U_ASSERT(fp
->fExtra
[opValue
+2] >= 0); // Start pos for this group must be set.
2923 fp
->fExtra
[opValue
] = fp
->fExtra
[opValue
+2]; // Tentative start becomes real.
2924 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // End position
2925 U_ASSERT(fp
->fExtra
[opValue
] <= fp
->fExtra
[opValue
+1]);
2929 case URX_DOLLAR
: // $, test for End of line
2930 // or for position before new line at end of input
2932 if (fp
->fInputIdx
>= fAnchorLimit
) {
2933 // We really are at the end of input. Success.
2939 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2941 // If we are positioned just before a new-line that is located at the
2942 // end of input, succeed.
2943 UChar32 c
= UTEXT_NEXT32(fInputText
);
2944 if (UTEXT_GETNATIVEINDEX(fInputText
) >= fAnchorLimit
) {
2945 if (isLineTerminator(c
)) {
2946 // If not in the middle of a CR/LF sequence
2947 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& ((void)UTEXT_PREVIOUS32(fInputText
), UTEXT_PREVIOUS32(fInputText
))==0x0d)) {
2948 // At new-line at end of input. Success
2956 UChar32 nextC
= UTEXT_NEXT32(fInputText
);
2957 if (c
== 0x0d && nextC
== 0x0a && UTEXT_GETNATIVEINDEX(fInputText
) >= fAnchorLimit
) {
2960 break; // At CR/LF at end of input. Success
2964 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2969 case URX_DOLLAR_D
: // $, test for End of Line, in UNIX_LINES mode.
2970 if (fp
->fInputIdx
>= fAnchorLimit
) {
2971 // Off the end of input. Success.
2976 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2977 UChar32 c
= UTEXT_NEXT32(fInputText
);
2978 // Either at the last character of input, or off the end.
2979 if (c
== 0x0a && UTEXT_GETNATIVEINDEX(fInputText
) == fAnchorLimit
) {
2986 // Not at end of input. Back-track out.
2987 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2991 case URX_DOLLAR_M
: // $, test for End of line in multi-line mode
2993 if (fp
->fInputIdx
>= fAnchorLimit
) {
2994 // We really are at the end of input. Success.
2999 // If we are positioned just before a new-line, succeed.
3000 // It makes no difference where the new-line is within the input.
3001 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3002 UChar32 c
= UTEXT_CURRENT32(fInputText
);
3003 if (isLineTerminator(c
)) {
3004 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
3005 // In multi-line mode, hitting a new-line just before the end of input does not
3006 // set the hitEnd or requireEnd flags
3007 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& UTEXT_PREVIOUS32(fInputText
)==0x0d)) {
3011 // not at a new line. Fail.
3012 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3017 case URX_DOLLAR_MD
: // $, test for End of line in multi-line and UNIX_LINES mode
3019 if (fp
->fInputIdx
>= fAnchorLimit
) {
3020 // We really are at the end of input. Success.
3022 fRequireEnd
= TRUE
; // Java set requireEnd in this case, even though
3023 break; // adding a new-line would not lose the match.
3025 // If we are not positioned just before a new-line, the test fails; backtrack out.
3026 // It makes no difference where the new-line is within the input.
3027 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3028 if (UTEXT_CURRENT32(fInputText
) != 0x0a) {
3029 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3035 case URX_CARET
: // ^, test for start of line
3036 if (fp
->fInputIdx
!= fAnchorStart
) {
3037 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3042 case URX_CARET_M
: // ^, test for start of line in mulit-line mode
3044 if (fp
->fInputIdx
== fAnchorStart
) {
3045 // We are at the start input. Success.
3048 // Check whether character just before the current pos is a new-line
3049 // unless we are at the end of input
3050 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3051 UChar32 c
= UTEXT_PREVIOUS32(fInputText
);
3052 if ((fp
->fInputIdx
< fAnchorLimit
) && isLineTerminator(c
)) {
3053 // It's a new-line. ^ is true. Success.
3054 // TODO: what should be done with positions between a CR and LF?
3057 // Not at the start of a line. Fail.
3058 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3063 case URX_CARET_M_UNIX
: // ^, test for start of line in mulit-line + Unix-line mode
3065 U_ASSERT(fp
->fInputIdx
>= fAnchorStart
);
3066 if (fp
->fInputIdx
<= fAnchorStart
) {
3067 // We are at the start input. Success.
3070 // Check whether character just before the current pos is a new-line
3071 U_ASSERT(fp
->fInputIdx
<= fAnchorLimit
);
3072 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3073 UChar32 c
= UTEXT_PREVIOUS32(fInputText
);
3075 // Not at the start of a line. Back-track out.
3076 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3081 case URX_BACKSLASH_B
: // Test for word boundaries
3083 UBool success
= isWordBoundary(fp
->fInputIdx
);
3084 success
^= (UBool
)(opValue
!= 0); // flip sense for \B
3086 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3092 case URX_BACKSLASH_BU
: // Test for word boundaries, Unicode-style
3094 UBool success
= isUWordBoundary(fp
->fInputIdx
);
3095 success
^= (UBool
)(opValue
!= 0); // flip sense for \B
3097 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3103 case URX_BACKSLASH_D
: // Test for decimal digit
3105 if (fp
->fInputIdx
>= fActiveLimit
) {
3107 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3111 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3113 UChar32 c
= UTEXT_NEXT32(fInputText
);
3114 int8_t ctype
= u_charType(c
); // TODO: make a unicode set for this. Will be faster.
3115 UBool success
= (ctype
== U_DECIMAL_DIGIT_NUMBER
);
3116 success
^= (UBool
)(opValue
!= 0); // flip sense for \D
3118 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3120 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3126 case URX_BACKSLASH_G
: // Test for position at end of previous match
3127 if (!((fMatch
&& fp
->fInputIdx
==fMatchEnd
) || (fMatch
==FALSE
&& fp
->fInputIdx
==fActiveStart
))) {
3128 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3133 case URX_BACKSLASH_H
: // Test for \h, horizontal white space.
3135 if (fp
->fInputIdx
>= fActiveLimit
) {
3137 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3140 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3141 UChar32 c
= UTEXT_NEXT32(fInputText
);
3142 int8_t ctype
= u_charType(c
);
3143 UBool success
= (ctype
== U_SPACE_SEPARATOR
|| c
== 9); // SPACE_SEPARATOR || TAB
3144 success
^= (UBool
)(opValue
!= 0); // flip sense for \H
3146 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3148 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3154 case URX_BACKSLASH_R
: // Test for \R, any line break sequence.
3156 if (fp
->fInputIdx
>= fActiveLimit
) {
3158 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3161 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3162 UChar32 c
= UTEXT_NEXT32(fInputText
);
3163 if (isLineTerminator(c
)) {
3164 if (c
== 0x0d && utext_current32(fInputText
) == 0x0a) {
3165 utext_next32(fInputText
);
3167 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3169 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3175 case URX_BACKSLASH_V
: // \v, any single line ending character.
3177 if (fp
->fInputIdx
>= fActiveLimit
) {
3179 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3182 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3183 UChar32 c
= UTEXT_NEXT32(fInputText
);
3184 UBool success
= isLineTerminator(c
);
3185 success
^= (UBool
)(opValue
!= 0); // flip sense for \V
3187 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3189 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3195 case URX_BACKSLASH_X
:
3196 // Match a Grapheme, as defined by Unicode TR 29.
3197 // Differs slightly from Perl, which consumes combining marks independently
3201 // Fail if at end of input
3202 if (fp
->fInputIdx
>= fActiveLimit
) {
3204 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3208 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3210 // Examine (and consume) the current char.
3211 // Dispatch into a little state machine, based on the char.
3213 c
= UTEXT_NEXT32(fInputText
);
3214 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3215 UnicodeSet
**sets
= fPattern
->fStaticSets
;
3216 if (sets
[URX_GC_NORMAL
]->contains(c
)) goto GC_Extend
;
3217 if (sets
[URX_GC_CONTROL
]->contains(c
)) goto GC_Control
;
3218 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
3219 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
3220 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
3221 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
3222 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
3228 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
3229 c
= UTEXT_NEXT32(fInputText
);
3230 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3231 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
3232 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
3233 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
3234 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
3235 (void)UTEXT_PREVIOUS32(fInputText
);
3236 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3240 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
3241 c
= UTEXT_NEXT32(fInputText
);
3242 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3243 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
3244 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
3245 (void)UTEXT_PREVIOUS32(fInputText
);
3246 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3250 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
3251 c
= UTEXT_NEXT32(fInputText
);
3252 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3253 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
3254 (void)UTEXT_PREVIOUS32(fInputText
);
3255 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3259 // Combining characters are consumed here
3261 if (fp
->fInputIdx
>= fActiveLimit
) {
3264 c
= UTEXT_CURRENT32(fInputText
);
3265 if (sets
[URX_GC_EXTEND
]->contains(c
) == FALSE
) {
3268 (void)UTEXT_NEXT32(fInputText
);
3269 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3274 // Most control chars stand alone (don't combine with combining chars),
3275 // except for that CR/LF sequence is a single grapheme cluster.
3276 if (c
== 0x0d && fp
->fInputIdx
< fActiveLimit
&& UTEXT_CURRENT32(fInputText
) == 0x0a) {
3277 c
= UTEXT_NEXT32(fInputText
);
3278 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3282 if (fp
->fInputIdx
>= fActiveLimit
) {
3291 case URX_BACKSLASH_Z
: // Test for end of Input
3292 if (fp
->fInputIdx
< fAnchorLimit
) {
3293 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3302 case URX_STATIC_SETREF
:
3304 // Test input character against one of the predefined sets
3305 // (Word Characters, for example)
3306 // The high bit of the op value is a flag for the match polarity.
3307 // 0: success if input char is in set.
3308 // 1: success if input char is not in set.
3309 if (fp
->fInputIdx
>= fActiveLimit
) {
3311 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3315 UBool success
= ((opValue
& URX_NEG_SET
) == URX_NEG_SET
);
3316 opValue
&= ~URX_NEG_SET
;
3317 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
3319 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3320 UChar32 c
= UTEXT_NEXT32(fInputText
);
3322 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
3323 if (s8
->contains(c
)) {
3327 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
3328 if (s
->contains(c
)) {
3333 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3335 // the character wasn't in the set.
3336 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3342 case URX_STAT_SETREF_N
:
3344 // Test input character for NOT being a member of one of
3345 // the predefined sets (Word Characters, for example)
3346 if (fp
->fInputIdx
>= fActiveLimit
) {
3348 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3352 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
3354 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3356 UChar32 c
= UTEXT_NEXT32(fInputText
);
3358 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
3359 if (s8
->contains(c
) == FALSE
) {
3360 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3364 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
3365 if (s
->contains(c
) == FALSE
) {
3366 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3370 // the character wasn't in the set.
3371 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3377 if (fp
->fInputIdx
>= fActiveLimit
) {
3379 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3382 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3384 // There is input left. Pick up one char and test it for set membership.
3385 UChar32 c
= UTEXT_NEXT32(fInputText
);
3386 U_ASSERT(opValue
> 0 && opValue
< fSets
->size());
3388 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
3389 if (s8
->contains(c
)) {
3390 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3394 UnicodeSet
*s
= (UnicodeSet
*)fSets
->elementAt(opValue
);
3395 if (s
->contains(c
)) {
3396 // The character is in the set. A Match.
3397 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3402 // the character wasn't in the set.
3403 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3410 // . matches anything, but stops at end-of-line.
3411 if (fp
->fInputIdx
>= fActiveLimit
) {
3412 // At end of input. Match failed. Backtrack out.
3414 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3418 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3420 // There is input left. Advance over one char, unless we've hit end-of-line
3421 UChar32 c
= UTEXT_NEXT32(fInputText
);
3422 if (isLineTerminator(c
)) {
3423 // End of line in normal mode. . does not match.
3424 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3427 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3432 case URX_DOTANY_ALL
:
3434 // ., in dot-matches-all (including new lines) mode
3435 if (fp
->fInputIdx
>= fActiveLimit
) {
3436 // At end of input. Match failed. Backtrack out.
3438 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3442 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3444 // There is input left. Advance over one char, except if we are
3445 // at a cr/lf, advance over both of them.
3447 c
= UTEXT_NEXT32(fInputText
);
3448 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3449 if (c
==0x0d && fp
->fInputIdx
< fActiveLimit
) {
3450 // In the case of a CR/LF, we need to advance over both.
3451 UChar32 nextc
= UTEXT_CURRENT32(fInputText
);
3452 if (nextc
== 0x0a) {
3453 (void)UTEXT_NEXT32(fInputText
);
3454 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3461 case URX_DOTANY_UNIX
:
3463 // '.' operator, matches all, but stops at end-of-line.
3464 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
3465 if (fp
->fInputIdx
>= fActiveLimit
) {
3466 // At end of input. Match failed. Backtrack out.
3468 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3472 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3474 // There is input left. Advance over one char, unless we've hit end-of-line
3475 UChar32 c
= UTEXT_NEXT32(fInputText
);
3477 // End of line in normal mode. '.' does not match the \n
3478 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3480 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3487 fp
->fPatIdx
= opValue
;
3495 U_ASSERT(opValue
< fPattern
->fCompiledPat
->size());
3496 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
3497 fp
->fPatIdx
= opValue
; // Then JMP.
3501 // This opcode is used with (x)+, when x can match a zero length string.
3502 // Same as JMP_SAV, except conditional on the match having made forward progress.
3503 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3504 // data address of the input position at the start of the loop.
3506 U_ASSERT(opValue
> 0 && opValue
< fPattern
->fCompiledPat
->size());
3507 int32_t stoOp
= (int32_t)pat
[opValue
-1];
3508 U_ASSERT(URX_TYPE(stoOp
) == URX_STO_INP_LOC
);
3509 int32_t frameLoc
= URX_VAL(stoOp
);
3510 U_ASSERT(frameLoc
>= 0 && frameLoc
< fFrameSize
);
3511 int64_t prevInputIdx
= fp
->fExtra
[frameLoc
];
3512 U_ASSERT(prevInputIdx
<= fp
->fInputIdx
);
3513 if (prevInputIdx
< fp
->fInputIdx
) {
3514 // The match did make progress. Repeat the loop.
3515 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
3516 fp
->fPatIdx
= opValue
;
3517 fp
->fExtra
[frameLoc
] = fp
->fInputIdx
;
3519 // If the input position did not advance, we do nothing here,
3520 // execution will fall out of the loop.
3526 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
3527 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
3529 // Pick up the three extra operands that CTR_INIT has, and
3530 // skip the pattern location counter past
3531 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
3533 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
3534 int32_t minCount
= (int32_t)pat
[instrOperandLoc
+1];
3535 int32_t maxCount
= (int32_t)pat
[instrOperandLoc
+2];
3536 U_ASSERT(minCount
>=0);
3537 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
3538 U_ASSERT(loopLoc
>=fp
->fPatIdx
);
3540 if (minCount
== 0) {
3541 fp
= StateSave(fp
, loopLoc
+1, status
);
3543 if (maxCount
== -1) {
3544 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // For loop breaking.
3545 } else if (maxCount
== 0) {
3546 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3553 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
3554 int32_t initOp
= (int32_t)pat
[opValue
];
3555 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT
);
3556 int64_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
3557 int32_t minCount
= (int32_t)pat
[opValue
+2];
3558 int32_t maxCount
= (int32_t)pat
[opValue
+3];
3560 if ((uint64_t)*pCounter
>= (uint32_t)maxCount
&& maxCount
!= -1) {
3561 U_ASSERT(*pCounter
== maxCount
);
3564 if (*pCounter
>= minCount
) {
3565 if (maxCount
== -1) {
3566 // Loop has no hard upper bound.
3567 // Check that it is progressing through the input, break if it is not.
3568 int64_t *pLastInputIdx
= &fp
->fExtra
[URX_VAL(initOp
) + 1];
3569 if (fp
->fInputIdx
== *pLastInputIdx
) {
3572 *pLastInputIdx
= fp
->fInputIdx
;
3575 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
3577 // Increment time-out counter. (StateSave() does it if count >= minCount)
3579 if (fTickCounter
<= 0) {
3580 IncrementTime(status
); // Re-initializes fTickCounter
3584 fp
->fPatIdx
= opValue
+ 4; // Loop back.
3588 case URX_CTR_INIT_NG
:
3590 // Initialize a non-greedy loop
3591 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
3592 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
3594 // Pick up the three extra operands that CTR_INIT_NG has, and
3595 // skip the pattern location counter past
3596 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
3598 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
3599 int32_t minCount
= (int32_t)pat
[instrOperandLoc
+1];
3600 int32_t maxCount
= (int32_t)pat
[instrOperandLoc
+2];
3601 U_ASSERT(minCount
>=0);
3602 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
3603 U_ASSERT(loopLoc
>fp
->fPatIdx
);
3604 if (maxCount
== -1) {
3605 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // Save initial input index for loop breaking.
3608 if (minCount
== 0) {
3609 if (maxCount
!= 0) {
3610 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
3612 fp
->fPatIdx
= loopLoc
+1; // Continue with stuff after repeated block
3617 case URX_CTR_LOOP_NG
:
3619 // Non-greedy {min, max} loops
3620 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
3621 int32_t initOp
= (int32_t)pat
[opValue
];
3622 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT_NG
);
3623 int64_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
3624 int32_t minCount
= (int32_t)pat
[opValue
+2];
3625 int32_t maxCount
= (int32_t)pat
[opValue
+3];
3628 if ((uint64_t)*pCounter
>= (uint32_t)maxCount
&& maxCount
!= -1) {
3629 // The loop has matched the maximum permitted number of times.
3630 // Break out of here with no action. Matching will
3631 // continue with the following pattern.
3632 U_ASSERT(*pCounter
== maxCount
);
3636 if (*pCounter
< minCount
) {
3637 // We haven't met the minimum number of matches yet.
3638 // Loop back for another one.
3639 fp
->fPatIdx
= opValue
+ 4; // Loop back.
3640 // Increment time-out counter. (StateSave() does it if count >= minCount)
3642 if (fTickCounter
<= 0) {
3643 IncrementTime(status
); // Re-initializes fTickCounter
3646 // We do have the minimum number of matches.
3648 // If there is no upper bound on the loop iterations, check that the input index
3649 // is progressing, and stop the loop if it is not.
3650 if (maxCount
== -1) {
3651 int64_t *pLastInputIdx
= &fp
->fExtra
[URX_VAL(initOp
) + 1];
3652 if (fp
->fInputIdx
== *pLastInputIdx
) {
3655 *pLastInputIdx
= fp
->fInputIdx
;
3658 // Loop Continuation: we will fall into the pattern following the loop
3659 // (non-greedy, don't execute loop body first), but first do
3660 // a state save to the top of the loop, so that a match failure
3661 // in the following pattern will try another iteration of the loop.
3662 fp
= StateSave(fp
, opValue
+ 4, status
);
3668 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
3669 fData
[opValue
] = fStack
->size();
3674 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
3675 int32_t newStackSize
= (int32_t)fData
[opValue
];
3676 U_ASSERT(newStackSize
<= fStack
->size());
3677 int64_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
3678 if (newFP
== (int64_t *)fp
) {
3682 for (j
=0; j
<fFrameSize
; j
++) {
3683 newFP
[j
] = ((int64_t *)fp
)[j
];
3685 fp
= (REStackFrame
*)newFP
;
3686 fStack
->setSize(newStackSize
);
3692 U_ASSERT(opValue
< fFrameSize
);
3693 int64_t groupStartIdx
= fp
->fExtra
[opValue
];
3694 int64_t groupEndIdx
= fp
->fExtra
[opValue
+1];
3695 U_ASSERT(groupStartIdx
<= groupEndIdx
);
3696 if (groupStartIdx
< 0) {
3697 // This capture group has not participated in the match thus far,
3698 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
3701 UTEXT_SETNATIVEINDEX(fAltInputText
, groupStartIdx
);
3702 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3704 // Note: if the capture group match was of an empty string the backref
3705 // match succeeds. Verified by testing: Perl matches succeed
3706 // in this case, so we do too.
3708 UBool success
= TRUE
;
3710 if (utext_getNativeIndex(fAltInputText
) >= groupEndIdx
) {
3714 if (utext_getNativeIndex(fInputText
) >= fActiveLimit
) {
3719 UChar32 captureGroupChar
= utext_next32(fAltInputText
);
3720 UChar32 inputChar
= utext_next32(fInputText
);
3721 if (inputChar
!= captureGroupChar
) {
3728 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3730 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3739 U_ASSERT(opValue
< fFrameSize
);
3740 int64_t groupStartIdx
= fp
->fExtra
[opValue
];
3741 int64_t groupEndIdx
= fp
->fExtra
[opValue
+1];
3742 U_ASSERT(groupStartIdx
<= groupEndIdx
);
3743 if (groupStartIdx
< 0) {
3744 // This capture group has not participated in the match thus far,
3745 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
3748 utext_setNativeIndex(fAltInputText
, groupStartIdx
);
3749 utext_setNativeIndex(fInputText
, fp
->fInputIdx
);
3750 CaseFoldingUTextIterator
captureGroupItr(*fAltInputText
);
3751 CaseFoldingUTextIterator
inputItr(*fInputText
);
3753 // Note: if the capture group match was of an empty string the backref
3754 // match succeeds. Verified by testing: Perl matches succeed
3755 // in this case, so we do too.
3757 UBool success
= TRUE
;
3759 if (!captureGroupItr
.inExpansion() && utext_getNativeIndex(fAltInputText
) >= groupEndIdx
) {
3763 if (!inputItr
.inExpansion() && utext_getNativeIndex(fInputText
) >= fActiveLimit
) {
3768 UChar32 captureGroupChar
= captureGroupItr
.next();
3769 UChar32 inputChar
= inputItr
.next();
3770 if (inputChar
!= captureGroupChar
) {
3776 if (success
&& inputItr
.inExpansion()) {
3777 // We otained a match by consuming part of a string obtained from
3778 // case-folding a single code point of the input text.
3779 // This does not count as an overall match.
3784 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3786 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3792 case URX_STO_INP_LOC
:
3794 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
);
3795 fp
->fExtra
[opValue
] = fp
->fInputIdx
;
3801 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
3803 int32_t dataLoc
= URX_VAL(pat
[instrOperandLoc
]);
3804 U_ASSERT(dataLoc
>= 0 && dataLoc
< fFrameSize
);
3805 int64_t savedInputIdx
= fp
->fExtra
[dataLoc
];
3806 U_ASSERT(savedInputIdx
<= fp
->fInputIdx
);
3807 if (savedInputIdx
< fp
->fInputIdx
) {
3808 fp
->fPatIdx
= opValue
; // JMP
3810 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no progress in loop.
3817 // Entering a lookahead block.
3818 // Save Stack Ptr, Input Pos.
3819 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3820 fData
[opValue
] = fStack
->size();
3821 fData
[opValue
+1] = fp
->fInputIdx
;
3822 fActiveStart
= fLookStart
; // Set the match region change for
3823 fActiveLimit
= fLookLimit
; // transparent bounds.
3829 // Leaving a look-ahead block.
3830 // restore Stack Ptr, Input Pos to positions they had on entry to block.
3831 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3832 int32_t stackSize
= fStack
->size();
3833 int32_t newStackSize
=(int32_t)fData
[opValue
];
3834 U_ASSERT(stackSize
>= newStackSize
);
3835 if (stackSize
> newStackSize
) {
3836 // Copy the current top frame back to the new (cut back) top frame.
3837 // This makes the capture groups from within the look-ahead
3838 // expression available.
3839 int64_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
3841 for (j
=0; j
<fFrameSize
; j
++) {
3842 newFP
[j
] = ((int64_t *)fp
)[j
];
3844 fp
= (REStackFrame
*)newFP
;
3845 fStack
->setSize(newStackSize
);
3847 fp
->fInputIdx
= fData
[opValue
+1];
3849 // Restore the active region bounds in the input string; they may have
3850 // been changed because of transparent bounds on a Region.
3851 fActiveStart
= fRegionStart
;
3852 fActiveLimit
= fRegionLimit
;
3857 // Case insensitive one char. The char from the pattern is already case folded.
3858 // Input text is not, but case folding the input can not reduce two or more code
3860 if (fp
->fInputIdx
< fActiveLimit
) {
3861 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3863 UChar32 c
= UTEXT_NEXT32(fInputText
);
3864 if (u_foldCase(c
, U_FOLD_CASE_DEFAULT
) == opValue
) {
3865 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3872 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3877 // Case-insensitive test input against a literal string.
3878 // Strings require two slots in the compiled pattern, one for the
3879 // offset to the string text, and one for the length.
3880 // The compiled string has already been case folded.
3882 const UChar
*patternString
= litText
+ opValue
;
3883 int32_t patternStringIdx
= 0;
3885 op
= (int32_t)pat
[fp
->fPatIdx
];
3887 opType
= URX_TYPE(op
);
3888 opValue
= URX_VAL(op
);
3889 U_ASSERT(opType
== URX_STRING_LEN
);
3890 int32_t patternStringLen
= opValue
; // Length of the string from the pattern.
3895 UBool success
= TRUE
;
3897 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3898 CaseFoldingUTextIterator
inputIterator(*fInputText
);
3899 while (patternStringIdx
< patternStringLen
) {
3900 if (!inputIterator
.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText
) >= fActiveLimit
) {
3905 U16_NEXT(patternString
, patternStringIdx
, patternStringLen
, cPattern
);
3906 cText
= inputIterator
.next();
3907 if (cText
!= cPattern
) {
3912 if (inputIterator
.inExpansion()) {
3917 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3919 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3927 // Entering a look-behind block.
3928 // Save Stack Ptr, Input Pos.
3929 // TODO: implement transparent bounds. Ticket #6067
3930 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3931 fData
[opValue
] = fStack
->size();
3932 fData
[opValue
+1] = fp
->fInputIdx
;
3933 // Init the variable containing the start index for attempted matches.
3934 fData
[opValue
+2] = -1;
3935 // Save input string length, then reset to pin any matches to end at
3936 // the current position.
3937 fData
[opValue
+3] = fActiveLimit
;
3938 fActiveLimit
= fp
->fInputIdx
;
3945 // Positive Look-Behind, at top of loop checking for matches of LB expression
3946 // at all possible input starting positions.
3948 // Fetch the min and max possible match lengths. They are the operands
3949 // of this op in the pattern.
3950 int32_t minML
= (int32_t)pat
[fp
->fPatIdx
++];
3951 int32_t maxML
= (int32_t)pat
[fp
->fPatIdx
++];
3952 if (!UTEXT_USES_U16(fInputText
)) {
3953 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
3954 // The max length need not be exact; it just needs to be >= actual maximum.
3957 U_ASSERT(minML
<= maxML
);
3958 U_ASSERT(minML
>= 0);
3960 // Fetch (from data) the last input index where a match was attempted.
3961 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3962 int64_t &lbStartIdx
= fData
[opValue
+2];
3963 if (lbStartIdx
< 0) {
3964 // First time through loop.
3965 lbStartIdx
= fp
->fInputIdx
- minML
;
3966 if (lbStartIdx
> 0) {
3967 // move index to a code point boudary, if it's not on one already.
3968 UTEXT_SETNATIVEINDEX(fInputText
, lbStartIdx
);
3969 lbStartIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3972 // 2nd through nth time through the loop.
3973 // Back up start position for match by one.
3974 if (lbStartIdx
== 0) {
3977 UTEXT_SETNATIVEINDEX(fInputText
, lbStartIdx
);
3978 (void)UTEXT_PREVIOUS32(fInputText
);
3979 lbStartIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3983 if (lbStartIdx
< 0 || lbStartIdx
< fp
->fInputIdx
- maxML
) {
3984 // We have tried all potential match starting points without
3985 // getting a match. Backtrack out, and out of the
3986 // Look Behind altogether.
3987 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3988 int64_t restoreInputLen
= fData
[opValue
+3];
3989 U_ASSERT(restoreInputLen
>= fActiveLimit
);
3990 U_ASSERT(restoreInputLen
<= fInputLength
);
3991 fActiveLimit
= restoreInputLen
;
3995 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3996 // (successful match will fall off the end of the loop.)
3997 fp
= StateSave(fp
, fp
->fPatIdx
-3, status
);
3998 fp
->fInputIdx
= lbStartIdx
;
4003 // End of a look-behind block, after a successful match.
4005 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
4006 if (fp
->fInputIdx
!= fActiveLimit
) {
4007 // The look-behind expression matched, but the match did not
4008 // extend all the way to the point that we are looking behind from.
4009 // FAIL out of here, which will take us back to the LB_CONT, which
4010 // will retry the match starting at another position or fail
4011 // the look-behind altogether, whichever is appropriate.
4012 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4016 // Look-behind match is good. Restore the orignal input string length,
4017 // which had been truncated to pin the end of the lookbehind match to the
4018 // position being looked-behind.
4019 int64_t originalInputLen
= fData
[opValue
+3];
4020 U_ASSERT(originalInputLen
>= fActiveLimit
);
4021 U_ASSERT(originalInputLen
<= fInputLength
);
4022 fActiveLimit
= originalInputLen
;
4029 // Negative Look-Behind, at top of loop checking for matches of LB expression
4030 // at all possible input starting positions.
4032 // Fetch the extra parameters of this op.
4033 int32_t minML
= (int32_t)pat
[fp
->fPatIdx
++];
4034 int32_t maxML
= (int32_t)pat
[fp
->fPatIdx
++];
4035 if (!UTEXT_USES_U16(fInputText
)) {
4036 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
4037 // The max length need not be exact; it just needs to be >= actual maximum.
4040 int32_t continueLoc
= (int32_t)pat
[fp
->fPatIdx
++];
4041 continueLoc
= URX_VAL(continueLoc
);
4042 U_ASSERT(minML
<= maxML
);
4043 U_ASSERT(minML
>= 0);
4044 U_ASSERT(continueLoc
> fp
->fPatIdx
);
4046 // Fetch (from data) the last input index where a match was attempted.
4047 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
4048 int64_t &lbStartIdx
= fData
[opValue
+2];
4049 if (lbStartIdx
< 0) {
4050 // First time through loop.
4051 lbStartIdx
= fp
->fInputIdx
- minML
;
4052 if (lbStartIdx
> 0) {
4053 // move index to a code point boudary, if it's not on one already.
4054 UTEXT_SETNATIVEINDEX(fInputText
, lbStartIdx
);
4055 lbStartIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
4058 // 2nd through nth time through the loop.
4059 // Back up start position for match by one.
4060 if (lbStartIdx
== 0) {
4063 UTEXT_SETNATIVEINDEX(fInputText
, lbStartIdx
);
4064 (void)UTEXT_PREVIOUS32(fInputText
);
4065 lbStartIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
4069 if (lbStartIdx
< 0 || lbStartIdx
< fp
->fInputIdx
- maxML
) {
4070 // We have tried all potential match starting points without
4071 // getting a match, which means that the negative lookbehind as
4072 // a whole has succeeded. Jump forward to the continue location
4073 int64_t restoreInputLen
= fData
[opValue
+3];
4074 U_ASSERT(restoreInputLen
>= fActiveLimit
);
4075 U_ASSERT(restoreInputLen
<= fInputLength
);
4076 fActiveLimit
= restoreInputLen
;
4077 fp
->fPatIdx
= continueLoc
;
4081 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4082 // (successful match will cause a FAIL out of the loop altogether.)
4083 fp
= StateSave(fp
, fp
->fPatIdx
-4, status
);
4084 fp
->fInputIdx
= lbStartIdx
;
4089 // End of a negative look-behind block, after a successful match.
4091 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
4092 if (fp
->fInputIdx
!= fActiveLimit
) {
4093 // The look-behind expression matched, but the match did not
4094 // extend all the way to the point that we are looking behind from.
4095 // FAIL out of here, which will take us back to the LB_CONT, which
4096 // will retry the match starting at another position or succeed
4097 // the look-behind altogether, whichever is appropriate.
4098 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4102 // Look-behind expression matched, which means look-behind test as
4105 // Restore the orignal input string length, which had been truncated
4106 // inorder to pin the end of the lookbehind match
4107 // to the position being looked-behind.
4108 int64_t originalInputLen
= fData
[opValue
+3];
4109 U_ASSERT(originalInputLen
>= fActiveLimit
);
4110 U_ASSERT(originalInputLen
<= fInputLength
);
4111 fActiveLimit
= originalInputLen
;
4113 // Restore original stack position, discarding any state saved
4114 // by the successful pattern match.
4115 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
4116 int32_t newStackSize
= (int32_t)fData
[opValue
];
4117 U_ASSERT(fStack
->size() > newStackSize
);
4118 fStack
->setSize(newStackSize
);
4120 // FAIL, which will take control back to someplace
4121 // prior to entering the look-behind test.
4122 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4128 // Loop Initialization for the optimized implementation of
4129 // [some character set]*
4130 // This op scans through all matching input.
4131 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4133 U_ASSERT(opValue
> 0 && opValue
< fSets
->size());
4134 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
4135 UnicodeSet
*s
= (UnicodeSet
*)fSets
->elementAt(opValue
);
4137 // Loop through input, until either the input is exhausted or
4138 // we reach a character that is not a member of the set.
4139 int64_t ix
= fp
->fInputIdx
;
4140 UTEXT_SETNATIVEINDEX(fInputText
, ix
);
4142 if (ix
>= fActiveLimit
) {
4146 UChar32 c
= UTEXT_NEXT32(fInputText
);
4148 if (s8
->contains(c
) == FALSE
) {
4152 if (s
->contains(c
) == FALSE
) {
4156 ix
= UTEXT_GETNATIVEINDEX(fInputText
);
4159 // If there were no matching characters, skip over the loop altogether.
4160 // The loop doesn't run at all, a * op always succeeds.
4161 if (ix
== fp
->fInputIdx
) {
4162 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
4166 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4167 // must follow. It's operand is the stack location
4168 // that holds the starting input index for the match of this [set]*
4169 int32_t loopcOp
= (int32_t)pat
[fp
->fPatIdx
];
4170 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
4171 int32_t stackLoc
= URX_VAL(loopcOp
);
4172 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
4173 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
4176 // Save State to the URX_LOOP_C op that follows this one,
4177 // so that match failures in the following code will return to there.
4178 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4179 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
4185 case URX_LOOP_DOT_I
:
4186 // Loop Initialization for the optimized implementation of .*
4187 // This op scans through all remaining input.
4188 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4190 // Loop through input until the input is exhausted (we reach an end-of-line)
4191 // In DOTALL mode, we can just go straight to the end of the input.
4193 if ((opValue
& 1) == 1) {
4194 // Dot-matches-All mode. Jump straight to the end of the string.
4198 // NOT DOT ALL mode. Line endings do not match '.'
4199 // Scan forward until a line ending or end of input.
4201 UTEXT_SETNATIVEINDEX(fInputText
, ix
);
4203 if (ix
>= fActiveLimit
) {
4207 UChar32 c
= UTEXT_NEXT32(fInputText
);
4208 if ((c
& 0x7f) <= 0x29) { // Fast filter of non-new-line-s
4209 if ((c
== 0x0a) || // 0x0a is newline in both modes.
4210 (((opValue
& 2) == 0) && // IF not UNIX_LINES mode
4211 isLineTerminator(c
))) {
4212 // char is a line ending. Exit the scanning loop.
4216 ix
= UTEXT_GETNATIVEINDEX(fInputText
);
4220 // If there were no matching characters, skip over the loop altogether.
4221 // The loop doesn't run at all, a * op always succeeds.
4222 if (ix
== fp
->fInputIdx
) {
4223 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
4227 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4228 // must follow. It's operand is the stack location
4229 // that holds the starting input index for the match of this .*
4230 int32_t loopcOp
= (int32_t)pat
[fp
->fPatIdx
];
4231 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
4232 int32_t stackLoc
= URX_VAL(loopcOp
);
4233 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
4234 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
4237 // Save State to the URX_LOOP_C op that follows this one,
4238 // so that match failures in the following code will return to there.
4239 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4240 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
4248 U_ASSERT(opValue
>=0 && opValue
<fFrameSize
);
4249 backSearchIndex
= fp
->fExtra
[opValue
];
4250 U_ASSERT(backSearchIndex
<= fp
->fInputIdx
);
4251 if (backSearchIndex
== fp
->fInputIdx
) {
4252 // We've backed up the input idx to the point that the loop started.
4253 // The loop is done. Leave here without saving state.
4254 // Subsequent failures won't come back here.
4257 // Set up for the next iteration of the loop, with input index
4258 // backed up by one from the last time through,
4259 // and a state save to this instruction in case the following code fails again.
4260 // (We're going backwards because this loop emulates stack unwinding, not
4261 // the initial scan forward.)
4262 U_ASSERT(fp
->fInputIdx
> 0);
4263 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
4264 UChar32 prevC
= UTEXT_PREVIOUS32(fInputText
);
4265 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
4267 UChar32 twoPrevC
= UTEXT_PREVIOUS32(fInputText
);
4268 if (prevC
== 0x0a &&
4269 fp
->fInputIdx
> backSearchIndex
&&
4271 int32_t prevOp
= (int32_t)pat
[fp
->fPatIdx
-2];
4272 if (URX_TYPE(prevOp
) == URX_LOOP_DOT_I
) {
4273 // .*, stepping back over CRLF pair.
4274 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
4279 fp
= StateSave(fp
, fp
->fPatIdx
-1, status
);
4286 // Trouble. The compiled pattern contains an entry with an
4287 // unrecognized type tag.
4291 if (U_FAILURE(status
)) {
4300 fLastMatchEnd
= fMatchEnd
;
4301 fMatchStart
= startIdx
;
4302 fMatchEnd
= fp
->fInputIdx
;
4305 #ifdef REGEX_RUN_DEBUG
4308 printf("Match. start=%ld end=%ld\n\n", fMatchStart
, fMatchEnd
);
4310 printf("No match\n\n");
4315 fFrame
= fp
; // The active stack frame when the engine stopped.
4316 // Contains the capture group results that we need to
4322 //--------------------------------------------------------------------------------
4324 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with the
4325 // assumption that the entire string is available in the UText's
4326 // chunk buffer. For now, that means we can use int32_t indexes,
4327 // except for anything that needs to be saved (like group starts
4330 // startIdx: begin matching a this index.
4331 // toEnd: if true, match must extend to end of the input region
4333 //--------------------------------------------------------------------------------
4334 void RegexMatcher::MatchChunkAt(int32_t startIdx
, UBool toEnd
, UErrorCode
&status
) {
4335 UBool isMatch
= FALSE
; // True if the we have a match.
4337 int32_t backSearchIndex
= INT32_MAX
; // used after greedy single-character matches for searching backwards
4339 int32_t op
; // Operation from the compiled pattern, split into
4340 int32_t opType
; // the opcode
4341 int32_t opValue
; // and the operand value.
4343 #ifdef REGEX_RUN_DEBUG
4345 printf("MatchAt(startIdx=%d)\n", startIdx
);
4346 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern
->fPattern
))());
4347 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText
))());
4351 if (U_FAILURE(status
)) {
4355 // Cache frequently referenced items from the compiled pattern
4357 int64_t *pat
= fPattern
->fCompiledPat
->getBuffer();
4359 const UChar
*litText
= fPattern
->fLiteralText
.getBuffer();
4360 UVector
*fSets
= fPattern
->fSets
;
4362 const UChar
*inputBuf
= fInputText
->chunkContents
;
4364 fFrameSize
= fPattern
->fFrameSize
;
4365 REStackFrame
*fp
= resetStack();
4366 if (U_FAILURE(fDeferredStatus
)) {
4367 status
= fDeferredStatus
;
4372 fp
->fInputIdx
= startIdx
;
4374 // Zero out the pattern's static data
4376 for (i
= 0; i
<fPattern
->fDataSize
; i
++) {
4381 // Main loop for interpreting the compiled pattern.
4382 // One iteration of the loop per pattern operation performed.
4385 op
= (int32_t)pat
[fp
->fPatIdx
];
4386 opType
= URX_TYPE(op
);
4387 opValue
= URX_VAL(op
);
4388 #ifdef REGEX_RUN_DEBUG
4390 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
4391 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp
->fInputIdx
,
4392 UTEXT_CURRENT32(fInputText
), (int64_t *)fp
-fStack
->getBuffer(), fActiveLimit
);
4393 fPattern
->dumpOp(fp
->fPatIdx
);
4406 // Force a backtrack. In some circumstances, the pattern compiler
4407 // will notice that the pattern can't possibly match anything, and will
4408 // emit one of these at that point.
4409 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4414 if (fp
->fInputIdx
< fActiveLimit
) {
4416 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4423 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4429 // Test input against a literal string.
4430 // Strings require two slots in the compiled pattern, one for the
4431 // offset to the string text, and one for the length.
4432 int32_t stringStartIdx
= opValue
;
4435 op
= (int32_t)pat
[fp
->fPatIdx
]; // Fetch the second operand
4437 opType
= URX_TYPE(op
);
4438 stringLen
= URX_VAL(op
);
4439 U_ASSERT(opType
== URX_STRING_LEN
);
4440 U_ASSERT(stringLen
>= 2);
4442 const UChar
* pInp
= inputBuf
+ fp
->fInputIdx
;
4443 const UChar
* pInpLimit
= inputBuf
+ fActiveLimit
;
4444 const UChar
* pPat
= litText
+stringStartIdx
;
4445 const UChar
* pEnd
= pInp
+ stringLen
;
4446 UBool success
= TRUE
;
4447 while (pInp
< pEnd
) {
4448 if (pInp
>= pInpLimit
) {
4453 if (*pInp
++ != *pPat
++) {
4460 fp
->fInputIdx
+= stringLen
;
4462 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4468 case URX_STATE_SAVE
:
4469 fp
= StateSave(fp
, opValue
, status
);
4474 // The match loop will exit via this path on a successful match,
4475 // when we reach the end of the pattern.
4476 if (toEnd
&& fp
->fInputIdx
!= fActiveLimit
) {
4477 // The pattern matched, but not to the end of input. Try some more.
4478 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4484 // Start and End Capture stack frame variables are laid out out like this:
4485 // fp->fExtra[opValue] - The start of a completed capture group
4486 // opValue+1 - The end of a completed capture group
4487 // opValue+2 - the start of a capture group whose end
4488 // has not yet been reached (and might not ever be).
4489 case URX_START_CAPTURE
:
4490 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
4491 fp
->fExtra
[opValue
+2] = fp
->fInputIdx
;
4495 case URX_END_CAPTURE
:
4496 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
4497 U_ASSERT(fp
->fExtra
[opValue
+2] >= 0); // Start pos for this group must be set.
4498 fp
->fExtra
[opValue
] = fp
->fExtra
[opValue
+2]; // Tentative start becomes real.
4499 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // End position
4500 U_ASSERT(fp
->fExtra
[opValue
] <= fp
->fExtra
[opValue
+1]);
4504 case URX_DOLLAR
: // $, test for End of line
4505 // or for position before new line at end of input
4506 if (fp
->fInputIdx
< fAnchorLimit
-2) {
4507 // We are no where near the end of input. Fail.
4508 // This is the common case. Keep it first.
4509 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4512 if (fp
->fInputIdx
>= fAnchorLimit
) {
4513 // We really are at the end of input. Success.
4519 // If we are positioned just before a new-line that is located at the
4520 // end of input, succeed.
4521 if (fp
->fInputIdx
== fAnchorLimit
-1) {
4523 U16_GET(inputBuf
, fAnchorStart
, fp
->fInputIdx
, fAnchorLimit
, c
);
4525 if (isLineTerminator(c
)) {
4526 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& inputBuf
[fp
->fInputIdx
-1]==0x0d)) {
4527 // At new-line at end of input. Success
4533 } else if (fp
->fInputIdx
== fAnchorLimit
-2 &&
4534 inputBuf
[fp
->fInputIdx
]==0x0d && inputBuf
[fp
->fInputIdx
+1]==0x0a) {
4537 break; // At CR/LF at end of input. Success
4540 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4545 case URX_DOLLAR_D
: // $, test for End of Line, in UNIX_LINES mode.
4546 if (fp
->fInputIdx
>= fAnchorLimit
-1) {
4547 // Either at the last character of input, or off the end.
4548 if (fp
->fInputIdx
== fAnchorLimit
-1) {
4549 // At last char of input. Success if it's a new line.
4550 if (inputBuf
[fp
->fInputIdx
] == 0x0a) {
4556 // Off the end of input. Success.
4563 // Not at end of input. Back-track out.
4564 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4568 case URX_DOLLAR_M
: // $, test for End of line in multi-line mode
4570 if (fp
->fInputIdx
>= fAnchorLimit
) {
4571 // We really are at the end of input. Success.
4576 // If we are positioned just before a new-line, succeed.
4577 // It makes no difference where the new-line is within the input.
4578 UChar32 c
= inputBuf
[fp
->fInputIdx
];
4579 if (isLineTerminator(c
)) {
4580 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
4581 // In multi-line mode, hitting a new-line just before the end of input does not
4582 // set the hitEnd or requireEnd flags
4583 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& inputBuf
[fp
->fInputIdx
-1]==0x0d)) {
4587 // not at a new line. Fail.
4588 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4593 case URX_DOLLAR_MD
: // $, test for End of line in multi-line and UNIX_LINES mode
4595 if (fp
->fInputIdx
>= fAnchorLimit
) {
4596 // We really are at the end of input. Success.
4598 fRequireEnd
= TRUE
; // Java set requireEnd in this case, even though
4599 break; // adding a new-line would not lose the match.
4601 // If we are not positioned just before a new-line, the test fails; backtrack out.
4602 // It makes no difference where the new-line is within the input.
4603 if (inputBuf
[fp
->fInputIdx
] != 0x0a) {
4604 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4610 case URX_CARET
: // ^, test for start of line
4611 if (fp
->fInputIdx
!= fAnchorStart
) {
4612 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4617 case URX_CARET_M
: // ^, test for start of line in mulit-line mode
4619 if (fp
->fInputIdx
== fAnchorStart
) {
4620 // We are at the start input. Success.
4623 // Check whether character just before the current pos is a new-line
4624 // unless we are at the end of input
4625 UChar c
= inputBuf
[fp
->fInputIdx
- 1];
4626 if ((fp
->fInputIdx
< fAnchorLimit
) &&
4627 isLineTerminator(c
)) {
4628 // It's a new-line. ^ is true. Success.
4629 // TODO: what should be done with positions between a CR and LF?
4632 // Not at the start of a line. Fail.
4633 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4638 case URX_CARET_M_UNIX
: // ^, test for start of line in mulit-line + Unix-line mode
4640 U_ASSERT(fp
->fInputIdx
>= fAnchorStart
);
4641 if (fp
->fInputIdx
<= fAnchorStart
) {
4642 // We are at the start input. Success.
4645 // Check whether character just before the current pos is a new-line
4646 U_ASSERT(fp
->fInputIdx
<= fAnchorLimit
);
4647 UChar c
= inputBuf
[fp
->fInputIdx
- 1];
4649 // Not at the start of a line. Back-track out.
4650 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4655 case URX_BACKSLASH_B
: // Test for word boundaries
4657 UBool success
= isChunkWordBoundary((int32_t)fp
->fInputIdx
);
4658 success
^= (UBool
)(opValue
!= 0); // flip sense for \B
4660 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4666 case URX_BACKSLASH_BU
: // Test for word boundaries, Unicode-style
4668 UBool success
= isUWordBoundary(fp
->fInputIdx
);
4669 success
^= (UBool
)(opValue
!= 0); // flip sense for \B
4671 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4677 case URX_BACKSLASH_D
: // Test for decimal digit
4679 if (fp
->fInputIdx
>= fActiveLimit
) {
4681 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4686 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4687 int8_t ctype
= u_charType(c
); // TODO: make a unicode set for this. Will be faster.
4688 UBool success
= (ctype
== U_DECIMAL_DIGIT_NUMBER
);
4689 success
^= (UBool
)(opValue
!= 0); // flip sense for \D
4691 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4697 case URX_BACKSLASH_G
: // Test for position at end of previous match
4698 if (!((fMatch
&& fp
->fInputIdx
==fMatchEnd
) || (fMatch
==FALSE
&& fp
->fInputIdx
==fActiveStart
))) {
4699 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4704 case URX_BACKSLASH_H
: // Test for \h, horizontal white space.
4706 if (fp
->fInputIdx
>= fActiveLimit
) {
4708 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4712 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4713 int8_t ctype
= u_charType(c
);
4714 UBool success
= (ctype
== U_SPACE_SEPARATOR
|| c
== 9); // SPACE_SEPARATOR || TAB
4715 success
^= (UBool
)(opValue
!= 0); // flip sense for \H
4717 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4723 case URX_BACKSLASH_R
: // Test for \R, any line break sequence.
4725 if (fp
->fInputIdx
>= fActiveLimit
) {
4727 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4731 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4732 if (isLineTerminator(c
)) {
4733 if (c
== 0x0d && fp
->fInputIdx
< fActiveLimit
) {
4734 // Check for CR/LF sequence. Consume both together when found.
4736 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c2
);
4738 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c2
);
4742 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4748 case URX_BACKSLASH_V
: // Any single code point line ending.
4750 if (fp
->fInputIdx
>= fActiveLimit
) {
4752 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4756 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4757 UBool success
= isLineTerminator(c
);
4758 success
^= (UBool
)(opValue
!= 0); // flip sense for \V
4760 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4767 case URX_BACKSLASH_X
:
4768 // Match a Grapheme, as defined by Unicode TR 29.
4769 // Differs slightly from Perl, which consumes combining marks independently
4773 // Fail if at end of input
4774 if (fp
->fInputIdx
>= fActiveLimit
) {
4776 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4780 // Examine (and consume) the current char.
4781 // Dispatch into a little state machine, based on the char.
4783 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4784 UnicodeSet
**sets
= fPattern
->fStaticSets
;
4785 if (sets
[URX_GC_NORMAL
]->contains(c
)) goto GC_Extend
;
4786 if (sets
[URX_GC_CONTROL
]->contains(c
)) goto GC_Control
;
4787 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
4788 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
4789 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
4790 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
4791 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
4797 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
4798 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4799 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
4800 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
4801 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
4802 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
4803 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c
);
4807 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
4808 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4809 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
4810 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
4811 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c
);
4815 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
4816 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4817 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
4818 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c
);
4822 // Combining characters are consumed here
4824 if (fp
->fInputIdx
>= fActiveLimit
) {
4827 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4828 if (sets
[URX_GC_EXTEND
]->contains(c
) == FALSE
) {
4829 U16_BACK_1(inputBuf
, 0, fp
->fInputIdx
);
4836 // Most control chars stand alone (don't combine with combining chars),
4837 // except for that CR/LF sequence is a single grapheme cluster.
4838 if (c
== 0x0d && fp
->fInputIdx
< fActiveLimit
&& inputBuf
[fp
->fInputIdx
] == 0x0a) {
4843 if (fp
->fInputIdx
>= fActiveLimit
) {
4852 case URX_BACKSLASH_Z
: // Test for end of Input
4853 if (fp
->fInputIdx
< fAnchorLimit
) {
4854 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4863 case URX_STATIC_SETREF
:
4865 // Test input character against one of the predefined sets
4866 // (Word Characters, for example)
4867 // The high bit of the op value is a flag for the match polarity.
4868 // 0: success if input char is in set.
4869 // 1: success if input char is not in set.
4870 if (fp
->fInputIdx
>= fActiveLimit
) {
4872 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4876 UBool success
= ((opValue
& URX_NEG_SET
) == URX_NEG_SET
);
4877 opValue
&= ~URX_NEG_SET
;
4878 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
4881 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4883 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
4884 if (s8
->contains(c
)) {
4888 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
4889 if (s
->contains(c
)) {
4894 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4900 case URX_STAT_SETREF_N
:
4902 // Test input character for NOT being a member of one of
4903 // the predefined sets (Word Characters, for example)
4904 if (fp
->fInputIdx
>= fActiveLimit
) {
4906 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4910 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
4913 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4915 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
4916 if (s8
->contains(c
) == FALSE
) {
4920 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
4921 if (s
->contains(c
) == FALSE
) {
4925 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4932 if (fp
->fInputIdx
>= fActiveLimit
) {
4934 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4938 U_ASSERT(opValue
> 0 && opValue
< fSets
->size());
4940 // There is input left. Pick up one char and test it for set membership.
4942 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4944 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
4945 if (s8
->contains(c
)) {
4946 // The character is in the set. A Match.
4950 UnicodeSet
*s
= (UnicodeSet
*)fSets
->elementAt(opValue
);
4951 if (s
->contains(c
)) {
4952 // The character is in the set. A Match.
4957 // the character wasn't in the set.
4958 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4965 // . matches anything, but stops at end-of-line.
4966 if (fp
->fInputIdx
>= fActiveLimit
) {
4967 // At end of input. Match failed. Backtrack out.
4969 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4973 // There is input left. Advance over one char, unless we've hit end-of-line
4975 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4976 if (isLineTerminator(c
)) {
4977 // End of line in normal mode. . does not match.
4978 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4985 case URX_DOTANY_ALL
:
4987 // . in dot-matches-all (including new lines) mode
4988 if (fp
->fInputIdx
>= fActiveLimit
) {
4989 // At end of input. Match failed. Backtrack out.
4991 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4995 // There is input left. Advance over one char, except if we are
4996 // at a cr/lf, advance over both of them.
4998 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4999 if (c
==0x0d && fp
->fInputIdx
< fActiveLimit
) {
5000 // In the case of a CR/LF, we need to advance over both.
5001 if (inputBuf
[fp
->fInputIdx
] == 0x0a) {
5002 U16_FWD_1(inputBuf
, fp
->fInputIdx
, fActiveLimit
);
5009 case URX_DOTANY_UNIX
:
5011 // '.' operator, matches all, but stops at end-of-line.
5012 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
5013 if (fp
->fInputIdx
>= fActiveLimit
) {
5014 // At end of input. Match failed. Backtrack out.
5016 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5020 // There is input left. Advance over one char, unless we've hit end-of-line
5022 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
5024 // End of line in normal mode. '.' does not match the \n
5025 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5032 fp
->fPatIdx
= opValue
;
5040 U_ASSERT(opValue
< fPattern
->fCompiledPat
->size());
5041 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
5042 fp
->fPatIdx
= opValue
; // Then JMP.
5046 // This opcode is used with (x)+, when x can match a zero length string.
5047 // Same as JMP_SAV, except conditional on the match having made forward progress.
5048 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
5049 // data address of the input position at the start of the loop.
5051 U_ASSERT(opValue
> 0 && opValue
< fPattern
->fCompiledPat
->size());
5052 int32_t stoOp
= (int32_t)pat
[opValue
-1];
5053 U_ASSERT(URX_TYPE(stoOp
) == URX_STO_INP_LOC
);
5054 int32_t frameLoc
= URX_VAL(stoOp
);
5055 U_ASSERT(frameLoc
>= 0 && frameLoc
< fFrameSize
);
5056 int32_t prevInputIdx
= (int32_t)fp
->fExtra
[frameLoc
];
5057 U_ASSERT(prevInputIdx
<= fp
->fInputIdx
);
5058 if (prevInputIdx
< fp
->fInputIdx
) {
5059 // The match did make progress. Repeat the loop.
5060 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
5061 fp
->fPatIdx
= opValue
;
5062 fp
->fExtra
[frameLoc
] = fp
->fInputIdx
;
5064 // If the input position did not advance, we do nothing here,
5065 // execution will fall out of the loop.
5071 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
5072 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
5074 // Pick up the three extra operands that CTR_INIT has, and
5075 // skip the pattern location counter past
5076 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
5078 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
5079 int32_t minCount
= (int32_t)pat
[instrOperandLoc
+1];
5080 int32_t maxCount
= (int32_t)pat
[instrOperandLoc
+2];
5081 U_ASSERT(minCount
>=0);
5082 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
5083 U_ASSERT(loopLoc
>=fp
->fPatIdx
);
5085 if (minCount
== 0) {
5086 fp
= StateSave(fp
, loopLoc
+1, status
);
5088 if (maxCount
== -1) {
5089 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // For loop breaking.
5090 } else if (maxCount
== 0) {
5091 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5098 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
5099 int32_t initOp
= (int32_t)pat
[opValue
];
5100 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT
);
5101 int64_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
5102 int32_t minCount
= (int32_t)pat
[opValue
+2];
5103 int32_t maxCount
= (int32_t)pat
[opValue
+3];
5105 if ((uint64_t)*pCounter
>= (uint32_t)maxCount
&& maxCount
!= -1) {
5106 U_ASSERT(*pCounter
== maxCount
);
5109 if (*pCounter
>= minCount
) {
5110 if (maxCount
== -1) {
5111 // Loop has no hard upper bound.
5112 // Check that it is progressing through the input, break if it is not.
5113 int64_t *pLastInputIdx
= &fp
->fExtra
[URX_VAL(initOp
) + 1];
5114 if (fp
->fInputIdx
== *pLastInputIdx
) {
5117 *pLastInputIdx
= fp
->fInputIdx
;
5120 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5122 // Increment time-out counter. (StateSave() does it if count >= minCount)
5124 if (fTickCounter
<= 0) {
5125 IncrementTime(status
); // Re-initializes fTickCounter
5128 fp
->fPatIdx
= opValue
+ 4; // Loop back.
5132 case URX_CTR_INIT_NG
:
5134 // Initialize a non-greedy loop
5135 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
5136 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
5138 // Pick up the three extra operands that CTR_INIT_NG has, and
5139 // skip the pattern location counter past
5140 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
5142 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
5143 int32_t minCount
= (int32_t)pat
[instrOperandLoc
+1];
5144 int32_t maxCount
= (int32_t)pat
[instrOperandLoc
+2];
5145 U_ASSERT(minCount
>=0);
5146 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
5147 U_ASSERT(loopLoc
>fp
->fPatIdx
);
5148 if (maxCount
== -1) {
5149 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // Save initial input index for loop breaking.
5152 if (minCount
== 0) {
5153 if (maxCount
!= 0) {
5154 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5156 fp
->fPatIdx
= loopLoc
+1; // Continue with stuff after repeated block
5161 case URX_CTR_LOOP_NG
:
5163 // Non-greedy {min, max} loops
5164 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
5165 int32_t initOp
= (int32_t)pat
[opValue
];
5166 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT_NG
);
5167 int64_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
5168 int32_t minCount
= (int32_t)pat
[opValue
+2];
5169 int32_t maxCount
= (int32_t)pat
[opValue
+3];
5172 if ((uint64_t)*pCounter
>= (uint32_t)maxCount
&& maxCount
!= -1) {
5173 // The loop has matched the maximum permitted number of times.
5174 // Break out of here with no action. Matching will
5175 // continue with the following pattern.
5176 U_ASSERT(*pCounter
== maxCount
);
5180 if (*pCounter
< minCount
) {
5181 // We haven't met the minimum number of matches yet.
5182 // Loop back for another one.
5183 fp
->fPatIdx
= opValue
+ 4; // Loop back.
5185 if (fTickCounter
<= 0) {
5186 IncrementTime(status
); // Re-initializes fTickCounter
5189 // We do have the minimum number of matches.
5191 // If there is no upper bound on the loop iterations, check that the input index
5192 // is progressing, and stop the loop if it is not.
5193 if (maxCount
== -1) {
5194 int64_t *pLastInputIdx
= &fp
->fExtra
[URX_VAL(initOp
) + 1];
5195 if (fp
->fInputIdx
== *pLastInputIdx
) {
5198 *pLastInputIdx
= fp
->fInputIdx
;
5201 // Loop Continuation: we will fall into the pattern following the loop
5202 // (non-greedy, don't execute loop body first), but first do
5203 // a state save to the top of the loop, so that a match failure
5204 // in the following pattern will try another iteration of the loop.
5205 fp
= StateSave(fp
, opValue
+ 4, status
);
5211 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
5212 fData
[opValue
] = fStack
->size();
5217 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
5218 int32_t newStackSize
= (int32_t)fData
[opValue
];
5219 U_ASSERT(newStackSize
<= fStack
->size());
5220 int64_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
5221 if (newFP
== (int64_t *)fp
) {
5225 for (j
=0; j
<fFrameSize
; j
++) {
5226 newFP
[j
] = ((int64_t *)fp
)[j
];
5228 fp
= (REStackFrame
*)newFP
;
5229 fStack
->setSize(newStackSize
);
5235 U_ASSERT(opValue
< fFrameSize
);
5236 int64_t groupStartIdx
= fp
->fExtra
[opValue
];
5237 int64_t groupEndIdx
= fp
->fExtra
[opValue
+1];
5238 U_ASSERT(groupStartIdx
<= groupEndIdx
);
5239 int64_t inputIndex
= fp
->fInputIdx
;
5240 if (groupStartIdx
< 0) {
5241 // This capture group has not participated in the match thus far,
5242 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
5245 UBool success
= TRUE
;
5246 for (int64_t groupIndex
= groupStartIdx
; groupIndex
< groupEndIdx
; ++groupIndex
,++inputIndex
) {
5247 if (inputIndex
>= fActiveLimit
) {
5252 if (inputBuf
[groupIndex
] != inputBuf
[inputIndex
]) {
5257 if (success
&& groupStartIdx
< groupEndIdx
&& U16_IS_LEAD(inputBuf
[groupEndIdx
-1]) &&
5258 inputIndex
< fActiveLimit
&& U16_IS_TRAIL(inputBuf
[inputIndex
])) {
5259 // Capture group ended with an unpaired lead surrogate.
5260 // Back reference is not permitted to match lead only of a surrogatge pair.
5264 fp
->fInputIdx
= inputIndex
;
5266 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5273 U_ASSERT(opValue
< fFrameSize
);
5274 int64_t groupStartIdx
= fp
->fExtra
[opValue
];
5275 int64_t groupEndIdx
= fp
->fExtra
[opValue
+1];
5276 U_ASSERT(groupStartIdx
<= groupEndIdx
);
5277 if (groupStartIdx
< 0) {
5278 // This capture group has not participated in the match thus far,
5279 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
5282 CaseFoldingUCharIterator
captureGroupItr(inputBuf
, groupStartIdx
, groupEndIdx
);
5283 CaseFoldingUCharIterator
inputItr(inputBuf
, fp
->fInputIdx
, fActiveLimit
);
5285 // Note: if the capture group match was of an empty string the backref
5286 // match succeeds. Verified by testing: Perl matches succeed
5287 // in this case, so we do too.
5289 UBool success
= TRUE
;
5291 UChar32 captureGroupChar
= captureGroupItr
.next();
5292 if (captureGroupChar
== U_SENTINEL
) {
5296 UChar32 inputChar
= inputItr
.next();
5297 if (inputChar
== U_SENTINEL
) {
5302 if (inputChar
!= captureGroupChar
) {
5308 if (success
&& inputItr
.inExpansion()) {
5309 // We otained a match by consuming part of a string obtained from
5310 // case-folding a single code point of the input text.
5311 // This does not count as an overall match.
5316 fp
->fInputIdx
= inputItr
.getIndex();
5318 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5323 case URX_STO_INP_LOC
:
5325 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
);
5326 fp
->fExtra
[opValue
] = fp
->fInputIdx
;
5332 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
5334 int32_t dataLoc
= URX_VAL(pat
[instrOperandLoc
]);
5335 U_ASSERT(dataLoc
>= 0 && dataLoc
< fFrameSize
);
5336 int32_t savedInputIdx
= (int32_t)fp
->fExtra
[dataLoc
];
5337 U_ASSERT(savedInputIdx
<= fp
->fInputIdx
);
5338 if (savedInputIdx
< fp
->fInputIdx
) {
5339 fp
->fPatIdx
= opValue
; // JMP
5341 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no progress in loop.
5348 // Entering a lookahead block.
5349 // Save Stack Ptr, Input Pos.
5350 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5351 fData
[opValue
] = fStack
->size();
5352 fData
[opValue
+1] = fp
->fInputIdx
;
5353 fActiveStart
= fLookStart
; // Set the match region change for
5354 fActiveLimit
= fLookLimit
; // transparent bounds.
5360 // Leaving a look-ahead block.
5361 // restore Stack Ptr, Input Pos to positions they had on entry to block.
5362 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5363 int32_t stackSize
= fStack
->size();
5364 int32_t newStackSize
= (int32_t)fData
[opValue
];
5365 U_ASSERT(stackSize
>= newStackSize
);
5366 if (stackSize
> newStackSize
) {
5367 // Copy the current top frame back to the new (cut back) top frame.
5368 // This makes the capture groups from within the look-ahead
5369 // expression available.
5370 int64_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
5372 for (j
=0; j
<fFrameSize
; j
++) {
5373 newFP
[j
] = ((int64_t *)fp
)[j
];
5375 fp
= (REStackFrame
*)newFP
;
5376 fStack
->setSize(newStackSize
);
5378 fp
->fInputIdx
= fData
[opValue
+1];
5380 // Restore the active region bounds in the input string; they may have
5381 // been changed because of transparent bounds on a Region.
5382 fActiveStart
= fRegionStart
;
5383 fActiveLimit
= fRegionLimit
;
5388 if (fp
->fInputIdx
< fActiveLimit
) {
5390 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
5391 if (u_foldCase(c
, U_FOLD_CASE_DEFAULT
) == opValue
) {
5397 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5401 // Case-insensitive test input against a literal string.
5402 // Strings require two slots in the compiled pattern, one for the
5403 // offset to the string text, and one for the length.
5404 // The compiled string has already been case folded.
5406 const UChar
*patternString
= litText
+ opValue
;
5408 op
= (int32_t)pat
[fp
->fPatIdx
];
5410 opType
= URX_TYPE(op
);
5411 opValue
= URX_VAL(op
);
5412 U_ASSERT(opType
== URX_STRING_LEN
);
5413 int32_t patternStringLen
= opValue
; // Length of the string from the pattern.
5417 UBool success
= TRUE
;
5418 int32_t patternStringIdx
= 0;
5419 CaseFoldingUCharIterator
inputIterator(inputBuf
, fp
->fInputIdx
, fActiveLimit
);
5420 while (patternStringIdx
< patternStringLen
) {
5421 U16_NEXT(patternString
, patternStringIdx
, patternStringLen
, cPattern
);
5422 cText
= inputIterator
.next();
5423 if (cText
!= cPattern
) {
5425 if (cText
== U_SENTINEL
) {
5431 if (inputIterator
.inExpansion()) {
5436 fp
->fInputIdx
= inputIterator
.getIndex();
5438 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5445 // Entering a look-behind block.
5446 // Save Stack Ptr, Input Pos.
5447 // TODO: implement transparent bounds. Ticket #6067
5448 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5449 fData
[opValue
] = fStack
->size();
5450 fData
[opValue
+1] = fp
->fInputIdx
;
5451 // Init the variable containing the start index for attempted matches.
5452 fData
[opValue
+2] = -1;
5453 // Save input string length, then reset to pin any matches to end at
5454 // the current position.
5455 fData
[opValue
+3] = fActiveLimit
;
5456 fActiveLimit
= fp
->fInputIdx
;
5463 // Positive Look-Behind, at top of loop checking for matches of LB expression
5464 // at all possible input starting positions.
5466 // Fetch the min and max possible match lengths. They are the operands
5467 // of this op in the pattern.
5468 int32_t minML
= (int32_t)pat
[fp
->fPatIdx
++];
5469 int32_t maxML
= (int32_t)pat
[fp
->fPatIdx
++];
5470 U_ASSERT(minML
<= maxML
);
5471 U_ASSERT(minML
>= 0);
5473 // Fetch (from data) the last input index where a match was attempted.
5474 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5475 int64_t &lbStartIdx
= fData
[opValue
+2];
5476 if (lbStartIdx
< 0) {
5477 // First time through loop.
5478 lbStartIdx
= fp
->fInputIdx
- minML
;
5479 if (lbStartIdx
> 0 && lbStartIdx
< fInputLength
) {
5480 U16_SET_CP_START(inputBuf
, 0, lbStartIdx
);
5483 // 2nd through nth time through the loop.
5484 // Back up start position for match by one.
5485 if (lbStartIdx
== 0) {
5488 U16_BACK_1(inputBuf
, 0, lbStartIdx
);
5492 if (lbStartIdx
< 0 || lbStartIdx
< fp
->fInputIdx
- maxML
) {
5493 // We have tried all potential match starting points without
5494 // getting a match. Backtrack out, and out of the
5495 // Look Behind altogether.
5496 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5497 int64_t restoreInputLen
= fData
[opValue
+3];
5498 U_ASSERT(restoreInputLen
>= fActiveLimit
);
5499 U_ASSERT(restoreInputLen
<= fInputLength
);
5500 fActiveLimit
= restoreInputLen
;
5504 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5505 // (successful match will fall off the end of the loop.)
5506 fp
= StateSave(fp
, fp
->fPatIdx
-3, status
);
5507 fp
->fInputIdx
= lbStartIdx
;
5512 // End of a look-behind block, after a successful match.
5514 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5515 if (fp
->fInputIdx
!= fActiveLimit
) {
5516 // The look-behind expression matched, but the match did not
5517 // extend all the way to the point that we are looking behind from.
5518 // FAIL out of here, which will take us back to the LB_CONT, which
5519 // will retry the match starting at another position or fail
5520 // the look-behind altogether, whichever is appropriate.
5521 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5525 // Look-behind match is good. Restore the orignal input string length,
5526 // which had been truncated to pin the end of the lookbehind match to the
5527 // position being looked-behind.
5528 int64_t originalInputLen
= fData
[opValue
+3];
5529 U_ASSERT(originalInputLen
>= fActiveLimit
);
5530 U_ASSERT(originalInputLen
<= fInputLength
);
5531 fActiveLimit
= originalInputLen
;
5538 // Negative Look-Behind, at top of loop checking for matches of LB expression
5539 // at all possible input starting positions.
5541 // Fetch the extra parameters of this op.
5542 int32_t minML
= (int32_t)pat
[fp
->fPatIdx
++];
5543 int32_t maxML
= (int32_t)pat
[fp
->fPatIdx
++];
5544 int32_t continueLoc
= (int32_t)pat
[fp
->fPatIdx
++];
5545 continueLoc
= URX_VAL(continueLoc
);
5546 U_ASSERT(minML
<= maxML
);
5547 U_ASSERT(minML
>= 0);
5548 U_ASSERT(continueLoc
> fp
->fPatIdx
);
5550 // Fetch (from data) the last input index where a match was attempted.
5551 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5552 int64_t &lbStartIdx
= fData
[opValue
+2];
5553 if (lbStartIdx
< 0) {
5554 // First time through loop.
5555 lbStartIdx
= fp
->fInputIdx
- minML
;
5556 if (lbStartIdx
> 0 && lbStartIdx
< fInputLength
) {
5557 U16_SET_CP_START(inputBuf
, 0, lbStartIdx
);
5560 // 2nd through nth time through the loop.
5561 // Back up start position for match by one.
5562 if (lbStartIdx
== 0) {
5563 lbStartIdx
--; // Because U16_BACK is unsafe starting at 0.
5565 U16_BACK_1(inputBuf
, 0, lbStartIdx
);
5569 if (lbStartIdx
< 0 || lbStartIdx
< fp
->fInputIdx
- maxML
) {
5570 // We have tried all potential match starting points without
5571 // getting a match, which means that the negative lookbehind as
5572 // a whole has succeeded. Jump forward to the continue location
5573 int64_t restoreInputLen
= fData
[opValue
+3];
5574 U_ASSERT(restoreInputLen
>= fActiveLimit
);
5575 U_ASSERT(restoreInputLen
<= fInputLength
);
5576 fActiveLimit
= restoreInputLen
;
5577 fp
->fPatIdx
= continueLoc
;
5581 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5582 // (successful match will cause a FAIL out of the loop altogether.)
5583 fp
= StateSave(fp
, fp
->fPatIdx
-4, status
);
5584 fp
->fInputIdx
= lbStartIdx
;
5589 // End of a negative look-behind block, after a successful match.
5591 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5592 if (fp
->fInputIdx
!= fActiveLimit
) {
5593 // The look-behind expression matched, but the match did not
5594 // extend all the way to the point that we are looking behind from.
5595 // FAIL out of here, which will take us back to the LB_CONT, which
5596 // will retry the match starting at another position or succeed
5597 // the look-behind altogether, whichever is appropriate.
5598 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5602 // Look-behind expression matched, which means look-behind test as
5605 // Restore the orignal input string length, which had been truncated
5606 // inorder to pin the end of the lookbehind match
5607 // to the position being looked-behind.
5608 int64_t originalInputLen
= fData
[opValue
+3];
5609 U_ASSERT(originalInputLen
>= fActiveLimit
);
5610 U_ASSERT(originalInputLen
<= fInputLength
);
5611 fActiveLimit
= originalInputLen
;
5613 // Restore original stack position, discarding any state saved
5614 // by the successful pattern match.
5615 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5616 int32_t newStackSize
= (int32_t)fData
[opValue
];
5617 U_ASSERT(fStack
->size() > newStackSize
);
5618 fStack
->setSize(newStackSize
);
5620 // FAIL, which will take control back to someplace
5621 // prior to entering the look-behind test.
5622 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5628 // Loop Initialization for the optimized implementation of
5629 // [some character set]*
5630 // This op scans through all matching input.
5631 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5633 U_ASSERT(opValue
> 0 && opValue
< fSets
->size());
5634 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
5635 UnicodeSet
*s
= (UnicodeSet
*)fSets
->elementAt(opValue
);
5637 // Loop through input, until either the input is exhausted or
5638 // we reach a character that is not a member of the set.
5639 int32_t ix
= (int32_t)fp
->fInputIdx
;
5641 if (ix
>= fActiveLimit
) {
5646 U16_NEXT(inputBuf
, ix
, fActiveLimit
, c
);
5648 if (s8
->contains(c
) == FALSE
) {
5649 U16_BACK_1(inputBuf
, 0, ix
);
5653 if (s
->contains(c
) == FALSE
) {
5654 U16_BACK_1(inputBuf
, 0, ix
);
5660 // If there were no matching characters, skip over the loop altogether.
5661 // The loop doesn't run at all, a * op always succeeds.
5662 if (ix
== fp
->fInputIdx
) {
5663 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
5667 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5668 // must follow. It's operand is the stack location
5669 // that holds the starting input index for the match of this [set]*
5670 int32_t loopcOp
= (int32_t)pat
[fp
->fPatIdx
];
5671 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
5672 int32_t stackLoc
= URX_VAL(loopcOp
);
5673 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
5674 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
5677 // Save State to the URX_LOOP_C op that follows this one,
5678 // so that match failures in the following code will return to there.
5679 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5680 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5686 case URX_LOOP_DOT_I
:
5687 // Loop Initialization for the optimized implementation of .*
5688 // This op scans through all remaining input.
5689 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5691 // Loop through input until the input is exhausted (we reach an end-of-line)
5692 // In DOTALL mode, we can just go straight to the end of the input.
5694 if ((opValue
& 1) == 1) {
5695 // Dot-matches-All mode. Jump straight to the end of the string.
5696 ix
= (int32_t)fActiveLimit
;
5699 // NOT DOT ALL mode. Line endings do not match '.'
5700 // Scan forward until a line ending or end of input.
5701 ix
= (int32_t)fp
->fInputIdx
;
5703 if (ix
>= fActiveLimit
) {
5708 U16_NEXT(inputBuf
, ix
, fActiveLimit
, c
); // c = inputBuf[ix++]
5709 if ((c
& 0x7f) <= 0x29) { // Fast filter of non-new-line-s
5710 if ((c
== 0x0a) || // 0x0a is newline in both modes.
5711 (((opValue
& 2) == 0) && // IF not UNIX_LINES mode
5712 isLineTerminator(c
))) {
5713 // char is a line ending. Put the input pos back to the
5714 // line ending char, and exit the scanning loop.
5715 U16_BACK_1(inputBuf
, 0, ix
);
5722 // If there were no matching characters, skip over the loop altogether.
5723 // The loop doesn't run at all, a * op always succeeds.
5724 if (ix
== fp
->fInputIdx
) {
5725 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
5729 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5730 // must follow. It's operand is the stack location
5731 // that holds the starting input index for the match of this .*
5732 int32_t loopcOp
= (int32_t)pat
[fp
->fPatIdx
];
5733 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
5734 int32_t stackLoc
= URX_VAL(loopcOp
);
5735 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
5736 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
5739 // Save State to the URX_LOOP_C op that follows this one,
5740 // so that match failures in the following code will return to there.
5741 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5742 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5750 U_ASSERT(opValue
>=0 && opValue
<fFrameSize
);
5751 backSearchIndex
= (int32_t)fp
->fExtra
[opValue
];
5752 U_ASSERT(backSearchIndex
<= fp
->fInputIdx
);
5753 if (backSearchIndex
== fp
->fInputIdx
) {
5754 // We've backed up the input idx to the point that the loop started.
5755 // The loop is done. Leave here without saving state.
5756 // Subsequent failures won't come back here.
5759 // Set up for the next iteration of the loop, with input index
5760 // backed up by one from the last time through,
5761 // and a state save to this instruction in case the following code fails again.
5762 // (We're going backwards because this loop emulates stack unwinding, not
5763 // the initial scan forward.)
5764 U_ASSERT(fp
->fInputIdx
> 0);
5766 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, prevC
); // !!!: should this 0 be one of f*Limit?
5768 if (prevC
== 0x0a &&
5769 fp
->fInputIdx
> backSearchIndex
&&
5770 inputBuf
[fp
->fInputIdx
-1] == 0x0d) {
5771 int32_t prevOp
= (int32_t)pat
[fp
->fPatIdx
-2];
5772 if (URX_TYPE(prevOp
) == URX_LOOP_DOT_I
) {
5773 // .*, stepping back over CRLF pair.
5774 U16_BACK_1(inputBuf
, 0, fp
->fInputIdx
);
5779 fp
= StateSave(fp
, fp
->fPatIdx
-1, status
);
5786 // Trouble. The compiled pattern contains an entry with an
5787 // unrecognized type tag.
5791 if (U_FAILURE(status
)) {
5800 fLastMatchEnd
= fMatchEnd
;
5801 fMatchStart
= startIdx
;
5802 fMatchEnd
= fp
->fInputIdx
;
5805 #ifdef REGEX_RUN_DEBUG
5808 printf("Match. start=%ld end=%ld\n\n", fMatchStart
, fMatchEnd
);
5810 printf("No match\n\n");
5815 fFrame
= fp
; // The active stack frame when the engine stopped.
5816 // Contains the capture group results that we need to
5823 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher
)
5827 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS