1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **************************************************************************
5 * Copyright (C) 2002-2016 International Business Machines Corporation
6 * and others. All rights reserved.
7 **************************************************************************
12 // Contains the implementation of class RegexMatcher,
13 // which is one of the main API classes for the ICU regular expression package.
16 #include "unicode/utypes.h"
17 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
19 #include "unicode/regex.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/ustring.h"
23 #include "unicode/rbbi.h"
24 #include "unicode/utf.h"
25 #include "unicode/utf16.h"
37 // #include <malloc.h> // Needed for heapcheck testing
42 // Default limit for the size of the back track stack, to avoid system
43 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
44 // This value puts ICU's limits higher than most other regexp implementations,
45 // which use recursion rather than the heap, and take more storage per
48 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY
= 8000000;
50 // Time limit counter constant.
51 // Time limits for expression evaluation are in terms of quanta of work by
52 // the engine, each of which is 10,000 state saves.
53 // This constant determines that state saves per tick number.
54 static const int32_t TIMER_INITIAL_VALUE
= 10000;
57 // Test for any of the Unicode line terminating characters.
58 static inline UBool
isLineTerminator(UChar32 c
) {
59 if (c
& ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
62 return (c
<=0x0d && c
>=0x0a) || c
==0x85 || c
==0x2028 || c
==0x2029;
65 //-----------------------------------------------------------------------------
67 // Constructor and Destructor
69 //-----------------------------------------------------------------------------
70 RegexMatcher::RegexMatcher(const RegexPattern
*pat
) {
71 fDeferredStatus
= U_ZERO_ERROR
;
72 init(fDeferredStatus
);
73 if (U_FAILURE(fDeferredStatus
)) {
77 fDeferredStatus
= U_ILLEGAL_ARGUMENT_ERROR
;
81 init2(RegexStaticSets::gStaticSets
->fEmptyText
, fDeferredStatus
);
86 RegexMatcher::RegexMatcher(const UnicodeString
®exp
, const UnicodeString
&input
,
87 uint32_t flags
, UErrorCode
&status
) {
89 if (U_FAILURE(status
)) {
93 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
94 fPattern
= fPatternOwned
;
96 UText inputText
= UTEXT_INITIALIZER
;
97 utext_openConstUnicodeString(&inputText
, &input
, &status
);
98 init2(&inputText
, status
);
99 utext_close(&inputText
);
101 fInputUniStrMaybeMutable
= TRUE
;
105 RegexMatcher::RegexMatcher(UText
*regexp
, UText
*input
,
106 uint32_t flags
, UErrorCode
&status
) {
108 if (U_FAILURE(status
)) {
112 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
113 if (U_FAILURE(status
)) {
117 fPattern
= fPatternOwned
;
118 init2(input
, status
);
122 RegexMatcher::RegexMatcher(const UnicodeString
®exp
,
123 uint32_t flags
, UErrorCode
&status
) {
125 if (U_FAILURE(status
)) {
129 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
130 if (U_FAILURE(status
)) {
133 fPattern
= fPatternOwned
;
134 init2(RegexStaticSets::gStaticSets
->fEmptyText
, status
);
137 RegexMatcher::RegexMatcher(UText
*regexp
,
138 uint32_t flags
, UErrorCode
&status
) {
140 if (U_FAILURE(status
)) {
144 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
145 if (U_FAILURE(status
)) {
149 fPattern
= fPatternOwned
;
150 init2(RegexStaticSets::gStaticSets
->fEmptyText
, status
);
156 RegexMatcher::~RegexMatcher() {
158 if (fData
!= fSmallData
) {
163 delete fPatternOwned
;
164 fPatternOwned
= NULL
;
172 utext_close(fInputText
);
175 utext_close(fAltInputText
);
178 #if UCONFIG_NO_BREAK_ITERATION==0
179 delete fWordBreakItr
;
184 // init() common initialization for use by all constructors.
185 // Initialize all fields, get the object into a consistent state.
186 // This must be done even when the initial status shows an error,
187 // so that the object is initialized sufficiently well for the destructor
190 void RegexMatcher::init(UErrorCode
&status
) {
192 fPatternOwned
= NULL
;
202 fTransparentBounds
= FALSE
;
203 fAnchoringBounds
= TRUE
;
216 fStackLimit
= DEFAULT_BACKTRACK_STACK_CAPACITY
;
218 fCallbackContext
= NULL
;
219 fFindProgressCallbackFn
= NULL
;
220 fFindProgressCallbackContext
= NULL
;
222 fDeferredStatus
= status
;
224 fWordBreakItr
= NULL
;
228 fAltInputText
= NULL
;
231 fInputUniStrMaybeMutable
= FALSE
;
235 // init2() Common initialization for use by RegexMatcher constructors, part 2.
236 // This handles the common setup to be done after the Pattern is available.
238 void RegexMatcher::init2(UText
*input
, UErrorCode
&status
) {
239 if (U_FAILURE(status
)) {
240 fDeferredStatus
= status
;
244 if (fPattern
->fDataSize
> UPRV_LENGTHOF(fSmallData
)) {
245 fData
= (int64_t *)uprv_malloc(fPattern
->fDataSize
* sizeof(int64_t));
247 status
= fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
252 fStack
= new UVector64(status
);
253 if (fStack
== NULL
) {
254 status
= fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
259 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY
, status
);
260 if (U_FAILURE(status
)) {
261 fDeferredStatus
= status
;
267 static const UChar BACKSLASH
= 0x5c;
268 static const UChar DOLLARSIGN
= 0x24;
269 static const UChar LEFTBRACKET
= 0x7b;
270 static const UChar RIGHTBRACKET
= 0x7d;
272 //--------------------------------------------------------------------------------
276 //--------------------------------------------------------------------------------
277 RegexMatcher
&RegexMatcher::appendReplacement(UnicodeString
&dest
,
278 const UnicodeString
&replacement
,
279 UErrorCode
&status
) {
280 UText replacementText
= UTEXT_INITIALIZER
;
282 utext_openConstUnicodeString(&replacementText
, &replacement
, &status
);
283 if (U_SUCCESS(status
)) {
284 UText resultText
= UTEXT_INITIALIZER
;
285 utext_openUnicodeString(&resultText
, &dest
, &status
);
287 if (U_SUCCESS(status
)) {
288 appendReplacement(&resultText
, &replacementText
, status
);
289 utext_close(&resultText
);
291 utext_close(&replacementText
);
298 // appendReplacement, UText mode
300 RegexMatcher
&RegexMatcher::appendReplacement(UText
*dest
,
302 UErrorCode
&status
) {
303 if (U_FAILURE(status
)) {
306 if (U_FAILURE(fDeferredStatus
)) {
307 status
= fDeferredStatus
;
310 if (fMatch
== FALSE
) {
311 status
= U_REGEX_INVALID_STATE
;
315 // Copy input string from the end of previous match to start of current match
316 int64_t destLen
= utext_nativeLength(dest
);
317 if (fMatchStart
> fAppendPosition
) {
318 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
319 destLen
+= utext_replace(dest
, destLen
, destLen
, fInputText
->chunkContents
+fAppendPosition
,
320 (int32_t)(fMatchStart
-fAppendPosition
), &status
);
323 if (UTEXT_USES_U16(fInputText
)) {
324 len16
= (int32_t)(fMatchStart
-fAppendPosition
);
326 UErrorCode lengthStatus
= U_ZERO_ERROR
;
327 len16
= utext_extract(fInputText
, fAppendPosition
, fMatchStart
, NULL
, 0, &lengthStatus
);
329 UChar
*inputChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(len16
+1));
330 if (inputChars
== NULL
) {
331 status
= U_MEMORY_ALLOCATION_ERROR
;
334 utext_extract(fInputText
, fAppendPosition
, fMatchStart
, inputChars
, len16
+1, &status
);
335 destLen
+= utext_replace(dest
, destLen
, destLen
, inputChars
, len16
, &status
);
336 uprv_free(inputChars
);
339 fAppendPosition
= fMatchEnd
;
342 // scan the replacement text, looking for substitutions ($n) and \escapes.
343 // TODO: optimize this loop by efficiently scanning for '$' or '\',
344 // move entire ranges not containing substitutions.
345 UTEXT_SETNATIVEINDEX(replacement
, 0);
346 for (UChar32 c
= UTEXT_NEXT32(replacement
); U_SUCCESS(status
) && c
!= U_SENTINEL
; c
= UTEXT_NEXT32(replacement
)) {
347 if (c
== BACKSLASH
) {
348 // Backslash Escape. Copy the following char out without further checks.
349 // Note: Surrogate pairs don't need any special handling
350 // The second half wont be a '$' or a '\', and
351 // will move to the dest normally on the next
353 c
= UTEXT_CURRENT32(replacement
);
354 if (c
== U_SENTINEL
) {
358 if (c
==0x55/*U*/ || c
==0x75/*u*/) {
359 // We have a \udddd or \Udddddddd escape sequence.
361 struct URegexUTextUnescapeCharContext context
= U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement
);
362 UChar32 escapedChar
= u_unescapeAt(uregex_utext_unescape_charAt
, &offset
, INT32_MAX
, &context
);
363 if (escapedChar
!= (UChar32
)0xFFFFFFFF) {
364 if (U_IS_BMP(escapedChar
)) {
365 UChar c16
= (UChar
)escapedChar
;
366 destLen
+= utext_replace(dest
, destLen
, destLen
, &c16
, 1, &status
);
369 surrogate
[0] = U16_LEAD(escapedChar
);
370 surrogate
[1] = U16_TRAIL(escapedChar
);
371 if (U_SUCCESS(status
)) {
372 destLen
+= utext_replace(dest
, destLen
, destLen
, surrogate
, 2, &status
);
375 // TODO: Report errors for mal-formed \u escapes?
376 // As this is, the original sequence is output, which may be OK.
377 if (context
.lastOffset
== offset
) {
378 (void)UTEXT_PREVIOUS32(replacement
);
379 } else if (context
.lastOffset
!= offset
-1) {
380 utext_moveIndex32(replacement
, offset
- context
.lastOffset
- 1);
384 (void)UTEXT_NEXT32(replacement
);
385 // Plain backslash escape. Just put out the escaped character.
387 UChar c16
= (UChar
)c
;
388 destLen
+= utext_replace(dest
, destLen
, destLen
, &c16
, 1, &status
);
391 surrogate
[0] = U16_LEAD(c
);
392 surrogate
[1] = U16_TRAIL(c
);
393 if (U_SUCCESS(status
)) {
394 destLen
+= utext_replace(dest
, destLen
, destLen
, surrogate
, 2, &status
);
398 } else if (c
!= DOLLARSIGN
) {
399 // Normal char, not a $. Copy it out without further checks.
401 UChar c16
= (UChar
)c
;
402 destLen
+= utext_replace(dest
, destLen
, destLen
, &c16
, 1, &status
);
405 surrogate
[0] = U16_LEAD(c
);
406 surrogate
[1] = U16_TRAIL(c
);
407 if (U_SUCCESS(status
)) {
408 destLen
+= utext_replace(dest
, destLen
, destLen
, surrogate
, 2, &status
);
412 // We've got a $. Pick up a capture group name or number if one follows.
413 // Consume digits so long as the resulting group number <= the number of
414 // number of capture groups in the pattern.
416 int32_t groupNum
= 0;
417 int32_t numDigits
= 0;
418 UChar32 nextChar
= utext_current32(replacement
);
419 if (nextChar
== LEFTBRACKET
) {
420 // Scan for a Named Capture Group, ${name}.
421 UnicodeString groupName
;
422 utext_next32(replacement
);
423 while(U_SUCCESS(status
) && nextChar
!= RIGHTBRACKET
) {
424 nextChar
= utext_next32(replacement
);
425 if (nextChar
== U_SENTINEL
) {
426 status
= U_REGEX_INVALID_CAPTURE_GROUP_NAME
;
427 } else if ((nextChar
>= 0x41 && nextChar
<= 0x5a) || // A..Z
428 (nextChar
>= 0x61 && nextChar
<= 0x7a) || // a..z
429 (nextChar
>= 0x31 && nextChar
<= 0x39)) { // 0..9
430 groupName
.append(nextChar
);
431 } else if (nextChar
== RIGHTBRACKET
) {
432 groupNum
= uhash_geti(fPattern
->fNamedCaptureMap
, &groupName
);
434 status
= U_REGEX_INVALID_CAPTURE_GROUP_NAME
;
437 // Character was something other than a name char or a closing '}'
438 status
= U_REGEX_INVALID_CAPTURE_GROUP_NAME
;
442 } else if (u_isdigit(nextChar
)) {
443 // $n Scan for a capture group number
444 int32_t numCaptureGroups
= fPattern
->fGroupMap
->size();
446 nextChar
= UTEXT_CURRENT32(replacement
);
447 if (nextChar
== U_SENTINEL
) {
450 if (u_isdigit(nextChar
) == FALSE
) {
453 int32_t nextDigitVal
= u_charDigitValue(nextChar
);
454 if (groupNum
*10 + nextDigitVal
> numCaptureGroups
) {
455 // Don't consume the next digit if it makes the capture group number too big.
456 if (numDigits
== 0) {
457 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
461 (void)UTEXT_NEXT32(replacement
);
462 groupNum
=groupNum
*10 + nextDigitVal
;
466 // $ not followed by capture group name or number.
467 status
= U_REGEX_INVALID_CAPTURE_GROUP_NAME
;
470 if (U_SUCCESS(status
)) {
471 destLen
+= appendGroup(groupNum
, dest
, status
);
473 } // End of $ capture group handling
474 } // End of per-character loop through the replacement string.
481 //--------------------------------------------------------------------------------
483 // appendTail Intended to be used in conjunction with appendReplacement()
484 // To the destination string, append everything following
485 // the last match position from the input string.
487 // Note: Match ranges do not affect appendTail or appendReplacement
489 //--------------------------------------------------------------------------------
490 UnicodeString
&RegexMatcher::appendTail(UnicodeString
&dest
) {
491 UErrorCode status
= U_ZERO_ERROR
;
492 UText resultText
= UTEXT_INITIALIZER
;
493 utext_openUnicodeString(&resultText
, &dest
, &status
);
495 if (U_SUCCESS(status
)) {
496 appendTail(&resultText
, status
);
497 utext_close(&resultText
);
504 // appendTail, UText mode
506 UText
*RegexMatcher::appendTail(UText
*dest
, UErrorCode
&status
) {
507 if (U_FAILURE(status
)) {
510 if (U_FAILURE(fDeferredStatus
)) {
511 status
= fDeferredStatus
;
515 if (fInputLength
> fAppendPosition
) {
516 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
517 int64_t destLen
= utext_nativeLength(dest
);
518 utext_replace(dest
, destLen
, destLen
, fInputText
->chunkContents
+fAppendPosition
,
519 (int32_t)(fInputLength
-fAppendPosition
), &status
);
522 if (UTEXT_USES_U16(fInputText
)) {
523 len16
= (int32_t)(fInputLength
-fAppendPosition
);
525 len16
= utext_extract(fInputText
, fAppendPosition
, fInputLength
, NULL
, 0, &status
);
526 status
= U_ZERO_ERROR
; // buffer overflow
529 UChar
*inputChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(len16
));
530 if (inputChars
== NULL
) {
531 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
533 utext_extract(fInputText
, fAppendPosition
, fInputLength
, inputChars
, len16
, &status
); // unterminated
534 int64_t destLen
= utext_nativeLength(dest
);
535 utext_replace(dest
, destLen
, destLen
, inputChars
, len16
, &status
);
536 uprv_free(inputChars
);
545 //--------------------------------------------------------------------------------
549 //--------------------------------------------------------------------------------
550 int32_t RegexMatcher::end(UErrorCode
&err
) const {
554 int64_t RegexMatcher::end64(UErrorCode
&err
) const {
555 return end64(0, err
);
558 int64_t RegexMatcher::end64(int32_t group
, UErrorCode
&err
) const {
559 if (U_FAILURE(err
)) {
562 if (fMatch
== FALSE
) {
563 err
= U_REGEX_INVALID_STATE
;
566 if (group
< 0 || group
> fPattern
->fGroupMap
->size()) {
567 err
= U_INDEX_OUTOFBOUNDS_ERROR
;
574 // Get the position within the stack frame of the variables for
575 // this capture group.
576 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(group
-1);
577 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
578 U_ASSERT(groupOffset
>= 0);
579 e
= fFrame
->fExtra
[groupOffset
+ 1];
585 int32_t RegexMatcher::end(int32_t group
, UErrorCode
&err
) const {
586 return (int32_t)end64(group
, err
);
589 //--------------------------------------------------------------------------------
591 // findProgressInterrupt This function is called once for each advance in the target
592 // string from the find() function, and calls the user progress callback
593 // function if there is one installed.
595 // Return: TRUE if the find operation is to be terminated.
596 // FALSE if the find operation is to continue running.
598 //--------------------------------------------------------------------------------
599 UBool
RegexMatcher::findProgressInterrupt(int64_t pos
, UErrorCode
&status
) {
600 if (fFindProgressCallbackFn
&& !(*fFindProgressCallbackFn
)(fFindProgressCallbackContext
, pos
)) {
601 status
= U_REGEX_STOPPED_BY_CALLER
;
607 //--------------------------------------------------------------------------------
611 //--------------------------------------------------------------------------------
612 UBool
RegexMatcher::find() {
613 if (U_FAILURE(fDeferredStatus
)) {
616 UErrorCode status
= U_ZERO_ERROR
;
617 UBool result
= find(status
);
621 //--------------------------------------------------------------------------------
625 //--------------------------------------------------------------------------------
626 UBool
RegexMatcher::find(UErrorCode
&status
) {
627 // Start at the position of the last match end. (Will be zero if the
628 // matcher has been reset.)
630 if (U_FAILURE(status
)) {
633 if (U_FAILURE(fDeferredStatus
)) {
634 status
= fDeferredStatus
;
638 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
639 return findUsingChunk(status
);
642 int64_t startPos
= fMatchEnd
;
644 startPos
= fActiveStart
;
648 // Save the position of any previous successful match.
649 fLastMatchEnd
= fMatchEnd
;
651 if (fMatchStart
== fMatchEnd
) {
652 // Previous match had zero length. Move start position up one position
653 // to avoid sending find() into a loop on zero-length matches.
654 if (startPos
>= fActiveLimit
) {
659 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
660 (void)UTEXT_NEXT32(fInputText
);
661 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
664 if (fLastMatchEnd
>= 0) {
665 // A previous find() failed to match. Don't try again.
666 // (without this test, a pattern with a zero-length match
667 // could match again at the end of an input string.)
674 // Compute the position in the input string beyond which a match can not begin, because
675 // the minimum length match would extend past the end of the input.
676 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
677 // Be aware of possible overflows if making changes here.
678 int64_t testStartLimit
;
679 if (UTEXT_USES_U16(fInputText
)) {
680 testStartLimit
= fActiveLimit
- fPattern
->fMinMatchLen
;
681 if (startPos
> testStartLimit
) {
687 // We don't know exactly how long the minimum match length is in native characters.
688 // Treat anything > 0 as 1.
689 testStartLimit
= fActiveLimit
- (fPattern
->fMinMatchLen
> 0 ? 1 : 0);
693 U_ASSERT(startPos
>= 0);
695 switch (fPattern
->fStartType
) {
697 // No optimization was found.
698 // Try a match at each input position.
700 MatchAt(startPos
, FALSE
, status
);
701 if (U_FAILURE(status
)) {
707 if (startPos
>= testStartLimit
) {
711 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
712 (void)UTEXT_NEXT32(fInputText
);
713 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
714 // Note that it's perfectly OK for a pattern to have a zero-length
715 // match at the end of a string, so we must make sure that the loop
716 // runs with startPos == testStartLimit the last time through.
717 if (findProgressInterrupt(startPos
, status
))
723 // Matches are only possible at the start of the input string
724 // (pattern begins with ^ or \A)
725 if (startPos
> fActiveStart
) {
729 MatchAt(startPos
, FALSE
, status
);
730 if (U_FAILURE(status
)) {
738 // Match may start on any char from a pre-computed set.
739 U_ASSERT(fPattern
->fMinMatchLen
> 0);
740 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
742 int64_t pos
= startPos
;
743 c
= UTEXT_NEXT32(fInputText
);
744 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
745 // c will be -1 (U_SENTINEL) at end of text, in which case we
746 // skip this next block (so we don't have a negative array index)
747 // and handle end of text in the following block.
748 if (c
>= 0 && ((c
<256 && fPattern
->fInitialChars8
->contains(c
)) ||
749 (c
>=256 && fPattern
->fInitialChars
->contains(c
)))) {
750 MatchAt(pos
, FALSE
, status
);
751 if (U_FAILURE(status
)) {
757 UTEXT_SETNATIVEINDEX(fInputText
, pos
);
759 if (startPos
> testStartLimit
) {
764 if (findProgressInterrupt(startPos
, status
))
773 // Match starts on exactly one char.
774 U_ASSERT(fPattern
->fMinMatchLen
> 0);
775 UChar32 theChar
= fPattern
->fInitialChar
;
776 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
778 int64_t pos
= startPos
;
779 c
= UTEXT_NEXT32(fInputText
);
780 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
782 MatchAt(pos
, FALSE
, status
);
783 if (U_FAILURE(status
)) {
789 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
791 if (startPos
> testStartLimit
) {
796 if (findProgressInterrupt(startPos
, status
))
805 if (startPos
== fAnchorStart
) {
806 MatchAt(startPos
, FALSE
, status
);
807 if (U_FAILURE(status
)) {
813 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
814 c
= UTEXT_NEXT32(fInputText
);
815 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
817 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
818 c
= UTEXT_PREVIOUS32(fInputText
);
819 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
822 if (fPattern
->fFlags
& UREGEX_UNIX_LINES
) {
825 MatchAt(startPos
, FALSE
, status
);
826 if (U_FAILURE(status
)) {
832 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
834 if (startPos
>= testStartLimit
) {
839 c
= UTEXT_NEXT32(fInputText
);
840 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
841 // Note that it's perfectly OK for a pattern to have a zero-length
842 // match at the end of a string, so we must make sure that the loop
843 // runs with startPos == testStartLimit the last time through.
844 if (findProgressInterrupt(startPos
, status
))
849 if (isLineTerminator(c
)) {
850 if (c
== 0x0d && startPos
< fActiveLimit
&& UTEXT_CURRENT32(fInputText
) == 0x0a) {
851 (void)UTEXT_NEXT32(fInputText
);
852 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
854 MatchAt(startPos
, FALSE
, status
);
855 if (U_FAILURE(status
)) {
861 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
863 if (startPos
>= testStartLimit
) {
868 c
= UTEXT_NEXT32(fInputText
);
869 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
870 // Note that it's perfectly OK for a pattern to have a zero-length
871 // match at the end of a string, so we must make sure that the loop
872 // runs with startPos == testStartLimit the last time through.
873 if (findProgressInterrupt(startPos
, status
))
889 UBool
RegexMatcher::find(int64_t start
, UErrorCode
&status
) {
890 if (U_FAILURE(status
)) {
893 if (U_FAILURE(fDeferredStatus
)) {
894 status
= fDeferredStatus
;
897 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
898 // This will reset the region to be the full input length.
900 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
904 int64_t nativeStart
= start
;
905 if (nativeStart
< fActiveStart
|| nativeStart
> fActiveLimit
) {
906 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
909 fMatchEnd
= nativeStart
;
914 //--------------------------------------------------------------------------------
916 // findUsingChunk() -- like find(), but with the advance knowledge that the
917 // entire string is available in the UText's chunk buffer.
919 //--------------------------------------------------------------------------------
920 UBool
RegexMatcher::findUsingChunk(UErrorCode
&status
) {
921 // Start at the position of the last match end. (Will be zero if the
922 // matcher has been reset.
925 int32_t startPos
= (int32_t)fMatchEnd
;
927 startPos
= (int32_t)fActiveStart
;
930 const UChar
*inputBuf
= fInputText
->chunkContents
;
933 // Save the position of any previous successful match.
934 fLastMatchEnd
= fMatchEnd
;
936 if (fMatchStart
== fMatchEnd
) {
937 // Previous match had zero length. Move start position up one position
938 // to avoid sending find() into a loop on zero-length matches.
939 if (startPos
>= fActiveLimit
) {
944 U16_FWD_1(inputBuf
, startPos
, fInputLength
);
947 if (fLastMatchEnd
>= 0) {
948 // A previous find() failed to match. Don't try again.
949 // (without this test, a pattern with a zero-length match
950 // could match again at the end of an input string.)
957 // Compute the position in the input string beyond which a match can not begin, because
958 // the minimum length match would extend past the end of the input.
959 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
960 // Be aware of possible overflows if making changes here.
961 // Note: a match can begin at inputBuf + testLen; it is an inclusive limit.
962 int32_t testLen
= (int32_t)(fActiveLimit
- fPattern
->fMinMatchLen
);
963 if (startPos
> testLen
) {
970 U_ASSERT(startPos
>= 0);
972 switch (fPattern
->fStartType
) {
974 // No optimization was found.
975 // Try a match at each input position.
977 MatchChunkAt(startPos
, FALSE
, status
);
978 if (U_FAILURE(status
)) {
984 if (startPos
>= testLen
) {
988 U16_FWD_1(inputBuf
, startPos
, fActiveLimit
);
989 // Note that it's perfectly OK for a pattern to have a zero-length
990 // match at the end of a string, so we must make sure that the loop
991 // runs with startPos == testLen the last time through.
992 if (findProgressInterrupt(startPos
, status
))
998 // Matches are only possible at the start of the input string
999 // (pattern begins with ^ or \A)
1000 if (startPos
> fActiveStart
) {
1004 MatchChunkAt(startPos
, FALSE
, status
);
1005 if (U_FAILURE(status
)) {
1013 // Match may start on any char from a pre-computed set.
1014 U_ASSERT(fPattern
->fMinMatchLen
> 0);
1016 int32_t pos
= startPos
;
1017 U16_NEXT(inputBuf
, startPos
, fActiveLimit
, c
); // like c = inputBuf[startPos++];
1018 if ((c
<256 && fPattern
->fInitialChars8
->contains(c
)) ||
1019 (c
>=256 && fPattern
->fInitialChars
->contains(c
))) {
1020 MatchChunkAt(pos
, FALSE
, status
);
1021 if (U_FAILURE(status
)) {
1028 if (startPos
> testLen
) {
1033 if (findProgressInterrupt(startPos
, status
))
1042 // Match starts on exactly one char.
1043 U_ASSERT(fPattern
->fMinMatchLen
> 0);
1044 UChar32 theChar
= fPattern
->fInitialChar
;
1046 int32_t pos
= startPos
;
1047 U16_NEXT(inputBuf
, startPos
, fActiveLimit
, c
); // like c = inputBuf[startPos++];
1049 MatchChunkAt(pos
, FALSE
, status
);
1050 if (U_FAILURE(status
)) {
1057 if (startPos
> testLen
) {
1062 if (findProgressInterrupt(startPos
, status
))
1071 if (startPos
== fAnchorStart
) {
1072 MatchChunkAt(startPos
, FALSE
, status
);
1073 if (U_FAILURE(status
)) {
1079 // In bug 31063104 which has a zero-length text buffer we get here with
1080 // inputBuf=NULL, startPos=fActiveLimit=0 (and fMatch F) which violates the
1081 // requirement for U16_FWD_1 (utf16.h) that startPos < fActiveLimit. Having
1082 // inputBuf=NULL (chunkContexts NULL) is probably due to an error in the
1083 // CFStringUText functions. Nevertheless, to be defensive, add test below.
1084 if (startPos
>= testLen
) {
1088 U16_FWD_1(inputBuf
, startPos
, fActiveLimit
);
1091 if (fPattern
->fFlags
& UREGEX_UNIX_LINES
) {
1093 c
= inputBuf
[startPos
-1];
1095 MatchChunkAt(startPos
, FALSE
, status
);
1096 if (U_FAILURE(status
)) {
1103 if (startPos
>= testLen
) {
1108 U16_FWD_1(inputBuf
, startPos
, fActiveLimit
);
1109 // Note that it's perfectly OK for a pattern to have a zero-length
1110 // match at the end of a string, so we must make sure that the loop
1111 // runs with startPos == testLen the last time through.
1112 if (findProgressInterrupt(startPos
, status
))
1117 c
= inputBuf
[startPos
-1];
1118 if (isLineTerminator(c
)) {
1119 if (c
== 0x0d && startPos
< fActiveLimit
&& inputBuf
[startPos
] == 0x0a) {
1122 MatchChunkAt(startPos
, FALSE
, status
);
1123 if (U_FAILURE(status
)) {
1130 if (startPos
>= testLen
) {
1135 U16_FWD_1(inputBuf
, startPos
, fActiveLimit
);
1136 // Note that it's perfectly OK for a pattern to have a zero-length
1137 // match at the end of a string, so we must make sure that the loop
1138 // runs with startPos == testLen the last time through.
1139 if (findProgressInterrupt(startPos
, status
))
1155 //--------------------------------------------------------------------------------
1159 //--------------------------------------------------------------------------------
1160 UnicodeString
RegexMatcher::group(UErrorCode
&status
) const {
1161 return group(0, status
);
1164 // Return immutable shallow clone
1165 UText
*RegexMatcher::group(UText
*dest
, int64_t &group_len
, UErrorCode
&status
) const {
1166 return group(0, dest
, group_len
, status
);
1169 // Return immutable shallow clone
1170 UText
*RegexMatcher::group(int32_t groupNum
, UText
*dest
, int64_t &group_len
, UErrorCode
&status
) const {
1172 if (U_FAILURE(status
)) {
1175 if (U_FAILURE(fDeferredStatus
)) {
1176 status
= fDeferredStatus
;
1177 } else if (fMatch
== FALSE
) {
1178 status
= U_REGEX_INVALID_STATE
;
1179 } else if (groupNum
< 0 || groupNum
> fPattern
->fGroupMap
->size()) {
1180 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1183 if (U_FAILURE(status
)) {
1188 if (groupNum
== 0) {
1192 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(groupNum
-1);
1193 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
1194 U_ASSERT(groupOffset
>= 0);
1195 s
= fFrame
->fExtra
[groupOffset
];
1196 e
= fFrame
->fExtra
[groupOffset
+1];
1200 // A capture group wasn't part of the match
1201 return utext_clone(dest
, fInputText
, FALSE
, TRUE
, &status
);
1206 dest
= utext_clone(dest
, fInputText
, FALSE
, TRUE
, &status
);
1208 UTEXT_SETNATIVEINDEX(dest
, s
);
1212 UnicodeString
RegexMatcher::group(int32_t groupNum
, UErrorCode
&status
) const {
1213 UnicodeString result
;
1214 int64_t groupStart
= start64(groupNum
, status
);
1215 int64_t groupEnd
= end64(groupNum
, status
);
1216 if (U_FAILURE(status
) || groupStart
== -1 || groupStart
== groupEnd
) {
1220 // Get the group length using a utext_extract preflight.
1221 // UText is actually pretty efficient at this when underlying encoding is UTF-16.
1222 int32_t length
= utext_extract(fInputText
, groupStart
, groupEnd
, NULL
, 0, &status
);
1223 if (status
!= U_BUFFER_OVERFLOW_ERROR
) {
1227 status
= U_ZERO_ERROR
;
1228 UChar
*buf
= result
.getBuffer(length
);
1230 status
= U_MEMORY_ALLOCATION_ERROR
;
1232 int32_t extractLength
= utext_extract(fInputText
, groupStart
, groupEnd
, buf
, length
, &status
);
1233 result
.releaseBuffer(extractLength
);
1234 U_ASSERT(length
== extractLength
);
1240 //--------------------------------------------------------------------------------
1242 // appendGroup() -- currently internal only, appends a group to a UText rather
1243 // than replacing its contents
1245 //--------------------------------------------------------------------------------
1247 int64_t RegexMatcher::appendGroup(int32_t groupNum
, UText
*dest
, UErrorCode
&status
) const {
1248 if (U_FAILURE(status
)) {
1251 if (U_FAILURE(fDeferredStatus
)) {
1252 status
= fDeferredStatus
;
1255 int64_t destLen
= utext_nativeLength(dest
);
1257 if (fMatch
== FALSE
) {
1258 status
= U_REGEX_INVALID_STATE
;
1259 return utext_replace(dest
, destLen
, destLen
, NULL
, 0, &status
);
1261 if (groupNum
< 0 || groupNum
> fPattern
->fGroupMap
->size()) {
1262 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1263 return utext_replace(dest
, destLen
, destLen
, NULL
, 0, &status
);
1267 if (groupNum
== 0) {
1271 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(groupNum
-1);
1272 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
1273 U_ASSERT(groupOffset
>= 0);
1274 s
= fFrame
->fExtra
[groupOffset
];
1275 e
= fFrame
->fExtra
[groupOffset
+1];
1279 // A capture group wasn't part of the match
1280 return utext_replace(dest
, destLen
, destLen
, NULL
, 0, &status
);
1285 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1286 U_ASSERT(e
<= fInputLength
);
1287 deltaLen
= utext_replace(dest
, destLen
, destLen
, fInputText
->chunkContents
+s
, (int32_t)(e
-s
), &status
);
1290 if (UTEXT_USES_U16(fInputText
)) {
1291 len16
= (int32_t)(e
-s
);
1293 UErrorCode lengthStatus
= U_ZERO_ERROR
;
1294 len16
= utext_extract(fInputText
, s
, e
, NULL
, 0, &lengthStatus
);
1296 UChar
*groupChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(len16
+1));
1297 if (groupChars
== NULL
) {
1298 status
= U_MEMORY_ALLOCATION_ERROR
;
1301 utext_extract(fInputText
, s
, e
, groupChars
, len16
+1, &status
);
1303 deltaLen
= utext_replace(dest
, destLen
, destLen
, groupChars
, len16
, &status
);
1304 uprv_free(groupChars
);
1311 //--------------------------------------------------------------------------------
1315 //--------------------------------------------------------------------------------
1316 int32_t RegexMatcher::groupCount() const {
1317 return fPattern
->fGroupMap
->size();
1320 //--------------------------------------------------------------------------------
1322 // hasAnchoringBounds()
1324 //--------------------------------------------------------------------------------
1325 UBool
RegexMatcher::hasAnchoringBounds() const {
1326 return fAnchoringBounds
;
1330 //--------------------------------------------------------------------------------
1332 // hasTransparentBounds()
1334 //--------------------------------------------------------------------------------
1335 UBool
RegexMatcher::hasTransparentBounds() const {
1336 return fTransparentBounds
;
1341 //--------------------------------------------------------------------------------
1345 //--------------------------------------------------------------------------------
1346 UBool
RegexMatcher::hitEnd() const {
1351 //--------------------------------------------------------------------------------
1355 //--------------------------------------------------------------------------------
1356 const UnicodeString
&RegexMatcher::input() const {
1358 UErrorCode status
= U_ZERO_ERROR
;
1360 if (UTEXT_USES_U16(fInputText
)) {
1361 len16
= (int32_t)fInputLength
;
1363 len16
= utext_extract(fInputText
, 0, fInputLength
, NULL
, 0, &status
);
1364 status
= U_ZERO_ERROR
; // overflow, length status
1366 UnicodeString
*result
= new UnicodeString(len16
, 0, 0);
1368 UChar
*inputChars
= result
->getBuffer(len16
);
1369 utext_extract(fInputText
, 0, fInputLength
, inputChars
, len16
, &status
); // unterminated warning
1370 result
->releaseBuffer(len16
);
1372 (*(const UnicodeString
**)&fInput
) = result
; // pointer assignment, rather than operator=
1378 //--------------------------------------------------------------------------------
1382 //--------------------------------------------------------------------------------
1383 UText
*RegexMatcher::inputText() const {
1388 //--------------------------------------------------------------------------------
1390 // getInput() -- like inputText(), but makes a clone or copies into another UText
1392 //--------------------------------------------------------------------------------
1393 UText
*RegexMatcher::getInput (UText
*dest
, UErrorCode
&status
) const {
1394 if (U_FAILURE(status
)) {
1397 if (U_FAILURE(fDeferredStatus
)) {
1398 status
= fDeferredStatus
;
1403 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1404 utext_replace(dest
, 0, utext_nativeLength(dest
), fInputText
->chunkContents
, (int32_t)fInputLength
, &status
);
1407 if (UTEXT_USES_U16(fInputText
)) {
1408 input16Len
= (int32_t)fInputLength
;
1410 UErrorCode lengthStatus
= U_ZERO_ERROR
;
1411 input16Len
= utext_extract(fInputText
, 0, fInputLength
, NULL
, 0, &lengthStatus
); // buffer overflow error
1413 UChar
*inputChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(input16Len
));
1414 if (inputChars
== NULL
) {
1418 status
= U_ZERO_ERROR
;
1419 utext_extract(fInputText
, 0, fInputLength
, inputChars
, input16Len
, &status
); // not terminated warning
1420 status
= U_ZERO_ERROR
;
1421 utext_replace(dest
, 0, utext_nativeLength(dest
), inputChars
, input16Len
, &status
);
1423 uprv_free(inputChars
);
1427 return utext_clone(NULL
, fInputText
, FALSE
, TRUE
, &status
);
1432 static UBool
compat_SyncMutableUTextContents(UText
*ut
);
1433 static UBool
compat_SyncMutableUTextContents(UText
*ut
) {
1434 UBool retVal
= FALSE
;
1436 // In the following test, we're really only interested in whether the UText should switch
1437 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents
1438 // will still point to the correct data.
1439 if (utext_nativeLength(ut
) != ut
->nativeIndexingLimit
) {
1440 UnicodeString
*us
=(UnicodeString
*)ut
->context
;
1442 // Update to the latest length.
1443 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1444 int32_t newLength
= us
->length();
1446 // Update the chunk description.
1447 // The buffer may have switched between stack- and heap-based.
1448 ut
->chunkContents
= us
->getBuffer();
1449 ut
->chunkLength
= newLength
;
1450 ut
->chunkNativeLimit
= newLength
;
1451 ut
->nativeIndexingLimit
= newLength
;
1458 //--------------------------------------------------------------------------------
1462 //--------------------------------------------------------------------------------
1463 UBool
RegexMatcher::lookingAt(UErrorCode
&status
) {
1464 if (U_FAILURE(status
)) {
1467 if (U_FAILURE(fDeferredStatus
)) {
1468 status
= fDeferredStatus
;
1472 if (fInputUniStrMaybeMutable
) {
1473 if (compat_SyncMutableUTextContents(fInputText
)) {
1474 fInputLength
= utext_nativeLength(fInputText
);
1479 resetPreserveRegion();
1481 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1482 MatchChunkAt((int32_t)fActiveStart
, FALSE
, status
);
1484 MatchAt(fActiveStart
, FALSE
, status
);
1490 UBool
RegexMatcher::lookingAt(int64_t start
, UErrorCode
&status
) {
1491 if (U_FAILURE(status
)) {
1494 if (U_FAILURE(fDeferredStatus
)) {
1495 status
= fDeferredStatus
;
1501 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1505 if (fInputUniStrMaybeMutable
) {
1506 if (compat_SyncMutableUTextContents(fInputText
)) {
1507 fInputLength
= utext_nativeLength(fInputText
);
1512 int64_t nativeStart
;
1513 nativeStart
= start
;
1514 if (nativeStart
< fActiveStart
|| nativeStart
> fActiveLimit
) {
1515 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1519 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1520 MatchChunkAt((int32_t)nativeStart
, FALSE
, status
);
1522 MatchAt(nativeStart
, FALSE
, status
);
1529 //--------------------------------------------------------------------------------
1533 //--------------------------------------------------------------------------------
1534 UBool
RegexMatcher::matches(UErrorCode
&status
) {
1535 if (U_FAILURE(status
)) {
1538 if (U_FAILURE(fDeferredStatus
)) {
1539 status
= fDeferredStatus
;
1543 if (fInputUniStrMaybeMutable
) {
1544 if (compat_SyncMutableUTextContents(fInputText
)) {
1545 fInputLength
= utext_nativeLength(fInputText
);
1550 resetPreserveRegion();
1553 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1554 MatchChunkAt((int32_t)fActiveStart
, TRUE
, status
);
1556 MatchAt(fActiveStart
, TRUE
, status
);
1562 UBool
RegexMatcher::matches(int64_t start
, UErrorCode
&status
) {
1563 if (U_FAILURE(status
)) {
1566 if (U_FAILURE(fDeferredStatus
)) {
1567 status
= fDeferredStatus
;
1573 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1577 if (fInputUniStrMaybeMutable
) {
1578 if (compat_SyncMutableUTextContents(fInputText
)) {
1579 fInputLength
= utext_nativeLength(fInputText
);
1584 int64_t nativeStart
;
1585 nativeStart
= start
;
1586 if (nativeStart
< fActiveStart
|| nativeStart
> fActiveLimit
) {
1587 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1591 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1592 MatchChunkAt((int32_t)nativeStart
, TRUE
, status
);
1594 MatchAt(nativeStart
, TRUE
, status
);
1601 //--------------------------------------------------------------------------------
1605 //--------------------------------------------------------------------------------
1606 const RegexPattern
&RegexMatcher::pattern() const {
1612 //--------------------------------------------------------------------------------
1616 //--------------------------------------------------------------------------------
1617 RegexMatcher
&RegexMatcher::region(int64_t regionStart
, int64_t regionLimit
, int64_t startIndex
, UErrorCode
&status
) {
1618 if (U_FAILURE(status
)) {
1622 if (regionStart
>regionLimit
|| regionStart
<0 || regionLimit
<0) {
1623 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1626 int64_t nativeStart
= regionStart
;
1627 int64_t nativeLimit
= regionLimit
;
1628 if (nativeStart
> fInputLength
|| nativeLimit
> fInputLength
) {
1629 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1632 if (startIndex
== -1)
1635 resetPreserveRegion();
1637 fRegionStart
= nativeStart
;
1638 fRegionLimit
= nativeLimit
;
1639 fActiveStart
= nativeStart
;
1640 fActiveLimit
= nativeLimit
;
1642 if (startIndex
!= -1) {
1643 if (startIndex
< fActiveStart
|| startIndex
> fActiveLimit
) {
1644 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1646 fMatchEnd
= startIndex
;
1649 if (!fTransparentBounds
) {
1650 fLookStart
= nativeStart
;
1651 fLookLimit
= nativeLimit
;
1653 if (fAnchoringBounds
) {
1654 fAnchorStart
= nativeStart
;
1655 fAnchorLimit
= nativeLimit
;
1660 RegexMatcher
&RegexMatcher::region(int64_t start
, int64_t limit
, UErrorCode
&status
) {
1661 return region(start
, limit
, -1, status
);
1664 //--------------------------------------------------------------------------------
1668 //--------------------------------------------------------------------------------
1669 int32_t RegexMatcher::regionEnd() const {
1670 return (int32_t)fRegionLimit
;
1673 int64_t RegexMatcher::regionEnd64() const {
1674 return fRegionLimit
;
1677 //--------------------------------------------------------------------------------
1681 //--------------------------------------------------------------------------------
1682 int32_t RegexMatcher::regionStart() const {
1683 return (int32_t)fRegionStart
;
1686 int64_t RegexMatcher::regionStart64() const {
1687 return fRegionStart
;
1691 //--------------------------------------------------------------------------------
1695 //--------------------------------------------------------------------------------
1696 UnicodeString
RegexMatcher::replaceAll(const UnicodeString
&replacement
, UErrorCode
&status
) {
1697 UText replacementText
= UTEXT_INITIALIZER
;
1698 UText resultText
= UTEXT_INITIALIZER
;
1699 UnicodeString resultString
;
1700 if (U_FAILURE(status
)) {
1701 return resultString
;
1704 utext_openConstUnicodeString(&replacementText
, &replacement
, &status
);
1705 utext_openUnicodeString(&resultText
, &resultString
, &status
);
1707 replaceAll(&replacementText
, &resultText
, status
);
1709 utext_close(&resultText
);
1710 utext_close(&replacementText
);
1712 return resultString
;
1717 // replaceAll, UText mode
1719 UText
*RegexMatcher::replaceAll(UText
*replacement
, UText
*dest
, UErrorCode
&status
) {
1720 if (U_FAILURE(status
)) {
1723 if (U_FAILURE(fDeferredStatus
)) {
1724 status
= fDeferredStatus
;
1729 UnicodeString emptyString
;
1730 UText empty
= UTEXT_INITIALIZER
;
1732 utext_openUnicodeString(&empty
, &emptyString
, &status
);
1733 dest
= utext_clone(NULL
, &empty
, TRUE
, FALSE
, &status
);
1734 utext_close(&empty
);
1737 if (U_SUCCESS(status
)) {
1740 appendReplacement(dest
, replacement
, status
);
1741 if (U_FAILURE(status
)) {
1745 appendTail(dest
, status
);
1752 //--------------------------------------------------------------------------------
1756 //--------------------------------------------------------------------------------
1757 UnicodeString
RegexMatcher::replaceFirst(const UnicodeString
&replacement
, UErrorCode
&status
) {
1758 UText replacementText
= UTEXT_INITIALIZER
;
1759 UText resultText
= UTEXT_INITIALIZER
;
1760 UnicodeString resultString
;
1762 utext_openConstUnicodeString(&replacementText
, &replacement
, &status
);
1763 utext_openUnicodeString(&resultText
, &resultString
, &status
);
1765 replaceFirst(&replacementText
, &resultText
, status
);
1767 utext_close(&resultText
);
1768 utext_close(&replacementText
);
1770 return resultString
;
1774 // replaceFirst, UText mode
1776 UText
*RegexMatcher::replaceFirst(UText
*replacement
, UText
*dest
, UErrorCode
&status
) {
1777 if (U_FAILURE(status
)) {
1780 if (U_FAILURE(fDeferredStatus
)) {
1781 status
= fDeferredStatus
;
1787 return getInput(dest
, status
);
1791 UnicodeString emptyString
;
1792 UText empty
= UTEXT_INITIALIZER
;
1794 utext_openUnicodeString(&empty
, &emptyString
, &status
);
1795 dest
= utext_clone(NULL
, &empty
, TRUE
, FALSE
, &status
);
1796 utext_close(&empty
);
1799 appendReplacement(dest
, replacement
, status
);
1800 appendTail(dest
, status
);
1806 //--------------------------------------------------------------------------------
1810 //--------------------------------------------------------------------------------
1811 UBool
RegexMatcher::requireEnd() const {
1816 //--------------------------------------------------------------------------------
1820 //--------------------------------------------------------------------------------
1821 RegexMatcher
&RegexMatcher::reset() {
1823 fRegionLimit
= fInputLength
;
1825 fActiveLimit
= fInputLength
;
1827 fAnchorLimit
= fInputLength
;
1829 fLookLimit
= fInputLength
;
1830 resetPreserveRegion();
1836 void RegexMatcher::resetPreserveRegion() {
1840 fAppendPosition
= 0;
1843 fRequireEnd
= FALSE
;
1845 fTickCounter
= TIMER_INITIAL_VALUE
;
1846 //resetStack(); // more expensive than it looks...
1850 RegexMatcher
&RegexMatcher::reset(const UnicodeString
&input
) {
1851 fInputText
= utext_openConstUnicodeString(fInputText
, &input
, &fDeferredStatus
);
1852 if (fPattern
->fNeedsAltInput
) {
1853 fAltInputText
= utext_clone(fAltInputText
, fInputText
, FALSE
, TRUE
, &fDeferredStatus
);
1855 if (U_FAILURE(fDeferredStatus
)) {
1858 fInputLength
= utext_nativeLength(fInputText
);
1864 // Do the following for any UnicodeString.
1865 // This is for compatibility for those clients who modify the input string "live" during regex operations.
1866 fInputUniStrMaybeMutable
= TRUE
;
1868 if (fWordBreakItr
!= NULL
) {
1869 #if UCONFIG_NO_BREAK_ITERATION==0
1870 UErrorCode status
= U_ZERO_ERROR
;
1871 fWordBreakItr
->setText(fInputText
, status
);
1878 RegexMatcher
&RegexMatcher::reset(UText
*input
) {
1879 if (fInputText
!= input
) {
1880 fInputText
= utext_clone(fInputText
, input
, FALSE
, TRUE
, &fDeferredStatus
);
1881 if (fPattern
->fNeedsAltInput
) fAltInputText
= utext_clone(fAltInputText
, fInputText
, FALSE
, TRUE
, &fDeferredStatus
);
1882 if (U_FAILURE(fDeferredStatus
)) {
1885 fInputLength
= utext_nativeLength(fInputText
);
1890 if (fWordBreakItr
!= NULL
) {
1891 #if UCONFIG_NO_BREAK_ITERATION==0
1892 UErrorCode status
= U_ZERO_ERROR
;
1893 fWordBreakItr
->setText(input
, status
);
1898 fInputUniStrMaybeMutable
= FALSE
;
1903 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
1904 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1908 RegexMatcher
&RegexMatcher::reset(int64_t position
, UErrorCode
&status
) {
1909 if (U_FAILURE(status
)) {
1912 reset(); // Reset also resets the region to be the entire string.
1914 if (position
< 0 || position
> fActiveLimit
) {
1915 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1918 fMatchEnd
= position
;
1923 //--------------------------------------------------------------------------------
1927 //--------------------------------------------------------------------------------
1928 RegexMatcher
&RegexMatcher::refreshInputText(UText
*input
, UErrorCode
&status
) {
1929 if (U_FAILURE(status
)) {
1932 if (input
== NULL
) {
1933 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1936 if (utext_nativeLength(fInputText
) != utext_nativeLength(input
)) {
1937 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1940 int64_t pos
= utext_getNativeIndex(fInputText
);
1941 // Shallow read-only clone of the new UText into the existing input UText
1942 fInputText
= utext_clone(fInputText
, input
, FALSE
, TRUE
, &status
);
1943 if (U_FAILURE(status
)) {
1946 utext_setNativeIndex(fInputText
, pos
);
1948 if (fAltInputText
!= NULL
) {
1949 pos
= utext_getNativeIndex(fAltInputText
);
1950 fAltInputText
= utext_clone(fAltInputText
, input
, FALSE
, TRUE
, &status
);
1951 if (U_FAILURE(status
)) {
1954 utext_setNativeIndex(fAltInputText
, pos
);
1961 //--------------------------------------------------------------------------------
1965 //--------------------------------------------------------------------------------
1966 void RegexMatcher::setTrace(UBool state
) {
1967 fTraceDebug
= state
;
1973 * UText, replace entire contents of the destination UText with a substring of the source UText.
1975 * @param src The source UText
1976 * @param dest The destination UText. Must be writable.
1977 * May be NULL, in which case a new UText will be allocated.
1978 * @param start Start index of source substring.
1979 * @param limit Limit index of source substring.
1980 * @param status An error code.
1982 static UText
*utext_extract_replace(UText
*src
, UText
*dest
, int64_t start
, int64_t limit
, UErrorCode
*status
) {
1983 if (U_FAILURE(*status
)) {
1986 if (start
== limit
) {
1988 utext_replace(dest
, 0, utext_nativeLength(dest
), NULL
, 0, status
);
1991 return utext_openUChars(NULL
, NULL
, 0, status
);
1994 int32_t length
= utext_extract(src
, start
, limit
, NULL
, 0, status
);
1995 if (*status
!= U_BUFFER_OVERFLOW_ERROR
&& U_FAILURE(*status
)) {
1998 *status
= U_ZERO_ERROR
;
1999 MaybeStackArray
<UChar
, 40> buffer
;
2000 if (length
>= buffer
.getCapacity()) {
2001 UChar
*newBuf
= buffer
.resize(length
+1); // Leave space for terminating Nul.
2002 if (newBuf
== NULL
) {
2003 *status
= U_MEMORY_ALLOCATION_ERROR
;
2006 utext_extract(src
, start
, limit
, buffer
.getAlias(), length
+1, status
);
2008 utext_replace(dest
, 0, utext_nativeLength(dest
), buffer
.getAlias(), length
, status
);
2012 // Caller did not provide a prexisting UText.
2013 // Open a new one, and have it adopt the text buffer storage.
2014 if (U_FAILURE(*status
)) {
2017 int32_t ownedLength
= 0;
2018 UChar
*ownedBuf
= buffer
.orphanOrClone(length
+1, ownedLength
);
2019 if (ownedBuf
== NULL
) {
2020 *status
= U_MEMORY_ALLOCATION_ERROR
;
2023 UText
*result
= utext_openUChars(NULL
, ownedBuf
, length
, status
);
2024 if (U_FAILURE(*status
)) {
2025 uprv_free(ownedBuf
);
2028 result
->providerProperties
|= (1 << UTEXT_PROVIDER_OWNS_TEXT
);
2033 //---------------------------------------------------------------------
2037 //---------------------------------------------------------------------
2038 int32_t RegexMatcher::split(const UnicodeString
&input
,
2039 UnicodeString dest
[],
2040 int32_t destCapacity
,
2043 UText inputText
= UTEXT_INITIALIZER
;
2044 utext_openConstUnicodeString(&inputText
, &input
, &status
);
2045 if (U_FAILURE(status
)) {
2049 UText
**destText
= (UText
**)uprv_malloc(sizeof(UText
*)*destCapacity
);
2050 if (destText
== NULL
) {
2051 status
= U_MEMORY_ALLOCATION_ERROR
;
2055 for (i
= 0; i
< destCapacity
; i
++) {
2056 destText
[i
] = utext_openUnicodeString(NULL
, &dest
[i
], &status
);
2059 int32_t fieldCount
= split(&inputText
, destText
, destCapacity
, status
);
2061 for (i
= 0; i
< destCapacity
; i
++) {
2062 utext_close(destText
[i
]);
2065 uprv_free(destText
);
2066 utext_close(&inputText
);
2071 // split, UText mode
2073 int32_t RegexMatcher::split(UText
*input
,
2075 int32_t destCapacity
,
2079 // Check arguements for validity
2081 if (U_FAILURE(status
)) {
2085 if (destCapacity
< 1) {
2086 status
= U_ILLEGAL_ARGUMENT_ERROR
;
2091 // Reset for the input text
2094 int64_t nextOutputStringStart
= 0;
2095 if (fActiveLimit
== 0) {
2100 // Loop through the input text, searching for the delimiter pattern
2103 int32_t numCaptureGroups
= fPattern
->fGroupMap
->size();
2105 if (i
>=destCapacity
-1) {
2106 // There is one or zero output string left.
2107 // Fill the last output string with whatever is left from the input, then exit the loop.
2108 // ( i will be == destCapacity if we filled the output array while processing
2109 // capture groups of the delimiter expression, in which case we will discard the
2110 // last capture group saved in favor of the unprocessed remainder of the
2113 if (fActiveLimit
> nextOutputStringStart
) {
2114 if (UTEXT_FULL_TEXT_IN_CHUNK(input
, fInputLength
)) {
2116 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]),
2117 input
->chunkContents
+nextOutputStringStart
,
2118 (int32_t)(fActiveLimit
-nextOutputStringStart
), &status
);
2120 UText remainingText
= UTEXT_INITIALIZER
;
2121 utext_openUChars(&remainingText
, input
->chunkContents
+nextOutputStringStart
,
2122 fActiveLimit
-nextOutputStringStart
, &status
);
2123 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2124 utext_close(&remainingText
);
2127 UErrorCode lengthStatus
= U_ZERO_ERROR
;
2128 int32_t remaining16Length
=
2129 utext_extract(input
, nextOutputStringStart
, fActiveLimit
, NULL
, 0, &lengthStatus
);
2130 UChar
*remainingChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(remaining16Length
+1));
2131 if (remainingChars
== NULL
) {
2132 status
= U_MEMORY_ALLOCATION_ERROR
;
2136 utext_extract(input
, nextOutputStringStart
, fActiveLimit
, remainingChars
, remaining16Length
+1, &status
);
2138 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]), remainingChars
, remaining16Length
, &status
);
2140 UText remainingText
= UTEXT_INITIALIZER
;
2141 utext_openUChars(&remainingText
, remainingChars
, remaining16Length
, &status
);
2142 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2143 utext_close(&remainingText
);
2146 uprv_free(remainingChars
);
2152 // We found another delimiter. Move everything from where we started looking
2153 // up until the start of the delimiter into the next output string.
2154 if (UTEXT_FULL_TEXT_IN_CHUNK(input
, fInputLength
)) {
2156 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]),
2157 input
->chunkContents
+nextOutputStringStart
,
2158 (int32_t)(fMatchStart
-nextOutputStringStart
), &status
);
2160 UText remainingText
= UTEXT_INITIALIZER
;
2161 utext_openUChars(&remainingText
, input
->chunkContents
+nextOutputStringStart
,
2162 fMatchStart
-nextOutputStringStart
, &status
);
2163 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2164 utext_close(&remainingText
);
2167 UErrorCode lengthStatus
= U_ZERO_ERROR
;
2168 int32_t remaining16Length
= utext_extract(input
, nextOutputStringStart
, fMatchStart
, NULL
, 0, &lengthStatus
);
2169 UChar
*remainingChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(remaining16Length
+1));
2170 if (remainingChars
== NULL
) {
2171 status
= U_MEMORY_ALLOCATION_ERROR
;
2174 utext_extract(input
, nextOutputStringStart
, fMatchStart
, remainingChars
, remaining16Length
+1, &status
);
2176 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]), remainingChars
, remaining16Length
, &status
);
2178 UText remainingText
= UTEXT_INITIALIZER
;
2179 utext_openUChars(&remainingText
, remainingChars
, remaining16Length
, &status
);
2180 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2181 utext_close(&remainingText
);
2184 uprv_free(remainingChars
);
2186 nextOutputStringStart
= fMatchEnd
;
2188 // If the delimiter pattern has capturing parentheses, the captured
2189 // text goes out into the next n destination strings.
2191 for (groupNum
=1; groupNum
<=numCaptureGroups
; groupNum
++) {
2192 if (i
>= destCapacity
-2) {
2193 // Never fill the last available output string with capture group text.
2194 // It will filled with the last field, the remainder of the
2195 // unsplit input text.
2199 dest
[i
] = utext_extract_replace(fInputText
, dest
[i
],
2200 start64(groupNum
, status
), end64(groupNum
, status
), &status
);
2203 if (nextOutputStringStart
== fActiveLimit
) {
2204 // The delimiter was at the end of the string. We're done, but first
2205 // we output one last empty string, for the empty field following
2206 // the delimiter at the end of input.
2207 if (i
+1 < destCapacity
) {
2209 if (dest
[i
] == NULL
) {
2210 dest
[i
] = utext_openUChars(NULL
, NULL
, 0, &status
);
2212 static const UChar emptyString
[] = {(UChar
)0};
2213 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]), emptyString
, 0, &status
);
2222 // We ran off the end of the input while looking for the next delimiter.
2223 // All the remaining text goes into the current output string.
2224 if (UTEXT_FULL_TEXT_IN_CHUNK(input
, fInputLength
)) {
2226 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]),
2227 input
->chunkContents
+nextOutputStringStart
,
2228 (int32_t)(fActiveLimit
-nextOutputStringStart
), &status
);
2230 UText remainingText
= UTEXT_INITIALIZER
;
2231 utext_openUChars(&remainingText
, input
->chunkContents
+nextOutputStringStart
,
2232 fActiveLimit
-nextOutputStringStart
, &status
);
2233 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2234 utext_close(&remainingText
);
2237 UErrorCode lengthStatus
= U_ZERO_ERROR
;
2238 int32_t remaining16Length
= utext_extract(input
, nextOutputStringStart
, fActiveLimit
, NULL
, 0, &lengthStatus
);
2239 UChar
*remainingChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(remaining16Length
+1));
2240 if (remainingChars
== NULL
) {
2241 status
= U_MEMORY_ALLOCATION_ERROR
;
2245 utext_extract(input
, nextOutputStringStart
, fActiveLimit
, remainingChars
, remaining16Length
+1, &status
);
2247 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]), remainingChars
, remaining16Length
, &status
);
2249 UText remainingText
= UTEXT_INITIALIZER
;
2250 utext_openUChars(&remainingText
, remainingChars
, remaining16Length
, &status
);
2251 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2252 utext_close(&remainingText
);
2255 uprv_free(remainingChars
);
2259 if (U_FAILURE(status
)) {
2262 } // end of for loop
2267 //--------------------------------------------------------------------------------
2271 //--------------------------------------------------------------------------------
2272 int32_t RegexMatcher::start(UErrorCode
&status
) const {
2273 return start(0, status
);
2276 int64_t RegexMatcher::start64(UErrorCode
&status
) const {
2277 return start64(0, status
);
2280 //--------------------------------------------------------------------------------
2282 // start(int32_t group, UErrorCode &status)
2284 //--------------------------------------------------------------------------------
2286 int64_t RegexMatcher::start64(int32_t group
, UErrorCode
&status
) const {
2287 if (U_FAILURE(status
)) {
2290 if (U_FAILURE(fDeferredStatus
)) {
2291 status
= fDeferredStatus
;
2294 if (fMatch
== FALSE
) {
2295 status
= U_REGEX_INVALID_STATE
;
2298 if (group
< 0 || group
> fPattern
->fGroupMap
->size()) {
2299 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
2306 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(group
-1);
2307 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
2308 U_ASSERT(groupOffset
>= 0);
2309 s
= fFrame
->fExtra
[groupOffset
];
2316 int32_t RegexMatcher::start(int32_t group
, UErrorCode
&status
) const {
2317 return (int32_t)start64(group
, status
);
2320 //--------------------------------------------------------------------------------
2322 // useAnchoringBounds
2324 //--------------------------------------------------------------------------------
2325 RegexMatcher
&RegexMatcher::useAnchoringBounds(UBool b
) {
2326 fAnchoringBounds
= b
;
2327 fAnchorStart
= (fAnchoringBounds
? fRegionStart
: 0);
2328 fAnchorLimit
= (fAnchoringBounds
? fRegionLimit
: fInputLength
);
2333 //--------------------------------------------------------------------------------
2335 // useTransparentBounds
2337 //--------------------------------------------------------------------------------
2338 RegexMatcher
&RegexMatcher::useTransparentBounds(UBool b
) {
2339 fTransparentBounds
= b
;
2340 fLookStart
= (fTransparentBounds
? 0 : fRegionStart
);
2341 fLookLimit
= (fTransparentBounds
? fInputLength
: fRegionLimit
);
2345 //--------------------------------------------------------------------------------
2349 //--------------------------------------------------------------------------------
2350 void RegexMatcher::setTimeLimit(int32_t limit
, UErrorCode
&status
) {
2351 if (U_FAILURE(status
)) {
2354 if (U_FAILURE(fDeferredStatus
)) {
2355 status
= fDeferredStatus
;
2359 status
= U_ILLEGAL_ARGUMENT_ERROR
;
2366 //--------------------------------------------------------------------------------
2370 //--------------------------------------------------------------------------------
2371 int32_t RegexMatcher::getTimeLimit() const {
2376 //--------------------------------------------------------------------------------
2380 //--------------------------------------------------------------------------------
2381 void RegexMatcher::setStackLimit(int32_t limit
, UErrorCode
&status
) {
2382 if (U_FAILURE(status
)) {
2385 if (U_FAILURE(fDeferredStatus
)) {
2386 status
= fDeferredStatus
;
2390 status
= U_ILLEGAL_ARGUMENT_ERROR
;
2394 // Reset the matcher. This is needed here in case there is a current match
2395 // whose final stack frame (containing the match results, pointed to by fFrame)
2396 // would be lost by resizing to a smaller stack size.
2400 // Unlimited stack expansion
2401 fStack
->setMaxCapacity(0);
2403 // Change the units of the limit from bytes to ints, and bump the size up
2404 // to be big enough to hold at least one stack frame for the pattern,
2405 // if it isn't there already.
2406 int32_t adjustedLimit
= limit
/ sizeof(int32_t);
2407 if (adjustedLimit
< fPattern
->fFrameSize
) {
2408 adjustedLimit
= fPattern
->fFrameSize
;
2410 fStack
->setMaxCapacity(adjustedLimit
);
2412 fStackLimit
= limit
;
2416 //--------------------------------------------------------------------------------
2420 //--------------------------------------------------------------------------------
2421 int32_t RegexMatcher::getStackLimit() const {
2426 //--------------------------------------------------------------------------------
2430 //--------------------------------------------------------------------------------
2431 void RegexMatcher::setMatchCallback(URegexMatchCallback
*callback
,
2432 const void *context
,
2433 UErrorCode
&status
) {
2434 if (U_FAILURE(status
)) {
2437 fCallbackFn
= callback
;
2438 fCallbackContext
= context
;
2442 //--------------------------------------------------------------------------------
2446 //--------------------------------------------------------------------------------
2447 void RegexMatcher::getMatchCallback(URegexMatchCallback
*&callback
,
2448 const void *&context
,
2449 UErrorCode
&status
) {
2450 if (U_FAILURE(status
)) {
2453 callback
= fCallbackFn
;
2454 context
= fCallbackContext
;
2458 //--------------------------------------------------------------------------------
2462 //--------------------------------------------------------------------------------
2463 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback
*callback
,
2464 const void *context
,
2465 UErrorCode
&status
) {
2466 if (U_FAILURE(status
)) {
2469 fFindProgressCallbackFn
= callback
;
2470 fFindProgressCallbackContext
= context
;
2474 //--------------------------------------------------------------------------------
2478 //--------------------------------------------------------------------------------
2479 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback
*&callback
,
2480 const void *&context
,
2481 UErrorCode
&status
) {
2482 if (U_FAILURE(status
)) {
2485 callback
= fFindProgressCallbackFn
;
2486 context
= fFindProgressCallbackContext
;
2490 //================================================================================
2492 // Code following this point in this file is the internal
2493 // Match Engine Implementation.
2495 //================================================================================
2498 //--------------------------------------------------------------------------------
2501 // Discard any previous contents of the state save stack, and initialize a
2502 // new stack frame to all -1. The -1s are needed for capture group limits,
2503 // where they indicate that a group has not yet matched anything.
2504 //--------------------------------------------------------------------------------
2505 REStackFrame
*RegexMatcher::resetStack() {
2506 // Discard any previous contents of the state save stack, and initialize a
2507 // new stack frame with all -1 data. The -1s are needed for capture group limits,
2508 // where they indicate that a group has not yet matched anything.
2509 fStack
->removeAllElements();
2511 REStackFrame
*iFrame
= (REStackFrame
*)fStack
->reserveBlock(fPattern
->fFrameSize
, fDeferredStatus
);
2512 if(U_FAILURE(fDeferredStatus
)) {
2517 for (i
=0; i
<fPattern
->fFrameSize
-RESTACKFRAME_HDRCOUNT
; i
++) {
2518 iFrame
->fExtra
[i
] = -1;
2525 //--------------------------------------------------------------------------------
2528 // in perl, "xab..cd..", \b is true at positions 0,3,5,7
2530 // If the current char is a combining mark,
2532 // Else Scan backwards to the first non-combining char.
2533 // We are at a boundary if the this char and the original chars are
2534 // opposite in membership in \w set
2536 // parameters: pos - the current position in the input buffer
2538 // TODO: double-check edge cases at region boundaries.
2540 //--------------------------------------------------------------------------------
2541 UBool
RegexMatcher::isWordBoundary(int64_t pos
) {
2542 UBool isBoundary
= FALSE
;
2543 UBool cIsWord
= FALSE
;
2545 if (pos
>= fLookLimit
) {
2548 // Determine whether char c at current position is a member of the word set of chars.
2549 // If we're off the end of the string, behave as though we're not at a word char.
2550 UTEXT_SETNATIVEINDEX(fInputText
, pos
);
2551 UChar32 c
= UTEXT_CURRENT32(fInputText
);
2552 if (u_hasBinaryProperty(c
, UCHAR_GRAPHEME_EXTEND
) || u_charType(c
) == U_FORMAT_CHAR
) {
2553 // Current char is a combining one. Not a boundary.
2556 cIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(c
);
2559 // Back up until we come to a non-combining char, determine whether
2560 // that char is a word char.
2561 UBool prevCIsWord
= FALSE
;
2563 if (UTEXT_GETNATIVEINDEX(fInputText
) <= fLookStart
) {
2566 UChar32 prevChar
= UTEXT_PREVIOUS32(fInputText
);
2567 if (!(u_hasBinaryProperty(prevChar
, UCHAR_GRAPHEME_EXTEND
)
2568 || u_charType(prevChar
) == U_FORMAT_CHAR
)) {
2569 prevCIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(prevChar
);
2573 isBoundary
= cIsWord
^ prevCIsWord
;
2577 UBool
RegexMatcher::isChunkWordBoundary(int32_t pos
) {
2578 UBool isBoundary
= FALSE
;
2579 UBool cIsWord
= FALSE
;
2581 const UChar
*inputBuf
= fInputText
->chunkContents
;
2583 if (pos
>= fLookLimit
) {
2586 // Determine whether char c at current position is a member of the word set of chars.
2587 // If we're off the end of the string, behave as though we're not at a word char.
2589 U16_GET(inputBuf
, fLookStart
, pos
, fLookLimit
, c
);
2590 if (u_hasBinaryProperty(c
, UCHAR_GRAPHEME_EXTEND
) || u_charType(c
) == U_FORMAT_CHAR
) {
2591 // Current char is a combining one. Not a boundary.
2594 cIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(c
);
2597 // Back up until we come to a non-combining char, determine whether
2598 // that char is a word char.
2599 UBool prevCIsWord
= FALSE
;
2601 if (pos
<= fLookStart
) {
2605 U16_PREV(inputBuf
, fLookStart
, pos
, prevChar
);
2606 if (!(u_hasBinaryProperty(prevChar
, UCHAR_GRAPHEME_EXTEND
)
2607 || u_charType(prevChar
) == U_FORMAT_CHAR
)) {
2608 prevCIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(prevChar
);
2612 isBoundary
= cIsWord
^ prevCIsWord
;
2616 //--------------------------------------------------------------------------------
2620 // Test for a word boundary using RBBI word break.
2622 // parameters: pos - the current position in the input buffer
2624 //--------------------------------------------------------------------------------
2625 UBool
RegexMatcher::isUWordBoundary(int64_t pos
) {
2626 UBool returnVal
= FALSE
;
2627 #if UCONFIG_NO_BREAK_ITERATION==0
2629 // If we haven't yet created a break iterator for this matcher, do it now.
2630 if (fWordBreakItr
== NULL
) {
2632 (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus
);
2633 if (U_FAILURE(fDeferredStatus
)) {
2636 fWordBreakItr
->setText(fInputText
, fDeferredStatus
);
2639 if (pos
>= fLookLimit
) {
2641 returnVal
= TRUE
; // With Unicode word rules, only positions within the interior of "real"
2642 // words are not boundaries. All non-word chars stand by themselves,
2643 // with word boundaries on both sides.
2645 if (!UTEXT_USES_U16(fInputText
)) {
2646 // !!!: Would like a better way to do this!
2647 UErrorCode status
= U_ZERO_ERROR
;
2648 pos
= utext_extract(fInputText
, 0, pos
, NULL
, 0, &status
);
2650 returnVal
= fWordBreakItr
->isBoundary((int32_t)pos
);
2656 //--------------------------------------------------------------------------------
2658 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state
2659 // saves. Increment the "time" counter, and call the
2660 // user callback function if there is one installed.
2662 // If the match operation needs to be aborted, either for a time-out
2663 // or because the user callback asked for it, just set an error status.
2664 // The engine will pick that up and stop in its outer loop.
2666 //--------------------------------------------------------------------------------
2667 void RegexMatcher::IncrementTime(UErrorCode
&status
) {
2668 fTickCounter
= TIMER_INITIAL_VALUE
;
2670 if (fCallbackFn
!= NULL
) {
2671 if ((*fCallbackFn
)(fCallbackContext
, fTime
) == FALSE
) {
2672 status
= U_REGEX_STOPPED_BY_CALLER
;
2676 if (fTimeLimit
> 0 && fTime
>= fTimeLimit
) {
2677 status
= U_REGEX_TIME_OUT
;
2681 //--------------------------------------------------------------------------------
2684 // Make a new stack frame, initialized as a copy of the current stack frame.
2685 // Set the pattern index in the original stack frame from the operand value
2686 // in the opcode. Execution of the engine continues with the state in
2687 // the newly created stack frame
2689 // Note that reserveBlock() may grow the stack, resulting in the
2690 // whole thing being relocated in memory.
2693 // fp The top frame pointer when called. At return, a new
2694 // fame will be present
2695 // savePatIdx An index into the compiled pattern. Goes into the original
2696 // (not new) frame. If execution ever back-tracks out of the
2697 // new frame, this will be where we continue from in the pattern.
2699 // The new frame pointer.
2701 //--------------------------------------------------------------------------------
2702 inline REStackFrame
*RegexMatcher::StateSave(REStackFrame
*fp
, int64_t savePatIdx
, UErrorCode
&status
) {
2703 if (U_FAILURE(status
)) {
2706 // push storage for a new frame.
2707 int64_t *newFP
= fStack
->reserveBlock(fFrameSize
, status
);
2708 if (U_FAILURE(status
)) {
2709 // Failure on attempted stack expansion.
2710 // Stack function set some other error code, change it to a more
2711 // specific one for regular expressions.
2712 status
= U_REGEX_STACK_OVERFLOW
;
2713 // We need to return a writable stack frame, so just return the
2714 // previous frame. The match operation will stop quickly
2715 // because of the error status, after which the frame will never
2716 // be looked at again.
2719 fp
= (REStackFrame
*)(newFP
- fFrameSize
); // in case of realloc of stack.
2721 // New stack frame = copy of old top frame.
2722 int64_t *source
= (int64_t *)fp
;
2723 int64_t *dest
= newFP
;
2725 *dest
++ = *source
++;
2726 if (source
== newFP
) {
2732 if (fTickCounter
<= 0) {
2733 IncrementTime(status
); // Re-initializes fTickCounter
2735 fp
->fPatIdx
= savePatIdx
;
2736 return (REStackFrame
*)newFP
;
2739 #if defined(REGEX_DEBUG)
2741 UnicodeString
StringFromUText(UText
*ut
) {
2742 UnicodeString result
;
2743 for (UChar32 c
= utext_next32From(ut
, 0); c
!= U_SENTINEL
; c
= UTEXT_NEXT32(ut
)) {
2749 #endif // REGEX_DEBUG
2752 //--------------------------------------------------------------------------------
2754 // MatchAt This is the actual matching engine.
2756 // startIdx: begin matching a this index.
2757 // toEnd: if true, match must extend to end of the input region
2759 //--------------------------------------------------------------------------------
2760 void RegexMatcher::MatchAt(int64_t startIdx
, UBool toEnd
, UErrorCode
&status
) {
2761 UBool isMatch
= FALSE
; // True if the we have a match.
2763 int64_t backSearchIndex
= U_INT64_MAX
; // used after greedy single-character matches for searching backwards
2765 int32_t op
; // Operation from the compiled pattern, split into
2766 int32_t opType
; // the opcode
2767 int32_t opValue
; // and the operand value.
2769 #ifdef REGEX_RUN_DEBUG
2771 printf("MatchAt(startIdx=%ld)\n", startIdx
);
2772 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern
->fPattern
))());
2773 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText
))());
2777 if (U_FAILURE(status
)) {
2781 // Cache frequently referenced items from the compiled pattern
2783 int64_t *pat
= fPattern
->fCompiledPat
->getBuffer();
2785 const UChar
*litText
= fPattern
->fLiteralText
.getBuffer();
2786 UVector
*sets
= fPattern
->fSets
;
2788 fFrameSize
= fPattern
->fFrameSize
;
2789 REStackFrame
*fp
= resetStack();
2790 if (U_FAILURE(fDeferredStatus
)) {
2791 status
= fDeferredStatus
;
2796 fp
->fInputIdx
= startIdx
;
2798 // Zero out the pattern's static data
2800 for (i
= 0; i
<fPattern
->fDataSize
; i
++) {
2805 // Main loop for interpreting the compiled pattern.
2806 // One iteration of the loop per pattern operation performed.
2809 op
= (int32_t)pat
[fp
->fPatIdx
];
2810 opType
= URX_TYPE(op
);
2811 opValue
= URX_VAL(op
);
2812 #ifdef REGEX_RUN_DEBUG
2814 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2815 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp
->fInputIdx
,
2816 UTEXT_CURRENT32(fInputText
), (int64_t *)fp
-fStack
->getBuffer(), fActiveLimit
);
2817 fPattern
->dumpOp(fp
->fPatIdx
);
2830 // Force a backtrack. In some circumstances, the pattern compiler
2831 // will notice that the pattern can't possibly match anything, and will
2832 // emit one of these at that point.
2833 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2838 if (fp
->fInputIdx
< fActiveLimit
) {
2839 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2840 UChar32 c
= UTEXT_NEXT32(fInputText
);
2842 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
2848 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2854 // Test input against a literal string.
2855 // Strings require two slots in the compiled pattern, one for the
2856 // offset to the string text, and one for the length.
2858 int32_t stringStartIdx
= opValue
;
2859 op
= (int32_t)pat
[fp
->fPatIdx
]; // Fetch the second operand
2861 opType
= URX_TYPE(op
);
2862 int32_t stringLen
= URX_VAL(op
);
2863 U_ASSERT(opType
== URX_STRING_LEN
);
2864 U_ASSERT(stringLen
>= 2);
2866 const UChar
*patternString
= litText
+stringStartIdx
;
2867 int32_t patternStringIndex
= 0;
2868 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2870 UChar32 patternChar
;
2871 UBool success
= TRUE
;
2872 while (patternStringIndex
< stringLen
) {
2873 if (UTEXT_GETNATIVEINDEX(fInputText
) >= fActiveLimit
) {
2878 inputChar
= UTEXT_NEXT32(fInputText
);
2879 U16_NEXT(patternString
, patternStringIndex
, stringLen
, patternChar
);
2880 if (patternChar
!= inputChar
) {
2887 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
2889 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2895 case URX_STATE_SAVE
:
2896 fp
= StateSave(fp
, opValue
, status
);
2901 // The match loop will exit via this path on a successful match,
2902 // when we reach the end of the pattern.
2903 if (toEnd
&& fp
->fInputIdx
!= fActiveLimit
) {
2904 // The pattern matched, but not to the end of input. Try some more.
2905 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2911 // Start and End Capture stack frame variables are laid out out like this:
2912 // fp->fExtra[opValue] - The start of a completed capture group
2913 // opValue+1 - The end of a completed capture group
2914 // opValue+2 - the start of a capture group whose end
2915 // has not yet been reached (and might not ever be).
2916 case URX_START_CAPTURE
:
2917 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
2918 fp
->fExtra
[opValue
+2] = fp
->fInputIdx
;
2922 case URX_END_CAPTURE
:
2923 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
2924 U_ASSERT(fp
->fExtra
[opValue
+2] >= 0); // Start pos for this group must be set.
2925 fp
->fExtra
[opValue
] = fp
->fExtra
[opValue
+2]; // Tentative start becomes real.
2926 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // End position
2927 U_ASSERT(fp
->fExtra
[opValue
] <= fp
->fExtra
[opValue
+1]);
2931 case URX_DOLLAR
: // $, test for End of line
2932 // or for position before new line at end of input
2934 if (fp
->fInputIdx
>= fAnchorLimit
) {
2935 // We really are at the end of input. Success.
2941 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2943 // If we are positioned just before a new-line that is located at the
2944 // end of input, succeed.
2945 UChar32 c
= UTEXT_NEXT32(fInputText
);
2946 if (UTEXT_GETNATIVEINDEX(fInputText
) >= fAnchorLimit
) {
2947 if (isLineTerminator(c
)) {
2948 // If not in the middle of a CR/LF sequence
2949 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& ((void)UTEXT_PREVIOUS32(fInputText
), UTEXT_PREVIOUS32(fInputText
))==0x0d)) {
2950 // At new-line at end of input. Success
2958 UChar32 nextC
= UTEXT_NEXT32(fInputText
);
2959 if (c
== 0x0d && nextC
== 0x0a && UTEXT_GETNATIVEINDEX(fInputText
) >= fAnchorLimit
) {
2962 break; // At CR/LF at end of input. Success
2966 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2971 case URX_DOLLAR_D
: // $, test for End of Line, in UNIX_LINES mode.
2972 if (fp
->fInputIdx
>= fAnchorLimit
) {
2973 // Off the end of input. Success.
2978 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2979 UChar32 c
= UTEXT_NEXT32(fInputText
);
2980 // Either at the last character of input, or off the end.
2981 if (c
== 0x0a && UTEXT_GETNATIVEINDEX(fInputText
) == fAnchorLimit
) {
2988 // Not at end of input. Back-track out.
2989 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2993 case URX_DOLLAR_M
: // $, test for End of line in multi-line mode
2995 if (fp
->fInputIdx
>= fAnchorLimit
) {
2996 // We really are at the end of input. Success.
3001 // If we are positioned just before a new-line, succeed.
3002 // It makes no difference where the new-line is within the input.
3003 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3004 UChar32 c
= UTEXT_CURRENT32(fInputText
);
3005 if (isLineTerminator(c
)) {
3006 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
3007 // In multi-line mode, hitting a new-line just before the end of input does not
3008 // set the hitEnd or requireEnd flags
3009 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& UTEXT_PREVIOUS32(fInputText
)==0x0d)) {
3013 // not at a new line. Fail.
3014 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3019 case URX_DOLLAR_MD
: // $, test for End of line in multi-line and UNIX_LINES mode
3021 if (fp
->fInputIdx
>= fAnchorLimit
) {
3022 // We really are at the end of input. Success.
3024 fRequireEnd
= TRUE
; // Java set requireEnd in this case, even though
3025 break; // adding a new-line would not lose the match.
3027 // If we are not positioned just before a new-line, the test fails; backtrack out.
3028 // It makes no difference where the new-line is within the input.
3029 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3030 if (UTEXT_CURRENT32(fInputText
) != 0x0a) {
3031 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3037 case URX_CARET
: // ^, test for start of line
3038 if (fp
->fInputIdx
!= fAnchorStart
) {
3039 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3044 case URX_CARET_M
: // ^, test for start of line in mulit-line mode
3046 if (fp
->fInputIdx
== fAnchorStart
) {
3047 // We are at the start input. Success.
3050 // Check whether character just before the current pos is a new-line
3051 // unless we are at the end of input
3052 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3053 UChar32 c
= UTEXT_PREVIOUS32(fInputText
);
3054 if ((fp
->fInputIdx
< fAnchorLimit
) && isLineTerminator(c
)) {
3055 // It's a new-line. ^ is true. Success.
3056 // TODO: what should be done with positions between a CR and LF?
3059 // Not at the start of a line. Fail.
3060 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3065 case URX_CARET_M_UNIX
: // ^, test for start of line in mulit-line + Unix-line mode
3067 U_ASSERT(fp
->fInputIdx
>= fAnchorStart
);
3068 if (fp
->fInputIdx
<= fAnchorStart
) {
3069 // We are at the start input. Success.
3072 // Check whether character just before the current pos is a new-line
3073 U_ASSERT(fp
->fInputIdx
<= fAnchorLimit
);
3074 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3075 UChar32 c
= UTEXT_PREVIOUS32(fInputText
);
3077 // Not at the start of a line. Back-track out.
3078 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3083 case URX_BACKSLASH_B
: // Test for word boundaries
3085 UBool success
= isWordBoundary(fp
->fInputIdx
);
3086 success
^= (UBool
)(opValue
!= 0); // flip sense for \B
3088 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3094 case URX_BACKSLASH_BU
: // Test for word boundaries, Unicode-style
3096 UBool success
= isUWordBoundary(fp
->fInputIdx
);
3097 success
^= (UBool
)(opValue
!= 0); // flip sense for \B
3099 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3105 case URX_BACKSLASH_D
: // Test for decimal digit
3107 if (fp
->fInputIdx
>= fActiveLimit
) {
3109 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3113 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3115 UChar32 c
= UTEXT_NEXT32(fInputText
);
3116 int8_t ctype
= u_charType(c
); // TODO: make a unicode set for this. Will be faster.
3117 UBool success
= (ctype
== U_DECIMAL_DIGIT_NUMBER
);
3118 success
^= (UBool
)(opValue
!= 0); // flip sense for \D
3120 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3122 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3128 case URX_BACKSLASH_G
: // Test for position at end of previous match
3129 if (!((fMatch
&& fp
->fInputIdx
==fMatchEnd
) || (fMatch
==FALSE
&& fp
->fInputIdx
==fActiveStart
))) {
3130 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3135 case URX_BACKSLASH_H
: // Test for \h, horizontal white space.
3137 if (fp
->fInputIdx
>= fActiveLimit
) {
3139 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3142 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3143 UChar32 c
= UTEXT_NEXT32(fInputText
);
3144 int8_t ctype
= u_charType(c
);
3145 UBool success
= (ctype
== U_SPACE_SEPARATOR
|| c
== 9); // SPACE_SEPARATOR || TAB
3146 success
^= (UBool
)(opValue
!= 0); // flip sense for \H
3148 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3150 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3156 case URX_BACKSLASH_R
: // Test for \R, any line break sequence.
3158 if (fp
->fInputIdx
>= fActiveLimit
) {
3160 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3163 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3164 UChar32 c
= UTEXT_NEXT32(fInputText
);
3165 if (isLineTerminator(c
)) {
3166 if (c
== 0x0d && utext_current32(fInputText
) == 0x0a) {
3167 utext_next32(fInputText
);
3169 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3171 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3177 case URX_BACKSLASH_V
: // \v, any single line ending character.
3179 if (fp
->fInputIdx
>= fActiveLimit
) {
3181 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3184 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3185 UChar32 c
= UTEXT_NEXT32(fInputText
);
3186 UBool success
= isLineTerminator(c
);
3187 success
^= (UBool
)(opValue
!= 0); // flip sense for \V
3189 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3191 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3197 case URX_BACKSLASH_X
:
3198 // Match a Grapheme, as defined by Unicode TR 29.
3199 // Differs slightly from Perl, which consumes combining marks independently
3203 // Fail if at end of input
3204 if (fp
->fInputIdx
>= fActiveLimit
) {
3206 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3210 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3212 // Examine (and consume) the current char.
3213 // Dispatch into a little state machine, based on the char.
3215 c
= UTEXT_NEXT32(fInputText
);
3216 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3217 UnicodeSet
**sets
= fPattern
->fStaticSets
;
3218 if (sets
[URX_GC_NORMAL
]->contains(c
)) goto GC_Extend
;
3219 if (sets
[URX_GC_CONTROL
]->contains(c
)) goto GC_Control
;
3220 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
3221 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
3222 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
3223 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
3224 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
3230 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
3231 c
= UTEXT_NEXT32(fInputText
);
3232 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3233 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
3234 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
3235 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
3236 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
3237 (void)UTEXT_PREVIOUS32(fInputText
);
3238 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3242 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
3243 c
= UTEXT_NEXT32(fInputText
);
3244 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3245 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
3246 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
3247 (void)UTEXT_PREVIOUS32(fInputText
);
3248 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3252 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
3253 c
= UTEXT_NEXT32(fInputText
);
3254 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3255 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
3256 (void)UTEXT_PREVIOUS32(fInputText
);
3257 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3261 // Combining characters are consumed here
3263 if (fp
->fInputIdx
>= fActiveLimit
) {
3266 c
= UTEXT_CURRENT32(fInputText
);
3267 if (sets
[URX_GC_EXTEND
]->contains(c
) == FALSE
) {
3270 (void)UTEXT_NEXT32(fInputText
);
3271 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3276 // Most control chars stand alone (don't combine with combining chars),
3277 // except for that CR/LF sequence is a single grapheme cluster.
3278 if (c
== 0x0d && fp
->fInputIdx
< fActiveLimit
&& UTEXT_CURRENT32(fInputText
) == 0x0a) {
3279 c
= UTEXT_NEXT32(fInputText
);
3280 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3284 if (fp
->fInputIdx
>= fActiveLimit
) {
3293 case URX_BACKSLASH_Z
: // Test for end of Input
3294 if (fp
->fInputIdx
< fAnchorLimit
) {
3295 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3304 case URX_STATIC_SETREF
:
3306 // Test input character against one of the predefined sets
3307 // (Word Characters, for example)
3308 // The high bit of the op value is a flag for the match polarity.
3309 // 0: success if input char is in set.
3310 // 1: success if input char is not in set.
3311 if (fp
->fInputIdx
>= fActiveLimit
) {
3313 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3317 UBool success
= ((opValue
& URX_NEG_SET
) == URX_NEG_SET
);
3318 opValue
&= ~URX_NEG_SET
;
3319 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
3321 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3322 UChar32 c
= UTEXT_NEXT32(fInputText
);
3324 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
3325 if (s8
->contains(c
)) {
3329 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
3330 if (s
->contains(c
)) {
3335 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3337 // the character wasn't in the set.
3338 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3344 case URX_STAT_SETREF_N
:
3346 // Test input character for NOT being a member of one of
3347 // the predefined sets (Word Characters, for example)
3348 if (fp
->fInputIdx
>= fActiveLimit
) {
3350 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3354 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
3356 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3358 UChar32 c
= UTEXT_NEXT32(fInputText
);
3360 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
3361 if (s8
->contains(c
) == FALSE
) {
3362 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3366 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
3367 if (s
->contains(c
) == FALSE
) {
3368 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3372 // the character wasn't in the set.
3373 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3379 if (fp
->fInputIdx
>= fActiveLimit
) {
3381 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3384 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3386 // There is input left. Pick up one char and test it for set membership.
3387 UChar32 c
= UTEXT_NEXT32(fInputText
);
3388 U_ASSERT(opValue
> 0 && opValue
< sets
->size());
3390 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
3391 if (s8
->contains(c
)) {
3392 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3396 UnicodeSet
*s
= (UnicodeSet
*)sets
->elementAt(opValue
);
3397 if (s
->contains(c
)) {
3398 // The character is in the set. A Match.
3399 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3404 // the character wasn't in the set.
3405 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3412 // . matches anything, but stops at end-of-line.
3413 if (fp
->fInputIdx
>= fActiveLimit
) {
3414 // At end of input. Match failed. Backtrack out.
3416 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3420 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3422 // There is input left. Advance over one char, unless we've hit end-of-line
3423 UChar32 c
= UTEXT_NEXT32(fInputText
);
3424 if (isLineTerminator(c
)) {
3425 // End of line in normal mode. . does not match.
3426 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3429 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3434 case URX_DOTANY_ALL
:
3436 // ., in dot-matches-all (including new lines) mode
3437 if (fp
->fInputIdx
>= fActiveLimit
) {
3438 // At end of input. Match failed. Backtrack out.
3440 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3444 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3446 // There is input left. Advance over one char, except if we are
3447 // at a cr/lf, advance over both of them.
3449 c
= UTEXT_NEXT32(fInputText
);
3450 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3451 if (c
==0x0d && fp
->fInputIdx
< fActiveLimit
) {
3452 // In the case of a CR/LF, we need to advance over both.
3453 UChar32 nextc
= UTEXT_CURRENT32(fInputText
);
3454 if (nextc
== 0x0a) {
3455 (void)UTEXT_NEXT32(fInputText
);
3456 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3463 case URX_DOTANY_UNIX
:
3465 // '.' operator, matches all, but stops at end-of-line.
3466 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
3467 if (fp
->fInputIdx
>= fActiveLimit
) {
3468 // At end of input. Match failed. Backtrack out.
3470 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3474 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3476 // There is input left. Advance over one char, unless we've hit end-of-line
3477 UChar32 c
= UTEXT_NEXT32(fInputText
);
3479 // End of line in normal mode. '.' does not match the \n
3480 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3482 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3489 fp
->fPatIdx
= opValue
;
3497 U_ASSERT(opValue
< fPattern
->fCompiledPat
->size());
3498 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
3499 fp
->fPatIdx
= opValue
; // Then JMP.
3503 // This opcode is used with (x)+, when x can match a zero length string.
3504 // Same as JMP_SAV, except conditional on the match having made forward progress.
3505 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3506 // data address of the input position at the start of the loop.
3508 U_ASSERT(opValue
> 0 && opValue
< fPattern
->fCompiledPat
->size());
3509 int32_t stoOp
= (int32_t)pat
[opValue
-1];
3510 U_ASSERT(URX_TYPE(stoOp
) == URX_STO_INP_LOC
);
3511 int32_t frameLoc
= URX_VAL(stoOp
);
3512 U_ASSERT(frameLoc
>= 0 && frameLoc
< fFrameSize
);
3513 int64_t prevInputIdx
= fp
->fExtra
[frameLoc
];
3514 U_ASSERT(prevInputIdx
<= fp
->fInputIdx
);
3515 if (prevInputIdx
< fp
->fInputIdx
) {
3516 // The match did make progress. Repeat the loop.
3517 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
3518 fp
->fPatIdx
= opValue
;
3519 fp
->fExtra
[frameLoc
] = fp
->fInputIdx
;
3521 // If the input position did not advance, we do nothing here,
3522 // execution will fall out of the loop.
3528 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
3529 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
3531 // Pick up the three extra operands that CTR_INIT has, and
3532 // skip the pattern location counter past
3533 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
3535 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
3536 int32_t minCount
= (int32_t)pat
[instrOperandLoc
+1];
3537 int32_t maxCount
= (int32_t)pat
[instrOperandLoc
+2];
3538 U_ASSERT(minCount
>=0);
3539 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
3540 U_ASSERT(loopLoc
>=fp
->fPatIdx
);
3542 if (minCount
== 0) {
3543 fp
= StateSave(fp
, loopLoc
+1, status
);
3545 if (maxCount
== -1) {
3546 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // For loop breaking.
3547 } else if (maxCount
== 0) {
3548 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3555 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
3556 int32_t initOp
= (int32_t)pat
[opValue
];
3557 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT
);
3558 int64_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
3559 int32_t minCount
= (int32_t)pat
[opValue
+2];
3560 int32_t maxCount
= (int32_t)pat
[opValue
+3];
3562 if ((uint64_t)*pCounter
>= (uint32_t)maxCount
&& maxCount
!= -1) {
3563 U_ASSERT(*pCounter
== maxCount
);
3566 if (*pCounter
>= minCount
) {
3567 if (maxCount
== -1) {
3568 // Loop has no hard upper bound.
3569 // Check that it is progressing through the input, break if it is not.
3570 int64_t *pLastInputIdx
= &fp
->fExtra
[URX_VAL(initOp
) + 1];
3571 if (fp
->fInputIdx
== *pLastInputIdx
) {
3574 *pLastInputIdx
= fp
->fInputIdx
;
3577 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
3579 // Increment time-out counter. (StateSave() does it if count >= minCount)
3581 if (fTickCounter
<= 0) {
3582 IncrementTime(status
); // Re-initializes fTickCounter
3586 fp
->fPatIdx
= opValue
+ 4; // Loop back.
3590 case URX_CTR_INIT_NG
:
3592 // Initialize a non-greedy loop
3593 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
3594 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
3596 // Pick up the three extra operands that CTR_INIT_NG has, and
3597 // skip the pattern location counter past
3598 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
3600 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
3601 int32_t minCount
= (int32_t)pat
[instrOperandLoc
+1];
3602 int32_t maxCount
= (int32_t)pat
[instrOperandLoc
+2];
3603 U_ASSERT(minCount
>=0);
3604 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
3605 U_ASSERT(loopLoc
>fp
->fPatIdx
);
3606 if (maxCount
== -1) {
3607 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // Save initial input index for loop breaking.
3610 if (minCount
== 0) {
3611 if (maxCount
!= 0) {
3612 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
3614 fp
->fPatIdx
= loopLoc
+1; // Continue with stuff after repeated block
3619 case URX_CTR_LOOP_NG
:
3621 // Non-greedy {min, max} loops
3622 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
3623 int32_t initOp
= (int32_t)pat
[opValue
];
3624 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT_NG
);
3625 int64_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
3626 int32_t minCount
= (int32_t)pat
[opValue
+2];
3627 int32_t maxCount
= (int32_t)pat
[opValue
+3];
3630 if ((uint64_t)*pCounter
>= (uint32_t)maxCount
&& maxCount
!= -1) {
3631 // The loop has matched the maximum permitted number of times.
3632 // Break out of here with no action. Matching will
3633 // continue with the following pattern.
3634 U_ASSERT(*pCounter
== maxCount
);
3638 if (*pCounter
< minCount
) {
3639 // We haven't met the minimum number of matches yet.
3640 // Loop back for another one.
3641 fp
->fPatIdx
= opValue
+ 4; // Loop back.
3642 // Increment time-out counter. (StateSave() does it if count >= minCount)
3644 if (fTickCounter
<= 0) {
3645 IncrementTime(status
); // Re-initializes fTickCounter
3648 // We do have the minimum number of matches.
3650 // If there is no upper bound on the loop iterations, check that the input index
3651 // is progressing, and stop the loop if it is not.
3652 if (maxCount
== -1) {
3653 int64_t *pLastInputIdx
= &fp
->fExtra
[URX_VAL(initOp
) + 1];
3654 if (fp
->fInputIdx
== *pLastInputIdx
) {
3657 *pLastInputIdx
= fp
->fInputIdx
;
3660 // Loop Continuation: we will fall into the pattern following the loop
3661 // (non-greedy, don't execute loop body first), but first do
3662 // a state save to the top of the loop, so that a match failure
3663 // in the following pattern will try another iteration of the loop.
3664 fp
= StateSave(fp
, opValue
+ 4, status
);
3670 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
3671 fData
[opValue
] = fStack
->size();
3676 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
3677 int32_t newStackSize
= (int32_t)fData
[opValue
];
3678 U_ASSERT(newStackSize
<= fStack
->size());
3679 int64_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
3680 if (newFP
== (int64_t *)fp
) {
3684 for (i
=0; i
<fFrameSize
; i
++) {
3685 newFP
[i
] = ((int64_t *)fp
)[i
];
3687 fp
= (REStackFrame
*)newFP
;
3688 fStack
->setSize(newStackSize
);
3694 U_ASSERT(opValue
< fFrameSize
);
3695 int64_t groupStartIdx
= fp
->fExtra
[opValue
];
3696 int64_t groupEndIdx
= fp
->fExtra
[opValue
+1];
3697 U_ASSERT(groupStartIdx
<= groupEndIdx
);
3698 if (groupStartIdx
< 0) {
3699 // This capture group has not participated in the match thus far,
3700 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
3703 UTEXT_SETNATIVEINDEX(fAltInputText
, groupStartIdx
);
3704 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3706 // Note: if the capture group match was of an empty string the backref
3707 // match succeeds. Verified by testing: Perl matches succeed
3708 // in this case, so we do too.
3710 UBool success
= TRUE
;
3712 if (utext_getNativeIndex(fAltInputText
) >= groupEndIdx
) {
3716 if (utext_getNativeIndex(fInputText
) >= fActiveLimit
) {
3721 UChar32 captureGroupChar
= utext_next32(fAltInputText
);
3722 UChar32 inputChar
= utext_next32(fInputText
);
3723 if (inputChar
!= captureGroupChar
) {
3730 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3732 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3741 U_ASSERT(opValue
< fFrameSize
);
3742 int64_t groupStartIdx
= fp
->fExtra
[opValue
];
3743 int64_t groupEndIdx
= fp
->fExtra
[opValue
+1];
3744 U_ASSERT(groupStartIdx
<= groupEndIdx
);
3745 if (groupStartIdx
< 0) {
3746 // This capture group has not participated in the match thus far,
3747 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
3750 utext_setNativeIndex(fAltInputText
, groupStartIdx
);
3751 utext_setNativeIndex(fInputText
, fp
->fInputIdx
);
3752 CaseFoldingUTextIterator
captureGroupItr(*fAltInputText
);
3753 CaseFoldingUTextIterator
inputItr(*fInputText
);
3755 // Note: if the capture group match was of an empty string the backref
3756 // match succeeds. Verified by testing: Perl matches succeed
3757 // in this case, so we do too.
3759 UBool success
= TRUE
;
3761 if (!captureGroupItr
.inExpansion() && utext_getNativeIndex(fAltInputText
) >= groupEndIdx
) {
3765 if (!inputItr
.inExpansion() && utext_getNativeIndex(fInputText
) >= fActiveLimit
) {
3770 UChar32 captureGroupChar
= captureGroupItr
.next();
3771 UChar32 inputChar
= inputItr
.next();
3772 if (inputChar
!= captureGroupChar
) {
3778 if (success
&& inputItr
.inExpansion()) {
3779 // We otained a match by consuming part of a string obtained from
3780 // case-folding a single code point of the input text.
3781 // This does not count as an overall match.
3786 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3788 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3794 case URX_STO_INP_LOC
:
3796 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
);
3797 fp
->fExtra
[opValue
] = fp
->fInputIdx
;
3803 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
3805 int32_t dataLoc
= URX_VAL(pat
[instrOperandLoc
]);
3806 U_ASSERT(dataLoc
>= 0 && dataLoc
< fFrameSize
);
3807 int64_t savedInputIdx
= fp
->fExtra
[dataLoc
];
3808 U_ASSERT(savedInputIdx
<= fp
->fInputIdx
);
3809 if (savedInputIdx
< fp
->fInputIdx
) {
3810 fp
->fPatIdx
= opValue
; // JMP
3812 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no progress in loop.
3819 // Entering a lookahead block.
3820 // Save Stack Ptr, Input Pos.
3821 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3822 fData
[opValue
] = fStack
->size();
3823 fData
[opValue
+1] = fp
->fInputIdx
;
3824 fActiveStart
= fLookStart
; // Set the match region change for
3825 fActiveLimit
= fLookLimit
; // transparent bounds.
3831 // Leaving a look-ahead block.
3832 // restore Stack Ptr, Input Pos to positions they had on entry to block.
3833 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3834 int32_t stackSize
= fStack
->size();
3835 int32_t newStackSize
=(int32_t)fData
[opValue
];
3836 U_ASSERT(stackSize
>= newStackSize
);
3837 if (stackSize
> newStackSize
) {
3838 // Copy the current top frame back to the new (cut back) top frame.
3839 // This makes the capture groups from within the look-ahead
3840 // expression available.
3841 int64_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
3843 for (i
=0; i
<fFrameSize
; i
++) {
3844 newFP
[i
] = ((int64_t *)fp
)[i
];
3846 fp
= (REStackFrame
*)newFP
;
3847 fStack
->setSize(newStackSize
);
3849 fp
->fInputIdx
= fData
[opValue
+1];
3851 // Restore the active region bounds in the input string; they may have
3852 // been changed because of transparent bounds on a Region.
3853 fActiveStart
= fRegionStart
;
3854 fActiveLimit
= fRegionLimit
;
3859 // Case insensitive one char. The char from the pattern is already case folded.
3860 // Input text is not, but case folding the input can not reduce two or more code
3862 if (fp
->fInputIdx
< fActiveLimit
) {
3863 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3865 UChar32 c
= UTEXT_NEXT32(fInputText
);
3866 if (u_foldCase(c
, U_FOLD_CASE_DEFAULT
) == opValue
) {
3867 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3874 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3879 // Case-insensitive test input against a literal string.
3880 // Strings require two slots in the compiled pattern, one for the
3881 // offset to the string text, and one for the length.
3882 // The compiled string has already been case folded.
3884 const UChar
*patternString
= litText
+ opValue
;
3885 int32_t patternStringIdx
= 0;
3887 op
= (int32_t)pat
[fp
->fPatIdx
];
3889 opType
= URX_TYPE(op
);
3890 opValue
= URX_VAL(op
);
3891 U_ASSERT(opType
== URX_STRING_LEN
);
3892 int32_t patternStringLen
= opValue
; // Length of the string from the pattern.
3897 UBool success
= TRUE
;
3899 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3900 CaseFoldingUTextIterator
inputIterator(*fInputText
);
3901 while (patternStringIdx
< patternStringLen
) {
3902 if (!inputIterator
.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText
) >= fActiveLimit
) {
3907 U16_NEXT(patternString
, patternStringIdx
, patternStringLen
, cPattern
);
3908 cText
= inputIterator
.next();
3909 if (cText
!= cPattern
) {
3914 if (inputIterator
.inExpansion()) {
3919 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3921 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3929 // Entering a look-behind block.
3930 // Save Stack Ptr, Input Pos.
3931 // TODO: implement transparent bounds. Ticket #6067
3932 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3933 fData
[opValue
] = fStack
->size();
3934 fData
[opValue
+1] = fp
->fInputIdx
;
3935 // Init the variable containing the start index for attempted matches.
3936 fData
[opValue
+2] = -1;
3937 // Save input string length, then reset to pin any matches to end at
3938 // the current position.
3939 fData
[opValue
+3] = fActiveLimit
;
3940 fActiveLimit
= fp
->fInputIdx
;
3947 // Positive Look-Behind, at top of loop checking for matches of LB expression
3948 // at all possible input starting positions.
3950 // Fetch the min and max possible match lengths. They are the operands
3951 // of this op in the pattern.
3952 int32_t minML
= (int32_t)pat
[fp
->fPatIdx
++];
3953 int32_t maxML
= (int32_t)pat
[fp
->fPatIdx
++];
3954 if (!UTEXT_USES_U16(fInputText
)) {
3955 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
3956 // The max length need not be exact; it just needs to be >= actual maximum.
3959 U_ASSERT(minML
<= maxML
);
3960 U_ASSERT(minML
>= 0);
3962 // Fetch (from data) the last input index where a match was attempted.
3963 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3964 int64_t &lbStartIdx
= fData
[opValue
+2];
3965 if (lbStartIdx
< 0) {
3966 // First time through loop.
3967 lbStartIdx
= fp
->fInputIdx
- minML
;
3968 if (lbStartIdx
> 0) {
3969 // move index to a code point boudary, if it's not on one already.
3970 UTEXT_SETNATIVEINDEX(fInputText
, lbStartIdx
);
3971 lbStartIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3974 // 2nd through nth time through the loop.
3975 // Back up start position for match by one.
3976 if (lbStartIdx
== 0) {
3979 UTEXT_SETNATIVEINDEX(fInputText
, lbStartIdx
);
3980 (void)UTEXT_PREVIOUS32(fInputText
);
3981 lbStartIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3985 if (lbStartIdx
< 0 || lbStartIdx
< fp
->fInputIdx
- maxML
) {
3986 // We have tried all potential match starting points without
3987 // getting a match. Backtrack out, and out of the
3988 // Look Behind altogether.
3989 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3990 int64_t restoreInputLen
= fData
[opValue
+3];
3991 U_ASSERT(restoreInputLen
>= fActiveLimit
);
3992 U_ASSERT(restoreInputLen
<= fInputLength
);
3993 fActiveLimit
= restoreInputLen
;
3997 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3998 // (successful match will fall off the end of the loop.)
3999 fp
= StateSave(fp
, fp
->fPatIdx
-3, status
);
4000 fp
->fInputIdx
= lbStartIdx
;
4005 // End of a look-behind block, after a successful match.
4007 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
4008 if (fp
->fInputIdx
!= fActiveLimit
) {
4009 // The look-behind expression matched, but the match did not
4010 // extend all the way to the point that we are looking behind from.
4011 // FAIL out of here, which will take us back to the LB_CONT, which
4012 // will retry the match starting at another position or fail
4013 // the look-behind altogether, whichever is appropriate.
4014 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4018 // Look-behind match is good. Restore the orignal input string length,
4019 // which had been truncated to pin the end of the lookbehind match to the
4020 // position being looked-behind.
4021 int64_t originalInputLen
= fData
[opValue
+3];
4022 U_ASSERT(originalInputLen
>= fActiveLimit
);
4023 U_ASSERT(originalInputLen
<= fInputLength
);
4024 fActiveLimit
= originalInputLen
;
4031 // Negative Look-Behind, at top of loop checking for matches of LB expression
4032 // at all possible input starting positions.
4034 // Fetch the extra parameters of this op.
4035 int32_t minML
= (int32_t)pat
[fp
->fPatIdx
++];
4036 int32_t maxML
= (int32_t)pat
[fp
->fPatIdx
++];
4037 if (!UTEXT_USES_U16(fInputText
)) {
4038 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
4039 // The max length need not be exact; it just needs to be >= actual maximum.
4042 int32_t continueLoc
= (int32_t)pat
[fp
->fPatIdx
++];
4043 continueLoc
= URX_VAL(continueLoc
);
4044 U_ASSERT(minML
<= maxML
);
4045 U_ASSERT(minML
>= 0);
4046 U_ASSERT(continueLoc
> fp
->fPatIdx
);
4048 // Fetch (from data) the last input index where a match was attempted.
4049 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
4050 int64_t &lbStartIdx
= fData
[opValue
+2];
4051 if (lbStartIdx
< 0) {
4052 // First time through loop.
4053 lbStartIdx
= fp
->fInputIdx
- minML
;
4054 if (lbStartIdx
> 0) {
4055 // move index to a code point boudary, if it's not on one already.
4056 UTEXT_SETNATIVEINDEX(fInputText
, lbStartIdx
);
4057 lbStartIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
4060 // 2nd through nth time through the loop.
4061 // Back up start position for match by one.
4062 if (lbStartIdx
== 0) {
4065 UTEXT_SETNATIVEINDEX(fInputText
, lbStartIdx
);
4066 (void)UTEXT_PREVIOUS32(fInputText
);
4067 lbStartIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
4071 if (lbStartIdx
< 0 || lbStartIdx
< fp
->fInputIdx
- maxML
) {
4072 // We have tried all potential match starting points without
4073 // getting a match, which means that the negative lookbehind as
4074 // a whole has succeeded. Jump forward to the continue location
4075 int64_t restoreInputLen
= fData
[opValue
+3];
4076 U_ASSERT(restoreInputLen
>= fActiveLimit
);
4077 U_ASSERT(restoreInputLen
<= fInputLength
);
4078 fActiveLimit
= restoreInputLen
;
4079 fp
->fPatIdx
= continueLoc
;
4083 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4084 // (successful match will cause a FAIL out of the loop altogether.)
4085 fp
= StateSave(fp
, fp
->fPatIdx
-4, status
);
4086 fp
->fInputIdx
= lbStartIdx
;
4091 // End of a negative look-behind block, after a successful match.
4093 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
4094 if (fp
->fInputIdx
!= fActiveLimit
) {
4095 // The look-behind expression matched, but the match did not
4096 // extend all the way to the point that we are looking behind from.
4097 // FAIL out of here, which will take us back to the LB_CONT, which
4098 // will retry the match starting at another position or succeed
4099 // the look-behind altogether, whichever is appropriate.
4100 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4104 // Look-behind expression matched, which means look-behind test as
4107 // Restore the orignal input string length, which had been truncated
4108 // inorder to pin the end of the lookbehind match
4109 // to the position being looked-behind.
4110 int64_t originalInputLen
= fData
[opValue
+3];
4111 U_ASSERT(originalInputLen
>= fActiveLimit
);
4112 U_ASSERT(originalInputLen
<= fInputLength
);
4113 fActiveLimit
= originalInputLen
;
4115 // Restore original stack position, discarding any state saved
4116 // by the successful pattern match.
4117 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
4118 int32_t newStackSize
= (int32_t)fData
[opValue
];
4119 U_ASSERT(fStack
->size() > newStackSize
);
4120 fStack
->setSize(newStackSize
);
4122 // FAIL, which will take control back to someplace
4123 // prior to entering the look-behind test.
4124 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4130 // Loop Initialization for the optimized implementation of
4131 // [some character set]*
4132 // This op scans through all matching input.
4133 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4135 U_ASSERT(opValue
> 0 && opValue
< sets
->size());
4136 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
4137 UnicodeSet
*s
= (UnicodeSet
*)sets
->elementAt(opValue
);
4139 // Loop through input, until either the input is exhausted or
4140 // we reach a character that is not a member of the set.
4141 int64_t ix
= fp
->fInputIdx
;
4142 UTEXT_SETNATIVEINDEX(fInputText
, ix
);
4144 if (ix
>= fActiveLimit
) {
4148 UChar32 c
= UTEXT_NEXT32(fInputText
);
4150 if (s8
->contains(c
) == FALSE
) {
4154 if (s
->contains(c
) == FALSE
) {
4158 ix
= UTEXT_GETNATIVEINDEX(fInputText
);
4161 // If there were no matching characters, skip over the loop altogether.
4162 // The loop doesn't run at all, a * op always succeeds.
4163 if (ix
== fp
->fInputIdx
) {
4164 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
4168 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4169 // must follow. It's operand is the stack location
4170 // that holds the starting input index for the match of this [set]*
4171 int32_t loopcOp
= (int32_t)pat
[fp
->fPatIdx
];
4172 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
4173 int32_t stackLoc
= URX_VAL(loopcOp
);
4174 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
4175 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
4178 // Save State to the URX_LOOP_C op that follows this one,
4179 // so that match failures in the following code will return to there.
4180 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4181 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
4187 case URX_LOOP_DOT_I
:
4188 // Loop Initialization for the optimized implementation of .*
4189 // This op scans through all remaining input.
4190 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4192 // Loop through input until the input is exhausted (we reach an end-of-line)
4193 // In DOTALL mode, we can just go straight to the end of the input.
4195 if ((opValue
& 1) == 1) {
4196 // Dot-matches-All mode. Jump straight to the end of the string.
4200 // NOT DOT ALL mode. Line endings do not match '.'
4201 // Scan forward until a line ending or end of input.
4203 UTEXT_SETNATIVEINDEX(fInputText
, ix
);
4205 if (ix
>= fActiveLimit
) {
4209 UChar32 c
= UTEXT_NEXT32(fInputText
);
4210 if ((c
& 0x7f) <= 0x29) { // Fast filter of non-new-line-s
4211 if ((c
== 0x0a) || // 0x0a is newline in both modes.
4212 (((opValue
& 2) == 0) && // IF not UNIX_LINES mode
4213 isLineTerminator(c
))) {
4214 // char is a line ending. Exit the scanning loop.
4218 ix
= UTEXT_GETNATIVEINDEX(fInputText
);
4222 // If there were no matching characters, skip over the loop altogether.
4223 // The loop doesn't run at all, a * op always succeeds.
4224 if (ix
== fp
->fInputIdx
) {
4225 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
4229 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4230 // must follow. It's operand is the stack location
4231 // that holds the starting input index for the match of this .*
4232 int32_t loopcOp
= (int32_t)pat
[fp
->fPatIdx
];
4233 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
4234 int32_t stackLoc
= URX_VAL(loopcOp
);
4235 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
4236 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
4239 // Save State to the URX_LOOP_C op that follows this one,
4240 // so that match failures in the following code will return to there.
4241 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4242 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
4250 U_ASSERT(opValue
>=0 && opValue
<fFrameSize
);
4251 backSearchIndex
= fp
->fExtra
[opValue
];
4252 U_ASSERT(backSearchIndex
<= fp
->fInputIdx
);
4253 if (backSearchIndex
== fp
->fInputIdx
) {
4254 // We've backed up the input idx to the point that the loop started.
4255 // The loop is done. Leave here without saving state.
4256 // Subsequent failures won't come back here.
4259 // Set up for the next iteration of the loop, with input index
4260 // backed up by one from the last time through,
4261 // and a state save to this instruction in case the following code fails again.
4262 // (We're going backwards because this loop emulates stack unwinding, not
4263 // the initial scan forward.)
4264 U_ASSERT(fp
->fInputIdx
> 0);
4265 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
4266 UChar32 prevC
= UTEXT_PREVIOUS32(fInputText
);
4267 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
4269 UChar32 twoPrevC
= UTEXT_PREVIOUS32(fInputText
);
4270 if (prevC
== 0x0a &&
4271 fp
->fInputIdx
> backSearchIndex
&&
4273 int32_t prevOp
= (int32_t)pat
[fp
->fPatIdx
-2];
4274 if (URX_TYPE(prevOp
) == URX_LOOP_DOT_I
) {
4275 // .*, stepping back over CRLF pair.
4276 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
4281 fp
= StateSave(fp
, fp
->fPatIdx
-1, status
);
4288 // Trouble. The compiled pattern contains an entry with an
4289 // unrecognized type tag.
4293 if (U_FAILURE(status
)) {
4302 fLastMatchEnd
= fMatchEnd
;
4303 fMatchStart
= startIdx
;
4304 fMatchEnd
= fp
->fInputIdx
;
4307 #ifdef REGEX_RUN_DEBUG
4310 printf("Match. start=%ld end=%ld\n\n", fMatchStart
, fMatchEnd
);
4312 printf("No match\n\n");
4317 fFrame
= fp
; // The active stack frame when the engine stopped.
4318 // Contains the capture group results that we need to
4324 //--------------------------------------------------------------------------------
4326 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with the
4327 // assumption that the entire string is available in the UText's
4328 // chunk buffer. For now, that means we can use int32_t indexes,
4329 // except for anything that needs to be saved (like group starts
4332 // startIdx: begin matching a this index.
4333 // toEnd: if true, match must extend to end of the input region
4335 //--------------------------------------------------------------------------------
4336 void RegexMatcher::MatchChunkAt(int32_t startIdx
, UBool toEnd
, UErrorCode
&status
) {
4337 UBool isMatch
= FALSE
; // True if the we have a match.
4339 int32_t backSearchIndex
= INT32_MAX
; // used after greedy single-character matches for searching backwards
4341 int32_t op
; // Operation from the compiled pattern, split into
4342 int32_t opType
; // the opcode
4343 int32_t opValue
; // and the operand value.
4345 #ifdef REGEX_RUN_DEBUG
4347 printf("MatchAt(startIdx=%d)\n", startIdx
);
4348 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern
->fPattern
))());
4349 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText
))());
4353 if (U_FAILURE(status
)) {
4357 // Cache frequently referenced items from the compiled pattern
4359 int64_t *pat
= fPattern
->fCompiledPat
->getBuffer();
4361 const UChar
*litText
= fPattern
->fLiteralText
.getBuffer();
4362 UVector
*sets
= fPattern
->fSets
;
4364 const UChar
*inputBuf
= fInputText
->chunkContents
;
4366 fFrameSize
= fPattern
->fFrameSize
;
4367 REStackFrame
*fp
= resetStack();
4368 if (U_FAILURE(fDeferredStatus
)) {
4369 status
= fDeferredStatus
;
4374 fp
->fInputIdx
= startIdx
;
4376 // Zero out the pattern's static data
4378 for (i
= 0; i
<fPattern
->fDataSize
; i
++) {
4383 // Main loop for interpreting the compiled pattern.
4384 // One iteration of the loop per pattern operation performed.
4387 op
= (int32_t)pat
[fp
->fPatIdx
];
4388 opType
= URX_TYPE(op
);
4389 opValue
= URX_VAL(op
);
4390 #ifdef REGEX_RUN_DEBUG
4392 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
4393 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp
->fInputIdx
,
4394 UTEXT_CURRENT32(fInputText
), (int64_t *)fp
-fStack
->getBuffer(), fActiveLimit
);
4395 fPattern
->dumpOp(fp
->fPatIdx
);
4408 // Force a backtrack. In some circumstances, the pattern compiler
4409 // will notice that the pattern can't possibly match anything, and will
4410 // emit one of these at that point.
4411 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4416 if (fp
->fInputIdx
< fActiveLimit
) {
4418 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4425 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4431 // Test input against a literal string.
4432 // Strings require two slots in the compiled pattern, one for the
4433 // offset to the string text, and one for the length.
4434 int32_t stringStartIdx
= opValue
;
4437 op
= (int32_t)pat
[fp
->fPatIdx
]; // Fetch the second operand
4439 opType
= URX_TYPE(op
);
4440 stringLen
= URX_VAL(op
);
4441 U_ASSERT(opType
== URX_STRING_LEN
);
4442 U_ASSERT(stringLen
>= 2);
4444 const UChar
* pInp
= inputBuf
+ fp
->fInputIdx
;
4445 const UChar
* pInpLimit
= inputBuf
+ fActiveLimit
;
4446 const UChar
* pPat
= litText
+stringStartIdx
;
4447 const UChar
* pEnd
= pInp
+ stringLen
;
4448 UBool success
= TRUE
;
4449 while (pInp
< pEnd
) {
4450 if (pInp
>= pInpLimit
) {
4455 if (*pInp
++ != *pPat
++) {
4462 fp
->fInputIdx
+= stringLen
;
4464 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4470 case URX_STATE_SAVE
:
4471 fp
= StateSave(fp
, opValue
, status
);
4476 // The match loop will exit via this path on a successful match,
4477 // when we reach the end of the pattern.
4478 if (toEnd
&& fp
->fInputIdx
!= fActiveLimit
) {
4479 // The pattern matched, but not to the end of input. Try some more.
4480 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4486 // Start and End Capture stack frame variables are laid out out like this:
4487 // fp->fExtra[opValue] - The start of a completed capture group
4488 // opValue+1 - The end of a completed capture group
4489 // opValue+2 - the start of a capture group whose end
4490 // has not yet been reached (and might not ever be).
4491 case URX_START_CAPTURE
:
4492 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
4493 fp
->fExtra
[opValue
+2] = fp
->fInputIdx
;
4497 case URX_END_CAPTURE
:
4498 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
4499 U_ASSERT(fp
->fExtra
[opValue
+2] >= 0); // Start pos for this group must be set.
4500 fp
->fExtra
[opValue
] = fp
->fExtra
[opValue
+2]; // Tentative start becomes real.
4501 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // End position
4502 U_ASSERT(fp
->fExtra
[opValue
] <= fp
->fExtra
[opValue
+1]);
4506 case URX_DOLLAR
: // $, test for End of line
4507 // or for position before new line at end of input
4508 if (fp
->fInputIdx
< fAnchorLimit
-2) {
4509 // We are no where near the end of input. Fail.
4510 // This is the common case. Keep it first.
4511 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4514 if (fp
->fInputIdx
>= fAnchorLimit
) {
4515 // We really are at the end of input. Success.
4521 // If we are positioned just before a new-line that is located at the
4522 // end of input, succeed.
4523 if (fp
->fInputIdx
== fAnchorLimit
-1) {
4525 U16_GET(inputBuf
, fAnchorStart
, fp
->fInputIdx
, fAnchorLimit
, c
);
4527 if (isLineTerminator(c
)) {
4528 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& inputBuf
[fp
->fInputIdx
-1]==0x0d)) {
4529 // At new-line at end of input. Success
4535 } else if (fp
->fInputIdx
== fAnchorLimit
-2 &&
4536 inputBuf
[fp
->fInputIdx
]==0x0d && inputBuf
[fp
->fInputIdx
+1]==0x0a) {
4539 break; // At CR/LF at end of input. Success
4542 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4547 case URX_DOLLAR_D
: // $, test for End of Line, in UNIX_LINES mode.
4548 if (fp
->fInputIdx
>= fAnchorLimit
-1) {
4549 // Either at the last character of input, or off the end.
4550 if (fp
->fInputIdx
== fAnchorLimit
-1) {
4551 // At last char of input. Success if it's a new line.
4552 if (inputBuf
[fp
->fInputIdx
] == 0x0a) {
4558 // Off the end of input. Success.
4565 // Not at end of input. Back-track out.
4566 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4570 case URX_DOLLAR_M
: // $, test for End of line in multi-line mode
4572 if (fp
->fInputIdx
>= fAnchorLimit
) {
4573 // We really are at the end of input. Success.
4578 // If we are positioned just before a new-line, succeed.
4579 // It makes no difference where the new-line is within the input.
4580 UChar32 c
= inputBuf
[fp
->fInputIdx
];
4581 if (isLineTerminator(c
)) {
4582 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
4583 // In multi-line mode, hitting a new-line just before the end of input does not
4584 // set the hitEnd or requireEnd flags
4585 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& inputBuf
[fp
->fInputIdx
-1]==0x0d)) {
4589 // not at a new line. Fail.
4590 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4595 case URX_DOLLAR_MD
: // $, test for End of line in multi-line and UNIX_LINES mode
4597 if (fp
->fInputIdx
>= fAnchorLimit
) {
4598 // We really are at the end of input. Success.
4600 fRequireEnd
= TRUE
; // Java set requireEnd in this case, even though
4601 break; // adding a new-line would not lose the match.
4603 // If we are not positioned just before a new-line, the test fails; backtrack out.
4604 // It makes no difference where the new-line is within the input.
4605 if (inputBuf
[fp
->fInputIdx
] != 0x0a) {
4606 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4612 case URX_CARET
: // ^, test for start of line
4613 if (fp
->fInputIdx
!= fAnchorStart
) {
4614 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4619 case URX_CARET_M
: // ^, test for start of line in mulit-line mode
4621 if (fp
->fInputIdx
== fAnchorStart
) {
4622 // We are at the start input. Success.
4625 // Check whether character just before the current pos is a new-line
4626 // unless we are at the end of input
4627 UChar c
= inputBuf
[fp
->fInputIdx
- 1];
4628 if ((fp
->fInputIdx
< fAnchorLimit
) &&
4629 isLineTerminator(c
)) {
4630 // It's a new-line. ^ is true. Success.
4631 // TODO: what should be done with positions between a CR and LF?
4634 // Not at the start of a line. Fail.
4635 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4640 case URX_CARET_M_UNIX
: // ^, test for start of line in mulit-line + Unix-line mode
4642 U_ASSERT(fp
->fInputIdx
>= fAnchorStart
);
4643 if (fp
->fInputIdx
<= fAnchorStart
) {
4644 // We are at the start input. Success.
4647 // Check whether character just before the current pos is a new-line
4648 U_ASSERT(fp
->fInputIdx
<= fAnchorLimit
);
4649 UChar c
= inputBuf
[fp
->fInputIdx
- 1];
4651 // Not at the start of a line. Back-track out.
4652 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4657 case URX_BACKSLASH_B
: // Test for word boundaries
4659 UBool success
= isChunkWordBoundary((int32_t)fp
->fInputIdx
);
4660 success
^= (UBool
)(opValue
!= 0); // flip sense for \B
4662 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4668 case URX_BACKSLASH_BU
: // Test for word boundaries, Unicode-style
4670 UBool success
= isUWordBoundary(fp
->fInputIdx
);
4671 success
^= (UBool
)(opValue
!= 0); // flip sense for \B
4673 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4679 case URX_BACKSLASH_D
: // Test for decimal digit
4681 if (fp
->fInputIdx
>= fActiveLimit
) {
4683 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4688 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4689 int8_t ctype
= u_charType(c
); // TODO: make a unicode set for this. Will be faster.
4690 UBool success
= (ctype
== U_DECIMAL_DIGIT_NUMBER
);
4691 success
^= (UBool
)(opValue
!= 0); // flip sense for \D
4693 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4699 case URX_BACKSLASH_G
: // Test for position at end of previous match
4700 if (!((fMatch
&& fp
->fInputIdx
==fMatchEnd
) || (fMatch
==FALSE
&& fp
->fInputIdx
==fActiveStart
))) {
4701 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4706 case URX_BACKSLASH_H
: // Test for \h, horizontal white space.
4708 if (fp
->fInputIdx
>= fActiveLimit
) {
4710 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4714 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4715 int8_t ctype
= u_charType(c
);
4716 UBool success
= (ctype
== U_SPACE_SEPARATOR
|| c
== 9); // SPACE_SEPARATOR || TAB
4717 success
^= (UBool
)(opValue
!= 0); // flip sense for \H
4719 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4725 case URX_BACKSLASH_R
: // Test for \R, any line break sequence.
4727 if (fp
->fInputIdx
>= fActiveLimit
) {
4729 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4733 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4734 if (isLineTerminator(c
)) {
4735 if (c
== 0x0d && fp
->fInputIdx
< fActiveLimit
) {
4736 // Check for CR/LF sequence. Consume both together when found.
4738 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c2
);
4740 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c2
);
4744 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4750 case URX_BACKSLASH_V
: // Any single code point line ending.
4752 if (fp
->fInputIdx
>= fActiveLimit
) {
4754 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4758 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4759 UBool success
= isLineTerminator(c
);
4760 success
^= (UBool
)(opValue
!= 0); // flip sense for \V
4762 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4769 case URX_BACKSLASH_X
:
4770 // Match a Grapheme, as defined by Unicode TR 29.
4771 // Differs slightly from Perl, which consumes combining marks independently
4775 // Fail if at end of input
4776 if (fp
->fInputIdx
>= fActiveLimit
) {
4778 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4782 // Examine (and consume) the current char.
4783 // Dispatch into a little state machine, based on the char.
4785 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4786 UnicodeSet
**sets
= fPattern
->fStaticSets
;
4787 if (sets
[URX_GC_NORMAL
]->contains(c
)) goto GC_Extend
;
4788 if (sets
[URX_GC_CONTROL
]->contains(c
)) goto GC_Control
;
4789 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
4790 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
4791 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
4792 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
4793 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
4799 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
4800 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4801 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
4802 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
4803 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
4804 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
4805 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c
);
4809 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
4810 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4811 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
4812 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
4813 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c
);
4817 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
4818 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4819 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
4820 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c
);
4824 // Combining characters are consumed here
4826 if (fp
->fInputIdx
>= fActiveLimit
) {
4829 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4830 if (sets
[URX_GC_EXTEND
]->contains(c
) == FALSE
) {
4831 U16_BACK_1(inputBuf
, 0, fp
->fInputIdx
);
4838 // Most control chars stand alone (don't combine with combining chars),
4839 // except for that CR/LF sequence is a single grapheme cluster.
4840 if (c
== 0x0d && fp
->fInputIdx
< fActiveLimit
&& inputBuf
[fp
->fInputIdx
] == 0x0a) {
4845 if (fp
->fInputIdx
>= fActiveLimit
) {
4854 case URX_BACKSLASH_Z
: // Test for end of Input
4855 if (fp
->fInputIdx
< fAnchorLimit
) {
4856 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4865 case URX_STATIC_SETREF
:
4867 // Test input character against one of the predefined sets
4868 // (Word Characters, for example)
4869 // The high bit of the op value is a flag for the match polarity.
4870 // 0: success if input char is in set.
4871 // 1: success if input char is not in set.
4872 if (fp
->fInputIdx
>= fActiveLimit
) {
4874 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4878 UBool success
= ((opValue
& URX_NEG_SET
) == URX_NEG_SET
);
4879 opValue
&= ~URX_NEG_SET
;
4880 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
4883 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4885 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
4886 if (s8
->contains(c
)) {
4890 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
4891 if (s
->contains(c
)) {
4896 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4902 case URX_STAT_SETREF_N
:
4904 // Test input character for NOT being a member of one of
4905 // the predefined sets (Word Characters, for example)
4906 if (fp
->fInputIdx
>= fActiveLimit
) {
4908 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4912 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
4915 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4917 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
4918 if (s8
->contains(c
) == FALSE
) {
4922 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
4923 if (s
->contains(c
) == FALSE
) {
4927 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4934 if (fp
->fInputIdx
>= fActiveLimit
) {
4936 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4940 U_ASSERT(opValue
> 0 && opValue
< sets
->size());
4942 // There is input left. Pick up one char and test it for set membership.
4944 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4946 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
4947 if (s8
->contains(c
)) {
4948 // The character is in the set. A Match.
4952 UnicodeSet
*s
= (UnicodeSet
*)sets
->elementAt(opValue
);
4953 if (s
->contains(c
)) {
4954 // The character is in the set. A Match.
4959 // the character wasn't in the set.
4960 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4967 // . matches anything, but stops at end-of-line.
4968 if (fp
->fInputIdx
>= fActiveLimit
) {
4969 // At end of input. Match failed. Backtrack out.
4971 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4975 // There is input left. Advance over one char, unless we've hit end-of-line
4977 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4978 if (isLineTerminator(c
)) {
4979 // End of line in normal mode. . does not match.
4980 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4987 case URX_DOTANY_ALL
:
4989 // . in dot-matches-all (including new lines) mode
4990 if (fp
->fInputIdx
>= fActiveLimit
) {
4991 // At end of input. Match failed. Backtrack out.
4993 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4997 // There is input left. Advance over one char, except if we are
4998 // at a cr/lf, advance over both of them.
5000 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
5001 if (c
==0x0d && fp
->fInputIdx
< fActiveLimit
) {
5002 // In the case of a CR/LF, we need to advance over both.
5003 if (inputBuf
[fp
->fInputIdx
] == 0x0a) {
5004 U16_FWD_1(inputBuf
, fp
->fInputIdx
, fActiveLimit
);
5011 case URX_DOTANY_UNIX
:
5013 // '.' operator, matches all, but stops at end-of-line.
5014 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
5015 if (fp
->fInputIdx
>= fActiveLimit
) {
5016 // At end of input. Match failed. Backtrack out.
5018 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5022 // There is input left. Advance over one char, unless we've hit end-of-line
5024 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
5026 // End of line in normal mode. '.' does not match the \n
5027 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5034 fp
->fPatIdx
= opValue
;
5042 U_ASSERT(opValue
< fPattern
->fCompiledPat
->size());
5043 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
5044 fp
->fPatIdx
= opValue
; // Then JMP.
5048 // This opcode is used with (x)+, when x can match a zero length string.
5049 // Same as JMP_SAV, except conditional on the match having made forward progress.
5050 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
5051 // data address of the input position at the start of the loop.
5053 U_ASSERT(opValue
> 0 && opValue
< fPattern
->fCompiledPat
->size());
5054 int32_t stoOp
= (int32_t)pat
[opValue
-1];
5055 U_ASSERT(URX_TYPE(stoOp
) == URX_STO_INP_LOC
);
5056 int32_t frameLoc
= URX_VAL(stoOp
);
5057 U_ASSERT(frameLoc
>= 0 && frameLoc
< fFrameSize
);
5058 int32_t prevInputIdx
= (int32_t)fp
->fExtra
[frameLoc
];
5059 U_ASSERT(prevInputIdx
<= fp
->fInputIdx
);
5060 if (prevInputIdx
< fp
->fInputIdx
) {
5061 // The match did make progress. Repeat the loop.
5062 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
5063 fp
->fPatIdx
= opValue
;
5064 fp
->fExtra
[frameLoc
] = fp
->fInputIdx
;
5066 // If the input position did not advance, we do nothing here,
5067 // execution will fall out of the loop.
5073 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
5074 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
5076 // Pick up the three extra operands that CTR_INIT has, and
5077 // skip the pattern location counter past
5078 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
5080 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
5081 int32_t minCount
= (int32_t)pat
[instrOperandLoc
+1];
5082 int32_t maxCount
= (int32_t)pat
[instrOperandLoc
+2];
5083 U_ASSERT(minCount
>=0);
5084 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
5085 U_ASSERT(loopLoc
>=fp
->fPatIdx
);
5087 if (minCount
== 0) {
5088 fp
= StateSave(fp
, loopLoc
+1, status
);
5090 if (maxCount
== -1) {
5091 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // For loop breaking.
5092 } else if (maxCount
== 0) {
5093 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5100 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
5101 int32_t initOp
= (int32_t)pat
[opValue
];
5102 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT
);
5103 int64_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
5104 int32_t minCount
= (int32_t)pat
[opValue
+2];
5105 int32_t maxCount
= (int32_t)pat
[opValue
+3];
5107 if ((uint64_t)*pCounter
>= (uint32_t)maxCount
&& maxCount
!= -1) {
5108 U_ASSERT(*pCounter
== maxCount
);
5111 if (*pCounter
>= minCount
) {
5112 if (maxCount
== -1) {
5113 // Loop has no hard upper bound.
5114 // Check that it is progressing through the input, break if it is not.
5115 int64_t *pLastInputIdx
= &fp
->fExtra
[URX_VAL(initOp
) + 1];
5116 if (fp
->fInputIdx
== *pLastInputIdx
) {
5119 *pLastInputIdx
= fp
->fInputIdx
;
5122 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5124 // Increment time-out counter. (StateSave() does it if count >= minCount)
5126 if (fTickCounter
<= 0) {
5127 IncrementTime(status
); // Re-initializes fTickCounter
5130 fp
->fPatIdx
= opValue
+ 4; // Loop back.
5134 case URX_CTR_INIT_NG
:
5136 // Initialize a non-greedy loop
5137 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
5138 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
5140 // Pick up the three extra operands that CTR_INIT_NG has, and
5141 // skip the pattern location counter past
5142 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
5144 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
5145 int32_t minCount
= (int32_t)pat
[instrOperandLoc
+1];
5146 int32_t maxCount
= (int32_t)pat
[instrOperandLoc
+2];
5147 U_ASSERT(minCount
>=0);
5148 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
5149 U_ASSERT(loopLoc
>fp
->fPatIdx
);
5150 if (maxCount
== -1) {
5151 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // Save initial input index for loop breaking.
5154 if (minCount
== 0) {
5155 if (maxCount
!= 0) {
5156 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5158 fp
->fPatIdx
= loopLoc
+1; // Continue with stuff after repeated block
5163 case URX_CTR_LOOP_NG
:
5165 // Non-greedy {min, max} loops
5166 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
5167 int32_t initOp
= (int32_t)pat
[opValue
];
5168 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT_NG
);
5169 int64_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
5170 int32_t minCount
= (int32_t)pat
[opValue
+2];
5171 int32_t maxCount
= (int32_t)pat
[opValue
+3];
5174 if ((uint64_t)*pCounter
>= (uint32_t)maxCount
&& maxCount
!= -1) {
5175 // The loop has matched the maximum permitted number of times.
5176 // Break out of here with no action. Matching will
5177 // continue with the following pattern.
5178 U_ASSERT(*pCounter
== maxCount
);
5182 if (*pCounter
< minCount
) {
5183 // We haven't met the minimum number of matches yet.
5184 // Loop back for another one.
5185 fp
->fPatIdx
= opValue
+ 4; // Loop back.
5187 if (fTickCounter
<= 0) {
5188 IncrementTime(status
); // Re-initializes fTickCounter
5191 // We do have the minimum number of matches.
5193 // If there is no upper bound on the loop iterations, check that the input index
5194 // is progressing, and stop the loop if it is not.
5195 if (maxCount
== -1) {
5196 int64_t *pLastInputIdx
= &fp
->fExtra
[URX_VAL(initOp
) + 1];
5197 if (fp
->fInputIdx
== *pLastInputIdx
) {
5200 *pLastInputIdx
= fp
->fInputIdx
;
5203 // Loop Continuation: we will fall into the pattern following the loop
5204 // (non-greedy, don't execute loop body first), but first do
5205 // a state save to the top of the loop, so that a match failure
5206 // in the following pattern will try another iteration of the loop.
5207 fp
= StateSave(fp
, opValue
+ 4, status
);
5213 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
5214 fData
[opValue
] = fStack
->size();
5219 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
5220 int32_t newStackSize
= (int32_t)fData
[opValue
];
5221 U_ASSERT(newStackSize
<= fStack
->size());
5222 int64_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
5223 if (newFP
== (int64_t *)fp
) {
5227 for (i
=0; i
<fFrameSize
; i
++) {
5228 newFP
[i
] = ((int64_t *)fp
)[i
];
5230 fp
= (REStackFrame
*)newFP
;
5231 fStack
->setSize(newStackSize
);
5237 U_ASSERT(opValue
< fFrameSize
);
5238 int64_t groupStartIdx
= fp
->fExtra
[opValue
];
5239 int64_t groupEndIdx
= fp
->fExtra
[opValue
+1];
5240 U_ASSERT(groupStartIdx
<= groupEndIdx
);
5241 int64_t inputIndex
= fp
->fInputIdx
;
5242 if (groupStartIdx
< 0) {
5243 // This capture group has not participated in the match thus far,
5244 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
5247 UBool success
= TRUE
;
5248 for (int64_t groupIndex
= groupStartIdx
; groupIndex
< groupEndIdx
; ++groupIndex
,++inputIndex
) {
5249 if (inputIndex
>= fActiveLimit
) {
5254 if (inputBuf
[groupIndex
] != inputBuf
[inputIndex
]) {
5259 if (success
&& groupStartIdx
< groupEndIdx
&& U16_IS_LEAD(inputBuf
[groupEndIdx
-1]) &&
5260 inputIndex
< fActiveLimit
&& U16_IS_TRAIL(inputBuf
[inputIndex
])) {
5261 // Capture group ended with an unpaired lead surrogate.
5262 // Back reference is not permitted to match lead only of a surrogatge pair.
5266 fp
->fInputIdx
= inputIndex
;
5268 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5275 U_ASSERT(opValue
< fFrameSize
);
5276 int64_t groupStartIdx
= fp
->fExtra
[opValue
];
5277 int64_t groupEndIdx
= fp
->fExtra
[opValue
+1];
5278 U_ASSERT(groupStartIdx
<= groupEndIdx
);
5279 if (groupStartIdx
< 0) {
5280 // This capture group has not participated in the match thus far,
5281 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
5284 CaseFoldingUCharIterator
captureGroupItr(inputBuf
, groupStartIdx
, groupEndIdx
);
5285 CaseFoldingUCharIterator
inputItr(inputBuf
, fp
->fInputIdx
, fActiveLimit
);
5287 // Note: if the capture group match was of an empty string the backref
5288 // match succeeds. Verified by testing: Perl matches succeed
5289 // in this case, so we do too.
5291 UBool success
= TRUE
;
5293 UChar32 captureGroupChar
= captureGroupItr
.next();
5294 if (captureGroupChar
== U_SENTINEL
) {
5298 UChar32 inputChar
= inputItr
.next();
5299 if (inputChar
== U_SENTINEL
) {
5304 if (inputChar
!= captureGroupChar
) {
5310 if (success
&& inputItr
.inExpansion()) {
5311 // We otained a match by consuming part of a string obtained from
5312 // case-folding a single code point of the input text.
5313 // This does not count as an overall match.
5318 fp
->fInputIdx
= inputItr
.getIndex();
5320 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5325 case URX_STO_INP_LOC
:
5327 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
);
5328 fp
->fExtra
[opValue
] = fp
->fInputIdx
;
5334 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
5336 int32_t dataLoc
= URX_VAL(pat
[instrOperandLoc
]);
5337 U_ASSERT(dataLoc
>= 0 && dataLoc
< fFrameSize
);
5338 int32_t savedInputIdx
= (int32_t)fp
->fExtra
[dataLoc
];
5339 U_ASSERT(savedInputIdx
<= fp
->fInputIdx
);
5340 if (savedInputIdx
< fp
->fInputIdx
) {
5341 fp
->fPatIdx
= opValue
; // JMP
5343 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no progress in loop.
5350 // Entering a lookahead block.
5351 // Save Stack Ptr, Input Pos.
5352 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5353 fData
[opValue
] = fStack
->size();
5354 fData
[opValue
+1] = fp
->fInputIdx
;
5355 fActiveStart
= fLookStart
; // Set the match region change for
5356 fActiveLimit
= fLookLimit
; // transparent bounds.
5362 // Leaving a look-ahead block.
5363 // restore Stack Ptr, Input Pos to positions they had on entry to block.
5364 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5365 int32_t stackSize
= fStack
->size();
5366 int32_t newStackSize
= (int32_t)fData
[opValue
];
5367 U_ASSERT(stackSize
>= newStackSize
);
5368 if (stackSize
> newStackSize
) {
5369 // Copy the current top frame back to the new (cut back) top frame.
5370 // This makes the capture groups from within the look-ahead
5371 // expression available.
5372 int64_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
5374 for (i
=0; i
<fFrameSize
; i
++) {
5375 newFP
[i
] = ((int64_t *)fp
)[i
];
5377 fp
= (REStackFrame
*)newFP
;
5378 fStack
->setSize(newStackSize
);
5380 fp
->fInputIdx
= fData
[opValue
+1];
5382 // Restore the active region bounds in the input string; they may have
5383 // been changed because of transparent bounds on a Region.
5384 fActiveStart
= fRegionStart
;
5385 fActiveLimit
= fRegionLimit
;
5390 if (fp
->fInputIdx
< fActiveLimit
) {
5392 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
5393 if (u_foldCase(c
, U_FOLD_CASE_DEFAULT
) == opValue
) {
5399 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5403 // Case-insensitive test input against a literal string.
5404 // Strings require two slots in the compiled pattern, one for the
5405 // offset to the string text, and one for the length.
5406 // The compiled string has already been case folded.
5408 const UChar
*patternString
= litText
+ opValue
;
5410 op
= (int32_t)pat
[fp
->fPatIdx
];
5412 opType
= URX_TYPE(op
);
5413 opValue
= URX_VAL(op
);
5414 U_ASSERT(opType
== URX_STRING_LEN
);
5415 int32_t patternStringLen
= opValue
; // Length of the string from the pattern.
5419 UBool success
= TRUE
;
5420 int32_t patternStringIdx
= 0;
5421 CaseFoldingUCharIterator
inputIterator(inputBuf
, fp
->fInputIdx
, fActiveLimit
);
5422 while (patternStringIdx
< patternStringLen
) {
5423 U16_NEXT(patternString
, patternStringIdx
, patternStringLen
, cPattern
);
5424 cText
= inputIterator
.next();
5425 if (cText
!= cPattern
) {
5427 if (cText
== U_SENTINEL
) {
5433 if (inputIterator
.inExpansion()) {
5438 fp
->fInputIdx
= inputIterator
.getIndex();
5440 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5447 // Entering a look-behind block.
5448 // Save Stack Ptr, Input Pos.
5449 // TODO: implement transparent bounds. Ticket #6067
5450 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5451 fData
[opValue
] = fStack
->size();
5452 fData
[opValue
+1] = fp
->fInputIdx
;
5453 // Init the variable containing the start index for attempted matches.
5454 fData
[opValue
+2] = -1;
5455 // Save input string length, then reset to pin any matches to end at
5456 // the current position.
5457 fData
[opValue
+3] = fActiveLimit
;
5458 fActiveLimit
= fp
->fInputIdx
;
5465 // Positive Look-Behind, at top of loop checking for matches of LB expression
5466 // at all possible input starting positions.
5468 // Fetch the min and max possible match lengths. They are the operands
5469 // of this op in the pattern.
5470 int32_t minML
= (int32_t)pat
[fp
->fPatIdx
++];
5471 int32_t maxML
= (int32_t)pat
[fp
->fPatIdx
++];
5472 U_ASSERT(minML
<= maxML
);
5473 U_ASSERT(minML
>= 0);
5475 // Fetch (from data) the last input index where a match was attempted.
5476 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5477 int64_t &lbStartIdx
= fData
[opValue
+2];
5478 if (lbStartIdx
< 0) {
5479 // First time through loop.
5480 lbStartIdx
= fp
->fInputIdx
- minML
;
5481 if (lbStartIdx
> 0 && lbStartIdx
< fInputLength
) {
5482 U16_SET_CP_START(inputBuf
, 0, lbStartIdx
);
5485 // 2nd through nth time through the loop.
5486 // Back up start position for match by one.
5487 if (lbStartIdx
== 0) {
5490 U16_BACK_1(inputBuf
, 0, lbStartIdx
);
5494 if (lbStartIdx
< 0 || lbStartIdx
< fp
->fInputIdx
- maxML
) {
5495 // We have tried all potential match starting points without
5496 // getting a match. Backtrack out, and out of the
5497 // Look Behind altogether.
5498 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5499 int64_t restoreInputLen
= fData
[opValue
+3];
5500 U_ASSERT(restoreInputLen
>= fActiveLimit
);
5501 U_ASSERT(restoreInputLen
<= fInputLength
);
5502 fActiveLimit
= restoreInputLen
;
5506 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5507 // (successful match will fall off the end of the loop.)
5508 fp
= StateSave(fp
, fp
->fPatIdx
-3, status
);
5509 fp
->fInputIdx
= lbStartIdx
;
5514 // End of a look-behind block, after a successful match.
5516 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5517 if (fp
->fInputIdx
!= fActiveLimit
) {
5518 // The look-behind expression matched, but the match did not
5519 // extend all the way to the point that we are looking behind from.
5520 // FAIL out of here, which will take us back to the LB_CONT, which
5521 // will retry the match starting at another position or fail
5522 // the look-behind altogether, whichever is appropriate.
5523 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5527 // Look-behind match is good. Restore the orignal input string length,
5528 // which had been truncated to pin the end of the lookbehind match to the
5529 // position being looked-behind.
5530 int64_t originalInputLen
= fData
[opValue
+3];
5531 U_ASSERT(originalInputLen
>= fActiveLimit
);
5532 U_ASSERT(originalInputLen
<= fInputLength
);
5533 fActiveLimit
= originalInputLen
;
5540 // Negative Look-Behind, at top of loop checking for matches of LB expression
5541 // at all possible input starting positions.
5543 // Fetch the extra parameters of this op.
5544 int32_t minML
= (int32_t)pat
[fp
->fPatIdx
++];
5545 int32_t maxML
= (int32_t)pat
[fp
->fPatIdx
++];
5546 int32_t continueLoc
= (int32_t)pat
[fp
->fPatIdx
++];
5547 continueLoc
= URX_VAL(continueLoc
);
5548 U_ASSERT(minML
<= maxML
);
5549 U_ASSERT(minML
>= 0);
5550 U_ASSERT(continueLoc
> fp
->fPatIdx
);
5552 // Fetch (from data) the last input index where a match was attempted.
5553 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5554 int64_t &lbStartIdx
= fData
[opValue
+2];
5555 if (lbStartIdx
< 0) {
5556 // First time through loop.
5557 lbStartIdx
= fp
->fInputIdx
- minML
;
5558 if (lbStartIdx
> 0 && lbStartIdx
< fInputLength
) {
5559 U16_SET_CP_START(inputBuf
, 0, lbStartIdx
);
5562 // 2nd through nth time through the loop.
5563 // Back up start position for match by one.
5564 if (lbStartIdx
== 0) {
5565 lbStartIdx
--; // Because U16_BACK is unsafe starting at 0.
5567 U16_BACK_1(inputBuf
, 0, lbStartIdx
);
5571 if (lbStartIdx
< 0 || lbStartIdx
< fp
->fInputIdx
- maxML
) {
5572 // We have tried all potential match starting points without
5573 // getting a match, which means that the negative lookbehind as
5574 // a whole has succeeded. Jump forward to the continue location
5575 int64_t restoreInputLen
= fData
[opValue
+3];
5576 U_ASSERT(restoreInputLen
>= fActiveLimit
);
5577 U_ASSERT(restoreInputLen
<= fInputLength
);
5578 fActiveLimit
= restoreInputLen
;
5579 fp
->fPatIdx
= continueLoc
;
5583 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5584 // (successful match will cause a FAIL out of the loop altogether.)
5585 fp
= StateSave(fp
, fp
->fPatIdx
-4, status
);
5586 fp
->fInputIdx
= lbStartIdx
;
5591 // End of a negative look-behind block, after a successful match.
5593 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5594 if (fp
->fInputIdx
!= fActiveLimit
) {
5595 // The look-behind expression matched, but the match did not
5596 // extend all the way to the point that we are looking behind from.
5597 // FAIL out of here, which will take us back to the LB_CONT, which
5598 // will retry the match starting at another position or succeed
5599 // the look-behind altogether, whichever is appropriate.
5600 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5604 // Look-behind expression matched, which means look-behind test as
5607 // Restore the orignal input string length, which had been truncated
5608 // inorder to pin the end of the lookbehind match
5609 // to the position being looked-behind.
5610 int64_t originalInputLen
= fData
[opValue
+3];
5611 U_ASSERT(originalInputLen
>= fActiveLimit
);
5612 U_ASSERT(originalInputLen
<= fInputLength
);
5613 fActiveLimit
= originalInputLen
;
5615 // Restore original stack position, discarding any state saved
5616 // by the successful pattern match.
5617 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5618 int32_t newStackSize
= (int32_t)fData
[opValue
];
5619 U_ASSERT(fStack
->size() > newStackSize
);
5620 fStack
->setSize(newStackSize
);
5622 // FAIL, which will take control back to someplace
5623 // prior to entering the look-behind test.
5624 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5630 // Loop Initialization for the optimized implementation of
5631 // [some character set]*
5632 // This op scans through all matching input.
5633 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5635 U_ASSERT(opValue
> 0 && opValue
< sets
->size());
5636 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
5637 UnicodeSet
*s
= (UnicodeSet
*)sets
->elementAt(opValue
);
5639 // Loop through input, until either the input is exhausted or
5640 // we reach a character that is not a member of the set.
5641 int32_t ix
= (int32_t)fp
->fInputIdx
;
5643 if (ix
>= fActiveLimit
) {
5648 U16_NEXT(inputBuf
, ix
, fActiveLimit
, c
);
5650 if (s8
->contains(c
) == FALSE
) {
5651 U16_BACK_1(inputBuf
, 0, ix
);
5655 if (s
->contains(c
) == FALSE
) {
5656 U16_BACK_1(inputBuf
, 0, ix
);
5662 // If there were no matching characters, skip over the loop altogether.
5663 // The loop doesn't run at all, a * op always succeeds.
5664 if (ix
== fp
->fInputIdx
) {
5665 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
5669 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5670 // must follow. It's operand is the stack location
5671 // that holds the starting input index for the match of this [set]*
5672 int32_t loopcOp
= (int32_t)pat
[fp
->fPatIdx
];
5673 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
5674 int32_t stackLoc
= URX_VAL(loopcOp
);
5675 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
5676 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
5679 // Save State to the URX_LOOP_C op that follows this one,
5680 // so that match failures in the following code will return to there.
5681 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5682 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5688 case URX_LOOP_DOT_I
:
5689 // Loop Initialization for the optimized implementation of .*
5690 // This op scans through all remaining input.
5691 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5693 // Loop through input until the input is exhausted (we reach an end-of-line)
5694 // In DOTALL mode, we can just go straight to the end of the input.
5696 if ((opValue
& 1) == 1) {
5697 // Dot-matches-All mode. Jump straight to the end of the string.
5698 ix
= (int32_t)fActiveLimit
;
5701 // NOT DOT ALL mode. Line endings do not match '.'
5702 // Scan forward until a line ending or end of input.
5703 ix
= (int32_t)fp
->fInputIdx
;
5705 if (ix
>= fActiveLimit
) {
5710 U16_NEXT(inputBuf
, ix
, fActiveLimit
, c
); // c = inputBuf[ix++]
5711 if ((c
& 0x7f) <= 0x29) { // Fast filter of non-new-line-s
5712 if ((c
== 0x0a) || // 0x0a is newline in both modes.
5713 (((opValue
& 2) == 0) && // IF not UNIX_LINES mode
5714 isLineTerminator(c
))) {
5715 // char is a line ending. Put the input pos back to the
5716 // line ending char, and exit the scanning loop.
5717 U16_BACK_1(inputBuf
, 0, ix
);
5724 // If there were no matching characters, skip over the loop altogether.
5725 // The loop doesn't run at all, a * op always succeeds.
5726 if (ix
== fp
->fInputIdx
) {
5727 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
5731 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5732 // must follow. It's operand is the stack location
5733 // that holds the starting input index for the match of this .*
5734 int32_t loopcOp
= (int32_t)pat
[fp
->fPatIdx
];
5735 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
5736 int32_t stackLoc
= URX_VAL(loopcOp
);
5737 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
5738 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
5741 // Save State to the URX_LOOP_C op that follows this one,
5742 // so that match failures in the following code will return to there.
5743 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5744 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5752 U_ASSERT(opValue
>=0 && opValue
<fFrameSize
);
5753 backSearchIndex
= (int32_t)fp
->fExtra
[opValue
];
5754 U_ASSERT(backSearchIndex
<= fp
->fInputIdx
);
5755 if (backSearchIndex
== fp
->fInputIdx
) {
5756 // We've backed up the input idx to the point that the loop started.
5757 // The loop is done. Leave here without saving state.
5758 // Subsequent failures won't come back here.
5761 // Set up for the next iteration of the loop, with input index
5762 // backed up by one from the last time through,
5763 // and a state save to this instruction in case the following code fails again.
5764 // (We're going backwards because this loop emulates stack unwinding, not
5765 // the initial scan forward.)
5766 U_ASSERT(fp
->fInputIdx
> 0);
5768 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, prevC
); // !!!: should this 0 be one of f*Limit?
5770 if (prevC
== 0x0a &&
5771 fp
->fInputIdx
> backSearchIndex
&&
5772 inputBuf
[fp
->fInputIdx
-1] == 0x0d) {
5773 int32_t prevOp
= (int32_t)pat
[fp
->fPatIdx
-2];
5774 if (URX_TYPE(prevOp
) == URX_LOOP_DOT_I
) {
5775 // .*, stepping back over CRLF pair.
5776 U16_BACK_1(inputBuf
, 0, fp
->fInputIdx
);
5781 fp
= StateSave(fp
, fp
->fPatIdx
-1, status
);
5788 // Trouble. The compiled pattern contains an entry with an
5789 // unrecognized type tag.
5793 if (U_FAILURE(status
)) {
5802 fLastMatchEnd
= fMatchEnd
;
5803 fMatchStart
= startIdx
;
5804 fMatchEnd
= fp
->fInputIdx
;
5807 #ifdef REGEX_RUN_DEBUG
5810 printf("Match. start=%ld end=%ld\n\n", fMatchStart
, fMatchEnd
);
5812 printf("No match\n\n");
5817 fFrame
= fp
; // The active stack frame when the engine stopped.
5818 // Contains the capture group results that we need to
5825 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher
)
5829 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS