2 **************************************************************************
3 * Copyright (C) 2002-2016 International Business Machines Corporation
4 * and others. All rights reserved.
5 **************************************************************************
10 // Contains the implementation of class RegexMatcher,
11 // which is one of the main API classes for the ICU regular expression package.
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
17 #include "unicode/regex.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/ustring.h"
21 #include "unicode/rbbi.h"
22 #include "unicode/utf.h"
23 #include "unicode/utf16.h"
35 // #include <malloc.h> // Needed for heapcheck testing
40 // Default limit for the size of the back track stack, to avoid system
41 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
42 // This value puts ICU's limits higher than most other regexp implementations,
43 // which use recursion rather than the heap, and take more storage per
46 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY
= 8000000;
48 // Time limit counter constant.
49 // Time limits for expression evaluation are in terms of quanta of work by
50 // the engine, each of which is 10,000 state saves.
51 // This constant determines that state saves per tick number.
52 static const int32_t TIMER_INITIAL_VALUE
= 10000;
55 // Test for any of the Unicode line terminating characters.
56 static inline UBool
isLineTerminator(UChar32 c
) {
57 if (c
& ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
60 return (c
<=0x0d && c
>=0x0a) || c
==0x85 || c
==0x2028 || c
==0x2029;
63 //-----------------------------------------------------------------------------
65 // Constructor and Destructor
67 //-----------------------------------------------------------------------------
68 RegexMatcher::RegexMatcher(const RegexPattern
*pat
) {
69 fDeferredStatus
= U_ZERO_ERROR
;
70 init(fDeferredStatus
);
71 if (U_FAILURE(fDeferredStatus
)) {
75 fDeferredStatus
= U_ILLEGAL_ARGUMENT_ERROR
;
79 init2(RegexStaticSets::gStaticSets
->fEmptyText
, fDeferredStatus
);
84 RegexMatcher::RegexMatcher(const UnicodeString
®exp
, const UnicodeString
&input
,
85 uint32_t flags
, UErrorCode
&status
) {
87 if (U_FAILURE(status
)) {
91 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
92 fPattern
= fPatternOwned
;
94 UText inputText
= UTEXT_INITIALIZER
;
95 utext_openConstUnicodeString(&inputText
, &input
, &status
);
96 init2(&inputText
, status
);
97 utext_close(&inputText
);
99 fInputUniStrMaybeMutable
= TRUE
;
103 RegexMatcher::RegexMatcher(UText
*regexp
, UText
*input
,
104 uint32_t flags
, UErrorCode
&status
) {
106 if (U_FAILURE(status
)) {
110 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
111 if (U_FAILURE(status
)) {
115 fPattern
= fPatternOwned
;
116 init2(input
, status
);
120 RegexMatcher::RegexMatcher(const UnicodeString
®exp
,
121 uint32_t flags
, UErrorCode
&status
) {
123 if (U_FAILURE(status
)) {
127 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
128 if (U_FAILURE(status
)) {
131 fPattern
= fPatternOwned
;
132 init2(RegexStaticSets::gStaticSets
->fEmptyText
, status
);
135 RegexMatcher::RegexMatcher(UText
*regexp
,
136 uint32_t flags
, UErrorCode
&status
) {
138 if (U_FAILURE(status
)) {
142 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
143 if (U_FAILURE(status
)) {
147 fPattern
= fPatternOwned
;
148 init2(RegexStaticSets::gStaticSets
->fEmptyText
, status
);
154 RegexMatcher::~RegexMatcher() {
156 if (fData
!= fSmallData
) {
161 delete fPatternOwned
;
162 fPatternOwned
= NULL
;
170 utext_close(fInputText
);
173 utext_close(fAltInputText
);
176 #if UCONFIG_NO_BREAK_ITERATION==0
177 delete fWordBreakItr
;
182 // init() common initialization for use by all constructors.
183 // Initialize all fields, get the object into a consistent state.
184 // This must be done even when the initial status shows an error,
185 // so that the object is initialized sufficiently well for the destructor
188 void RegexMatcher::init(UErrorCode
&status
) {
190 fPatternOwned
= NULL
;
200 fTransparentBounds
= FALSE
;
201 fAnchoringBounds
= TRUE
;
214 fStackLimit
= DEFAULT_BACKTRACK_STACK_CAPACITY
;
216 fCallbackContext
= NULL
;
217 fFindProgressCallbackFn
= NULL
;
218 fFindProgressCallbackContext
= NULL
;
220 fDeferredStatus
= status
;
222 fWordBreakItr
= NULL
;
226 fAltInputText
= NULL
;
229 fInputUniStrMaybeMutable
= FALSE
;
233 // init2() Common initialization for use by RegexMatcher constructors, part 2.
234 // This handles the common setup to be done after the Pattern is available.
236 void RegexMatcher::init2(UText
*input
, UErrorCode
&status
) {
237 if (U_FAILURE(status
)) {
238 fDeferredStatus
= status
;
242 if (fPattern
->fDataSize
> UPRV_LENGTHOF(fSmallData
)) {
243 fData
= (int64_t *)uprv_malloc(fPattern
->fDataSize
* sizeof(int64_t));
245 status
= fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
250 fStack
= new UVector64(status
);
251 if (fStack
== NULL
) {
252 status
= fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
257 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY
, status
);
258 if (U_FAILURE(status
)) {
259 fDeferredStatus
= status
;
265 static const UChar BACKSLASH
= 0x5c;
266 static const UChar DOLLARSIGN
= 0x24;
267 static const UChar LEFTBRACKET
= 0x7b;
268 static const UChar RIGHTBRACKET
= 0x7d;
270 //--------------------------------------------------------------------------------
274 //--------------------------------------------------------------------------------
275 RegexMatcher
&RegexMatcher::appendReplacement(UnicodeString
&dest
,
276 const UnicodeString
&replacement
,
277 UErrorCode
&status
) {
278 UText replacementText
= UTEXT_INITIALIZER
;
280 utext_openConstUnicodeString(&replacementText
, &replacement
, &status
);
281 if (U_SUCCESS(status
)) {
282 UText resultText
= UTEXT_INITIALIZER
;
283 utext_openUnicodeString(&resultText
, &dest
, &status
);
285 if (U_SUCCESS(status
)) {
286 appendReplacement(&resultText
, &replacementText
, status
);
287 utext_close(&resultText
);
289 utext_close(&replacementText
);
296 // appendReplacement, UText mode
298 RegexMatcher
&RegexMatcher::appendReplacement(UText
*dest
,
300 UErrorCode
&status
) {
301 if (U_FAILURE(status
)) {
304 if (U_FAILURE(fDeferredStatus
)) {
305 status
= fDeferredStatus
;
308 if (fMatch
== FALSE
) {
309 status
= U_REGEX_INVALID_STATE
;
313 // Copy input string from the end of previous match to start of current match
314 int64_t destLen
= utext_nativeLength(dest
);
315 if (fMatchStart
> fAppendPosition
) {
316 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
317 destLen
+= utext_replace(dest
, destLen
, destLen
, fInputText
->chunkContents
+fAppendPosition
,
318 (int32_t)(fMatchStart
-fAppendPosition
), &status
);
321 if (UTEXT_USES_U16(fInputText
)) {
322 len16
= (int32_t)(fMatchStart
-fAppendPosition
);
324 UErrorCode lengthStatus
= U_ZERO_ERROR
;
325 len16
= utext_extract(fInputText
, fAppendPosition
, fMatchStart
, NULL
, 0, &lengthStatus
);
327 UChar
*inputChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(len16
+1));
328 if (inputChars
== NULL
) {
329 status
= U_MEMORY_ALLOCATION_ERROR
;
332 utext_extract(fInputText
, fAppendPosition
, fMatchStart
, inputChars
, len16
+1, &status
);
333 destLen
+= utext_replace(dest
, destLen
, destLen
, inputChars
, len16
, &status
);
334 uprv_free(inputChars
);
337 fAppendPosition
= fMatchEnd
;
340 // scan the replacement text, looking for substitutions ($n) and \escapes.
341 // TODO: optimize this loop by efficiently scanning for '$' or '\',
342 // move entire ranges not containing substitutions.
343 UTEXT_SETNATIVEINDEX(replacement
, 0);
344 for (UChar32 c
= UTEXT_NEXT32(replacement
); U_SUCCESS(status
) && c
!= U_SENTINEL
; c
= UTEXT_NEXT32(replacement
)) {
345 if (c
== BACKSLASH
) {
346 // Backslash Escape. Copy the following char out without further checks.
347 // Note: Surrogate pairs don't need any special handling
348 // The second half wont be a '$' or a '\', and
349 // will move to the dest normally on the next
351 c
= UTEXT_CURRENT32(replacement
);
352 if (c
== U_SENTINEL
) {
356 if (c
==0x55/*U*/ || c
==0x75/*u*/) {
357 // We have a \udddd or \Udddddddd escape sequence.
359 struct URegexUTextUnescapeCharContext context
= U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement
);
360 UChar32 escapedChar
= u_unescapeAt(uregex_utext_unescape_charAt
, &offset
, INT32_MAX
, &context
);
361 if (escapedChar
!= (UChar32
)0xFFFFFFFF) {
362 if (U_IS_BMP(escapedChar
)) {
363 UChar c16
= (UChar
)escapedChar
;
364 destLen
+= utext_replace(dest
, destLen
, destLen
, &c16
, 1, &status
);
367 surrogate
[0] = U16_LEAD(escapedChar
);
368 surrogate
[1] = U16_TRAIL(escapedChar
);
369 if (U_SUCCESS(status
)) {
370 destLen
+= utext_replace(dest
, destLen
, destLen
, surrogate
, 2, &status
);
373 // TODO: Report errors for mal-formed \u escapes?
374 // As this is, the original sequence is output, which may be OK.
375 if (context
.lastOffset
== offset
) {
376 (void)UTEXT_PREVIOUS32(replacement
);
377 } else if (context
.lastOffset
!= offset
-1) {
378 utext_moveIndex32(replacement
, offset
- context
.lastOffset
- 1);
382 (void)UTEXT_NEXT32(replacement
);
383 // Plain backslash escape. Just put out the escaped character.
385 UChar c16
= (UChar
)c
;
386 destLen
+= utext_replace(dest
, destLen
, destLen
, &c16
, 1, &status
);
389 surrogate
[0] = U16_LEAD(c
);
390 surrogate
[1] = U16_TRAIL(c
);
391 if (U_SUCCESS(status
)) {
392 destLen
+= utext_replace(dest
, destLen
, destLen
, surrogate
, 2, &status
);
396 } else if (c
!= DOLLARSIGN
) {
397 // Normal char, not a $. Copy it out without further checks.
399 UChar c16
= (UChar
)c
;
400 destLen
+= utext_replace(dest
, destLen
, destLen
, &c16
, 1, &status
);
403 surrogate
[0] = U16_LEAD(c
);
404 surrogate
[1] = U16_TRAIL(c
);
405 if (U_SUCCESS(status
)) {
406 destLen
+= utext_replace(dest
, destLen
, destLen
, surrogate
, 2, &status
);
410 // We've got a $. Pick up a capture group name or number if one follows.
411 // Consume digits so long as the resulting group number <= the number of
412 // number of capture groups in the pattern.
414 int32_t groupNum
= 0;
415 int32_t numDigits
= 0;
416 UChar32 nextChar
= utext_current32(replacement
);
417 if (nextChar
== LEFTBRACKET
) {
418 // Scan for a Named Capture Group, ${name}.
419 UnicodeString groupName
;
420 utext_next32(replacement
);
421 while(U_SUCCESS(status
) && nextChar
!= RIGHTBRACKET
) {
422 nextChar
= utext_next32(replacement
);
423 if (nextChar
== U_SENTINEL
) {
424 status
= U_REGEX_INVALID_CAPTURE_GROUP_NAME
;
425 } else if ((nextChar
>= 0x41 && nextChar
<= 0x5a) || // A..Z
426 (nextChar
>= 0x61 && nextChar
<= 0x7a) || // a..z
427 (nextChar
>= 0x31 && nextChar
<= 0x39)) { // 0..9
428 groupName
.append(nextChar
);
429 } else if (nextChar
== RIGHTBRACKET
) {
430 groupNum
= uhash_geti(fPattern
->fNamedCaptureMap
, &groupName
);
432 status
= U_REGEX_INVALID_CAPTURE_GROUP_NAME
;
435 // Character was something other than a name char or a closing '}'
436 status
= U_REGEX_INVALID_CAPTURE_GROUP_NAME
;
440 } else if (u_isdigit(nextChar
)) {
441 // $n Scan for a capture group number
442 int32_t numCaptureGroups
= fPattern
->fGroupMap
->size();
444 nextChar
= UTEXT_CURRENT32(replacement
);
445 if (nextChar
== U_SENTINEL
) {
448 if (u_isdigit(nextChar
) == FALSE
) {
451 int32_t nextDigitVal
= u_charDigitValue(nextChar
);
452 if (groupNum
*10 + nextDigitVal
> numCaptureGroups
) {
453 // Don't consume the next digit if it makes the capture group number too big.
454 if (numDigits
== 0) {
455 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
459 (void)UTEXT_NEXT32(replacement
);
460 groupNum
=groupNum
*10 + nextDigitVal
;
464 // $ not followed by capture group name or number.
465 status
= U_REGEX_INVALID_CAPTURE_GROUP_NAME
;
468 if (U_SUCCESS(status
)) {
469 destLen
+= appendGroup(groupNum
, dest
, status
);
471 } // End of $ capture group handling
472 } // End of per-character loop through the replacement string.
479 //--------------------------------------------------------------------------------
481 // appendTail Intended to be used in conjunction with appendReplacement()
482 // To the destination string, append everything following
483 // the last match position from the input string.
485 // Note: Match ranges do not affect appendTail or appendReplacement
487 //--------------------------------------------------------------------------------
488 UnicodeString
&RegexMatcher::appendTail(UnicodeString
&dest
) {
489 UErrorCode status
= U_ZERO_ERROR
;
490 UText resultText
= UTEXT_INITIALIZER
;
491 utext_openUnicodeString(&resultText
, &dest
, &status
);
493 if (U_SUCCESS(status
)) {
494 appendTail(&resultText
, status
);
495 utext_close(&resultText
);
502 // appendTail, UText mode
504 UText
*RegexMatcher::appendTail(UText
*dest
, UErrorCode
&status
) {
505 if (U_FAILURE(status
)) {
508 if (U_FAILURE(fDeferredStatus
)) {
509 status
= fDeferredStatus
;
513 if (fInputLength
> fAppendPosition
) {
514 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
515 int64_t destLen
= utext_nativeLength(dest
);
516 utext_replace(dest
, destLen
, destLen
, fInputText
->chunkContents
+fAppendPosition
,
517 (int32_t)(fInputLength
-fAppendPosition
), &status
);
520 if (UTEXT_USES_U16(fInputText
)) {
521 len16
= (int32_t)(fInputLength
-fAppendPosition
);
523 len16
= utext_extract(fInputText
, fAppendPosition
, fInputLength
, NULL
, 0, &status
);
524 status
= U_ZERO_ERROR
; // buffer overflow
527 UChar
*inputChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(len16
));
528 if (inputChars
== NULL
) {
529 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
531 utext_extract(fInputText
, fAppendPosition
, fInputLength
, inputChars
, len16
, &status
); // unterminated
532 int64_t destLen
= utext_nativeLength(dest
);
533 utext_replace(dest
, destLen
, destLen
, inputChars
, len16
, &status
);
534 uprv_free(inputChars
);
543 //--------------------------------------------------------------------------------
547 //--------------------------------------------------------------------------------
548 int32_t RegexMatcher::end(UErrorCode
&err
) const {
552 int64_t RegexMatcher::end64(UErrorCode
&err
) const {
553 return end64(0, err
);
556 int64_t RegexMatcher::end64(int32_t group
, UErrorCode
&err
) const {
557 if (U_FAILURE(err
)) {
560 if (fMatch
== FALSE
) {
561 err
= U_REGEX_INVALID_STATE
;
564 if (group
< 0 || group
> fPattern
->fGroupMap
->size()) {
565 err
= U_INDEX_OUTOFBOUNDS_ERROR
;
572 // Get the position within the stack frame of the variables for
573 // this capture group.
574 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(group
-1);
575 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
576 U_ASSERT(groupOffset
>= 0);
577 e
= fFrame
->fExtra
[groupOffset
+ 1];
583 int32_t RegexMatcher::end(int32_t group
, UErrorCode
&err
) const {
584 return (int32_t)end64(group
, err
);
587 //--------------------------------------------------------------------------------
589 // findProgressInterrupt This function is called once for each advance in the target
590 // string from the find() function, and calls the user progress callback
591 // function if there is one installed.
593 // Return: TRUE if the find operation is to be terminated.
594 // FALSE if the find operation is to continue running.
596 //--------------------------------------------------------------------------------
597 UBool
RegexMatcher::findProgressInterrupt(int64_t pos
, UErrorCode
&status
) {
598 if (fFindProgressCallbackFn
&& !(*fFindProgressCallbackFn
)(fFindProgressCallbackContext
, pos
)) {
599 status
= U_REGEX_STOPPED_BY_CALLER
;
605 //--------------------------------------------------------------------------------
609 //--------------------------------------------------------------------------------
610 UBool
RegexMatcher::find() {
611 if (U_FAILURE(fDeferredStatus
)) {
614 UErrorCode status
= U_ZERO_ERROR
;
615 UBool result
= find(status
);
619 //--------------------------------------------------------------------------------
623 //--------------------------------------------------------------------------------
624 UBool
RegexMatcher::find(UErrorCode
&status
) {
625 // Start at the position of the last match end. (Will be zero if the
626 // matcher has been reset.)
628 if (U_FAILURE(status
)) {
631 if (U_FAILURE(fDeferredStatus
)) {
632 status
= fDeferredStatus
;
636 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
637 return findUsingChunk(status
);
640 int64_t startPos
= fMatchEnd
;
642 startPos
= fActiveStart
;
646 // Save the position of any previous successful match.
647 fLastMatchEnd
= fMatchEnd
;
649 if (fMatchStart
== fMatchEnd
) {
650 // Previous match had zero length. Move start position up one position
651 // to avoid sending find() into a loop on zero-length matches.
652 if (startPos
>= fActiveLimit
) {
657 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
658 (void)UTEXT_NEXT32(fInputText
);
659 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
662 if (fLastMatchEnd
>= 0) {
663 // A previous find() failed to match. Don't try again.
664 // (without this test, a pattern with a zero-length match
665 // could match again at the end of an input string.)
672 // Compute the position in the input string beyond which a match can not begin, because
673 // the minimum length match would extend past the end of the input.
674 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
675 // Be aware of possible overflows if making changes here.
676 int64_t testStartLimit
;
677 if (UTEXT_USES_U16(fInputText
)) {
678 testStartLimit
= fActiveLimit
- fPattern
->fMinMatchLen
;
679 if (startPos
> testStartLimit
) {
685 // We don't know exactly how long the minimum match length is in native characters.
686 // Treat anything > 0 as 1.
687 testStartLimit
= fActiveLimit
- (fPattern
->fMinMatchLen
> 0 ? 1 : 0);
691 U_ASSERT(startPos
>= 0);
693 switch (fPattern
->fStartType
) {
695 // No optimization was found.
696 // Try a match at each input position.
698 MatchAt(startPos
, FALSE
, status
);
699 if (U_FAILURE(status
)) {
705 if (startPos
>= testStartLimit
) {
709 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
710 (void)UTEXT_NEXT32(fInputText
);
711 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
712 // Note that it's perfectly OK for a pattern to have a zero-length
713 // match at the end of a string, so we must make sure that the loop
714 // runs with startPos == testStartLimit the last time through.
715 if (findProgressInterrupt(startPos
, status
))
721 // Matches are only possible at the start of the input string
722 // (pattern begins with ^ or \A)
723 if (startPos
> fActiveStart
) {
727 MatchAt(startPos
, FALSE
, status
);
728 if (U_FAILURE(status
)) {
736 // Match may start on any char from a pre-computed set.
737 U_ASSERT(fPattern
->fMinMatchLen
> 0);
738 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
740 int64_t pos
= startPos
;
741 c
= UTEXT_NEXT32(fInputText
);
742 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
743 // c will be -1 (U_SENTINEL) at end of text, in which case we
744 // skip this next block (so we don't have a negative array index)
745 // and handle end of text in the following block.
746 if (c
>= 0 && ((c
<256 && fPattern
->fInitialChars8
->contains(c
)) ||
747 (c
>=256 && fPattern
->fInitialChars
->contains(c
)))) {
748 MatchAt(pos
, FALSE
, status
);
749 if (U_FAILURE(status
)) {
755 UTEXT_SETNATIVEINDEX(fInputText
, pos
);
757 if (startPos
> testStartLimit
) {
762 if (findProgressInterrupt(startPos
, status
))
771 // Match starts on exactly one char.
772 U_ASSERT(fPattern
->fMinMatchLen
> 0);
773 UChar32 theChar
= fPattern
->fInitialChar
;
774 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
776 int64_t pos
= startPos
;
777 c
= UTEXT_NEXT32(fInputText
);
778 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
780 MatchAt(pos
, FALSE
, status
);
781 if (U_FAILURE(status
)) {
787 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
789 if (startPos
> testStartLimit
) {
794 if (findProgressInterrupt(startPos
, status
))
803 if (startPos
== fAnchorStart
) {
804 MatchAt(startPos
, FALSE
, status
);
805 if (U_FAILURE(status
)) {
811 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
812 c
= UTEXT_NEXT32(fInputText
);
813 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
815 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
816 c
= UTEXT_PREVIOUS32(fInputText
);
817 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
820 if (fPattern
->fFlags
& UREGEX_UNIX_LINES
) {
823 MatchAt(startPos
, FALSE
, status
);
824 if (U_FAILURE(status
)) {
830 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
832 if (startPos
>= testStartLimit
) {
837 c
= UTEXT_NEXT32(fInputText
);
838 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
839 // Note that it's perfectly OK for a pattern to have a zero-length
840 // match at the end of a string, so we must make sure that the loop
841 // runs with startPos == testStartLimit the last time through.
842 if (findProgressInterrupt(startPos
, status
))
847 if (isLineTerminator(c
)) {
848 if (c
== 0x0d && startPos
< fActiveLimit
&& UTEXT_CURRENT32(fInputText
) == 0x0a) {
849 (void)UTEXT_NEXT32(fInputText
);
850 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
852 MatchAt(startPos
, FALSE
, status
);
853 if (U_FAILURE(status
)) {
859 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
861 if (startPos
>= testStartLimit
) {
866 c
= UTEXT_NEXT32(fInputText
);
867 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
868 // Note that it's perfectly OK for a pattern to have a zero-length
869 // match at the end of a string, so we must make sure that the loop
870 // runs with startPos == testStartLimit the last time through.
871 if (findProgressInterrupt(startPos
, status
))
887 UBool
RegexMatcher::find(int64_t start
, UErrorCode
&status
) {
888 if (U_FAILURE(status
)) {
891 if (U_FAILURE(fDeferredStatus
)) {
892 status
= fDeferredStatus
;
895 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
896 // This will reset the region to be the full input length.
898 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
902 int64_t nativeStart
= start
;
903 if (nativeStart
< fActiveStart
|| nativeStart
> fActiveLimit
) {
904 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
907 fMatchEnd
= nativeStart
;
912 //--------------------------------------------------------------------------------
914 // findUsingChunk() -- like find(), but with the advance knowledge that the
915 // entire string is available in the UText's chunk buffer.
917 //--------------------------------------------------------------------------------
918 UBool
RegexMatcher::findUsingChunk(UErrorCode
&status
) {
919 // Start at the position of the last match end. (Will be zero if the
920 // matcher has been reset.
923 int32_t startPos
= (int32_t)fMatchEnd
;
925 startPos
= (int32_t)fActiveStart
;
928 const UChar
*inputBuf
= fInputText
->chunkContents
;
931 // Save the position of any previous successful match.
932 fLastMatchEnd
= fMatchEnd
;
934 if (fMatchStart
== fMatchEnd
) {
935 // Previous match had zero length. Move start position up one position
936 // to avoid sending find() into a loop on zero-length matches.
937 if (startPos
>= fActiveLimit
) {
942 U16_FWD_1(inputBuf
, startPos
, fInputLength
);
945 if (fLastMatchEnd
>= 0) {
946 // A previous find() failed to match. Don't try again.
947 // (without this test, a pattern with a zero-length match
948 // could match again at the end of an input string.)
955 // Compute the position in the input string beyond which a match can not begin, because
956 // the minimum length match would extend past the end of the input.
957 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
958 // Be aware of possible overflows if making changes here.
959 // Note: a match can begin at inputBuf + testLen; it is an inclusive limit.
960 int32_t testLen
= (int32_t)(fActiveLimit
- fPattern
->fMinMatchLen
);
961 if (startPos
> testLen
) {
968 U_ASSERT(startPos
>= 0);
970 switch (fPattern
->fStartType
) {
972 // No optimization was found.
973 // Try a match at each input position.
975 MatchChunkAt(startPos
, FALSE
, status
);
976 if (U_FAILURE(status
)) {
982 if (startPos
>= testLen
) {
986 U16_FWD_1(inputBuf
, startPos
, fActiveLimit
);
987 // Note that it's perfectly OK for a pattern to have a zero-length
988 // match at the end of a string, so we must make sure that the loop
989 // runs with startPos == testLen the last time through.
990 if (findProgressInterrupt(startPos
, status
))
996 // Matches are only possible at the start of the input string
997 // (pattern begins with ^ or \A)
998 if (startPos
> fActiveStart
) {
1002 MatchChunkAt(startPos
, FALSE
, status
);
1003 if (U_FAILURE(status
)) {
1011 // Match may start on any char from a pre-computed set.
1012 U_ASSERT(fPattern
->fMinMatchLen
> 0);
1014 int32_t pos
= startPos
;
1015 U16_NEXT(inputBuf
, startPos
, fActiveLimit
, c
); // like c = inputBuf[startPos++];
1016 if ((c
<256 && fPattern
->fInitialChars8
->contains(c
)) ||
1017 (c
>=256 && fPattern
->fInitialChars
->contains(c
))) {
1018 MatchChunkAt(pos
, FALSE
, status
);
1019 if (U_FAILURE(status
)) {
1026 if (startPos
> testLen
) {
1031 if (findProgressInterrupt(startPos
, status
))
1040 // Match starts on exactly one char.
1041 U_ASSERT(fPattern
->fMinMatchLen
> 0);
1042 UChar32 theChar
= fPattern
->fInitialChar
;
1044 int32_t pos
= startPos
;
1045 U16_NEXT(inputBuf
, startPos
, fActiveLimit
, c
); // like c = inputBuf[startPos++];
1047 MatchChunkAt(pos
, FALSE
, status
);
1048 if (U_FAILURE(status
)) {
1055 if (startPos
> testLen
) {
1060 if (findProgressInterrupt(startPos
, status
))
1069 if (startPos
== fAnchorStart
) {
1070 MatchChunkAt(startPos
, FALSE
, status
);
1071 if (U_FAILURE(status
)) {
1077 U16_FWD_1(inputBuf
, startPos
, fActiveLimit
);
1080 if (fPattern
->fFlags
& UREGEX_UNIX_LINES
) {
1082 c
= inputBuf
[startPos
-1];
1084 MatchChunkAt(startPos
, FALSE
, status
);
1085 if (U_FAILURE(status
)) {
1092 if (startPos
>= testLen
) {
1097 U16_FWD_1(inputBuf
, startPos
, fActiveLimit
);
1098 // Note that it's perfectly OK for a pattern to have a zero-length
1099 // match at the end of a string, so we must make sure that the loop
1100 // runs with startPos == testLen the last time through.
1101 if (findProgressInterrupt(startPos
, status
))
1106 c
= inputBuf
[startPos
-1];
1107 if (isLineTerminator(c
)) {
1108 if (c
== 0x0d && startPos
< fActiveLimit
&& inputBuf
[startPos
] == 0x0a) {
1111 MatchChunkAt(startPos
, FALSE
, status
);
1112 if (U_FAILURE(status
)) {
1119 if (startPos
>= testLen
) {
1124 U16_FWD_1(inputBuf
, startPos
, fActiveLimit
);
1125 // Note that it's perfectly OK for a pattern to have a zero-length
1126 // match at the end of a string, so we must make sure that the loop
1127 // runs with startPos == testLen the last time through.
1128 if (findProgressInterrupt(startPos
, status
))
1144 //--------------------------------------------------------------------------------
1148 //--------------------------------------------------------------------------------
1149 UnicodeString
RegexMatcher::group(UErrorCode
&status
) const {
1150 return group(0, status
);
1153 // Return immutable shallow clone
1154 UText
*RegexMatcher::group(UText
*dest
, int64_t &group_len
, UErrorCode
&status
) const {
1155 return group(0, dest
, group_len
, status
);
1158 // Return immutable shallow clone
1159 UText
*RegexMatcher::group(int32_t groupNum
, UText
*dest
, int64_t &group_len
, UErrorCode
&status
) const {
1161 if (U_FAILURE(status
)) {
1164 if (U_FAILURE(fDeferredStatus
)) {
1165 status
= fDeferredStatus
;
1166 } else if (fMatch
== FALSE
) {
1167 status
= U_REGEX_INVALID_STATE
;
1168 } else if (groupNum
< 0 || groupNum
> fPattern
->fGroupMap
->size()) {
1169 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1172 if (U_FAILURE(status
)) {
1177 if (groupNum
== 0) {
1181 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(groupNum
-1);
1182 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
1183 U_ASSERT(groupOffset
>= 0);
1184 s
= fFrame
->fExtra
[groupOffset
];
1185 e
= fFrame
->fExtra
[groupOffset
+1];
1189 // A capture group wasn't part of the match
1190 return utext_clone(dest
, fInputText
, FALSE
, TRUE
, &status
);
1195 dest
= utext_clone(dest
, fInputText
, FALSE
, TRUE
, &status
);
1197 UTEXT_SETNATIVEINDEX(dest
, s
);
1201 UnicodeString
RegexMatcher::group(int32_t groupNum
, UErrorCode
&status
) const {
1202 UnicodeString result
;
1203 int64_t groupStart
= start64(groupNum
, status
);
1204 int64_t groupEnd
= end64(groupNum
, status
);
1205 if (U_FAILURE(status
) || groupStart
== -1 || groupStart
== groupEnd
) {
1209 // Get the group length using a utext_extract preflight.
1210 // UText is actually pretty efficient at this when underlying encoding is UTF-16.
1211 int32_t length
= utext_extract(fInputText
, groupStart
, groupEnd
, NULL
, 0, &status
);
1212 if (status
!= U_BUFFER_OVERFLOW_ERROR
) {
1216 status
= U_ZERO_ERROR
;
1217 UChar
*buf
= result
.getBuffer(length
);
1219 status
= U_MEMORY_ALLOCATION_ERROR
;
1221 int32_t extractLength
= utext_extract(fInputText
, groupStart
, groupEnd
, buf
, length
, &status
);
1222 result
.releaseBuffer(extractLength
);
1223 U_ASSERT(length
== extractLength
);
1229 //--------------------------------------------------------------------------------
1231 // appendGroup() -- currently internal only, appends a group to a UText rather
1232 // than replacing its contents
1234 //--------------------------------------------------------------------------------
1236 int64_t RegexMatcher::appendGroup(int32_t groupNum
, UText
*dest
, UErrorCode
&status
) const {
1237 if (U_FAILURE(status
)) {
1240 if (U_FAILURE(fDeferredStatus
)) {
1241 status
= fDeferredStatus
;
1244 int64_t destLen
= utext_nativeLength(dest
);
1246 if (fMatch
== FALSE
) {
1247 status
= U_REGEX_INVALID_STATE
;
1248 return utext_replace(dest
, destLen
, destLen
, NULL
, 0, &status
);
1250 if (groupNum
< 0 || groupNum
> fPattern
->fGroupMap
->size()) {
1251 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1252 return utext_replace(dest
, destLen
, destLen
, NULL
, 0, &status
);
1256 if (groupNum
== 0) {
1260 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(groupNum
-1);
1261 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
1262 U_ASSERT(groupOffset
>= 0);
1263 s
= fFrame
->fExtra
[groupOffset
];
1264 e
= fFrame
->fExtra
[groupOffset
+1];
1268 // A capture group wasn't part of the match
1269 return utext_replace(dest
, destLen
, destLen
, NULL
, 0, &status
);
1274 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1275 U_ASSERT(e
<= fInputLength
);
1276 deltaLen
= utext_replace(dest
, destLen
, destLen
, fInputText
->chunkContents
+s
, (int32_t)(e
-s
), &status
);
1279 if (UTEXT_USES_U16(fInputText
)) {
1280 len16
= (int32_t)(e
-s
);
1282 UErrorCode lengthStatus
= U_ZERO_ERROR
;
1283 len16
= utext_extract(fInputText
, s
, e
, NULL
, 0, &lengthStatus
);
1285 UChar
*groupChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(len16
+1));
1286 if (groupChars
== NULL
) {
1287 status
= U_MEMORY_ALLOCATION_ERROR
;
1290 utext_extract(fInputText
, s
, e
, groupChars
, len16
+1, &status
);
1292 deltaLen
= utext_replace(dest
, destLen
, destLen
, groupChars
, len16
, &status
);
1293 uprv_free(groupChars
);
1300 //--------------------------------------------------------------------------------
1304 //--------------------------------------------------------------------------------
1305 int32_t RegexMatcher::groupCount() const {
1306 return fPattern
->fGroupMap
->size();
1309 //--------------------------------------------------------------------------------
1311 // hasAnchoringBounds()
1313 //--------------------------------------------------------------------------------
1314 UBool
RegexMatcher::hasAnchoringBounds() const {
1315 return fAnchoringBounds
;
1319 //--------------------------------------------------------------------------------
1321 // hasTransparentBounds()
1323 //--------------------------------------------------------------------------------
1324 UBool
RegexMatcher::hasTransparentBounds() const {
1325 return fTransparentBounds
;
1330 //--------------------------------------------------------------------------------
1334 //--------------------------------------------------------------------------------
1335 UBool
RegexMatcher::hitEnd() const {
1340 //--------------------------------------------------------------------------------
1344 //--------------------------------------------------------------------------------
1345 const UnicodeString
&RegexMatcher::input() const {
1347 UErrorCode status
= U_ZERO_ERROR
;
1349 if (UTEXT_USES_U16(fInputText
)) {
1350 len16
= (int32_t)fInputLength
;
1352 len16
= utext_extract(fInputText
, 0, fInputLength
, NULL
, 0, &status
);
1353 status
= U_ZERO_ERROR
; // overflow, length status
1355 UnicodeString
*result
= new UnicodeString(len16
, 0, 0);
1357 UChar
*inputChars
= result
->getBuffer(len16
);
1358 utext_extract(fInputText
, 0, fInputLength
, inputChars
, len16
, &status
); // unterminated warning
1359 result
->releaseBuffer(len16
);
1361 (*(const UnicodeString
**)&fInput
) = result
; // pointer assignment, rather than operator=
1367 //--------------------------------------------------------------------------------
1371 //--------------------------------------------------------------------------------
1372 UText
*RegexMatcher::inputText() const {
1377 //--------------------------------------------------------------------------------
1379 // getInput() -- like inputText(), but makes a clone or copies into another UText
1381 //--------------------------------------------------------------------------------
1382 UText
*RegexMatcher::getInput (UText
*dest
, UErrorCode
&status
) const {
1383 if (U_FAILURE(status
)) {
1386 if (U_FAILURE(fDeferredStatus
)) {
1387 status
= fDeferredStatus
;
1392 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1393 utext_replace(dest
, 0, utext_nativeLength(dest
), fInputText
->chunkContents
, (int32_t)fInputLength
, &status
);
1396 if (UTEXT_USES_U16(fInputText
)) {
1397 input16Len
= (int32_t)fInputLength
;
1399 UErrorCode lengthStatus
= U_ZERO_ERROR
;
1400 input16Len
= utext_extract(fInputText
, 0, fInputLength
, NULL
, 0, &lengthStatus
); // buffer overflow error
1402 UChar
*inputChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(input16Len
));
1403 if (inputChars
== NULL
) {
1407 status
= U_ZERO_ERROR
;
1408 utext_extract(fInputText
, 0, fInputLength
, inputChars
, input16Len
, &status
); // not terminated warning
1409 status
= U_ZERO_ERROR
;
1410 utext_replace(dest
, 0, utext_nativeLength(dest
), inputChars
, input16Len
, &status
);
1412 uprv_free(inputChars
);
1416 return utext_clone(NULL
, fInputText
, FALSE
, TRUE
, &status
);
1421 static UBool
compat_SyncMutableUTextContents(UText
*ut
);
1422 static UBool
compat_SyncMutableUTextContents(UText
*ut
) {
1423 UBool retVal
= FALSE
;
1425 // In the following test, we're really only interested in whether the UText should switch
1426 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents
1427 // will still point to the correct data.
1428 if (utext_nativeLength(ut
) != ut
->nativeIndexingLimit
) {
1429 UnicodeString
*us
=(UnicodeString
*)ut
->context
;
1431 // Update to the latest length.
1432 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1433 int32_t newLength
= us
->length();
1435 // Update the chunk description.
1436 // The buffer may have switched between stack- and heap-based.
1437 ut
->chunkContents
= us
->getBuffer();
1438 ut
->chunkLength
= newLength
;
1439 ut
->chunkNativeLimit
= newLength
;
1440 ut
->nativeIndexingLimit
= newLength
;
1447 //--------------------------------------------------------------------------------
1451 //--------------------------------------------------------------------------------
1452 UBool
RegexMatcher::lookingAt(UErrorCode
&status
) {
1453 if (U_FAILURE(status
)) {
1456 if (U_FAILURE(fDeferredStatus
)) {
1457 status
= fDeferredStatus
;
1461 if (fInputUniStrMaybeMutable
) {
1462 if (compat_SyncMutableUTextContents(fInputText
)) {
1463 fInputLength
= utext_nativeLength(fInputText
);
1468 resetPreserveRegion();
1470 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1471 MatchChunkAt((int32_t)fActiveStart
, FALSE
, status
);
1473 MatchAt(fActiveStart
, FALSE
, status
);
1479 UBool
RegexMatcher::lookingAt(int64_t start
, UErrorCode
&status
) {
1480 if (U_FAILURE(status
)) {
1483 if (U_FAILURE(fDeferredStatus
)) {
1484 status
= fDeferredStatus
;
1490 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1494 if (fInputUniStrMaybeMutable
) {
1495 if (compat_SyncMutableUTextContents(fInputText
)) {
1496 fInputLength
= utext_nativeLength(fInputText
);
1501 int64_t nativeStart
;
1502 nativeStart
= start
;
1503 if (nativeStart
< fActiveStart
|| nativeStart
> fActiveLimit
) {
1504 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1508 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1509 MatchChunkAt((int32_t)nativeStart
, FALSE
, status
);
1511 MatchAt(nativeStart
, FALSE
, status
);
1518 //--------------------------------------------------------------------------------
1522 //--------------------------------------------------------------------------------
1523 UBool
RegexMatcher::matches(UErrorCode
&status
) {
1524 if (U_FAILURE(status
)) {
1527 if (U_FAILURE(fDeferredStatus
)) {
1528 status
= fDeferredStatus
;
1532 if (fInputUniStrMaybeMutable
) {
1533 if (compat_SyncMutableUTextContents(fInputText
)) {
1534 fInputLength
= utext_nativeLength(fInputText
);
1539 resetPreserveRegion();
1542 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1543 MatchChunkAt((int32_t)fActiveStart
, TRUE
, status
);
1545 MatchAt(fActiveStart
, TRUE
, status
);
1551 UBool
RegexMatcher::matches(int64_t start
, UErrorCode
&status
) {
1552 if (U_FAILURE(status
)) {
1555 if (U_FAILURE(fDeferredStatus
)) {
1556 status
= fDeferredStatus
;
1562 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1566 if (fInputUniStrMaybeMutable
) {
1567 if (compat_SyncMutableUTextContents(fInputText
)) {
1568 fInputLength
= utext_nativeLength(fInputText
);
1573 int64_t nativeStart
;
1574 nativeStart
= start
;
1575 if (nativeStart
< fActiveStart
|| nativeStart
> fActiveLimit
) {
1576 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1580 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1581 MatchChunkAt((int32_t)nativeStart
, TRUE
, status
);
1583 MatchAt(nativeStart
, TRUE
, status
);
1590 //--------------------------------------------------------------------------------
1594 //--------------------------------------------------------------------------------
1595 const RegexPattern
&RegexMatcher::pattern() const {
1601 //--------------------------------------------------------------------------------
1605 //--------------------------------------------------------------------------------
1606 RegexMatcher
&RegexMatcher::region(int64_t regionStart
, int64_t regionLimit
, int64_t startIndex
, UErrorCode
&status
) {
1607 if (U_FAILURE(status
)) {
1611 if (regionStart
>regionLimit
|| regionStart
<0 || regionLimit
<0) {
1612 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1615 int64_t nativeStart
= regionStart
;
1616 int64_t nativeLimit
= regionLimit
;
1617 if (nativeStart
> fInputLength
|| nativeLimit
> fInputLength
) {
1618 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1621 if (startIndex
== -1)
1624 resetPreserveRegion();
1626 fRegionStart
= nativeStart
;
1627 fRegionLimit
= nativeLimit
;
1628 fActiveStart
= nativeStart
;
1629 fActiveLimit
= nativeLimit
;
1631 if (startIndex
!= -1) {
1632 if (startIndex
< fActiveStart
|| startIndex
> fActiveLimit
) {
1633 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1635 fMatchEnd
= startIndex
;
1638 if (!fTransparentBounds
) {
1639 fLookStart
= nativeStart
;
1640 fLookLimit
= nativeLimit
;
1642 if (fAnchoringBounds
) {
1643 fAnchorStart
= nativeStart
;
1644 fAnchorLimit
= nativeLimit
;
1649 RegexMatcher
&RegexMatcher::region(int64_t start
, int64_t limit
, UErrorCode
&status
) {
1650 return region(start
, limit
, -1, status
);
1653 //--------------------------------------------------------------------------------
1657 //--------------------------------------------------------------------------------
1658 int32_t RegexMatcher::regionEnd() const {
1659 return (int32_t)fRegionLimit
;
1662 int64_t RegexMatcher::regionEnd64() const {
1663 return fRegionLimit
;
1666 //--------------------------------------------------------------------------------
1670 //--------------------------------------------------------------------------------
1671 int32_t RegexMatcher::regionStart() const {
1672 return (int32_t)fRegionStart
;
1675 int64_t RegexMatcher::regionStart64() const {
1676 return fRegionStart
;
1680 //--------------------------------------------------------------------------------
1684 //--------------------------------------------------------------------------------
1685 UnicodeString
RegexMatcher::replaceAll(const UnicodeString
&replacement
, UErrorCode
&status
) {
1686 UText replacementText
= UTEXT_INITIALIZER
;
1687 UText resultText
= UTEXT_INITIALIZER
;
1688 UnicodeString resultString
;
1689 if (U_FAILURE(status
)) {
1690 return resultString
;
1693 utext_openConstUnicodeString(&replacementText
, &replacement
, &status
);
1694 utext_openUnicodeString(&resultText
, &resultString
, &status
);
1696 replaceAll(&replacementText
, &resultText
, status
);
1698 utext_close(&resultText
);
1699 utext_close(&replacementText
);
1701 return resultString
;
1706 // replaceAll, UText mode
1708 UText
*RegexMatcher::replaceAll(UText
*replacement
, UText
*dest
, UErrorCode
&status
) {
1709 if (U_FAILURE(status
)) {
1712 if (U_FAILURE(fDeferredStatus
)) {
1713 status
= fDeferredStatus
;
1718 UnicodeString emptyString
;
1719 UText empty
= UTEXT_INITIALIZER
;
1721 utext_openUnicodeString(&empty
, &emptyString
, &status
);
1722 dest
= utext_clone(NULL
, &empty
, TRUE
, FALSE
, &status
);
1723 utext_close(&empty
);
1726 if (U_SUCCESS(status
)) {
1729 appendReplacement(dest
, replacement
, status
);
1730 if (U_FAILURE(status
)) {
1734 appendTail(dest
, status
);
1741 //--------------------------------------------------------------------------------
1745 //--------------------------------------------------------------------------------
1746 UnicodeString
RegexMatcher::replaceFirst(const UnicodeString
&replacement
, UErrorCode
&status
) {
1747 UText replacementText
= UTEXT_INITIALIZER
;
1748 UText resultText
= UTEXT_INITIALIZER
;
1749 UnicodeString resultString
;
1751 utext_openConstUnicodeString(&replacementText
, &replacement
, &status
);
1752 utext_openUnicodeString(&resultText
, &resultString
, &status
);
1754 replaceFirst(&replacementText
, &resultText
, status
);
1756 utext_close(&resultText
);
1757 utext_close(&replacementText
);
1759 return resultString
;
1763 // replaceFirst, UText mode
1765 UText
*RegexMatcher::replaceFirst(UText
*replacement
, UText
*dest
, UErrorCode
&status
) {
1766 if (U_FAILURE(status
)) {
1769 if (U_FAILURE(fDeferredStatus
)) {
1770 status
= fDeferredStatus
;
1776 return getInput(dest
, status
);
1780 UnicodeString emptyString
;
1781 UText empty
= UTEXT_INITIALIZER
;
1783 utext_openUnicodeString(&empty
, &emptyString
, &status
);
1784 dest
= utext_clone(NULL
, &empty
, TRUE
, FALSE
, &status
);
1785 utext_close(&empty
);
1788 appendReplacement(dest
, replacement
, status
);
1789 appendTail(dest
, status
);
1795 //--------------------------------------------------------------------------------
1799 //--------------------------------------------------------------------------------
1800 UBool
RegexMatcher::requireEnd() const {
1805 //--------------------------------------------------------------------------------
1809 //--------------------------------------------------------------------------------
1810 RegexMatcher
&RegexMatcher::reset() {
1812 fRegionLimit
= fInputLength
;
1814 fActiveLimit
= fInputLength
;
1816 fAnchorLimit
= fInputLength
;
1818 fLookLimit
= fInputLength
;
1819 resetPreserveRegion();
1825 void RegexMatcher::resetPreserveRegion() {
1829 fAppendPosition
= 0;
1832 fRequireEnd
= FALSE
;
1834 fTickCounter
= TIMER_INITIAL_VALUE
;
1835 //resetStack(); // more expensive than it looks...
1839 RegexMatcher
&RegexMatcher::reset(const UnicodeString
&input
) {
1840 fInputText
= utext_openConstUnicodeString(fInputText
, &input
, &fDeferredStatus
);
1841 if (fPattern
->fNeedsAltInput
) {
1842 fAltInputText
= utext_clone(fAltInputText
, fInputText
, FALSE
, TRUE
, &fDeferredStatus
);
1844 if (U_FAILURE(fDeferredStatus
)) {
1847 fInputLength
= utext_nativeLength(fInputText
);
1853 // Do the following for any UnicodeString.
1854 // This is for compatibility for those clients who modify the input string "live" during regex operations.
1855 fInputUniStrMaybeMutable
= TRUE
;
1857 if (fWordBreakItr
!= NULL
) {
1858 #if UCONFIG_NO_BREAK_ITERATION==0
1859 UErrorCode status
= U_ZERO_ERROR
;
1860 fWordBreakItr
->setText(fInputText
, status
);
1867 RegexMatcher
&RegexMatcher::reset(UText
*input
) {
1868 if (fInputText
!= input
) {
1869 fInputText
= utext_clone(fInputText
, input
, FALSE
, TRUE
, &fDeferredStatus
);
1870 if (fPattern
->fNeedsAltInput
) fAltInputText
= utext_clone(fAltInputText
, fInputText
, FALSE
, TRUE
, &fDeferredStatus
);
1871 if (U_FAILURE(fDeferredStatus
)) {
1874 fInputLength
= utext_nativeLength(fInputText
);
1879 if (fWordBreakItr
!= NULL
) {
1880 #if UCONFIG_NO_BREAK_ITERATION==0
1881 UErrorCode status
= U_ZERO_ERROR
;
1882 fWordBreakItr
->setText(input
, status
);
1887 fInputUniStrMaybeMutable
= FALSE
;
1892 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
1893 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1897 RegexMatcher
&RegexMatcher::reset(int64_t position
, UErrorCode
&status
) {
1898 if (U_FAILURE(status
)) {
1901 reset(); // Reset also resets the region to be the entire string.
1903 if (position
< 0 || position
> fActiveLimit
) {
1904 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1907 fMatchEnd
= position
;
1912 //--------------------------------------------------------------------------------
1916 //--------------------------------------------------------------------------------
1917 RegexMatcher
&RegexMatcher::refreshInputText(UText
*input
, UErrorCode
&status
) {
1918 if (U_FAILURE(status
)) {
1921 if (input
== NULL
) {
1922 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1925 if (utext_nativeLength(fInputText
) != utext_nativeLength(input
)) {
1926 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1929 int64_t pos
= utext_getNativeIndex(fInputText
);
1930 // Shallow read-only clone of the new UText into the existing input UText
1931 fInputText
= utext_clone(fInputText
, input
, FALSE
, TRUE
, &status
);
1932 if (U_FAILURE(status
)) {
1935 utext_setNativeIndex(fInputText
, pos
);
1937 if (fAltInputText
!= NULL
) {
1938 pos
= utext_getNativeIndex(fAltInputText
);
1939 fAltInputText
= utext_clone(fAltInputText
, input
, FALSE
, TRUE
, &status
);
1940 if (U_FAILURE(status
)) {
1943 utext_setNativeIndex(fAltInputText
, pos
);
1950 //--------------------------------------------------------------------------------
1954 //--------------------------------------------------------------------------------
1955 void RegexMatcher::setTrace(UBool state
) {
1956 fTraceDebug
= state
;
1962 * UText, replace entire contents of the destination UText with a substring of the source UText.
1964 * @param src The source UText
1965 * @param dest The destination UText. Must be writable.
1966 * May be NULL, in which case a new UText will be allocated.
1967 * @param start Start index of source substring.
1968 * @param limit Limit index of source substring.
1969 * @param status An error code.
1971 static UText
*utext_extract_replace(UText
*src
, UText
*dest
, int64_t start
, int64_t limit
, UErrorCode
*status
) {
1972 if (U_FAILURE(*status
)) {
1975 if (start
== limit
) {
1977 utext_replace(dest
, 0, utext_nativeLength(dest
), NULL
, 0, status
);
1980 return utext_openUChars(NULL
, NULL
, 0, status
);
1983 int32_t length
= utext_extract(src
, start
, limit
, NULL
, 0, status
);
1984 if (*status
!= U_BUFFER_OVERFLOW_ERROR
&& U_FAILURE(*status
)) {
1987 *status
= U_ZERO_ERROR
;
1988 MaybeStackArray
<UChar
, 40> buffer
;
1989 if (length
>= buffer
.getCapacity()) {
1990 UChar
*newBuf
= buffer
.resize(length
+1); // Leave space for terminating Nul.
1991 if (newBuf
== NULL
) {
1992 *status
= U_MEMORY_ALLOCATION_ERROR
;
1995 utext_extract(src
, start
, limit
, buffer
.getAlias(), length
+1, status
);
1997 utext_replace(dest
, 0, utext_nativeLength(dest
), buffer
.getAlias(), length
, status
);
2001 // Caller did not provide a prexisting UText.
2002 // Open a new one, and have it adopt the text buffer storage.
2003 if (U_FAILURE(*status
)) {
2006 int32_t ownedLength
= 0;
2007 UChar
*ownedBuf
= buffer
.orphanOrClone(length
+1, ownedLength
);
2008 if (ownedBuf
== NULL
) {
2009 *status
= U_MEMORY_ALLOCATION_ERROR
;
2012 UText
*result
= utext_openUChars(NULL
, ownedBuf
, length
, status
);
2013 if (U_FAILURE(*status
)) {
2014 uprv_free(ownedBuf
);
2017 result
->providerProperties
|= (1 << UTEXT_PROVIDER_OWNS_TEXT
);
2022 //---------------------------------------------------------------------
2026 //---------------------------------------------------------------------
2027 int32_t RegexMatcher::split(const UnicodeString
&input
,
2028 UnicodeString dest
[],
2029 int32_t destCapacity
,
2032 UText inputText
= UTEXT_INITIALIZER
;
2033 utext_openConstUnicodeString(&inputText
, &input
, &status
);
2034 if (U_FAILURE(status
)) {
2038 UText
**destText
= (UText
**)uprv_malloc(sizeof(UText
*)*destCapacity
);
2039 if (destText
== NULL
) {
2040 status
= U_MEMORY_ALLOCATION_ERROR
;
2044 for (i
= 0; i
< destCapacity
; i
++) {
2045 destText
[i
] = utext_openUnicodeString(NULL
, &dest
[i
], &status
);
2048 int32_t fieldCount
= split(&inputText
, destText
, destCapacity
, status
);
2050 for (i
= 0; i
< destCapacity
; i
++) {
2051 utext_close(destText
[i
]);
2054 uprv_free(destText
);
2055 utext_close(&inputText
);
2060 // split, UText mode
2062 int32_t RegexMatcher::split(UText
*input
,
2064 int32_t destCapacity
,
2068 // Check arguements for validity
2070 if (U_FAILURE(status
)) {
2074 if (destCapacity
< 1) {
2075 status
= U_ILLEGAL_ARGUMENT_ERROR
;
2080 // Reset for the input text
2083 int64_t nextOutputStringStart
= 0;
2084 if (fActiveLimit
== 0) {
2089 // Loop through the input text, searching for the delimiter pattern
2092 int32_t numCaptureGroups
= fPattern
->fGroupMap
->size();
2094 if (i
>=destCapacity
-1) {
2095 // There is one or zero output string left.
2096 // Fill the last output string with whatever is left from the input, then exit the loop.
2097 // ( i will be == destCapacity if we filled the output array while processing
2098 // capture groups of the delimiter expression, in which case we will discard the
2099 // last capture group saved in favor of the unprocessed remainder of the
2102 if (fActiveLimit
> nextOutputStringStart
) {
2103 if (UTEXT_FULL_TEXT_IN_CHUNK(input
, fInputLength
)) {
2105 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]),
2106 input
->chunkContents
+nextOutputStringStart
,
2107 (int32_t)(fActiveLimit
-nextOutputStringStart
), &status
);
2109 UText remainingText
= UTEXT_INITIALIZER
;
2110 utext_openUChars(&remainingText
, input
->chunkContents
+nextOutputStringStart
,
2111 fActiveLimit
-nextOutputStringStart
, &status
);
2112 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2113 utext_close(&remainingText
);
2116 UErrorCode lengthStatus
= U_ZERO_ERROR
;
2117 int32_t remaining16Length
=
2118 utext_extract(input
, nextOutputStringStart
, fActiveLimit
, NULL
, 0, &lengthStatus
);
2119 UChar
*remainingChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(remaining16Length
+1));
2120 if (remainingChars
== NULL
) {
2121 status
= U_MEMORY_ALLOCATION_ERROR
;
2125 utext_extract(input
, nextOutputStringStart
, fActiveLimit
, remainingChars
, remaining16Length
+1, &status
);
2127 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]), remainingChars
, remaining16Length
, &status
);
2129 UText remainingText
= UTEXT_INITIALIZER
;
2130 utext_openUChars(&remainingText
, remainingChars
, remaining16Length
, &status
);
2131 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2132 utext_close(&remainingText
);
2135 uprv_free(remainingChars
);
2141 // We found another delimiter. Move everything from where we started looking
2142 // up until the start of the delimiter into the next output string.
2143 if (UTEXT_FULL_TEXT_IN_CHUNK(input
, fInputLength
)) {
2145 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]),
2146 input
->chunkContents
+nextOutputStringStart
,
2147 (int32_t)(fMatchStart
-nextOutputStringStart
), &status
);
2149 UText remainingText
= UTEXT_INITIALIZER
;
2150 utext_openUChars(&remainingText
, input
->chunkContents
+nextOutputStringStart
,
2151 fMatchStart
-nextOutputStringStart
, &status
);
2152 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2153 utext_close(&remainingText
);
2156 UErrorCode lengthStatus
= U_ZERO_ERROR
;
2157 int32_t remaining16Length
= utext_extract(input
, nextOutputStringStart
, fMatchStart
, NULL
, 0, &lengthStatus
);
2158 UChar
*remainingChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(remaining16Length
+1));
2159 if (remainingChars
== NULL
) {
2160 status
= U_MEMORY_ALLOCATION_ERROR
;
2163 utext_extract(input
, nextOutputStringStart
, fMatchStart
, remainingChars
, remaining16Length
+1, &status
);
2165 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]), remainingChars
, remaining16Length
, &status
);
2167 UText remainingText
= UTEXT_INITIALIZER
;
2168 utext_openUChars(&remainingText
, remainingChars
, remaining16Length
, &status
);
2169 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2170 utext_close(&remainingText
);
2173 uprv_free(remainingChars
);
2175 nextOutputStringStart
= fMatchEnd
;
2177 // If the delimiter pattern has capturing parentheses, the captured
2178 // text goes out into the next n destination strings.
2180 for (groupNum
=1; groupNum
<=numCaptureGroups
; groupNum
++) {
2181 if (i
>= destCapacity
-2) {
2182 // Never fill the last available output string with capture group text.
2183 // It will filled with the last field, the remainder of the
2184 // unsplit input text.
2188 dest
[i
] = utext_extract_replace(fInputText
, dest
[i
],
2189 start64(groupNum
, status
), end64(groupNum
, status
), &status
);
2192 if (nextOutputStringStart
== fActiveLimit
) {
2193 // The delimiter was at the end of the string. We're done, but first
2194 // we output one last empty string, for the empty field following
2195 // the delimiter at the end of input.
2196 if (i
+1 < destCapacity
) {
2198 if (dest
[i
] == NULL
) {
2199 dest
[i
] = utext_openUChars(NULL
, NULL
, 0, &status
);
2201 static UChar emptyString
[] = {(UChar
)0};
2202 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]), emptyString
, 0, &status
);
2211 // We ran off the end of the input while looking for the next delimiter.
2212 // All the remaining text goes into the current output string.
2213 if (UTEXT_FULL_TEXT_IN_CHUNK(input
, fInputLength
)) {
2215 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]),
2216 input
->chunkContents
+nextOutputStringStart
,
2217 (int32_t)(fActiveLimit
-nextOutputStringStart
), &status
);
2219 UText remainingText
= UTEXT_INITIALIZER
;
2220 utext_openUChars(&remainingText
, input
->chunkContents
+nextOutputStringStart
,
2221 fActiveLimit
-nextOutputStringStart
, &status
);
2222 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2223 utext_close(&remainingText
);
2226 UErrorCode lengthStatus
= U_ZERO_ERROR
;
2227 int32_t remaining16Length
= utext_extract(input
, nextOutputStringStart
, fActiveLimit
, NULL
, 0, &lengthStatus
);
2228 UChar
*remainingChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(remaining16Length
+1));
2229 if (remainingChars
== NULL
) {
2230 status
= U_MEMORY_ALLOCATION_ERROR
;
2234 utext_extract(input
, nextOutputStringStart
, fActiveLimit
, remainingChars
, remaining16Length
+1, &status
);
2236 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]), remainingChars
, remaining16Length
, &status
);
2238 UText remainingText
= UTEXT_INITIALIZER
;
2239 utext_openUChars(&remainingText
, remainingChars
, remaining16Length
, &status
);
2240 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2241 utext_close(&remainingText
);
2244 uprv_free(remainingChars
);
2248 if (U_FAILURE(status
)) {
2251 } // end of for loop
2256 //--------------------------------------------------------------------------------
2260 //--------------------------------------------------------------------------------
2261 int32_t RegexMatcher::start(UErrorCode
&status
) const {
2262 return start(0, status
);
2265 int64_t RegexMatcher::start64(UErrorCode
&status
) const {
2266 return start64(0, status
);
2269 //--------------------------------------------------------------------------------
2271 // start(int32_t group, UErrorCode &status)
2273 //--------------------------------------------------------------------------------
2275 int64_t RegexMatcher::start64(int32_t group
, UErrorCode
&status
) const {
2276 if (U_FAILURE(status
)) {
2279 if (U_FAILURE(fDeferredStatus
)) {
2280 status
= fDeferredStatus
;
2283 if (fMatch
== FALSE
) {
2284 status
= U_REGEX_INVALID_STATE
;
2287 if (group
< 0 || group
> fPattern
->fGroupMap
->size()) {
2288 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
2295 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(group
-1);
2296 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
2297 U_ASSERT(groupOffset
>= 0);
2298 s
= fFrame
->fExtra
[groupOffset
];
2305 int32_t RegexMatcher::start(int32_t group
, UErrorCode
&status
) const {
2306 return (int32_t)start64(group
, status
);
2309 //--------------------------------------------------------------------------------
2311 // useAnchoringBounds
2313 //--------------------------------------------------------------------------------
2314 RegexMatcher
&RegexMatcher::useAnchoringBounds(UBool b
) {
2315 fAnchoringBounds
= b
;
2316 fAnchorStart
= (fAnchoringBounds
? fRegionStart
: 0);
2317 fAnchorLimit
= (fAnchoringBounds
? fRegionLimit
: fInputLength
);
2322 //--------------------------------------------------------------------------------
2324 // useTransparentBounds
2326 //--------------------------------------------------------------------------------
2327 RegexMatcher
&RegexMatcher::useTransparentBounds(UBool b
) {
2328 fTransparentBounds
= b
;
2329 fLookStart
= (fTransparentBounds
? 0 : fRegionStart
);
2330 fLookLimit
= (fTransparentBounds
? fInputLength
: fRegionLimit
);
2334 //--------------------------------------------------------------------------------
2338 //--------------------------------------------------------------------------------
2339 void RegexMatcher::setTimeLimit(int32_t limit
, UErrorCode
&status
) {
2340 if (U_FAILURE(status
)) {
2343 if (U_FAILURE(fDeferredStatus
)) {
2344 status
= fDeferredStatus
;
2348 status
= U_ILLEGAL_ARGUMENT_ERROR
;
2355 //--------------------------------------------------------------------------------
2359 //--------------------------------------------------------------------------------
2360 int32_t RegexMatcher::getTimeLimit() const {
2365 //--------------------------------------------------------------------------------
2369 //--------------------------------------------------------------------------------
2370 void RegexMatcher::setStackLimit(int32_t limit
, UErrorCode
&status
) {
2371 if (U_FAILURE(status
)) {
2374 if (U_FAILURE(fDeferredStatus
)) {
2375 status
= fDeferredStatus
;
2379 status
= U_ILLEGAL_ARGUMENT_ERROR
;
2383 // Reset the matcher. This is needed here in case there is a current match
2384 // whose final stack frame (containing the match results, pointed to by fFrame)
2385 // would be lost by resizing to a smaller stack size.
2389 // Unlimited stack expansion
2390 fStack
->setMaxCapacity(0);
2392 // Change the units of the limit from bytes to ints, and bump the size up
2393 // to be big enough to hold at least one stack frame for the pattern,
2394 // if it isn't there already.
2395 int32_t adjustedLimit
= limit
/ sizeof(int32_t);
2396 if (adjustedLimit
< fPattern
->fFrameSize
) {
2397 adjustedLimit
= fPattern
->fFrameSize
;
2399 fStack
->setMaxCapacity(adjustedLimit
);
2401 fStackLimit
= limit
;
2405 //--------------------------------------------------------------------------------
2409 //--------------------------------------------------------------------------------
2410 int32_t RegexMatcher::getStackLimit() const {
2415 //--------------------------------------------------------------------------------
2419 //--------------------------------------------------------------------------------
2420 void RegexMatcher::setMatchCallback(URegexMatchCallback
*callback
,
2421 const void *context
,
2422 UErrorCode
&status
) {
2423 if (U_FAILURE(status
)) {
2426 fCallbackFn
= callback
;
2427 fCallbackContext
= context
;
2431 //--------------------------------------------------------------------------------
2435 //--------------------------------------------------------------------------------
2436 void RegexMatcher::getMatchCallback(URegexMatchCallback
*&callback
,
2437 const void *&context
,
2438 UErrorCode
&status
) {
2439 if (U_FAILURE(status
)) {
2442 callback
= fCallbackFn
;
2443 context
= fCallbackContext
;
2447 //--------------------------------------------------------------------------------
2451 //--------------------------------------------------------------------------------
2452 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback
*callback
,
2453 const void *context
,
2454 UErrorCode
&status
) {
2455 if (U_FAILURE(status
)) {
2458 fFindProgressCallbackFn
= callback
;
2459 fFindProgressCallbackContext
= context
;
2463 //--------------------------------------------------------------------------------
2467 //--------------------------------------------------------------------------------
2468 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback
*&callback
,
2469 const void *&context
,
2470 UErrorCode
&status
) {
2471 if (U_FAILURE(status
)) {
2474 callback
= fFindProgressCallbackFn
;
2475 context
= fFindProgressCallbackContext
;
2479 //================================================================================
2481 // Code following this point in this file is the internal
2482 // Match Engine Implementation.
2484 //================================================================================
2487 //--------------------------------------------------------------------------------
2490 // Discard any previous contents of the state save stack, and initialize a
2491 // new stack frame to all -1. The -1s are needed for capture group limits,
2492 // where they indicate that a group has not yet matched anything.
2493 //--------------------------------------------------------------------------------
2494 REStackFrame
*RegexMatcher::resetStack() {
2495 // Discard any previous contents of the state save stack, and initialize a
2496 // new stack frame with all -1 data. The -1s are needed for capture group limits,
2497 // where they indicate that a group has not yet matched anything.
2498 fStack
->removeAllElements();
2500 REStackFrame
*iFrame
= (REStackFrame
*)fStack
->reserveBlock(fPattern
->fFrameSize
, fDeferredStatus
);
2501 if(U_FAILURE(fDeferredStatus
)) {
2506 for (i
=0; i
<fPattern
->fFrameSize
-RESTACKFRAME_HDRCOUNT
; i
++) {
2507 iFrame
->fExtra
[i
] = -1;
2514 //--------------------------------------------------------------------------------
2517 // in perl, "xab..cd..", \b is true at positions 0,3,5,7
2519 // If the current char is a combining mark,
2521 // Else Scan backwards to the first non-combining char.
2522 // We are at a boundary if the this char and the original chars are
2523 // opposite in membership in \w set
2525 // parameters: pos - the current position in the input buffer
2527 // TODO: double-check edge cases at region boundaries.
2529 //--------------------------------------------------------------------------------
2530 UBool
RegexMatcher::isWordBoundary(int64_t pos
) {
2531 UBool isBoundary
= FALSE
;
2532 UBool cIsWord
= FALSE
;
2534 if (pos
>= fLookLimit
) {
2537 // Determine whether char c at current position is a member of the word set of chars.
2538 // If we're off the end of the string, behave as though we're not at a word char.
2539 UTEXT_SETNATIVEINDEX(fInputText
, pos
);
2540 UChar32 c
= UTEXT_CURRENT32(fInputText
);
2541 if (u_hasBinaryProperty(c
, UCHAR_GRAPHEME_EXTEND
) || u_charType(c
) == U_FORMAT_CHAR
) {
2542 // Current char is a combining one. Not a boundary.
2545 cIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(c
);
2548 // Back up until we come to a non-combining char, determine whether
2549 // that char is a word char.
2550 UBool prevCIsWord
= FALSE
;
2552 if (UTEXT_GETNATIVEINDEX(fInputText
) <= fLookStart
) {
2555 UChar32 prevChar
= UTEXT_PREVIOUS32(fInputText
);
2556 if (!(u_hasBinaryProperty(prevChar
, UCHAR_GRAPHEME_EXTEND
)
2557 || u_charType(prevChar
) == U_FORMAT_CHAR
)) {
2558 prevCIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(prevChar
);
2562 isBoundary
= cIsWord
^ prevCIsWord
;
2566 UBool
RegexMatcher::isChunkWordBoundary(int32_t pos
) {
2567 UBool isBoundary
= FALSE
;
2568 UBool cIsWord
= FALSE
;
2570 const UChar
*inputBuf
= fInputText
->chunkContents
;
2572 if (pos
>= fLookLimit
) {
2575 // Determine whether char c at current position is a member of the word set of chars.
2576 // If we're off the end of the string, behave as though we're not at a word char.
2578 U16_GET(inputBuf
, fLookStart
, pos
, fLookLimit
, c
);
2579 if (u_hasBinaryProperty(c
, UCHAR_GRAPHEME_EXTEND
) || u_charType(c
) == U_FORMAT_CHAR
) {
2580 // Current char is a combining one. Not a boundary.
2583 cIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(c
);
2586 // Back up until we come to a non-combining char, determine whether
2587 // that char is a word char.
2588 UBool prevCIsWord
= FALSE
;
2590 if (pos
<= fLookStart
) {
2594 U16_PREV(inputBuf
, fLookStart
, pos
, prevChar
);
2595 if (!(u_hasBinaryProperty(prevChar
, UCHAR_GRAPHEME_EXTEND
)
2596 || u_charType(prevChar
) == U_FORMAT_CHAR
)) {
2597 prevCIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(prevChar
);
2601 isBoundary
= cIsWord
^ prevCIsWord
;
2605 //--------------------------------------------------------------------------------
2609 // Test for a word boundary using RBBI word break.
2611 // parameters: pos - the current position in the input buffer
2613 //--------------------------------------------------------------------------------
2614 UBool
RegexMatcher::isUWordBoundary(int64_t pos
) {
2615 UBool returnVal
= FALSE
;
2616 #if UCONFIG_NO_BREAK_ITERATION==0
2618 // If we haven't yet created a break iterator for this matcher, do it now.
2619 if (fWordBreakItr
== NULL
) {
2621 (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus
);
2622 if (U_FAILURE(fDeferredStatus
)) {
2625 fWordBreakItr
->setText(fInputText
, fDeferredStatus
);
2628 if (pos
>= fLookLimit
) {
2630 returnVal
= TRUE
; // With Unicode word rules, only positions within the interior of "real"
2631 // words are not boundaries. All non-word chars stand by themselves,
2632 // with word boundaries on both sides.
2634 if (!UTEXT_USES_U16(fInputText
)) {
2635 // !!!: Would like a better way to do this!
2636 UErrorCode status
= U_ZERO_ERROR
;
2637 pos
= utext_extract(fInputText
, 0, pos
, NULL
, 0, &status
);
2639 returnVal
= fWordBreakItr
->isBoundary((int32_t)pos
);
2645 //--------------------------------------------------------------------------------
2647 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state
2648 // saves. Increment the "time" counter, and call the
2649 // user callback function if there is one installed.
2651 // If the match operation needs to be aborted, either for a time-out
2652 // or because the user callback asked for it, just set an error status.
2653 // The engine will pick that up and stop in its outer loop.
2655 //--------------------------------------------------------------------------------
2656 void RegexMatcher::IncrementTime(UErrorCode
&status
) {
2657 fTickCounter
= TIMER_INITIAL_VALUE
;
2659 if (fCallbackFn
!= NULL
) {
2660 if ((*fCallbackFn
)(fCallbackContext
, fTime
) == FALSE
) {
2661 status
= U_REGEX_STOPPED_BY_CALLER
;
2665 if (fTimeLimit
> 0 && fTime
>= fTimeLimit
) {
2666 status
= U_REGEX_TIME_OUT
;
2670 //--------------------------------------------------------------------------------
2673 // Make a new stack frame, initialized as a copy of the current stack frame.
2674 // Set the pattern index in the original stack frame from the operand value
2675 // in the opcode. Execution of the engine continues with the state in
2676 // the newly created stack frame
2678 // Note that reserveBlock() may grow the stack, resulting in the
2679 // whole thing being relocated in memory.
2682 // fp The top frame pointer when called. At return, a new
2683 // fame will be present
2684 // savePatIdx An index into the compiled pattern. Goes into the original
2685 // (not new) frame. If execution ever back-tracks out of the
2686 // new frame, this will be where we continue from in the pattern.
2688 // The new frame pointer.
2690 //--------------------------------------------------------------------------------
2691 inline REStackFrame
*RegexMatcher::StateSave(REStackFrame
*fp
, int64_t savePatIdx
, UErrorCode
&status
) {
2692 if (U_FAILURE(status
)) {
2695 // push storage for a new frame.
2696 int64_t *newFP
= fStack
->reserveBlock(fFrameSize
, status
);
2697 if (U_FAILURE(status
)) {
2698 // Failure on attempted stack expansion.
2699 // Stack function set some other error code, change it to a more
2700 // specific one for regular expressions.
2701 status
= U_REGEX_STACK_OVERFLOW
;
2702 // We need to return a writable stack frame, so just return the
2703 // previous frame. The match operation will stop quickly
2704 // because of the error status, after which the frame will never
2705 // be looked at again.
2708 fp
= (REStackFrame
*)(newFP
- fFrameSize
); // in case of realloc of stack.
2710 // New stack frame = copy of old top frame.
2711 int64_t *source
= (int64_t *)fp
;
2712 int64_t *dest
= newFP
;
2714 *dest
++ = *source
++;
2715 if (source
== newFP
) {
2721 if (fTickCounter
<= 0) {
2722 IncrementTime(status
); // Re-initializes fTickCounter
2724 fp
->fPatIdx
= savePatIdx
;
2725 return (REStackFrame
*)newFP
;
2728 #if defined(REGEX_DEBUG)
2730 UnicodeString
StringFromUText(UText
*ut
) {
2731 UnicodeString result
;
2732 for (UChar32 c
= utext_next32From(ut
, 0); c
!= U_SENTINEL
; c
= UTEXT_NEXT32(ut
)) {
2738 #endif // REGEX_DEBUG
2741 //--------------------------------------------------------------------------------
2743 // MatchAt This is the actual matching engine.
2745 // startIdx: begin matching a this index.
2746 // toEnd: if true, match must extend to end of the input region
2748 //--------------------------------------------------------------------------------
2749 void RegexMatcher::MatchAt(int64_t startIdx
, UBool toEnd
, UErrorCode
&status
) {
2750 UBool isMatch
= FALSE
; // True if the we have a match.
2752 int64_t backSearchIndex
= U_INT64_MAX
; // used after greedy single-character matches for searching backwards
2754 int32_t op
; // Operation from the compiled pattern, split into
2755 int32_t opType
; // the opcode
2756 int32_t opValue
; // and the operand value.
2758 #ifdef REGEX_RUN_DEBUG
2760 printf("MatchAt(startIdx=%ld)\n", startIdx
);
2761 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern
->fPattern
))());
2762 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText
))());
2766 if (U_FAILURE(status
)) {
2770 // Cache frequently referenced items from the compiled pattern
2772 int64_t *pat
= fPattern
->fCompiledPat
->getBuffer();
2774 const UChar
*litText
= fPattern
->fLiteralText
.getBuffer();
2775 UVector
*sets
= fPattern
->fSets
;
2777 fFrameSize
= fPattern
->fFrameSize
;
2778 REStackFrame
*fp
= resetStack();
2779 if (U_FAILURE(fDeferredStatus
)) {
2780 status
= fDeferredStatus
;
2785 fp
->fInputIdx
= startIdx
;
2787 // Zero out the pattern's static data
2789 for (i
= 0; i
<fPattern
->fDataSize
; i
++) {
2794 // Main loop for interpreting the compiled pattern.
2795 // One iteration of the loop per pattern operation performed.
2798 op
= (int32_t)pat
[fp
->fPatIdx
];
2799 opType
= URX_TYPE(op
);
2800 opValue
= URX_VAL(op
);
2801 #ifdef REGEX_RUN_DEBUG
2803 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2804 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp
->fInputIdx
,
2805 UTEXT_CURRENT32(fInputText
), (int64_t *)fp
-fStack
->getBuffer(), fActiveLimit
);
2806 fPattern
->dumpOp(fp
->fPatIdx
);
2819 // Force a backtrack. In some circumstances, the pattern compiler
2820 // will notice that the pattern can't possibly match anything, and will
2821 // emit one of these at that point.
2822 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2827 if (fp
->fInputIdx
< fActiveLimit
) {
2828 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2829 UChar32 c
= UTEXT_NEXT32(fInputText
);
2831 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
2837 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2843 // Test input against a literal string.
2844 // Strings require two slots in the compiled pattern, one for the
2845 // offset to the string text, and one for the length.
2847 int32_t stringStartIdx
= opValue
;
2848 op
= (int32_t)pat
[fp
->fPatIdx
]; // Fetch the second operand
2850 opType
= URX_TYPE(op
);
2851 int32_t stringLen
= URX_VAL(op
);
2852 U_ASSERT(opType
== URX_STRING_LEN
);
2853 U_ASSERT(stringLen
>= 2);
2855 const UChar
*patternString
= litText
+stringStartIdx
;
2856 int32_t patternStringIndex
= 0;
2857 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2859 UChar32 patternChar
;
2860 UBool success
= TRUE
;
2861 while (patternStringIndex
< stringLen
) {
2862 if (UTEXT_GETNATIVEINDEX(fInputText
) >= fActiveLimit
) {
2867 inputChar
= UTEXT_NEXT32(fInputText
);
2868 U16_NEXT(patternString
, patternStringIndex
, stringLen
, patternChar
);
2869 if (patternChar
!= inputChar
) {
2876 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
2878 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2884 case URX_STATE_SAVE
:
2885 fp
= StateSave(fp
, opValue
, status
);
2890 // The match loop will exit via this path on a successful match,
2891 // when we reach the end of the pattern.
2892 if (toEnd
&& fp
->fInputIdx
!= fActiveLimit
) {
2893 // The pattern matched, but not to the end of input. Try some more.
2894 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2900 // Start and End Capture stack frame variables are laid out out like this:
2901 // fp->fExtra[opValue] - The start of a completed capture group
2902 // opValue+1 - The end of a completed capture group
2903 // opValue+2 - the start of a capture group whose end
2904 // has not yet been reached (and might not ever be).
2905 case URX_START_CAPTURE
:
2906 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
2907 fp
->fExtra
[opValue
+2] = fp
->fInputIdx
;
2911 case URX_END_CAPTURE
:
2912 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
2913 U_ASSERT(fp
->fExtra
[opValue
+2] >= 0); // Start pos for this group must be set.
2914 fp
->fExtra
[opValue
] = fp
->fExtra
[opValue
+2]; // Tentative start becomes real.
2915 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // End position
2916 U_ASSERT(fp
->fExtra
[opValue
] <= fp
->fExtra
[opValue
+1]);
2920 case URX_DOLLAR
: // $, test for End of line
2921 // or for position before new line at end of input
2923 if (fp
->fInputIdx
>= fAnchorLimit
) {
2924 // We really are at the end of input. Success.
2930 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2932 // If we are positioned just before a new-line that is located at the
2933 // end of input, succeed.
2934 UChar32 c
= UTEXT_NEXT32(fInputText
);
2935 if (UTEXT_GETNATIVEINDEX(fInputText
) >= fAnchorLimit
) {
2936 if (isLineTerminator(c
)) {
2937 // If not in the middle of a CR/LF sequence
2938 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& ((void)UTEXT_PREVIOUS32(fInputText
), UTEXT_PREVIOUS32(fInputText
))==0x0d)) {
2939 // At new-line at end of input. Success
2947 UChar32 nextC
= UTEXT_NEXT32(fInputText
);
2948 if (c
== 0x0d && nextC
== 0x0a && UTEXT_GETNATIVEINDEX(fInputText
) >= fAnchorLimit
) {
2951 break; // At CR/LF at end of input. Success
2955 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2960 case URX_DOLLAR_D
: // $, test for End of Line, in UNIX_LINES mode.
2961 if (fp
->fInputIdx
>= fAnchorLimit
) {
2962 // Off the end of input. Success.
2967 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2968 UChar32 c
= UTEXT_NEXT32(fInputText
);
2969 // Either at the last character of input, or off the end.
2970 if (c
== 0x0a && UTEXT_GETNATIVEINDEX(fInputText
) == fAnchorLimit
) {
2977 // Not at end of input. Back-track out.
2978 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2982 case URX_DOLLAR_M
: // $, test for End of line in multi-line mode
2984 if (fp
->fInputIdx
>= fAnchorLimit
) {
2985 // We really are at the end of input. Success.
2990 // If we are positioned just before a new-line, succeed.
2991 // It makes no difference where the new-line is within the input.
2992 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2993 UChar32 c
= UTEXT_CURRENT32(fInputText
);
2994 if (isLineTerminator(c
)) {
2995 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
2996 // In multi-line mode, hitting a new-line just before the end of input does not
2997 // set the hitEnd or requireEnd flags
2998 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& UTEXT_PREVIOUS32(fInputText
)==0x0d)) {
3002 // not at a new line. Fail.
3003 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3008 case URX_DOLLAR_MD
: // $, test for End of line in multi-line and UNIX_LINES mode
3010 if (fp
->fInputIdx
>= fAnchorLimit
) {
3011 // We really are at the end of input. Success.
3013 fRequireEnd
= TRUE
; // Java set requireEnd in this case, even though
3014 break; // adding a new-line would not lose the match.
3016 // If we are not positioned just before a new-line, the test fails; backtrack out.
3017 // It makes no difference where the new-line is within the input.
3018 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3019 if (UTEXT_CURRENT32(fInputText
) != 0x0a) {
3020 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3026 case URX_CARET
: // ^, test for start of line
3027 if (fp
->fInputIdx
!= fAnchorStart
) {
3028 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3033 case URX_CARET_M
: // ^, test for start of line in mulit-line mode
3035 if (fp
->fInputIdx
== fAnchorStart
) {
3036 // We are at the start input. Success.
3039 // Check whether character just before the current pos is a new-line
3040 // unless we are at the end of input
3041 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3042 UChar32 c
= UTEXT_PREVIOUS32(fInputText
);
3043 if ((fp
->fInputIdx
< fAnchorLimit
) && isLineTerminator(c
)) {
3044 // It's a new-line. ^ is true. Success.
3045 // TODO: what should be done with positions between a CR and LF?
3048 // Not at the start of a line. Fail.
3049 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3054 case URX_CARET_M_UNIX
: // ^, test for start of line in mulit-line + Unix-line mode
3056 U_ASSERT(fp
->fInputIdx
>= fAnchorStart
);
3057 if (fp
->fInputIdx
<= fAnchorStart
) {
3058 // We are at the start input. Success.
3061 // Check whether character just before the current pos is a new-line
3062 U_ASSERT(fp
->fInputIdx
<= fAnchorLimit
);
3063 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3064 UChar32 c
= UTEXT_PREVIOUS32(fInputText
);
3066 // Not at the start of a line. Back-track out.
3067 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3072 case URX_BACKSLASH_B
: // Test for word boundaries
3074 UBool success
= isWordBoundary(fp
->fInputIdx
);
3075 success
^= (UBool
)(opValue
!= 0); // flip sense for \B
3077 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3083 case URX_BACKSLASH_BU
: // Test for word boundaries, Unicode-style
3085 UBool success
= isUWordBoundary(fp
->fInputIdx
);
3086 success
^= (UBool
)(opValue
!= 0); // flip sense for \B
3088 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3094 case URX_BACKSLASH_D
: // Test for decimal digit
3096 if (fp
->fInputIdx
>= fActiveLimit
) {
3098 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3102 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3104 UChar32 c
= UTEXT_NEXT32(fInputText
);
3105 int8_t ctype
= u_charType(c
); // TODO: make a unicode set for this. Will be faster.
3106 UBool success
= (ctype
== U_DECIMAL_DIGIT_NUMBER
);
3107 success
^= (UBool
)(opValue
!= 0); // flip sense for \D
3109 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3111 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3117 case URX_BACKSLASH_G
: // Test for position at end of previous match
3118 if (!((fMatch
&& fp
->fInputIdx
==fMatchEnd
) || (fMatch
==FALSE
&& fp
->fInputIdx
==fActiveStart
))) {
3119 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3124 case URX_BACKSLASH_H
: // Test for \h, horizontal white space.
3126 if (fp
->fInputIdx
>= fActiveLimit
) {
3128 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3131 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3132 UChar32 c
= UTEXT_NEXT32(fInputText
);
3133 int8_t ctype
= u_charType(c
);
3134 UBool success
= (ctype
== U_SPACE_SEPARATOR
|| c
== 9); // SPACE_SEPARATOR || TAB
3135 success
^= (UBool
)(opValue
!= 0); // flip sense for \H
3137 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3139 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3145 case URX_BACKSLASH_R
: // Test for \R, any line break sequence.
3147 if (fp
->fInputIdx
>= fActiveLimit
) {
3149 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3152 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3153 UChar32 c
= UTEXT_NEXT32(fInputText
);
3154 if (isLineTerminator(c
)) {
3155 if (c
== 0x0d && utext_current32(fInputText
) == 0x0a) {
3156 utext_next32(fInputText
);
3158 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3160 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3166 case URX_BACKSLASH_V
: // \v, any single line ending character.
3168 if (fp
->fInputIdx
>= fActiveLimit
) {
3170 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3173 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3174 UChar32 c
= UTEXT_NEXT32(fInputText
);
3175 UBool success
= isLineTerminator(c
);
3176 success
^= (UBool
)(opValue
!= 0); // flip sense for \V
3178 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3180 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3186 case URX_BACKSLASH_X
:
3187 // Match a Grapheme, as defined by Unicode TR 29.
3188 // Differs slightly from Perl, which consumes combining marks independently
3192 // Fail if at end of input
3193 if (fp
->fInputIdx
>= fActiveLimit
) {
3195 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3199 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3201 // Examine (and consume) the current char.
3202 // Dispatch into a little state machine, based on the char.
3204 c
= UTEXT_NEXT32(fInputText
);
3205 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3206 UnicodeSet
**sets
= fPattern
->fStaticSets
;
3207 if (sets
[URX_GC_NORMAL
]->contains(c
)) goto GC_Extend
;
3208 if (sets
[URX_GC_CONTROL
]->contains(c
)) goto GC_Control
;
3209 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
3210 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
3211 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
3212 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
3213 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
3219 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
3220 c
= UTEXT_NEXT32(fInputText
);
3221 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3222 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
3223 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
3224 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
3225 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
3226 (void)UTEXT_PREVIOUS32(fInputText
);
3227 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3231 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
3232 c
= UTEXT_NEXT32(fInputText
);
3233 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3234 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
3235 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
3236 (void)UTEXT_PREVIOUS32(fInputText
);
3237 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3241 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
3242 c
= UTEXT_NEXT32(fInputText
);
3243 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3244 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
3245 (void)UTEXT_PREVIOUS32(fInputText
);
3246 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3250 // Combining characters are consumed here
3252 if (fp
->fInputIdx
>= fActiveLimit
) {
3255 c
= UTEXT_CURRENT32(fInputText
);
3256 if (sets
[URX_GC_EXTEND
]->contains(c
) == FALSE
) {
3259 (void)UTEXT_NEXT32(fInputText
);
3260 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3265 // Most control chars stand alone (don't combine with combining chars),
3266 // except for that CR/LF sequence is a single grapheme cluster.
3267 if (c
== 0x0d && fp
->fInputIdx
< fActiveLimit
&& UTEXT_CURRENT32(fInputText
) == 0x0a) {
3268 c
= UTEXT_NEXT32(fInputText
);
3269 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3273 if (fp
->fInputIdx
>= fActiveLimit
) {
3282 case URX_BACKSLASH_Z
: // Test for end of Input
3283 if (fp
->fInputIdx
< fAnchorLimit
) {
3284 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3293 case URX_STATIC_SETREF
:
3295 // Test input character against one of the predefined sets
3296 // (Word Characters, for example)
3297 // The high bit of the op value is a flag for the match polarity.
3298 // 0: success if input char is in set.
3299 // 1: success if input char is not in set.
3300 if (fp
->fInputIdx
>= fActiveLimit
) {
3302 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3306 UBool success
= ((opValue
& URX_NEG_SET
) == URX_NEG_SET
);
3307 opValue
&= ~URX_NEG_SET
;
3308 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
3310 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3311 UChar32 c
= UTEXT_NEXT32(fInputText
);
3313 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
3314 if (s8
->contains(c
)) {
3318 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
3319 if (s
->contains(c
)) {
3324 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3326 // the character wasn't in the set.
3327 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3333 case URX_STAT_SETREF_N
:
3335 // Test input character for NOT being a member of one of
3336 // the predefined sets (Word Characters, for example)
3337 if (fp
->fInputIdx
>= fActiveLimit
) {
3339 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3343 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
3345 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3347 UChar32 c
= UTEXT_NEXT32(fInputText
);
3349 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
3350 if (s8
->contains(c
) == FALSE
) {
3351 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3355 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
3356 if (s
->contains(c
) == FALSE
) {
3357 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3361 // the character wasn't in the set.
3362 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3368 if (fp
->fInputIdx
>= fActiveLimit
) {
3370 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3373 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3375 // There is input left. Pick up one char and test it for set membership.
3376 UChar32 c
= UTEXT_NEXT32(fInputText
);
3377 U_ASSERT(opValue
> 0 && opValue
< sets
->size());
3379 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
3380 if (s8
->contains(c
)) {
3381 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3385 UnicodeSet
*s
= (UnicodeSet
*)sets
->elementAt(opValue
);
3386 if (s
->contains(c
)) {
3387 // The character is in the set. A Match.
3388 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3393 // the character wasn't in the set.
3394 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3401 // . matches anything, but stops at end-of-line.
3402 if (fp
->fInputIdx
>= fActiveLimit
) {
3403 // At end of input. Match failed. Backtrack out.
3405 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3409 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3411 // There is input left. Advance over one char, unless we've hit end-of-line
3412 UChar32 c
= UTEXT_NEXT32(fInputText
);
3413 if (isLineTerminator(c
)) {
3414 // End of line in normal mode. . does not match.
3415 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3418 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3423 case URX_DOTANY_ALL
:
3425 // ., in dot-matches-all (including new lines) mode
3426 if (fp
->fInputIdx
>= fActiveLimit
) {
3427 // At end of input. Match failed. Backtrack out.
3429 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3433 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3435 // There is input left. Advance over one char, except if we are
3436 // at a cr/lf, advance over both of them.
3438 c
= UTEXT_NEXT32(fInputText
);
3439 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3440 if (c
==0x0d && fp
->fInputIdx
< fActiveLimit
) {
3441 // In the case of a CR/LF, we need to advance over both.
3442 UChar32 nextc
= UTEXT_CURRENT32(fInputText
);
3443 if (nextc
== 0x0a) {
3444 (void)UTEXT_NEXT32(fInputText
);
3445 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3452 case URX_DOTANY_UNIX
:
3454 // '.' operator, matches all, but stops at end-of-line.
3455 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
3456 if (fp
->fInputIdx
>= fActiveLimit
) {
3457 // At end of input. Match failed. Backtrack out.
3459 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3463 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3465 // There is input left. Advance over one char, unless we've hit end-of-line
3466 UChar32 c
= UTEXT_NEXT32(fInputText
);
3468 // End of line in normal mode. '.' does not match the \n
3469 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3471 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3478 fp
->fPatIdx
= opValue
;
3486 U_ASSERT(opValue
< fPattern
->fCompiledPat
->size());
3487 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
3488 fp
->fPatIdx
= opValue
; // Then JMP.
3492 // This opcode is used with (x)+, when x can match a zero length string.
3493 // Same as JMP_SAV, except conditional on the match having made forward progress.
3494 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3495 // data address of the input position at the start of the loop.
3497 U_ASSERT(opValue
> 0 && opValue
< fPattern
->fCompiledPat
->size());
3498 int32_t stoOp
= (int32_t)pat
[opValue
-1];
3499 U_ASSERT(URX_TYPE(stoOp
) == URX_STO_INP_LOC
);
3500 int32_t frameLoc
= URX_VAL(stoOp
);
3501 U_ASSERT(frameLoc
>= 0 && frameLoc
< fFrameSize
);
3502 int64_t prevInputIdx
= fp
->fExtra
[frameLoc
];
3503 U_ASSERT(prevInputIdx
<= fp
->fInputIdx
);
3504 if (prevInputIdx
< fp
->fInputIdx
) {
3505 // The match did make progress. Repeat the loop.
3506 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
3507 fp
->fPatIdx
= opValue
;
3508 fp
->fExtra
[frameLoc
] = fp
->fInputIdx
;
3510 // If the input position did not advance, we do nothing here,
3511 // execution will fall out of the loop.
3517 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
3518 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
3520 // Pick up the three extra operands that CTR_INIT has, and
3521 // skip the pattern location counter past
3522 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
3524 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
3525 int32_t minCount
= (int32_t)pat
[instrOperandLoc
+1];
3526 int32_t maxCount
= (int32_t)pat
[instrOperandLoc
+2];
3527 U_ASSERT(minCount
>=0);
3528 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
3529 U_ASSERT(loopLoc
>=fp
->fPatIdx
);
3531 if (minCount
== 0) {
3532 fp
= StateSave(fp
, loopLoc
+1, status
);
3534 if (maxCount
== -1) {
3535 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // For loop breaking.
3536 } else if (maxCount
== 0) {
3537 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3544 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
3545 int32_t initOp
= (int32_t)pat
[opValue
];
3546 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT
);
3547 int64_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
3548 int32_t minCount
= (int32_t)pat
[opValue
+2];
3549 int32_t maxCount
= (int32_t)pat
[opValue
+3];
3551 if ((uint64_t)*pCounter
>= (uint32_t)maxCount
&& maxCount
!= -1) {
3552 U_ASSERT(*pCounter
== maxCount
);
3555 if (*pCounter
>= minCount
) {
3556 if (maxCount
== -1) {
3557 // Loop has no hard upper bound.
3558 // Check that it is progressing through the input, break if it is not.
3559 int64_t *pLastInputIdx
= &fp
->fExtra
[URX_VAL(initOp
) + 1];
3560 if (fp
->fInputIdx
== *pLastInputIdx
) {
3563 *pLastInputIdx
= fp
->fInputIdx
;
3566 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
3568 fp
->fPatIdx
= opValue
+ 4; // Loop back.
3572 case URX_CTR_INIT_NG
:
3574 // Initialize a non-greedy loop
3575 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
3576 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
3578 // Pick up the three extra operands that CTR_INIT_NG has, and
3579 // skip the pattern location counter past
3580 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
3582 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
3583 int32_t minCount
= (int32_t)pat
[instrOperandLoc
+1];
3584 int32_t maxCount
= (int32_t)pat
[instrOperandLoc
+2];
3585 U_ASSERT(minCount
>=0);
3586 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
3587 U_ASSERT(loopLoc
>fp
->fPatIdx
);
3588 if (maxCount
== -1) {
3589 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // Save initial input index for loop breaking.
3592 if (minCount
== 0) {
3593 if (maxCount
!= 0) {
3594 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
3596 fp
->fPatIdx
= loopLoc
+1; // Continue with stuff after repeated block
3601 case URX_CTR_LOOP_NG
:
3603 // Non-greedy {min, max} loops
3604 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
3605 int32_t initOp
= (int32_t)pat
[opValue
];
3606 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT_NG
);
3607 int64_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
3608 int32_t minCount
= (int32_t)pat
[opValue
+2];
3609 int32_t maxCount
= (int32_t)pat
[opValue
+3];
3612 if ((uint64_t)*pCounter
>= (uint32_t)maxCount
&& maxCount
!= -1) {
3613 // The loop has matched the maximum permitted number of times.
3614 // Break out of here with no action. Matching will
3615 // continue with the following pattern.
3616 U_ASSERT(*pCounter
== maxCount
);
3620 if (*pCounter
< minCount
) {
3621 // We haven't met the minimum number of matches yet.
3622 // Loop back for another one.
3623 fp
->fPatIdx
= opValue
+ 4; // Loop back.
3625 // We do have the minimum number of matches.
3627 // If there is no upper bound on the loop iterations, check that the input index
3628 // is progressing, and stop the loop if it is not.
3629 if (maxCount
== -1) {
3630 int64_t *pLastInputIdx
= &fp
->fExtra
[URX_VAL(initOp
) + 1];
3631 if (fp
->fInputIdx
== *pLastInputIdx
) {
3634 *pLastInputIdx
= fp
->fInputIdx
;
3637 // Loop Continuation: we will fall into the pattern following the loop
3638 // (non-greedy, don't execute loop body first), but first do
3639 // a state save to the top of the loop, so that a match failure
3640 // in the following pattern will try another iteration of the loop.
3641 fp
= StateSave(fp
, opValue
+ 4, status
);
3647 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
3648 fData
[opValue
] = fStack
->size();
3653 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
3654 int32_t newStackSize
= (int32_t)fData
[opValue
];
3655 U_ASSERT(newStackSize
<= fStack
->size());
3656 int64_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
3657 if (newFP
== (int64_t *)fp
) {
3661 for (i
=0; i
<fFrameSize
; i
++) {
3662 newFP
[i
] = ((int64_t *)fp
)[i
];
3664 fp
= (REStackFrame
*)newFP
;
3665 fStack
->setSize(newStackSize
);
3671 U_ASSERT(opValue
< fFrameSize
);
3672 int64_t groupStartIdx
= fp
->fExtra
[opValue
];
3673 int64_t groupEndIdx
= fp
->fExtra
[opValue
+1];
3674 U_ASSERT(groupStartIdx
<= groupEndIdx
);
3675 if (groupStartIdx
< 0) {
3676 // This capture group has not participated in the match thus far,
3677 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
3680 UTEXT_SETNATIVEINDEX(fAltInputText
, groupStartIdx
);
3681 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3683 // Note: if the capture group match was of an empty string the backref
3684 // match succeeds. Verified by testing: Perl matches succeed
3685 // in this case, so we do too.
3687 UBool success
= TRUE
;
3689 if (utext_getNativeIndex(fAltInputText
) >= groupEndIdx
) {
3693 if (utext_getNativeIndex(fInputText
) >= fActiveLimit
) {
3698 UChar32 captureGroupChar
= utext_next32(fAltInputText
);
3699 UChar32 inputChar
= utext_next32(fInputText
);
3700 if (inputChar
!= captureGroupChar
) {
3707 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3709 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3718 U_ASSERT(opValue
< fFrameSize
);
3719 int64_t groupStartIdx
= fp
->fExtra
[opValue
];
3720 int64_t groupEndIdx
= fp
->fExtra
[opValue
+1];
3721 U_ASSERT(groupStartIdx
<= groupEndIdx
);
3722 if (groupStartIdx
< 0) {
3723 // This capture group has not participated in the match thus far,
3724 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
3727 utext_setNativeIndex(fAltInputText
, groupStartIdx
);
3728 utext_setNativeIndex(fInputText
, fp
->fInputIdx
);
3729 CaseFoldingUTextIterator
captureGroupItr(*fAltInputText
);
3730 CaseFoldingUTextIterator
inputItr(*fInputText
);
3732 // Note: if the capture group match was of an empty string the backref
3733 // match succeeds. Verified by testing: Perl matches succeed
3734 // in this case, so we do too.
3736 UBool success
= TRUE
;
3738 if (!captureGroupItr
.inExpansion() && utext_getNativeIndex(fAltInputText
) >= groupEndIdx
) {
3742 if (!inputItr
.inExpansion() && utext_getNativeIndex(fInputText
) >= fActiveLimit
) {
3747 UChar32 captureGroupChar
= captureGroupItr
.next();
3748 UChar32 inputChar
= inputItr
.next();
3749 if (inputChar
!= captureGroupChar
) {
3755 if (success
&& inputItr
.inExpansion()) {
3756 // We otained a match by consuming part of a string obtained from
3757 // case-folding a single code point of the input text.
3758 // This does not count as an overall match.
3763 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3765 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3771 case URX_STO_INP_LOC
:
3773 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
);
3774 fp
->fExtra
[opValue
] = fp
->fInputIdx
;
3780 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
3782 int32_t dataLoc
= URX_VAL(pat
[instrOperandLoc
]);
3783 U_ASSERT(dataLoc
>= 0 && dataLoc
< fFrameSize
);
3784 int64_t savedInputIdx
= fp
->fExtra
[dataLoc
];
3785 U_ASSERT(savedInputIdx
<= fp
->fInputIdx
);
3786 if (savedInputIdx
< fp
->fInputIdx
) {
3787 fp
->fPatIdx
= opValue
; // JMP
3789 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no progress in loop.
3796 // Entering a lookahead block.
3797 // Save Stack Ptr, Input Pos.
3798 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3799 fData
[opValue
] = fStack
->size();
3800 fData
[opValue
+1] = fp
->fInputIdx
;
3801 fActiveStart
= fLookStart
; // Set the match region change for
3802 fActiveLimit
= fLookLimit
; // transparent bounds.
3808 // Leaving a look-ahead block.
3809 // restore Stack Ptr, Input Pos to positions they had on entry to block.
3810 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3811 int32_t stackSize
= fStack
->size();
3812 int32_t newStackSize
=(int32_t)fData
[opValue
];
3813 U_ASSERT(stackSize
>= newStackSize
);
3814 if (stackSize
> newStackSize
) {
3815 // Copy the current top frame back to the new (cut back) top frame.
3816 // This makes the capture groups from within the look-ahead
3817 // expression available.
3818 int64_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
3820 for (i
=0; i
<fFrameSize
; i
++) {
3821 newFP
[i
] = ((int64_t *)fp
)[i
];
3823 fp
= (REStackFrame
*)newFP
;
3824 fStack
->setSize(newStackSize
);
3826 fp
->fInputIdx
= fData
[opValue
+1];
3828 // Restore the active region bounds in the input string; they may have
3829 // been changed because of transparent bounds on a Region.
3830 fActiveStart
= fRegionStart
;
3831 fActiveLimit
= fRegionLimit
;
3836 // Case insensitive one char. The char from the pattern is already case folded.
3837 // Input text is not, but case folding the input can not reduce two or more code
3839 if (fp
->fInputIdx
< fActiveLimit
) {
3840 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3842 UChar32 c
= UTEXT_NEXT32(fInputText
);
3843 if (u_foldCase(c
, U_FOLD_CASE_DEFAULT
) == opValue
) {
3844 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3851 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3856 // Case-insensitive test input against a literal string.
3857 // Strings require two slots in the compiled pattern, one for the
3858 // offset to the string text, and one for the length.
3859 // The compiled string has already been case folded.
3861 const UChar
*patternString
= litText
+ opValue
;
3862 int32_t patternStringIdx
= 0;
3864 op
= (int32_t)pat
[fp
->fPatIdx
];
3866 opType
= URX_TYPE(op
);
3867 opValue
= URX_VAL(op
);
3868 U_ASSERT(opType
== URX_STRING_LEN
);
3869 int32_t patternStringLen
= opValue
; // Length of the string from the pattern.
3874 UBool success
= TRUE
;
3876 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3877 CaseFoldingUTextIterator
inputIterator(*fInputText
);
3878 while (patternStringIdx
< patternStringLen
) {
3879 if (!inputIterator
.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText
) >= fActiveLimit
) {
3884 U16_NEXT(patternString
, patternStringIdx
, patternStringLen
, cPattern
);
3885 cText
= inputIterator
.next();
3886 if (cText
!= cPattern
) {
3891 if (inputIterator
.inExpansion()) {
3896 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3898 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3906 // Entering a look-behind block.
3907 // Save Stack Ptr, Input Pos.
3908 // TODO: implement transparent bounds. Ticket #6067
3909 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3910 fData
[opValue
] = fStack
->size();
3911 fData
[opValue
+1] = fp
->fInputIdx
;
3912 // Init the variable containing the start index for attempted matches.
3913 fData
[opValue
+2] = -1;
3914 // Save input string length, then reset to pin any matches to end at
3915 // the current position.
3916 fData
[opValue
+3] = fActiveLimit
;
3917 fActiveLimit
= fp
->fInputIdx
;
3924 // Positive Look-Behind, at top of loop checking for matches of LB expression
3925 // at all possible input starting positions.
3927 // Fetch the min and max possible match lengths. They are the operands
3928 // of this op in the pattern.
3929 int32_t minML
= (int32_t)pat
[fp
->fPatIdx
++];
3930 int32_t maxML
= (int32_t)pat
[fp
->fPatIdx
++];
3931 if (!UTEXT_USES_U16(fInputText
)) {
3932 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
3933 // The max length need not be exact; it just needs to be >= actual maximum.
3936 U_ASSERT(minML
<= maxML
);
3937 U_ASSERT(minML
>= 0);
3939 // Fetch (from data) the last input index where a match was attempted.
3940 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3941 int64_t &lbStartIdx
= fData
[opValue
+2];
3942 if (lbStartIdx
< 0) {
3943 // First time through loop.
3944 lbStartIdx
= fp
->fInputIdx
- minML
;
3945 if (lbStartIdx
> 0) {
3946 // move index to a code point boudary, if it's not on one already.
3947 UTEXT_SETNATIVEINDEX(fInputText
, lbStartIdx
);
3948 lbStartIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3951 // 2nd through nth time through the loop.
3952 // Back up start position for match by one.
3953 if (lbStartIdx
== 0) {
3956 UTEXT_SETNATIVEINDEX(fInputText
, lbStartIdx
);
3957 (void)UTEXT_PREVIOUS32(fInputText
);
3958 lbStartIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3962 if (lbStartIdx
< 0 || lbStartIdx
< fp
->fInputIdx
- maxML
) {
3963 // We have tried all potential match starting points without
3964 // getting a match. Backtrack out, and out of the
3965 // Look Behind altogether.
3966 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3967 int64_t restoreInputLen
= fData
[opValue
+3];
3968 U_ASSERT(restoreInputLen
>= fActiveLimit
);
3969 U_ASSERT(restoreInputLen
<= fInputLength
);
3970 fActiveLimit
= restoreInputLen
;
3974 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3975 // (successful match will fall off the end of the loop.)
3976 fp
= StateSave(fp
, fp
->fPatIdx
-3, status
);
3977 fp
->fInputIdx
= lbStartIdx
;
3982 // End of a look-behind block, after a successful match.
3984 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3985 if (fp
->fInputIdx
!= fActiveLimit
) {
3986 // The look-behind expression matched, but the match did not
3987 // extend all the way to the point that we are looking behind from.
3988 // FAIL out of here, which will take us back to the LB_CONT, which
3989 // will retry the match starting at another position or fail
3990 // the look-behind altogether, whichever is appropriate.
3991 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3995 // Look-behind match is good. Restore the orignal input string length,
3996 // which had been truncated to pin the end of the lookbehind match to the
3997 // position being looked-behind.
3998 int64_t originalInputLen
= fData
[opValue
+3];
3999 U_ASSERT(originalInputLen
>= fActiveLimit
);
4000 U_ASSERT(originalInputLen
<= fInputLength
);
4001 fActiveLimit
= originalInputLen
;
4008 // Negative Look-Behind, at top of loop checking for matches of LB expression
4009 // at all possible input starting positions.
4011 // Fetch the extra parameters of this op.
4012 int32_t minML
= (int32_t)pat
[fp
->fPatIdx
++];
4013 int32_t maxML
= (int32_t)pat
[fp
->fPatIdx
++];
4014 if (!UTEXT_USES_U16(fInputText
)) {
4015 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
4016 // The max length need not be exact; it just needs to be >= actual maximum.
4019 int32_t continueLoc
= (int32_t)pat
[fp
->fPatIdx
++];
4020 continueLoc
= URX_VAL(continueLoc
);
4021 U_ASSERT(minML
<= maxML
);
4022 U_ASSERT(minML
>= 0);
4023 U_ASSERT(continueLoc
> fp
->fPatIdx
);
4025 // Fetch (from data) the last input index where a match was attempted.
4026 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
4027 int64_t &lbStartIdx
= fData
[opValue
+2];
4028 if (lbStartIdx
< 0) {
4029 // First time through loop.
4030 lbStartIdx
= fp
->fInputIdx
- minML
;
4031 if (lbStartIdx
> 0) {
4032 // move index to a code point boudary, if it's not on one already.
4033 UTEXT_SETNATIVEINDEX(fInputText
, lbStartIdx
);
4034 lbStartIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
4037 // 2nd through nth time through the loop.
4038 // Back up start position for match by one.
4039 if (lbStartIdx
== 0) {
4042 UTEXT_SETNATIVEINDEX(fInputText
, lbStartIdx
);
4043 (void)UTEXT_PREVIOUS32(fInputText
);
4044 lbStartIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
4048 if (lbStartIdx
< 0 || lbStartIdx
< fp
->fInputIdx
- maxML
) {
4049 // We have tried all potential match starting points without
4050 // getting a match, which means that the negative lookbehind as
4051 // a whole has succeeded. Jump forward to the continue location
4052 int64_t restoreInputLen
= fData
[opValue
+3];
4053 U_ASSERT(restoreInputLen
>= fActiveLimit
);
4054 U_ASSERT(restoreInputLen
<= fInputLength
);
4055 fActiveLimit
= restoreInputLen
;
4056 fp
->fPatIdx
= continueLoc
;
4060 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4061 // (successful match will cause a FAIL out of the loop altogether.)
4062 fp
= StateSave(fp
, fp
->fPatIdx
-4, status
);
4063 fp
->fInputIdx
= lbStartIdx
;
4068 // End of a negative look-behind block, after a successful match.
4070 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
4071 if (fp
->fInputIdx
!= fActiveLimit
) {
4072 // The look-behind expression matched, but the match did not
4073 // extend all the way to the point that we are looking behind from.
4074 // FAIL out of here, which will take us back to the LB_CONT, which
4075 // will retry the match starting at another position or succeed
4076 // the look-behind altogether, whichever is appropriate.
4077 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4081 // Look-behind expression matched, which means look-behind test as
4084 // Restore the orignal input string length, which had been truncated
4085 // inorder to pin the end of the lookbehind match
4086 // to the position being looked-behind.
4087 int64_t originalInputLen
= fData
[opValue
+3];
4088 U_ASSERT(originalInputLen
>= fActiveLimit
);
4089 U_ASSERT(originalInputLen
<= fInputLength
);
4090 fActiveLimit
= originalInputLen
;
4092 // Restore original stack position, discarding any state saved
4093 // by the successful pattern match.
4094 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
4095 int32_t newStackSize
= (int32_t)fData
[opValue
];
4096 U_ASSERT(fStack
->size() > newStackSize
);
4097 fStack
->setSize(newStackSize
);
4099 // FAIL, which will take control back to someplace
4100 // prior to entering the look-behind test.
4101 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4107 // Loop Initialization for the optimized implementation of
4108 // [some character set]*
4109 // This op scans through all matching input.
4110 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4112 U_ASSERT(opValue
> 0 && opValue
< sets
->size());
4113 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
4114 UnicodeSet
*s
= (UnicodeSet
*)sets
->elementAt(opValue
);
4116 // Loop through input, until either the input is exhausted or
4117 // we reach a character that is not a member of the set.
4118 int64_t ix
= fp
->fInputIdx
;
4119 UTEXT_SETNATIVEINDEX(fInputText
, ix
);
4121 if (ix
>= fActiveLimit
) {
4125 UChar32 c
= UTEXT_NEXT32(fInputText
);
4127 if (s8
->contains(c
) == FALSE
) {
4131 if (s
->contains(c
) == FALSE
) {
4135 ix
= UTEXT_GETNATIVEINDEX(fInputText
);
4138 // If there were no matching characters, skip over the loop altogether.
4139 // The loop doesn't run at all, a * op always succeeds.
4140 if (ix
== fp
->fInputIdx
) {
4141 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
4145 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4146 // must follow. It's operand is the stack location
4147 // that holds the starting input index for the match of this [set]*
4148 int32_t loopcOp
= (int32_t)pat
[fp
->fPatIdx
];
4149 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
4150 int32_t stackLoc
= URX_VAL(loopcOp
);
4151 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
4152 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
4155 // Save State to the URX_LOOP_C op that follows this one,
4156 // so that match failures in the following code will return to there.
4157 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4158 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
4164 case URX_LOOP_DOT_I
:
4165 // Loop Initialization for the optimized implementation of .*
4166 // This op scans through all remaining input.
4167 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4169 // Loop through input until the input is exhausted (we reach an end-of-line)
4170 // In DOTALL mode, we can just go straight to the end of the input.
4172 if ((opValue
& 1) == 1) {
4173 // Dot-matches-All mode. Jump straight to the end of the string.
4177 // NOT DOT ALL mode. Line endings do not match '.'
4178 // Scan forward until a line ending or end of input.
4180 UTEXT_SETNATIVEINDEX(fInputText
, ix
);
4182 if (ix
>= fActiveLimit
) {
4186 UChar32 c
= UTEXT_NEXT32(fInputText
);
4187 if ((c
& 0x7f) <= 0x29) { // Fast filter of non-new-line-s
4188 if ((c
== 0x0a) || // 0x0a is newline in both modes.
4189 (((opValue
& 2) == 0) && // IF not UNIX_LINES mode
4190 isLineTerminator(c
))) {
4191 // char is a line ending. Exit the scanning loop.
4195 ix
= UTEXT_GETNATIVEINDEX(fInputText
);
4199 // If there were no matching characters, skip over the loop altogether.
4200 // The loop doesn't run at all, a * op always succeeds.
4201 if (ix
== fp
->fInputIdx
) {
4202 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
4206 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4207 // must follow. It's operand is the stack location
4208 // that holds the starting input index for the match of this .*
4209 int32_t loopcOp
= (int32_t)pat
[fp
->fPatIdx
];
4210 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
4211 int32_t stackLoc
= URX_VAL(loopcOp
);
4212 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
4213 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
4216 // Save State to the URX_LOOP_C op that follows this one,
4217 // so that match failures in the following code will return to there.
4218 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4219 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
4227 U_ASSERT(opValue
>=0 && opValue
<fFrameSize
);
4228 backSearchIndex
= fp
->fExtra
[opValue
];
4229 U_ASSERT(backSearchIndex
<= fp
->fInputIdx
);
4230 if (backSearchIndex
== fp
->fInputIdx
) {
4231 // We've backed up the input idx to the point that the loop started.
4232 // The loop is done. Leave here without saving state.
4233 // Subsequent failures won't come back here.
4236 // Set up for the next iteration of the loop, with input index
4237 // backed up by one from the last time through,
4238 // and a state save to this instruction in case the following code fails again.
4239 // (We're going backwards because this loop emulates stack unwinding, not
4240 // the initial scan forward.)
4241 U_ASSERT(fp
->fInputIdx
> 0);
4242 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
4243 UChar32 prevC
= UTEXT_PREVIOUS32(fInputText
);
4244 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
4246 UChar32 twoPrevC
= UTEXT_PREVIOUS32(fInputText
);
4247 if (prevC
== 0x0a &&
4248 fp
->fInputIdx
> backSearchIndex
&&
4250 int32_t prevOp
= (int32_t)pat
[fp
->fPatIdx
-2];
4251 if (URX_TYPE(prevOp
) == URX_LOOP_DOT_I
) {
4252 // .*, stepping back over CRLF pair.
4253 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
4258 fp
= StateSave(fp
, fp
->fPatIdx
-1, status
);
4265 // Trouble. The compiled pattern contains an entry with an
4266 // unrecognized type tag.
4270 if (U_FAILURE(status
)) {
4279 fLastMatchEnd
= fMatchEnd
;
4280 fMatchStart
= startIdx
;
4281 fMatchEnd
= fp
->fInputIdx
;
4284 #ifdef REGEX_RUN_DEBUG
4287 printf("Match. start=%ld end=%ld\n\n", fMatchStart
, fMatchEnd
);
4289 printf("No match\n\n");
4294 fFrame
= fp
; // The active stack frame when the engine stopped.
4295 // Contains the capture group results that we need to
4301 //--------------------------------------------------------------------------------
4303 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with the
4304 // assumption that the entire string is available in the UText's
4305 // chunk buffer. For now, that means we can use int32_t indexes,
4306 // except for anything that needs to be saved (like group starts
4309 // startIdx: begin matching a this index.
4310 // toEnd: if true, match must extend to end of the input region
4312 //--------------------------------------------------------------------------------
4313 void RegexMatcher::MatchChunkAt(int32_t startIdx
, UBool toEnd
, UErrorCode
&status
) {
4314 UBool isMatch
= FALSE
; // True if the we have a match.
4316 int32_t backSearchIndex
= INT32_MAX
; // used after greedy single-character matches for searching backwards
4318 int32_t op
; // Operation from the compiled pattern, split into
4319 int32_t opType
; // the opcode
4320 int32_t opValue
; // and the operand value.
4322 #ifdef REGEX_RUN_DEBUG
4324 printf("MatchAt(startIdx=%d)\n", startIdx
);
4325 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern
->fPattern
))());
4326 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText
))());
4330 if (U_FAILURE(status
)) {
4334 // Cache frequently referenced items from the compiled pattern
4336 int64_t *pat
= fPattern
->fCompiledPat
->getBuffer();
4338 const UChar
*litText
= fPattern
->fLiteralText
.getBuffer();
4339 UVector
*sets
= fPattern
->fSets
;
4341 const UChar
*inputBuf
= fInputText
->chunkContents
;
4343 fFrameSize
= fPattern
->fFrameSize
;
4344 REStackFrame
*fp
= resetStack();
4345 if (U_FAILURE(fDeferredStatus
)) {
4346 status
= fDeferredStatus
;
4351 fp
->fInputIdx
= startIdx
;
4353 // Zero out the pattern's static data
4355 for (i
= 0; i
<fPattern
->fDataSize
; i
++) {
4360 // Main loop for interpreting the compiled pattern.
4361 // One iteration of the loop per pattern operation performed.
4364 op
= (int32_t)pat
[fp
->fPatIdx
];
4365 opType
= URX_TYPE(op
);
4366 opValue
= URX_VAL(op
);
4367 #ifdef REGEX_RUN_DEBUG
4369 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
4370 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp
->fInputIdx
,
4371 UTEXT_CURRENT32(fInputText
), (int64_t *)fp
-fStack
->getBuffer(), fActiveLimit
);
4372 fPattern
->dumpOp(fp
->fPatIdx
);
4385 // Force a backtrack. In some circumstances, the pattern compiler
4386 // will notice that the pattern can't possibly match anything, and will
4387 // emit one of these at that point.
4388 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4393 if (fp
->fInputIdx
< fActiveLimit
) {
4395 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4402 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4408 // Test input against a literal string.
4409 // Strings require two slots in the compiled pattern, one for the
4410 // offset to the string text, and one for the length.
4411 int32_t stringStartIdx
= opValue
;
4414 op
= (int32_t)pat
[fp
->fPatIdx
]; // Fetch the second operand
4416 opType
= URX_TYPE(op
);
4417 stringLen
= URX_VAL(op
);
4418 U_ASSERT(opType
== URX_STRING_LEN
);
4419 U_ASSERT(stringLen
>= 2);
4421 const UChar
* pInp
= inputBuf
+ fp
->fInputIdx
;
4422 const UChar
* pInpLimit
= inputBuf
+ fActiveLimit
;
4423 const UChar
* pPat
= litText
+stringStartIdx
;
4424 const UChar
* pEnd
= pInp
+ stringLen
;
4425 UBool success
= TRUE
;
4426 while (pInp
< pEnd
) {
4427 if (pInp
>= pInpLimit
) {
4432 if (*pInp
++ != *pPat
++) {
4439 fp
->fInputIdx
+= stringLen
;
4441 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4447 case URX_STATE_SAVE
:
4448 fp
= StateSave(fp
, opValue
, status
);
4453 // The match loop will exit via this path on a successful match,
4454 // when we reach the end of the pattern.
4455 if (toEnd
&& fp
->fInputIdx
!= fActiveLimit
) {
4456 // The pattern matched, but not to the end of input. Try some more.
4457 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4463 // Start and End Capture stack frame variables are laid out out like this:
4464 // fp->fExtra[opValue] - The start of a completed capture group
4465 // opValue+1 - The end of a completed capture group
4466 // opValue+2 - the start of a capture group whose end
4467 // has not yet been reached (and might not ever be).
4468 case URX_START_CAPTURE
:
4469 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
4470 fp
->fExtra
[opValue
+2] = fp
->fInputIdx
;
4474 case URX_END_CAPTURE
:
4475 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
4476 U_ASSERT(fp
->fExtra
[opValue
+2] >= 0); // Start pos for this group must be set.
4477 fp
->fExtra
[opValue
] = fp
->fExtra
[opValue
+2]; // Tentative start becomes real.
4478 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // End position
4479 U_ASSERT(fp
->fExtra
[opValue
] <= fp
->fExtra
[opValue
+1]);
4483 case URX_DOLLAR
: // $, test for End of line
4484 // or for position before new line at end of input
4485 if (fp
->fInputIdx
< fAnchorLimit
-2) {
4486 // We are no where near the end of input. Fail.
4487 // This is the common case. Keep it first.
4488 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4491 if (fp
->fInputIdx
>= fAnchorLimit
) {
4492 // We really are at the end of input. Success.
4498 // If we are positioned just before a new-line that is located at the
4499 // end of input, succeed.
4500 if (fp
->fInputIdx
== fAnchorLimit
-1) {
4502 U16_GET(inputBuf
, fAnchorStart
, fp
->fInputIdx
, fAnchorLimit
, c
);
4504 if (isLineTerminator(c
)) {
4505 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& inputBuf
[fp
->fInputIdx
-1]==0x0d)) {
4506 // At new-line at end of input. Success
4512 } else if (fp
->fInputIdx
== fAnchorLimit
-2 &&
4513 inputBuf
[fp
->fInputIdx
]==0x0d && inputBuf
[fp
->fInputIdx
+1]==0x0a) {
4516 break; // At CR/LF at end of input. Success
4519 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4524 case URX_DOLLAR_D
: // $, test for End of Line, in UNIX_LINES mode.
4525 if (fp
->fInputIdx
>= fAnchorLimit
-1) {
4526 // Either at the last character of input, or off the end.
4527 if (fp
->fInputIdx
== fAnchorLimit
-1) {
4528 // At last char of input. Success if it's a new line.
4529 if (inputBuf
[fp
->fInputIdx
] == 0x0a) {
4535 // Off the end of input. Success.
4542 // Not at end of input. Back-track out.
4543 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4547 case URX_DOLLAR_M
: // $, test for End of line in multi-line mode
4549 if (fp
->fInputIdx
>= fAnchorLimit
) {
4550 // We really are at the end of input. Success.
4555 // If we are positioned just before a new-line, succeed.
4556 // It makes no difference where the new-line is within the input.
4557 UChar32 c
= inputBuf
[fp
->fInputIdx
];
4558 if (isLineTerminator(c
)) {
4559 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
4560 // In multi-line mode, hitting a new-line just before the end of input does not
4561 // set the hitEnd or requireEnd flags
4562 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& inputBuf
[fp
->fInputIdx
-1]==0x0d)) {
4566 // not at a new line. Fail.
4567 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4572 case URX_DOLLAR_MD
: // $, test for End of line in multi-line and UNIX_LINES mode
4574 if (fp
->fInputIdx
>= fAnchorLimit
) {
4575 // We really are at the end of input. Success.
4577 fRequireEnd
= TRUE
; // Java set requireEnd in this case, even though
4578 break; // adding a new-line would not lose the match.
4580 // If we are not positioned just before a new-line, the test fails; backtrack out.
4581 // It makes no difference where the new-line is within the input.
4582 if (inputBuf
[fp
->fInputIdx
] != 0x0a) {
4583 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4589 case URX_CARET
: // ^, test for start of line
4590 if (fp
->fInputIdx
!= fAnchorStart
) {
4591 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4596 case URX_CARET_M
: // ^, test for start of line in mulit-line mode
4598 if (fp
->fInputIdx
== fAnchorStart
) {
4599 // We are at the start input. Success.
4602 // Check whether character just before the current pos is a new-line
4603 // unless we are at the end of input
4604 UChar c
= inputBuf
[fp
->fInputIdx
- 1];
4605 if ((fp
->fInputIdx
< fAnchorLimit
) &&
4606 isLineTerminator(c
)) {
4607 // It's a new-line. ^ is true. Success.
4608 // TODO: what should be done with positions between a CR and LF?
4611 // Not at the start of a line. Fail.
4612 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4617 case URX_CARET_M_UNIX
: // ^, test for start of line in mulit-line + Unix-line mode
4619 U_ASSERT(fp
->fInputIdx
>= fAnchorStart
);
4620 if (fp
->fInputIdx
<= fAnchorStart
) {
4621 // We are at the start input. Success.
4624 // Check whether character just before the current pos is a new-line
4625 U_ASSERT(fp
->fInputIdx
<= fAnchorLimit
);
4626 UChar c
= inputBuf
[fp
->fInputIdx
- 1];
4628 // Not at the start of a line. Back-track out.
4629 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4634 case URX_BACKSLASH_B
: // Test for word boundaries
4636 UBool success
= isChunkWordBoundary((int32_t)fp
->fInputIdx
);
4637 success
^= (UBool
)(opValue
!= 0); // flip sense for \B
4639 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4645 case URX_BACKSLASH_BU
: // Test for word boundaries, Unicode-style
4647 UBool success
= isUWordBoundary(fp
->fInputIdx
);
4648 success
^= (UBool
)(opValue
!= 0); // flip sense for \B
4650 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4656 case URX_BACKSLASH_D
: // Test for decimal digit
4658 if (fp
->fInputIdx
>= fActiveLimit
) {
4660 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4665 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4666 int8_t ctype
= u_charType(c
); // TODO: make a unicode set for this. Will be faster.
4667 UBool success
= (ctype
== U_DECIMAL_DIGIT_NUMBER
);
4668 success
^= (UBool
)(opValue
!= 0); // flip sense for \D
4670 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4676 case URX_BACKSLASH_G
: // Test for position at end of previous match
4677 if (!((fMatch
&& fp
->fInputIdx
==fMatchEnd
) || (fMatch
==FALSE
&& fp
->fInputIdx
==fActiveStart
))) {
4678 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4683 case URX_BACKSLASH_H
: // Test for \h, horizontal white space.
4685 if (fp
->fInputIdx
>= fActiveLimit
) {
4687 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4691 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4692 int8_t ctype
= u_charType(c
);
4693 UBool success
= (ctype
== U_SPACE_SEPARATOR
|| c
== 9); // SPACE_SEPARATOR || TAB
4694 success
^= (UBool
)(opValue
!= 0); // flip sense for \H
4696 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4702 case URX_BACKSLASH_R
: // Test for \R, any line break sequence.
4704 if (fp
->fInputIdx
>= fActiveLimit
) {
4706 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4710 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4711 if (isLineTerminator(c
)) {
4712 if (c
== 0x0d && fp
->fInputIdx
< fActiveLimit
) {
4713 // Check for CR/LF sequence. Consume both together when found.
4715 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c2
);
4717 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c2
);
4721 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4727 case URX_BACKSLASH_V
: // Any single code point line ending.
4729 if (fp
->fInputIdx
>= fActiveLimit
) {
4731 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4735 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4736 UBool success
= isLineTerminator(c
);
4737 success
^= (UBool
)(opValue
!= 0); // flip sense for \V
4739 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4746 case URX_BACKSLASH_X
:
4747 // Match a Grapheme, as defined by Unicode TR 29.
4748 // Differs slightly from Perl, which consumes combining marks independently
4752 // Fail if at end of input
4753 if (fp
->fInputIdx
>= fActiveLimit
) {
4755 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4759 // Examine (and consume) the current char.
4760 // Dispatch into a little state machine, based on the char.
4762 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4763 UnicodeSet
**sets
= fPattern
->fStaticSets
;
4764 if (sets
[URX_GC_NORMAL
]->contains(c
)) goto GC_Extend
;
4765 if (sets
[URX_GC_CONTROL
]->contains(c
)) goto GC_Control
;
4766 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
4767 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
4768 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
4769 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
4770 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
4776 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
4777 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4778 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
4779 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
4780 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
4781 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
4782 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c
);
4786 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
4787 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4788 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
4789 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
4790 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c
);
4794 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
4795 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4796 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
4797 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c
);
4801 // Combining characters are consumed here
4803 if (fp
->fInputIdx
>= fActiveLimit
) {
4806 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4807 if (sets
[URX_GC_EXTEND
]->contains(c
) == FALSE
) {
4808 U16_BACK_1(inputBuf
, 0, fp
->fInputIdx
);
4815 // Most control chars stand alone (don't combine with combining chars),
4816 // except for that CR/LF sequence is a single grapheme cluster.
4817 if (c
== 0x0d && fp
->fInputIdx
< fActiveLimit
&& inputBuf
[fp
->fInputIdx
] == 0x0a) {
4822 if (fp
->fInputIdx
>= fActiveLimit
) {
4831 case URX_BACKSLASH_Z
: // Test for end of Input
4832 if (fp
->fInputIdx
< fAnchorLimit
) {
4833 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4842 case URX_STATIC_SETREF
:
4844 // Test input character against one of the predefined sets
4845 // (Word Characters, for example)
4846 // The high bit of the op value is a flag for the match polarity.
4847 // 0: success if input char is in set.
4848 // 1: success if input char is not in set.
4849 if (fp
->fInputIdx
>= fActiveLimit
) {
4851 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4855 UBool success
= ((opValue
& URX_NEG_SET
) == URX_NEG_SET
);
4856 opValue
&= ~URX_NEG_SET
;
4857 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
4860 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4862 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
4863 if (s8
->contains(c
)) {
4867 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
4868 if (s
->contains(c
)) {
4873 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4879 case URX_STAT_SETREF_N
:
4881 // Test input character for NOT being a member of one of
4882 // the predefined sets (Word Characters, for example)
4883 if (fp
->fInputIdx
>= fActiveLimit
) {
4885 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4889 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
4892 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4894 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
4895 if (s8
->contains(c
) == FALSE
) {
4899 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
4900 if (s
->contains(c
) == FALSE
) {
4904 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4911 if (fp
->fInputIdx
>= fActiveLimit
) {
4913 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4917 U_ASSERT(opValue
> 0 && opValue
< sets
->size());
4919 // There is input left. Pick up one char and test it for set membership.
4921 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4923 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
4924 if (s8
->contains(c
)) {
4925 // The character is in the set. A Match.
4929 UnicodeSet
*s
= (UnicodeSet
*)sets
->elementAt(opValue
);
4930 if (s
->contains(c
)) {
4931 // The character is in the set. A Match.
4936 // the character wasn't in the set.
4937 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4944 // . matches anything, but stops at end-of-line.
4945 if (fp
->fInputIdx
>= fActiveLimit
) {
4946 // At end of input. Match failed. Backtrack out.
4948 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4952 // There is input left. Advance over one char, unless we've hit end-of-line
4954 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4955 if (isLineTerminator(c
)) {
4956 // End of line in normal mode. . does not match.
4957 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4964 case URX_DOTANY_ALL
:
4966 // . in dot-matches-all (including new lines) mode
4967 if (fp
->fInputIdx
>= fActiveLimit
) {
4968 // At end of input. Match failed. Backtrack out.
4970 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4974 // There is input left. Advance over one char, except if we are
4975 // at a cr/lf, advance over both of them.
4977 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4978 if (c
==0x0d && fp
->fInputIdx
< fActiveLimit
) {
4979 // In the case of a CR/LF, we need to advance over both.
4980 if (inputBuf
[fp
->fInputIdx
] == 0x0a) {
4981 U16_FWD_1(inputBuf
, fp
->fInputIdx
, fActiveLimit
);
4988 case URX_DOTANY_UNIX
:
4990 // '.' operator, matches all, but stops at end-of-line.
4991 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
4992 if (fp
->fInputIdx
>= fActiveLimit
) {
4993 // At end of input. Match failed. Backtrack out.
4995 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4999 // There is input left. Advance over one char, unless we've hit end-of-line
5001 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
5003 // End of line in normal mode. '.' does not match the \n
5004 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5011 fp
->fPatIdx
= opValue
;
5019 U_ASSERT(opValue
< fPattern
->fCompiledPat
->size());
5020 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
5021 fp
->fPatIdx
= opValue
; // Then JMP.
5025 // This opcode is used with (x)+, when x can match a zero length string.
5026 // Same as JMP_SAV, except conditional on the match having made forward progress.
5027 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
5028 // data address of the input position at the start of the loop.
5030 U_ASSERT(opValue
> 0 && opValue
< fPattern
->fCompiledPat
->size());
5031 int32_t stoOp
= (int32_t)pat
[opValue
-1];
5032 U_ASSERT(URX_TYPE(stoOp
) == URX_STO_INP_LOC
);
5033 int32_t frameLoc
= URX_VAL(stoOp
);
5034 U_ASSERT(frameLoc
>= 0 && frameLoc
< fFrameSize
);
5035 int32_t prevInputIdx
= (int32_t)fp
->fExtra
[frameLoc
];
5036 U_ASSERT(prevInputIdx
<= fp
->fInputIdx
);
5037 if (prevInputIdx
< fp
->fInputIdx
) {
5038 // The match did make progress. Repeat the loop.
5039 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
5040 fp
->fPatIdx
= opValue
;
5041 fp
->fExtra
[frameLoc
] = fp
->fInputIdx
;
5043 // If the input position did not advance, we do nothing here,
5044 // execution will fall out of the loop.
5050 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
5051 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
5053 // Pick up the three extra operands that CTR_INIT has, and
5054 // skip the pattern location counter past
5055 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
5057 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
5058 int32_t minCount
= (int32_t)pat
[instrOperandLoc
+1];
5059 int32_t maxCount
= (int32_t)pat
[instrOperandLoc
+2];
5060 U_ASSERT(minCount
>=0);
5061 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
5062 U_ASSERT(loopLoc
>=fp
->fPatIdx
);
5064 if (minCount
== 0) {
5065 fp
= StateSave(fp
, loopLoc
+1, status
);
5067 if (maxCount
== -1) {
5068 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // For loop breaking.
5069 } else if (maxCount
== 0) {
5070 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5077 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
5078 int32_t initOp
= (int32_t)pat
[opValue
];
5079 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT
);
5080 int64_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
5081 int32_t minCount
= (int32_t)pat
[opValue
+2];
5082 int32_t maxCount
= (int32_t)pat
[opValue
+3];
5084 if ((uint64_t)*pCounter
>= (uint32_t)maxCount
&& maxCount
!= -1) {
5085 U_ASSERT(*pCounter
== maxCount
);
5088 if (*pCounter
>= minCount
) {
5089 if (maxCount
== -1) {
5090 // Loop has no hard upper bound.
5091 // Check that it is progressing through the input, break if it is not.
5092 int64_t *pLastInputIdx
= &fp
->fExtra
[URX_VAL(initOp
) + 1];
5093 if (fp
->fInputIdx
== *pLastInputIdx
) {
5096 *pLastInputIdx
= fp
->fInputIdx
;
5099 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5101 fp
->fPatIdx
= opValue
+ 4; // Loop back.
5105 case URX_CTR_INIT_NG
:
5107 // Initialize a non-greedy loop
5108 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
5109 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
5111 // Pick up the three extra operands that CTR_INIT_NG has, and
5112 // skip the pattern location counter past
5113 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
5115 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
5116 int32_t minCount
= (int32_t)pat
[instrOperandLoc
+1];
5117 int32_t maxCount
= (int32_t)pat
[instrOperandLoc
+2];
5118 U_ASSERT(minCount
>=0);
5119 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
5120 U_ASSERT(loopLoc
>fp
->fPatIdx
);
5121 if (maxCount
== -1) {
5122 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // Save initial input index for loop breaking.
5125 if (minCount
== 0) {
5126 if (maxCount
!= 0) {
5127 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5129 fp
->fPatIdx
= loopLoc
+1; // Continue with stuff after repeated block
5134 case URX_CTR_LOOP_NG
:
5136 // Non-greedy {min, max} loops
5137 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
5138 int32_t initOp
= (int32_t)pat
[opValue
];
5139 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT_NG
);
5140 int64_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
5141 int32_t minCount
= (int32_t)pat
[opValue
+2];
5142 int32_t maxCount
= (int32_t)pat
[opValue
+3];
5145 if ((uint64_t)*pCounter
>= (uint32_t)maxCount
&& maxCount
!= -1) {
5146 // The loop has matched the maximum permitted number of times.
5147 // Break out of here with no action. Matching will
5148 // continue with the following pattern.
5149 U_ASSERT(*pCounter
== maxCount
);
5153 if (*pCounter
< minCount
) {
5154 // We haven't met the minimum number of matches yet.
5155 // Loop back for another one.
5156 fp
->fPatIdx
= opValue
+ 4; // Loop back.
5158 // We do have the minimum number of matches.
5160 // If there is no upper bound on the loop iterations, check that the input index
5161 // is progressing, and stop the loop if it is not.
5162 if (maxCount
== -1) {
5163 int64_t *pLastInputIdx
= &fp
->fExtra
[URX_VAL(initOp
) + 1];
5164 if (fp
->fInputIdx
== *pLastInputIdx
) {
5167 *pLastInputIdx
= fp
->fInputIdx
;
5170 // Loop Continuation: we will fall into the pattern following the loop
5171 // (non-greedy, don't execute loop body first), but first do
5172 // a state save to the top of the loop, so that a match failure
5173 // in the following pattern will try another iteration of the loop.
5174 fp
= StateSave(fp
, opValue
+ 4, status
);
5180 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
5181 fData
[opValue
] = fStack
->size();
5186 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
5187 int32_t newStackSize
= (int32_t)fData
[opValue
];
5188 U_ASSERT(newStackSize
<= fStack
->size());
5189 int64_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
5190 if (newFP
== (int64_t *)fp
) {
5194 for (i
=0; i
<fFrameSize
; i
++) {
5195 newFP
[i
] = ((int64_t *)fp
)[i
];
5197 fp
= (REStackFrame
*)newFP
;
5198 fStack
->setSize(newStackSize
);
5204 U_ASSERT(opValue
< fFrameSize
);
5205 int64_t groupStartIdx
= fp
->fExtra
[opValue
];
5206 int64_t groupEndIdx
= fp
->fExtra
[opValue
+1];
5207 U_ASSERT(groupStartIdx
<= groupEndIdx
);
5208 int64_t inputIndex
= fp
->fInputIdx
;
5209 if (groupStartIdx
< 0) {
5210 // This capture group has not participated in the match thus far,
5211 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
5214 UBool success
= TRUE
;
5215 for (int64_t groupIndex
= groupStartIdx
; groupIndex
< groupEndIdx
; ++groupIndex
,++inputIndex
) {
5216 if (inputIndex
>= fActiveLimit
) {
5221 if (inputBuf
[groupIndex
] != inputBuf
[inputIndex
]) {
5226 if (success
&& groupStartIdx
< groupEndIdx
&& U16_IS_LEAD(inputBuf
[groupEndIdx
-1]) &&
5227 inputIndex
< fActiveLimit
&& U16_IS_TRAIL(inputBuf
[inputIndex
])) {
5228 // Capture group ended with an unpaired lead surrogate.
5229 // Back reference is not permitted to match lead only of a surrogatge pair.
5233 fp
->fInputIdx
= inputIndex
;
5235 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5242 U_ASSERT(opValue
< fFrameSize
);
5243 int64_t groupStartIdx
= fp
->fExtra
[opValue
];
5244 int64_t groupEndIdx
= fp
->fExtra
[opValue
+1];
5245 U_ASSERT(groupStartIdx
<= groupEndIdx
);
5246 if (groupStartIdx
< 0) {
5247 // This capture group has not participated in the match thus far,
5248 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
5251 CaseFoldingUCharIterator
captureGroupItr(inputBuf
, groupStartIdx
, groupEndIdx
);
5252 CaseFoldingUCharIterator
inputItr(inputBuf
, fp
->fInputIdx
, fActiveLimit
);
5254 // Note: if the capture group match was of an empty string the backref
5255 // match succeeds. Verified by testing: Perl matches succeed
5256 // in this case, so we do too.
5258 UBool success
= TRUE
;
5260 UChar32 captureGroupChar
= captureGroupItr
.next();
5261 if (captureGroupChar
== U_SENTINEL
) {
5265 UChar32 inputChar
= inputItr
.next();
5266 if (inputChar
== U_SENTINEL
) {
5271 if (inputChar
!= captureGroupChar
) {
5277 if (success
&& inputItr
.inExpansion()) {
5278 // We otained a match by consuming part of a string obtained from
5279 // case-folding a single code point of the input text.
5280 // This does not count as an overall match.
5285 fp
->fInputIdx
= inputItr
.getIndex();
5287 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5292 case URX_STO_INP_LOC
:
5294 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
);
5295 fp
->fExtra
[opValue
] = fp
->fInputIdx
;
5301 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
5303 int32_t dataLoc
= URX_VAL(pat
[instrOperandLoc
]);
5304 U_ASSERT(dataLoc
>= 0 && dataLoc
< fFrameSize
);
5305 int32_t savedInputIdx
= (int32_t)fp
->fExtra
[dataLoc
];
5306 U_ASSERT(savedInputIdx
<= fp
->fInputIdx
);
5307 if (savedInputIdx
< fp
->fInputIdx
) {
5308 fp
->fPatIdx
= opValue
; // JMP
5310 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no progress in loop.
5317 // Entering a lookahead block.
5318 // Save Stack Ptr, Input Pos.
5319 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5320 fData
[opValue
] = fStack
->size();
5321 fData
[opValue
+1] = fp
->fInputIdx
;
5322 fActiveStart
= fLookStart
; // Set the match region change for
5323 fActiveLimit
= fLookLimit
; // transparent bounds.
5329 // Leaving a look-ahead block.
5330 // restore Stack Ptr, Input Pos to positions they had on entry to block.
5331 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5332 int32_t stackSize
= fStack
->size();
5333 int32_t newStackSize
= (int32_t)fData
[opValue
];
5334 U_ASSERT(stackSize
>= newStackSize
);
5335 if (stackSize
> newStackSize
) {
5336 // Copy the current top frame back to the new (cut back) top frame.
5337 // This makes the capture groups from within the look-ahead
5338 // expression available.
5339 int64_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
5341 for (i
=0; i
<fFrameSize
; i
++) {
5342 newFP
[i
] = ((int64_t *)fp
)[i
];
5344 fp
= (REStackFrame
*)newFP
;
5345 fStack
->setSize(newStackSize
);
5347 fp
->fInputIdx
= fData
[opValue
+1];
5349 // Restore the active region bounds in the input string; they may have
5350 // been changed because of transparent bounds on a Region.
5351 fActiveStart
= fRegionStart
;
5352 fActiveLimit
= fRegionLimit
;
5357 if (fp
->fInputIdx
< fActiveLimit
) {
5359 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
5360 if (u_foldCase(c
, U_FOLD_CASE_DEFAULT
) == opValue
) {
5366 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5370 // Case-insensitive test input against a literal string.
5371 // Strings require two slots in the compiled pattern, one for the
5372 // offset to the string text, and one for the length.
5373 // The compiled string has already been case folded.
5375 const UChar
*patternString
= litText
+ opValue
;
5377 op
= (int32_t)pat
[fp
->fPatIdx
];
5379 opType
= URX_TYPE(op
);
5380 opValue
= URX_VAL(op
);
5381 U_ASSERT(opType
== URX_STRING_LEN
);
5382 int32_t patternStringLen
= opValue
; // Length of the string from the pattern.
5386 UBool success
= TRUE
;
5387 int32_t patternStringIdx
= 0;
5388 CaseFoldingUCharIterator
inputIterator(inputBuf
, fp
->fInputIdx
, fActiveLimit
);
5389 while (patternStringIdx
< patternStringLen
) {
5390 U16_NEXT(patternString
, patternStringIdx
, patternStringLen
, cPattern
);
5391 cText
= inputIterator
.next();
5392 if (cText
!= cPattern
) {
5394 if (cText
== U_SENTINEL
) {
5400 if (inputIterator
.inExpansion()) {
5405 fp
->fInputIdx
= inputIterator
.getIndex();
5407 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5414 // Entering a look-behind block.
5415 // Save Stack Ptr, Input Pos.
5416 // TODO: implement transparent bounds. Ticket #6067
5417 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5418 fData
[opValue
] = fStack
->size();
5419 fData
[opValue
+1] = fp
->fInputIdx
;
5420 // Init the variable containing the start index for attempted matches.
5421 fData
[opValue
+2] = -1;
5422 // Save input string length, then reset to pin any matches to end at
5423 // the current position.
5424 fData
[opValue
+3] = fActiveLimit
;
5425 fActiveLimit
= fp
->fInputIdx
;
5432 // Positive Look-Behind, at top of loop checking for matches of LB expression
5433 // at all possible input starting positions.
5435 // Fetch the min and max possible match lengths. They are the operands
5436 // of this op in the pattern.
5437 int32_t minML
= (int32_t)pat
[fp
->fPatIdx
++];
5438 int32_t maxML
= (int32_t)pat
[fp
->fPatIdx
++];
5439 U_ASSERT(minML
<= maxML
);
5440 U_ASSERT(minML
>= 0);
5442 // Fetch (from data) the last input index where a match was attempted.
5443 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5444 int64_t &lbStartIdx
= fData
[opValue
+2];
5445 if (lbStartIdx
< 0) {
5446 // First time through loop.
5447 lbStartIdx
= fp
->fInputIdx
- minML
;
5448 if (lbStartIdx
> 0) {
5449 U16_SET_CP_START(inputBuf
, 0, lbStartIdx
);
5452 // 2nd through nth time through the loop.
5453 // Back up start position for match by one.
5454 if (lbStartIdx
== 0) {
5457 U16_BACK_1(inputBuf
, 0, lbStartIdx
);
5461 if (lbStartIdx
< 0 || lbStartIdx
< fp
->fInputIdx
- maxML
) {
5462 // We have tried all potential match starting points without
5463 // getting a match. Backtrack out, and out of the
5464 // Look Behind altogether.
5465 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5466 int64_t restoreInputLen
= fData
[opValue
+3];
5467 U_ASSERT(restoreInputLen
>= fActiveLimit
);
5468 U_ASSERT(restoreInputLen
<= fInputLength
);
5469 fActiveLimit
= restoreInputLen
;
5473 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5474 // (successful match will fall off the end of the loop.)
5475 fp
= StateSave(fp
, fp
->fPatIdx
-3, status
);
5476 fp
->fInputIdx
= lbStartIdx
;
5481 // End of a look-behind block, after a successful match.
5483 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5484 if (fp
->fInputIdx
!= fActiveLimit
) {
5485 // The look-behind expression matched, but the match did not
5486 // extend all the way to the point that we are looking behind from.
5487 // FAIL out of here, which will take us back to the LB_CONT, which
5488 // will retry the match starting at another position or fail
5489 // the look-behind altogether, whichever is appropriate.
5490 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5494 // Look-behind match is good. Restore the orignal input string length,
5495 // which had been truncated to pin the end of the lookbehind match to the
5496 // position being looked-behind.
5497 int64_t originalInputLen
= fData
[opValue
+3];
5498 U_ASSERT(originalInputLen
>= fActiveLimit
);
5499 U_ASSERT(originalInputLen
<= fInputLength
);
5500 fActiveLimit
= originalInputLen
;
5507 // Negative Look-Behind, at top of loop checking for matches of LB expression
5508 // at all possible input starting positions.
5510 // Fetch the extra parameters of this op.
5511 int32_t minML
= (int32_t)pat
[fp
->fPatIdx
++];
5512 int32_t maxML
= (int32_t)pat
[fp
->fPatIdx
++];
5513 int32_t continueLoc
= (int32_t)pat
[fp
->fPatIdx
++];
5514 continueLoc
= URX_VAL(continueLoc
);
5515 U_ASSERT(minML
<= maxML
);
5516 U_ASSERT(minML
>= 0);
5517 U_ASSERT(continueLoc
> fp
->fPatIdx
);
5519 // Fetch (from data) the last input index where a match was attempted.
5520 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5521 int64_t &lbStartIdx
= fData
[opValue
+2];
5522 if (lbStartIdx
< 0) {
5523 // First time through loop.
5524 lbStartIdx
= fp
->fInputIdx
- minML
;
5525 if (lbStartIdx
> 0) {
5526 U16_SET_CP_START(inputBuf
, 0, lbStartIdx
);
5529 // 2nd through nth time through the loop.
5530 // Back up start position for match by one.
5531 if (lbStartIdx
== 0) {
5532 lbStartIdx
--; // Because U16_BACK is unsafe starting at 0.
5534 U16_BACK_1(inputBuf
, 0, lbStartIdx
);
5538 if (lbStartIdx
< 0 || lbStartIdx
< fp
->fInputIdx
- maxML
) {
5539 // We have tried all potential match starting points without
5540 // getting a match, which means that the negative lookbehind as
5541 // a whole has succeeded. Jump forward to the continue location
5542 int64_t restoreInputLen
= fData
[opValue
+3];
5543 U_ASSERT(restoreInputLen
>= fActiveLimit
);
5544 U_ASSERT(restoreInputLen
<= fInputLength
);
5545 fActiveLimit
= restoreInputLen
;
5546 fp
->fPatIdx
= continueLoc
;
5550 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5551 // (successful match will cause a FAIL out of the loop altogether.)
5552 fp
= StateSave(fp
, fp
->fPatIdx
-4, status
);
5553 fp
->fInputIdx
= lbStartIdx
;
5558 // End of a negative look-behind block, after a successful match.
5560 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5561 if (fp
->fInputIdx
!= fActiveLimit
) {
5562 // The look-behind expression matched, but the match did not
5563 // extend all the way to the point that we are looking behind from.
5564 // FAIL out of here, which will take us back to the LB_CONT, which
5565 // will retry the match starting at another position or succeed
5566 // the look-behind altogether, whichever is appropriate.
5567 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5571 // Look-behind expression matched, which means look-behind test as
5574 // Restore the orignal input string length, which had been truncated
5575 // inorder to pin the end of the lookbehind match
5576 // to the position being looked-behind.
5577 int64_t originalInputLen
= fData
[opValue
+3];
5578 U_ASSERT(originalInputLen
>= fActiveLimit
);
5579 U_ASSERT(originalInputLen
<= fInputLength
);
5580 fActiveLimit
= originalInputLen
;
5582 // Restore original stack position, discarding any state saved
5583 // by the successful pattern match.
5584 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5585 int32_t newStackSize
= (int32_t)fData
[opValue
];
5586 U_ASSERT(fStack
->size() > newStackSize
);
5587 fStack
->setSize(newStackSize
);
5589 // FAIL, which will take control back to someplace
5590 // prior to entering the look-behind test.
5591 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5597 // Loop Initialization for the optimized implementation of
5598 // [some character set]*
5599 // This op scans through all matching input.
5600 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5602 U_ASSERT(opValue
> 0 && opValue
< sets
->size());
5603 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
5604 UnicodeSet
*s
= (UnicodeSet
*)sets
->elementAt(opValue
);
5606 // Loop through input, until either the input is exhausted or
5607 // we reach a character that is not a member of the set.
5608 int32_t ix
= (int32_t)fp
->fInputIdx
;
5610 if (ix
>= fActiveLimit
) {
5615 U16_NEXT(inputBuf
, ix
, fActiveLimit
, c
);
5617 if (s8
->contains(c
) == FALSE
) {
5618 U16_BACK_1(inputBuf
, 0, ix
);
5622 if (s
->contains(c
) == FALSE
) {
5623 U16_BACK_1(inputBuf
, 0, ix
);
5629 // If there were no matching characters, skip over the loop altogether.
5630 // The loop doesn't run at all, a * op always succeeds.
5631 if (ix
== fp
->fInputIdx
) {
5632 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
5636 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5637 // must follow. It's operand is the stack location
5638 // that holds the starting input index for the match of this [set]*
5639 int32_t loopcOp
= (int32_t)pat
[fp
->fPatIdx
];
5640 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
5641 int32_t stackLoc
= URX_VAL(loopcOp
);
5642 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
5643 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
5646 // Save State to the URX_LOOP_C op that follows this one,
5647 // so that match failures in the following code will return to there.
5648 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5649 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5655 case URX_LOOP_DOT_I
:
5656 // Loop Initialization for the optimized implementation of .*
5657 // This op scans through all remaining input.
5658 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5660 // Loop through input until the input is exhausted (we reach an end-of-line)
5661 // In DOTALL mode, we can just go straight to the end of the input.
5663 if ((opValue
& 1) == 1) {
5664 // Dot-matches-All mode. Jump straight to the end of the string.
5665 ix
= (int32_t)fActiveLimit
;
5668 // NOT DOT ALL mode. Line endings do not match '.'
5669 // Scan forward until a line ending or end of input.
5670 ix
= (int32_t)fp
->fInputIdx
;
5672 if (ix
>= fActiveLimit
) {
5677 U16_NEXT(inputBuf
, ix
, fActiveLimit
, c
); // c = inputBuf[ix++]
5678 if ((c
& 0x7f) <= 0x29) { // Fast filter of non-new-line-s
5679 if ((c
== 0x0a) || // 0x0a is newline in both modes.
5680 (((opValue
& 2) == 0) && // IF not UNIX_LINES mode
5681 isLineTerminator(c
))) {
5682 // char is a line ending. Put the input pos back to the
5683 // line ending char, and exit the scanning loop.
5684 U16_BACK_1(inputBuf
, 0, ix
);
5691 // If there were no matching characters, skip over the loop altogether.
5692 // The loop doesn't run at all, a * op always succeeds.
5693 if (ix
== fp
->fInputIdx
) {
5694 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
5698 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5699 // must follow. It's operand is the stack location
5700 // that holds the starting input index for the match of this .*
5701 int32_t loopcOp
= (int32_t)pat
[fp
->fPatIdx
];
5702 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
5703 int32_t stackLoc
= URX_VAL(loopcOp
);
5704 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
5705 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
5708 // Save State to the URX_LOOP_C op that follows this one,
5709 // so that match failures in the following code will return to there.
5710 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5711 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5719 U_ASSERT(opValue
>=0 && opValue
<fFrameSize
);
5720 backSearchIndex
= (int32_t)fp
->fExtra
[opValue
];
5721 U_ASSERT(backSearchIndex
<= fp
->fInputIdx
);
5722 if (backSearchIndex
== fp
->fInputIdx
) {
5723 // We've backed up the input idx to the point that the loop started.
5724 // The loop is done. Leave here without saving state.
5725 // Subsequent failures won't come back here.
5728 // Set up for the next iteration of the loop, with input index
5729 // backed up by one from the last time through,
5730 // and a state save to this instruction in case the following code fails again.
5731 // (We're going backwards because this loop emulates stack unwinding, not
5732 // the initial scan forward.)
5733 U_ASSERT(fp
->fInputIdx
> 0);
5735 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, prevC
); // !!!: should this 0 be one of f*Limit?
5737 if (prevC
== 0x0a &&
5738 fp
->fInputIdx
> backSearchIndex
&&
5739 inputBuf
[fp
->fInputIdx
-1] == 0x0d) {
5740 int32_t prevOp
= (int32_t)pat
[fp
->fPatIdx
-2];
5741 if (URX_TYPE(prevOp
) == URX_LOOP_DOT_I
) {
5742 // .*, stepping back over CRLF pair.
5743 U16_BACK_1(inputBuf
, 0, fp
->fInputIdx
);
5748 fp
= StateSave(fp
, fp
->fPatIdx
-1, status
);
5755 // Trouble. The compiled pattern contains an entry with an
5756 // unrecognized type tag.
5760 if (U_FAILURE(status
)) {
5769 fLastMatchEnd
= fMatchEnd
;
5770 fMatchStart
= startIdx
;
5771 fMatchEnd
= fp
->fInputIdx
;
5774 #ifdef REGEX_RUN_DEBUG
5777 printf("Match. start=%ld end=%ld\n\n", fMatchStart
, fMatchEnd
);
5779 printf("No match\n\n");
5784 fFrame
= fp
; // The active stack frame when the engine stopped.
5785 // Contains the capture group results that we need to
5792 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher
)
5796 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS