2 **************************************************************************
3 * Copyright (C) 2002-2015 International Business Machines Corporation *
4 * and others. All rights reserved. *
5 **************************************************************************
10 // Contains the implementation of class RegexMatcher,
11 // which is one of the main API classes for the ICU regular expression package.
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
17 #include "unicode/regex.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/ustring.h"
21 #include "unicode/rbbi.h"
22 #include "unicode/utf.h"
23 #include "unicode/utf16.h"
34 // #include <malloc.h> // Needed for heapcheck testing
38 // Default limit for the size of the back track stack, to avoid system
39 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
40 // This value puts ICU's limits higher than most other regexp implementations,
41 // which use recursion rather than the heap, and take more storage per
44 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY
= 8000000;
46 // Time limit counter constant.
47 // Time limits for expression evaluation are in terms of quanta of work by
48 // the engine, each of which is 10,000 state saves.
49 // This constant determines that state saves per tick number.
50 static const int32_t TIMER_INITIAL_VALUE
= 10000;
53 // Test for any of the Unicode line terminating characters.
54 static inline UBool
isLineTerminator(UChar32 c
) {
55 if (c
& ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
58 return (c
<=0x0d && c
>=0x0a) || c
==0x85 || c
==0x2028 || c
==0x2029;
61 //-----------------------------------------------------------------------------
63 // Constructor and Destructor
65 //-----------------------------------------------------------------------------
66 RegexMatcher::RegexMatcher(const RegexPattern
*pat
) {
67 fDeferredStatus
= U_ZERO_ERROR
;
68 init(fDeferredStatus
);
69 if (U_FAILURE(fDeferredStatus
)) {
73 fDeferredStatus
= U_ILLEGAL_ARGUMENT_ERROR
;
77 init2(RegexStaticSets::gStaticSets
->fEmptyText
, fDeferredStatus
);
82 RegexMatcher::RegexMatcher(const UnicodeString
®exp
, const UnicodeString
&input
,
83 uint32_t flags
, UErrorCode
&status
) {
85 if (U_FAILURE(status
)) {
89 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
90 fPattern
= fPatternOwned
;
92 UText inputText
= UTEXT_INITIALIZER
;
93 utext_openConstUnicodeString(&inputText
, &input
, &status
);
94 init2(&inputText
, status
);
95 utext_close(&inputText
);
97 fInputUniStrMaybeMutable
= TRUE
;
101 RegexMatcher::RegexMatcher(UText
*regexp
, UText
*input
,
102 uint32_t flags
, UErrorCode
&status
) {
104 if (U_FAILURE(status
)) {
108 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
109 if (U_FAILURE(status
)) {
113 fPattern
= fPatternOwned
;
114 init2(input
, status
);
118 RegexMatcher::RegexMatcher(const UnicodeString
®exp
,
119 uint32_t flags
, UErrorCode
&status
) {
121 if (U_FAILURE(status
)) {
125 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
126 if (U_FAILURE(status
)) {
129 fPattern
= fPatternOwned
;
130 init2(RegexStaticSets::gStaticSets
->fEmptyText
, status
);
133 RegexMatcher::RegexMatcher(UText
*regexp
,
134 uint32_t flags
, UErrorCode
&status
) {
136 if (U_FAILURE(status
)) {
140 fPatternOwned
= RegexPattern::compile(regexp
, flags
, pe
, status
);
141 if (U_FAILURE(status
)) {
145 fPattern
= fPatternOwned
;
146 init2(RegexStaticSets::gStaticSets
->fEmptyText
, status
);
152 RegexMatcher::~RegexMatcher() {
154 if (fData
!= fSmallData
) {
159 delete fPatternOwned
;
160 fPatternOwned
= NULL
;
168 utext_close(fInputText
);
171 utext_close(fAltInputText
);
174 #if UCONFIG_NO_BREAK_ITERATION==0
175 delete fWordBreakItr
;
180 // init() common initialization for use by all constructors.
181 // Initialize all fields, get the object into a consistent state.
182 // This must be done even when the initial status shows an error,
183 // so that the object is initialized sufficiently well for the destructor
186 void RegexMatcher::init(UErrorCode
&status
) {
188 fPatternOwned
= NULL
;
198 fTransparentBounds
= FALSE
;
199 fAnchoringBounds
= TRUE
;
212 fStackLimit
= DEFAULT_BACKTRACK_STACK_CAPACITY
;
214 fCallbackContext
= NULL
;
215 fFindProgressCallbackFn
= NULL
;
216 fFindProgressCallbackContext
= NULL
;
218 fDeferredStatus
= status
;
220 fWordBreakItr
= NULL
;
224 fAltInputText
= NULL
;
227 fInputUniStrMaybeMutable
= FALSE
;
229 if (U_FAILURE(status
)) {
230 fDeferredStatus
= status
;
235 // init2() Common initialization for use by RegexMatcher constructors, part 2.
236 // This handles the common setup to be done after the Pattern is available.
238 void RegexMatcher::init2(UText
*input
, UErrorCode
&status
) {
239 if (U_FAILURE(status
)) {
240 fDeferredStatus
= status
;
244 if (fPattern
->fDataSize
> (int32_t)(sizeof(fSmallData
)/sizeof(fSmallData
[0]))) {
245 fData
= (int64_t *)uprv_malloc(fPattern
->fDataSize
* sizeof(int64_t));
247 status
= fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
252 fStack
= new UVector64(status
);
253 if (fStack
== NULL
) {
254 status
= fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
259 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY
, status
);
260 if (U_FAILURE(status
)) {
261 fDeferredStatus
= status
;
267 static const UChar BACKSLASH
= 0x5c;
268 static const UChar DOLLARSIGN
= 0x24;
269 static const UChar LEFTBRACKET
= 0x7b;
270 static const UChar RIGHTBRACKET
= 0x7d;
272 //--------------------------------------------------------------------------------
276 //--------------------------------------------------------------------------------
277 RegexMatcher
&RegexMatcher::appendReplacement(UnicodeString
&dest
,
278 const UnicodeString
&replacement
,
279 UErrorCode
&status
) {
280 UText replacementText
= UTEXT_INITIALIZER
;
282 utext_openConstUnicodeString(&replacementText
, &replacement
, &status
);
283 if (U_SUCCESS(status
)) {
284 UText resultText
= UTEXT_INITIALIZER
;
285 utext_openUnicodeString(&resultText
, &dest
, &status
);
287 if (U_SUCCESS(status
)) {
288 appendReplacement(&resultText
, &replacementText
, status
);
289 utext_close(&resultText
);
291 utext_close(&replacementText
);
298 // appendReplacement, UText mode
300 RegexMatcher
&RegexMatcher::appendReplacement(UText
*dest
,
302 UErrorCode
&status
) {
303 if (U_FAILURE(status
)) {
306 if (U_FAILURE(fDeferredStatus
)) {
307 status
= fDeferredStatus
;
310 if (fMatch
== FALSE
) {
311 status
= U_REGEX_INVALID_STATE
;
315 // Copy input string from the end of previous match to start of current match
316 int64_t destLen
= utext_nativeLength(dest
);
317 if (fMatchStart
> fAppendPosition
) {
318 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
319 destLen
+= utext_replace(dest
, destLen
, destLen
, fInputText
->chunkContents
+fAppendPosition
,
320 (int32_t)(fMatchStart
-fAppendPosition
), &status
);
323 if (UTEXT_USES_U16(fInputText
)) {
324 len16
= (int32_t)(fMatchStart
-fAppendPosition
);
326 UErrorCode lengthStatus
= U_ZERO_ERROR
;
327 len16
= utext_extract(fInputText
, fAppendPosition
, fMatchStart
, NULL
, 0, &lengthStatus
);
329 UChar
*inputChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(len16
+1));
330 if (inputChars
== NULL
) {
331 status
= U_MEMORY_ALLOCATION_ERROR
;
334 utext_extract(fInputText
, fAppendPosition
, fMatchStart
, inputChars
, len16
+1, &status
);
335 destLen
+= utext_replace(dest
, destLen
, destLen
, inputChars
, len16
, &status
);
336 uprv_free(inputChars
);
339 fAppendPosition
= fMatchEnd
;
342 // scan the replacement text, looking for substitutions ($n) and \escapes.
343 // TODO: optimize this loop by efficiently scanning for '$' or '\',
344 // move entire ranges not containing substitutions.
345 UTEXT_SETNATIVEINDEX(replacement
, 0);
346 for (UChar32 c
= UTEXT_NEXT32(replacement
); U_SUCCESS(status
) && c
!= U_SENTINEL
; c
= UTEXT_NEXT32(replacement
)) {
347 if (c
== BACKSLASH
) {
348 // Backslash Escape. Copy the following char out without further checks.
349 // Note: Surrogate pairs don't need any special handling
350 // The second half wont be a '$' or a '\', and
351 // will move to the dest normally on the next
353 c
= UTEXT_CURRENT32(replacement
);
354 if (c
== U_SENTINEL
) {
358 if (c
==0x55/*U*/ || c
==0x75/*u*/) {
359 // We have a \udddd or \Udddddddd escape sequence.
361 struct URegexUTextUnescapeCharContext context
= U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement
);
362 UChar32 escapedChar
= u_unescapeAt(uregex_utext_unescape_charAt
, &offset
, INT32_MAX
, &context
);
363 if (escapedChar
!= (UChar32
)0xFFFFFFFF) {
364 if (U_IS_BMP(escapedChar
)) {
365 UChar c16
= (UChar
)escapedChar
;
366 destLen
+= utext_replace(dest
, destLen
, destLen
, &c16
, 1, &status
);
369 surrogate
[0] = U16_LEAD(escapedChar
);
370 surrogate
[1] = U16_TRAIL(escapedChar
);
371 if (U_SUCCESS(status
)) {
372 destLen
+= utext_replace(dest
, destLen
, destLen
, surrogate
, 2, &status
);
375 // TODO: Report errors for mal-formed \u escapes?
376 // As this is, the original sequence is output, which may be OK.
377 if (context
.lastOffset
== offset
) {
378 (void)UTEXT_PREVIOUS32(replacement
);
379 } else if (context
.lastOffset
!= offset
-1) {
380 utext_moveIndex32(replacement
, offset
- context
.lastOffset
- 1);
384 (void)UTEXT_NEXT32(replacement
);
385 // Plain backslash escape. Just put out the escaped character.
387 UChar c16
= (UChar
)c
;
388 destLen
+= utext_replace(dest
, destLen
, destLen
, &c16
, 1, &status
);
391 surrogate
[0] = U16_LEAD(c
);
392 surrogate
[1] = U16_TRAIL(c
);
393 if (U_SUCCESS(status
)) {
394 destLen
+= utext_replace(dest
, destLen
, destLen
, surrogate
, 2, &status
);
398 } else if (c
!= DOLLARSIGN
) {
399 // Normal char, not a $. Copy it out without further checks.
401 UChar c16
= (UChar
)c
;
402 destLen
+= utext_replace(dest
, destLen
, destLen
, &c16
, 1, &status
);
405 surrogate
[0] = U16_LEAD(c
);
406 surrogate
[1] = U16_TRAIL(c
);
407 if (U_SUCCESS(status
)) {
408 destLen
+= utext_replace(dest
, destLen
, destLen
, surrogate
, 2, &status
);
412 // We've got a $. Pick up a capture group name or number if one follows.
413 // Consume digits so long as the resulting group number <= the number of
414 // number of capture groups in the pattern.
416 int32_t groupNum
= 0;
417 int32_t numDigits
= 0;
418 UChar32 nextChar
= utext_current32(replacement
);
419 if (nextChar
== LEFTBRACKET
) {
420 // Scan for a Named Capture Group, ${name}.
421 UnicodeString groupName
;
422 utext_next32(replacement
);
423 while(U_SUCCESS(status
) && nextChar
!= RIGHTBRACKET
) {
424 nextChar
= utext_next32(replacement
);
425 if (nextChar
== U_SENTINEL
) {
426 status
= U_REGEX_INVALID_CAPTURE_GROUP_NAME
;
427 } else if ((nextChar
>= 0x41 && nextChar
<= 0x5a) || // A..Z
428 (nextChar
>= 0x61 && nextChar
<= 0x7a) || // a..z
429 (nextChar
>= 0x31 && nextChar
<= 0x39)) { // 0..9
430 groupName
.append(nextChar
);
431 } else if (nextChar
== RIGHTBRACKET
) {
432 groupNum
= uhash_geti(fPattern
->fNamedCaptureMap
, &groupName
);
434 status
= U_REGEX_INVALID_CAPTURE_GROUP_NAME
;
437 // Character was something other than a name char or a closing '}'
438 status
= U_REGEX_INVALID_CAPTURE_GROUP_NAME
;
442 } else if (u_isdigit(nextChar
)) {
443 // $n Scan for a capture group number
444 int32_t numCaptureGroups
= fPattern
->fGroupMap
->size();
446 nextChar
= UTEXT_CURRENT32(replacement
);
447 if (nextChar
== U_SENTINEL
) {
450 if (u_isdigit(nextChar
) == FALSE
) {
453 int32_t nextDigitVal
= u_charDigitValue(nextChar
);
454 if (groupNum
*10 + nextDigitVal
> numCaptureGroups
) {
455 // Don't consume the next digit if it makes the capture group number too big.
456 if (numDigits
== 0) {
457 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
461 (void)UTEXT_NEXT32(replacement
);
462 groupNum
=groupNum
*10 + nextDigitVal
;
466 // $ not followed by capture group name or number.
467 status
= U_REGEX_INVALID_CAPTURE_GROUP_NAME
;
470 if (U_SUCCESS(status
)) {
471 destLen
+= appendGroup(groupNum
, dest
, status
);
473 } // End of $ capture group handling
474 } // End of per-character loop through the replacement string.
481 //--------------------------------------------------------------------------------
483 // appendTail Intended to be used in conjunction with appendReplacement()
484 // To the destination string, append everything following
485 // the last match position from the input string.
487 // Note: Match ranges do not affect appendTail or appendReplacement
489 //--------------------------------------------------------------------------------
490 UnicodeString
&RegexMatcher::appendTail(UnicodeString
&dest
) {
491 UErrorCode status
= U_ZERO_ERROR
;
492 UText resultText
= UTEXT_INITIALIZER
;
493 utext_openUnicodeString(&resultText
, &dest
, &status
);
495 if (U_SUCCESS(status
)) {
496 appendTail(&resultText
, status
);
497 utext_close(&resultText
);
504 // appendTail, UText mode
506 UText
*RegexMatcher::appendTail(UText
*dest
, UErrorCode
&status
) {
507 if (U_FAILURE(status
)) {
510 if (U_FAILURE(fDeferredStatus
)) {
511 status
= fDeferredStatus
;
515 if (fInputLength
> fAppendPosition
) {
516 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
517 int64_t destLen
= utext_nativeLength(dest
);
518 utext_replace(dest
, destLen
, destLen
, fInputText
->chunkContents
+fAppendPosition
,
519 (int32_t)(fInputLength
-fAppendPosition
), &status
);
522 if (UTEXT_USES_U16(fInputText
)) {
523 len16
= (int32_t)(fInputLength
-fAppendPosition
);
525 len16
= utext_extract(fInputText
, fAppendPosition
, fInputLength
, NULL
, 0, &status
);
526 status
= U_ZERO_ERROR
; // buffer overflow
529 UChar
*inputChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(len16
));
530 if (inputChars
== NULL
) {
531 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
533 utext_extract(fInputText
, fAppendPosition
, fInputLength
, inputChars
, len16
, &status
); // unterminated
534 int64_t destLen
= utext_nativeLength(dest
);
535 utext_replace(dest
, destLen
, destLen
, inputChars
, len16
, &status
);
536 uprv_free(inputChars
);
545 //--------------------------------------------------------------------------------
549 //--------------------------------------------------------------------------------
550 int32_t RegexMatcher::end(UErrorCode
&err
) const {
554 int64_t RegexMatcher::end64(UErrorCode
&err
) const {
555 return end64(0, err
);
558 int64_t RegexMatcher::end64(int32_t group
, UErrorCode
&err
) const {
559 if (U_FAILURE(err
)) {
562 if (fMatch
== FALSE
) {
563 err
= U_REGEX_INVALID_STATE
;
566 if (group
< 0 || group
> fPattern
->fGroupMap
->size()) {
567 err
= U_INDEX_OUTOFBOUNDS_ERROR
;
574 // Get the position within the stack frame of the variables for
575 // this capture group.
576 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(group
-1);
577 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
578 U_ASSERT(groupOffset
>= 0);
579 e
= fFrame
->fExtra
[groupOffset
+ 1];
585 int32_t RegexMatcher::end(int32_t group
, UErrorCode
&err
) const {
586 return (int32_t)end64(group
, err
);
589 //--------------------------------------------------------------------------------
591 // findProgressInterrupt This function is called once for each advance in the target
592 // string from the find() function, and calls the user progress callback
593 // function if there is one installed.
595 // Return: TRUE if the find operation is to be terminated.
596 // FALSE if the find operation is to continue running.
598 //--------------------------------------------------------------------------------
599 UBool
RegexMatcher::findProgressInterrupt(int64_t pos
, UErrorCode
&status
) {
600 if (fFindProgressCallbackFn
&& !(*fFindProgressCallbackFn
)(fFindProgressCallbackContext
, pos
)) {
601 status
= U_REGEX_STOPPED_BY_CALLER
;
607 //--------------------------------------------------------------------------------
611 //--------------------------------------------------------------------------------
612 UBool
RegexMatcher::find() {
613 if (U_FAILURE(fDeferredStatus
)) {
616 UErrorCode status
= U_ZERO_ERROR
;
617 UBool result
= find(status
);
621 //--------------------------------------------------------------------------------
625 //--------------------------------------------------------------------------------
626 UBool
RegexMatcher::find(UErrorCode
&status
) {
627 // Start at the position of the last match end. (Will be zero if the
628 // matcher has been reset.)
630 if (U_FAILURE(status
)) {
633 if (U_FAILURE(fDeferredStatus
)) {
634 status
= fDeferredStatus
;
638 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
639 return findUsingChunk(status
);
642 int64_t startPos
= fMatchEnd
;
644 startPos
= fActiveStart
;
648 // Save the position of any previous successful match.
649 fLastMatchEnd
= fMatchEnd
;
651 if (fMatchStart
== fMatchEnd
) {
652 // Previous match had zero length. Move start position up one position
653 // to avoid sending find() into a loop on zero-length matches.
654 if (startPos
>= fActiveLimit
) {
659 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
660 (void)UTEXT_NEXT32(fInputText
);
661 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
664 if (fLastMatchEnd
>= 0) {
665 // A previous find() failed to match. Don't try again.
666 // (without this test, a pattern with a zero-length match
667 // could match again at the end of an input string.)
674 // Compute the position in the input string beyond which a match can not begin, because
675 // the minimum length match would extend past the end of the input.
676 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
677 // Be aware of possible overflows if making changes here.
678 int64_t testStartLimit
;
679 if (UTEXT_USES_U16(fInputText
)) {
680 testStartLimit
= fActiveLimit
- fPattern
->fMinMatchLen
;
681 if (startPos
> testStartLimit
) {
687 // We don't know exactly how long the minimum match length is in native characters.
688 // Treat anything > 0 as 1.
689 testStartLimit
= fActiveLimit
- (fPattern
->fMinMatchLen
> 0 ? 1 : 0);
693 U_ASSERT(startPos
>= 0);
695 switch (fPattern
->fStartType
) {
697 // No optimization was found.
698 // Try a match at each input position.
700 MatchAt(startPos
, FALSE
, status
);
701 if (U_FAILURE(status
)) {
707 if (startPos
>= testStartLimit
) {
711 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
712 (void)UTEXT_NEXT32(fInputText
);
713 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
714 // Note that it's perfectly OK for a pattern to have a zero-length
715 // match at the end of a string, so we must make sure that the loop
716 // runs with startPos == testStartLimit the last time through.
717 if (findProgressInterrupt(startPos
, status
))
723 // Matches are only possible at the start of the input string
724 // (pattern begins with ^ or \A)
725 if (startPos
> fActiveStart
) {
729 MatchAt(startPos
, FALSE
, status
);
730 if (U_FAILURE(status
)) {
738 // Match may start on any char from a pre-computed set.
739 U_ASSERT(fPattern
->fMinMatchLen
> 0);
740 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
742 int64_t pos
= startPos
;
743 c
= UTEXT_NEXT32(fInputText
);
744 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
745 // c will be -1 (U_SENTINEL) at end of text, in which case we
746 // skip this next block (so we don't have a negative array index)
747 // and handle end of text in the following block.
748 if (c
>= 0 && ((c
<256 && fPattern
->fInitialChars8
->contains(c
)) ||
749 (c
>=256 && fPattern
->fInitialChars
->contains(c
)))) {
750 MatchAt(pos
, FALSE
, status
);
751 if (U_FAILURE(status
)) {
757 UTEXT_SETNATIVEINDEX(fInputText
, pos
);
759 if (startPos
> testStartLimit
) {
764 if (findProgressInterrupt(startPos
, status
))
773 // Match starts on exactly one char.
774 U_ASSERT(fPattern
->fMinMatchLen
> 0);
775 UChar32 theChar
= fPattern
->fInitialChar
;
776 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
778 int64_t pos
= startPos
;
779 c
= UTEXT_NEXT32(fInputText
);
780 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
782 MatchAt(pos
, FALSE
, status
);
783 if (U_FAILURE(status
)) {
789 UTEXT_SETNATIVEINDEX(fInputText
, pos
);
791 if (startPos
> testStartLimit
) {
796 if (findProgressInterrupt(startPos
, status
))
805 if (startPos
== fAnchorStart
) {
806 MatchAt(startPos
, FALSE
, status
);
807 if (U_FAILURE(status
)) {
813 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
814 c
= UTEXT_NEXT32(fInputText
);
815 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
817 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
818 c
= UTEXT_PREVIOUS32(fInputText
);
819 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
822 if (fPattern
->fFlags
& UREGEX_UNIX_LINES
) {
825 MatchAt(startPos
, FALSE
, status
);
826 if (U_FAILURE(status
)) {
832 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
834 if (startPos
>= testStartLimit
) {
839 c
= UTEXT_NEXT32(fInputText
);
840 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
841 // Note that it's perfectly OK for a pattern to have a zero-length
842 // match at the end of a string, so we must make sure that the loop
843 // runs with startPos == testStartLimit the last time through.
844 if (findProgressInterrupt(startPos
, status
))
849 if (isLineTerminator(c
)) {
850 if (c
== 0x0d && startPos
< fActiveLimit
&& UTEXT_CURRENT32(fInputText
) == 0x0a) {
851 (void)UTEXT_NEXT32(fInputText
);
852 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
854 MatchAt(startPos
, FALSE
, status
);
855 if (U_FAILURE(status
)) {
861 UTEXT_SETNATIVEINDEX(fInputText
, startPos
);
863 if (startPos
>= testStartLimit
) {
868 c
= UTEXT_NEXT32(fInputText
);
869 startPos
= UTEXT_GETNATIVEINDEX(fInputText
);
870 // Note that it's perfectly OK for a pattern to have a zero-length
871 // match at the end of a string, so we must make sure that the loop
872 // runs with startPos == testStartLimit the last time through.
873 if (findProgressInterrupt(startPos
, status
))
889 UBool
RegexMatcher::find(int64_t start
, UErrorCode
&status
) {
890 if (U_FAILURE(status
)) {
893 if (U_FAILURE(fDeferredStatus
)) {
894 status
= fDeferredStatus
;
897 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
898 // This will reset the region to be the full input length.
900 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
904 int64_t nativeStart
= start
;
905 if (nativeStart
< fActiveStart
|| nativeStart
> fActiveLimit
) {
906 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
909 fMatchEnd
= nativeStart
;
914 //--------------------------------------------------------------------------------
916 // findUsingChunk() -- like find(), but with the advance knowledge that the
917 // entire string is available in the UText's chunk buffer.
919 //--------------------------------------------------------------------------------
920 UBool
RegexMatcher::findUsingChunk(UErrorCode
&status
) {
921 // Start at the position of the last match end. (Will be zero if the
922 // matcher has been reset.
925 int32_t startPos
= (int32_t)fMatchEnd
;
927 startPos
= (int32_t)fActiveStart
;
930 const UChar
*inputBuf
= fInputText
->chunkContents
;
933 // Save the position of any previous successful match.
934 fLastMatchEnd
= fMatchEnd
;
936 if (fMatchStart
== fMatchEnd
) {
937 // Previous match had zero length. Move start position up one position
938 // to avoid sending find() into a loop on zero-length matches.
939 if (startPos
>= fActiveLimit
) {
944 U16_FWD_1(inputBuf
, startPos
, fInputLength
);
947 if (fLastMatchEnd
>= 0) {
948 // A previous find() failed to match. Don't try again.
949 // (without this test, a pattern with a zero-length match
950 // could match again at the end of an input string.)
957 // Compute the position in the input string beyond which a match can not begin, because
958 // the minimum length match would extend past the end of the input.
959 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
960 // Be aware of possible overflows if making changes here.
961 // Note: a match can begin at inputBuf + testLen; it is an inclusive limit.
962 int32_t testLen
= (int32_t)(fActiveLimit
- fPattern
->fMinMatchLen
);
963 if (startPos
> testLen
) {
970 U_ASSERT(startPos
>= 0);
972 switch (fPattern
->fStartType
) {
974 // No optimization was found.
975 // Try a match at each input position.
977 MatchChunkAt(startPos
, FALSE
, status
);
978 if (U_FAILURE(status
)) {
984 if (startPos
>= testLen
) {
988 U16_FWD_1(inputBuf
, startPos
, fActiveLimit
);
989 // Note that it's perfectly OK for a pattern to have a zero-length
990 // match at the end of a string, so we must make sure that the loop
991 // runs with startPos == testLen the last time through.
992 if (findProgressInterrupt(startPos
, status
))
998 // Matches are only possible at the start of the input string
999 // (pattern begins with ^ or \A)
1000 if (startPos
> fActiveStart
) {
1004 MatchChunkAt(startPos
, FALSE
, status
);
1005 if (U_FAILURE(status
)) {
1013 // Match may start on any char from a pre-computed set.
1014 U_ASSERT(fPattern
->fMinMatchLen
> 0);
1016 int32_t pos
= startPos
;
1017 U16_NEXT(inputBuf
, startPos
, fActiveLimit
, c
); // like c = inputBuf[startPos++];
1018 if ((c
<256 && fPattern
->fInitialChars8
->contains(c
)) ||
1019 (c
>=256 && fPattern
->fInitialChars
->contains(c
))) {
1020 MatchChunkAt(pos
, FALSE
, status
);
1021 if (U_FAILURE(status
)) {
1028 if (startPos
> testLen
) {
1033 if (findProgressInterrupt(startPos
, status
))
1042 // Match starts on exactly one char.
1043 U_ASSERT(fPattern
->fMinMatchLen
> 0);
1044 UChar32 theChar
= fPattern
->fInitialChar
;
1046 int32_t pos
= startPos
;
1047 U16_NEXT(inputBuf
, startPos
, fActiveLimit
, c
); // like c = inputBuf[startPos++];
1049 MatchChunkAt(pos
, FALSE
, status
);
1050 if (U_FAILURE(status
)) {
1057 if (startPos
> testLen
) {
1062 if (findProgressInterrupt(startPos
, status
))
1071 if (startPos
== fAnchorStart
) {
1072 MatchChunkAt(startPos
, FALSE
, status
);
1073 if (U_FAILURE(status
)) {
1079 U16_FWD_1(inputBuf
, startPos
, fActiveLimit
);
1082 if (fPattern
->fFlags
& UREGEX_UNIX_LINES
) {
1084 c
= inputBuf
[startPos
-1];
1086 MatchChunkAt(startPos
, FALSE
, status
);
1087 if (U_FAILURE(status
)) {
1094 if (startPos
>= testLen
) {
1099 U16_FWD_1(inputBuf
, startPos
, fActiveLimit
);
1100 // Note that it's perfectly OK for a pattern to have a zero-length
1101 // match at the end of a string, so we must make sure that the loop
1102 // runs with startPos == testLen the last time through.
1103 if (findProgressInterrupt(startPos
, status
))
1108 c
= inputBuf
[startPos
-1];
1109 if (isLineTerminator(c
)) {
1110 if (c
== 0x0d && startPos
< fActiveLimit
&& inputBuf
[startPos
] == 0x0a) {
1113 MatchChunkAt(startPos
, FALSE
, status
);
1114 if (U_FAILURE(status
)) {
1121 if (startPos
>= testLen
) {
1126 U16_FWD_1(inputBuf
, startPos
, fActiveLimit
);
1127 // Note that it's perfectly OK for a pattern to have a zero-length
1128 // match at the end of a string, so we must make sure that the loop
1129 // runs with startPos == testLen the last time through.
1130 if (findProgressInterrupt(startPos
, status
))
1146 //--------------------------------------------------------------------------------
1150 //--------------------------------------------------------------------------------
1151 UnicodeString
RegexMatcher::group(UErrorCode
&status
) const {
1152 return group(0, status
);
1155 // Return immutable shallow clone
1156 UText
*RegexMatcher::group(UText
*dest
, int64_t &group_len
, UErrorCode
&status
) const {
1157 return group(0, dest
, group_len
, status
);
1160 // Return immutable shallow clone
1161 UText
*RegexMatcher::group(int32_t groupNum
, UText
*dest
, int64_t &group_len
, UErrorCode
&status
) const {
1163 if (U_FAILURE(status
)) {
1166 if (U_FAILURE(fDeferredStatus
)) {
1167 status
= fDeferredStatus
;
1168 } else if (fMatch
== FALSE
) {
1169 status
= U_REGEX_INVALID_STATE
;
1170 } else if (groupNum
< 0 || groupNum
> fPattern
->fGroupMap
->size()) {
1171 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1174 if (U_FAILURE(status
)) {
1179 if (groupNum
== 0) {
1183 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(groupNum
-1);
1184 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
1185 U_ASSERT(groupOffset
>= 0);
1186 s
= fFrame
->fExtra
[groupOffset
];
1187 e
= fFrame
->fExtra
[groupOffset
+1];
1191 // A capture group wasn't part of the match
1192 return utext_clone(dest
, fInputText
, FALSE
, TRUE
, &status
);
1197 dest
= utext_clone(dest
, fInputText
, FALSE
, TRUE
, &status
);
1199 UTEXT_SETNATIVEINDEX(dest
, s
);
1203 UnicodeString
RegexMatcher::group(int32_t groupNum
, UErrorCode
&status
) const {
1204 UnicodeString result
;
1205 int64_t groupStart
= start64(groupNum
, status
);
1206 int64_t groupEnd
= end64(groupNum
, status
);
1207 if (U_FAILURE(status
) || groupStart
== -1 || groupStart
== groupEnd
) {
1211 // Get the group length using a utext_extract preflight.
1212 // UText is actually pretty efficient at this when underlying encoding is UTF-16.
1213 int32_t length
= utext_extract(fInputText
, groupStart
, groupEnd
, NULL
, 0, &status
);
1214 if (status
!= U_BUFFER_OVERFLOW_ERROR
) {
1218 status
= U_ZERO_ERROR
;
1219 UChar
*buf
= result
.getBuffer(length
);
1221 status
= U_MEMORY_ALLOCATION_ERROR
;
1223 int32_t extractLength
= utext_extract(fInputText
, groupStart
, groupEnd
, buf
, length
, &status
);
1224 result
.releaseBuffer(extractLength
);
1225 U_ASSERT(length
== extractLength
);
1231 //--------------------------------------------------------------------------------
1233 // appendGroup() -- currently internal only, appends a group to a UText rather
1234 // than replacing its contents
1236 //--------------------------------------------------------------------------------
1238 int64_t RegexMatcher::appendGroup(int32_t groupNum
, UText
*dest
, UErrorCode
&status
) const {
1239 if (U_FAILURE(status
)) {
1242 if (U_FAILURE(fDeferredStatus
)) {
1243 status
= fDeferredStatus
;
1246 int64_t destLen
= utext_nativeLength(dest
);
1248 if (fMatch
== FALSE
) {
1249 status
= U_REGEX_INVALID_STATE
;
1250 return utext_replace(dest
, destLen
, destLen
, NULL
, 0, &status
);
1252 if (groupNum
< 0 || groupNum
> fPattern
->fGroupMap
->size()) {
1253 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1254 return utext_replace(dest
, destLen
, destLen
, NULL
, 0, &status
);
1258 if (groupNum
== 0) {
1262 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(groupNum
-1);
1263 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
1264 U_ASSERT(groupOffset
>= 0);
1265 s
= fFrame
->fExtra
[groupOffset
];
1266 e
= fFrame
->fExtra
[groupOffset
+1];
1270 // A capture group wasn't part of the match
1271 return utext_replace(dest
, destLen
, destLen
, NULL
, 0, &status
);
1276 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1277 U_ASSERT(e
<= fInputLength
);
1278 deltaLen
= utext_replace(dest
, destLen
, destLen
, fInputText
->chunkContents
+s
, (int32_t)(e
-s
), &status
);
1281 if (UTEXT_USES_U16(fInputText
)) {
1282 len16
= (int32_t)(e
-s
);
1284 UErrorCode lengthStatus
= U_ZERO_ERROR
;
1285 len16
= utext_extract(fInputText
, s
, e
, NULL
, 0, &lengthStatus
);
1287 UChar
*groupChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(len16
+1));
1288 if (groupChars
== NULL
) {
1289 status
= U_MEMORY_ALLOCATION_ERROR
;
1292 utext_extract(fInputText
, s
, e
, groupChars
, len16
+1, &status
);
1294 deltaLen
= utext_replace(dest
, destLen
, destLen
, groupChars
, len16
, &status
);
1295 uprv_free(groupChars
);
1302 //--------------------------------------------------------------------------------
1306 //--------------------------------------------------------------------------------
1307 int32_t RegexMatcher::groupCount() const {
1308 return fPattern
->fGroupMap
->size();
1311 //--------------------------------------------------------------------------------
1313 // hasAnchoringBounds()
1315 //--------------------------------------------------------------------------------
1316 UBool
RegexMatcher::hasAnchoringBounds() const {
1317 return fAnchoringBounds
;
1321 //--------------------------------------------------------------------------------
1323 // hasTransparentBounds()
1325 //--------------------------------------------------------------------------------
1326 UBool
RegexMatcher::hasTransparentBounds() const {
1327 return fTransparentBounds
;
1332 //--------------------------------------------------------------------------------
1336 //--------------------------------------------------------------------------------
1337 UBool
RegexMatcher::hitEnd() const {
1342 //--------------------------------------------------------------------------------
1346 //--------------------------------------------------------------------------------
1347 const UnicodeString
&RegexMatcher::input() const {
1349 UErrorCode status
= U_ZERO_ERROR
;
1351 if (UTEXT_USES_U16(fInputText
)) {
1352 len16
= (int32_t)fInputLength
;
1354 len16
= utext_extract(fInputText
, 0, fInputLength
, NULL
, 0, &status
);
1355 status
= U_ZERO_ERROR
; // overflow, length status
1357 UnicodeString
*result
= new UnicodeString(len16
, 0, 0);
1359 UChar
*inputChars
= result
->getBuffer(len16
);
1360 utext_extract(fInputText
, 0, fInputLength
, inputChars
, len16
, &status
); // unterminated warning
1361 result
->releaseBuffer(len16
);
1363 (*(const UnicodeString
**)&fInput
) = result
; // pointer assignment, rather than operator=
1369 //--------------------------------------------------------------------------------
1373 //--------------------------------------------------------------------------------
1374 UText
*RegexMatcher::inputText() const {
1379 //--------------------------------------------------------------------------------
1381 // getInput() -- like inputText(), but makes a clone or copies into another UText
1383 //--------------------------------------------------------------------------------
1384 UText
*RegexMatcher::getInput (UText
*dest
, UErrorCode
&status
) const {
1385 if (U_FAILURE(status
)) {
1388 if (U_FAILURE(fDeferredStatus
)) {
1389 status
= fDeferredStatus
;
1394 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1395 utext_replace(dest
, 0, utext_nativeLength(dest
), fInputText
->chunkContents
, (int32_t)fInputLength
, &status
);
1398 if (UTEXT_USES_U16(fInputText
)) {
1399 input16Len
= (int32_t)fInputLength
;
1401 UErrorCode lengthStatus
= U_ZERO_ERROR
;
1402 input16Len
= utext_extract(fInputText
, 0, fInputLength
, NULL
, 0, &lengthStatus
); // buffer overflow error
1404 UChar
*inputChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(input16Len
));
1405 if (inputChars
== NULL
) {
1409 status
= U_ZERO_ERROR
;
1410 utext_extract(fInputText
, 0, fInputLength
, inputChars
, input16Len
, &status
); // not terminated warning
1411 status
= U_ZERO_ERROR
;
1412 utext_replace(dest
, 0, utext_nativeLength(dest
), inputChars
, input16Len
, &status
);
1414 uprv_free(inputChars
);
1418 return utext_clone(NULL
, fInputText
, FALSE
, TRUE
, &status
);
1423 static UBool
compat_SyncMutableUTextContents(UText
*ut
);
1424 static UBool
compat_SyncMutableUTextContents(UText
*ut
) {
1425 UBool retVal
= FALSE
;
1427 // In the following test, we're really only interested in whether the UText should switch
1428 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents
1429 // will still point to the correct data.
1430 if (utext_nativeLength(ut
) != ut
->nativeIndexingLimit
) {
1431 UnicodeString
*us
=(UnicodeString
*)ut
->context
;
1433 // Update to the latest length.
1434 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1435 int32_t newLength
= us
->length();
1437 // Update the chunk description.
1438 // The buffer may have switched between stack- and heap-based.
1439 ut
->chunkContents
= us
->getBuffer();
1440 ut
->chunkLength
= newLength
;
1441 ut
->chunkNativeLimit
= newLength
;
1442 ut
->nativeIndexingLimit
= newLength
;
1449 //--------------------------------------------------------------------------------
1453 //--------------------------------------------------------------------------------
1454 UBool
RegexMatcher::lookingAt(UErrorCode
&status
) {
1455 if (U_FAILURE(status
)) {
1458 if (U_FAILURE(fDeferredStatus
)) {
1459 status
= fDeferredStatus
;
1463 if (fInputUniStrMaybeMutable
) {
1464 if (compat_SyncMutableUTextContents(fInputText
)) {
1465 fInputLength
= utext_nativeLength(fInputText
);
1470 resetPreserveRegion();
1472 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1473 MatchChunkAt((int32_t)fActiveStart
, FALSE
, status
);
1475 MatchAt(fActiveStart
, FALSE
, status
);
1481 UBool
RegexMatcher::lookingAt(int64_t start
, UErrorCode
&status
) {
1482 if (U_FAILURE(status
)) {
1485 if (U_FAILURE(fDeferredStatus
)) {
1486 status
= fDeferredStatus
;
1492 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1496 if (fInputUniStrMaybeMutable
) {
1497 if (compat_SyncMutableUTextContents(fInputText
)) {
1498 fInputLength
= utext_nativeLength(fInputText
);
1503 int64_t nativeStart
;
1504 nativeStart
= start
;
1505 if (nativeStart
< fActiveStart
|| nativeStart
> fActiveLimit
) {
1506 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1510 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1511 MatchChunkAt((int32_t)nativeStart
, FALSE
, status
);
1513 MatchAt(nativeStart
, FALSE
, status
);
1520 //--------------------------------------------------------------------------------
1524 //--------------------------------------------------------------------------------
1525 UBool
RegexMatcher::matches(UErrorCode
&status
) {
1526 if (U_FAILURE(status
)) {
1529 if (U_FAILURE(fDeferredStatus
)) {
1530 status
= fDeferredStatus
;
1534 if (fInputUniStrMaybeMutable
) {
1535 if (compat_SyncMutableUTextContents(fInputText
)) {
1536 fInputLength
= utext_nativeLength(fInputText
);
1541 resetPreserveRegion();
1544 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1545 MatchChunkAt((int32_t)fActiveStart
, TRUE
, status
);
1547 MatchAt(fActiveStart
, TRUE
, status
);
1553 UBool
RegexMatcher::matches(int64_t start
, UErrorCode
&status
) {
1554 if (U_FAILURE(status
)) {
1557 if (U_FAILURE(fDeferredStatus
)) {
1558 status
= fDeferredStatus
;
1564 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1568 if (fInputUniStrMaybeMutable
) {
1569 if (compat_SyncMutableUTextContents(fInputText
)) {
1570 fInputLength
= utext_nativeLength(fInputText
);
1575 int64_t nativeStart
;
1576 nativeStart
= start
;
1577 if (nativeStart
< fActiveStart
|| nativeStart
> fActiveLimit
) {
1578 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1582 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText
, fInputLength
)) {
1583 MatchChunkAt((int32_t)nativeStart
, TRUE
, status
);
1585 MatchAt(nativeStart
, TRUE
, status
);
1592 //--------------------------------------------------------------------------------
1596 //--------------------------------------------------------------------------------
1597 const RegexPattern
&RegexMatcher::pattern() const {
1603 //--------------------------------------------------------------------------------
1607 //--------------------------------------------------------------------------------
1608 RegexMatcher
&RegexMatcher::region(int64_t regionStart
, int64_t regionLimit
, int64_t startIndex
, UErrorCode
&status
) {
1609 if (U_FAILURE(status
)) {
1613 if (regionStart
>regionLimit
|| regionStart
<0 || regionLimit
<0) {
1614 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1617 int64_t nativeStart
= regionStart
;
1618 int64_t nativeLimit
= regionLimit
;
1619 if (nativeStart
> fInputLength
|| nativeLimit
> fInputLength
) {
1620 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1623 if (startIndex
== -1)
1626 resetPreserveRegion();
1628 fRegionStart
= nativeStart
;
1629 fRegionLimit
= nativeLimit
;
1630 fActiveStart
= nativeStart
;
1631 fActiveLimit
= nativeLimit
;
1633 if (startIndex
!= -1) {
1634 if (startIndex
< fActiveStart
|| startIndex
> fActiveLimit
) {
1635 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1637 fMatchEnd
= startIndex
;
1640 if (!fTransparentBounds
) {
1641 fLookStart
= nativeStart
;
1642 fLookLimit
= nativeLimit
;
1644 if (fAnchoringBounds
) {
1645 fAnchorStart
= nativeStart
;
1646 fAnchorLimit
= nativeLimit
;
1651 RegexMatcher
&RegexMatcher::region(int64_t start
, int64_t limit
, UErrorCode
&status
) {
1652 return region(start
, limit
, -1, status
);
1655 //--------------------------------------------------------------------------------
1659 //--------------------------------------------------------------------------------
1660 int32_t RegexMatcher::regionEnd() const {
1661 return (int32_t)fRegionLimit
;
1664 int64_t RegexMatcher::regionEnd64() const {
1665 return fRegionLimit
;
1668 //--------------------------------------------------------------------------------
1672 //--------------------------------------------------------------------------------
1673 int32_t RegexMatcher::regionStart() const {
1674 return (int32_t)fRegionStart
;
1677 int64_t RegexMatcher::regionStart64() const {
1678 return fRegionStart
;
1682 //--------------------------------------------------------------------------------
1686 //--------------------------------------------------------------------------------
1687 UnicodeString
RegexMatcher::replaceAll(const UnicodeString
&replacement
, UErrorCode
&status
) {
1688 UText replacementText
= UTEXT_INITIALIZER
;
1689 UText resultText
= UTEXT_INITIALIZER
;
1690 UnicodeString resultString
;
1691 if (U_FAILURE(status
)) {
1692 return resultString
;
1695 utext_openConstUnicodeString(&replacementText
, &replacement
, &status
);
1696 utext_openUnicodeString(&resultText
, &resultString
, &status
);
1698 replaceAll(&replacementText
, &resultText
, status
);
1700 utext_close(&resultText
);
1701 utext_close(&replacementText
);
1703 return resultString
;
1708 // replaceAll, UText mode
1710 UText
*RegexMatcher::replaceAll(UText
*replacement
, UText
*dest
, UErrorCode
&status
) {
1711 if (U_FAILURE(status
)) {
1714 if (U_FAILURE(fDeferredStatus
)) {
1715 status
= fDeferredStatus
;
1720 UnicodeString emptyString
;
1721 UText empty
= UTEXT_INITIALIZER
;
1723 utext_openUnicodeString(&empty
, &emptyString
, &status
);
1724 dest
= utext_clone(NULL
, &empty
, TRUE
, FALSE
, &status
);
1725 utext_close(&empty
);
1728 if (U_SUCCESS(status
)) {
1731 appendReplacement(dest
, replacement
, status
);
1732 if (U_FAILURE(status
)) {
1736 appendTail(dest
, status
);
1743 //--------------------------------------------------------------------------------
1747 //--------------------------------------------------------------------------------
1748 UnicodeString
RegexMatcher::replaceFirst(const UnicodeString
&replacement
, UErrorCode
&status
) {
1749 UText replacementText
= UTEXT_INITIALIZER
;
1750 UText resultText
= UTEXT_INITIALIZER
;
1751 UnicodeString resultString
;
1753 utext_openConstUnicodeString(&replacementText
, &replacement
, &status
);
1754 utext_openUnicodeString(&resultText
, &resultString
, &status
);
1756 replaceFirst(&replacementText
, &resultText
, status
);
1758 utext_close(&resultText
);
1759 utext_close(&replacementText
);
1761 return resultString
;
1765 // replaceFirst, UText mode
1767 UText
*RegexMatcher::replaceFirst(UText
*replacement
, UText
*dest
, UErrorCode
&status
) {
1768 if (U_FAILURE(status
)) {
1771 if (U_FAILURE(fDeferredStatus
)) {
1772 status
= fDeferredStatus
;
1778 return getInput(dest
, status
);
1782 UnicodeString emptyString
;
1783 UText empty
= UTEXT_INITIALIZER
;
1785 utext_openUnicodeString(&empty
, &emptyString
, &status
);
1786 dest
= utext_clone(NULL
, &empty
, TRUE
, FALSE
, &status
);
1787 utext_close(&empty
);
1790 appendReplacement(dest
, replacement
, status
);
1791 appendTail(dest
, status
);
1797 //--------------------------------------------------------------------------------
1801 //--------------------------------------------------------------------------------
1802 UBool
RegexMatcher::requireEnd() const {
1807 //--------------------------------------------------------------------------------
1811 //--------------------------------------------------------------------------------
1812 RegexMatcher
&RegexMatcher::reset() {
1814 fRegionLimit
= fInputLength
;
1816 fActiveLimit
= fInputLength
;
1818 fAnchorLimit
= fInputLength
;
1820 fLookLimit
= fInputLength
;
1821 resetPreserveRegion();
1827 void RegexMatcher::resetPreserveRegion() {
1831 fAppendPosition
= 0;
1834 fRequireEnd
= FALSE
;
1836 fTickCounter
= TIMER_INITIAL_VALUE
;
1837 //resetStack(); // more expensive than it looks...
1841 RegexMatcher
&RegexMatcher::reset(const UnicodeString
&input
) {
1842 fInputText
= utext_openConstUnicodeString(fInputText
, &input
, &fDeferredStatus
);
1843 if (fPattern
->fNeedsAltInput
) {
1844 fAltInputText
= utext_clone(fAltInputText
, fInputText
, FALSE
, TRUE
, &fDeferredStatus
);
1846 if (U_FAILURE(fDeferredStatus
)) {
1849 fInputLength
= utext_nativeLength(fInputText
);
1855 // Do the following for any UnicodeString.
1856 // This is for compatibility for those clients who modify the input string "live" during regex operations.
1857 fInputUniStrMaybeMutable
= TRUE
;
1859 if (fWordBreakItr
!= NULL
) {
1860 #if UCONFIG_NO_BREAK_ITERATION==0
1861 UErrorCode status
= U_ZERO_ERROR
;
1862 fWordBreakItr
->setText(fInputText
, status
);
1869 RegexMatcher
&RegexMatcher::reset(UText
*input
) {
1870 if (fInputText
!= input
) {
1871 fInputText
= utext_clone(fInputText
, input
, FALSE
, TRUE
, &fDeferredStatus
);
1872 if (fPattern
->fNeedsAltInput
) fAltInputText
= utext_clone(fAltInputText
, fInputText
, FALSE
, TRUE
, &fDeferredStatus
);
1873 if (U_FAILURE(fDeferredStatus
)) {
1876 fInputLength
= utext_nativeLength(fInputText
);
1881 if (fWordBreakItr
!= NULL
) {
1882 #if UCONFIG_NO_BREAK_ITERATION==0
1883 UErrorCode status
= U_ZERO_ERROR
;
1884 fWordBreakItr
->setText(input
, status
);
1889 fInputUniStrMaybeMutable
= FALSE
;
1894 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
1895 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1899 RegexMatcher
&RegexMatcher::reset(int64_t position
, UErrorCode
&status
) {
1900 if (U_FAILURE(status
)) {
1903 reset(); // Reset also resets the region to be the entire string.
1905 if (position
< 0 || position
> fActiveLimit
) {
1906 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1909 fMatchEnd
= position
;
1914 //--------------------------------------------------------------------------------
1918 //--------------------------------------------------------------------------------
1919 RegexMatcher
&RegexMatcher::refreshInputText(UText
*input
, UErrorCode
&status
) {
1920 if (U_FAILURE(status
)) {
1923 if (input
== NULL
) {
1924 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1927 if (utext_nativeLength(fInputText
) != utext_nativeLength(input
)) {
1928 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1931 int64_t pos
= utext_getNativeIndex(fInputText
);
1932 // Shallow read-only clone of the new UText into the existing input UText
1933 fInputText
= utext_clone(fInputText
, input
, FALSE
, TRUE
, &status
);
1934 if (U_FAILURE(status
)) {
1937 utext_setNativeIndex(fInputText
, pos
);
1939 if (fAltInputText
!= NULL
) {
1940 pos
= utext_getNativeIndex(fAltInputText
);
1941 fAltInputText
= utext_clone(fAltInputText
, input
, FALSE
, TRUE
, &status
);
1942 if (U_FAILURE(status
)) {
1945 utext_setNativeIndex(fAltInputText
, pos
);
1952 //--------------------------------------------------------------------------------
1956 //--------------------------------------------------------------------------------
1957 void RegexMatcher::setTrace(UBool state
) {
1958 fTraceDebug
= state
;
1964 * UText, replace entire contents of the destination UText with a substring of the source UText.
1966 * @param src The source UText
1967 * @param dest The destination UText. Must be writable.
1968 * May be NULL, in which case a new UText will be allocated.
1969 * @param start Start index of source substring.
1970 * @param limit Limit index of source substring.
1971 * @param status An error code.
1973 static UText
*utext_extract_replace(UText
*src
, UText
*dest
, int64_t start
, int64_t limit
, UErrorCode
*status
) {
1974 if (U_FAILURE(*status
)) {
1977 if (start
== limit
) {
1979 utext_replace(dest
, 0, utext_nativeLength(dest
), NULL
, 0, status
);
1982 return utext_openUChars(NULL
, NULL
, 0, status
);
1985 int32_t length
= utext_extract(src
, start
, limit
, NULL
, 0, status
);
1986 if (*status
!= U_BUFFER_OVERFLOW_ERROR
&& U_FAILURE(*status
)) {
1989 *status
= U_ZERO_ERROR
;
1990 MaybeStackArray
<UChar
, 40> buffer
;
1991 if (length
>= buffer
.getCapacity()) {
1992 UChar
*newBuf
= buffer
.resize(length
+1); // Leave space for terminating Nul.
1993 if (newBuf
== NULL
) {
1994 *status
= U_MEMORY_ALLOCATION_ERROR
;
1997 utext_extract(src
, start
, limit
, buffer
.getAlias(), length
+1, status
);
1999 utext_replace(dest
, 0, utext_nativeLength(dest
), buffer
.getAlias(), length
, status
);
2003 // Caller did not provide a prexisting UText.
2004 // Open a new one, and have it adopt the text buffer storage.
2005 if (U_FAILURE(*status
)) {
2008 int32_t ownedLength
= 0;
2009 UChar
*ownedBuf
= buffer
.orphanOrClone(length
+1, ownedLength
);
2010 if (ownedBuf
== NULL
) {
2011 *status
= U_MEMORY_ALLOCATION_ERROR
;
2014 UText
*result
= utext_openUChars(NULL
, ownedBuf
, length
, status
);
2015 if (U_FAILURE(*status
)) {
2016 uprv_free(ownedBuf
);
2019 result
->providerProperties
|= (1 << UTEXT_PROVIDER_OWNS_TEXT
);
2024 //---------------------------------------------------------------------
2028 //---------------------------------------------------------------------
2029 int32_t RegexMatcher::split(const UnicodeString
&input
,
2030 UnicodeString dest
[],
2031 int32_t destCapacity
,
2034 UText inputText
= UTEXT_INITIALIZER
;
2035 utext_openConstUnicodeString(&inputText
, &input
, &status
);
2036 if (U_FAILURE(status
)) {
2040 UText
**destText
= (UText
**)uprv_malloc(sizeof(UText
*)*destCapacity
);
2041 if (destText
== NULL
) {
2042 status
= U_MEMORY_ALLOCATION_ERROR
;
2046 for (i
= 0; i
< destCapacity
; i
++) {
2047 destText
[i
] = utext_openUnicodeString(NULL
, &dest
[i
], &status
);
2050 int32_t fieldCount
= split(&inputText
, destText
, destCapacity
, status
);
2052 for (i
= 0; i
< destCapacity
; i
++) {
2053 utext_close(destText
[i
]);
2056 uprv_free(destText
);
2057 utext_close(&inputText
);
2062 // split, UText mode
2064 int32_t RegexMatcher::split(UText
*input
,
2066 int32_t destCapacity
,
2070 // Check arguements for validity
2072 if (U_FAILURE(status
)) {
2076 if (destCapacity
< 1) {
2077 status
= U_ILLEGAL_ARGUMENT_ERROR
;
2082 // Reset for the input text
2085 int64_t nextOutputStringStart
= 0;
2086 if (fActiveLimit
== 0) {
2091 // Loop through the input text, searching for the delimiter pattern
2094 int32_t numCaptureGroups
= fPattern
->fGroupMap
->size();
2096 if (i
>=destCapacity
-1) {
2097 // There is one or zero output string left.
2098 // Fill the last output string with whatever is left from the input, then exit the loop.
2099 // ( i will be == destCapacity if we filled the output array while processing
2100 // capture groups of the delimiter expression, in which case we will discard the
2101 // last capture group saved in favor of the unprocessed remainder of the
2104 if (fActiveLimit
> nextOutputStringStart
) {
2105 if (UTEXT_FULL_TEXT_IN_CHUNK(input
, fInputLength
)) {
2107 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]),
2108 input
->chunkContents
+nextOutputStringStart
,
2109 (int32_t)(fActiveLimit
-nextOutputStringStart
), &status
);
2111 UText remainingText
= UTEXT_INITIALIZER
;
2112 utext_openUChars(&remainingText
, input
->chunkContents
+nextOutputStringStart
,
2113 fActiveLimit
-nextOutputStringStart
, &status
);
2114 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2115 utext_close(&remainingText
);
2118 UErrorCode lengthStatus
= U_ZERO_ERROR
;
2119 int32_t remaining16Length
=
2120 utext_extract(input
, nextOutputStringStart
, fActiveLimit
, NULL
, 0, &lengthStatus
);
2121 UChar
*remainingChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(remaining16Length
+1));
2122 if (remainingChars
== NULL
) {
2123 status
= U_MEMORY_ALLOCATION_ERROR
;
2127 utext_extract(input
, nextOutputStringStart
, fActiveLimit
, remainingChars
, remaining16Length
+1, &status
);
2129 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]), remainingChars
, remaining16Length
, &status
);
2131 UText remainingText
= UTEXT_INITIALIZER
;
2132 utext_openUChars(&remainingText
, remainingChars
, remaining16Length
, &status
);
2133 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2134 utext_close(&remainingText
);
2137 uprv_free(remainingChars
);
2143 // We found another delimiter. Move everything from where we started looking
2144 // up until the start of the delimiter into the next output string.
2145 if (UTEXT_FULL_TEXT_IN_CHUNK(input
, fInputLength
)) {
2147 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]),
2148 input
->chunkContents
+nextOutputStringStart
,
2149 (int32_t)(fMatchStart
-nextOutputStringStart
), &status
);
2151 UText remainingText
= UTEXT_INITIALIZER
;
2152 utext_openUChars(&remainingText
, input
->chunkContents
+nextOutputStringStart
,
2153 fMatchStart
-nextOutputStringStart
, &status
);
2154 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2155 utext_close(&remainingText
);
2158 UErrorCode lengthStatus
= U_ZERO_ERROR
;
2159 int32_t remaining16Length
= utext_extract(input
, nextOutputStringStart
, fMatchStart
, NULL
, 0, &lengthStatus
);
2160 UChar
*remainingChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(remaining16Length
+1));
2161 if (remainingChars
== NULL
) {
2162 status
= U_MEMORY_ALLOCATION_ERROR
;
2165 utext_extract(input
, nextOutputStringStart
, fMatchStart
, remainingChars
, remaining16Length
+1, &status
);
2167 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]), remainingChars
, remaining16Length
, &status
);
2169 UText remainingText
= UTEXT_INITIALIZER
;
2170 utext_openUChars(&remainingText
, remainingChars
, remaining16Length
, &status
);
2171 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2172 utext_close(&remainingText
);
2175 uprv_free(remainingChars
);
2177 nextOutputStringStart
= fMatchEnd
;
2179 // If the delimiter pattern has capturing parentheses, the captured
2180 // text goes out into the next n destination strings.
2182 for (groupNum
=1; groupNum
<=numCaptureGroups
; groupNum
++) {
2183 if (i
>= destCapacity
-2) {
2184 // Never fill the last available output string with capture group text.
2185 // It will filled with the last field, the remainder of the
2186 // unsplit input text.
2190 dest
[i
] = utext_extract_replace(fInputText
, dest
[i
],
2191 start64(groupNum
, status
), end64(groupNum
, status
), &status
);
2194 if (nextOutputStringStart
== fActiveLimit
) {
2195 // The delimiter was at the end of the string. We're done, but first
2196 // we output one last empty string, for the empty field following
2197 // the delimiter at the end of input.
2198 if (i
+1 < destCapacity
) {
2200 if (dest
[i
] == NULL
) {
2201 dest
[i
] = utext_openUChars(NULL
, NULL
, 0, &status
);
2203 static UChar emptyString
[] = {(UChar
)0};
2204 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]), emptyString
, 0, &status
);
2213 // We ran off the end of the input while looking for the next delimiter.
2214 // All the remaining text goes into the current output string.
2215 if (UTEXT_FULL_TEXT_IN_CHUNK(input
, fInputLength
)) {
2217 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]),
2218 input
->chunkContents
+nextOutputStringStart
,
2219 (int32_t)(fActiveLimit
-nextOutputStringStart
), &status
);
2221 UText remainingText
= UTEXT_INITIALIZER
;
2222 utext_openUChars(&remainingText
, input
->chunkContents
+nextOutputStringStart
,
2223 fActiveLimit
-nextOutputStringStart
, &status
);
2224 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2225 utext_close(&remainingText
);
2228 UErrorCode lengthStatus
= U_ZERO_ERROR
;
2229 int32_t remaining16Length
= utext_extract(input
, nextOutputStringStart
, fActiveLimit
, NULL
, 0, &lengthStatus
);
2230 UChar
*remainingChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(remaining16Length
+1));
2231 if (remainingChars
== NULL
) {
2232 status
= U_MEMORY_ALLOCATION_ERROR
;
2236 utext_extract(input
, nextOutputStringStart
, fActiveLimit
, remainingChars
, remaining16Length
+1, &status
);
2238 utext_replace(dest
[i
], 0, utext_nativeLength(dest
[i
]), remainingChars
, remaining16Length
, &status
);
2240 UText remainingText
= UTEXT_INITIALIZER
;
2241 utext_openUChars(&remainingText
, remainingChars
, remaining16Length
, &status
);
2242 dest
[i
] = utext_clone(NULL
, &remainingText
, TRUE
, FALSE
, &status
);
2243 utext_close(&remainingText
);
2246 uprv_free(remainingChars
);
2250 if (U_FAILURE(status
)) {
2253 } // end of for loop
2258 //--------------------------------------------------------------------------------
2262 //--------------------------------------------------------------------------------
2263 int32_t RegexMatcher::start(UErrorCode
&status
) const {
2264 return start(0, status
);
2267 int64_t RegexMatcher::start64(UErrorCode
&status
) const {
2268 return start64(0, status
);
2271 //--------------------------------------------------------------------------------
2273 // start(int32_t group, UErrorCode &status)
2275 //--------------------------------------------------------------------------------
2277 int64_t RegexMatcher::start64(int32_t group
, UErrorCode
&status
) const {
2278 if (U_FAILURE(status
)) {
2281 if (U_FAILURE(fDeferredStatus
)) {
2282 status
= fDeferredStatus
;
2285 if (fMatch
== FALSE
) {
2286 status
= U_REGEX_INVALID_STATE
;
2289 if (group
< 0 || group
> fPattern
->fGroupMap
->size()) {
2290 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
2297 int32_t groupOffset
= fPattern
->fGroupMap
->elementAti(group
-1);
2298 U_ASSERT(groupOffset
< fPattern
->fFrameSize
);
2299 U_ASSERT(groupOffset
>= 0);
2300 s
= fFrame
->fExtra
[groupOffset
];
2307 int32_t RegexMatcher::start(int32_t group
, UErrorCode
&status
) const {
2308 return (int32_t)start64(group
, status
);
2311 //--------------------------------------------------------------------------------
2313 // useAnchoringBounds
2315 //--------------------------------------------------------------------------------
2316 RegexMatcher
&RegexMatcher::useAnchoringBounds(UBool b
) {
2317 fAnchoringBounds
= b
;
2318 fAnchorStart
= (fAnchoringBounds
? fRegionStart
: 0);
2319 fAnchorLimit
= (fAnchoringBounds
? fRegionLimit
: fInputLength
);
2324 //--------------------------------------------------------------------------------
2326 // useTransparentBounds
2328 //--------------------------------------------------------------------------------
2329 RegexMatcher
&RegexMatcher::useTransparentBounds(UBool b
) {
2330 fTransparentBounds
= b
;
2331 fLookStart
= (fTransparentBounds
? 0 : fRegionStart
);
2332 fLookLimit
= (fTransparentBounds
? fInputLength
: fRegionLimit
);
2336 //--------------------------------------------------------------------------------
2340 //--------------------------------------------------------------------------------
2341 void RegexMatcher::setTimeLimit(int32_t limit
, UErrorCode
&status
) {
2342 if (U_FAILURE(status
)) {
2345 if (U_FAILURE(fDeferredStatus
)) {
2346 status
= fDeferredStatus
;
2350 status
= U_ILLEGAL_ARGUMENT_ERROR
;
2357 //--------------------------------------------------------------------------------
2361 //--------------------------------------------------------------------------------
2362 int32_t RegexMatcher::getTimeLimit() const {
2367 //--------------------------------------------------------------------------------
2371 //--------------------------------------------------------------------------------
2372 void RegexMatcher::setStackLimit(int32_t limit
, UErrorCode
&status
) {
2373 if (U_FAILURE(status
)) {
2376 if (U_FAILURE(fDeferredStatus
)) {
2377 status
= fDeferredStatus
;
2381 status
= U_ILLEGAL_ARGUMENT_ERROR
;
2385 // Reset the matcher. This is needed here in case there is a current match
2386 // whose final stack frame (containing the match results, pointed to by fFrame)
2387 // would be lost by resizing to a smaller stack size.
2391 // Unlimited stack expansion
2392 fStack
->setMaxCapacity(0);
2394 // Change the units of the limit from bytes to ints, and bump the size up
2395 // to be big enough to hold at least one stack frame for the pattern,
2396 // if it isn't there already.
2397 int32_t adjustedLimit
= limit
/ sizeof(int32_t);
2398 if (adjustedLimit
< fPattern
->fFrameSize
) {
2399 adjustedLimit
= fPattern
->fFrameSize
;
2401 fStack
->setMaxCapacity(adjustedLimit
);
2403 fStackLimit
= limit
;
2407 //--------------------------------------------------------------------------------
2411 //--------------------------------------------------------------------------------
2412 int32_t RegexMatcher::getStackLimit() const {
2417 //--------------------------------------------------------------------------------
2421 //--------------------------------------------------------------------------------
2422 void RegexMatcher::setMatchCallback(URegexMatchCallback
*callback
,
2423 const void *context
,
2424 UErrorCode
&status
) {
2425 if (U_FAILURE(status
)) {
2428 fCallbackFn
= callback
;
2429 fCallbackContext
= context
;
2433 //--------------------------------------------------------------------------------
2437 //--------------------------------------------------------------------------------
2438 void RegexMatcher::getMatchCallback(URegexMatchCallback
*&callback
,
2439 const void *&context
,
2440 UErrorCode
&status
) {
2441 if (U_FAILURE(status
)) {
2444 callback
= fCallbackFn
;
2445 context
= fCallbackContext
;
2449 //--------------------------------------------------------------------------------
2453 //--------------------------------------------------------------------------------
2454 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback
*callback
,
2455 const void *context
,
2456 UErrorCode
&status
) {
2457 if (U_FAILURE(status
)) {
2460 fFindProgressCallbackFn
= callback
;
2461 fFindProgressCallbackContext
= context
;
2465 //--------------------------------------------------------------------------------
2469 //--------------------------------------------------------------------------------
2470 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback
*&callback
,
2471 const void *&context
,
2472 UErrorCode
&status
) {
2473 if (U_FAILURE(status
)) {
2476 callback
= fFindProgressCallbackFn
;
2477 context
= fFindProgressCallbackContext
;
2481 //================================================================================
2483 // Code following this point in this file is the internal
2484 // Match Engine Implementation.
2486 //================================================================================
2489 //--------------------------------------------------------------------------------
2492 // Discard any previous contents of the state save stack, and initialize a
2493 // new stack frame to all -1. The -1s are needed for capture group limits,
2494 // where they indicate that a group has not yet matched anything.
2495 //--------------------------------------------------------------------------------
2496 REStackFrame
*RegexMatcher::resetStack() {
2497 // Discard any previous contents of the state save stack, and initialize a
2498 // new stack frame with all -1 data. The -1s are needed for capture group limits,
2499 // where they indicate that a group has not yet matched anything.
2500 fStack
->removeAllElements();
2502 REStackFrame
*iFrame
= (REStackFrame
*)fStack
->reserveBlock(fPattern
->fFrameSize
, fDeferredStatus
);
2504 for (i
=0; i
<fPattern
->fFrameSize
-RESTACKFRAME_HDRCOUNT
; i
++) {
2505 iFrame
->fExtra
[i
] = -1;
2512 //--------------------------------------------------------------------------------
2515 // in perl, "xab..cd..", \b is true at positions 0,3,5,7
2517 // If the current char is a combining mark,
2519 // Else Scan backwards to the first non-combining char.
2520 // We are at a boundary if the this char and the original chars are
2521 // opposite in membership in \w set
2523 // parameters: pos - the current position in the input buffer
2525 // TODO: double-check edge cases at region boundaries.
2527 //--------------------------------------------------------------------------------
2528 UBool
RegexMatcher::isWordBoundary(int64_t pos
) {
2529 UBool isBoundary
= FALSE
;
2530 UBool cIsWord
= FALSE
;
2532 if (pos
>= fLookLimit
) {
2535 // Determine whether char c at current position is a member of the word set of chars.
2536 // If we're off the end of the string, behave as though we're not at a word char.
2537 UTEXT_SETNATIVEINDEX(fInputText
, pos
);
2538 UChar32 c
= UTEXT_CURRENT32(fInputText
);
2539 if (u_hasBinaryProperty(c
, UCHAR_GRAPHEME_EXTEND
) || u_charType(c
) == U_FORMAT_CHAR
) {
2540 // Current char is a combining one. Not a boundary.
2543 cIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(c
);
2546 // Back up until we come to a non-combining char, determine whether
2547 // that char is a word char.
2548 UBool prevCIsWord
= FALSE
;
2550 if (UTEXT_GETNATIVEINDEX(fInputText
) <= fLookStart
) {
2553 UChar32 prevChar
= UTEXT_PREVIOUS32(fInputText
);
2554 if (!(u_hasBinaryProperty(prevChar
, UCHAR_GRAPHEME_EXTEND
)
2555 || u_charType(prevChar
) == U_FORMAT_CHAR
)) {
2556 prevCIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(prevChar
);
2560 isBoundary
= cIsWord
^ prevCIsWord
;
2564 UBool
RegexMatcher::isChunkWordBoundary(int32_t pos
) {
2565 UBool isBoundary
= FALSE
;
2566 UBool cIsWord
= FALSE
;
2568 const UChar
*inputBuf
= fInputText
->chunkContents
;
2570 if (pos
>= fLookLimit
) {
2573 // Determine whether char c at current position is a member of the word set of chars.
2574 // If we're off the end of the string, behave as though we're not at a word char.
2576 U16_GET(inputBuf
, fLookStart
, pos
, fLookLimit
, c
);
2577 if (u_hasBinaryProperty(c
, UCHAR_GRAPHEME_EXTEND
) || u_charType(c
) == U_FORMAT_CHAR
) {
2578 // Current char is a combining one. Not a boundary.
2581 cIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(c
);
2584 // Back up until we come to a non-combining char, determine whether
2585 // that char is a word char.
2586 UBool prevCIsWord
= FALSE
;
2588 if (pos
<= fLookStart
) {
2592 U16_PREV(inputBuf
, fLookStart
, pos
, prevChar
);
2593 if (!(u_hasBinaryProperty(prevChar
, UCHAR_GRAPHEME_EXTEND
)
2594 || u_charType(prevChar
) == U_FORMAT_CHAR
)) {
2595 prevCIsWord
= fPattern
->fStaticSets
[URX_ISWORD_SET
]->contains(prevChar
);
2599 isBoundary
= cIsWord
^ prevCIsWord
;
2603 //--------------------------------------------------------------------------------
2607 // Test for a word boundary using RBBI word break.
2609 // parameters: pos - the current position in the input buffer
2611 //--------------------------------------------------------------------------------
2612 UBool
RegexMatcher::isUWordBoundary(int64_t pos
) {
2613 UBool returnVal
= FALSE
;
2614 #if UCONFIG_NO_BREAK_ITERATION==0
2616 // If we haven't yet created a break iterator for this matcher, do it now.
2617 if (fWordBreakItr
== NULL
) {
2619 (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus
);
2620 if (U_FAILURE(fDeferredStatus
)) {
2623 fWordBreakItr
->setText(fInputText
, fDeferredStatus
);
2626 if (pos
>= fLookLimit
) {
2628 returnVal
= TRUE
; // With Unicode word rules, only positions within the interior of "real"
2629 // words are not boundaries. All non-word chars stand by themselves,
2630 // with word boundaries on both sides.
2632 if (!UTEXT_USES_U16(fInputText
)) {
2633 // !!!: Would like a better way to do this!
2634 UErrorCode status
= U_ZERO_ERROR
;
2635 pos
= utext_extract(fInputText
, 0, pos
, NULL
, 0, &status
);
2637 returnVal
= fWordBreakItr
->isBoundary((int32_t)pos
);
2643 //--------------------------------------------------------------------------------
2645 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state
2646 // saves. Increment the "time" counter, and call the
2647 // user callback function if there is one installed.
2649 // If the match operation needs to be aborted, either for a time-out
2650 // or because the user callback asked for it, just set an error status.
2651 // The engine will pick that up and stop in its outer loop.
2653 //--------------------------------------------------------------------------------
2654 void RegexMatcher::IncrementTime(UErrorCode
&status
) {
2655 fTickCounter
= TIMER_INITIAL_VALUE
;
2657 if (fCallbackFn
!= NULL
) {
2658 if ((*fCallbackFn
)(fCallbackContext
, fTime
) == FALSE
) {
2659 status
= U_REGEX_STOPPED_BY_CALLER
;
2663 if (fTimeLimit
> 0 && fTime
>= fTimeLimit
) {
2664 status
= U_REGEX_TIME_OUT
;
2668 //--------------------------------------------------------------------------------
2671 // Make a new stack frame, initialized as a copy of the current stack frame.
2672 // Set the pattern index in the original stack frame from the operand value
2673 // in the opcode. Execution of the engine continues with the state in
2674 // the newly created stack frame
2676 // Note that reserveBlock() may grow the stack, resulting in the
2677 // whole thing being relocated in memory.
2680 // fp The top frame pointer when called. At return, a new
2681 // fame will be present
2682 // savePatIdx An index into the compiled pattern. Goes into the original
2683 // (not new) frame. If execution ever back-tracks out of the
2684 // new frame, this will be where we continue from in the pattern.
2686 // The new frame pointer.
2688 //--------------------------------------------------------------------------------
2689 inline REStackFrame
*RegexMatcher::StateSave(REStackFrame
*fp
, int64_t savePatIdx
, UErrorCode
&status
) {
2690 // push storage for a new frame.
2691 int64_t *newFP
= fStack
->reserveBlock(fFrameSize
, status
);
2692 if (newFP
== NULL
) {
2693 // Failure on attempted stack expansion.
2694 // Stack function set some other error code, change it to a more
2695 // specific one for regular expressions.
2696 status
= U_REGEX_STACK_OVERFLOW
;
2697 // We need to return a writable stack frame, so just return the
2698 // previous frame. The match operation will stop quickly
2699 // because of the error status, after which the frame will never
2700 // be looked at again.
2703 fp
= (REStackFrame
*)(newFP
- fFrameSize
); // in case of realloc of stack.
2705 // New stack frame = copy of old top frame.
2706 int64_t *source
= (int64_t *)fp
;
2707 int64_t *dest
= newFP
;
2709 *dest
++ = *source
++;
2710 if (source
== newFP
) {
2716 if (fTickCounter
<= 0) {
2717 IncrementTime(status
); // Re-initializes fTickCounter
2719 fp
->fPatIdx
= savePatIdx
;
2720 return (REStackFrame
*)newFP
;
2724 //--------------------------------------------------------------------------------
2726 // MatchAt This is the actual matching engine.
2728 // startIdx: begin matching a this index.
2729 // toEnd: if true, match must extend to end of the input region
2731 //--------------------------------------------------------------------------------
2732 void RegexMatcher::MatchAt(int64_t startIdx
, UBool toEnd
, UErrorCode
&status
) {
2733 UBool isMatch
= FALSE
; // True if the we have a match.
2735 int64_t backSearchIndex
= U_INT64_MAX
; // used after greedy single-character matches for searching backwards
2737 int32_t op
; // Operation from the compiled pattern, split into
2738 int32_t opType
; // the opcode
2739 int32_t opValue
; // and the operand value.
2741 #ifdef REGEX_RUN_DEBUG
2744 printf("MatchAt(startIdx=%ld)\n", startIdx
);
2745 printf("Original Pattern: ");
2746 UChar32 c
= utext_next32From(fPattern
->fPattern
, 0);
2747 while (c
!= U_SENTINEL
) {
2748 if (c
<32 || c
>256) {
2753 c
= UTEXT_NEXT32(fPattern
->fPattern
);
2756 printf("Input String: ");
2757 c
= utext_next32From(fInputText
, 0);
2758 while (c
!= U_SENTINEL
) {
2759 if (c
<32 || c
>256) {
2764 c
= UTEXT_NEXT32(fInputText
);
2771 if (U_FAILURE(status
)) {
2775 // Cache frequently referenced items from the compiled pattern
2777 int64_t *pat
= fPattern
->fCompiledPat
->getBuffer();
2779 const UChar
*litText
= fPattern
->fLiteralText
.getBuffer();
2780 UVector
*sets
= fPattern
->fSets
;
2782 fFrameSize
= fPattern
->fFrameSize
;
2783 REStackFrame
*fp
= resetStack();
2786 fp
->fInputIdx
= startIdx
;
2788 // Zero out the pattern's static data
2790 for (i
= 0; i
<fPattern
->fDataSize
; i
++) {
2795 // Main loop for interpreting the compiled pattern.
2796 // One iteration of the loop per pattern operation performed.
2799 op
= (int32_t)pat
[fp
->fPatIdx
];
2800 opType
= URX_TYPE(op
);
2801 opValue
= URX_VAL(op
);
2802 #ifdef REGEX_RUN_DEBUG
2804 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2805 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp
->fInputIdx
,
2806 UTEXT_CURRENT32(fInputText
), (int64_t *)fp
-fStack
->getBuffer(), fActiveLimit
);
2807 fPattern
->dumpOp(fp
->fPatIdx
);
2820 // Force a backtrack. In some circumstances, the pattern compiler
2821 // will notice that the pattern can't possibly match anything, and will
2822 // emit one of these at that point.
2823 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2828 if (fp
->fInputIdx
< fActiveLimit
) {
2829 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2830 UChar32 c
= UTEXT_NEXT32(fInputText
);
2832 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
2838 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2844 // Test input against a literal string.
2845 // Strings require two slots in the compiled pattern, one for the
2846 // offset to the string text, and one for the length.
2848 int32_t stringStartIdx
= opValue
;
2849 op
= (int32_t)pat
[fp
->fPatIdx
]; // Fetch the second operand
2851 opType
= URX_TYPE(op
);
2852 int32_t stringLen
= URX_VAL(op
);
2853 U_ASSERT(opType
== URX_STRING_LEN
);
2854 U_ASSERT(stringLen
>= 2);
2856 const UChar
*patternString
= litText
+stringStartIdx
;
2857 int32_t patternStringIndex
= 0;
2858 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2860 UChar32 patternChar
;
2861 UBool success
= TRUE
;
2862 while (patternStringIndex
< stringLen
) {
2863 if (UTEXT_GETNATIVEINDEX(fInputText
) >= fActiveLimit
) {
2868 inputChar
= UTEXT_NEXT32(fInputText
);
2869 U16_NEXT(patternString
, patternStringIndex
, stringLen
, patternChar
);
2870 if (patternChar
!= inputChar
) {
2877 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
2879 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2885 case URX_STATE_SAVE
:
2886 fp
= StateSave(fp
, opValue
, status
);
2891 // The match loop will exit via this path on a successful match,
2892 // when we reach the end of the pattern.
2893 if (toEnd
&& fp
->fInputIdx
!= fActiveLimit
) {
2894 // The pattern matched, but not to the end of input. Try some more.
2895 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2901 // Start and End Capture stack frame variables are laid out out like this:
2902 // fp->fExtra[opValue] - The start of a completed capture group
2903 // opValue+1 - The end of a completed capture group
2904 // opValue+2 - the start of a capture group whose end
2905 // has not yet been reached (and might not ever be).
2906 case URX_START_CAPTURE
:
2907 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
2908 fp
->fExtra
[opValue
+2] = fp
->fInputIdx
;
2912 case URX_END_CAPTURE
:
2913 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
2914 U_ASSERT(fp
->fExtra
[opValue
+2] >= 0); // Start pos for this group must be set.
2915 fp
->fExtra
[opValue
] = fp
->fExtra
[opValue
+2]; // Tentative start becomes real.
2916 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // End position
2917 U_ASSERT(fp
->fExtra
[opValue
] <= fp
->fExtra
[opValue
+1]);
2921 case URX_DOLLAR
: // $, test for End of line
2922 // or for position before new line at end of input
2924 if (fp
->fInputIdx
>= fAnchorLimit
) {
2925 // We really are at the end of input. Success.
2931 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2933 // If we are positioned just before a new-line that is located at the
2934 // end of input, succeed.
2935 UChar32 c
= UTEXT_NEXT32(fInputText
);
2936 if (UTEXT_GETNATIVEINDEX(fInputText
) >= fAnchorLimit
) {
2937 if (isLineTerminator(c
)) {
2938 // If not in the middle of a CR/LF sequence
2939 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& ((void)UTEXT_PREVIOUS32(fInputText
), UTEXT_PREVIOUS32(fInputText
))==0x0d)) {
2940 // At new-line at end of input. Success
2948 UChar32 nextC
= UTEXT_NEXT32(fInputText
);
2949 if (c
== 0x0d && nextC
== 0x0a && UTEXT_GETNATIVEINDEX(fInputText
) >= fAnchorLimit
) {
2952 break; // At CR/LF at end of input. Success
2956 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2961 case URX_DOLLAR_D
: // $, test for End of Line, in UNIX_LINES mode.
2962 if (fp
->fInputIdx
>= fAnchorLimit
) {
2963 // Off the end of input. Success.
2968 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2969 UChar32 c
= UTEXT_NEXT32(fInputText
);
2970 // Either at the last character of input, or off the end.
2971 if (c
== 0x0a && UTEXT_GETNATIVEINDEX(fInputText
) == fAnchorLimit
) {
2978 // Not at end of input. Back-track out.
2979 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
2983 case URX_DOLLAR_M
: // $, test for End of line in multi-line mode
2985 if (fp
->fInputIdx
>= fAnchorLimit
) {
2986 // We really are at the end of input. Success.
2991 // If we are positioned just before a new-line, succeed.
2992 // It makes no difference where the new-line is within the input.
2993 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
2994 UChar32 c
= UTEXT_CURRENT32(fInputText
);
2995 if (isLineTerminator(c
)) {
2996 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
2997 // In multi-line mode, hitting a new-line just before the end of input does not
2998 // set the hitEnd or requireEnd flags
2999 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& UTEXT_PREVIOUS32(fInputText
)==0x0d)) {
3003 // not at a new line. Fail.
3004 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3009 case URX_DOLLAR_MD
: // $, test for End of line in multi-line and UNIX_LINES mode
3011 if (fp
->fInputIdx
>= fAnchorLimit
) {
3012 // We really are at the end of input. Success.
3014 fRequireEnd
= TRUE
; // Java set requireEnd in this case, even though
3015 break; // adding a new-line would not lose the match.
3017 // If we are not positioned just before a new-line, the test fails; backtrack out.
3018 // It makes no difference where the new-line is within the input.
3019 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3020 if (UTEXT_CURRENT32(fInputText
) != 0x0a) {
3021 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3027 case URX_CARET
: // ^, test for start of line
3028 if (fp
->fInputIdx
!= fAnchorStart
) {
3029 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3034 case URX_CARET_M
: // ^, test for start of line in mulit-line mode
3036 if (fp
->fInputIdx
== fAnchorStart
) {
3037 // We are at the start input. Success.
3040 // Check whether character just before the current pos is a new-line
3041 // unless we are at the end of input
3042 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3043 UChar32 c
= UTEXT_PREVIOUS32(fInputText
);
3044 if ((fp
->fInputIdx
< fAnchorLimit
) && isLineTerminator(c
)) {
3045 // It's a new-line. ^ is true. Success.
3046 // TODO: what should be done with positions between a CR and LF?
3049 // Not at the start of a line. Fail.
3050 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3055 case URX_CARET_M_UNIX
: // ^, test for start of line in mulit-line + Unix-line mode
3057 U_ASSERT(fp
->fInputIdx
>= fAnchorStart
);
3058 if (fp
->fInputIdx
<= fAnchorStart
) {
3059 // We are at the start input. Success.
3062 // Check whether character just before the current pos is a new-line
3063 U_ASSERT(fp
->fInputIdx
<= fAnchorLimit
);
3064 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3065 UChar32 c
= UTEXT_PREVIOUS32(fInputText
);
3067 // Not at the start of a line. Back-track out.
3068 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3073 case URX_BACKSLASH_B
: // Test for word boundaries
3075 UBool success
= isWordBoundary(fp
->fInputIdx
);
3076 success
^= (UBool
)(opValue
!= 0); // flip sense for \B
3078 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3084 case URX_BACKSLASH_BU
: // Test for word boundaries, Unicode-style
3086 UBool success
= isUWordBoundary(fp
->fInputIdx
);
3087 success
^= (UBool
)(opValue
!= 0); // flip sense for \B
3089 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3095 case URX_BACKSLASH_D
: // Test for decimal digit
3097 if (fp
->fInputIdx
>= fActiveLimit
) {
3099 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3103 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3105 UChar32 c
= UTEXT_NEXT32(fInputText
);
3106 int8_t ctype
= u_charType(c
); // TODO: make a unicode set for this. Will be faster.
3107 UBool success
= (ctype
== U_DECIMAL_DIGIT_NUMBER
);
3108 success
^= (UBool
)(opValue
!= 0); // flip sense for \D
3110 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3112 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3118 case URX_BACKSLASH_G
: // Test for position at end of previous match
3119 if (!((fMatch
&& fp
->fInputIdx
==fMatchEnd
) || (fMatch
==FALSE
&& fp
->fInputIdx
==fActiveStart
))) {
3120 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3125 case URX_BACKSLASH_H
: // Test for \h, horizontal white space.
3127 if (fp
->fInputIdx
>= fActiveLimit
) {
3129 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3132 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3133 UChar32 c
= UTEXT_NEXT32(fInputText
);
3134 int8_t ctype
= u_charType(c
);
3135 UBool success
= (ctype
== U_SPACE_SEPARATOR
|| c
== 9); // SPACE_SEPARATOR || TAB
3136 success
^= (UBool
)(opValue
!= 0); // flip sense for \H
3138 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3140 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3146 case URX_BACKSLASH_R
: // Test for \R, any line break sequence.
3148 if (fp
->fInputIdx
>= fActiveLimit
) {
3150 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3153 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3154 UChar32 c
= UTEXT_NEXT32(fInputText
);
3155 if (isLineTerminator(c
)) {
3156 if (c
== 0x0d && utext_current32(fInputText
) == 0x0a) {
3157 utext_next32(fInputText
);
3159 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3161 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3167 case URX_BACKSLASH_V
: // \v, any single line ending character.
3169 if (fp
->fInputIdx
>= fActiveLimit
) {
3171 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3174 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3175 UChar32 c
= UTEXT_NEXT32(fInputText
);
3176 UBool success
= isLineTerminator(c
);
3177 success
^= (UBool
)(opValue
!= 0); // flip sense for \V
3179 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3181 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3187 case URX_BACKSLASH_X
:
3188 // Match a Grapheme, as defined by Unicode TR 29.
3189 // Differs slightly from Perl, which consumes combining marks independently
3193 // Fail if at end of input
3194 if (fp
->fInputIdx
>= fActiveLimit
) {
3196 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3200 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3202 // Examine (and consume) the current char.
3203 // Dispatch into a little state machine, based on the char.
3205 c
= UTEXT_NEXT32(fInputText
);
3206 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3207 UnicodeSet
**sets
= fPattern
->fStaticSets
;
3208 if (sets
[URX_GC_NORMAL
]->contains(c
)) goto GC_Extend
;
3209 if (sets
[URX_GC_CONTROL
]->contains(c
)) goto GC_Control
;
3210 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
3211 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
3212 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
3213 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
3214 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
3220 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
3221 c
= UTEXT_NEXT32(fInputText
);
3222 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3223 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
3224 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
3225 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
3226 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
3227 (void)UTEXT_PREVIOUS32(fInputText
);
3228 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3232 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
3233 c
= UTEXT_NEXT32(fInputText
);
3234 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3235 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
3236 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
3237 (void)UTEXT_PREVIOUS32(fInputText
);
3238 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3242 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
3243 c
= UTEXT_NEXT32(fInputText
);
3244 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3245 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
3246 (void)UTEXT_PREVIOUS32(fInputText
);
3247 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3251 // Combining characters are consumed here
3253 if (fp
->fInputIdx
>= fActiveLimit
) {
3256 c
= UTEXT_CURRENT32(fInputText
);
3257 if (sets
[URX_GC_EXTEND
]->contains(c
) == FALSE
) {
3260 (void)UTEXT_NEXT32(fInputText
);
3261 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3266 // Most control chars stand alone (don't combine with combining chars),
3267 // except for that CR/LF sequence is a single grapheme cluster.
3268 if (c
== 0x0d && fp
->fInputIdx
< fActiveLimit
&& UTEXT_CURRENT32(fInputText
) == 0x0a) {
3269 c
= UTEXT_NEXT32(fInputText
);
3270 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3274 if (fp
->fInputIdx
>= fActiveLimit
) {
3283 case URX_BACKSLASH_Z
: // Test for end of Input
3284 if (fp
->fInputIdx
< fAnchorLimit
) {
3285 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3294 case URX_STATIC_SETREF
:
3296 // Test input character against one of the predefined sets
3297 // (Word Characters, for example)
3298 // The high bit of the op value is a flag for the match polarity.
3299 // 0: success if input char is in set.
3300 // 1: success if input char is not in set.
3301 if (fp
->fInputIdx
>= fActiveLimit
) {
3303 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3307 UBool success
= ((opValue
& URX_NEG_SET
) == URX_NEG_SET
);
3308 opValue
&= ~URX_NEG_SET
;
3309 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
3311 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3312 UChar32 c
= UTEXT_NEXT32(fInputText
);
3314 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
3315 if (s8
->contains(c
)) {
3319 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
3320 if (s
->contains(c
)) {
3325 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3327 // the character wasn't in the set.
3328 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3334 case URX_STAT_SETREF_N
:
3336 // Test input character for NOT being a member of one of
3337 // the predefined sets (Word Characters, for example)
3338 if (fp
->fInputIdx
>= fActiveLimit
) {
3340 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3344 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
3346 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3348 UChar32 c
= UTEXT_NEXT32(fInputText
);
3350 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
3351 if (s8
->contains(c
) == FALSE
) {
3352 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3356 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
3357 if (s
->contains(c
) == FALSE
) {
3358 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3362 // the character wasn't in the set.
3363 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3369 if (fp
->fInputIdx
>= fActiveLimit
) {
3371 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3374 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3376 // There is input left. Pick up one char and test it for set membership.
3377 UChar32 c
= UTEXT_NEXT32(fInputText
);
3378 U_ASSERT(opValue
> 0 && opValue
< sets
->size());
3380 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
3381 if (s8
->contains(c
)) {
3382 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3386 UnicodeSet
*s
= (UnicodeSet
*)sets
->elementAt(opValue
);
3387 if (s
->contains(c
)) {
3388 // The character is in the set. A Match.
3389 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3394 // the character wasn't in the set.
3395 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3402 // . matches anything, but stops at end-of-line.
3403 if (fp
->fInputIdx
>= fActiveLimit
) {
3404 // At end of input. Match failed. Backtrack out.
3406 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3410 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3412 // There is input left. Advance over one char, unless we've hit end-of-line
3413 UChar32 c
= UTEXT_NEXT32(fInputText
);
3414 if (isLineTerminator(c
)) {
3415 // End of line in normal mode. . does not match.
3416 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3419 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3424 case URX_DOTANY_ALL
:
3426 // ., in dot-matches-all (including new lines) mode
3427 if (fp
->fInputIdx
>= fActiveLimit
) {
3428 // At end of input. Match failed. Backtrack out.
3430 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3434 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3436 // There is input left. Advance over one char, except if we are
3437 // at a cr/lf, advance over both of them.
3439 c
= UTEXT_NEXT32(fInputText
);
3440 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3441 if (c
==0x0d && fp
->fInputIdx
< fActiveLimit
) {
3442 // In the case of a CR/LF, we need to advance over both.
3443 UChar32 nextc
= UTEXT_CURRENT32(fInputText
);
3444 if (nextc
== 0x0a) {
3445 (void)UTEXT_NEXT32(fInputText
);
3446 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3453 case URX_DOTANY_UNIX
:
3455 // '.' operator, matches all, but stops at end-of-line.
3456 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
3457 if (fp
->fInputIdx
>= fActiveLimit
) {
3458 // At end of input. Match failed. Backtrack out.
3460 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3464 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3466 // There is input left. Advance over one char, unless we've hit end-of-line
3467 UChar32 c
= UTEXT_NEXT32(fInputText
);
3469 // End of line in normal mode. '.' does not match the \n
3470 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3472 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3479 fp
->fPatIdx
= opValue
;
3487 U_ASSERT(opValue
< fPattern
->fCompiledPat
->size());
3488 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
3489 fp
->fPatIdx
= opValue
; // Then JMP.
3493 // This opcode is used with (x)+, when x can match a zero length string.
3494 // Same as JMP_SAV, except conditional on the match having made forward progress.
3495 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3496 // data address of the input position at the start of the loop.
3498 U_ASSERT(opValue
> 0 && opValue
< fPattern
->fCompiledPat
->size());
3499 int32_t stoOp
= (int32_t)pat
[opValue
-1];
3500 U_ASSERT(URX_TYPE(stoOp
) == URX_STO_INP_LOC
);
3501 int32_t frameLoc
= URX_VAL(stoOp
);
3502 U_ASSERT(frameLoc
>= 0 && frameLoc
< fFrameSize
);
3503 int64_t prevInputIdx
= fp
->fExtra
[frameLoc
];
3504 U_ASSERT(prevInputIdx
<= fp
->fInputIdx
);
3505 if (prevInputIdx
< fp
->fInputIdx
) {
3506 // The match did make progress. Repeat the loop.
3507 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
3508 fp
->fPatIdx
= opValue
;
3509 fp
->fExtra
[frameLoc
] = fp
->fInputIdx
;
3511 // If the input position did not advance, we do nothing here,
3512 // execution will fall out of the loop.
3518 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
3519 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
3521 // Pick up the three extra operands that CTR_INIT has, and
3522 // skip the pattern location counter past
3523 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
3525 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
3526 int32_t minCount
= (int32_t)pat
[instrOperandLoc
+1];
3527 int32_t maxCount
= (int32_t)pat
[instrOperandLoc
+2];
3528 U_ASSERT(minCount
>=0);
3529 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
3530 U_ASSERT(loopLoc
>=fp
->fPatIdx
);
3532 if (minCount
== 0) {
3533 fp
= StateSave(fp
, loopLoc
+1, status
);
3535 if (maxCount
== -1) {
3536 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // For loop breaking.
3537 } else if (maxCount
== 0) {
3538 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3545 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
3546 int32_t initOp
= (int32_t)pat
[opValue
];
3547 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT
);
3548 int64_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
3549 int32_t minCount
= (int32_t)pat
[opValue
+2];
3550 int32_t maxCount
= (int32_t)pat
[opValue
+3];
3552 if ((uint64_t)*pCounter
>= (uint32_t)maxCount
&& maxCount
!= -1) {
3553 U_ASSERT(*pCounter
== maxCount
);
3556 if (*pCounter
>= minCount
) {
3557 if (maxCount
== -1) {
3558 // Loop has no hard upper bound.
3559 // Check that it is progressing through the input, break if it is not.
3560 int64_t *pLastInputIdx
= &fp
->fExtra
[URX_VAL(initOp
) + 1];
3561 if (fp
->fInputIdx
== *pLastInputIdx
) {
3564 *pLastInputIdx
= fp
->fInputIdx
;
3567 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
3569 fp
->fPatIdx
= opValue
+ 4; // Loop back.
3573 case URX_CTR_INIT_NG
:
3575 // Initialize a non-greedy loop
3576 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
3577 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
3579 // Pick up the three extra operands that CTR_INIT_NG has, and
3580 // skip the pattern location counter past
3581 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
3583 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
3584 int32_t minCount
= (int32_t)pat
[instrOperandLoc
+1];
3585 int32_t maxCount
= (int32_t)pat
[instrOperandLoc
+2];
3586 U_ASSERT(minCount
>=0);
3587 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
3588 U_ASSERT(loopLoc
>fp
->fPatIdx
);
3589 if (maxCount
== -1) {
3590 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // Save initial input index for loop breaking.
3593 if (minCount
== 0) {
3594 if (maxCount
!= 0) {
3595 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
3597 fp
->fPatIdx
= loopLoc
+1; // Continue with stuff after repeated block
3602 case URX_CTR_LOOP_NG
:
3604 // Non-greedy {min, max} loops
3605 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
3606 int32_t initOp
= (int32_t)pat
[opValue
];
3607 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT_NG
);
3608 int64_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
3609 int32_t minCount
= (int32_t)pat
[opValue
+2];
3610 int32_t maxCount
= (int32_t)pat
[opValue
+3];
3613 if ((uint64_t)*pCounter
>= (uint32_t)maxCount
&& maxCount
!= -1) {
3614 // The loop has matched the maximum permitted number of times.
3615 // Break out of here with no action. Matching will
3616 // continue with the following pattern.
3617 U_ASSERT(*pCounter
== maxCount
);
3621 if (*pCounter
< minCount
) {
3622 // We haven't met the minimum number of matches yet.
3623 // Loop back for another one.
3624 fp
->fPatIdx
= opValue
+ 4; // Loop back.
3626 // We do have the minimum number of matches.
3628 // If there is no upper bound on the loop iterations, check that the input index
3629 // is progressing, and stop the loop if it is not.
3630 if (maxCount
== -1) {
3631 int64_t *pLastInputIdx
= &fp
->fExtra
[URX_VAL(initOp
) + 1];
3632 if (fp
->fInputIdx
== *pLastInputIdx
) {
3635 *pLastInputIdx
= fp
->fInputIdx
;
3638 // Loop Continuation: we will fall into the pattern following the loop
3639 // (non-greedy, don't execute loop body first), but first do
3640 // a state save to the top of the loop, so that a match failure
3641 // in the following pattern will try another iteration of the loop.
3642 fp
= StateSave(fp
, opValue
+ 4, status
);
3648 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
3649 fData
[opValue
] = fStack
->size();
3654 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
3655 int32_t newStackSize
= (int32_t)fData
[opValue
];
3656 U_ASSERT(newStackSize
<= fStack
->size());
3657 int64_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
3658 if (newFP
== (int64_t *)fp
) {
3662 for (i
=0; i
<fFrameSize
; i
++) {
3663 newFP
[i
] = ((int64_t *)fp
)[i
];
3665 fp
= (REStackFrame
*)newFP
;
3666 fStack
->setSize(newStackSize
);
3672 U_ASSERT(opValue
< fFrameSize
);
3673 int64_t groupStartIdx
= fp
->fExtra
[opValue
];
3674 int64_t groupEndIdx
= fp
->fExtra
[opValue
+1];
3675 U_ASSERT(groupStartIdx
<= groupEndIdx
);
3676 if (groupStartIdx
< 0) {
3677 // This capture group has not participated in the match thus far,
3678 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
3681 UTEXT_SETNATIVEINDEX(fAltInputText
, groupStartIdx
);
3682 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3684 // Note: if the capture group match was of an empty string the backref
3685 // match succeeds. Verified by testing: Perl matches succeed
3686 // in this case, so we do too.
3688 UBool success
= TRUE
;
3690 if (utext_getNativeIndex(fAltInputText
) >= groupEndIdx
) {
3694 if (utext_getNativeIndex(fInputText
) >= fActiveLimit
) {
3699 UChar32 captureGroupChar
= utext_next32(fAltInputText
);
3700 UChar32 inputChar
= utext_next32(fInputText
);
3701 if (inputChar
!= captureGroupChar
) {
3708 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3710 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3719 U_ASSERT(opValue
< fFrameSize
);
3720 int64_t groupStartIdx
= fp
->fExtra
[opValue
];
3721 int64_t groupEndIdx
= fp
->fExtra
[opValue
+1];
3722 U_ASSERT(groupStartIdx
<= groupEndIdx
);
3723 if (groupStartIdx
< 0) {
3724 // This capture group has not participated in the match thus far,
3725 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
3728 utext_setNativeIndex(fAltInputText
, groupStartIdx
);
3729 utext_setNativeIndex(fInputText
, fp
->fInputIdx
);
3730 CaseFoldingUTextIterator
captureGroupItr(*fAltInputText
);
3731 CaseFoldingUTextIterator
inputItr(*fInputText
);
3733 // Note: if the capture group match was of an empty string the backref
3734 // match succeeds. Verified by testing: Perl matches succeed
3735 // in this case, so we do too.
3737 UBool success
= TRUE
;
3739 if (!captureGroupItr
.inExpansion() && utext_getNativeIndex(fAltInputText
) >= groupEndIdx
) {
3743 if (!inputItr
.inExpansion() && utext_getNativeIndex(fInputText
) >= fActiveLimit
) {
3748 UChar32 captureGroupChar
= captureGroupItr
.next();
3749 UChar32 inputChar
= inputItr
.next();
3750 if (inputChar
!= captureGroupChar
) {
3756 if (success
&& inputItr
.inExpansion()) {
3757 // We otained a match by consuming part of a string obtained from
3758 // case-folding a single code point of the input text.
3759 // This does not count as an overall match.
3764 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3766 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3772 case URX_STO_INP_LOC
:
3774 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
);
3775 fp
->fExtra
[opValue
] = fp
->fInputIdx
;
3781 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
3783 int32_t dataLoc
= URX_VAL(pat
[instrOperandLoc
]);
3784 U_ASSERT(dataLoc
>= 0 && dataLoc
< fFrameSize
);
3785 int64_t savedInputIdx
= fp
->fExtra
[dataLoc
];
3786 U_ASSERT(savedInputIdx
<= fp
->fInputIdx
);
3787 if (savedInputIdx
< fp
->fInputIdx
) {
3788 fp
->fPatIdx
= opValue
; // JMP
3790 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no progress in loop.
3797 // Entering a lookahead block.
3798 // Save Stack Ptr, Input Pos.
3799 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3800 fData
[opValue
] = fStack
->size();
3801 fData
[opValue
+1] = fp
->fInputIdx
;
3802 fActiveStart
= fLookStart
; // Set the match region change for
3803 fActiveLimit
= fLookLimit
; // transparent bounds.
3809 // Leaving a look-ahead block.
3810 // restore Stack Ptr, Input Pos to positions they had on entry to block.
3811 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3812 int32_t stackSize
= fStack
->size();
3813 int32_t newStackSize
=(int32_t)fData
[opValue
];
3814 U_ASSERT(stackSize
>= newStackSize
);
3815 if (stackSize
> newStackSize
) {
3816 // Copy the current top frame back to the new (cut back) top frame.
3817 // This makes the capture groups from within the look-ahead
3818 // expression available.
3819 int64_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
3821 for (i
=0; i
<fFrameSize
; i
++) {
3822 newFP
[i
] = ((int64_t *)fp
)[i
];
3824 fp
= (REStackFrame
*)newFP
;
3825 fStack
->setSize(newStackSize
);
3827 fp
->fInputIdx
= fData
[opValue
+1];
3829 // Restore the active region bounds in the input string; they may have
3830 // been changed because of transparent bounds on a Region.
3831 fActiveStart
= fRegionStart
;
3832 fActiveLimit
= fRegionLimit
;
3837 // Case insensitive one char. The char from the pattern is already case folded.
3838 // Input text is not, but case folding the input can not reduce two or more code
3840 if (fp
->fInputIdx
< fActiveLimit
) {
3841 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3843 UChar32 c
= UTEXT_NEXT32(fInputText
);
3844 if (u_foldCase(c
, U_FOLD_CASE_DEFAULT
) == opValue
) {
3845 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3852 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3857 // Case-insensitive test input against a literal string.
3858 // Strings require two slots in the compiled pattern, one for the
3859 // offset to the string text, and one for the length.
3860 // The compiled string has already been case folded.
3862 const UChar
*patternString
= litText
+ opValue
;
3863 int32_t patternStringIdx
= 0;
3865 op
= (int32_t)pat
[fp
->fPatIdx
];
3867 opType
= URX_TYPE(op
);
3868 opValue
= URX_VAL(op
);
3869 U_ASSERT(opType
== URX_STRING_LEN
);
3870 int32_t patternStringLen
= opValue
; // Length of the string from the pattern.
3875 UBool success
= TRUE
;
3877 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
3878 CaseFoldingUTextIterator
inputIterator(*fInputText
);
3879 while (patternStringIdx
< patternStringLen
) {
3880 if (!inputIterator
.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText
) >= fActiveLimit
) {
3885 U16_NEXT(patternString
, patternStringIdx
, patternStringLen
, cPattern
);
3886 cText
= inputIterator
.next();
3887 if (cText
!= cPattern
) {
3892 if (inputIterator
.inExpansion()) {
3897 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3899 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3907 // Entering a look-behind block.
3908 // Save Stack Ptr, Input Pos.
3909 // TODO: implement transparent bounds. Ticket #6067
3910 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3911 fData
[opValue
] = fStack
->size();
3912 fData
[opValue
+1] = fp
->fInputIdx
;
3913 // Init the variable containing the start index for attempted matches.
3914 fData
[opValue
+2] = -1;
3915 // Save input string length, then reset to pin any matches to end at
3916 // the current position.
3917 fData
[opValue
+3] = fActiveLimit
;
3918 fActiveLimit
= fp
->fInputIdx
;
3925 // Positive Look-Behind, at top of loop checking for matches of LB expression
3926 // at all possible input starting positions.
3928 // Fetch the min and max possible match lengths. They are the operands
3929 // of this op in the pattern.
3930 int32_t minML
= (int32_t)pat
[fp
->fPatIdx
++];
3931 int32_t maxML
= (int32_t)pat
[fp
->fPatIdx
++];
3932 U_ASSERT(minML
<= maxML
);
3933 U_ASSERT(minML
>= 0);
3935 // Fetch (from data) the last input index where a match was attempted.
3936 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3937 int64_t *lbStartIdx
= &fData
[opValue
+2];
3938 if (*lbStartIdx
< 0) {
3939 // First time through loop.
3940 *lbStartIdx
= fp
->fInputIdx
- minML
;
3942 // 2nd through nth time through the loop.
3943 // Back up start position for match by one.
3944 if (*lbStartIdx
== 0) {
3947 UTEXT_SETNATIVEINDEX(fInputText
, *lbStartIdx
);
3948 (void)UTEXT_PREVIOUS32(fInputText
);
3949 *lbStartIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
3953 if (*lbStartIdx
< 0 || *lbStartIdx
< fp
->fInputIdx
- maxML
) {
3954 // We have tried all potential match starting points without
3955 // getting a match. Backtrack out, and out of the
3956 // Look Behind altogether.
3957 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3958 int64_t restoreInputLen
= fData
[opValue
+3];
3959 U_ASSERT(restoreInputLen
>= fActiveLimit
);
3960 U_ASSERT(restoreInputLen
<= fInputLength
);
3961 fActiveLimit
= restoreInputLen
;
3965 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3966 // (successful match will fall off the end of the loop.)
3967 fp
= StateSave(fp
, fp
->fPatIdx
-3, status
);
3968 fp
->fInputIdx
= *lbStartIdx
;
3973 // End of a look-behind block, after a successful match.
3975 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
3976 if (fp
->fInputIdx
!= fActiveLimit
) {
3977 // The look-behind expression matched, but the match did not
3978 // extend all the way to the point that we are looking behind from.
3979 // FAIL out of here, which will take us back to the LB_CONT, which
3980 // will retry the match starting at another position or fail
3981 // the look-behind altogether, whichever is appropriate.
3982 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
3986 // Look-behind match is good. Restore the orignal input string length,
3987 // which had been truncated to pin the end of the lookbehind match to the
3988 // position being looked-behind.
3989 int64_t originalInputLen
= fData
[opValue
+3];
3990 U_ASSERT(originalInputLen
>= fActiveLimit
);
3991 U_ASSERT(originalInputLen
<= fInputLength
);
3992 fActiveLimit
= originalInputLen
;
3999 // Negative Look-Behind, at top of loop checking for matches of LB expression
4000 // at all possible input starting positions.
4002 // Fetch the extra parameters of this op.
4003 int32_t minML
= (int32_t)pat
[fp
->fPatIdx
++];
4004 int32_t maxML
= (int32_t)pat
[fp
->fPatIdx
++];
4005 int32_t continueLoc
= (int32_t)pat
[fp
->fPatIdx
++];
4006 continueLoc
= URX_VAL(continueLoc
);
4007 U_ASSERT(minML
<= maxML
);
4008 U_ASSERT(minML
>= 0);
4009 U_ASSERT(continueLoc
> fp
->fPatIdx
);
4011 // Fetch (from data) the last input index where a match was attempted.
4012 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
4013 int64_t *lbStartIdx
= &fData
[opValue
+2];
4014 if (*lbStartIdx
< 0) {
4015 // First time through loop.
4016 *lbStartIdx
= fp
->fInputIdx
- minML
;
4018 // 2nd through nth time through the loop.
4019 // Back up start position for match by one.
4020 if (*lbStartIdx
== 0) {
4023 UTEXT_SETNATIVEINDEX(fInputText
, *lbStartIdx
);
4024 (void)UTEXT_PREVIOUS32(fInputText
);
4025 *lbStartIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
4029 if (*lbStartIdx
< 0 || *lbStartIdx
< fp
->fInputIdx
- maxML
) {
4030 // We have tried all potential match starting points without
4031 // getting a match, which means that the negative lookbehind as
4032 // a whole has succeeded. Jump forward to the continue location
4033 int64_t restoreInputLen
= fData
[opValue
+3];
4034 U_ASSERT(restoreInputLen
>= fActiveLimit
);
4035 U_ASSERT(restoreInputLen
<= fInputLength
);
4036 fActiveLimit
= restoreInputLen
;
4037 fp
->fPatIdx
= continueLoc
;
4041 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4042 // (successful match will cause a FAIL out of the loop altogether.)
4043 fp
= StateSave(fp
, fp
->fPatIdx
-4, status
);
4044 fp
->fInputIdx
= *lbStartIdx
;
4049 // End of a negative look-behind block, after a successful match.
4051 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
4052 if (fp
->fInputIdx
!= fActiveLimit
) {
4053 // The look-behind expression matched, but the match did not
4054 // extend all the way to the point that we are looking behind from.
4055 // FAIL out of here, which will take us back to the LB_CONT, which
4056 // will retry the match starting at another position or succeed
4057 // the look-behind altogether, whichever is appropriate.
4058 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4062 // Look-behind expression matched, which means look-behind test as
4065 // Restore the orignal input string length, which had been truncated
4066 // inorder to pin the end of the lookbehind match
4067 // to the position being looked-behind.
4068 int64_t originalInputLen
= fData
[opValue
+3];
4069 U_ASSERT(originalInputLen
>= fActiveLimit
);
4070 U_ASSERT(originalInputLen
<= fInputLength
);
4071 fActiveLimit
= originalInputLen
;
4073 // Restore original stack position, discarding any state saved
4074 // by the successful pattern match.
4075 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
4076 int32_t newStackSize
= (int32_t)fData
[opValue
];
4077 U_ASSERT(fStack
->size() > newStackSize
);
4078 fStack
->setSize(newStackSize
);
4080 // FAIL, which will take control back to someplace
4081 // prior to entering the look-behind test.
4082 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4088 // Loop Initialization for the optimized implementation of
4089 // [some character set]*
4090 // This op scans through all matching input.
4091 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4093 U_ASSERT(opValue
> 0 && opValue
< sets
->size());
4094 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
4095 UnicodeSet
*s
= (UnicodeSet
*)sets
->elementAt(opValue
);
4097 // Loop through input, until either the input is exhausted or
4098 // we reach a character that is not a member of the set.
4099 int64_t ix
= fp
->fInputIdx
;
4100 UTEXT_SETNATIVEINDEX(fInputText
, ix
);
4102 if (ix
>= fActiveLimit
) {
4106 UChar32 c
= UTEXT_NEXT32(fInputText
);
4108 if (s8
->contains(c
) == FALSE
) {
4112 if (s
->contains(c
) == FALSE
) {
4116 ix
= UTEXT_GETNATIVEINDEX(fInputText
);
4119 // If there were no matching characters, skip over the loop altogether.
4120 // The loop doesn't run at all, a * op always succeeds.
4121 if (ix
== fp
->fInputIdx
) {
4122 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
4126 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4127 // must follow. It's operand is the stack location
4128 // that holds the starting input index for the match of this [set]*
4129 int32_t loopcOp
= (int32_t)pat
[fp
->fPatIdx
];
4130 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
4131 int32_t stackLoc
= URX_VAL(loopcOp
);
4132 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
4133 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
4136 // Save State to the URX_LOOP_C op that follows this one,
4137 // so that match failures in the following code will return to there.
4138 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4139 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
4145 case URX_LOOP_DOT_I
:
4146 // Loop Initialization for the optimized implementation of .*
4147 // This op scans through all remaining input.
4148 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4150 // Loop through input until the input is exhausted (we reach an end-of-line)
4151 // In DOTALL mode, we can just go straight to the end of the input.
4153 if ((opValue
& 1) == 1) {
4154 // Dot-matches-All mode. Jump straight to the end of the string.
4158 // NOT DOT ALL mode. Line endings do not match '.'
4159 // Scan forward until a line ending or end of input.
4161 UTEXT_SETNATIVEINDEX(fInputText
, ix
);
4163 if (ix
>= fActiveLimit
) {
4167 UChar32 c
= UTEXT_NEXT32(fInputText
);
4168 if ((c
& 0x7f) <= 0x29) { // Fast filter of non-new-line-s
4169 if ((c
== 0x0a) || // 0x0a is newline in both modes.
4170 (((opValue
& 2) == 0) && // IF not UNIX_LINES mode
4171 isLineTerminator(c
))) {
4172 // char is a line ending. Exit the scanning loop.
4176 ix
= UTEXT_GETNATIVEINDEX(fInputText
);
4180 // If there were no matching characters, skip over the loop altogether.
4181 // The loop doesn't run at all, a * op always succeeds.
4182 if (ix
== fp
->fInputIdx
) {
4183 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
4187 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4188 // must follow. It's operand is the stack location
4189 // that holds the starting input index for the match of this .*
4190 int32_t loopcOp
= (int32_t)pat
[fp
->fPatIdx
];
4191 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
4192 int32_t stackLoc
= URX_VAL(loopcOp
);
4193 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
4194 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
4197 // Save State to the URX_LOOP_C op that follows this one,
4198 // so that match failures in the following code will return to there.
4199 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4200 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
4208 U_ASSERT(opValue
>=0 && opValue
<fFrameSize
);
4209 backSearchIndex
= fp
->fExtra
[opValue
];
4210 U_ASSERT(backSearchIndex
<= fp
->fInputIdx
);
4211 if (backSearchIndex
== fp
->fInputIdx
) {
4212 // We've backed up the input idx to the point that the loop started.
4213 // The loop is done. Leave here without saving state.
4214 // Subsequent failures won't come back here.
4217 // Set up for the next iteration of the loop, with input index
4218 // backed up by one from the last time through,
4219 // and a state save to this instruction in case the following code fails again.
4220 // (We're going backwards because this loop emulates stack unwinding, not
4221 // the initial scan forward.)
4222 U_ASSERT(fp
->fInputIdx
> 0);
4223 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
4224 UChar32 prevC
= UTEXT_PREVIOUS32(fInputText
);
4225 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
4227 UChar32 twoPrevC
= UTEXT_PREVIOUS32(fInputText
);
4228 if (prevC
== 0x0a &&
4229 fp
->fInputIdx
> backSearchIndex
&&
4231 int32_t prevOp
= (int32_t)pat
[fp
->fPatIdx
-2];
4232 if (URX_TYPE(prevOp
) == URX_LOOP_DOT_I
) {
4233 // .*, stepping back over CRLF pair.
4234 fp
->fInputIdx
= UTEXT_GETNATIVEINDEX(fInputText
);
4239 fp
= StateSave(fp
, fp
->fPatIdx
-1, status
);
4246 // Trouble. The compiled pattern contains an entry with an
4247 // unrecognized type tag.
4251 if (U_FAILURE(status
)) {
4260 fLastMatchEnd
= fMatchEnd
;
4261 fMatchStart
= startIdx
;
4262 fMatchEnd
= fp
->fInputIdx
;
4265 #ifdef REGEX_RUN_DEBUG
4268 printf("Match. start=%ld end=%ld\n\n", fMatchStart
, fMatchEnd
);
4270 printf("No match\n\n");
4275 fFrame
= fp
; // The active stack frame when the engine stopped.
4276 // Contains the capture group results that we need to
4282 //--------------------------------------------------------------------------------
4284 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with the
4285 // assumption that the entire string is available in the UText's
4286 // chunk buffer. For now, that means we can use int32_t indexes,
4287 // except for anything that needs to be saved (like group starts
4290 // startIdx: begin matching a this index.
4291 // toEnd: if true, match must extend to end of the input region
4293 //--------------------------------------------------------------------------------
4294 void RegexMatcher::MatchChunkAt(int32_t startIdx
, UBool toEnd
, UErrorCode
&status
) {
4295 UBool isMatch
= FALSE
; // True if the we have a match.
4297 int32_t backSearchIndex
= INT32_MAX
; // used after greedy single-character matches for searching backwards
4299 int32_t op
; // Operation from the compiled pattern, split into
4300 int32_t opType
; // the opcode
4301 int32_t opValue
; // and the operand value.
4303 #ifdef REGEX_RUN_DEBUG
4305 printf("MatchAt(startIdx=%d)\n", startIdx
);
4306 printf("Original Pattern: ");
4307 UChar32 c
= utext_next32From(fPattern
->fPattern
, 0);
4308 while (c
!= U_SENTINEL
) {
4309 if (c
<32 || c
>256) {
4314 c
= UTEXT_NEXT32(fPattern
->fPattern
);
4317 printf("Input String: ");
4318 c
= utext_next32From(fInputText
, 0);
4319 while (c
!= U_SENTINEL
) {
4320 if (c
<32 || c
>256) {
4325 c
= UTEXT_NEXT32(fInputText
);
4332 if (U_FAILURE(status
)) {
4336 // Cache frequently referenced items from the compiled pattern
4338 int64_t *pat
= fPattern
->fCompiledPat
->getBuffer();
4340 const UChar
*litText
= fPattern
->fLiteralText
.getBuffer();
4341 UVector
*sets
= fPattern
->fSets
;
4343 const UChar
*inputBuf
= fInputText
->chunkContents
;
4345 fFrameSize
= fPattern
->fFrameSize
;
4346 REStackFrame
*fp
= resetStack();
4349 fp
->fInputIdx
= startIdx
;
4351 // Zero out the pattern's static data
4353 for (i
= 0; i
<fPattern
->fDataSize
; i
++) {
4358 // Main loop for interpreting the compiled pattern.
4359 // One iteration of the loop per pattern operation performed.
4362 op
= (int32_t)pat
[fp
->fPatIdx
];
4363 opType
= URX_TYPE(op
);
4364 opValue
= URX_VAL(op
);
4365 #ifdef REGEX_RUN_DEBUG
4367 UTEXT_SETNATIVEINDEX(fInputText
, fp
->fInputIdx
);
4368 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp
->fInputIdx
,
4369 UTEXT_CURRENT32(fInputText
), (int64_t *)fp
-fStack
->getBuffer(), fActiveLimit
);
4370 fPattern
->dumpOp(fp
->fPatIdx
);
4383 // Force a backtrack. In some circumstances, the pattern compiler
4384 // will notice that the pattern can't possibly match anything, and will
4385 // emit one of these at that point.
4386 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4391 if (fp
->fInputIdx
< fActiveLimit
) {
4393 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4400 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4406 // Test input against a literal string.
4407 // Strings require two slots in the compiled pattern, one for the
4408 // offset to the string text, and one for the length.
4409 int32_t stringStartIdx
= opValue
;
4412 op
= (int32_t)pat
[fp
->fPatIdx
]; // Fetch the second operand
4414 opType
= URX_TYPE(op
);
4415 stringLen
= URX_VAL(op
);
4416 U_ASSERT(opType
== URX_STRING_LEN
);
4417 U_ASSERT(stringLen
>= 2);
4419 const UChar
* pInp
= inputBuf
+ fp
->fInputIdx
;
4420 const UChar
* pInpLimit
= inputBuf
+ fActiveLimit
;
4421 const UChar
* pPat
= litText
+stringStartIdx
;
4422 const UChar
* pEnd
= pInp
+ stringLen
;
4423 UBool success
= TRUE
;
4424 while (pInp
< pEnd
) {
4425 if (pInp
>= pInpLimit
) {
4430 if (*pInp
++ != *pPat
++) {
4437 fp
->fInputIdx
+= stringLen
;
4439 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4445 case URX_STATE_SAVE
:
4446 fp
= StateSave(fp
, opValue
, status
);
4451 // The match loop will exit via this path on a successful match,
4452 // when we reach the end of the pattern.
4453 if (toEnd
&& fp
->fInputIdx
!= fActiveLimit
) {
4454 // The pattern matched, but not to the end of input. Try some more.
4455 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4461 // Start and End Capture stack frame variables are laid out out like this:
4462 // fp->fExtra[opValue] - The start of a completed capture group
4463 // opValue+1 - The end of a completed capture group
4464 // opValue+2 - the start of a capture group whose end
4465 // has not yet been reached (and might not ever be).
4466 case URX_START_CAPTURE
:
4467 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
4468 fp
->fExtra
[opValue
+2] = fp
->fInputIdx
;
4472 case URX_END_CAPTURE
:
4473 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-3);
4474 U_ASSERT(fp
->fExtra
[opValue
+2] >= 0); // Start pos for this group must be set.
4475 fp
->fExtra
[opValue
] = fp
->fExtra
[opValue
+2]; // Tentative start becomes real.
4476 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // End position
4477 U_ASSERT(fp
->fExtra
[opValue
] <= fp
->fExtra
[opValue
+1]);
4481 case URX_DOLLAR
: // $, test for End of line
4482 // or for position before new line at end of input
4483 if (fp
->fInputIdx
< fAnchorLimit
-2) {
4484 // We are no where near the end of input. Fail.
4485 // This is the common case. Keep it first.
4486 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4489 if (fp
->fInputIdx
>= fAnchorLimit
) {
4490 // We really are at the end of input. Success.
4496 // If we are positioned just before a new-line that is located at the
4497 // end of input, succeed.
4498 if (fp
->fInputIdx
== fAnchorLimit
-1) {
4500 U16_GET(inputBuf
, fAnchorStart
, fp
->fInputIdx
, fAnchorLimit
, c
);
4502 if (isLineTerminator(c
)) {
4503 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& inputBuf
[fp
->fInputIdx
-1]==0x0d)) {
4504 // At new-line at end of input. Success
4510 } else if (fp
->fInputIdx
== fAnchorLimit
-2 &&
4511 inputBuf
[fp
->fInputIdx
]==0x0d && inputBuf
[fp
->fInputIdx
+1]==0x0a) {
4514 break; // At CR/LF at end of input. Success
4517 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4522 case URX_DOLLAR_D
: // $, test for End of Line, in UNIX_LINES mode.
4523 if (fp
->fInputIdx
>= fAnchorLimit
-1) {
4524 // Either at the last character of input, or off the end.
4525 if (fp
->fInputIdx
== fAnchorLimit
-1) {
4526 // At last char of input. Success if it's a new line.
4527 if (inputBuf
[fp
->fInputIdx
] == 0x0a) {
4533 // Off the end of input. Success.
4540 // Not at end of input. Back-track out.
4541 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4545 case URX_DOLLAR_M
: // $, test for End of line in multi-line mode
4547 if (fp
->fInputIdx
>= fAnchorLimit
) {
4548 // We really are at the end of input. Success.
4553 // If we are positioned just before a new-line, succeed.
4554 // It makes no difference where the new-line is within the input.
4555 UChar32 c
= inputBuf
[fp
->fInputIdx
];
4556 if (isLineTerminator(c
)) {
4557 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
4558 // In multi-line mode, hitting a new-line just before the end of input does not
4559 // set the hitEnd or requireEnd flags
4560 if ( !(c
==0x0a && fp
->fInputIdx
>fAnchorStart
&& inputBuf
[fp
->fInputIdx
-1]==0x0d)) {
4564 // not at a new line. Fail.
4565 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4570 case URX_DOLLAR_MD
: // $, test for End of line in multi-line and UNIX_LINES mode
4572 if (fp
->fInputIdx
>= fAnchorLimit
) {
4573 // We really are at the end of input. Success.
4575 fRequireEnd
= TRUE
; // Java set requireEnd in this case, even though
4576 break; // adding a new-line would not lose the match.
4578 // If we are not positioned just before a new-line, the test fails; backtrack out.
4579 // It makes no difference where the new-line is within the input.
4580 if (inputBuf
[fp
->fInputIdx
] != 0x0a) {
4581 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4587 case URX_CARET
: // ^, test for start of line
4588 if (fp
->fInputIdx
!= fAnchorStart
) {
4589 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4594 case URX_CARET_M
: // ^, test for start of line in mulit-line mode
4596 if (fp
->fInputIdx
== fAnchorStart
) {
4597 // We are at the start input. Success.
4600 // Check whether character just before the current pos is a new-line
4601 // unless we are at the end of input
4602 UChar c
= inputBuf
[fp
->fInputIdx
- 1];
4603 if ((fp
->fInputIdx
< fAnchorLimit
) &&
4604 isLineTerminator(c
)) {
4605 // It's a new-line. ^ is true. Success.
4606 // TODO: what should be done with positions between a CR and LF?
4609 // Not at the start of a line. Fail.
4610 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4615 case URX_CARET_M_UNIX
: // ^, test for start of line in mulit-line + Unix-line mode
4617 U_ASSERT(fp
->fInputIdx
>= fAnchorStart
);
4618 if (fp
->fInputIdx
<= fAnchorStart
) {
4619 // We are at the start input. Success.
4622 // Check whether character just before the current pos is a new-line
4623 U_ASSERT(fp
->fInputIdx
<= fAnchorLimit
);
4624 UChar c
= inputBuf
[fp
->fInputIdx
- 1];
4626 // Not at the start of a line. Back-track out.
4627 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4632 case URX_BACKSLASH_B
: // Test for word boundaries
4634 UBool success
= isChunkWordBoundary((int32_t)fp
->fInputIdx
);
4635 success
^= (UBool
)(opValue
!= 0); // flip sense for \B
4637 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4643 case URX_BACKSLASH_BU
: // Test for word boundaries, Unicode-style
4645 UBool success
= isUWordBoundary(fp
->fInputIdx
);
4646 success
^= (UBool
)(opValue
!= 0); // flip sense for \B
4648 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4654 case URX_BACKSLASH_D
: // Test for decimal digit
4656 if (fp
->fInputIdx
>= fActiveLimit
) {
4658 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4663 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4664 int8_t ctype
= u_charType(c
); // TODO: make a unicode set for this. Will be faster.
4665 UBool success
= (ctype
== U_DECIMAL_DIGIT_NUMBER
);
4666 success
^= (UBool
)(opValue
!= 0); // flip sense for \D
4668 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4674 case URX_BACKSLASH_G
: // Test for position at end of previous match
4675 if (!((fMatch
&& fp
->fInputIdx
==fMatchEnd
) || (fMatch
==FALSE
&& fp
->fInputIdx
==fActiveStart
))) {
4676 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4681 case URX_BACKSLASH_H
: // Test for \h, horizontal white space.
4683 if (fp
->fInputIdx
>= fActiveLimit
) {
4685 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4689 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4690 int8_t ctype
= u_charType(c
);
4691 UBool success
= (ctype
== U_SPACE_SEPARATOR
|| c
== 9); // SPACE_SEPARATOR || TAB
4692 success
^= (UBool
)(opValue
!= 0); // flip sense for \H
4694 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4700 case URX_BACKSLASH_R
: // Test for \R, any line break sequence.
4702 if (fp
->fInputIdx
>= fActiveLimit
) {
4704 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4708 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4709 if (isLineTerminator(c
)) {
4710 if (c
== 0x0d && fp
->fInputIdx
< fActiveLimit
) {
4711 // Check for CR/LF sequence. Consume both together when found.
4713 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c2
);
4715 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c2
);
4719 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4725 case URX_BACKSLASH_V
: // Any single code point line ending.
4727 if (fp
->fInputIdx
>= fActiveLimit
) {
4729 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4733 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4734 UBool success
= isLineTerminator(c
);
4735 success
^= (UBool
)(opValue
!= 0); // flip sense for \V
4737 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4744 case URX_BACKSLASH_X
:
4745 // Match a Grapheme, as defined by Unicode TR 29.
4746 // Differs slightly from Perl, which consumes combining marks independently
4750 // Fail if at end of input
4751 if (fp
->fInputIdx
>= fActiveLimit
) {
4753 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4757 // Examine (and consume) the current char.
4758 // Dispatch into a little state machine, based on the char.
4760 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4761 UnicodeSet
**sets
= fPattern
->fStaticSets
;
4762 if (sets
[URX_GC_NORMAL
]->contains(c
)) goto GC_Extend
;
4763 if (sets
[URX_GC_CONTROL
]->contains(c
)) goto GC_Control
;
4764 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
4765 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
4766 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
4767 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
4768 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
4774 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
4775 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4776 if (sets
[URX_GC_L
]->contains(c
)) goto GC_L
;
4777 if (sets
[URX_GC_LV
]->contains(c
)) goto GC_V
;
4778 if (sets
[URX_GC_LVT
]->contains(c
)) goto GC_T
;
4779 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
4780 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c
);
4784 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
4785 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4786 if (sets
[URX_GC_V
]->contains(c
)) goto GC_V
;
4787 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
4788 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c
);
4792 if (fp
->fInputIdx
>= fActiveLimit
) goto GC_Done
;
4793 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4794 if (sets
[URX_GC_T
]->contains(c
)) goto GC_T
;
4795 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, c
);
4799 // Combining characters are consumed here
4801 if (fp
->fInputIdx
>= fActiveLimit
) {
4804 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4805 if (sets
[URX_GC_EXTEND
]->contains(c
) == FALSE
) {
4806 U16_BACK_1(inputBuf
, 0, fp
->fInputIdx
);
4813 // Most control chars stand alone (don't combine with combining chars),
4814 // except for that CR/LF sequence is a single grapheme cluster.
4815 if (c
== 0x0d && fp
->fInputIdx
< fActiveLimit
&& inputBuf
[fp
->fInputIdx
] == 0x0a) {
4820 if (fp
->fInputIdx
>= fActiveLimit
) {
4829 case URX_BACKSLASH_Z
: // Test for end of Input
4830 if (fp
->fInputIdx
< fAnchorLimit
) {
4831 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4840 case URX_STATIC_SETREF
:
4842 // Test input character against one of the predefined sets
4843 // (Word Characters, for example)
4844 // The high bit of the op value is a flag for the match polarity.
4845 // 0: success if input char is in set.
4846 // 1: success if input char is not in set.
4847 if (fp
->fInputIdx
>= fActiveLimit
) {
4849 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4853 UBool success
= ((opValue
& URX_NEG_SET
) == URX_NEG_SET
);
4854 opValue
&= ~URX_NEG_SET
;
4855 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
4858 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4860 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
4861 if (s8
->contains(c
)) {
4865 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
4866 if (s
->contains(c
)) {
4871 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4877 case URX_STAT_SETREF_N
:
4879 // Test input character for NOT being a member of one of
4880 // the predefined sets (Word Characters, for example)
4881 if (fp
->fInputIdx
>= fActiveLimit
) {
4883 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4887 U_ASSERT(opValue
> 0 && opValue
< URX_LAST_SET
);
4890 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4892 Regex8BitSet
*s8
= &fPattern
->fStaticSets8
[opValue
];
4893 if (s8
->contains(c
) == FALSE
) {
4897 const UnicodeSet
*s
= fPattern
->fStaticSets
[opValue
];
4898 if (s
->contains(c
) == FALSE
) {
4902 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4909 if (fp
->fInputIdx
>= fActiveLimit
) {
4911 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4915 U_ASSERT(opValue
> 0 && opValue
< sets
->size());
4917 // There is input left. Pick up one char and test it for set membership.
4919 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4921 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
4922 if (s8
->contains(c
)) {
4923 // The character is in the set. A Match.
4927 UnicodeSet
*s
= (UnicodeSet
*)sets
->elementAt(opValue
);
4928 if (s
->contains(c
)) {
4929 // The character is in the set. A Match.
4934 // the character wasn't in the set.
4935 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4942 // . matches anything, but stops at end-of-line.
4943 if (fp
->fInputIdx
>= fActiveLimit
) {
4944 // At end of input. Match failed. Backtrack out.
4946 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4950 // There is input left. Advance over one char, unless we've hit end-of-line
4952 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4953 if (isLineTerminator(c
)) {
4954 // End of line in normal mode. . does not match.
4955 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4962 case URX_DOTANY_ALL
:
4964 // . in dot-matches-all (including new lines) mode
4965 if (fp
->fInputIdx
>= fActiveLimit
) {
4966 // At end of input. Match failed. Backtrack out.
4968 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4972 // There is input left. Advance over one char, except if we are
4973 // at a cr/lf, advance over both of them.
4975 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
4976 if (c
==0x0d && fp
->fInputIdx
< fActiveLimit
) {
4977 // In the case of a CR/LF, we need to advance over both.
4978 if (inputBuf
[fp
->fInputIdx
] == 0x0a) {
4979 U16_FWD_1(inputBuf
, fp
->fInputIdx
, fActiveLimit
);
4986 case URX_DOTANY_UNIX
:
4988 // '.' operator, matches all, but stops at end-of-line.
4989 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
4990 if (fp
->fInputIdx
>= fActiveLimit
) {
4991 // At end of input. Match failed. Backtrack out.
4993 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
4997 // There is input left. Advance over one char, unless we've hit end-of-line
4999 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
5001 // End of line in normal mode. '.' does not match the \n
5002 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5009 fp
->fPatIdx
= opValue
;
5017 U_ASSERT(opValue
< fPattern
->fCompiledPat
->size());
5018 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
5019 fp
->fPatIdx
= opValue
; // Then JMP.
5023 // This opcode is used with (x)+, when x can match a zero length string.
5024 // Same as JMP_SAV, except conditional on the match having made forward progress.
5025 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
5026 // data address of the input position at the start of the loop.
5028 U_ASSERT(opValue
> 0 && opValue
< fPattern
->fCompiledPat
->size());
5029 int32_t stoOp
= (int32_t)pat
[opValue
-1];
5030 U_ASSERT(URX_TYPE(stoOp
) == URX_STO_INP_LOC
);
5031 int32_t frameLoc
= URX_VAL(stoOp
);
5032 U_ASSERT(frameLoc
>= 0 && frameLoc
< fFrameSize
);
5033 int32_t prevInputIdx
= (int32_t)fp
->fExtra
[frameLoc
];
5034 U_ASSERT(prevInputIdx
<= fp
->fInputIdx
);
5035 if (prevInputIdx
< fp
->fInputIdx
) {
5036 // The match did make progress. Repeat the loop.
5037 fp
= StateSave(fp
, fp
->fPatIdx
, status
); // State save to loc following current
5038 fp
->fPatIdx
= opValue
;
5039 fp
->fExtra
[frameLoc
] = fp
->fInputIdx
;
5041 // If the input position did not advance, we do nothing here,
5042 // execution will fall out of the loop.
5048 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
5049 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
5051 // Pick up the three extra operands that CTR_INIT has, and
5052 // skip the pattern location counter past
5053 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
5055 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
5056 int32_t minCount
= (int32_t)pat
[instrOperandLoc
+1];
5057 int32_t maxCount
= (int32_t)pat
[instrOperandLoc
+2];
5058 U_ASSERT(minCount
>=0);
5059 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
5060 U_ASSERT(loopLoc
>=fp
->fPatIdx
);
5062 if (minCount
== 0) {
5063 fp
= StateSave(fp
, loopLoc
+1, status
);
5065 if (maxCount
== -1) {
5066 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // For loop breaking.
5067 } else if (maxCount
== 0) {
5068 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5075 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
5076 int32_t initOp
= (int32_t)pat
[opValue
];
5077 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT
);
5078 int64_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
5079 int32_t minCount
= (int32_t)pat
[opValue
+2];
5080 int32_t maxCount
= (int32_t)pat
[opValue
+3];
5082 if ((uint64_t)*pCounter
>= (uint32_t)maxCount
&& maxCount
!= -1) {
5083 U_ASSERT(*pCounter
== maxCount
);
5086 if (*pCounter
>= minCount
) {
5087 if (maxCount
== -1) {
5088 // Loop has no hard upper bound.
5089 // Check that it is progressing through the input, break if it is not.
5090 int64_t *pLastInputIdx
= &fp
->fExtra
[URX_VAL(initOp
) + 1];
5091 if (fp
->fInputIdx
== *pLastInputIdx
) {
5094 *pLastInputIdx
= fp
->fInputIdx
;
5097 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5099 fp
->fPatIdx
= opValue
+ 4; // Loop back.
5103 case URX_CTR_INIT_NG
:
5105 // Initialize a non-greedy loop
5106 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
-2);
5107 fp
->fExtra
[opValue
] = 0; // Set the loop counter variable to zero
5109 // Pick up the three extra operands that CTR_INIT_NG has, and
5110 // skip the pattern location counter past
5111 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
5113 int32_t loopLoc
= URX_VAL(pat
[instrOperandLoc
]);
5114 int32_t minCount
= (int32_t)pat
[instrOperandLoc
+1];
5115 int32_t maxCount
= (int32_t)pat
[instrOperandLoc
+2];
5116 U_ASSERT(minCount
>=0);
5117 U_ASSERT(maxCount
>=minCount
|| maxCount
==-1);
5118 U_ASSERT(loopLoc
>fp
->fPatIdx
);
5119 if (maxCount
== -1) {
5120 fp
->fExtra
[opValue
+1] = fp
->fInputIdx
; // Save initial input index for loop breaking.
5123 if (minCount
== 0) {
5124 if (maxCount
!= 0) {
5125 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5127 fp
->fPatIdx
= loopLoc
+1; // Continue with stuff after repeated block
5132 case URX_CTR_LOOP_NG
:
5134 // Non-greedy {min, max} loops
5135 U_ASSERT(opValue
>0 && opValue
< fp
->fPatIdx
-2);
5136 int32_t initOp
= (int32_t)pat
[opValue
];
5137 U_ASSERT(URX_TYPE(initOp
) == URX_CTR_INIT_NG
);
5138 int64_t *pCounter
= &fp
->fExtra
[URX_VAL(initOp
)];
5139 int32_t minCount
= (int32_t)pat
[opValue
+2];
5140 int32_t maxCount
= (int32_t)pat
[opValue
+3];
5143 if ((uint64_t)*pCounter
>= (uint32_t)maxCount
&& maxCount
!= -1) {
5144 // The loop has matched the maximum permitted number of times.
5145 // Break out of here with no action. Matching will
5146 // continue with the following pattern.
5147 U_ASSERT(*pCounter
== maxCount
);
5151 if (*pCounter
< minCount
) {
5152 // We haven't met the minimum number of matches yet.
5153 // Loop back for another one.
5154 fp
->fPatIdx
= opValue
+ 4; // Loop back.
5156 // We do have the minimum number of matches.
5158 // If there is no upper bound on the loop iterations, check that the input index
5159 // is progressing, and stop the loop if it is not.
5160 if (maxCount
== -1) {
5161 int64_t *pLastInputIdx
= &fp
->fExtra
[URX_VAL(initOp
) + 1];
5162 if (fp
->fInputIdx
== *pLastInputIdx
) {
5165 *pLastInputIdx
= fp
->fInputIdx
;
5168 // Loop Continuation: we will fall into the pattern following the loop
5169 // (non-greedy, don't execute loop body first), but first do
5170 // a state save to the top of the loop, so that a match failure
5171 // in the following pattern will try another iteration of the loop.
5172 fp
= StateSave(fp
, opValue
+ 4, status
);
5178 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
5179 fData
[opValue
] = fStack
->size();
5184 U_ASSERT(opValue
>= 0 && opValue
< fPattern
->fDataSize
);
5185 int32_t newStackSize
= (int32_t)fData
[opValue
];
5186 U_ASSERT(newStackSize
<= fStack
->size());
5187 int64_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
5188 if (newFP
== (int64_t *)fp
) {
5192 for (i
=0; i
<fFrameSize
; i
++) {
5193 newFP
[i
] = ((int64_t *)fp
)[i
];
5195 fp
= (REStackFrame
*)newFP
;
5196 fStack
->setSize(newStackSize
);
5202 U_ASSERT(opValue
< fFrameSize
);
5203 int64_t groupStartIdx
= fp
->fExtra
[opValue
];
5204 int64_t groupEndIdx
= fp
->fExtra
[opValue
+1];
5205 U_ASSERT(groupStartIdx
<= groupEndIdx
);
5206 int64_t inputIndex
= fp
->fInputIdx
;
5207 if (groupStartIdx
< 0) {
5208 // This capture group has not participated in the match thus far,
5209 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
5212 UBool success
= TRUE
;
5213 for (int64_t groupIndex
= groupStartIdx
; groupIndex
< groupEndIdx
; ++groupIndex
,++inputIndex
) {
5214 if (inputIndex
>= fActiveLimit
) {
5219 if (inputBuf
[groupIndex
] != inputBuf
[inputIndex
]) {
5225 fp
->fInputIdx
= inputIndex
;
5227 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5234 U_ASSERT(opValue
< fFrameSize
);
5235 int64_t groupStartIdx
= fp
->fExtra
[opValue
];
5236 int64_t groupEndIdx
= fp
->fExtra
[opValue
+1];
5237 U_ASSERT(groupStartIdx
<= groupEndIdx
);
5238 if (groupStartIdx
< 0) {
5239 // This capture group has not participated in the match thus far,
5240 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no match.
5243 CaseFoldingUCharIterator
captureGroupItr(inputBuf
, groupStartIdx
, groupEndIdx
);
5244 CaseFoldingUCharIterator
inputItr(inputBuf
, fp
->fInputIdx
, fActiveLimit
);
5246 // Note: if the capture group match was of an empty string the backref
5247 // match succeeds. Verified by testing: Perl matches succeed
5248 // in this case, so we do too.
5250 UBool success
= TRUE
;
5252 UChar32 captureGroupChar
= captureGroupItr
.next();
5253 if (captureGroupChar
== U_SENTINEL
) {
5257 UChar32 inputChar
= inputItr
.next();
5258 if (inputChar
== U_SENTINEL
) {
5263 if (inputChar
!= captureGroupChar
) {
5269 if (success
&& inputItr
.inExpansion()) {
5270 // We otained a match by consuming part of a string obtained from
5271 // case-folding a single code point of the input text.
5272 // This does not count as an overall match.
5277 fp
->fInputIdx
= inputItr
.getIndex();
5279 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5284 case URX_STO_INP_LOC
:
5286 U_ASSERT(opValue
>= 0 && opValue
< fFrameSize
);
5287 fp
->fExtra
[opValue
] = fp
->fInputIdx
;
5293 int32_t instrOperandLoc
= (int32_t)fp
->fPatIdx
;
5295 int32_t dataLoc
= URX_VAL(pat
[instrOperandLoc
]);
5296 U_ASSERT(dataLoc
>= 0 && dataLoc
< fFrameSize
);
5297 int32_t savedInputIdx
= (int32_t)fp
->fExtra
[dataLoc
];
5298 U_ASSERT(savedInputIdx
<= fp
->fInputIdx
);
5299 if (savedInputIdx
< fp
->fInputIdx
) {
5300 fp
->fPatIdx
= opValue
; // JMP
5302 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
); // FAIL, no progress in loop.
5309 // Entering a lookahead block.
5310 // Save Stack Ptr, Input Pos.
5311 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5312 fData
[opValue
] = fStack
->size();
5313 fData
[opValue
+1] = fp
->fInputIdx
;
5314 fActiveStart
= fLookStart
; // Set the match region change for
5315 fActiveLimit
= fLookLimit
; // transparent bounds.
5321 // Leaving a look-ahead block.
5322 // restore Stack Ptr, Input Pos to positions they had on entry to block.
5323 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5324 int32_t stackSize
= fStack
->size();
5325 int32_t newStackSize
= (int32_t)fData
[opValue
];
5326 U_ASSERT(stackSize
>= newStackSize
);
5327 if (stackSize
> newStackSize
) {
5328 // Copy the current top frame back to the new (cut back) top frame.
5329 // This makes the capture groups from within the look-ahead
5330 // expression available.
5331 int64_t *newFP
= fStack
->getBuffer() + newStackSize
- fFrameSize
;
5333 for (i
=0; i
<fFrameSize
; i
++) {
5334 newFP
[i
] = ((int64_t *)fp
)[i
];
5336 fp
= (REStackFrame
*)newFP
;
5337 fStack
->setSize(newStackSize
);
5339 fp
->fInputIdx
= fData
[opValue
+1];
5341 // Restore the active region bounds in the input string; they may have
5342 // been changed because of transparent bounds on a Region.
5343 fActiveStart
= fRegionStart
;
5344 fActiveLimit
= fRegionLimit
;
5349 if (fp
->fInputIdx
< fActiveLimit
) {
5351 U16_NEXT(inputBuf
, fp
->fInputIdx
, fActiveLimit
, c
);
5352 if (u_foldCase(c
, U_FOLD_CASE_DEFAULT
) == opValue
) {
5358 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5362 // Case-insensitive test input against a literal string.
5363 // Strings require two slots in the compiled pattern, one for the
5364 // offset to the string text, and one for the length.
5365 // The compiled string has already been case folded.
5367 const UChar
*patternString
= litText
+ opValue
;
5369 op
= (int32_t)pat
[fp
->fPatIdx
];
5371 opType
= URX_TYPE(op
);
5372 opValue
= URX_VAL(op
);
5373 U_ASSERT(opType
== URX_STRING_LEN
);
5374 int32_t patternStringLen
= opValue
; // Length of the string from the pattern.
5378 UBool success
= TRUE
;
5379 int32_t patternStringIdx
= 0;
5380 CaseFoldingUCharIterator
inputIterator(inputBuf
, fp
->fInputIdx
, fActiveLimit
);
5381 while (patternStringIdx
< patternStringLen
) {
5382 U16_NEXT(patternString
, patternStringIdx
, patternStringLen
, cPattern
);
5383 cText
= inputIterator
.next();
5384 if (cText
!= cPattern
) {
5386 if (cText
== U_SENTINEL
) {
5392 if (inputIterator
.inExpansion()) {
5397 fp
->fInputIdx
= inputIterator
.getIndex();
5399 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5406 // Entering a look-behind block.
5407 // Save Stack Ptr, Input Pos.
5408 // TODO: implement transparent bounds. Ticket #6067
5409 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5410 fData
[opValue
] = fStack
->size();
5411 fData
[opValue
+1] = fp
->fInputIdx
;
5412 // Init the variable containing the start index for attempted matches.
5413 fData
[opValue
+2] = -1;
5414 // Save input string length, then reset to pin any matches to end at
5415 // the current position.
5416 fData
[opValue
+3] = fActiveLimit
;
5417 fActiveLimit
= fp
->fInputIdx
;
5424 // Positive Look-Behind, at top of loop checking for matches of LB expression
5425 // at all possible input starting positions.
5427 // Fetch the min and max possible match lengths. They are the operands
5428 // of this op in the pattern.
5429 int32_t minML
= (int32_t)pat
[fp
->fPatIdx
++];
5430 int32_t maxML
= (int32_t)pat
[fp
->fPatIdx
++];
5431 U_ASSERT(minML
<= maxML
);
5432 U_ASSERT(minML
>= 0);
5434 // Fetch (from data) the last input index where a match was attempted.
5435 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5436 int64_t *lbStartIdx
= &fData
[opValue
+2];
5437 if (*lbStartIdx
< 0) {
5438 // First time through loop.
5439 *lbStartIdx
= fp
->fInputIdx
- minML
;
5441 // 2nd through nth time through the loop.
5442 // Back up start position for match by one.
5443 if (*lbStartIdx
== 0) {
5446 U16_BACK_1(inputBuf
, 0, *lbStartIdx
);
5450 if (*lbStartIdx
< 0 || *lbStartIdx
< fp
->fInputIdx
- maxML
) {
5451 // We have tried all potential match starting points without
5452 // getting a match. Backtrack out, and out of the
5453 // Look Behind altogether.
5454 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5455 int64_t restoreInputLen
= fData
[opValue
+3];
5456 U_ASSERT(restoreInputLen
>= fActiveLimit
);
5457 U_ASSERT(restoreInputLen
<= fInputLength
);
5458 fActiveLimit
= restoreInputLen
;
5462 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5463 // (successful match will fall off the end of the loop.)
5464 fp
= StateSave(fp
, fp
->fPatIdx
-3, status
);
5465 fp
->fInputIdx
= *lbStartIdx
;
5470 // End of a look-behind block, after a successful match.
5472 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5473 if (fp
->fInputIdx
!= fActiveLimit
) {
5474 // The look-behind expression matched, but the match did not
5475 // extend all the way to the point that we are looking behind from.
5476 // FAIL out of here, which will take us back to the LB_CONT, which
5477 // will retry the match starting at another position or fail
5478 // the look-behind altogether, whichever is appropriate.
5479 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5483 // Look-behind match is good. Restore the orignal input string length,
5484 // which had been truncated to pin the end of the lookbehind match to the
5485 // position being looked-behind.
5486 int64_t originalInputLen
= fData
[opValue
+3];
5487 U_ASSERT(originalInputLen
>= fActiveLimit
);
5488 U_ASSERT(originalInputLen
<= fInputLength
);
5489 fActiveLimit
= originalInputLen
;
5496 // Negative Look-Behind, at top of loop checking for matches of LB expression
5497 // at all possible input starting positions.
5499 // Fetch the extra parameters of this op.
5500 int32_t minML
= (int32_t)pat
[fp
->fPatIdx
++];
5501 int32_t maxML
= (int32_t)pat
[fp
->fPatIdx
++];
5502 int32_t continueLoc
= (int32_t)pat
[fp
->fPatIdx
++];
5503 continueLoc
= URX_VAL(continueLoc
);
5504 U_ASSERT(minML
<= maxML
);
5505 U_ASSERT(minML
>= 0);
5506 U_ASSERT(continueLoc
> fp
->fPatIdx
);
5508 // Fetch (from data) the last input index where a match was attempted.
5509 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5510 int64_t *lbStartIdx
= &fData
[opValue
+2];
5511 if (*lbStartIdx
< 0) {
5512 // First time through loop.
5513 *lbStartIdx
= fp
->fInputIdx
- minML
;
5515 // 2nd through nth time through the loop.
5516 // Back up start position for match by one.
5517 if (*lbStartIdx
== 0) {
5518 (*lbStartIdx
)--; // Because U16_BACK is unsafe starting at 0.
5520 U16_BACK_1(inputBuf
, 0, *lbStartIdx
);
5524 if (*lbStartIdx
< 0 || *lbStartIdx
< fp
->fInputIdx
- maxML
) {
5525 // We have tried all potential match starting points without
5526 // getting a match, which means that the negative lookbehind as
5527 // a whole has succeeded. Jump forward to the continue location
5528 int64_t restoreInputLen
= fData
[opValue
+3];
5529 U_ASSERT(restoreInputLen
>= fActiveLimit
);
5530 U_ASSERT(restoreInputLen
<= fInputLength
);
5531 fActiveLimit
= restoreInputLen
;
5532 fp
->fPatIdx
= continueLoc
;
5536 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5537 // (successful match will cause a FAIL out of the loop altogether.)
5538 fp
= StateSave(fp
, fp
->fPatIdx
-4, status
);
5539 fp
->fInputIdx
= *lbStartIdx
;
5544 // End of a negative look-behind block, after a successful match.
5546 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5547 if (fp
->fInputIdx
!= fActiveLimit
) {
5548 // The look-behind expression matched, but the match did not
5549 // extend all the way to the point that we are looking behind from.
5550 // FAIL out of here, which will take us back to the LB_CONT, which
5551 // will retry the match starting at another position or succeed
5552 // the look-behind altogether, whichever is appropriate.
5553 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5557 // Look-behind expression matched, which means look-behind test as
5560 // Restore the orignal input string length, which had been truncated
5561 // inorder to pin the end of the lookbehind match
5562 // to the position being looked-behind.
5563 int64_t originalInputLen
= fData
[opValue
+3];
5564 U_ASSERT(originalInputLen
>= fActiveLimit
);
5565 U_ASSERT(originalInputLen
<= fInputLength
);
5566 fActiveLimit
= originalInputLen
;
5568 // Restore original stack position, discarding any state saved
5569 // by the successful pattern match.
5570 U_ASSERT(opValue
>=0 && opValue
+1<fPattern
->fDataSize
);
5571 int32_t newStackSize
= (int32_t)fData
[opValue
];
5572 U_ASSERT(fStack
->size() > newStackSize
);
5573 fStack
->setSize(newStackSize
);
5575 // FAIL, which will take control back to someplace
5576 // prior to entering the look-behind test.
5577 fp
= (REStackFrame
*)fStack
->popFrame(fFrameSize
);
5583 // Loop Initialization for the optimized implementation of
5584 // [some character set]*
5585 // This op scans through all matching input.
5586 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5588 U_ASSERT(opValue
> 0 && opValue
< sets
->size());
5589 Regex8BitSet
*s8
= &fPattern
->fSets8
[opValue
];
5590 UnicodeSet
*s
= (UnicodeSet
*)sets
->elementAt(opValue
);
5592 // Loop through input, until either the input is exhausted or
5593 // we reach a character that is not a member of the set.
5594 int32_t ix
= (int32_t)fp
->fInputIdx
;
5596 if (ix
>= fActiveLimit
) {
5601 U16_NEXT(inputBuf
, ix
, fActiveLimit
, c
);
5603 if (s8
->contains(c
) == FALSE
) {
5604 U16_BACK_1(inputBuf
, 0, ix
);
5608 if (s
->contains(c
) == FALSE
) {
5609 U16_BACK_1(inputBuf
, 0, ix
);
5615 // If there were no matching characters, skip over the loop altogether.
5616 // The loop doesn't run at all, a * op always succeeds.
5617 if (ix
== fp
->fInputIdx
) {
5618 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
5622 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5623 // must follow. It's operand is the stack location
5624 // that holds the starting input index for the match of this [set]*
5625 int32_t loopcOp
= (int32_t)pat
[fp
->fPatIdx
];
5626 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
5627 int32_t stackLoc
= URX_VAL(loopcOp
);
5628 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
5629 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
5632 // Save State to the URX_LOOP_C op that follows this one,
5633 // so that match failures in the following code will return to there.
5634 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5635 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5641 case URX_LOOP_DOT_I
:
5642 // Loop Initialization for the optimized implementation of .*
5643 // This op scans through all remaining input.
5644 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5646 // Loop through input until the input is exhausted (we reach an end-of-line)
5647 // In DOTALL mode, we can just go straight to the end of the input.
5649 if ((opValue
& 1) == 1) {
5650 // Dot-matches-All mode. Jump straight to the end of the string.
5651 ix
= (int32_t)fActiveLimit
;
5654 // NOT DOT ALL mode. Line endings do not match '.'
5655 // Scan forward until a line ending or end of input.
5656 ix
= (int32_t)fp
->fInputIdx
;
5658 if (ix
>= fActiveLimit
) {
5663 U16_NEXT(inputBuf
, ix
, fActiveLimit
, c
); // c = inputBuf[ix++]
5664 if ((c
& 0x7f) <= 0x29) { // Fast filter of non-new-line-s
5665 if ((c
== 0x0a) || // 0x0a is newline in both modes.
5666 (((opValue
& 2) == 0) && // IF not UNIX_LINES mode
5667 isLineTerminator(c
))) {
5668 // char is a line ending. Put the input pos back to the
5669 // line ending char, and exit the scanning loop.
5670 U16_BACK_1(inputBuf
, 0, ix
);
5677 // If there were no matching characters, skip over the loop altogether.
5678 // The loop doesn't run at all, a * op always succeeds.
5679 if (ix
== fp
->fInputIdx
) {
5680 fp
->fPatIdx
++; // skip the URX_LOOP_C op.
5684 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5685 // must follow. It's operand is the stack location
5686 // that holds the starting input index for the match of this .*
5687 int32_t loopcOp
= (int32_t)pat
[fp
->fPatIdx
];
5688 U_ASSERT(URX_TYPE(loopcOp
) == URX_LOOP_C
);
5689 int32_t stackLoc
= URX_VAL(loopcOp
);
5690 U_ASSERT(stackLoc
>= 0 && stackLoc
< fFrameSize
);
5691 fp
->fExtra
[stackLoc
] = fp
->fInputIdx
;
5694 // Save State to the URX_LOOP_C op that follows this one,
5695 // so that match failures in the following code will return to there.
5696 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5697 fp
= StateSave(fp
, fp
->fPatIdx
, status
);
5705 U_ASSERT(opValue
>=0 && opValue
<fFrameSize
);
5706 backSearchIndex
= (int32_t)fp
->fExtra
[opValue
];
5707 U_ASSERT(backSearchIndex
<= fp
->fInputIdx
);
5708 if (backSearchIndex
== fp
->fInputIdx
) {
5709 // We've backed up the input idx to the point that the loop started.
5710 // The loop is done. Leave here without saving state.
5711 // Subsequent failures won't come back here.
5714 // Set up for the next iteration of the loop, with input index
5715 // backed up by one from the last time through,
5716 // and a state save to this instruction in case the following code fails again.
5717 // (We're going backwards because this loop emulates stack unwinding, not
5718 // the initial scan forward.)
5719 U_ASSERT(fp
->fInputIdx
> 0);
5721 U16_PREV(inputBuf
, 0, fp
->fInputIdx
, prevC
); // !!!: should this 0 be one of f*Limit?
5723 if (prevC
== 0x0a &&
5724 fp
->fInputIdx
> backSearchIndex
&&
5725 inputBuf
[fp
->fInputIdx
-1] == 0x0d) {
5726 int32_t prevOp
= (int32_t)pat
[fp
->fPatIdx
-2];
5727 if (URX_TYPE(prevOp
) == URX_LOOP_DOT_I
) {
5728 // .*, stepping back over CRLF pair.
5729 U16_BACK_1(inputBuf
, 0, fp
->fInputIdx
);
5734 fp
= StateSave(fp
, fp
->fPatIdx
-1, status
);
5741 // Trouble. The compiled pattern contains an entry with an
5742 // unrecognized type tag.
5746 if (U_FAILURE(status
)) {
5755 fLastMatchEnd
= fMatchEnd
;
5756 fMatchStart
= startIdx
;
5757 fMatchEnd
= fp
->fInputIdx
;
5760 #ifdef REGEX_RUN_DEBUG
5763 printf("Match. start=%ld end=%ld\n\n", fMatchStart
, fMatchEnd
);
5765 printf("No match\n\n");
5770 fFrame
= fp
; // The active stack frame when the engine stopped.
5771 // Contains the capture group results that we need to
5778 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher
)
5782 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS