]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/rematch.cpp
ICU-400.38.tar.gz
[apple/icu.git] / icuSources / i18n / rematch.cpp
1 /*
2 **************************************************************************
3 * Copyright (C) 2002-2008 International Business Machines Corporation *
4 * and others. All rights reserved. *
5 **************************************************************************
6 */
7 //
8 // file: rematch.cpp
9 //
10 // Contains the implementation of class RegexMatcher,
11 // which is one of the main API classes for the ICU regular expression package.
12 //
13
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
16
17 #include "unicode/regex.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/ustring.h"
21 #include "unicode/rbbi.h"
22 #include "uassert.h"
23 #include "cmemory.h"
24 #include "uvector.h"
25 #include "uvectr32.h"
26 #include "regeximp.h"
27 #include "regexst.h"
28
29 // #include <malloc.h> // Needed for heapcheck testing
30
31 U_NAMESPACE_BEGIN
32
33 // Default limit for the size of the back track stack, to avoid system
34 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
35 // This value puts ICU's limits higher than most other regexp implementations,
36 // which use recursion rather than the heap, and take more storage per
37 // backtrack point.
38 //
39 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
40
41 // Time limit counter constant.
42 // Time limits for expression evaluation are in terms of quanta of work by
43 // the engine, each of which is 10,000 state saves.
44 // This constant determines that state saves per tick number.
45 static const int32_t TIMER_INITIAL_VALUE = 10000;
46
47 //-----------------------------------------------------------------------------
48 //
49 // Constructor and Destructor
50 //
51 //-----------------------------------------------------------------------------
52 RegexMatcher::RegexMatcher(const RegexPattern *pat) {
53 fDeferredStatus = U_ZERO_ERROR;
54 init(fDeferredStatus);
55 if (U_FAILURE(fDeferredStatus)) {
56 return;
57 }
58 if (pat==NULL) {
59 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
60 return;
61 }
62 fPattern = pat;
63 init2(RegexStaticSets::gStaticSets->fEmptyString, fDeferredStatus);
64 }
65
66
67
68 RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
69 uint32_t flags, UErrorCode &status) {
70 init(status);
71 if (U_FAILURE(status)) {
72 return;
73 }
74 UParseError pe;
75 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
76 fPattern = fPatternOwned;
77 init2(input, status);
78 }
79
80
81 RegexMatcher::RegexMatcher(const UnicodeString &regexp,
82 uint32_t flags, UErrorCode &status) {
83 init(status);
84 if (U_FAILURE(status)) {
85 return;
86 }
87 UParseError pe;
88 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
89 fPattern = fPatternOwned;
90 init2(RegexStaticSets::gStaticSets->fEmptyString, status);
91 }
92
93
94
95
96 RegexMatcher::~RegexMatcher() {
97 delete fStack;
98 if (fData != fSmallData) {
99 uprv_free(fData);
100 fData = NULL;
101 }
102 if (fPatternOwned) {
103 delete fPatternOwned;
104 fPatternOwned = NULL;
105 fPattern = NULL;
106 }
107 #if UCONFIG_NO_BREAK_ITERATION==0
108 delete fWordBreakItr;
109 #endif
110 }
111
112 //
113 // init() common initialization for use by all constructors.
114 // Initialize all fields, get the object into a consistent state.
115 // This must be done even when the initial status shows an error,
116 // so that the object is initialized sufficiently well for the destructor
117 // to run safely.
118 //
119 void RegexMatcher::init(UErrorCode &status) {
120 fPattern = NULL;
121 fPatternOwned = NULL;
122 fInput = NULL;
123 fFrameSize = 0;
124 fRegionStart = 0;
125 fRegionLimit = 0;
126 fAnchorStart = 0;
127 fAnchorLimit = 0;
128 fLookStart = 0;
129 fLookLimit = 0;
130 fActiveStart = 0;
131 fActiveLimit = 0;
132 fTransparentBounds = FALSE;
133 fAnchoringBounds = TRUE;
134 fMatch = FALSE;
135 fMatchStart = 0;
136 fMatchEnd = 0;
137 fLastMatchEnd = -1;
138 fAppendPosition = 0;
139 fHitEnd = FALSE;
140 fRequireEnd = FALSE;
141 fStack = NULL;
142 fFrame = NULL;
143 fTimeLimit = 0;
144 fTime = 0;
145 fTickCounter = 0;
146 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY;
147 fCallbackFn = NULL;
148 fCallbackContext = NULL;
149 fTraceDebug = FALSE;
150 fDeferredStatus = status;
151 fData = fSmallData;
152 fWordBreakItr = NULL;
153
154 fStack = new UVector32(status);
155 if (U_FAILURE(status)) {
156 fDeferredStatus = status;
157 }
158 }
159
160 //
161 // init2() Common initialization for use by RegexMatcher constructors, part 2.
162 // This handles the common setup to be done after the Pattern is available.
163 //
164 void RegexMatcher::init2(const UnicodeString &input, UErrorCode &status) {
165 if (U_FAILURE(status)) {
166 fDeferredStatus = status;
167 return;
168 }
169
170 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) {
171 fData = (int32_t *)uprv_malloc(fPattern->fDataSize * sizeof(int32_t));
172 if (fData == NULL) {
173 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
174 return;
175 }
176 }
177
178 reset(input);
179 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
180 if (U_FAILURE(status)) {
181 fDeferredStatus = status;
182 return;
183 }
184 }
185
186
187 static const UChar BACKSLASH = 0x5c;
188 static const UChar DOLLARSIGN = 0x24;
189 //--------------------------------------------------------------------------------
190 //
191 // appendReplacement
192 //
193 //--------------------------------------------------------------------------------
194 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
195 const UnicodeString &replacement,
196 UErrorCode &status) {
197 if (U_FAILURE(status)) {
198 return *this;
199 }
200 if (U_FAILURE(fDeferredStatus)) {
201 status = fDeferredStatus;
202 return *this;
203 }
204 if (fMatch == FALSE) {
205 status = U_REGEX_INVALID_STATE;
206 return *this;
207 }
208
209 // Copy input string from the end of previous match to start of current match
210 int32_t len = fMatchStart-fAppendPosition;
211 if (len > 0) {
212 dest.append(*fInput, fAppendPosition, len);
213 }
214 fAppendPosition = fMatchEnd;
215
216
217 // scan the replacement text, looking for substitutions ($n) and \escapes.
218 // TODO: optimize this loop by efficiently scanning for '$' or '\',
219 // move entire ranges not containing substitutions.
220 int32_t replLen = replacement.length();
221 int32_t replIdx = 0;
222 while (replIdx<replLen) {
223 UChar c = replacement.charAt(replIdx);
224 replIdx++;
225 if (c == BACKSLASH) {
226 // Backslash Escape. Copy the following char out without further checks.
227 // Note: Surrogate pairs don't need any special handling
228 // The second half wont be a '$' or a '\', and
229 // will move to the dest normally on the next
230 // loop iteration.
231 if (replIdx >= replLen) {
232 break;
233 }
234 c = replacement.charAt(replIdx);
235
236 if (c==0x55/*U*/ || c==0x75/*u*/) {
237 // We have a \udddd or \Udddddddd escape sequence.
238 UChar32 escapedChar = replacement.unescapeAt(replIdx);
239 if (escapedChar != (UChar32)0xFFFFFFFF) {
240 dest.append(escapedChar);
241 // TODO: Report errors for mal-formed \u escapes?
242 // As this is, the original sequence is output, which may be OK.
243 continue;
244 }
245 }
246
247 // Plain backslash escape. Just put out the escaped character.
248 dest.append(c);
249 replIdx++;
250 continue;
251 }
252
253 if (c != DOLLARSIGN) {
254 // Normal char, not a $. Copy it out without further checks.
255 dest.append(c);
256 continue;
257 }
258
259 // We've got a $. Pick up a capture group number if one follows.
260 // Consume at most the number of digits necessary for the largest capture
261 // number that is valid for this pattern.
262
263 int32_t numDigits = 0;
264 int32_t groupNum = 0;
265 UChar32 digitC;
266 for (;;) {
267 if (replIdx >= replLen) {
268 break;
269 }
270 digitC = replacement.char32At(replIdx);
271 if (u_isdigit(digitC) == FALSE) {
272 break;
273 }
274 replIdx = replacement.moveIndex32(replIdx, 1);
275 groupNum=groupNum*10 + u_charDigitValue(digitC);
276 numDigits++;
277 if (numDigits >= fPattern->fMaxCaptureDigits) {
278 break;
279 }
280 }
281
282
283 if (numDigits == 0) {
284 // The $ didn't introduce a group number at all.
285 // Treat it as just part of the substitution text.
286 dest.append(DOLLARSIGN);
287 continue;
288 }
289
290 // Finally, append the capture group data to the destination.
291 dest.append(group(groupNum, status));
292 if (U_FAILURE(status)) {
293 // Can fail if group number is out of range.
294 break;
295 }
296
297 }
298
299 return *this;
300 }
301
302
303
304 //--------------------------------------------------------------------------------
305 //
306 // appendTail Intended to be used in conjunction with appendReplacement()
307 // To the destination string, append everything following
308 // the last match position from the input string.
309 //
310 // Note: Match ranges do not affect appendTail or appendReplacement
311 //
312 //--------------------------------------------------------------------------------
313 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
314 int32_t len = fInput->length() - fAppendPosition;
315 if (len > 0) {
316 dest.append(*fInput, fAppendPosition, len);
317 }
318 return dest;
319 }
320
321
322
323 //--------------------------------------------------------------------------------
324 //
325 // end
326 //
327 //--------------------------------------------------------------------------------
328 int32_t RegexMatcher::end(UErrorCode &err) const {
329 return end(0, err);
330 }
331
332
333
334 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
335 if (U_FAILURE(err)) {
336 return -1;
337 }
338 if (fMatch == FALSE) {
339 err = U_REGEX_INVALID_STATE;
340 return -1;
341 }
342 if (group < 0 || group > fPattern->fGroupMap->size()) {
343 err = U_INDEX_OUTOFBOUNDS_ERROR;
344 return -1;
345 }
346 int32_t e = -1;
347 if (group == 0) {
348 e = fMatchEnd;
349 } else {
350 // Get the position within the stack frame of the variables for
351 // this capture group.
352 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
353 U_ASSERT(groupOffset < fPattern->fFrameSize);
354 U_ASSERT(groupOffset >= 0);
355 e = fFrame->fExtra[groupOffset + 1];
356 }
357 return e;
358 }
359
360
361
362 //--------------------------------------------------------------------------------
363 //
364 // find()
365 //
366 //--------------------------------------------------------------------------------
367 UBool RegexMatcher::find() {
368 // Start at the position of the last match end. (Will be zero if the
369 // matcher has been reset.
370 //
371 if (U_FAILURE(fDeferredStatus)) {
372 return FALSE;
373 }
374
375 int32_t startPos = fMatchEnd;
376 if (startPos==0) {
377 startPos = fActiveStart;
378 }
379
380 if (fMatch) {
381 // Save the position of any previous successful match.
382 fLastMatchEnd = fMatchEnd;
383
384 if (fMatchStart == fMatchEnd) {
385 // Previous match had zero length. Move start position up one position
386 // to avoid sending find() into a loop on zero-length matches.
387 if (startPos >= fActiveLimit) {
388 fMatch = FALSE;
389 fHitEnd = TRUE;
390 return FALSE;
391 }
392 startPos = fInput->moveIndex32(startPos, 1);
393 }
394 } else {
395 if (fLastMatchEnd >= 0) {
396 // A previous find() failed to match. Don't try again.
397 // (without this test, a pattern with a zero-length match
398 // could match again at the end of an input string.)
399 fHitEnd = TRUE;
400 return FALSE;
401 }
402 }
403
404
405 // Compute the position in the input string beyond which a match can not begin, because
406 // the minimum length match would extend past the end of the input.
407 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
408 // Be aware of possible overflows if making changes here.
409 int32_t testLen = fActiveLimit - fPattern->fMinMatchLen;
410 if (startPos > testLen) {
411 fMatch = FALSE;
412 fHitEnd = TRUE;
413 return FALSE;
414 }
415
416 const UChar *inputBuf = fInput->getBuffer();
417 UChar32 c;
418 U_ASSERT(startPos >= 0);
419
420 switch (fPattern->fStartType) {
421 case START_NO_INFO:
422 // No optimization was found.
423 // Try a match at each input position.
424 for (;;) {
425 MatchAt(startPos, FALSE, fDeferredStatus);
426 if (U_FAILURE(fDeferredStatus)) {
427 return FALSE;
428 }
429 if (fMatch) {
430 return TRUE;
431 }
432 if (startPos >= testLen) {
433 fHitEnd = TRUE;
434 return FALSE;
435 }
436 U16_FWD_1(inputBuf, startPos, fActiveLimit);
437 // Note that it's perfectly OK for a pattern to have a zero-length
438 // match at the end of a string, so we must make sure that the loop
439 // runs with startPos == testLen the last time through.
440 }
441 U_ASSERT(FALSE);
442
443 case START_START:
444 // Matches are only possible at the start of the input string
445 // (pattern begins with ^ or \A)
446 if (startPos > fActiveStart) {
447 fMatch = FALSE;
448 return FALSE;
449 }
450 MatchAt(startPos, FALSE, fDeferredStatus);
451 if (U_FAILURE(fDeferredStatus)) {
452 return FALSE;
453 }
454 return fMatch;
455
456
457 case START_SET:
458 {
459 // Match may start on any char from a pre-computed set.
460 U_ASSERT(fPattern->fMinMatchLen > 0);
461 for (;;) {
462 int32_t pos = startPos;
463 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
464 if (c<256 && fPattern->fInitialChars8->contains(c) ||
465 c>=256 && fPattern->fInitialChars->contains(c)) {
466 MatchAt(pos, FALSE, fDeferredStatus);
467 if (U_FAILURE(fDeferredStatus)) {
468 return FALSE;
469 }
470 if (fMatch) {
471 return TRUE;
472 }
473 }
474 if (pos >= testLen) {
475 fMatch = FALSE;
476 fHitEnd = TRUE;
477 return FALSE;
478 }
479 }
480 }
481 U_ASSERT(FALSE);
482
483 case START_STRING:
484 case START_CHAR:
485 {
486 // Match starts on exactly one char.
487 U_ASSERT(fPattern->fMinMatchLen > 0);
488 UChar32 theChar = fPattern->fInitialChar;
489 for (;;) {
490 int32_t pos = startPos;
491 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
492 if (c == theChar) {
493 MatchAt(pos, FALSE, fDeferredStatus);
494 if (U_FAILURE(fDeferredStatus)) {
495 return FALSE;
496 }
497 if (fMatch) {
498 return TRUE;
499 }
500 }
501 if (pos >= testLen) {
502 fMatch = FALSE;
503 fHitEnd = TRUE;
504 return FALSE;
505 }
506 }
507 }
508 U_ASSERT(FALSE);
509
510 case START_LINE:
511 {
512 UChar32 c;
513 if (startPos == fAnchorStart) {
514 MatchAt(startPos, FALSE, fDeferredStatus);
515 if (U_FAILURE(fDeferredStatus)) {
516 return FALSE;
517 }
518 if (fMatch) {
519 return TRUE;
520 }
521 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
522 }
523
524 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
525 for (;;) {
526 c = inputBuf[startPos-1];
527 if (c == 0x0a) {
528 MatchAt(startPos, FALSE, fDeferredStatus);
529 if (U_FAILURE(fDeferredStatus)) {
530 return FALSE;
531 }
532 if (fMatch) {
533 return TRUE;
534 }
535 }
536 if (startPos >= testLen) {
537 fMatch = FALSE;
538 fHitEnd = TRUE;
539 return FALSE;
540 }
541 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
542 // Note that it's perfectly OK for a pattern to have a zero-length
543 // match at the end of a string, so we must make sure that the loop
544 // runs with startPos == testLen the last time through.
545 }
546 } else {
547 for (;;) {
548 c = inputBuf[startPos-1];
549 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
550 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
551 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
552 startPos++;
553 }
554 MatchAt(startPos, FALSE, fDeferredStatus);
555 if (U_FAILURE(fDeferredStatus)) {
556 return FALSE;
557 }
558 if (fMatch) {
559 return TRUE;
560 }
561 }
562 if (startPos >= testLen) {
563 fMatch = FALSE;
564 fHitEnd = TRUE;
565 return FALSE;
566 }
567 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
568 // Note that it's perfectly OK for a pattern to have a zero-length
569 // match at the end of a string, so we must make sure that the loop
570 // runs with startPos == testLen the last time through.
571 }
572 }
573 }
574
575 default:
576 U_ASSERT(FALSE);
577 }
578
579 U_ASSERT(FALSE);
580 return FALSE;
581 }
582
583
584
585 UBool RegexMatcher::find(int32_t start, UErrorCode &status) {
586 if (U_FAILURE(status)) {
587 return FALSE;
588 }
589 if (U_FAILURE(fDeferredStatus)) {
590 status = fDeferredStatus;
591 return FALSE;
592 }
593 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
594 // This will reset the region to be the full input length.
595 if (start < fActiveStart || start > fActiveLimit) {
596 status = U_INDEX_OUTOFBOUNDS_ERROR;
597 return FALSE;
598 }
599 fMatchEnd = start;
600 return find();
601 }
602
603
604
605 //--------------------------------------------------------------------------------
606 //
607 // group()
608 //
609 //--------------------------------------------------------------------------------
610 UnicodeString RegexMatcher::group(UErrorCode &status) const {
611 return group(0, status);
612 }
613
614
615
616 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
617 int32_t s = start(groupNum, status);
618 int32_t e = end(groupNum, status);
619
620 // Note: calling start() and end() above will do all necessary checking that
621 // the group number is OK and that a match exists. status will be set.
622 if (U_FAILURE(status)) {
623 return UnicodeString();
624 }
625 if (U_FAILURE(fDeferredStatus)) {
626 status = fDeferredStatus;
627 return UnicodeString();
628 }
629
630 if (s < 0) {
631 // A capture group wasn't part of the match
632 return UnicodeString();
633 }
634 U_ASSERT(s <= e);
635 return UnicodeString(*fInput, s, e-s);
636 }
637
638
639
640
641 int32_t RegexMatcher::groupCount() const {
642 return fPattern->fGroupMap->size();
643 }
644
645
646
647 const UnicodeString &RegexMatcher::input() const {
648 return *fInput;
649 }
650
651
652 //--------------------------------------------------------------------------------
653 //
654 // hasAnchoringBounds()
655 //
656 //--------------------------------------------------------------------------------
657 UBool RegexMatcher::hasAnchoringBounds() const {
658 return fAnchoringBounds;
659 }
660
661
662 //--------------------------------------------------------------------------------
663 //
664 // hasTransparentBounds()
665 //
666 //--------------------------------------------------------------------------------
667 UBool RegexMatcher::hasTransparentBounds() const {
668 return fTransparentBounds;
669 }
670
671
672
673 //--------------------------------------------------------------------------------
674 //
675 // hitEnd()
676 //
677 //--------------------------------------------------------------------------------
678 UBool RegexMatcher::hitEnd() const {
679 return fHitEnd;
680 }
681
682 //--------------------------------------------------------------------------------
683 //
684 // lookingAt()
685 //
686 //--------------------------------------------------------------------------------
687 UBool RegexMatcher::lookingAt(UErrorCode &status) {
688 if (U_FAILURE(status)) {
689 return FALSE;
690 }
691 if (U_FAILURE(fDeferredStatus)) {
692 status = fDeferredStatus;
693 return FALSE;
694 }
695 resetPreserveRegion();
696 MatchAt(fActiveStart, FALSE, status);
697 return fMatch;
698 }
699
700
701 UBool RegexMatcher::lookingAt(int32_t start, UErrorCode &status) {
702 if (U_FAILURE(status)) {
703 return FALSE;
704 }
705 if (U_FAILURE(fDeferredStatus)) {
706 status = fDeferredStatus;
707 return FALSE;
708 }
709 reset();
710 if (start < fActiveStart || start > fActiveLimit) {
711 status = U_INDEX_OUTOFBOUNDS_ERROR;
712 return FALSE;
713 }
714 MatchAt(start, FALSE, status);
715 return fMatch;
716 }
717
718
719
720 //--------------------------------------------------------------------------------
721 //
722 // matches()
723 //
724 //--------------------------------------------------------------------------------
725 UBool RegexMatcher::matches(UErrorCode &status) {
726 if (U_FAILURE(status)) {
727 return FALSE;
728 }
729 if (U_FAILURE(fDeferredStatus)) {
730 status = fDeferredStatus;
731 return FALSE;
732 }
733 resetPreserveRegion();
734 MatchAt(fActiveStart, TRUE, status);
735 return fMatch;
736 }
737
738
739 UBool RegexMatcher::matches(int32_t start, UErrorCode &status) {
740 if (U_FAILURE(status)) {
741 return FALSE;
742 }
743 if (U_FAILURE(fDeferredStatus)) {
744 status = fDeferredStatus;
745 return FALSE;
746 }
747 reset();
748 if (start < fActiveStart || start > fActiveLimit) {
749 status = U_INDEX_OUTOFBOUNDS_ERROR;
750 return FALSE;
751 }
752 MatchAt(start, TRUE, status);
753 return fMatch;
754 }
755
756
757
758 //--------------------------------------------------------------------------------
759 //
760 // pattern
761 //
762 //--------------------------------------------------------------------------------
763 const RegexPattern &RegexMatcher::pattern() const {
764 return *fPattern;
765 }
766
767
768
769 //--------------------------------------------------------------------------------
770 //
771 // region
772 //
773 //--------------------------------------------------------------------------------
774 RegexMatcher &RegexMatcher::region(int32_t start, int32_t limit, UErrorCode &status) {
775 if (U_FAILURE(status)) {
776 return *this;
777 }
778 if (start>limit || start<0 || limit<0 || limit>fInput->length()) {
779 status = U_ILLEGAL_ARGUMENT_ERROR;
780 }
781 this->reset();
782 fRegionStart = start;
783 fRegionLimit = limit;
784 fActiveStart = start;
785 fActiveLimit = limit;
786 if (!fTransparentBounds) {
787 fLookStart = start;
788 fLookLimit = limit;
789 }
790 if (fAnchoringBounds) {
791 fAnchorStart = start;
792 fAnchorLimit = limit;
793 }
794 return *this;
795 }
796
797
798
799 //--------------------------------------------------------------------------------
800 //
801 // regionEnd
802 //
803 //--------------------------------------------------------------------------------
804 int32_t RegexMatcher::regionEnd() const {
805 return fRegionLimit;
806 }
807
808
809 //--------------------------------------------------------------------------------
810 //
811 // regionStart
812 //
813 //--------------------------------------------------------------------------------
814 int32_t RegexMatcher::regionStart() const {
815 return fRegionStart;
816 }
817
818
819 //--------------------------------------------------------------------------------
820 //
821 // replaceAll
822 //
823 //--------------------------------------------------------------------------------
824 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
825 if (U_FAILURE(status)) {
826 return *fInput;
827 }
828 if (U_FAILURE(fDeferredStatus)) {
829 status = fDeferredStatus;
830 return *fInput;
831 }
832 UnicodeString destString;
833 reset();
834 while (find()) {
835 appendReplacement(destString, replacement, status);
836 if (U_FAILURE(status)) {
837 break;
838 }
839 }
840 appendTail(destString);
841 return destString;
842 }
843
844
845
846
847 //--------------------------------------------------------------------------------
848 //
849 // replaceFirst
850 //
851 //--------------------------------------------------------------------------------
852 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
853 if (U_FAILURE(status)) {
854 return *fInput;
855 }
856 if (U_FAILURE(fDeferredStatus)) {
857 status = fDeferredStatus;
858 return *fInput;
859 }
860
861 reset();
862 if (!find()) {
863 return *fInput;
864 }
865
866 UnicodeString destString;
867 appendReplacement(destString, replacement, status);
868 appendTail(destString);
869 return destString;
870 }
871
872
873 //--------------------------------------------------------------------------------
874 //
875 // requireEnd
876 //
877 //--------------------------------------------------------------------------------
878 UBool RegexMatcher::requireEnd() const {
879 return fRequireEnd;
880 }
881
882
883 //--------------------------------------------------------------------------------
884 //
885 // reset
886 //
887 //--------------------------------------------------------------------------------
888 RegexMatcher &RegexMatcher::reset() {
889 fRegionStart = 0;
890 fRegionLimit = fInput->length();
891 fActiveStart = 0;
892 fActiveLimit = fRegionLimit;
893 fAnchorStart = 0;
894 fAnchorLimit = fRegionLimit;
895 fLookStart = 0;
896 fLookLimit = fRegionLimit;
897 resetPreserveRegion();
898 return *this;
899 }
900
901
902
903 void RegexMatcher::resetPreserveRegion() {
904 fMatchStart = 0;
905 fMatchEnd = 0;
906 fLastMatchEnd = -1;
907 fAppendPosition = 0;
908 fMatch = FALSE;
909 fHitEnd = FALSE;
910 fRequireEnd = FALSE;
911 fTime = 0;
912 fTickCounter = TIMER_INITIAL_VALUE;
913 resetStack();
914 }
915
916
917 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
918 fInput = &input;
919 reset();
920 if (fWordBreakItr != NULL) {
921 #if UCONFIG_NO_BREAK_ITERATION==0
922 fWordBreakItr->setText(input);
923 #endif
924 }
925 return *this;
926 }
927
928 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
929 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
930 return *this;
931 }*/
932
933
934 RegexMatcher &RegexMatcher::reset(int32_t position, UErrorCode &status) {
935 if (U_FAILURE(status)) {
936 return *this;
937 }
938 reset(); // Reset also resets the region to be the entire string.
939 if (position < 0 || position >= fActiveLimit) {
940 status = U_INDEX_OUTOFBOUNDS_ERROR;
941 return *this;
942 }
943 fMatchEnd = position;
944 return *this;
945 }
946
947
948
949
950
951 //--------------------------------------------------------------------------------
952 //
953 // setTrace
954 //
955 //--------------------------------------------------------------------------------
956 void RegexMatcher::setTrace(UBool state) {
957 fTraceDebug = state;
958 }
959
960
961
962 //---------------------------------------------------------------------
963 //
964 // split
965 //
966 //---------------------------------------------------------------------
967 int32_t RegexMatcher::split(const UnicodeString &input,
968 UnicodeString dest[],
969 int32_t destCapacity,
970 UErrorCode &status)
971 {
972 //
973 // Check arguements for validity
974 //
975 if (U_FAILURE(status)) {
976 return 0;
977 };
978
979 if (destCapacity < 1) {
980 status = U_ILLEGAL_ARGUMENT_ERROR;
981 return 0;
982 }
983
984 //
985 // Reset for the input text
986 //
987 reset(input);
988 int32_t nextOutputStringStart = 0;
989 if (fActiveLimit == 0) {
990 return 0;
991 }
992
993 //
994 // Loop through the input text, searching for the delimiter pattern
995 //
996 int32_t i;
997 int32_t numCaptureGroups = fPattern->fGroupMap->size();
998 for (i=0; ; i++) {
999 if (i>=destCapacity-1) {
1000 // There is one or zero output string left.
1001 // Fill the last output string with whatever is left from the input, then exit the loop.
1002 // ( i will be == destCapicity if we filled the output array while processing
1003 // capture groups of the delimiter expression, in which case we will discard the
1004 // last capture group saved in favor of the unprocessed remainder of the
1005 // input string.)
1006 i = destCapacity-1;
1007 int32_t remainingLength = fActiveLimit-nextOutputStringStart;
1008 if (remainingLength > 0) {
1009 dest[i].setTo(input, nextOutputStringStart, remainingLength);
1010 }
1011 break;
1012 }
1013 if (find()) {
1014 // We found another delimiter. Move everything from where we started looking
1015 // up until the start of the delimiter into the next output string.
1016 int32_t fieldLen = fMatchStart - nextOutputStringStart;
1017 dest[i].setTo(input, nextOutputStringStart, fieldLen);
1018 nextOutputStringStart = fMatchEnd;
1019
1020 // If the delimiter pattern has capturing parentheses, the captured
1021 // text goes out into the next n destination strings.
1022 int32_t groupNum;
1023 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
1024 if (i==destCapacity-1) {
1025 break;
1026 }
1027 i++;
1028 dest[i] = group(groupNum, status);
1029 }
1030
1031 if (nextOutputStringStart == fActiveLimit) {
1032 // The delimiter was at the end of the string. We're done.
1033 break;
1034 }
1035
1036 }
1037 else
1038 {
1039 // We ran off the end of the input while looking for the next delimiter.
1040 // All the remaining text goes into the current output string.
1041 dest[i].setTo(input, nextOutputStringStart, fActiveLimit-nextOutputStringStart);
1042 break;
1043 }
1044 }
1045 return i+1;
1046 }
1047
1048
1049
1050 //--------------------------------------------------------------------------------
1051 //
1052 // start
1053 //
1054 //--------------------------------------------------------------------------------
1055 int32_t RegexMatcher::start(UErrorCode &status) const {
1056 return start(0, status);
1057 }
1058
1059
1060
1061
1062 //--------------------------------------------------------------------------------
1063 //
1064 // start(int32_t group, UErrorCode &status)
1065 //
1066 //--------------------------------------------------------------------------------
1067 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
1068 if (U_FAILURE(status)) {
1069 return -1;
1070 }
1071 if (U_FAILURE(fDeferredStatus)) {
1072 status = fDeferredStatus;
1073 return -1;
1074 }
1075 if (fMatch == FALSE) {
1076 status = U_REGEX_INVALID_STATE;
1077 return -1;
1078 }
1079 if (group < 0 || group > fPattern->fGroupMap->size()) {
1080 status = U_INDEX_OUTOFBOUNDS_ERROR;
1081 return -1;
1082 }
1083 int32_t s;
1084 if (group == 0) {
1085 s = fMatchStart;
1086 } else {
1087 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
1088 U_ASSERT(groupOffset < fPattern->fFrameSize);
1089 U_ASSERT(groupOffset >= 0);
1090 s = fFrame->fExtra[groupOffset];
1091 }
1092 return s;
1093 }
1094
1095
1096
1097 //--------------------------------------------------------------------------------
1098 //
1099 // useAnchoringBounds
1100 //
1101 //--------------------------------------------------------------------------------
1102 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
1103 fAnchoringBounds = b;
1104 UErrorCode status = U_ZERO_ERROR;
1105 region(fRegionStart, fRegionLimit, status);
1106 U_ASSERT(U_SUCCESS(status));
1107 return *this;
1108 }
1109
1110
1111 //--------------------------------------------------------------------------------
1112 //
1113 // useTransparentBounds
1114 //
1115 //--------------------------------------------------------------------------------
1116 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
1117 fTransparentBounds = b;
1118 UErrorCode status = U_ZERO_ERROR;
1119 region(fRegionStart, fRegionLimit, status);
1120 U_ASSERT(U_SUCCESS(status));
1121 return *this;
1122 }
1123
1124 //--------------------------------------------------------------------------------
1125 //
1126 // setTimeLimit
1127 //
1128 //--------------------------------------------------------------------------------
1129 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
1130 if (U_FAILURE(status)) {
1131 return;
1132 }
1133 if (U_FAILURE(fDeferredStatus)) {
1134 status = fDeferredStatus;
1135 return;
1136 }
1137 if (limit < 0) {
1138 status = U_ILLEGAL_ARGUMENT_ERROR;
1139 return;
1140 }
1141 fTimeLimit = limit;
1142 }
1143
1144
1145 //--------------------------------------------------------------------------------
1146 //
1147 // getTimeLimit
1148 //
1149 //--------------------------------------------------------------------------------
1150 int32_t RegexMatcher::getTimeLimit() const {
1151 return fTimeLimit;
1152 }
1153
1154
1155 //--------------------------------------------------------------------------------
1156 //
1157 // setStackLimit
1158 //
1159 //--------------------------------------------------------------------------------
1160 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
1161 if (U_FAILURE(status)) {
1162 return;
1163 }
1164 if (U_FAILURE(fDeferredStatus)) {
1165 status = fDeferredStatus;
1166 return;
1167 }
1168 if (limit < 0) {
1169 status = U_ILLEGAL_ARGUMENT_ERROR;
1170 return;
1171 }
1172
1173 // Reset the matcher. This is needed here in case there is a current match
1174 // whose final stack frame (containing the match results, pointed to by fFrame)
1175 // would be lost by resizing to a smaller stack size.
1176 reset();
1177
1178 if (limit == 0) {
1179 // Unlimited stack expansion
1180 fStack->setMaxCapacity(0);
1181 } else {
1182 // Change the units of the limit from bytes to ints, and bump the size up
1183 // to be big enough to hold at least one stack frame for the pattern,
1184 // if it isn't there already.
1185 int32_t adjustedLimit = limit / sizeof(int32_t);
1186 if (adjustedLimit < fPattern->fFrameSize) {
1187 adjustedLimit = fPattern->fFrameSize;
1188 }
1189 fStack->setMaxCapacity(adjustedLimit);
1190 }
1191 fStackLimit = limit;
1192 }
1193
1194
1195 //--------------------------------------------------------------------------------
1196 //
1197 // getStackLimit
1198 //
1199 //--------------------------------------------------------------------------------
1200 int32_t RegexMatcher::getStackLimit() const {
1201 return fStackLimit;
1202 }
1203
1204
1205 //--------------------------------------------------------------------------------
1206 //
1207 // setMatchCallback
1208 //
1209 //--------------------------------------------------------------------------------
1210 void RegexMatcher::setMatchCallback(URegexMatchCallback *callback,
1211 const void *context,
1212 UErrorCode &status) {
1213 if (U_FAILURE(status)) {
1214 return;
1215 }
1216 fCallbackFn = callback;
1217 fCallbackContext = context;
1218 }
1219
1220
1221 //--------------------------------------------------------------------------------
1222 //
1223 // getMatchCallback
1224 //
1225 //--------------------------------------------------------------------------------
1226 void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback,
1227 const void *&context,
1228 UErrorCode &status) {
1229 if (U_FAILURE(status)) {
1230 return;
1231 }
1232 callback = fCallbackFn;
1233 context = fCallbackContext;
1234 }
1235
1236
1237 //================================================================================
1238 //
1239 // Code following this point in this file is the internal
1240 // Match Engine Implementation.
1241 //
1242 //================================================================================
1243
1244
1245 //--------------------------------------------------------------------------------
1246 //
1247 // resetStack
1248 // Discard any previous contents of the state save stack, and initialize a
1249 // new stack frame to all -1. The -1s are needed for capture group limits,
1250 // where they indicate that a group has not yet matched anything.
1251 //--------------------------------------------------------------------------------
1252 REStackFrame *RegexMatcher::resetStack() {
1253 // Discard any previous contents of the state save stack, and initialize a
1254 // new stack frame to all -1. The -1s are needed for capture group limits, where
1255 // they indicate that a group has not yet matched anything.
1256 fStack->removeAllElements();
1257
1258 int32_t *iFrame = fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
1259 int32_t i;
1260 for (i=0; i<fPattern->fFrameSize; i++) {
1261 iFrame[i] = -1;
1262 }
1263 return (REStackFrame *)iFrame;
1264 }
1265
1266
1267
1268 //--------------------------------------------------------------------------------
1269 //
1270 // isWordBoundary
1271 // in perl, "xab..cd..", \b is true at positions 0,3,5,7
1272 // For us,
1273 // If the current char is a combining mark,
1274 // \b is FALSE.
1275 // Else Scan backwards to the first non-combining char.
1276 // We are at a boundary if the this char and the original chars are
1277 // opposite in membership in \w set
1278 //
1279 // parameters: pos - the current position in the input buffer
1280 //
1281 // TODO: double-check edge cases at region boundaries.
1282 //
1283 //--------------------------------------------------------------------------------
1284 UBool RegexMatcher::isWordBoundary(int32_t pos) {
1285 UBool isBoundary = FALSE;
1286 UBool cIsWord = FALSE;
1287
1288 if (pos >= fLookLimit) {
1289 fHitEnd = TRUE;
1290 } else {
1291 // Determine whether char c at current position is a member of the word set of chars.
1292 // If we're off the end of the string, behave as though we're not at a word char.
1293 UChar32 c = fInput->char32At(pos);
1294 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
1295 // Current char is a combining one. Not a boundary.
1296 return FALSE;
1297 }
1298 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
1299 }
1300
1301 // Back up until we come to a non-combining char, determine whether
1302 // that char is a word char.
1303 UBool prevCIsWord = FALSE;
1304 int32_t prevPos = pos;
1305 for (;;) {
1306 if (prevPos <= fLookStart) {
1307 break;
1308 }
1309 prevPos = fInput->moveIndex32(prevPos, -1);
1310 UChar32 prevChar = fInput->char32At(prevPos);
1311 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
1312 || u_charType(prevChar) == U_FORMAT_CHAR)) {
1313 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
1314 break;
1315 }
1316 }
1317 isBoundary = cIsWord ^ prevCIsWord;
1318 return isBoundary;
1319 }
1320
1321 //--------------------------------------------------------------------------------
1322 //
1323 // isUWordBoundary
1324 //
1325 // Test for a word boundary using RBBI word break.
1326 //
1327 // parameters: pos - the current position in the input buffer
1328 //
1329 //--------------------------------------------------------------------------------
1330 UBool RegexMatcher::isUWordBoundary(int32_t pos) {
1331 UBool returnVal = FALSE;
1332 #if UCONFIG_NO_BREAK_ITERATION==0
1333
1334 // If we haven't yet created a break iterator for this matcher, do it now.
1335 if (fWordBreakItr == NULL) {
1336 fWordBreakItr =
1337 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
1338 if (U_FAILURE(fDeferredStatus)) {
1339 return FALSE;
1340 }
1341 fWordBreakItr->setText(*fInput);
1342 }
1343
1344 if (pos >= fLookLimit) {
1345 fHitEnd = TRUE;
1346 returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real"
1347 // words are not boundaries. All non-word chars stand by themselves,
1348 // with word boundaries on both sides.
1349 } else {
1350 returnVal = fWordBreakItr->isBoundary(pos);
1351 }
1352 #endif
1353 return returnVal;
1354 }
1355
1356 //--------------------------------------------------------------------------------
1357 //
1358 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state
1359 // saves. Increment the "time" counter, and call the
1360 // user callback function if there is one installed.
1361 //
1362 // If the match operation needs to be aborted, either for a time-out
1363 // or because the user callback asked for it, just set an error status.
1364 // The engine will pick that up and stop in its outer loop.
1365 //
1366 //--------------------------------------------------------------------------------
1367 void RegexMatcher::IncrementTime(UErrorCode &status) {
1368 fTickCounter = TIMER_INITIAL_VALUE;
1369 fTime++;
1370 if (fCallbackFn != NULL) {
1371 if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
1372 status = U_REGEX_STOPPED_BY_CALLER;
1373 return;
1374 }
1375 }
1376 if (fTimeLimit > 0 && fTime >= fTimeLimit) {
1377 status = U_REGEX_TIME_OUT;
1378 }
1379 }
1380
1381 //--------------------------------------------------------------------------------
1382 //
1383 // StateSave
1384 // Make a new stack frame, initialized as a copy of the current stack frame.
1385 // Set the pattern index in the original stack frame from the operand value
1386 // in the opcode. Execution of the engine continues with the state in
1387 // the newly created stack frame
1388 //
1389 // Note that reserveBlock() may grow the stack, resulting in the
1390 // whole thing being relocated in memory.
1391 //
1392 // Parameters:
1393 // fp The top frame pointer when called. At return, a new
1394 // fame will be present
1395 // savePatIdx An index into the compiled pattern. Goes into the original
1396 // (not new) frame. If execution ever back-tracks out of the
1397 // new frame, this will be where we continue from in the pattern.
1398 // Return
1399 // The new frame pointer.
1400 //
1401 //--------------------------------------------------------------------------------
1402 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int32_t savePatIdx, UErrorCode &status) {
1403 // push storage for a new frame.
1404 int32_t *newFP = fStack->reserveBlock(fFrameSize, status);
1405 if (newFP == NULL) {
1406 // Failure on attempted stack expansion.
1407 // Stack function set some other error code, change it to a more
1408 // specific one for regular expressions.
1409 status = U_REGEX_STACK_OVERFLOW;
1410 // We need to return a writable stack frame, so just return the
1411 // previous frame. The match operation will stop quickly
1412 // because of the error status, after which the frame will never
1413 // be looked at again.
1414 return fp;
1415 }
1416 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack.
1417
1418 // New stack frame = copy of old top frame.
1419 int32_t *source = (int32_t *)fp;
1420 int32_t *dest = newFP;
1421 for (;;) {
1422 *dest++ = *source++;
1423 if (source == newFP) {
1424 break;
1425 }
1426 }
1427
1428 fTickCounter--;
1429 if (fTickCounter <= 0) {
1430 IncrementTime(status); // Re-initializes fTickCounter
1431 }
1432 fp->fPatIdx = savePatIdx;
1433 return (REStackFrame *)newFP;
1434 }
1435
1436
1437 //--------------------------------------------------------------------------------
1438 //
1439 // MatchAt This is the actual matching engine.
1440 //
1441 // startIdx: begin matching a this index.
1442 // toEnd: if true, match must extend to end of the input region
1443 //
1444 //--------------------------------------------------------------------------------
1445 void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
1446 UBool isMatch = FALSE; // True if the we have a match.
1447
1448 int32_t op; // Operation from the compiled pattern, split into
1449 int32_t opType; // the opcode
1450 int32_t opValue; // and the operand value.
1451
1452 #ifdef REGEX_RUN_DEBUG
1453 if (fTraceDebug)
1454 {
1455 printf("MatchAt(startIdx=%d)\n", startIdx);
1456 printf("Original Pattern: ");
1457 int32_t i;
1458 for (i=0; i<fPattern->fPattern.length(); i++) {
1459 printf("%c", fPattern->fPattern.charAt(i));
1460 }
1461 printf("\n");
1462 printf("Input String: ");
1463 for (i=0; i<fInput->length(); i++) {
1464 UChar c = fInput->charAt(i);
1465 if (c<32 || c>256) {
1466 c = '.';
1467 }
1468 printf("%c", c);
1469 }
1470 printf("\n");
1471 printf("\n");
1472 }
1473 #endif
1474
1475 if (U_FAILURE(status)) {
1476 return;
1477 }
1478
1479 // Cache frequently referenced items from the compiled pattern
1480 //
1481 int32_t *pat = fPattern->fCompiledPat->getBuffer();
1482
1483 const UChar *litText = fPattern->fLiteralText.getBuffer();
1484 UVector *sets = fPattern->fSets;
1485
1486 const UChar *inputBuf = fInput->getBuffer();
1487
1488 fFrameSize = fPattern->fFrameSize;
1489 REStackFrame *fp = resetStack();
1490
1491 fp->fPatIdx = 0;
1492 fp->fInputIdx = startIdx;
1493
1494 // Zero out the pattern's static data
1495 int32_t i;
1496 for (i = 0; i<fPattern->fDataSize; i++) {
1497 fData[i] = 0;
1498 }
1499
1500 //
1501 // Main loop for interpreting the compiled pattern.
1502 // One iteration of the loop per pattern operation performed.
1503 //
1504 for (;;) {
1505 #if 0
1506 if (_heapchk() != _HEAPOK) {
1507 fprintf(stderr, "Heap Trouble\n");
1508 }
1509 #endif
1510 op = pat[fp->fPatIdx];
1511 opType = URX_TYPE(op);
1512 opValue = URX_VAL(op);
1513 #ifdef REGEX_RUN_DEBUG
1514 if (fTraceDebug) {
1515 printf("inputIdx=%d inputChar=%c sp=%3d ", fp->fInputIdx,
1516 fInput->char32At(fp->fInputIdx), (int32_t *)fp-fStack->getBuffer());
1517 fPattern->dumpOp(fp->fPatIdx);
1518 }
1519 #endif
1520 fp->fPatIdx++;
1521
1522 switch (opType) {
1523
1524
1525 case URX_NOP:
1526 break;
1527
1528
1529 case URX_BACKTRACK:
1530 // Force a backtrack. In some circumstances, the pattern compiler
1531 // will notice that the pattern can't possibly match anything, and will
1532 // emit one of these at that point.
1533 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1534 break;
1535
1536
1537 case URX_ONECHAR:
1538 if (fp->fInputIdx < fActiveLimit) {
1539 UChar32 c;
1540 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
1541 if (c == opValue) {
1542 break;
1543 }
1544 } else {
1545 fHitEnd = TRUE;
1546 }
1547 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1548 break;
1549
1550
1551 case URX_STRING:
1552 {
1553 // Test input against a literal string.
1554 // Strings require two slots in the compiled pattern, one for the
1555 // offset to the string text, and one for the length.
1556 int32_t stringStartIdx = opValue;
1557 int32_t stringLen;
1558
1559 op = pat[fp->fPatIdx]; // Fetch the second operand
1560 fp->fPatIdx++;
1561 opType = URX_TYPE(op);
1562 stringLen = URX_VAL(op);
1563 U_ASSERT(opType == URX_STRING_LEN);
1564 U_ASSERT(stringLen >= 2);
1565
1566 if (fp->fInputIdx + stringLen > fActiveLimit) {
1567 // No match. String is longer than the remaining input text.
1568 fHitEnd = TRUE; // TODO: See ticket 6074
1569 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1570 break;
1571 }
1572
1573 const UChar * pInp = inputBuf + fp->fInputIdx;
1574 const UChar * pPat = litText+stringStartIdx;
1575 const UChar * pEnd = pInp + stringLen;
1576 for(;;) {
1577 if (*pInp == *pPat) {
1578 pInp++;
1579 pPat++;
1580 if (pInp == pEnd) {
1581 // Successful Match.
1582 fp->fInputIdx += stringLen;
1583 break;
1584 }
1585 } else {
1586 // Match failed.
1587 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1588 break;
1589 }
1590 }
1591 }
1592 break;
1593
1594
1595
1596 case URX_STATE_SAVE:
1597 fp = StateSave(fp, opValue, status);
1598 break;
1599
1600
1601 case URX_END:
1602 // The match loop will exit via this path on a successful match,
1603 // when we reach the end of the pattern.
1604 if (toEnd && fp->fInputIdx != fActiveLimit) {
1605 // The pattern matched, but not to the end of input. Try some more.
1606 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1607 break;
1608 }
1609 isMatch = TRUE;
1610 goto breakFromLoop;
1611
1612 // Start and End Capture stack frame variables are layout out like this:
1613 // fp->fExtra[opValue] - The start of a completed capture group
1614 // opValue+1 - The end of a completed capture group
1615 // opValue+2 - the start of a capture group whose end
1616 // has not yet been reached (and might not ever be).
1617 case URX_START_CAPTURE:
1618 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
1619 fp->fExtra[opValue+2] = fp->fInputIdx;
1620 break;
1621
1622
1623 case URX_END_CAPTURE:
1624 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
1625 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
1626 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
1627 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
1628 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
1629 break;
1630
1631
1632 case URX_DOLLAR: // $, test for End of line
1633 // or for position before new line at end of input
1634 if (fp->fInputIdx < fAnchorLimit-2) {
1635 // We are no where near the end of input. Fail.
1636 // This is the common case. Keep it first.
1637 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1638 break;
1639 }
1640 if (fp->fInputIdx >= fAnchorLimit) {
1641 // We really are at the end of input. Success.
1642 fHitEnd = TRUE;
1643 fRequireEnd = TRUE;
1644 break;
1645 }
1646 // If we are positioned just before a new-line that is located at the
1647 // end of input, succeed.
1648 if (fp->fInputIdx == fAnchorLimit-1) {
1649 UChar32 c = fInput->char32At(fp->fInputIdx);
1650 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
1651 // If not in the middle of a CR/LF sequence
1652 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
1653 // At new-line at end of input. Success
1654 fHitEnd = TRUE;
1655 fRequireEnd = TRUE;
1656 break;
1657 }
1658 }
1659 }
1660
1661 if (fp->fInputIdx == fAnchorLimit-2 &&
1662 fInput->char32At(fp->fInputIdx) == 0x0d && fInput->char32At(fp->fInputIdx+1) == 0x0a) {
1663 fHitEnd = TRUE;
1664 fRequireEnd = TRUE;
1665 break; // At CR/LF at end of input. Success
1666 }
1667
1668 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1669
1670 break;
1671
1672
1673 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
1674 if (fp->fInputIdx >= fAnchorLimit-1) {
1675 // Either at the last character of input, or off the end.
1676 if (fp->fInputIdx == fAnchorLimit-1) {
1677 // At last char of input. Success if it's a new line.
1678 if (fInput->char32At(fp->fInputIdx) == 0x0a) {
1679 fHitEnd = TRUE;
1680 fRequireEnd = TRUE;
1681 break;
1682 }
1683 } else {
1684 // Off the end of input. Success.
1685 fHitEnd = TRUE;
1686 fRequireEnd = TRUE;
1687 break;
1688 }
1689 }
1690
1691 // Not at end of input. Back-track out.
1692 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1693 break;
1694
1695
1696 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
1697 {
1698 if (fp->fInputIdx >= fAnchorLimit) {
1699 // We really are at the end of input. Success.
1700 fHitEnd = TRUE;
1701 fRequireEnd = TRUE;
1702 break;
1703 }
1704 // If we are positioned just before a new-line, succeed.
1705 // It makes no difference where the new-line is within the input.
1706 UChar32 c = inputBuf[fp->fInputIdx];
1707 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
1708 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
1709 // In multi-line mode, hitting a new-line just before the end of input does not
1710 // set the hitEnd or requireEnd flags
1711 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
1712 break;
1713 }
1714 }
1715 // not at a new line. Fail.
1716 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1717 }
1718 break;
1719
1720
1721 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
1722 {
1723 if (fp->fInputIdx >= fAnchorLimit) {
1724 // We really are at the end of input. Success.
1725 fHitEnd = TRUE;
1726 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
1727 break; // adding a new-line would not lose the match.
1728 }
1729 // If we are not positioned just before a new-line, the test fails; backtrack out.
1730 // It makes no difference where the new-line is within the input.
1731 if (inputBuf[fp->fInputIdx] != 0x0a) {
1732 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1733 }
1734 }
1735 break;
1736
1737
1738 case URX_CARET: // ^, test for start of line
1739 if (fp->fInputIdx != fAnchorStart) {
1740 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1741 }
1742 break;
1743
1744
1745 case URX_CARET_M: // ^, test for start of line in mulit-line mode
1746 {
1747 if (fp->fInputIdx == fAnchorStart) {
1748 // We are at the start input. Success.
1749 break;
1750 }
1751 // Check whether character just before the current pos is a new-line
1752 // unless we are at the end of input
1753 UChar c = inputBuf[fp->fInputIdx - 1];
1754 if ((fp->fInputIdx < fAnchorLimit) &&
1755 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
1756 // It's a new-line. ^ is true. Success.
1757 // TODO: what should be done with positions between a CR and LF?
1758 break;
1759 }
1760 // Not at the start of a line. Fail.
1761 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1762 }
1763 break;
1764
1765
1766 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
1767 {
1768 U_ASSERT(fp->fInputIdx >= fAnchorStart);
1769 if (fp->fInputIdx <= fAnchorStart) {
1770 // We are at the start input. Success.
1771 break;
1772 }
1773 // Check whether character just before the current pos is a new-line
1774 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
1775 UChar c = inputBuf[fp->fInputIdx - 1];
1776 if (c != 0x0a) {
1777 // Not at the start of a line. Back-track out.
1778 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1779 }
1780 }
1781 break;
1782
1783 case URX_BACKSLASH_B: // Test for word boundaries
1784 {
1785 UBool success = isWordBoundary(fp->fInputIdx);
1786 success ^= (opValue != 0); // flip sense for \B
1787 if (!success) {
1788 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1789 }
1790 }
1791 break;
1792
1793
1794 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
1795 {
1796 UBool success = isUWordBoundary(fp->fInputIdx);
1797 success ^= (opValue != 0); // flip sense for \B
1798 if (!success) {
1799 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1800 }
1801 }
1802 break;
1803
1804
1805 case URX_BACKSLASH_D: // Test for decimal digit
1806 {
1807 if (fp->fInputIdx >= fActiveLimit) {
1808 fHitEnd = TRUE;
1809 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1810 break;
1811 }
1812
1813 UChar32 c = fInput->char32At(fp->fInputIdx);
1814 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
1815 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
1816 success ^= (opValue != 0); // flip sense for \D
1817 if (success) {
1818 fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1);
1819 } else {
1820 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1821 }
1822 }
1823 break;
1824
1825
1826 case URX_BACKSLASH_G: // Test for position at end of previous match
1827 if (!((fMatch && fp->fInputIdx==fMatchEnd) || fMatch==FALSE && fp->fInputIdx==fActiveStart)) {
1828 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1829 }
1830 break;
1831
1832
1833 case URX_BACKSLASH_X:
1834 // Match a Grapheme, as defined by Unicode TR 29.
1835 // Differs slightly from Perl, which consumes combining marks independently
1836 // of context.
1837 {
1838
1839 // Fail if at end of input
1840 if (fp->fInputIdx >= fActiveLimit) {
1841 fHitEnd = TRUE;
1842 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1843 break;
1844 }
1845
1846 // Examine (and consume) the current char.
1847 // Dispatch into a little state machine, based on the char.
1848 UChar32 c;
1849 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
1850 UnicodeSet **sets = fPattern->fStaticSets;
1851 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
1852 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
1853 if (sets[URX_GC_L]->contains(c)) goto GC_L;
1854 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
1855 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
1856 if (sets[URX_GC_V]->contains(c)) goto GC_V;
1857 if (sets[URX_GC_T]->contains(c)) goto GC_T;
1858 goto GC_Extend;
1859
1860
1861
1862 GC_L:
1863 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
1864 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
1865 if (sets[URX_GC_L]->contains(c)) goto GC_L;
1866 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
1867 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
1868 if (sets[URX_GC_V]->contains(c)) goto GC_V;
1869 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
1870 goto GC_Extend;
1871
1872 GC_V:
1873 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
1874 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
1875 if (sets[URX_GC_V]->contains(c)) goto GC_V;
1876 if (sets[URX_GC_T]->contains(c)) goto GC_T;
1877 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
1878 goto GC_Extend;
1879
1880 GC_T:
1881 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
1882 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
1883 if (sets[URX_GC_T]->contains(c)) goto GC_T;
1884 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
1885 goto GC_Extend;
1886
1887 GC_Extend:
1888 // Combining characters are consumed here
1889 for (;;) {
1890 if (fp->fInputIdx >= fActiveLimit) {
1891 break;
1892 }
1893 U16_GET(inputBuf, 0, fp->fInputIdx, fActiveLimit, c);
1894 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
1895 break;
1896 }
1897 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
1898 }
1899 goto GC_Done;
1900
1901 GC_Control:
1902 // Most control chars stand alone (don't combine with combining chars),
1903 // except for that CR/LF sequence is a single grapheme cluster.
1904 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
1905 fp->fInputIdx++;
1906 }
1907
1908 GC_Done:
1909 if (fp->fInputIdx >= fActiveLimit) {
1910 fHitEnd = TRUE;
1911 }
1912 break;
1913 }
1914
1915
1916
1917
1918 case URX_BACKSLASH_Z: // Test for end of Input
1919 if (fp->fInputIdx < fAnchorLimit) {
1920 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1921 } else {
1922 fHitEnd = TRUE;
1923 fRequireEnd = TRUE;
1924 }
1925 break;
1926
1927
1928
1929 case URX_STATIC_SETREF:
1930 {
1931 // Test input character against one of the predefined sets
1932 // (Word Characters, for example)
1933 // The high bit of the op value is a flag for the match polarity.
1934 // 0: success if input char is in set.
1935 // 1: success if input char is not in set.
1936 if (fp->fInputIdx >= fActiveLimit) {
1937 fHitEnd = TRUE;
1938 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1939 break;
1940 }
1941
1942 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
1943 opValue &= ~URX_NEG_SET;
1944 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
1945 UChar32 c;
1946 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
1947 if (c < 256) {
1948 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
1949 if (s8->contains(c)) {
1950 success = !success;
1951 }
1952 } else {
1953 const UnicodeSet *s = fPattern->fStaticSets[opValue];
1954 if (s->contains(c)) {
1955 success = !success;
1956 }
1957 }
1958 if (!success) {
1959 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1960 }
1961 }
1962 break;
1963
1964
1965 case URX_STAT_SETREF_N:
1966 {
1967 // Test input character for NOT being a member of one of
1968 // the predefined sets (Word Characters, for example)
1969 if (fp->fInputIdx >= fActiveLimit) {
1970 fHitEnd = TRUE;
1971 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1972 break;
1973 }
1974
1975 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
1976 UChar32 c;
1977 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
1978 if (c < 256) {
1979 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
1980 if (s8->contains(c) == FALSE) {
1981 break;
1982 }
1983 } else {
1984 const UnicodeSet *s = fPattern->fStaticSets[opValue];
1985 if (s->contains(c) == FALSE) {
1986 break;
1987 }
1988 }
1989
1990 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1991 }
1992 break;
1993
1994
1995 case URX_SETREF:
1996 if (fp->fInputIdx >= fActiveLimit) {
1997 fHitEnd = TRUE;
1998 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1999 break;
2000 }
2001 // There is input left. Pick up one char and test it for set membership.
2002 UChar32 c;
2003 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
2004 U_ASSERT(opValue > 0 && opValue < sets->size());
2005 if (c<256) {
2006 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
2007 if (s8->contains(c)) {
2008 break;
2009 }
2010 } else {
2011 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
2012 if (s->contains(c)) {
2013 // The character is in the set. A Match.
2014 break;
2015 }
2016 }
2017 // the character wasn't in the set. Back track out.
2018 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2019 break;
2020
2021
2022 case URX_DOTANY:
2023 {
2024 // . matches anything, but stops at end-of-line.
2025 if (fp->fInputIdx >= fActiveLimit) {
2026 // At end of input. Match failed. Backtrack out.
2027 fHitEnd = TRUE;
2028 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2029 break;
2030 }
2031 // There is input left. Advance over one char, unless we've hit end-of-line
2032 UChar32 c;
2033 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
2034 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
2035 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
2036 // End of line in normal mode. . does not match.
2037 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2038 break;
2039 }
2040 }
2041 break;
2042
2043
2044 case URX_DOTANY_ALL:
2045 {
2046 // ., in dot-matches-all (including new lines) mode
2047 if (fp->fInputIdx >= fActiveLimit) {
2048 // At end of input. Match failed. Backtrack out.
2049 fHitEnd = TRUE;
2050 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2051 break;
2052 }
2053 // There is input left. Advance over one char, except if we are
2054 // at a cr/lf, advance over both of them.
2055 UChar32 c;
2056 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
2057 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
2058 // In the case of a CR/LF, we need to advance over both.
2059 UChar nextc = inputBuf[fp->fInputIdx];
2060 if (nextc == 0x0a) {
2061 fp->fInputIdx++;
2062 }
2063 }
2064 }
2065 break;
2066
2067
2068 case URX_DOTANY_UNIX:
2069 {
2070 // '.' operator, matches all, but stops at end-of-line.
2071 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
2072 if (fp->fInputIdx >= fActiveLimit) {
2073 // At end of input. Match failed. Backtrack out.
2074 fHitEnd = TRUE;
2075 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2076 break;
2077 }
2078 // There is input left. Advance over one char, unless we've hit end-of-line
2079 UChar32 c;
2080 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
2081 if (c == 0x0a) {
2082 // End of line in normal mode. '.' does not match the \n
2083 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2084 }
2085 }
2086 break;
2087
2088
2089 case URX_JMP:
2090 fp->fPatIdx = opValue;
2091 break;
2092
2093 case URX_FAIL:
2094 isMatch = FALSE;
2095 goto breakFromLoop;
2096
2097 case URX_JMP_SAV:
2098 U_ASSERT(opValue < fPattern->fCompiledPat->size());
2099 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
2100 fp->fPatIdx = opValue; // Then JMP.
2101 break;
2102
2103 case URX_JMP_SAV_X:
2104 // This opcode is used with (x)+, when x can match a zero length string.
2105 // Same as JMP_SAV, except conditional on the match having made forward progress.
2106 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
2107 // data address of the input position at the start of the loop.
2108 {
2109 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
2110 int32_t stoOp = pat[opValue-1];
2111 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
2112 int32_t frameLoc = URX_VAL(stoOp);
2113 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
2114 int32_t prevInputIdx = fp->fExtra[frameLoc];
2115 U_ASSERT(prevInputIdx <= fp->fInputIdx);
2116 if (prevInputIdx < fp->fInputIdx) {
2117 // The match did make progress. Repeat the loop.
2118 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
2119 fp->fPatIdx = opValue;
2120 fp->fExtra[frameLoc] = fp->fInputIdx;
2121 }
2122 // If the input position did not advance, we do nothing here,
2123 // execution will fall out of the loop.
2124 }
2125 break;
2126
2127 case URX_CTR_INIT:
2128 {
2129 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
2130 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
2131
2132 // Pick up the three extra operands that CTR_INIT has, and
2133 // skip the pattern location counter past
2134 int32_t instrOperandLoc = fp->fPatIdx;
2135 fp->fPatIdx += 3;
2136 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
2137 int32_t minCount = pat[instrOperandLoc+1];
2138 int32_t maxCount = pat[instrOperandLoc+2];
2139 U_ASSERT(minCount>=0);
2140 U_ASSERT(maxCount>=minCount || maxCount==-1);
2141 U_ASSERT(loopLoc>fp->fPatIdx);
2142
2143 if (minCount == 0) {
2144 fp = StateSave(fp, loopLoc+1, status);
2145 }
2146 if (maxCount == 0) {
2147 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2148 }
2149 }
2150 break;
2151
2152 case URX_CTR_LOOP:
2153 {
2154 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
2155 int32_t initOp = pat[opValue];
2156 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
2157 int32_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
2158 int32_t minCount = pat[opValue+2];
2159 int32_t maxCount = pat[opValue+3];
2160 // Increment the counter. Note: we're not worrying about counter
2161 // overflow, since the data comes from UnicodeStrings, which
2162 // stores its length in an int32_t.
2163 (*pCounter)++;
2164 U_ASSERT(*pCounter > 0);
2165 if ((uint32_t)*pCounter >= (uint32_t)maxCount) {
2166 U_ASSERT(*pCounter == maxCount || maxCount == -1);
2167 break;
2168 }
2169 if (*pCounter >= minCount) {
2170 fp = StateSave(fp, fp->fPatIdx, status);
2171 }
2172 fp->fPatIdx = opValue + 4; // Loop back.
2173 }
2174 break;
2175
2176 case URX_CTR_INIT_NG:
2177 {
2178 // Initialize a non-greedy loop
2179 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
2180 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
2181
2182 // Pick up the three extra operands that CTR_INIT has, and
2183 // skip the pattern location counter past
2184 int32_t instrOperandLoc = fp->fPatIdx;
2185 fp->fPatIdx += 3;
2186 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
2187 int32_t minCount = pat[instrOperandLoc+1];
2188 int32_t maxCount = pat[instrOperandLoc+2];
2189 U_ASSERT(minCount>=0);
2190 U_ASSERT(maxCount>=minCount || maxCount==-1);
2191 U_ASSERT(loopLoc>fp->fPatIdx);
2192
2193 if (minCount == 0) {
2194 if (maxCount != 0) {
2195 fp = StateSave(fp, fp->fPatIdx, status);
2196 }
2197 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
2198 }
2199 }
2200 break;
2201
2202 case URX_CTR_LOOP_NG:
2203 {
2204 // Non-greedy {min, max} loops
2205 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
2206 int32_t initOp = pat[opValue];
2207 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
2208 int32_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
2209 int32_t minCount = pat[opValue+2];
2210 int32_t maxCount = pat[opValue+3];
2211 // Increment the counter. Note: we're not worrying about counter
2212 // overflow, since the data comes from UnicodeStrings, which
2213 // stores its length in an int32_t.
2214 (*pCounter)++;
2215 U_ASSERT(*pCounter > 0);
2216
2217 if ((uint32_t)*pCounter >= (uint32_t)maxCount) {
2218 // The loop has matched the maximum permitted number of times.
2219 // Break out of here with no action. Matching will
2220 // continue with the following pattern.
2221 U_ASSERT(*pCounter == maxCount || maxCount == -1);
2222 break;
2223 }
2224
2225 if (*pCounter < minCount) {
2226 // We haven't met the minimum number of matches yet.
2227 // Loop back for another one.
2228 fp->fPatIdx = opValue + 4; // Loop back.
2229 } else {
2230 // We do have the minimum number of matches.
2231 // Fall into the following pattern, but first do
2232 // a state save to the top of the loop, so that a failure
2233 // in the following pattern will try another iteration of the loop.
2234 fp = StateSave(fp, opValue + 4, status);
2235 }
2236 }
2237 break;
2238
2239 case URX_STO_SP:
2240 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
2241 fData[opValue] = fStack->size();
2242 break;
2243
2244 case URX_LD_SP:
2245 {
2246 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
2247 int32_t newStackSize = fData[opValue];
2248 U_ASSERT(newStackSize <= fStack->size());
2249 int32_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
2250 if (newFP == (int32_t *)fp) {
2251 break;
2252 }
2253 int32_t i;
2254 for (i=0; i<fFrameSize; i++) {
2255 newFP[i] = ((int32_t *)fp)[i];
2256 }
2257 fp = (REStackFrame *)newFP;
2258 fStack->setSize(newStackSize);
2259 }
2260 break;
2261
2262 case URX_BACKREF:
2263 case URX_BACKREF_I:
2264 {
2265 U_ASSERT(opValue < fFrameSize);
2266 int32_t groupStartIdx = fp->fExtra[opValue];
2267 int32_t groupEndIdx = fp->fExtra[opValue+1];
2268 U_ASSERT(groupStartIdx <= groupEndIdx);
2269 int32_t len = groupEndIdx-groupStartIdx;
2270 if (groupStartIdx < 0) {
2271 // This capture group has not participated in the match thus far,
2272 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
2273 }
2274
2275 if (len == 0) {
2276 // The capture group match was of an empty string.
2277 // Verified by testing: Perl matches succeed in this case, so
2278 // we do too.
2279 break;
2280 }
2281
2282 UBool haveMatch = FALSE;
2283 if (fp->fInputIdx + len <= fActiveLimit) {
2284 if (opType == URX_BACKREF) {
2285 if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, len) == 0) {
2286 haveMatch = TRUE;
2287 }
2288 } else {
2289 if (u_strncasecmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx,
2290 len, U_FOLD_CASE_DEFAULT) == 0) {
2291 haveMatch = TRUE;
2292 }
2293 }
2294 } else {
2295 // TODO: probably need to do a partial string comparison, and only
2296 // set HitEnd if the available input matched. Ticket #6074
2297 fHitEnd = TRUE;
2298 }
2299 if (haveMatch) {
2300 fp->fInputIdx += len; // Match. Advance current input position.
2301 } else {
2302 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
2303 }
2304 }
2305 break;
2306
2307 case URX_STO_INP_LOC:
2308 {
2309 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
2310 fp->fExtra[opValue] = fp->fInputIdx;
2311 }
2312 break;
2313
2314 case URX_JMPX:
2315 {
2316 int32_t instrOperandLoc = fp->fPatIdx;
2317 fp->fPatIdx += 1;
2318 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
2319 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
2320 int32_t savedInputIdx = fp->fExtra[dataLoc];
2321 U_ASSERT(savedInputIdx <= fp->fInputIdx);
2322 if (savedInputIdx < fp->fInputIdx) {
2323 fp->fPatIdx = opValue; // JMP
2324 } else {
2325 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
2326 }
2327 }
2328 break;
2329
2330 case URX_LA_START:
2331 {
2332 // Entering a lookahead block.
2333 // Save Stack Ptr, Input Pos.
2334 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2335 fData[opValue] = fStack->size();
2336 fData[opValue+1] = fp->fInputIdx;
2337 fActiveStart = fLookStart; // Set the match region change for
2338 fActiveLimit = fLookLimit; // transparent bounds.
2339 }
2340 break;
2341
2342 case URX_LA_END:
2343 {
2344 // Leaving a look-ahead block.
2345 // restore Stack Ptr, Input Pos to positions they had on entry to block.
2346 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2347 int32_t stackSize = fStack->size();
2348 int32_t newStackSize = fData[opValue];
2349 U_ASSERT(stackSize >= newStackSize);
2350 if (stackSize > newStackSize) {
2351 // Copy the current top frame back to the new (cut back) top frame.
2352 // This makes the capture groups from within the look-ahead
2353 // expression available.
2354 int32_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
2355 int32_t i;
2356 for (i=0; i<fFrameSize; i++) {
2357 newFP[i] = ((int32_t *)fp)[i];
2358 }
2359 fp = (REStackFrame *)newFP;
2360 fStack->setSize(newStackSize);
2361 }
2362 fp->fInputIdx = fData[opValue+1];
2363
2364 // Restore the active region bounds in the input string; they may have
2365 // been changed because of transparent bounds on a Region.
2366 fActiveStart = fRegionStart;
2367 fActiveLimit = fRegionLimit;
2368 }
2369 break;
2370
2371 case URX_ONECHAR_I:
2372 if (fp->fInputIdx < fActiveLimit) {
2373 UChar32 c;
2374 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
2375 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
2376 break;
2377 }
2378 } else {
2379 fHitEnd = TRUE;
2380 }
2381 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2382 break;
2383
2384 case URX_STRING_I:
2385 {
2386 // Test input against a literal string.
2387 // Strings require two slots in the compiled pattern, one for the
2388 // offset to the string text, and one for the length.
2389 int32_t stringStartIdx, stringLen;
2390 stringStartIdx = opValue;
2391
2392 op = pat[fp->fPatIdx];
2393 fp->fPatIdx++;
2394 opType = URX_TYPE(op);
2395 opValue = URX_VAL(op);
2396 U_ASSERT(opType == URX_STRING_LEN);
2397 stringLen = opValue;
2398
2399 int32_t stringEndIndex = fp->fInputIdx + stringLen;
2400 if (stringEndIndex <= fActiveLimit) {
2401 if (u_strncasecmp(inputBuf+fp->fInputIdx, litText+stringStartIdx,
2402 stringLen, U_FOLD_CASE_DEFAULT) == 0) {
2403 // Success. Advance the current input position.
2404 fp->fInputIdx = stringEndIndex;
2405 break;
2406 }
2407 } else {
2408 // Insufficent input left for a match.
2409 fHitEnd = TRUE; // See ticket 6074
2410 }
2411 // No match. Back up matching to a saved state
2412 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2413 }
2414 break;
2415
2416 case URX_LB_START:
2417 {
2418 // Entering a look-behind block.
2419 // Save Stack Ptr, Input Pos.
2420 // TODO: implement transparent bounds. Ticket #6067
2421 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2422 fData[opValue] = fStack->size();
2423 fData[opValue+1] = fp->fInputIdx;
2424 // Init the variable containing the start index for attempted matches.
2425 fData[opValue+2] = -1;
2426 // Save input string length, then reset to pin any matches to end at
2427 // the current position.
2428 fData[opValue+3] = fActiveLimit;
2429 fActiveLimit = fp->fInputIdx;
2430 }
2431 break;
2432
2433
2434 case URX_LB_CONT:
2435 {
2436 // Positive Look-Behind, at top of loop checking for matches of LB expression
2437 // at all possible input starting positions.
2438
2439 // Fetch the min and max possible match lengths. They are the operands
2440 // of this op in the pattern.
2441 int32_t minML = pat[fp->fPatIdx++];
2442 int32_t maxML = pat[fp->fPatIdx++];
2443 U_ASSERT(minML <= maxML);
2444 U_ASSERT(minML >= 0);
2445
2446 // Fetch (from data) the last input index where a match was attempted.
2447 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2448 int32_t *lbStartIdx = &fData[opValue+2];
2449 if (*lbStartIdx < 0) {
2450 // First time through loop.
2451 *lbStartIdx = fp->fInputIdx - minML;
2452 } else {
2453 // 2nd through nth time through the loop.
2454 // Back up start position for match by one.
2455 if (*lbStartIdx == 0) {
2456 (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0.
2457 } else {
2458 U16_BACK_1(inputBuf, 0, *lbStartIdx);
2459 }
2460 }
2461
2462 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
2463 // We have tried all potential match starting points without
2464 // getting a match. Backtrack out, and out of the
2465 // Look Behind altogether.
2466 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2467 int32_t restoreInputLen = fData[opValue+3];
2468 U_ASSERT(restoreInputLen >= fActiveLimit);
2469 U_ASSERT(restoreInputLen <= fInput->length());
2470 fActiveLimit = restoreInputLen;
2471 break;
2472 }
2473
2474 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
2475 // (successful match will fall off the end of the loop.)
2476 fp = StateSave(fp, fp->fPatIdx-3, status);
2477 fp->fInputIdx = *lbStartIdx;
2478 }
2479 break;
2480
2481 case URX_LB_END:
2482 // End of a look-behind block, after a successful match.
2483 {
2484 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2485 if (fp->fInputIdx != fActiveLimit) {
2486 // The look-behind expression matched, but the match did not
2487 // extend all the way to the point that we are looking behind from.
2488 // FAIL out of here, which will take us back to the LB_CONT, which
2489 // will retry the match starting at another position or fail
2490 // the look-behind altogether, whichever is appropriate.
2491 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2492 break;
2493 }
2494
2495 // Look-behind match is good. Restore the orignal input string length,
2496 // which had been truncated to pin the end of the lookbehind match to the
2497 // position being looked-behind.
2498 int32_t originalInputLen = fData[opValue+3];
2499 U_ASSERT(originalInputLen >= fActiveLimit);
2500 U_ASSERT(originalInputLen <= fInput->length());
2501 fActiveLimit = originalInputLen;
2502 }
2503 break;
2504
2505
2506 case URX_LBN_CONT:
2507 {
2508 // Negative Look-Behind, at top of loop checking for matches of LB expression
2509 // at all possible input starting positions.
2510
2511 // Fetch the extra parameters of this op.
2512 int32_t minML = pat[fp->fPatIdx++];
2513 int32_t maxML = pat[fp->fPatIdx++];
2514 int32_t continueLoc = pat[fp->fPatIdx++];
2515 continueLoc = URX_VAL(continueLoc);
2516 U_ASSERT(minML <= maxML);
2517 U_ASSERT(minML >= 0);
2518 U_ASSERT(continueLoc > fp->fPatIdx);
2519
2520 // Fetch (from data) the last input index where a match was attempted.
2521 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2522 int32_t *lbStartIdx = &fData[opValue+2];
2523 if (*lbStartIdx < 0) {
2524 // First time through loop.
2525 *lbStartIdx = fp->fInputIdx - minML;
2526 } else {
2527 // 2nd through nth time through the loop.
2528 // Back up start position for match by one.
2529 if (*lbStartIdx == 0) {
2530 (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0.
2531 } else {
2532 U16_BACK_1(inputBuf, 0, *lbStartIdx);
2533 }
2534 }
2535
2536 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
2537 // We have tried all potential match starting points without
2538 // getting a match, which means that the negative lookbehind as
2539 // a whole has succeeded. Jump forward to the continue location
2540 int32_t restoreInputLen = fData[opValue+3];
2541 U_ASSERT(restoreInputLen >= fActiveLimit);
2542 U_ASSERT(restoreInputLen <= fInput->length());
2543 fActiveLimit = restoreInputLen;
2544 fp->fPatIdx = continueLoc;
2545 break;
2546 }
2547
2548 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
2549 // (successful match will cause a FAIL out of the loop altogether.)
2550 fp = StateSave(fp, fp->fPatIdx-4, status);
2551 fp->fInputIdx = *lbStartIdx;
2552 }
2553 break;
2554
2555 case URX_LBN_END:
2556 // End of a negative look-behind block, after a successful match.
2557 {
2558 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2559 if (fp->fInputIdx != fActiveLimit) {
2560 // The look-behind expression matched, but the match did not
2561 // extend all the way to the point that we are looking behind from.
2562 // FAIL out of here, which will take us back to the LB_CONT, which
2563 // will retry the match starting at another position or succeed
2564 // the look-behind altogether, whichever is appropriate.
2565 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2566 break;
2567 }
2568
2569 // Look-behind expression matched, which means look-behind test as
2570 // a whole Fails
2571
2572 // Restore the orignal input string length, which had been truncated
2573 // inorder to pin the end of the lookbehind match
2574 // to the position being looked-behind.
2575 int32_t originalInputLen = fData[opValue+3];
2576 U_ASSERT(originalInputLen >= fActiveLimit);
2577 U_ASSERT(originalInputLen <= fInput->length());
2578 fActiveLimit = originalInputLen;
2579
2580 // Restore original stack position, discarding any state saved
2581 // by the successful pattern match.
2582 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2583 int32_t newStackSize = fData[opValue];
2584 U_ASSERT(fStack->size() > newStackSize);
2585 fStack->setSize(newStackSize);
2586
2587 // FAIL, which will take control back to someplace
2588 // prior to entering the look-behind test.
2589 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2590 }
2591 break;
2592
2593
2594 case URX_LOOP_SR_I:
2595 // Loop Initialization for the optimized implementation of
2596 // [some character set]*
2597 // This op scans through all matching input.
2598 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
2599 {
2600 U_ASSERT(opValue > 0 && opValue < sets->size());
2601 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
2602 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
2603
2604 // Loop through input, until either the input is exhausted or
2605 // we reach a character that is not a member of the set.
2606 int32_t ix = fp->fInputIdx;
2607 for (;;) {
2608 if (ix >= fActiveLimit) {
2609 fHitEnd = TRUE;
2610 break;
2611 }
2612 UChar32 c;
2613 U16_NEXT(inputBuf, ix, fActiveLimit, c);
2614 if (c<256) {
2615 if (s8->contains(c) == FALSE) {
2616 U16_BACK_1(inputBuf, 0, ix);
2617 break;
2618 }
2619 } else {
2620 if (s->contains(c) == FALSE) {
2621 U16_BACK_1(inputBuf, 0, ix);
2622 break;
2623 }
2624 }
2625 }
2626
2627 // If there were no matching characters, skip over the loop altogether.
2628 // The loop doesn't run at all, a * op always succeeds.
2629 if (ix == fp->fInputIdx) {
2630 fp->fPatIdx++; // skip the URX_LOOP_C op.
2631 break;
2632 }
2633
2634 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
2635 // must follow. It's operand is the stack location
2636 // that holds the starting input index for the match of this [set]*
2637 int32_t loopcOp = pat[fp->fPatIdx];
2638 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
2639 int32_t stackLoc = URX_VAL(loopcOp);
2640 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
2641 fp->fExtra[stackLoc] = fp->fInputIdx;
2642 fp->fInputIdx = ix;
2643
2644 // Save State to the URX_LOOP_C op that follows this one,
2645 // so that match failures in the following code will return to there.
2646 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
2647 fp = StateSave(fp, fp->fPatIdx, status);
2648 fp->fPatIdx++;
2649 }
2650 break;
2651
2652
2653 case URX_LOOP_DOT_I:
2654 // Loop Initialization for the optimized implementation of .*
2655 // This op scans through all remaining input.
2656 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
2657 {
2658 // Loop through input until the input is exhausted (we reach an end-of-line)
2659 // In DOTALL mode, we can just go straight to the end of the input.
2660 int32_t ix;
2661 if ((opValue & 1) == 1) {
2662 // Dot-matches-All mode. Jump straight to the end of the string.
2663 ix = fActiveLimit;
2664 fHitEnd = TRUE;
2665 } else {
2666 // NOT DOT ALL mode. Line endings do not match '.'
2667 // Scan forward until a line ending or end of input.
2668 ix = fp->fInputIdx;
2669 for (;;) {
2670 if (ix >= fActiveLimit) {
2671 fHitEnd = TRUE;
2672 ix = fActiveLimit;
2673 break;
2674 }
2675 UChar32 c;
2676 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++]
2677 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
2678 if ((c == 0x0a) || // 0x0a is newline in both modes.
2679 ((opValue & 2) == 0) && // IF not UNIX_LINES mode
2680 (c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029) {
2681 // char is a line ending. Put the input pos back to the
2682 // line ending char, and exit the scanning loop.
2683 U16_BACK_1(inputBuf, 0, ix);
2684 break;
2685 }
2686 }
2687 }
2688 }
2689
2690 // If there were no matching characters, skip over the loop altogether.
2691 // The loop doesn't run at all, a * op always succeeds.
2692 if (ix == fp->fInputIdx) {
2693 fp->fPatIdx++; // skip the URX_LOOP_C op.
2694 break;
2695 }
2696
2697 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
2698 // must follow. It's operand is the stack location
2699 // that holds the starting input index for the match of this .*
2700 int32_t loopcOp = pat[fp->fPatIdx];
2701 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
2702 int32_t stackLoc = URX_VAL(loopcOp);
2703 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
2704 fp->fExtra[stackLoc] = fp->fInputIdx;
2705 fp->fInputIdx = ix;
2706
2707 // Save State to the URX_LOOP_C op that follows this one,
2708 // so that match failures in the following code will return to there.
2709 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
2710 fp = StateSave(fp, fp->fPatIdx, status);
2711 fp->fPatIdx++;
2712 }
2713 break;
2714
2715
2716 case URX_LOOP_C:
2717 {
2718 U_ASSERT(opValue>=0 && opValue<fFrameSize);
2719 int32_t terminalIdx = fp->fExtra[opValue];
2720 U_ASSERT(terminalIdx <= fp->fInputIdx);
2721 if (terminalIdx == fp->fInputIdx) {
2722 // We've backed up the input idx to the point that the loop started.
2723 // The loop is done. Leave here without saving state.
2724 // Subsequent failures won't come back here.
2725 break;
2726 }
2727 // Set up for the next iteration of the loop, with input index
2728 // backed up by one from the last time through,
2729 // and a state save to this instruction in case the following code fails again.
2730 // (We're going backwards because this loop emulates stack unwinding, not
2731 // the initial scan forward.)
2732 U_ASSERT(fp->fInputIdx > 0);
2733 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
2734 if (inputBuf[fp->fInputIdx] == 0x0a &&
2735 fp->fInputIdx > terminalIdx &&
2736 inputBuf[fp->fInputIdx-1] == 0x0d) {
2737 int32_t prevOp = pat[fp->fPatIdx-2];
2738 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
2739 // .*, stepping back over CRLF pair.
2740 fp->fInputIdx--;
2741 }
2742 }
2743
2744
2745 fp = StateSave(fp, fp->fPatIdx-1, status);
2746 }
2747 break;
2748
2749
2750
2751 default:
2752 // Trouble. The compiled pattern contains an entry with an
2753 // unrecognized type tag.
2754 U_ASSERT(FALSE);
2755 }
2756
2757 if (U_FAILURE(status)) {
2758 isMatch = FALSE;
2759 break;
2760 }
2761 }
2762
2763 breakFromLoop:
2764 fMatch = isMatch;
2765 if (isMatch) {
2766 fLastMatchEnd = fMatchEnd;
2767 fMatchStart = startIdx;
2768 fMatchEnd = fp->fInputIdx;
2769 if (fTraceDebug) {
2770 REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd));
2771 }
2772 }
2773 else
2774 {
2775 if (fTraceDebug) {
2776 REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
2777 }
2778 }
2779
2780 fFrame = fp; // The active stack frame when the engine stopped.
2781 // Contains the capture group results that we need to
2782 // access later.
2783
2784 return;
2785 }
2786
2787
2788
2789 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
2790
2791 U_NAMESPACE_END
2792
2793 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
2794