]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/rematch.cpp
ICU-3.13.tar.gz
[apple/icu.git] / icuSources / i18n / rematch.cpp
CommitLineData
b75a7d8f
A
1//
2// file: rematch.cpp
3//
4// Contains the implementation of class RegexMatcher,
5// which is one of the main API classes for the ICU regular expression package.
6//
7/*
8**************************************************************************
9* Copyright (C) 2002-2003 International Business Machines Corporation *
10* and others. All rights reserved. *
11**************************************************************************
12*/
13
14#include "unicode/utypes.h"
15#if !UCONFIG_NO_REGULAR_EXPRESSIONS
16
17#include "unicode/regex.h"
18#include "unicode/uniset.h"
19#include "unicode/uchar.h"
20#include "unicode/ustring.h"
21#include "uassert.h"
22#include "cmemory.h"
23#include "uvector.h"
24#include "uvectr32.h"
25#include "regeximp.h"
26#include "regexst.h"
27
28// #include <malloc.h> // Needed for heapcheck testing
29
30U_NAMESPACE_BEGIN
31
32//-----------------------------------------------------------------------------
33//
34// Constructor and Destructor
35//
36//-----------------------------------------------------------------------------
37RegexMatcher::RegexMatcher(const RegexPattern *pat) {
38 fPattern = pat;
39 fPatternOwned = NULL;
40 fInput = NULL;
41 fTraceDebug = FALSE;
42 fDeferredStatus = U_ZERO_ERROR;
43 fStack = new UVector32(fDeferredStatus);
44 fData = fSmallData;
45 if (pat==NULL) {
46 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
47 return;
48 }
49 if (pat->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) {
50 fData = (int32_t *)uprv_malloc(pat->fDataSize * sizeof(int32_t));
51 }
52 if (fStack == NULL || fData == NULL) {
53 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
54 }
55
56 reset(*RegexStaticSets::gStaticSets->fEmptyString);
57}
58
59
60
61RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
62 uint32_t flags, UErrorCode &status) {
63 UParseError pe;
64 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
65 fPattern = fPatternOwned;
66 fTraceDebug = FALSE;
67 fDeferredStatus = U_ZERO_ERROR;
68 fStack = new UVector32(status);
69 fData = fSmallData;
70 if (U_FAILURE(status)) {
71 return;
72 }
73 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) {
74 fData = (int32_t *)uprv_malloc(fPattern->fDataSize * sizeof(int32_t));
75 }
76 if (fStack == NULL || fData == NULL) {
77 status = U_MEMORY_ALLOCATION_ERROR;
78 }
79 reset(input);
80}
81
82
83RegexMatcher::RegexMatcher(const UnicodeString &regexp,
84 uint32_t flags, UErrorCode &status) {
85 UParseError pe;
86 fTraceDebug = FALSE;
87 fDeferredStatus = U_ZERO_ERROR;
88 fStack = new UVector32(status);
89 fData = fSmallData;
90 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
91 fPattern = fPatternOwned;
92 if (U_FAILURE(status)) {
93 return;
94 }
95
96 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) {
97 fData = (int32_t *)uprv_malloc(fPattern->fDataSize * sizeof(int32_t));
98 }
99 if (fStack == NULL || fData == NULL) {
100 status = U_MEMORY_ALLOCATION_ERROR;
101 }
102 reset(*RegexStaticSets::gStaticSets->fEmptyString);
103}
104
105
106
107RegexMatcher::~RegexMatcher() {
108 delete fStack;
109 if (fData != fSmallData) {
110 delete fData;
111 fData = NULL;
112 }
113 if (fPatternOwned) {
114 delete fPatternOwned;
115 fPatternOwned = NULL;
116 fPattern = NULL;
117 }
118}
119
120
121
122static const UChar BACKSLASH = 0x5c;
123static const UChar DOLLARSIGN = 0x24;
124//--------------------------------------------------------------------------------
125//
126// appendReplacement
127//
128//--------------------------------------------------------------------------------
129RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
130 const UnicodeString &replacement,
131 UErrorCode &status) {
132 if (U_FAILURE(status)) {
133 return *this;
134 }
135 if (U_FAILURE(fDeferredStatus)) {
136 status = fDeferredStatus;
137 return *this;
138 }
139 if (fMatch == FALSE) {
140 status = U_REGEX_INVALID_STATE;
141 return *this;
142 }
143
144 // Copy input string from the end of previous match to start of current match
145 int32_t len = fMatchStart-fLastMatchEnd;
146 if (len > 0) {
147 dest.append(*fInput, fLastMatchEnd, len);
148 }
149
150
151 // scan the replacement text, looking for substitutions ($n) and \escapes.
152 // TODO: optimize this loop by efficiently scanning for '$' or '\',
153 // move entire ranges not containing substitutions.
154 int32_t replLen = replacement.length();
155 int32_t replIdx = 0;
156 while (replIdx<replLen) {
157 UChar c = replacement.charAt(replIdx);
158 replIdx++;
159 if (c == BACKSLASH) {
160 // Backslash Escape. Copy the following char out without further checks.
161 // Note: Surrogate pairs don't need any special handling
162 // The second half wont be a '$' or a '\', and
163 // will move to the dest normally on the next
164 // loop iteration.
165 if (replIdx >= replLen) {
166 break;
167 }
168 c = replacement.charAt(replIdx);
169
170 if (c==0x55/*U*/ || c==0x75/*u*/) {
171 // We have a \udddd or \Udddddddd escape sequence.
172 UChar32 escapedChar = replacement.unescapeAt(replIdx);
173 if (escapedChar != (UChar32)0xFFFFFFFF) {
174 dest.append(escapedChar);
175 replIdx += (c==0x55? 9: 5);
176 // TODO: Report errors for mal-formed \u escapes?
177 // As this is, the original sequence is output, which may be OK.
178 continue;
179 }
180 }
181
182 // Plain backslash escape. Just put out the escaped character.
183 dest.append(c);
184 replIdx++;
185 continue;
186 }
187
188 if (c != DOLLARSIGN) {
189 // Normal char, not a $. Copy it out without further checks.
190 dest.append(c);
191 continue;
192 }
193
194 // We've got a $. Pick up a capture group number if one follows.
195 // Consume at most the number of digits necessary for the largest capture
196 // number that is valid for this pattern.
197
198 int32_t numDigits = 0;
199 int32_t groupNum = 0;
200 UChar32 digitC;
201 for (;;) {
202 if (replIdx >= replLen) {
203 break;
204 }
205 digitC = replacement.char32At(replIdx);
206 if (u_isdigit(digitC) == FALSE) {
207 break;
208 }
209 replIdx = replacement.moveIndex32(replIdx, 1);
210 groupNum=groupNum*10 + u_charDigitValue(digitC);
211 numDigits++;
212 if (numDigits >= fPattern->fMaxCaptureDigits) {
213 break;
214 }
215 }
216
217
218 if (numDigits == 0) {
219 // The $ didn't introduce a group number at all.
220 // Treat it as just part of the substitution text.
221 dest.append(DOLLARSIGN);
222 continue;
223 }
224
225 // Finally, append the capture group data to the destination.
226 dest.append(group(groupNum, status));
227 if (U_FAILURE(status)) {
228 // Can fail if group number is out of range.
229 break;
230 }
231
232 }
233
234 return *this;
235}
236
237
238
239//--------------------------------------------------------------------------------
240//
241// appendTail Intended to be used in conjunction with appendReplacement()
242// To the destination string, append everything following
243// the last match position from the input string.
244//
245//--------------------------------------------------------------------------------
246UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
247 int32_t len = fInput->length()-fMatchEnd;
248 if (len > 0) {
249 dest.append(*fInput, fMatchEnd, len);
250 }
251 return dest;
252}
253
254
255
256//--------------------------------------------------------------------------------
257//
258// end
259//
260//--------------------------------------------------------------------------------
261int32_t RegexMatcher::end(UErrorCode &err) const {
262 return end(0, err);
263}
264
265
266
267int32_t RegexMatcher::end(int group, UErrorCode &err) const {
268 if (U_FAILURE(err)) {
269 return -1;
270 }
271 if (fMatch == FALSE) {
272 err = U_REGEX_INVALID_STATE;
273 return -1;
274 }
275 if (group < 0 || group > fPattern->fGroupMap->size()) {
276 err = U_INDEX_OUTOFBOUNDS_ERROR;
277 return -1;
278 }
279 int32_t e = -1;
280 if (group == 0) {
281 e = fMatchEnd;
282 } else {
283 // Get the position within the stack frame of the variables for
284 // this capture group.
285 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
286 U_ASSERT(groupOffset < fPattern->fFrameSize);
287 U_ASSERT(groupOffset >= 0);
288 e = fFrame->fExtra[groupOffset + 1];
289 }
290 return e;
291}
292
293
294
295//--------------------------------------------------------------------------------
296//
297// find()
298//
299//--------------------------------------------------------------------------------
300UBool RegexMatcher::find() {
301 // Start at the position of the last match end. (Will be zero if the
302 // matcher has been reset.
303 //
304 if (U_FAILURE(fDeferredStatus)) {
305 return FALSE;
306 }
307
308 int32_t startPos = fMatchEnd;
309 int32_t inputLen = fInput->length();
310 int32_t testLen = inputLen - fPattern->fMinMatchLen;
311 if (startPos > testLen) {
312 return FALSE;
313 }
314
315 const UChar *inputBuf = fInput->getBuffer();
316 UChar32 c;
317 U_ASSERT(startPos >= 0);
318
319 switch (fPattern->fStartType) {
320 case START_NO_INFO:
321 // No optimization was found.
322 // Try a match at each input position.
323 for (;;) {
324 MatchAt(startPos, fDeferredStatus);
325 if (U_FAILURE(fDeferredStatus)) {
326 return FALSE;
327 }
328 if (fMatch) {
329 return TRUE;
330 }
331 if (startPos >= testLen) {
332 return FALSE;
333 }
334 U16_FWD_1(inputBuf, startPos, inputLen);
335 // Note that it's perfectly OK for a pattern to have a zero-length
336 // match at the end of a string, so we must make sure that the loop
337 // runs with startPos == testLen the last time through.
338 }
339 U_ASSERT(FALSE);
340
341 case START_START:
342 // Matches are only possible at the start of the input string
343 // (pattern begins with ^ or \A)
344 if (startPos > 0) {
345 return FALSE;
346 }
347 MatchAt(startPos, fDeferredStatus);
348 if (U_FAILURE(fDeferredStatus)) {
349 return FALSE;
350 }
351 return fMatch;
352
353
354 case START_SET:
355 {
356 // Match may start on any char from a pre-computed set.
357 U_ASSERT(fPattern->fMinMatchLen > 0);
358 for (;;) {
359 int32_t pos = startPos;
360 U16_NEXT(inputBuf, startPos, inputLen, c); // like c = inputBuf[startPos++];
361 if (c<256 && fPattern->fInitialChars8->contains(c) ||
362 c>=256 && fPattern->fInitialChars->contains(c)) {
363 MatchAt(pos, fDeferredStatus);
364 if (U_FAILURE(fDeferredStatus)) {
365 return FALSE;
366 }
367 if (fMatch) {
368 return TRUE;
369 }
370 }
371 if (pos >= testLen) {
372 return FALSE;
373 }
374 }
375 }
376 U_ASSERT(FALSE);
377
378 case START_STRING:
379 case START_CHAR:
380 {
381 // Match starts on exactly one char.
382 U_ASSERT(fPattern->fMinMatchLen > 0);
383 UChar32 theChar = fPattern->fInitialChar;
384 for (;;) {
385 int32_t pos = startPos;
386 U16_NEXT(inputBuf, startPos, inputLen, c); // like c = inputBuf[startPos++];
387 if (c == theChar) {
388 MatchAt(pos, fDeferredStatus);
389 if (U_FAILURE(fDeferredStatus)) {
390 return FALSE;
391 }
392 if (fMatch) {
393 return TRUE;
394 }
395 }
396 if (pos >= testLen) {
397 return FALSE;
398 }
399 }
400 }
401 U_ASSERT(FALSE);
402
403 case START_LINE:
404 {
405 UChar32 c;
406 if (startPos == 0) {
407 MatchAt(startPos, fDeferredStatus);
408 if (U_FAILURE(fDeferredStatus)) {
409 return FALSE;
410 }
411 if (fMatch) {
412 return TRUE;
413 }
414 U16_NEXT(inputBuf, startPos, inputLen, c); // like c = inputBuf[startPos++];
415 }
416
417 for (;;) {
418 UChar32 c = inputBuf[startPos-1];
419 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
420 (c == 0x0a || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029 ||
421 c == 0x0d && startPos+1 < inputLen && inputBuf[startPos+1] != 0x0a)) {
422 MatchAt(startPos, fDeferredStatus);
423 if (U_FAILURE(fDeferredStatus)) {
424 return FALSE;
425 }
426 if (fMatch) {
427 return TRUE;
428 }
429 }
430 if (startPos >= testLen) {
431 return FALSE;
432 }
433 U16_NEXT(inputBuf, startPos, inputLen, c); // like c = inputBuf[startPos++];
434 // Note that it's perfectly OK for a pattern to have a zero-length
435 // match at the end of a string, so we must make sure that the loop
436 // runs with startPos == testLen the last time through.
437 }
438 }
439
440 default:
441 U_ASSERT(FALSE);
442 }
443
444 U_ASSERT(FALSE);
445 return FALSE;
446}
447
448
449
450UBool RegexMatcher::find(int32_t start, UErrorCode &status) {
451 if (U_FAILURE(status)) {
452 return FALSE;
453 }
454 if (U_FAILURE(fDeferredStatus)) {
455 status = fDeferredStatus;
456 return FALSE;
457 }
458 int32_t inputLen = fInput->length();
459 if (start < 0 || start >= inputLen) {
460 status = U_INDEX_OUTOFBOUNDS_ERROR;
461 return FALSE;
462 }
463 this->reset();
464 fMatchEnd = start;
465 return find();
466}
467
468
469
470//--------------------------------------------------------------------------------
471//
472// group()
473//
474//--------------------------------------------------------------------------------
475UnicodeString RegexMatcher::group(UErrorCode &status) const {
476 return group(0, status);
477}
478
479
480
481UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
482 int32_t s = start(groupNum, status);
483 int32_t e = end(groupNum, status);
484
485 // Note: calling start() and end() above will do all necessary checking that
486 // the group number is OK and that a match exists. status will be set.
487 if (U_FAILURE(status)) {
488 return UnicodeString();
489 }
490 if (U_FAILURE(fDeferredStatus)) {
491 status = fDeferredStatus;
492 return UnicodeString();
493 }
494
495 if (s < 0) {
496 // A capture group wasn't part of the match
497 return UnicodeString();
498 }
499 U_ASSERT(s <= e);
500 return UnicodeString(*fInput, s, e-s);
501}
502
503
504
505
506int32_t RegexMatcher::groupCount() const {
507 return fPattern->fGroupMap->size();
508}
509
510
511
512const UnicodeString &RegexMatcher::input() const {
513 return *fInput;
514}
515
516
517
518
519UBool RegexMatcher::lookingAt(UErrorCode &status) {
520 if (U_FAILURE(status)) {
521 return FALSE;
522 }
523 if (U_FAILURE(fDeferredStatus)) {
524 status = fDeferredStatus;
525 return FALSE;
526 }
527 reset();
528 MatchAt(0, status);
529 return fMatch;
530}
531
532
533
534UBool RegexMatcher::matches(UErrorCode &status) {
535 if (U_FAILURE(status)) {
536 return FALSE;
537 }
538 if (U_FAILURE(fDeferredStatus)) {
539 status = fDeferredStatus;
540 return FALSE;
541 }
542 reset();
543 MatchAt(0, status);
544 UBool success = (fMatch && fMatchEnd==fInput->length());
545 return success;
546}
547
548
549
550
551const RegexPattern &RegexMatcher::pattern() const {
552 return *fPattern;
553}
554
555
556
557//--------------------------------------------------------------------------------
558//
559// replaceAll
560//
561//--------------------------------------------------------------------------------
562UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
563 if (U_FAILURE(status)) {
564 return *fInput;
565 }
566 if (U_FAILURE(fDeferredStatus)) {
567 status = fDeferredStatus;
568 return *fInput;
569 }
570 UnicodeString destString;
571 for (reset(); find(); ) {
572 appendReplacement(destString, replacement, status);
573 if (U_FAILURE(status)) {
574 break;
575 }
576 }
577 appendTail(destString);
578 return destString;
579}
580
581
582
583
584//--------------------------------------------------------------------------------
585//
586// replaceFirst
587//
588//--------------------------------------------------------------------------------
589UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
590 if (U_FAILURE(status)) {
591 return *fInput;
592 }
593 if (U_FAILURE(fDeferredStatus)) {
594 status = fDeferredStatus;
595 return *fInput;
596 }
597
598 reset();
599 if (!find()) {
600 return *fInput;
601 }
602
603 UnicodeString destString;
604 appendReplacement(destString, replacement, status);
605 appendTail(destString);
606 return destString;
607}
608
609
610
611//--------------------------------------------------------------------------------
612//
613// reset
614//
615//--------------------------------------------------------------------------------
616RegexMatcher &RegexMatcher::reset() {
617 fMatchStart = 0;
618 fMatchEnd = 0;
619 fLastMatchEnd = 0;
620 fMatch = FALSE;
621 resetStack();
622 return *this;
623}
624
625
626
627RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
628 fInput = &input;
629 reset();
630 return *this;
631}
632
633
634
635REStackFrame *RegexMatcher::resetStack() {
636 // Discard any previous contents of the state save stack, and initialize a
637 // new stack frame to all -1. The -1s are needed for capture group limits, where
638 // they indicate that a group has not yet matched anything.
639 fStack->removeAllElements();
640
641 int32_t *iFrame = fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
642 int i;
643 for (i=0; i<fPattern->fFrameSize; i++) {
644 iFrame[i] = -1;
645 }
646 return (REStackFrame *)iFrame;
647}
648
649
650
651//--------------------------------------------------------------------------------
652//
653// setTrace
654//
655//--------------------------------------------------------------------------------
656void RegexMatcher::setTrace(UBool state) {
657 fTraceDebug = state;
658}
659
660
661
662//---------------------------------------------------------------------
663//
664// split
665//
666//---------------------------------------------------------------------
667int32_t RegexMatcher::split(const UnicodeString &input,
668 UnicodeString dest[],
669 int32_t destCapacity,
670 UErrorCode &status)
671{
672 //
673 // Check arguements for validity
674 //
675 if (U_FAILURE(status)) {
676 return 0;
677 };
678
679 if (destCapacity < 1) {
680 status = U_ILLEGAL_ARGUMENT_ERROR;
681 return 0;
682 }
683
684
685 //
686 // Reset for the input text
687 //
688 reset(input);
689 int32_t inputLen = input.length();
690 int32_t nextOutputStringStart = 0;
691 if (inputLen == 0) {
692 return 0;
693 }
694
695
696 //
697 // Loop through the input text, searching for the delimiter pattern
698 //
699 int i;
700 int32_t numCaptureGroups = fPattern->fGroupMap->size();
701 for (i=0; ; i++) {
702 if (i>=destCapacity-1) {
703 // There is one or zero output string left.
704 // Fill the last output string with whatever is left from the input, then exit the loop.
705 // ( i will be == destCapicity if we filled the output array while processing
706 // capture groups of the delimiter expression, in which case we will discard the
707 // last capture group saved in favor of the unprocessed remainder of the
708 // input string.)
709 i = destCapacity-1;
710 int32_t remainingLength = inputLen-nextOutputStringStart;
711 if (remainingLength > 0) {
712 dest[i].setTo(input, nextOutputStringStart, remainingLength);
713 }
714 break;
715 }
716 if (find()) {
717 // We found another delimiter. Move everything from where we started looking
718 // up until the start of the delimiter into the next output string.
719 int32_t fieldLen = fMatchStart - nextOutputStringStart;
720 dest[i].setTo(input, nextOutputStringStart, fieldLen);
721 nextOutputStringStart = fMatchEnd;
722
723 // If the delimiter pattern has capturing parentheses, the captured
724 // text goes out into the next n destination strings.
725 int32_t groupNum;
726 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
727 if (i==destCapacity-1) {
728 break;
729 }
730 i++;
731 dest[i] = group(groupNum, status);
732 }
733
734 if (nextOutputStringStart == inputLen) {
735 // The delimiter was at the end of the string. We're done.
736 break;
737 }
738
739 }
740 else
741 {
742 // We ran off the end of the input while looking for the next delimiter.
743 // All the remaining text goes into the current output string.
744 dest[i].setTo(input, nextOutputStringStart, inputLen-nextOutputStringStart);
745 break;
746 }
747 }
748 return i+1;
749}
750
751
752
753//--------------------------------------------------------------------------------
754//
755// start
756//
757//--------------------------------------------------------------------------------
758int32_t RegexMatcher::start(UErrorCode &status) const {
759 return start(0, status);
760}
761
762
763
764
765int32_t RegexMatcher::start(int group, UErrorCode &status) const {
766 if (U_FAILURE(status)) {
767 return -1;
768 }
769 if (U_FAILURE(fDeferredStatus)) {
770 status = fDeferredStatus;
771 return -1;
772 }
773 if (fMatch == FALSE) {
774 status = U_REGEX_INVALID_STATE;
775 return -1;
776 }
777 if (group < 0 || group > fPattern->fGroupMap->size()) {
778 status = U_INDEX_OUTOFBOUNDS_ERROR;
779 return -1;
780 }
781 int32_t s;
782 if (group == 0) {
783 s = fMatchStart;
784 } else {
785 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
786 U_ASSERT(groupOffset < fPattern->fFrameSize);
787 U_ASSERT(groupOffset >= 0);
788 s = fFrame->fExtra[groupOffset];
789 }
790 return s;
791}
792
793
794
795//--------------------------------------------------------------------------------
796//
797// isWordBoundary
798// in perl, "xab..cd..", \b is true at positions 0,3,5,7
799// For us,
800// If the current char is a combining mark,
801// \b is FALSE.
802// Else Scan backwards to the first non-combining char.
803// We are at a boundary if the this char and the original chars are
804// opposite in membership in \w set
805//
806// parameters: pos - the current position in the input buffer
807// start - the position where the match operation started.
808// don't backup before this position when looking back
809// for a preceding base char.
810//
811//--------------------------------------------------------------------------------
812UBool RegexMatcher::isWordBoundary(int32_t pos) {
813 UBool isBoundary = FALSE;
814 UBool cIsWord = FALSE;
815
816 // Determine whether char c at current position is a member of the word set of chars.
817 // If we're off the end of the string, behave as though we're not at a word char.
818 if (pos < fInput->length()) {
819 UChar32 c = fInput->char32At(pos);
820 int8_t ctype = u_charType(c);
821 if (ctype==U_NON_SPACING_MARK || ctype==U_ENCLOSING_MARK) {
822 // Current char is a combining one. Not a boundary.
823 return FALSE;
824 }
825 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
826 }
827
828 // Back up until we come to a non-combining char, determine whether
829 // that char is a word char.
830 UBool prevCIsWord = FALSE;
831 int32_t prevPos = pos;
832 for (;;) {
833 if (prevPos == 0) {
834 break;
835 }
836 prevPos = fInput->moveIndex32(prevPos, -1);
837 UChar32 prevChar = fInput->char32At(prevPos);
838 int8_t prevCType = u_charType(prevChar);
839 if (!(prevCType==U_NON_SPACING_MARK || prevCType==U_ENCLOSING_MARK)) {
840 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
841 break;
842 }
843 }
844 isBoundary = cIsWord ^ prevCIsWord;
845 return isBoundary;
846}
847
848//--------------------------------------------------------------------------------
849//
850// StateSave
851// Make a new stack frame, initialized as a copy of the current stack frame.
852// Set the pattern index in the original stack frame from the operand value
853// in the opcode. Execution of the engine continues with the state in
854// the newly created stack frame
855//
856// Note that reserveBlock() may grow the stack, resulting in the
857// whole thing being relocated in memory.
858//
859//--------------------------------------------------------------------------------
860inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int32_t savePatIdx, int32_t frameSize, UErrorCode &status) {
861 // push storage for a new frame.
862 int32_t *newFP = fStack->reserveBlock(frameSize, status);
863 fp = (REStackFrame *)(newFP - frameSize); // in case of realloc of stack.
864
865 // New stack frame = copy of old top frame.
866 int32_t *source = (int32_t *)fp;
867 int32_t *dest = newFP;
868 for (;;) {
869 *dest++ = *source++;
870 if (source == newFP) {
871 break;
872 }
873 }
874
875 fp->fPatIdx = savePatIdx;
876 return (REStackFrame *)newFP;
877}
878
879
880//--------------------------------------------------------------------------------
881//
882// MatchAt This is the actual matching engine.
883//
884//--------------------------------------------------------------------------------
885void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
886 UBool isMatch = FALSE; // True if the we have a match.
887
888 int32_t op; // Operation from the compiled pattern, split into
889 int32_t opType; // the opcode
890 int32_t opValue; // and the operand value.
891
892 #ifdef REGEX_RUN_DEBUG
893 if (fTraceDebug)
894 {
895 printf("MatchAt(startIdx=%d)\n", startIdx);
896 printf("Original Pattern: ");
897 int i;
898 for (i=0; i<fPattern->fPattern.length(); i++) {
899 printf("%c", fPattern->fPattern.charAt(i));
900 }
901 printf("\n");
902 printf("Input String: ");
903 for (i=0; i<fInput->length(); i++) {
904 UChar c = fInput->charAt(i);
905 if (c<32 || c>256) {
906 c = '.';
907 }
908 printf("%c", c);
909 }
910 printf("\n");
911 printf("\n");
912 }
913 #endif
914
915 if (U_FAILURE(status)) {
916 return;
917 }
918
919 // Cache frequently referenced items from the compiled pattern
920 // in local variables.
921 //
922 int32_t *pat = fPattern->fCompiledPat->getBuffer();
923
924 const UChar *litText = fPattern->fLiteralText.getBuffer();
925 UVector *sets = fPattern->fSets;
926 int32_t inputLen = fInput->length();
927 const UChar *inputBuf = fInput->getBuffer();
928
929 REStackFrame *fp = resetStack();
930 int32_t frameSize = fPattern->fFrameSize;
931
932 fp->fPatIdx = 0;
933 fp->fInputIdx = startIdx;
934
935 // Zero out the pattern's static data
936 int32_t i;
937 for (i = 0; i<fPattern->fDataSize; i++) {
938 fData[i] = 0;
939 }
940
941 //
942 // Main loop for interpreting the compiled pattern.
943 // One iteration of the loop per pattern operation performed.
944 //
945 for (;;) {
946#if 0
947 if (_heapchk() != _HEAPOK) {
948 fprintf(stderr, "Heap Trouble\n");
949 }
950#endif
951 op = pat[fp->fPatIdx];
952 opType = URX_TYPE(op);
953 opValue = URX_VAL(op);
954 #ifdef REGEX_RUN_DEBUG
955 if (fTraceDebug) {
956 printf("inputIdx=%d inputChar=%c sp=%3d ", fp->fInputIdx,
957 fInput->char32At(fp->fInputIdx), (int32_t *)fp-fStack->getBuffer());
958 fPattern->dumpOp(fp->fPatIdx);
959 }
960 #endif
961 fp->fPatIdx++;
962
963 switch (opType) {
964
965
966 case URX_NOP:
967 break;
968
969
970 case URX_BACKTRACK:
971 // Force a backtrack. In some circumstances, the pattern compiler
972 // will notice that the pattern can't possibly match anything, and will
973 // emit one of these at that point.
974 fp = (REStackFrame *)fStack->popFrame(frameSize);
975 break;
976
977
978 case URX_ONECHAR:
979 if (fp->fInputIdx < inputLen) {
980 UChar32 c;
981 U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
982 if (c == opValue) {
983 break;
984 }
985 }
986 fp = (REStackFrame *)fStack->popFrame(frameSize);
987 break;
988
989
990 case URX_STRING:
991 {
992 // Test input against a literal string.
993 // Strings require two slots in the compiled pattern, one for the
994 // offset to the string text, and one for the length.
995 int32_t stringStartIdx = opValue;
996 int32_t stringLen;
997
998 op = pat[fp->fPatIdx]; // Fetch the second operand
999 fp->fPatIdx++;
1000 opType = URX_TYPE(op);
1001 stringLen = URX_VAL(op);
1002 U_ASSERT(opType == URX_STRING_LEN);
1003 U_ASSERT(stringLen >= 2);
1004
1005 if (fp->fInputIdx + stringLen > inputLen) {
1006 // No match. String is longer than the remaining input text.
1007 fp = (REStackFrame *)fStack->popFrame(frameSize);
1008 break;
1009 }
1010
1011 const UChar * pInp = inputBuf + fp->fInputIdx;
1012 const UChar * pPat = litText+stringStartIdx;
1013 const UChar * pEnd = pInp + stringLen;
1014 for(;;) {
1015 if (*pInp == *pPat) {
1016 pInp++;
1017 pPat++;
1018 if (pInp == pEnd) {
1019 // Successful Match.
1020 fp->fInputIdx += stringLen;
1021 break;
1022 }
1023 } else {
1024 // Match failed.
1025 fp = (REStackFrame *)fStack->popFrame(frameSize);
1026 break;
1027 }
1028 }
1029 break;
1030
1031 }
1032 break;
1033
1034
1035
1036 case URX_STATE_SAVE:
1037 fp = StateSave(fp, opValue, frameSize, status);
1038 break;
1039
1040
1041 case URX_END:
1042 // The match loop will exit via this path on a successful match,
1043 // when we reach the end of the pattern.
1044 isMatch = TRUE;
1045 goto breakFromLoop;
1046
1047 // Start and End Capture stack frame variables are layout out like this:
1048 // fp->fExtra[opValue] - The start of a completed capture group
1049 // opValue+1 - The end of a completed capture group
1050 // opValue+2 - the start of a capture group whose end
1051 // has not yet been reached (and might not ever be).
1052 case URX_START_CAPTURE:
1053 U_ASSERT(opValue >= 0 && opValue < frameSize-3);
1054 fp->fExtra[opValue+2] = fp->fInputIdx;
1055 break;
1056
1057
1058 case URX_END_CAPTURE:
1059 U_ASSERT(opValue >= 0 && opValue < frameSize-3);
1060 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
1061 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
1062 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
1063 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
1064 break;
1065
1066
1067 case URX_DOLLAR: // $, test for End of line
1068 // or for position before new line at end of input
1069 if (fp->fInputIdx < inputLen-2) {
1070 // We are no where near the end of input. Fail.
1071 fp = (REStackFrame *)fStack->popFrame(frameSize);
1072 break;
1073 }
1074 if (fp->fInputIdx >= inputLen) {
1075 // We really are at the end of input. Success.
1076 break;
1077 }
1078 // If we are positioned just before a new-line that is located at the
1079 // end of input, succeed.
1080 if (fp->fInputIdx == inputLen-1) {
1081 UChar32 c = fInput->char32At(fp->fInputIdx);
1082 if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) {
1083 break; // At new-line at end of input. Success
1084 }
1085 }
1086
1087 if (fp->fInputIdx == inputLen-2) {
1088 if (fInput->char32At(fp->fInputIdx) == 0x0d && fInput->char32At(fp->fInputIdx+1) == 0x0a) {
1089 break; // At CR/LF at end of input. Success
1090 }
1091 }
1092
1093 fp = (REStackFrame *)fStack->popFrame(frameSize);
1094
1095 break;
1096
1097
1098 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
1099 {
1100 if (fp->fInputIdx >= inputLen) {
1101 // We really are at the end of input. Success.
1102 break;
1103 }
1104 // If we are positioned just before a new-line , succeed.
1105 // It makes no difference where the new-line is within the input.
1106 UChar32 c = inputBuf[fp->fInputIdx];
1107 if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) {
1108 break; // At new-line at end of input. Success
1109 }
1110 // not at a new line. Fail.
1111 fp = (REStackFrame *)fStack->popFrame(frameSize);
1112 }
1113 break;
1114
1115
1116 case URX_CARET: // ^, test for start of line
1117 if (fp->fInputIdx != 0) {
1118 fp = (REStackFrame *)fStack->popFrame(frameSize);
1119 }
1120 break;
1121
1122
1123 case URX_CARET_M: // ^, test for start of line in mulit-line mode
1124 {
1125 if (fp->fInputIdx == 0) {
1126 // We are at the start input. Success.
1127 break;
1128 }
1129 // Check whether character just before the current pos is a new-line
1130 // unless we are at the end of input
1131 UChar c = inputBuf[fp->fInputIdx - 1];
1132 if ((fp->fInputIdx < inputLen) &&
1133 (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029)) {
1134 // It's a new-line. ^ is true. Success.
1135 break;
1136 }
1137 // Not at the start of a line. Fail.
1138 fp = (REStackFrame *)fStack->popFrame(frameSize);
1139 }
1140 break;
1141
1142
1143 case URX_BACKSLASH_B: // Test for word boundaries
1144 {
1145 UBool success = isWordBoundary(fp->fInputIdx);
1146 success ^= (opValue != 0); // flip sense for \B
1147 if (!success) {
1148 fp = (REStackFrame *)fStack->popFrame(frameSize);
1149 }
1150 }
1151 break;
1152
1153
1154 case URX_BACKSLASH_D: // Test for decimal digit
1155 {
1156 if (fp->fInputIdx >= inputLen) {
1157 fp = (REStackFrame *)fStack->popFrame(frameSize);
1158 break;
1159 }
1160
1161 UChar32 c = fInput->char32At(fp->fInputIdx);
1162 int8_t ctype = u_charType(c);
1163 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
1164 success ^= (opValue != 0); // flip sense for \D
1165 if (success) {
1166 fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1);
1167 } else {
1168 fp = (REStackFrame *)fStack->popFrame(frameSize);
1169 }
1170 }
1171 break;
1172
1173
1174
1175
1176 case URX_BACKSLASH_G: // Test for position at end of previous match
1177 if (!((fMatch && fp->fInputIdx==fMatchEnd) || fMatch==FALSE && fp->fInputIdx==0)) {
1178 fp = (REStackFrame *)fStack->popFrame(frameSize);
1179 }
1180 break;
1181
1182
1183 case URX_BACKSLASH_X:
1184 // Match a Grapheme, as defined by Unicode TR 29.
1185 // Differs slightly from Perl, which consumes combining marks independently
1186 // of context.
1187 {
1188
1189 // Fail if at end of input
1190 if (fp->fInputIdx >= inputLen) {
1191 fp = (REStackFrame *)fStack->popFrame(frameSize);
1192 break;
1193 }
1194
1195 // Examine (and consume) the current char.
1196 // Dispatch into a little state machine, based on the char.
1197 UChar32 c;
1198 U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
1199 UnicodeSet **sets = fPattern->fStaticSets;
1200 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
1201 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
1202 if (sets[URX_GC_L]->contains(c)) goto GC_L;
1203 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
1204 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
1205 if (sets[URX_GC_V]->contains(c)) goto GC_V;
1206 if (sets[URX_GC_T]->contains(c)) goto GC_T;
1207 goto GC_Extend;
1208
1209
1210
1211GC_L:
1212 if (fp->fInputIdx >= inputLen) goto GC_Done;
1213 U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
1214 if (sets[URX_GC_L]->contains(c)) goto GC_L;
1215 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
1216 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
1217 if (sets[URX_GC_V]->contains(c)) goto GC_V;
1218 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
1219 goto GC_Extend;
1220
1221GC_V:
1222 if (fp->fInputIdx >= inputLen) goto GC_Done;
1223 U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
1224 if (sets[URX_GC_V]->contains(c)) goto GC_V;
1225 if (sets[URX_GC_T]->contains(c)) goto GC_T;
1226 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
1227 goto GC_Extend;
1228
1229GC_T:
1230 if (fp->fInputIdx >= inputLen) goto GC_Done;
1231 U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
1232 if (sets[URX_GC_T]->contains(c)) goto GC_T;
1233 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
1234 goto GC_Extend;
1235
1236GC_Extend:
1237 // Combining characters are consumed here
1238 for (;;) {
1239 if (fp->fInputIdx >= inputLen) {
1240 break;
1241 }
1242 U16_GET(inputBuf, 0, fp->fInputIdx, inputLen, c);
1243 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
1244 break;
1245 }
1246 U16_FWD_1(inputBuf, fp->fInputIdx, inputLen);
1247 }
1248 goto GC_Done;
1249
1250GC_Control:
1251 // Most control chars stand alone (don't combine with combining chars),
1252 // except for that CR/LF sequence is a single grapheme cluster.
1253 if (c == 0x0d && fp->fInputIdx < inputLen && inputBuf[fp->fInputIdx] == 0x0a) {
1254 fp->fInputIdx++;
1255 }
1256
1257GC_Done:
1258 break;
1259 }
1260
1261
1262
1263
1264 case URX_BACKSLASH_Z: // Test for end of line
1265 if (fp->fInputIdx < inputLen) {
1266 fp = (REStackFrame *)fStack->popFrame(frameSize);
1267 }
1268 break;
1269
1270
1271
1272 case URX_STATIC_SETREF:
1273 {
1274 // Test input character against one of the predefined sets
1275 // (Word Characters, for example)
1276 // The high bit of the op value is a flag for the match polarity.
1277 // 0: success if input char is in set.
1278 // 1: success if input char is not in set.
1279 if (fp->fInputIdx >= inputLen) {
1280 fp = (REStackFrame *)fStack->popFrame(frameSize);
1281 break;
1282 }
1283
1284 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
1285 opValue &= ~URX_NEG_SET;
1286 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
1287 UChar32 c;
1288 U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
1289 if (c < 256) {
1290 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
1291 if (s8->contains(c)) {
1292 success = !success;
1293 }
1294 } else {
1295 const UnicodeSet *s = fPattern->fStaticSets[opValue];
1296 if (s->contains(c)) {
1297 success = !success;
1298 }
1299 }
1300 if (!success) {
1301 fp = (REStackFrame *)fStack->popFrame(frameSize);
1302 }
1303 }
1304 break;
1305
1306
1307 case URX_STAT_SETREF_N:
1308 {
1309 // Test input character for NOT being a member of one of
1310 // the predefined sets (Word Characters, for example)
1311 if (fp->fInputIdx >= inputLen) {
1312 fp = (REStackFrame *)fStack->popFrame(frameSize);
1313 break;
1314 }
1315
1316 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
1317 UChar32 c;
1318 U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
1319 if (c < 256) {
1320 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
1321 if (s8->contains(c) == FALSE) {
1322 break;
1323 }
1324 } else {
1325 const UnicodeSet *s = fPattern->fStaticSets[opValue];
1326 if (s->contains(c) == FALSE) {
1327 break;
1328 }
1329 }
1330
1331 fp = (REStackFrame *)fStack->popFrame(frameSize);
1332 }
1333 break;
1334
1335
1336 case URX_SETREF:
1337 if (fp->fInputIdx < inputLen) {
1338 // There is input left. Pick up one char and test it for set membership.
1339 UChar32 c;
1340 U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
1341 U_ASSERT(opValue > 0 && opValue < sets->size());
1342 if (c<256) {
1343 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
1344 if (s8->contains(c)) {
1345 break;
1346 }
1347 } else {
1348
1349 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
1350 if (s->contains(c)) {
1351 // The character is in the set. A Match.
1352 break;
1353 }
1354 }
1355 }
1356 // Either at end of input, or the character wasn't in the set.
1357 // Either way, we need to back track out.
1358 fp = (REStackFrame *)fStack->popFrame(frameSize);
1359 break;
1360
1361
1362 case URX_DOTANY:
1363 {
1364 // . matches anything, but stops at end-of-line.
1365 if (fp->fInputIdx >= inputLen) {
1366 // At end of input. Match failed. Backtrack out.
1367 fp = (REStackFrame *)fStack->popFrame(frameSize);
1368 break;
1369 }
1370 // There is input left. Advance over one char, unless we've hit end-of-line
1371 UChar32 c;
1372 U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
1373 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
1374 (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029)) {
1375 // End of line in normal mode. . does not match.
1376 fp = (REStackFrame *)fStack->popFrame(frameSize);
1377 break;
1378 }
1379 }
1380 break;
1381
1382
1383 case URX_DOTANY_ALL:
1384 {
1385 // ., in dot-matches-all (including new lines) mode
1386 if (fp->fInputIdx >= inputLen) {
1387 // At end of input. Match failed. Backtrack out.
1388 fp = (REStackFrame *)fStack->popFrame(frameSize);
1389 break;
1390 }
1391 // There is input left. Advance over one char, except if we are
1392 // at a cr/lf, advance over both of them.
1393 UChar32 c;
1394 U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
1395 if (c==0x0d) {
1396 // In the case of a CR/LF, we need to advance over both.
1397 UChar nextc = inputBuf[fp->fInputIdx];
1398 if (nextc == 0x0a) {
1399 fp->fInputIdx++;
1400 }
1401 }
1402 }
1403 break;
1404
1405 case URX_DOTANY_PL:
1406 // Match all up to and end-of-line or end-of-input.
1407 {
1408 // Fail if input already exhausted.
1409 if (fp->fInputIdx >= inputLen) {
1410 fp = (REStackFrame *)fStack->popFrame(frameSize);
1411 break;
1412 }
1413
1414 // There is input left. Fail if we are at the end of a line.
1415 UChar32 c;
1416 U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
1417 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
1418 (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029)) {
1419 // End of line in normal mode. . does not match.
1420 fp = (REStackFrame *)fStack->popFrame(frameSize);
1421 break;
1422 }
1423
1424 // There was input left. Consume it until we hit the end of a line,
1425 // or until it's exhausted.
1426 while (fp->fInputIdx < inputLen) {
1427 U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
1428 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
1429 (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029)) {
1430 U16_BACK_1(inputBuf, 0, fp->fInputIdx)
1431 // Scan has reached a line-end. We are done.
1432 break;
1433 }
1434 }
1435 }
1436 break;
1437
1438 case URX_DOTANY_ALL_PL:
1439 {
1440 // Match up to end of input. Fail if already at end of input.
1441 if (fp->fInputIdx >= inputLen) {
1442 fp = (REStackFrame *)fStack->popFrame(frameSize);
1443 } else {
1444 fp->fInputIdx = inputLen;
1445 }
1446 }
1447 break;
1448
1449
1450 case URX_JMP:
1451 fp->fPatIdx = opValue;
1452 break;
1453
1454 case URX_FAIL:
1455 isMatch = FALSE;
1456 goto breakFromLoop;
1457
1458 case URX_JMP_SAV:
1459 U_ASSERT(opValue < fPattern->fCompiledPat->size());
1460 fp = StateSave(fp, fp->fPatIdx, frameSize, status); // State save to loc following current
1461 fp->fPatIdx = opValue; // Then JMP.
1462 break;
1463
1464 case URX_JMP_SAV_X:
1465 // This opcode is used with (x)+, when x can match a zero length string.
1466 // Same as JMP_SAV, except conditional on the match having made forward progress.
1467 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
1468 // data address of the input position at the start of the loop.
1469 {
1470 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
1471 int32_t stoOp = pat[opValue-1];
1472 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
1473 int32_t frameLoc = URX_VAL(stoOp);
1474 U_ASSERT(frameLoc >= 0 && frameLoc < frameSize);
1475 int32_t prevInputIdx = fp->fExtra[frameLoc];
1476 U_ASSERT(prevInputIdx <= fp->fInputIdx);
1477 if (prevInputIdx < fp->fInputIdx) {
1478 // The match did make progress. Repeat the loop.
1479 fp = StateSave(fp, fp->fPatIdx, frameSize, status); // State save to loc following current
1480 fp->fPatIdx = opValue;
1481 fp->fExtra[frameLoc] = fp->fInputIdx;
1482 }
1483 // If the input position did not advance, we do nothing here,
1484 // execution will fall out of the loop.
1485 }
1486 break;
1487
1488 case URX_CTR_INIT:
1489 {
1490 U_ASSERT(opValue >= 0 && opValue < frameSize-2);
1491 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
1492
1493 // Pick up the three extra operands that CTR_INIT has, and
1494 // skip the pattern location counter past
1495 int32_t instrOperandLoc = fp->fPatIdx;
1496 fp->fPatIdx += 3;
1497 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
1498 int32_t minCount = pat[instrOperandLoc+1];
1499 int32_t maxCount = pat[instrOperandLoc+2];
1500 U_ASSERT(minCount>=0);
1501 U_ASSERT(maxCount>=minCount || maxCount==-1);
1502 U_ASSERT(loopLoc>fp->fPatIdx);
1503
1504 if (minCount == 0) {
1505 fp = StateSave(fp, loopLoc+1, frameSize, status);
1506 }
1507 if (maxCount == 0) {
1508 fp = (REStackFrame *)fStack->popFrame(frameSize);
1509 }
1510 }
1511 break;
1512
1513 case URX_CTR_LOOP:
1514 {
1515 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
1516 int32_t initOp = pat[opValue];
1517 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
1518 int32_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
1519 int32_t minCount = pat[opValue+2];
1520 int32_t maxCount = pat[opValue+3];
1521 // Increment the counter. Note: we're not worrying about counter
1522 // overflow, since the data comes from UnicodeStrings, which
1523 // stores its length in an int32_t.
1524 (*pCounter)++;
1525 U_ASSERT(*pCounter > 0);
1526 if ((uint32_t)*pCounter >= (uint32_t)maxCount) {
1527 U_ASSERT(*pCounter == maxCount || maxCount == -1);
1528 break;
1529 }
1530 if (*pCounter >= minCount) {
1531 fp = StateSave(fp, fp->fPatIdx, frameSize, status);
1532 }
1533 fp->fPatIdx = opValue + 4; // Loop back.
1534 }
1535 break;
1536
1537 case URX_CTR_INIT_NG:
1538 {
1539 U_ASSERT(opValue >= 0 && opValue < frameSize-2);
1540 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
1541
1542 // Pick up the three extra operands that CTR_INIT has, and
1543 // skip the pattern location counter past
1544 int32_t instrOperandLoc = fp->fPatIdx;
1545 fp->fPatIdx += 3;
1546 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
1547 int32_t minCount = pat[instrOperandLoc+1];
1548 int32_t maxCount = pat[instrOperandLoc+2];
1549 U_ASSERT(minCount>=0);
1550 U_ASSERT(maxCount>=minCount || maxCount==-1);
1551 U_ASSERT(loopLoc>fp->fPatIdx);
1552
1553 if (minCount == 0) {
1554 if (maxCount != 0) {
1555 fp = StateSave(fp, fp->fPatIdx, frameSize, status);
1556 }
1557 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
1558 }
1559 }
1560 break;
1561
1562 case URX_CTR_LOOP_NG:
1563 {
1564 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
1565 int32_t initOp = pat[opValue];
1566 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
1567 int32_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
1568 int32_t minCount = pat[opValue+2];
1569 int32_t maxCount = pat[opValue+3];
1570 // Increment the counter. Note: we're not worrying about counter
1571 // overflow, since the data comes from UnicodeStrings, which
1572 // stores its length in an int32_t.
1573 (*pCounter)++;
1574 U_ASSERT(*pCounter > 0);
1575
1576 if ((uint32_t)*pCounter >= (uint32_t)maxCount) {
1577 // The loop has matched the maximum permitted number of times.
1578 // Break out of here with no action. Matching will
1579 // continue with the following pattern.
1580 U_ASSERT(*pCounter == maxCount || maxCount == -1);
1581 break;
1582 }
1583
1584 if (*pCounter < minCount) {
1585 // We haven't met the minimum number of matches yet.
1586 // Loop back for another one.
1587 fp->fPatIdx = opValue + 4; // Loop back.
1588 } else {
1589 // We do have the minimum number of matches.
1590 // Fall into the following pattern, but first do
1591 // a state save to the top of the loop, so that a failure
1592 // in the following pattern will try another iteration of the loop.
1593 fp = StateSave(fp, opValue + 4, frameSize, status);
1594 }
1595 }
1596 break;
1597
1598 // TODO: Possessive flavor of loop ops, or take them out if no longer needed.
1599
1600 case URX_STO_SP:
1601 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
1602 fData[opValue] = fStack->size();
1603 break;
1604
1605 case URX_LD_SP:
1606 {
1607 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
1608 int32_t newStackSize = fData[opValue];
1609 U_ASSERT(newStackSize <= fStack->size());
1610 int32_t *newFP = fStack->getBuffer() + newStackSize - frameSize;
1611 if (newFP == (int32_t *)fp) {
1612 break;
1613 }
1614 int32_t i;
1615 for (i=0; i<frameSize; i++) {
1616 newFP[i] = ((int32_t *)fp)[i];
1617 }
1618 fp = (REStackFrame *)newFP;
1619 fStack->setSize(newStackSize);
1620 }
1621 break;
1622
1623 case URX_BACKREF:
1624 case URX_BACKREF_I:
1625 {
1626 U_ASSERT(opValue < frameSize);
1627 int32_t groupStartIdx = fp->fExtra[opValue];
1628 int32_t groupEndIdx = fp->fExtra[opValue+1];
1629 U_ASSERT(groupStartIdx <= groupEndIdx);
1630 int32_t len = groupEndIdx-groupStartIdx;
1631 if (groupStartIdx < 0) {
1632 // This capture group has not participated in the match thus far,
1633 fp = (REStackFrame *)fStack->popFrame(frameSize); // FAIL, no match.
1634 }
1635
1636 if (len == 0) {
1637 // The capture group match was of an empty string.
1638 // Verified by testing: Perl matches succeed in this case, so
1639 // we do too.
1640 break;
1641 }
1642 /*
1643 if ((fp->fInputIdx + len > inputLen) ||
1644 u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, len) != 0) {
1645 fp = (REStackFrame *)fStack->popFrame(frameSize); // FAIL, no match.
1646 } else {
1647 fp->fInputIdx += len; // Match. Advance current input position.
1648 }
1649 */
1650 UBool haveMatch = FALSE;
1651 if (fp->fInputIdx + len <= inputLen) {
1652 if (opType == URX_BACKREF) {
1653 if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, len) == 0) {
1654 haveMatch = TRUE;
1655 }
1656 } else {
1657 if (u_strncasecmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx,
1658 len, U_FOLD_CASE_DEFAULT) == 0) {
1659 haveMatch = TRUE;
1660 }
1661 }
1662 }
1663 if (haveMatch) {
1664 fp->fInputIdx += len; // Match. Advance current input position.
1665 } else {
1666 fp = (REStackFrame *)fStack->popFrame(frameSize); // FAIL, no match.
1667 }
1668 }
1669 break;
1670
1671 case URX_STO_INP_LOC:
1672 {
1673 U_ASSERT(opValue >= 0 && opValue < frameSize);
1674 fp->fExtra[opValue] = fp->fInputIdx;
1675 }
1676 break;
1677
1678 case URX_JMPX:
1679 {
1680 int32_t instrOperandLoc = fp->fPatIdx;
1681 fp->fPatIdx += 1;
1682 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
1683 U_ASSERT(dataLoc >= 0 && dataLoc < frameSize);
1684 int32_t savedInputIdx = fp->fExtra[dataLoc];
1685 U_ASSERT(savedInputIdx <= fp->fInputIdx);
1686 if (savedInputIdx < fp->fInputIdx) {
1687 fp->fPatIdx = opValue; // JMP
1688 } else {
1689 fp = (REStackFrame *)fStack->popFrame(frameSize); // FAIL, no progress in loop.
1690 }
1691 }
1692 break;
1693
1694 case URX_LA_START:
1695 {
1696 // Entering a lookahead block.
1697 // Save Stack Ptr, Input Pos.
1698 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
1699 fData[opValue] = fStack->size();
1700 fData[opValue+1] = fp->fInputIdx;
1701 }
1702 break;
1703
1704 case URX_LA_END:
1705 {
1706 // Leaving a look-ahead block.
1707 // restore Stack Ptr, Input Pos to positions they had on entry to block.
1708 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
1709 int32_t stackSize = fStack->size();
1710 int32_t newStackSize = fData[opValue];
1711 U_ASSERT(stackSize >= newStackSize);
1712 if (stackSize > newStackSize) {
1713 int32_t *newFP = fStack->getBuffer() + newStackSize - frameSize;
1714 int32_t i;
1715 for (i=0; i<frameSize; i++) {
1716 newFP[i] = ((int32_t *)fp)[i];
1717 }
1718 fp = (REStackFrame *)newFP;
1719 fStack->setSize(newStackSize);
1720 }
1721 fp->fInputIdx = fData[opValue+1];
1722 }
1723 break;
1724
1725 case URX_ONECHAR_I:
1726 if (fp->fInputIdx < inputLen) {
1727 UChar32 c;
1728 U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
1729 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
1730 break;
1731 }
1732 }
1733 fp = (REStackFrame *)fStack->popFrame(frameSize);
1734 break;
1735
1736 case URX_STRING_I:
1737 {
1738 // Test input against a literal string.
1739 // Strings require two slots in the compiled pattern, one for the
1740 // offset to the string text, and one for the length.
1741 int32_t stringStartIdx, stringLen;
1742 stringStartIdx = opValue;
1743
1744 op = pat[fp->fPatIdx];
1745 fp->fPatIdx++;
1746 opType = URX_TYPE(op);
1747 opValue = URX_VAL(op);
1748 U_ASSERT(opType == URX_STRING_LEN);
1749 stringLen = opValue;
1750
1751 int32_t stringEndIndex = fp->fInputIdx + stringLen;
1752 if (stringEndIndex <= inputLen &&
1753 u_strncasecmp(inputBuf+fp->fInputIdx, litText+stringStartIdx,
1754 stringLen, U_FOLD_CASE_DEFAULT) == 0) {
1755 // Success. Advance the current input position.
1756 fp->fInputIdx = stringEndIndex;
1757 } else {
1758 // No match. Back up matching to a saved state
1759 fp = (REStackFrame *)fStack->popFrame(frameSize);
1760 }
1761 }
1762 break;
1763
1764 case URX_LB_START:
1765 {
1766 // Entering a look-behind block.
1767 // Save Stack Ptr, Input Pos.
1768 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
1769 fData[opValue] = fStack->size();
1770 fData[opValue+1] = fp->fInputIdx;
1771 // Init the variable containing the start index for attempted matches.
1772 fData[opValue+2] = -1;
1773 // Save input string length, then reset to pin any matches to end at
1774 // the current position.
1775 fData[opValue+3] = inputLen;
1776 inputLen = fp->fInputIdx;
1777 }
1778 break;
1779
1780
1781 case URX_LB_CONT:
1782 {
1783 // Positive Look-Behind, at top of loop checking for matches of LB expression
1784 // at all possible input starting positions.
1785
1786 // Fetch the min and max possible match lengths. They are the operands
1787 // of this op in the pattern.
1788 int32_t minML = pat[fp->fPatIdx++];
1789 int32_t maxML = pat[fp->fPatIdx++];
1790 U_ASSERT(minML <= maxML);
1791 U_ASSERT(minML >= 0);
1792
1793 // Fetch (from data) the last input index where a match was attempted.
1794 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
1795 int32_t *lbStartIdx = &fData[opValue+2];
1796 if (*lbStartIdx < 0) {
1797 // First time through loop.
1798 *lbStartIdx = fp->fInputIdx - minML;
1799 } else {
1800 // 2nd through nth time through the loop.
1801 // Back up start position for match by one.
1802 if (*lbStartIdx == 0) {
1803 (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0.
1804 } else {
1805 U16_BACK_1(inputBuf, 0, *lbStartIdx);
1806 }
1807 }
1808
1809 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
1810 // We have tried all potential match starting points without
1811 // getting a match. Backtrack out, and out of the
1812 // Look Behind altogether.
1813 fp = (REStackFrame *)fStack->popFrame(frameSize);
1814 int32_t restoreInputLen = fData[opValue+3];
1815 U_ASSERT(restoreInputLen >= inputLen);
1816 U_ASSERT(restoreInputLen <= fInput->length());
1817 inputLen = restoreInputLen;
1818 break;
1819 }
1820
1821 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
1822 // (successful match will fall off the end of the loop.)
1823 fp = StateSave(fp, fp->fPatIdx-3, frameSize, status);
1824 fp->fInputIdx = *lbStartIdx;
1825 }
1826 break;
1827
1828 case URX_LB_END:
1829 // End of a look-behind block, after a successful match.
1830 {
1831 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
1832 if (fp->fInputIdx != inputLen) {
1833 // The look-behind expression matched, but the match did not
1834 // extend all the way to the point that we are looking behind from.
1835 // FAIL out of here, which will take us back to the LB_CONT, which
1836 // will retry the match starting at another position or fail
1837 // the look-behind altogether, whichever is appropriate.
1838 fp = (REStackFrame *)fStack->popFrame(frameSize);
1839 break;
1840 }
1841
1842 // Look-behind match is good. Restore the orignal input string length,
1843 // which had been truncated to pin the end of the lookbehind match to the
1844 // position being looked-behind.
1845 int32_t originalInputLen = fData[opValue+3];
1846 U_ASSERT(originalInputLen >= inputLen);
1847 U_ASSERT(originalInputLen <= fInput->length());
1848 inputLen = originalInputLen;
1849 }
1850 break;
1851
1852
1853 case URX_LBN_CONT:
1854 {
1855 // Negative Look-Behind, at top of loop checking for matches of LB expression
1856 // at all possible input starting positions.
1857
1858 // Fetch the extra parameters of this op.
1859 int32_t minML = pat[fp->fPatIdx++];
1860 int32_t maxML = pat[fp->fPatIdx++];
1861 int32_t continueLoc = pat[fp->fPatIdx++];
1862 continueLoc = URX_VAL(continueLoc);
1863 U_ASSERT(minML <= maxML);
1864 U_ASSERT(minML >= 0);
1865 U_ASSERT(continueLoc > fp->fPatIdx);
1866
1867 // Fetch (from data) the last input index where a match was attempted.
1868 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
1869 int32_t *lbStartIdx = &fData[opValue+2];
1870 if (*lbStartIdx < 0) {
1871 // First time through loop.
1872 *lbStartIdx = fp->fInputIdx - minML;
1873 } else {
1874 // 2nd through nth time through the loop.
1875 // Back up start position for match by one.
1876 if (*lbStartIdx == 0) {
1877 (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0.
1878 } else {
1879 U16_BACK_1(inputBuf, 0, *lbStartIdx);
1880 }
1881 }
1882
1883 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
1884 // We have tried all potential match starting points without
1885 // getting a match, which means that the negative lookbehind as
1886 // a whole has succeeded. Jump forward to the continue location
1887 int32_t restoreInputLen = fData[opValue+3];
1888 U_ASSERT(restoreInputLen >= inputLen);
1889 U_ASSERT(restoreInputLen <= fInput->length());
1890 inputLen = restoreInputLen;
1891 fp->fPatIdx = continueLoc;
1892 break;
1893 }
1894
1895 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
1896 // (successful match will cause a FAIL out of the loop altogether.)
1897 fp = StateSave(fp, fp->fPatIdx-4, frameSize, status);
1898 fp->fInputIdx = *lbStartIdx;
1899 }
1900 break;
1901
1902 case URX_LBN_END:
1903 // End of a negative look-behind block, after a successful match.
1904 {
1905 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
1906 if (fp->fInputIdx != inputLen) {
1907 // The look-behind expression matched, but the match did not
1908 // extend all the way to the point that we are looking behind from.
1909 // FAIL out of here, which will take us back to the LB_CONT, which
1910 // will retry the match starting at another position or succeed
1911 // the look-behind altogether, whichever is appropriate.
1912 fp = (REStackFrame *)fStack->popFrame(frameSize);
1913 break;
1914 }
1915
1916 // Look-behind expression matched, which means look-behind test as
1917 // a whole Fails
1918
1919 // Restore the orignal input string length, which had been truncated
1920 // inorder to pin the end of the lookbehind match
1921 // to the position being looked-behind.
1922 int32_t originalInputLen = fData[opValue+3];
1923 U_ASSERT(originalInputLen >= inputLen);
1924 U_ASSERT(originalInputLen <= fInput->length());
1925 inputLen = originalInputLen;
1926
1927 // Restore original stack position, discarding any state saved
1928 // by the successful pattern match.
1929 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
1930 int32_t newStackSize = fData[opValue];
1931 U_ASSERT(fStack->size() > newStackSize);
1932 fStack->setSize(newStackSize);
1933
1934 // FAIL, which will take control back to someplace
1935 // prior to entering the look-behind test.
1936 fp = (REStackFrame *)fStack->popFrame(frameSize);
1937 }
1938 break;
1939
1940
1941 case URX_LOOP_SR_I:
1942 // Loop Initialization for the optimized implementation of
1943 // [some character set]*
1944 // This op scans through all matching input.
1945 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
1946 {
1947 U_ASSERT(opValue > 0 && opValue < sets->size());
1948 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
1949 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
1950
1951 // Loop through input, until either the input is exhausted or
1952 // we reach a character that is not a member of the set.
1953 int32_t ix = fp->fInputIdx;
1954 for (;;) {
1955 if (ix >= inputLen) {
1956 break;
1957 }
1958 UChar32 c;
1959 U16_NEXT(inputBuf, ix, inputLen, c);
1960 if (c<256) {
1961 if (s8->contains(c) == FALSE) {
1962 U16_BACK_1(inputBuf, 0, ix);
1963 break;
1964 }
1965 } else {
1966 if (s->contains(c) == FALSE) {
1967 U16_BACK_1(inputBuf, 0, ix);
1968 break;
1969 }
1970 }
1971 }
1972
1973 // If there were no matching characters, skip over the loop altogether.
1974 // The loop doesn't run at all, a * op always succeeds.
1975 if (ix == fp->fInputIdx) {
1976 fp->fPatIdx++; // skip the URX_LOOP_C op.
1977 break;
1978 }
1979
1980 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
1981 // must follow. It's operand is the stack location
1982 // that holds the starting input index for the match of this [set]*
1983 int32_t loopcOp = pat[fp->fPatIdx];
1984 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
1985 int32_t stackLoc = URX_VAL(loopcOp);
1986 U_ASSERT(stackLoc >= 0 && stackLoc < frameSize);
1987 fp->fExtra[stackLoc] = fp->fInputIdx;
1988 fp->fInputIdx = ix;
1989
1990 // Save State to the URX_LOOP_C op that follows this one,
1991 // so that match failures in the following code will return to there.
1992 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
1993 fp = StateSave(fp, fp->fPatIdx, frameSize, status);
1994 fp->fPatIdx++;
1995 }
1996 break;
1997
1998
1999 case URX_LOOP_DOT_I:
2000 // Loop Initialization for the optimized implementation of .*
2001 // This op scans through all remaining input.
2002 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
2003 {
2004 // Loop through input until the input is exhausted (we reach an end-of-line)
2005 // In multi-line mode, we can just go straight to the end of the input.
2006 int32_t ix = inputLen;
2007 if (opValue == 0) {
2008 // NOT multi-line mode. Line endings do not match '.'
2009 // Scan forward until a line ending or end of input.
2010 ix = fp->fInputIdx;
2011 for (;;) {
2012 if (ix >= inputLen) {
2013 break;
2014 }
2015 UChar32 c;
2016 U16_NEXT(inputBuf, ix, inputLen, c); // c = inputBuf[ix++]
2017 if (((c & 0x7f) <= 0x29) &&
2018 (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029)) {
2019 // char is a line ending. Put the input pos back to the
2020 // line ending char, and exit the scanning loop.
2021 U16_BACK_1(inputBuf, 0, ix);
2022 break;
2023 }
2024 }
2025 }
2026
2027 // If there were no matching characters, skip over the loop altogether.
2028 // The loop doesn't run at all, a * op always succeeds.
2029 if (ix == fp->fInputIdx) {
2030 fp->fPatIdx++; // skip the URX_LOOP_C op.
2031 break;
2032 }
2033
2034 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
2035 // must follow. It's operand is the stack location
2036 // that holds the starting input index for the match of this [set]*
2037 int32_t loopcOp = pat[fp->fPatIdx];
2038 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
2039 int32_t stackLoc = URX_VAL(loopcOp);
2040 U_ASSERT(stackLoc >= 0 && stackLoc < frameSize);
2041 fp->fExtra[stackLoc] = fp->fInputIdx;
2042 fp->fInputIdx = ix;
2043
2044 // Save State to the URX_LOOP_C op that follows this one,
2045 // so that match failures in the following code will return to there.
2046 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
2047 fp = StateSave(fp, fp->fPatIdx, frameSize, status);
2048 fp->fPatIdx++;
2049 }
2050 break;
2051
2052
2053 case URX_LOOP_C:
2054 {
2055 U_ASSERT(opValue>=0 && opValue<frameSize);
2056 int32_t terminalIdx = fp->fExtra[opValue];
2057 U_ASSERT(terminalIdx <= fp->fInputIdx);
2058 if (terminalIdx == fp->fInputIdx) {
2059 // We've backed up the input idx to the point that the loop started.
2060 // The loop is done. Leave here without saving state.
2061 // Subsequent failures won't come back here.
2062 break;
2063 }
2064 // Set up for the next iteration of the loop, with input index
2065 // backed up by one from the last time through,
2066 // and a state save to this instruction in case the following code fails again.
2067 // (We're going backwards because this loop emulates stack unwinding, not
2068 // the initial scan forward.)
2069 U_ASSERT(fp->fInputIdx > 0);
2070 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
2071 if (inputBuf[fp->fInputIdx] == 0x0a &&
2072 fp->fInputIdx > terminalIdx &&
2073 inputBuf[fp->fInputIdx-1] == 0x0d) {
2074 int32_t prevOp = pat[fp->fPatIdx-2];
2075 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
2076 // .*, stepping back over CRLF pair.
2077 fp->fInputIdx--;
2078 }
2079 }
2080
2081
2082 fp = StateSave(fp, fp->fPatIdx-1, frameSize, status);
2083 }
2084 break;
2085
2086
2087
2088 default:
2089 // Trouble. The compiled pattern contains an entry with an
2090 // unrecognized type tag.
2091 U_ASSERT(FALSE);
2092 }
2093
2094 if (U_FAILURE(status)) {
2095 break;
2096 }
2097 }
2098
2099breakFromLoop:
2100 fMatch = isMatch;
2101 if (isMatch) {
2102 fLastMatchEnd = fMatchEnd;
2103 fMatchStart = startIdx;
2104 fMatchEnd = fp->fInputIdx;
2105 if (fTraceDebug) {
2106 REGEX_RUN_DEBUG_PRINTF("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd);
2107 }
2108 }
2109 else
2110 {
2111 if (fTraceDebug) {
2112 REGEX_RUN_DEBUG_PRINTF("No match\n\n");
2113 }
2114 }
2115
2116 fFrame = fp; // The active stack frame when the engine stopped.
2117 // Contains the capture group results that we need to
2118 // access later.
2119
2120 return;
2121}
2122
2123
2124
2125const char RegexMatcher::fgClassID = 0;
2126
2127U_NAMESPACE_END
2128
2129#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
2130