]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/rematch.cpp
ICU-400.40.tar.gz
[apple/icu.git] / icuSources / i18n / rematch.cpp
CommitLineData
b75a7d8f
A
1/*
2**************************************************************************
46f4442e 3* Copyright (C) 2002-2008 International Business Machines Corporation *
b75a7d8f
A
4* and others. All rights reserved. *
5**************************************************************************
6*/
46f4442e
A
7//
8// file: rematch.cpp
9//
10// Contains the implementation of class RegexMatcher,
11// which is one of the main API classes for the ICU regular expression package.
12//
b75a7d8f
A
13
14#include "unicode/utypes.h"
15#if !UCONFIG_NO_REGULAR_EXPRESSIONS
16
17#include "unicode/regex.h"
18#include "unicode/uniset.h"
19#include "unicode/uchar.h"
20#include "unicode/ustring.h"
374ca955 21#include "unicode/rbbi.h"
b75a7d8f
A
22#include "uassert.h"
23#include "cmemory.h"
24#include "uvector.h"
25#include "uvectr32.h"
26#include "regeximp.h"
27#include "regexst.h"
28
29// #include <malloc.h> // Needed for heapcheck testing
30
31U_NAMESPACE_BEGIN
32
46f4442e
A
33// Default limit for the size of the back track stack, to avoid system
34// failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
35// This value puts ICU's limits higher than most other regexp implementations,
36// which use recursion rather than the heap, and take more storage per
37// backtrack point.
38//
39static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
40
41// Time limit counter constant.
42// Time limits for expression evaluation are in terms of quanta of work by
43// the engine, each of which is 10,000 state saves.
44// This constant determines that state saves per tick number.
45static const int32_t TIMER_INITIAL_VALUE = 10000;
46
b75a7d8f
A
47//-----------------------------------------------------------------------------
48//
49// Constructor and Destructor
50//
51//-----------------------------------------------------------------------------
52RegexMatcher::RegexMatcher(const RegexPattern *pat) {
46f4442e
A
53 fDeferredStatus = U_ZERO_ERROR;
54 init(fDeferredStatus);
55 if (U_FAILURE(fDeferredStatus)) {
56 return;
57 }
b75a7d8f
A
58 if (pat==NULL) {
59 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
60 return;
61 }
46f4442e
A
62 fPattern = pat;
63 init2(RegexStaticSets::gStaticSets->fEmptyString, fDeferredStatus);
b75a7d8f
A
64}
65
66
67
68RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
69 uint32_t flags, UErrorCode &status) {
46f4442e 70 init(status);
b75a7d8f
A
71 if (U_FAILURE(status)) {
72 return;
73 }
46f4442e
A
74 UParseError pe;
75 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
76 fPattern = fPatternOwned;
77 init2(input, status);
b75a7d8f
A
78}
79
80
81RegexMatcher::RegexMatcher(const UnicodeString &regexp,
82 uint32_t flags, UErrorCode &status) {
46f4442e 83 init(status);
b75a7d8f
A
84 if (U_FAILURE(status)) {
85 return;
86 }
46f4442e
A
87 UParseError pe;
88 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
89 fPattern = fPatternOwned;
90 init2(RegexStaticSets::gStaticSets->fEmptyString, status);
b75a7d8f
A
91}
92
93
94
46f4442e 95
b75a7d8f
A
96RegexMatcher::~RegexMatcher() {
97 delete fStack;
98 if (fData != fSmallData) {
374ca955 99 uprv_free(fData);
b75a7d8f
A
100 fData = NULL;
101 }
102 if (fPatternOwned) {
103 delete fPatternOwned;
104 fPatternOwned = NULL;
105 fPattern = NULL;
106 }
374ca955
A
107 #if UCONFIG_NO_BREAK_ITERATION==0
108 delete fWordBreakItr;
109 #endif
b75a7d8f
A
110}
111
46f4442e
A
112//
113// init() common initialization for use by all constructors.
114// Initialize all fields, get the object into a consistent state.
115// This must be done even when the initial status shows an error,
116// so that the object is initialized sufficiently well for the destructor
117// to run safely.
118//
119void RegexMatcher::init(UErrorCode &status) {
120 fPattern = NULL;
121 fPatternOwned = NULL;
122 fInput = NULL;
123 fFrameSize = 0;
124 fRegionStart = 0;
125 fRegionLimit = 0;
126 fAnchorStart = 0;
127 fAnchorLimit = 0;
128 fLookStart = 0;
129 fLookLimit = 0;
130 fActiveStart = 0;
131 fActiveLimit = 0;
132 fTransparentBounds = FALSE;
133 fAnchoringBounds = TRUE;
134 fMatch = FALSE;
135 fMatchStart = 0;
136 fMatchEnd = 0;
137 fLastMatchEnd = -1;
138 fAppendPosition = 0;
139 fHitEnd = FALSE;
140 fRequireEnd = FALSE;
141 fStack = NULL;
142 fFrame = NULL;
143 fTimeLimit = 0;
144 fTime = 0;
145 fTickCounter = 0;
146 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY;
147 fCallbackFn = NULL;
148 fCallbackContext = NULL;
149 fTraceDebug = FALSE;
150 fDeferredStatus = status;
151 fData = fSmallData;
152 fWordBreakItr = NULL;
153
154 fStack = new UVector32(status);
155 if (U_FAILURE(status)) {
156 fDeferredStatus = status;
157 }
158}
159
160//
161// init2() Common initialization for use by RegexMatcher constructors, part 2.
162// This handles the common setup to be done after the Pattern is available.
163//
164void RegexMatcher::init2(const UnicodeString &input, UErrorCode &status) {
165 if (U_FAILURE(status)) {
166 fDeferredStatus = status;
167 return;
168 }
169
170 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) {
171 fData = (int32_t *)uprv_malloc(fPattern->fDataSize * sizeof(int32_t));
172 if (fData == NULL) {
173 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
174 return;
175 }
176 }
177
178 reset(input);
179 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
180 if (U_FAILURE(status)) {
181 fDeferredStatus = status;
182 return;
183 }
184}
b75a7d8f
A
185
186
187static const UChar BACKSLASH = 0x5c;
188static const UChar DOLLARSIGN = 0x24;
189//--------------------------------------------------------------------------------
190//
191// appendReplacement
192//
193//--------------------------------------------------------------------------------
194RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
195 const UnicodeString &replacement,
196 UErrorCode &status) {
197 if (U_FAILURE(status)) {
198 return *this;
199 }
200 if (U_FAILURE(fDeferredStatus)) {
201 status = fDeferredStatus;
202 return *this;
203 }
204 if (fMatch == FALSE) {
205 status = U_REGEX_INVALID_STATE;
206 return *this;
207 }
208
209 // Copy input string from the end of previous match to start of current match
46f4442e 210 int32_t len = fMatchStart-fAppendPosition;
b75a7d8f 211 if (len > 0) {
46f4442e 212 dest.append(*fInput, fAppendPosition, len);
b75a7d8f 213 }
46f4442e 214 fAppendPosition = fMatchEnd;
b75a7d8f
A
215
216
217 // scan the replacement text, looking for substitutions ($n) and \escapes.
218 // TODO: optimize this loop by efficiently scanning for '$' or '\',
219 // move entire ranges not containing substitutions.
220 int32_t replLen = replacement.length();
221 int32_t replIdx = 0;
222 while (replIdx<replLen) {
223 UChar c = replacement.charAt(replIdx);
224 replIdx++;
225 if (c == BACKSLASH) {
226 // Backslash Escape. Copy the following char out without further checks.
227 // Note: Surrogate pairs don't need any special handling
228 // The second half wont be a '$' or a '\', and
229 // will move to the dest normally on the next
230 // loop iteration.
231 if (replIdx >= replLen) {
232 break;
233 }
234 c = replacement.charAt(replIdx);
235
236 if (c==0x55/*U*/ || c==0x75/*u*/) {
237 // We have a \udddd or \Udddddddd escape sequence.
238 UChar32 escapedChar = replacement.unescapeAt(replIdx);
239 if (escapedChar != (UChar32)0xFFFFFFFF) {
240 dest.append(escapedChar);
b75a7d8f
A
241 // TODO: Report errors for mal-formed \u escapes?
242 // As this is, the original sequence is output, which may be OK.
243 continue;
244 }
245 }
246
247 // Plain backslash escape. Just put out the escaped character.
248 dest.append(c);
249 replIdx++;
250 continue;
251 }
252
253 if (c != DOLLARSIGN) {
254 // Normal char, not a $. Copy it out without further checks.
255 dest.append(c);
256 continue;
257 }
258
259 // We've got a $. Pick up a capture group number if one follows.
260 // Consume at most the number of digits necessary for the largest capture
261 // number that is valid for this pattern.
262
263 int32_t numDigits = 0;
264 int32_t groupNum = 0;
265 UChar32 digitC;
266 for (;;) {
267 if (replIdx >= replLen) {
268 break;
269 }
270 digitC = replacement.char32At(replIdx);
271 if (u_isdigit(digitC) == FALSE) {
272 break;
273 }
274 replIdx = replacement.moveIndex32(replIdx, 1);
275 groupNum=groupNum*10 + u_charDigitValue(digitC);
276 numDigits++;
277 if (numDigits >= fPattern->fMaxCaptureDigits) {
278 break;
279 }
280 }
281
282
283 if (numDigits == 0) {
284 // The $ didn't introduce a group number at all.
285 // Treat it as just part of the substitution text.
286 dest.append(DOLLARSIGN);
287 continue;
288 }
289
290 // Finally, append the capture group data to the destination.
291 dest.append(group(groupNum, status));
292 if (U_FAILURE(status)) {
293 // Can fail if group number is out of range.
294 break;
295 }
296
297 }
298
299 return *this;
300}
301
302
303
304//--------------------------------------------------------------------------------
305//
306// appendTail Intended to be used in conjunction with appendReplacement()
307// To the destination string, append everything following
308// the last match position from the input string.
309//
46f4442e
A
310// Note: Match ranges do not affect appendTail or appendReplacement
311//
b75a7d8f
A
312//--------------------------------------------------------------------------------
313UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
46f4442e 314 int32_t len = fInput->length() - fAppendPosition;
b75a7d8f 315 if (len > 0) {
46f4442e 316 dest.append(*fInput, fAppendPosition, len);
b75a7d8f
A
317 }
318 return dest;
319}
320
321
322
323//--------------------------------------------------------------------------------
324//
325// end
326//
327//--------------------------------------------------------------------------------
328int32_t RegexMatcher::end(UErrorCode &err) const {
329 return end(0, err);
330}
331
332
333
73c04bcf 334int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
b75a7d8f
A
335 if (U_FAILURE(err)) {
336 return -1;
337 }
338 if (fMatch == FALSE) {
339 err = U_REGEX_INVALID_STATE;
340 return -1;
341 }
342 if (group < 0 || group > fPattern->fGroupMap->size()) {
343 err = U_INDEX_OUTOFBOUNDS_ERROR;
344 return -1;
345 }
346 int32_t e = -1;
347 if (group == 0) {
348 e = fMatchEnd;
349 } else {
350 // Get the position within the stack frame of the variables for
351 // this capture group.
352 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
353 U_ASSERT(groupOffset < fPattern->fFrameSize);
354 U_ASSERT(groupOffset >= 0);
355 e = fFrame->fExtra[groupOffset + 1];
356 }
357 return e;
358}
359
360
361
362//--------------------------------------------------------------------------------
363//
364// find()
365//
366//--------------------------------------------------------------------------------
367UBool RegexMatcher::find() {
368 // Start at the position of the last match end. (Will be zero if the
369 // matcher has been reset.
370 //
371 if (U_FAILURE(fDeferredStatus)) {
372 return FALSE;
373 }
374
375 int32_t startPos = fMatchEnd;
46f4442e
A
376 if (startPos==0) {
377 startPos = fActiveStart;
378 }
374ca955
A
379
380 if (fMatch) {
381 // Save the position of any previous successful match.
382 fLastMatchEnd = fMatchEnd;
383
384 if (fMatchStart == fMatchEnd) {
385 // Previous match had zero length. Move start position up one position
386 // to avoid sending find() into a loop on zero-length matches.
46f4442e 387 if (startPos >= fActiveLimit) {
374ca955 388 fMatch = FALSE;
46f4442e 389 fHitEnd = TRUE;
374ca955
A
390 return FALSE;
391 }
392 startPos = fInput->moveIndex32(startPos, 1);
393 }
394 } else {
395 if (fLastMatchEnd >= 0) {
396 // A previous find() failed to match. Don't try again.
397 // (without this test, a pattern with a zero-length match
398 // could match again at the end of an input string.)
46f4442e 399 fHitEnd = TRUE;
374ca955
A
400 return FALSE;
401 }
402 }
403
374ca955
A
404
405 // Compute the position in the input string beyond which a match can not begin, because
406 // the minimum length match would extend past the end of the input.
46f4442e
A
407 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
408 // Be aware of possible overflows if making changes here.
409 int32_t testLen = fActiveLimit - fPattern->fMinMatchLen;
b75a7d8f 410 if (startPos > testLen) {
374ca955 411 fMatch = FALSE;
46f4442e 412 fHitEnd = TRUE;
b75a7d8f
A
413 return FALSE;
414 }
415
416 const UChar *inputBuf = fInput->getBuffer();
417 UChar32 c;
418 U_ASSERT(startPos >= 0);
419
420 switch (fPattern->fStartType) {
421 case START_NO_INFO:
422 // No optimization was found.
423 // Try a match at each input position.
424 for (;;) {
46f4442e 425 MatchAt(startPos, FALSE, fDeferredStatus);
b75a7d8f
A
426 if (U_FAILURE(fDeferredStatus)) {
427 return FALSE;
428 }
429 if (fMatch) {
430 return TRUE;
431 }
432 if (startPos >= testLen) {
46f4442e 433 fHitEnd = TRUE;
b75a7d8f
A
434 return FALSE;
435 }
46f4442e 436 U16_FWD_1(inputBuf, startPos, fActiveLimit);
b75a7d8f
A
437 // Note that it's perfectly OK for a pattern to have a zero-length
438 // match at the end of a string, so we must make sure that the loop
439 // runs with startPos == testLen the last time through.
440 }
441 U_ASSERT(FALSE);
442
443 case START_START:
444 // Matches are only possible at the start of the input string
445 // (pattern begins with ^ or \A)
46f4442e 446 if (startPos > fActiveStart) {
374ca955 447 fMatch = FALSE;
b75a7d8f
A
448 return FALSE;
449 }
46f4442e 450 MatchAt(startPos, FALSE, fDeferredStatus);
b75a7d8f
A
451 if (U_FAILURE(fDeferredStatus)) {
452 return FALSE;
453 }
454 return fMatch;
455
456
457 case START_SET:
458 {
459 // Match may start on any char from a pre-computed set.
460 U_ASSERT(fPattern->fMinMatchLen > 0);
461 for (;;) {
462 int32_t pos = startPos;
46f4442e 463 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
b75a7d8f
A
464 if (c<256 && fPattern->fInitialChars8->contains(c) ||
465 c>=256 && fPattern->fInitialChars->contains(c)) {
46f4442e 466 MatchAt(pos, FALSE, fDeferredStatus);
b75a7d8f
A
467 if (U_FAILURE(fDeferredStatus)) {
468 return FALSE;
469 }
470 if (fMatch) {
471 return TRUE;
472 }
473 }
474 if (pos >= testLen) {
374ca955 475 fMatch = FALSE;
46f4442e 476 fHitEnd = TRUE;
b75a7d8f
A
477 return FALSE;
478 }
479 }
480 }
481 U_ASSERT(FALSE);
482
483 case START_STRING:
484 case START_CHAR:
485 {
486 // Match starts on exactly one char.
487 U_ASSERT(fPattern->fMinMatchLen > 0);
488 UChar32 theChar = fPattern->fInitialChar;
489 for (;;) {
490 int32_t pos = startPos;
46f4442e 491 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
b75a7d8f 492 if (c == theChar) {
46f4442e 493 MatchAt(pos, FALSE, fDeferredStatus);
b75a7d8f
A
494 if (U_FAILURE(fDeferredStatus)) {
495 return FALSE;
496 }
497 if (fMatch) {
498 return TRUE;
499 }
500 }
501 if (pos >= testLen) {
374ca955 502 fMatch = FALSE;
46f4442e 503 fHitEnd = TRUE;
b75a7d8f
A
504 return FALSE;
505 }
506 }
507 }
508 U_ASSERT(FALSE);
509
510 case START_LINE:
511 {
512 UChar32 c;
46f4442e
A
513 if (startPos == fAnchorStart) {
514 MatchAt(startPos, FALSE, fDeferredStatus);
b75a7d8f
A
515 if (U_FAILURE(fDeferredStatus)) {
516 return FALSE;
517 }
518 if (fMatch) {
519 return TRUE;
520 }
46f4442e 521 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
b75a7d8f
A
522 }
523
46f4442e
A
524 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
525 for (;;) {
526 c = inputBuf[startPos-1];
527 if (c == 0x0a) {
528 MatchAt(startPos, FALSE, fDeferredStatus);
529 if (U_FAILURE(fDeferredStatus)) {
530 return FALSE;
531 }
532 if (fMatch) {
533 return TRUE;
534 }
535 }
536 if (startPos >= testLen) {
537 fMatch = FALSE;
538 fHitEnd = TRUE;
539 return FALSE;
540 }
541 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
542 // Note that it's perfectly OK for a pattern to have a zero-length
543 // match at the end of a string, so we must make sure that the loop
544 // runs with startPos == testLen the last time through.
b75a7d8f 545 }
46f4442e
A
546 } else {
547 for (;;) {
548 c = inputBuf[startPos-1];
549 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
550 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
551 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
552 startPos++;
553 }
554 MatchAt(startPos, FALSE, fDeferredStatus);
555 if (U_FAILURE(fDeferredStatus)) {
556 return FALSE;
557 }
558 if (fMatch) {
559 return TRUE;
560 }
561 }
562 if (startPos >= testLen) {
563 fMatch = FALSE;
564 fHitEnd = TRUE;
565 return FALSE;
566 }
567 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
568 // Note that it's perfectly OK for a pattern to have a zero-length
569 // match at the end of a string, so we must make sure that the loop
570 // runs with startPos == testLen the last time through.
b75a7d8f 571 }
b75a7d8f
A
572 }
573 }
574
575 default:
576 U_ASSERT(FALSE);
577 }
578
579 U_ASSERT(FALSE);
580 return FALSE;
581}
582
583
584
585UBool RegexMatcher::find(int32_t start, UErrorCode &status) {
586 if (U_FAILURE(status)) {
587 return FALSE;
588 }
589 if (U_FAILURE(fDeferredStatus)) {
590 status = fDeferredStatus;
591 return FALSE;
592 }
46f4442e
A
593 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
594 // This will reset the region to be the full input length.
595 if (start < fActiveStart || start > fActiveLimit) {
b75a7d8f
A
596 status = U_INDEX_OUTOFBOUNDS_ERROR;
597 return FALSE;
598 }
46f4442e 599 fMatchEnd = start;
b75a7d8f
A
600 return find();
601}
602
603
604
605//--------------------------------------------------------------------------------
606//
607// group()
608//
609//--------------------------------------------------------------------------------
610UnicodeString RegexMatcher::group(UErrorCode &status) const {
611 return group(0, status);
612}
613
614
615
616UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
617 int32_t s = start(groupNum, status);
618 int32_t e = end(groupNum, status);
619
620 // Note: calling start() and end() above will do all necessary checking that
621 // the group number is OK and that a match exists. status will be set.
622 if (U_FAILURE(status)) {
623 return UnicodeString();
624 }
625 if (U_FAILURE(fDeferredStatus)) {
626 status = fDeferredStatus;
627 return UnicodeString();
628 }
629
630 if (s < 0) {
631 // A capture group wasn't part of the match
632 return UnicodeString();
633 }
634 U_ASSERT(s <= e);
635 return UnicodeString(*fInput, s, e-s);
636}
637
638
639
640
641int32_t RegexMatcher::groupCount() const {
642 return fPattern->fGroupMap->size();
643}
644
645
646
647const UnicodeString &RegexMatcher::input() const {
648 return *fInput;
649}
650
651
46f4442e
A
652//--------------------------------------------------------------------------------
653//
654// hasAnchoringBounds()
655//
656//--------------------------------------------------------------------------------
657UBool RegexMatcher::hasAnchoringBounds() const {
658 return fAnchoringBounds;
659}
660
661
662//--------------------------------------------------------------------------------
663//
664// hasTransparentBounds()
665//
666//--------------------------------------------------------------------------------
667UBool RegexMatcher::hasTransparentBounds() const {
668 return fTransparentBounds;
669}
670
671
b75a7d8f 672
46f4442e
A
673//--------------------------------------------------------------------------------
674//
675// hitEnd()
676//
677//--------------------------------------------------------------------------------
678UBool RegexMatcher::hitEnd() const {
679 return fHitEnd;
680}
b75a7d8f 681
374ca955
A
682//--------------------------------------------------------------------------------
683//
684// lookingAt()
685//
686//--------------------------------------------------------------------------------
b75a7d8f
A
687UBool RegexMatcher::lookingAt(UErrorCode &status) {
688 if (U_FAILURE(status)) {
689 return FALSE;
690 }
691 if (U_FAILURE(fDeferredStatus)) {
692 status = fDeferredStatus;
693 return FALSE;
694 }
46f4442e
A
695 resetPreserveRegion();
696 MatchAt(fActiveStart, FALSE, status);
b75a7d8f
A
697 return fMatch;
698}
699
700
374ca955
A
701UBool RegexMatcher::lookingAt(int32_t start, UErrorCode &status) {
702 if (U_FAILURE(status)) {
703 return FALSE;
704 }
705 if (U_FAILURE(fDeferredStatus)) {
706 status = fDeferredStatus;
707 return FALSE;
708 }
46f4442e
A
709 reset();
710 if (start < fActiveStart || start > fActiveLimit) {
374ca955
A
711 status = U_INDEX_OUTOFBOUNDS_ERROR;
712 return FALSE;
713 }
46f4442e 714 MatchAt(start, FALSE, status);
374ca955
A
715 return fMatch;
716}
717
718
b75a7d8f 719
374ca955
A
720//--------------------------------------------------------------------------------
721//
722// matches()
723//
724//--------------------------------------------------------------------------------
b75a7d8f
A
725UBool RegexMatcher::matches(UErrorCode &status) {
726 if (U_FAILURE(status)) {
727 return FALSE;
728 }
729 if (U_FAILURE(fDeferredStatus)) {
730 status = fDeferredStatus;
731 return FALSE;
732 }
46f4442e
A
733 resetPreserveRegion();
734 MatchAt(fActiveStart, TRUE, status);
735 return fMatch;
b75a7d8f
A
736}
737
738
374ca955
A
739UBool RegexMatcher::matches(int32_t start, UErrorCode &status) {
740 if (U_FAILURE(status)) {
741 return FALSE;
742 }
743 if (U_FAILURE(fDeferredStatus)) {
744 status = fDeferredStatus;
745 return FALSE;
746 }
46f4442e
A
747 reset();
748 if (start < fActiveStart || start > fActiveLimit) {
374ca955
A
749 status = U_INDEX_OUTOFBOUNDS_ERROR;
750 return FALSE;
751 }
46f4442e
A
752 MatchAt(start, TRUE, status);
753 return fMatch;
374ca955
A
754}
755
b75a7d8f
A
756
757
46f4442e
A
758//--------------------------------------------------------------------------------
759//
760// pattern
761//
762//--------------------------------------------------------------------------------
b75a7d8f
A
763const RegexPattern &RegexMatcher::pattern() const {
764 return *fPattern;
765}
766
767
768
46f4442e
A
769//--------------------------------------------------------------------------------
770//
771// region
772//
773//--------------------------------------------------------------------------------
774RegexMatcher &RegexMatcher::region(int32_t start, int32_t limit, UErrorCode &status) {
775 if (U_FAILURE(status)) {
776 return *this;
777 }
778 if (start>limit || start<0 || limit<0 || limit>fInput->length()) {
779 status = U_ILLEGAL_ARGUMENT_ERROR;
780 }
781 this->reset();
782 fRegionStart = start;
783 fRegionLimit = limit;
784 fActiveStart = start;
785 fActiveLimit = limit;
786 if (!fTransparentBounds) {
787 fLookStart = start;
788 fLookLimit = limit;
789 }
790 if (fAnchoringBounds) {
791 fAnchorStart = start;
792 fAnchorLimit = limit;
793 }
794 return *this;
795}
796
797
798
799//--------------------------------------------------------------------------------
800//
801// regionEnd
802//
803//--------------------------------------------------------------------------------
804int32_t RegexMatcher::regionEnd() const {
805 return fRegionLimit;
806}
807
808
809//--------------------------------------------------------------------------------
810//
811// regionStart
812//
813//--------------------------------------------------------------------------------
814int32_t RegexMatcher::regionStart() const {
815 return fRegionStart;
816}
817
818
b75a7d8f
A
819//--------------------------------------------------------------------------------
820//
821// replaceAll
822//
823//--------------------------------------------------------------------------------
824UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
825 if (U_FAILURE(status)) {
826 return *fInput;
827 }
828 if (U_FAILURE(fDeferredStatus)) {
829 status = fDeferredStatus;
830 return *fInput;
831 }
832 UnicodeString destString;
46f4442e
A
833 reset();
834 while (find()) {
b75a7d8f
A
835 appendReplacement(destString, replacement, status);
836 if (U_FAILURE(status)) {
837 break;
838 }
839 }
840 appendTail(destString);
841 return destString;
842}
843
844
845
846
847//--------------------------------------------------------------------------------
848//
849// replaceFirst
850//
851//--------------------------------------------------------------------------------
852UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
853 if (U_FAILURE(status)) {
854 return *fInput;
855 }
856 if (U_FAILURE(fDeferredStatus)) {
857 status = fDeferredStatus;
858 return *fInput;
859 }
860
861 reset();
862 if (!find()) {
863 return *fInput;
864 }
865
866 UnicodeString destString;
867 appendReplacement(destString, replacement, status);
868 appendTail(destString);
869 return destString;
870}
871
872
46f4442e
A
873//--------------------------------------------------------------------------------
874//
875// requireEnd
876//
877//--------------------------------------------------------------------------------
878UBool RegexMatcher::requireEnd() const {
879 return fRequireEnd;
880}
881
b75a7d8f
A
882
883//--------------------------------------------------------------------------------
884//
885// reset
886//
887//--------------------------------------------------------------------------------
888RegexMatcher &RegexMatcher::reset() {
46f4442e
A
889 fRegionStart = 0;
890 fRegionLimit = fInput->length();
891 fActiveStart = 0;
892 fActiveLimit = fRegionLimit;
893 fAnchorStart = 0;
894 fAnchorLimit = fRegionLimit;
895 fLookStart = 0;
896 fLookLimit = fRegionLimit;
897 resetPreserveRegion();
898 return *this;
899}
900
901
902
903void RegexMatcher::resetPreserveRegion() {
374ca955
A
904 fMatchStart = 0;
905 fMatchEnd = 0;
906 fLastMatchEnd = -1;
46f4442e 907 fAppendPosition = 0;
374ca955 908 fMatch = FALSE;
46f4442e
A
909 fHitEnd = FALSE;
910 fRequireEnd = FALSE;
911 fTime = 0;
912 fTickCounter = TIMER_INITIAL_VALUE;
b75a7d8f 913 resetStack();
b75a7d8f
A
914}
915
916
b75a7d8f
A
917RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
918 fInput = &input;
919 reset();
374ca955
A
920 if (fWordBreakItr != NULL) {
921 #if UCONFIG_NO_BREAK_ITERATION==0
922 fWordBreakItr->setText(input);
923 #endif
924 }
b75a7d8f
A
925 return *this;
926}
927
46f4442e 928/*RegexMatcher &RegexMatcher::reset(const UChar *) {
374ca955
A
929 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
930 return *this;
46f4442e 931}*/
b75a7d8f
A
932
933
374ca955
A
934RegexMatcher &RegexMatcher::reset(int32_t position, UErrorCode &status) {
935 if (U_FAILURE(status)) {
936 return *this;
b75a7d8f 937 }
46f4442e
A
938 reset(); // Reset also resets the region to be the entire string.
939 if (position < 0 || position >= fActiveLimit) {
374ca955
A
940 status = U_INDEX_OUTOFBOUNDS_ERROR;
941 return *this;
942 }
943 fMatchEnd = position;
944 return *this;
b75a7d8f
A
945}
946
947
948
374ca955
A
949
950
b75a7d8f
A
951//--------------------------------------------------------------------------------
952//
953// setTrace
954//
955//--------------------------------------------------------------------------------
956void RegexMatcher::setTrace(UBool state) {
957 fTraceDebug = state;
958}
959
960
961
962//---------------------------------------------------------------------
963//
964// split
965//
966//---------------------------------------------------------------------
967int32_t RegexMatcher::split(const UnicodeString &input,
968 UnicodeString dest[],
969 int32_t destCapacity,
970 UErrorCode &status)
971{
972 //
973 // Check arguements for validity
974 //
975 if (U_FAILURE(status)) {
976 return 0;
977 };
978
979 if (destCapacity < 1) {
980 status = U_ILLEGAL_ARGUMENT_ERROR;
981 return 0;
982 }
983
b75a7d8f
A
984 //
985 // Reset for the input text
986 //
987 reset(input);
b75a7d8f 988 int32_t nextOutputStringStart = 0;
46f4442e 989 if (fActiveLimit == 0) {
b75a7d8f
A
990 return 0;
991 }
992
b75a7d8f
A
993 //
994 // Loop through the input text, searching for the delimiter pattern
995 //
73c04bcf 996 int32_t i;
b75a7d8f
A
997 int32_t numCaptureGroups = fPattern->fGroupMap->size();
998 for (i=0; ; i++) {
999 if (i>=destCapacity-1) {
1000 // There is one or zero output string left.
1001 // Fill the last output string with whatever is left from the input, then exit the loop.
1002 // ( i will be == destCapicity if we filled the output array while processing
1003 // capture groups of the delimiter expression, in which case we will discard the
1004 // last capture group saved in favor of the unprocessed remainder of the
1005 // input string.)
1006 i = destCapacity-1;
46f4442e 1007 int32_t remainingLength = fActiveLimit-nextOutputStringStart;
b75a7d8f
A
1008 if (remainingLength > 0) {
1009 dest[i].setTo(input, nextOutputStringStart, remainingLength);
1010 }
1011 break;
1012 }
1013 if (find()) {
1014 // We found another delimiter. Move everything from where we started looking
1015 // up until the start of the delimiter into the next output string.
1016 int32_t fieldLen = fMatchStart - nextOutputStringStart;
1017 dest[i].setTo(input, nextOutputStringStart, fieldLen);
1018 nextOutputStringStart = fMatchEnd;
1019
1020 // If the delimiter pattern has capturing parentheses, the captured
1021 // text goes out into the next n destination strings.
1022 int32_t groupNum;
1023 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
1024 if (i==destCapacity-1) {
1025 break;
1026 }
1027 i++;
1028 dest[i] = group(groupNum, status);
1029 }
1030
46f4442e 1031 if (nextOutputStringStart == fActiveLimit) {
b75a7d8f
A
1032 // The delimiter was at the end of the string. We're done.
1033 break;
1034 }
1035
1036 }
1037 else
1038 {
1039 // We ran off the end of the input while looking for the next delimiter.
1040 // All the remaining text goes into the current output string.
46f4442e 1041 dest[i].setTo(input, nextOutputStringStart, fActiveLimit-nextOutputStringStart);
b75a7d8f
A
1042 break;
1043 }
1044 }
1045 return i+1;
1046}
1047
1048
1049
1050//--------------------------------------------------------------------------------
1051//
1052// start
1053//
1054//--------------------------------------------------------------------------------
1055int32_t RegexMatcher::start(UErrorCode &status) const {
1056 return start(0, status);
1057}
1058
1059
1060
1061
46f4442e
A
1062//--------------------------------------------------------------------------------
1063//
1064// start(int32_t group, UErrorCode &status)
1065//
1066//--------------------------------------------------------------------------------
73c04bcf 1067int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
b75a7d8f
A
1068 if (U_FAILURE(status)) {
1069 return -1;
1070 }
1071 if (U_FAILURE(fDeferredStatus)) {
1072 status = fDeferredStatus;
1073 return -1;
1074 }
1075 if (fMatch == FALSE) {
1076 status = U_REGEX_INVALID_STATE;
1077 return -1;
1078 }
1079 if (group < 0 || group > fPattern->fGroupMap->size()) {
1080 status = U_INDEX_OUTOFBOUNDS_ERROR;
1081 return -1;
1082 }
1083 int32_t s;
1084 if (group == 0) {
1085 s = fMatchStart;
1086 } else {
1087 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
1088 U_ASSERT(groupOffset < fPattern->fFrameSize);
1089 U_ASSERT(groupOffset >= 0);
1090 s = fFrame->fExtra[groupOffset];
1091 }
1092 return s;
1093}
1094
1095
1096
46f4442e
A
1097//--------------------------------------------------------------------------------
1098//
1099// useAnchoringBounds
1100//
1101//--------------------------------------------------------------------------------
1102RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
1103 fAnchoringBounds = b;
1104 UErrorCode status = U_ZERO_ERROR;
1105 region(fRegionStart, fRegionLimit, status);
1106 U_ASSERT(U_SUCCESS(status));
1107 return *this;
1108}
1109
1110
1111//--------------------------------------------------------------------------------
1112//
1113// useTransparentBounds
1114//
1115//--------------------------------------------------------------------------------
1116RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
1117 fTransparentBounds = b;
1118 UErrorCode status = U_ZERO_ERROR;
1119 region(fRegionStart, fRegionLimit, status);
1120 U_ASSERT(U_SUCCESS(status));
1121 return *this;
1122}
1123
1124//--------------------------------------------------------------------------------
1125//
1126// setTimeLimit
1127//
1128//--------------------------------------------------------------------------------
1129void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
1130 if (U_FAILURE(status)) {
1131 return;
1132 }
1133 if (U_FAILURE(fDeferredStatus)) {
1134 status = fDeferredStatus;
1135 return;
1136 }
1137 if (limit < 0) {
1138 status = U_ILLEGAL_ARGUMENT_ERROR;
1139 return;
1140 }
1141 fTimeLimit = limit;
1142}
1143
1144
1145//--------------------------------------------------------------------------------
1146//
1147// getTimeLimit
1148//
1149//--------------------------------------------------------------------------------
1150int32_t RegexMatcher::getTimeLimit() const {
1151 return fTimeLimit;
1152}
1153
1154
1155//--------------------------------------------------------------------------------
1156//
1157// setStackLimit
1158//
1159//--------------------------------------------------------------------------------
1160void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
1161 if (U_FAILURE(status)) {
1162 return;
1163 }
1164 if (U_FAILURE(fDeferredStatus)) {
1165 status = fDeferredStatus;
1166 return;
1167 }
1168 if (limit < 0) {
1169 status = U_ILLEGAL_ARGUMENT_ERROR;
1170 return;
1171 }
1172
1173 // Reset the matcher. This is needed here in case there is a current match
1174 // whose final stack frame (containing the match results, pointed to by fFrame)
1175 // would be lost by resizing to a smaller stack size.
1176 reset();
1177
1178 if (limit == 0) {
1179 // Unlimited stack expansion
1180 fStack->setMaxCapacity(0);
1181 } else {
1182 // Change the units of the limit from bytes to ints, and bump the size up
1183 // to be big enough to hold at least one stack frame for the pattern,
1184 // if it isn't there already.
1185 int32_t adjustedLimit = limit / sizeof(int32_t);
1186 if (adjustedLimit < fPattern->fFrameSize) {
1187 adjustedLimit = fPattern->fFrameSize;
1188 }
1189 fStack->setMaxCapacity(adjustedLimit);
1190 }
1191 fStackLimit = limit;
1192}
1193
1194
1195//--------------------------------------------------------------------------------
1196//
1197// getStackLimit
1198//
1199//--------------------------------------------------------------------------------
1200int32_t RegexMatcher::getStackLimit() const {
1201 return fStackLimit;
1202}
1203
1204
1205//--------------------------------------------------------------------------------
1206//
1207// setMatchCallback
1208//
1209//--------------------------------------------------------------------------------
1210void RegexMatcher::setMatchCallback(URegexMatchCallback *callback,
1211 const void *context,
1212 UErrorCode &status) {
1213 if (U_FAILURE(status)) {
1214 return;
1215 }
1216 fCallbackFn = callback;
1217 fCallbackContext = context;
1218}
1219
1220
1221//--------------------------------------------------------------------------------
1222//
1223// getMatchCallback
1224//
1225//--------------------------------------------------------------------------------
1226void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback,
1227 const void *&context,
1228 UErrorCode &status) {
1229 if (U_FAILURE(status)) {
1230 return;
1231 }
1232 callback = fCallbackFn;
1233 context = fCallbackContext;
1234}
1235
1236
374ca955
A
1237//================================================================================
1238//
1239// Code following this point in this file is the internal
1240// Match Engine Implementation.
1241//
1242//================================================================================
1243
1244
1245//--------------------------------------------------------------------------------
1246//
1247// resetStack
1248// Discard any previous contents of the state save stack, and initialize a
1249// new stack frame to all -1. The -1s are needed for capture group limits,
1250// where they indicate that a group has not yet matched anything.
1251//--------------------------------------------------------------------------------
1252REStackFrame *RegexMatcher::resetStack() {
1253 // Discard any previous contents of the state save stack, and initialize a
1254 // new stack frame to all -1. The -1s are needed for capture group limits, where
1255 // they indicate that a group has not yet matched anything.
1256 fStack->removeAllElements();
1257
1258 int32_t *iFrame = fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
73c04bcf 1259 int32_t i;
374ca955
A
1260 for (i=0; i<fPattern->fFrameSize; i++) {
1261 iFrame[i] = -1;
1262 }
1263 return (REStackFrame *)iFrame;
1264}
1265
1266
1267
b75a7d8f
A
1268//--------------------------------------------------------------------------------
1269//
1270// isWordBoundary
1271// in perl, "xab..cd..", \b is true at positions 0,3,5,7
1272// For us,
1273// If the current char is a combining mark,
1274// \b is FALSE.
1275// Else Scan backwards to the first non-combining char.
1276// We are at a boundary if the this char and the original chars are
1277// opposite in membership in \w set
1278//
1279// parameters: pos - the current position in the input buffer
b75a7d8f 1280//
46f4442e
A
1281// TODO: double-check edge cases at region boundaries.
1282//
b75a7d8f
A
1283//--------------------------------------------------------------------------------
1284UBool RegexMatcher::isWordBoundary(int32_t pos) {
1285 UBool isBoundary = FALSE;
1286 UBool cIsWord = FALSE;
1287
46f4442e
A
1288 if (pos >= fLookLimit) {
1289 fHitEnd = TRUE;
1290 } else {
1291 // Determine whether char c at current position is a member of the word set of chars.
1292 // If we're off the end of the string, behave as though we're not at a word char.
b75a7d8f 1293 UChar32 c = fInput->char32At(pos);
46f4442e 1294 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
b75a7d8f
A
1295 // Current char is a combining one. Not a boundary.
1296 return FALSE;
1297 }
1298 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
1299 }
1300
1301 // Back up until we come to a non-combining char, determine whether
1302 // that char is a word char.
1303 UBool prevCIsWord = FALSE;
1304 int32_t prevPos = pos;
1305 for (;;) {
46f4442e 1306 if (prevPos <= fLookStart) {
b75a7d8f
A
1307 break;
1308 }
1309 prevPos = fInput->moveIndex32(prevPos, -1);
1310 UChar32 prevChar = fInput->char32At(prevPos);
46f4442e
A
1311 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
1312 || u_charType(prevChar) == U_FORMAT_CHAR)) {
b75a7d8f
A
1313 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
1314 break;
1315 }
1316 }
1317 isBoundary = cIsWord ^ prevCIsWord;
1318 return isBoundary;
1319}
1320
374ca955
A
1321//--------------------------------------------------------------------------------
1322//
1323// isUWordBoundary
1324//
1325// Test for a word boundary using RBBI word break.
1326//
1327// parameters: pos - the current position in the input buffer
1328//
1329//--------------------------------------------------------------------------------
1330UBool RegexMatcher::isUWordBoundary(int32_t pos) {
1331 UBool returnVal = FALSE;
1332#if UCONFIG_NO_BREAK_ITERATION==0
374ca955
A
1333
1334 // If we haven't yet created a break iterator for this matcher, do it now.
1335 if (fWordBreakItr == NULL) {
1336 fWordBreakItr =
46f4442e
A
1337 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
1338 if (U_FAILURE(fDeferredStatus)) {
374ca955
A
1339 return FALSE;
1340 }
1341 fWordBreakItr->setText(*fInput);
1342 }
1343
46f4442e
A
1344 if (pos >= fLookLimit) {
1345 fHitEnd = TRUE;
1346 returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real"
1347 // words are not boundaries. All non-word chars stand by themselves,
1348 // with word boundaries on both sides.
1349 } else {
1350 returnVal = fWordBreakItr->isBoundary(pos);
1351 }
374ca955
A
1352#endif
1353 return returnVal;
1354}
1355
b75a7d8f
A
1356//--------------------------------------------------------------------------------
1357//
46f4442e
A
1358// IncrementTime This function is called once each TIMER_INITIAL_VALUE state
1359// saves. Increment the "time" counter, and call the
1360// user callback function if there is one installed.
1361//
1362// If the match operation needs to be aborted, either for a time-out
1363// or because the user callback asked for it, just set an error status.
1364// The engine will pick that up and stop in its outer loop.
1365//
1366//--------------------------------------------------------------------------------
1367void RegexMatcher::IncrementTime(UErrorCode &status) {
1368 fTickCounter = TIMER_INITIAL_VALUE;
1369 fTime++;
1370 if (fCallbackFn != NULL) {
1371 if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
1372 status = U_REGEX_STOPPED_BY_CALLER;
1373 return;
1374 }
1375 }
1376 if (fTimeLimit > 0 && fTime >= fTimeLimit) {
1377 status = U_REGEX_TIME_OUT;
1378 }
1379}
1380
1381//--------------------------------------------------------------------------------
1382//
1383// StateSave
b75a7d8f
A
1384// Make a new stack frame, initialized as a copy of the current stack frame.
1385// Set the pattern index in the original stack frame from the operand value
1386// in the opcode. Execution of the engine continues with the state in
1387// the newly created stack frame
1388//
1389// Note that reserveBlock() may grow the stack, resulting in the
46f4442e
A
1390// whole thing being relocated in memory.
1391//
1392// Parameters:
1393// fp The top frame pointer when called. At return, a new
1394// fame will be present
1395// savePatIdx An index into the compiled pattern. Goes into the original
1396// (not new) frame. If execution ever back-tracks out of the
1397// new frame, this will be where we continue from in the pattern.
1398// Return
1399// The new frame pointer.
b75a7d8f
A
1400//
1401//--------------------------------------------------------------------------------
46f4442e 1402inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int32_t savePatIdx, UErrorCode &status) {
b75a7d8f 1403 // push storage for a new frame.
46f4442e
A
1404 int32_t *newFP = fStack->reserveBlock(fFrameSize, status);
1405 if (newFP == NULL) {
1406 // Failure on attempted stack expansion.
1407 // Stack function set some other error code, change it to a more
1408 // specific one for regular expressions.
1409 status = U_REGEX_STACK_OVERFLOW;
1410 // We need to return a writable stack frame, so just return the
1411 // previous frame. The match operation will stop quickly
1412 // because of the error status, after which the frame will never
1413 // be looked at again.
1414 return fp;
1415 }
1416 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack.
b75a7d8f
A
1417
1418 // New stack frame = copy of old top frame.
1419 int32_t *source = (int32_t *)fp;
1420 int32_t *dest = newFP;
1421 for (;;) {
1422 *dest++ = *source++;
1423 if (source == newFP) {
1424 break;
1425 }
1426 }
1427
46f4442e
A
1428 fTickCounter--;
1429 if (fTickCounter <= 0) {
1430 IncrementTime(status); // Re-initializes fTickCounter
1431 }
b75a7d8f
A
1432 fp->fPatIdx = savePatIdx;
1433 return (REStackFrame *)newFP;
1434}
46f4442e
A
1435
1436
b75a7d8f
A
1437//--------------------------------------------------------------------------------
1438//
1439// MatchAt This is the actual matching engine.
1440//
46f4442e
A
1441// startIdx: begin matching a this index.
1442// toEnd: if true, match must extend to end of the input region
1443//
b75a7d8f 1444//--------------------------------------------------------------------------------
46f4442e 1445void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
b75a7d8f
A
1446 UBool isMatch = FALSE; // True if the we have a match.
1447
1448 int32_t op; // Operation from the compiled pattern, split into
1449 int32_t opType; // the opcode
1450 int32_t opValue; // and the operand value.
1451
1452 #ifdef REGEX_RUN_DEBUG
1453 if (fTraceDebug)
1454 {
1455 printf("MatchAt(startIdx=%d)\n", startIdx);
1456 printf("Original Pattern: ");
73c04bcf 1457 int32_t i;
b75a7d8f
A
1458 for (i=0; i<fPattern->fPattern.length(); i++) {
1459 printf("%c", fPattern->fPattern.charAt(i));
1460 }
1461 printf("\n");
1462 printf("Input String: ");
1463 for (i=0; i<fInput->length(); i++) {
1464 UChar c = fInput->charAt(i);
1465 if (c<32 || c>256) {
1466 c = '.';
1467 }
1468 printf("%c", c);
1469 }
1470 printf("\n");
1471 printf("\n");
1472 }
1473 #endif
1474
1475 if (U_FAILURE(status)) {
1476 return;
1477 }
1478
1479 // Cache frequently referenced items from the compiled pattern
b75a7d8f
A
1480 //
1481 int32_t *pat = fPattern->fCompiledPat->getBuffer();
1482
1483 const UChar *litText = fPattern->fLiteralText.getBuffer();
1484 UVector *sets = fPattern->fSets;
46f4442e 1485
b75a7d8f
A
1486 const UChar *inputBuf = fInput->getBuffer();
1487
46f4442e 1488 fFrameSize = fPattern->fFrameSize;
b75a7d8f 1489 REStackFrame *fp = resetStack();
b75a7d8f
A
1490
1491 fp->fPatIdx = 0;
1492 fp->fInputIdx = startIdx;
1493
1494 // Zero out the pattern's static data
1495 int32_t i;
1496 for (i = 0; i<fPattern->fDataSize; i++) {
1497 fData[i] = 0;
1498 }
1499
1500 //
1501 // Main loop for interpreting the compiled pattern.
1502 // One iteration of the loop per pattern operation performed.
1503 //
1504 for (;;) {
1505#if 0
1506 if (_heapchk() != _HEAPOK) {
1507 fprintf(stderr, "Heap Trouble\n");
1508 }
1509#endif
1510 op = pat[fp->fPatIdx];
1511 opType = URX_TYPE(op);
1512 opValue = URX_VAL(op);
1513 #ifdef REGEX_RUN_DEBUG
1514 if (fTraceDebug) {
1515 printf("inputIdx=%d inputChar=%c sp=%3d ", fp->fInputIdx,
1516 fInput->char32At(fp->fInputIdx), (int32_t *)fp-fStack->getBuffer());
1517 fPattern->dumpOp(fp->fPatIdx);
1518 }
1519 #endif
1520 fp->fPatIdx++;
1521
1522 switch (opType) {
1523
1524
1525 case URX_NOP:
1526 break;
1527
1528
1529 case URX_BACKTRACK:
1530 // Force a backtrack. In some circumstances, the pattern compiler
1531 // will notice that the pattern can't possibly match anything, and will
1532 // emit one of these at that point.
46f4442e 1533 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
1534 break;
1535
1536
1537 case URX_ONECHAR:
46f4442e 1538 if (fp->fInputIdx < fActiveLimit) {
b75a7d8f 1539 UChar32 c;
46f4442e
A
1540 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
1541 if (c == opValue) {
b75a7d8f
A
1542 break;
1543 }
46f4442e
A
1544 } else {
1545 fHitEnd = TRUE;
b75a7d8f 1546 }
46f4442e 1547 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
1548 break;
1549
1550
1551 case URX_STRING:
1552 {
1553 // Test input against a literal string.
1554 // Strings require two slots in the compiled pattern, one for the
1555 // offset to the string text, and one for the length.
1556 int32_t stringStartIdx = opValue;
1557 int32_t stringLen;
1558
1559 op = pat[fp->fPatIdx]; // Fetch the second operand
1560 fp->fPatIdx++;
1561 opType = URX_TYPE(op);
1562 stringLen = URX_VAL(op);
1563 U_ASSERT(opType == URX_STRING_LEN);
1564 U_ASSERT(stringLen >= 2);
1565
46f4442e 1566 if (fp->fInputIdx + stringLen > fActiveLimit) {
b75a7d8f 1567 // No match. String is longer than the remaining input text.
46f4442e
A
1568 fHitEnd = TRUE; // TODO: See ticket 6074
1569 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
1570 break;
1571 }
1572
1573 const UChar * pInp = inputBuf + fp->fInputIdx;
1574 const UChar * pPat = litText+stringStartIdx;
1575 const UChar * pEnd = pInp + stringLen;
1576 for(;;) {
1577 if (*pInp == *pPat) {
1578 pInp++;
1579 pPat++;
1580 if (pInp == pEnd) {
1581 // Successful Match.
1582 fp->fInputIdx += stringLen;
1583 break;
1584 }
1585 } else {
1586 // Match failed.
46f4442e 1587 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
1588 break;
1589 }
1590 }
b75a7d8f
A
1591 }
1592 break;
1593
1594
1595
1596 case URX_STATE_SAVE:
46f4442e 1597 fp = StateSave(fp, opValue, status);
b75a7d8f
A
1598 break;
1599
1600
1601 case URX_END:
1602 // The match loop will exit via this path on a successful match,
1603 // when we reach the end of the pattern.
46f4442e
A
1604 if (toEnd && fp->fInputIdx != fActiveLimit) {
1605 // The pattern matched, but not to the end of input. Try some more.
1606 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1607 break;
1608 }
b75a7d8f
A
1609 isMatch = TRUE;
1610 goto breakFromLoop;
1611
1612 // Start and End Capture stack frame variables are layout out like this:
1613 // fp->fExtra[opValue] - The start of a completed capture group
1614 // opValue+1 - The end of a completed capture group
1615 // opValue+2 - the start of a capture group whose end
1616 // has not yet been reached (and might not ever be).
1617 case URX_START_CAPTURE:
46f4442e 1618 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
b75a7d8f
A
1619 fp->fExtra[opValue+2] = fp->fInputIdx;
1620 break;
1621
1622
1623 case URX_END_CAPTURE:
46f4442e 1624 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
b75a7d8f
A
1625 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
1626 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
1627 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
1628 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
1629 break;
1630
46f4442e 1631
b75a7d8f
A
1632 case URX_DOLLAR: // $, test for End of line
1633 // or for position before new line at end of input
46f4442e 1634 if (fp->fInputIdx < fAnchorLimit-2) {
b75a7d8f 1635 // We are no where near the end of input. Fail.
46f4442e
A
1636 // This is the common case. Keep it first.
1637 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
1638 break;
1639 }
46f4442e 1640 if (fp->fInputIdx >= fAnchorLimit) {
b75a7d8f 1641 // We really are at the end of input. Success.
46f4442e
A
1642 fHitEnd = TRUE;
1643 fRequireEnd = TRUE;
b75a7d8f
A
1644 break;
1645 }
1646 // If we are positioned just before a new-line that is located at the
1647 // end of input, succeed.
46f4442e 1648 if (fp->fInputIdx == fAnchorLimit-1) {
b75a7d8f 1649 UChar32 c = fInput->char32At(fp->fInputIdx);
46f4442e 1650 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
374ca955 1651 // If not in the middle of a CR/LF sequence
46f4442e 1652 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
374ca955 1653 // At new-line at end of input. Success
46f4442e
A
1654 fHitEnd = TRUE;
1655 fRequireEnd = TRUE;
1656 break;
374ca955 1657 }
b75a7d8f
A
1658 }
1659 }
1660
46f4442e
A
1661 if (fp->fInputIdx == fAnchorLimit-2 &&
1662 fInput->char32At(fp->fInputIdx) == 0x0d && fInput->char32At(fp->fInputIdx+1) == 0x0a) {
1663 fHitEnd = TRUE;
1664 fRequireEnd = TRUE;
b75a7d8f 1665 break; // At CR/LF at end of input. Success
b75a7d8f
A
1666 }
1667
46f4442e
A
1668 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1669
1670 break;
1671
1672
1673 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
1674 if (fp->fInputIdx >= fAnchorLimit-1) {
1675 // Either at the last character of input, or off the end.
1676 if (fp->fInputIdx == fAnchorLimit-1) {
1677 // At last char of input. Success if it's a new line.
1678 if (fInput->char32At(fp->fInputIdx) == 0x0a) {
1679 fHitEnd = TRUE;
1680 fRequireEnd = TRUE;
1681 break;
1682 }
1683 } else {
1684 // Off the end of input. Success.
1685 fHitEnd = TRUE;
1686 fRequireEnd = TRUE;
1687 break;
1688 }
1689 }
b75a7d8f 1690
46f4442e
A
1691 // Not at end of input. Back-track out.
1692 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
1693 break;
1694
1695
1696 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
1697 {
46f4442e 1698 if (fp->fInputIdx >= fAnchorLimit) {
b75a7d8f 1699 // We really are at the end of input. Success.
46f4442e
A
1700 fHitEnd = TRUE;
1701 fRequireEnd = TRUE;
b75a7d8f
A
1702 break;
1703 }
374ca955 1704 // If we are positioned just before a new-line, succeed.
b75a7d8f
A
1705 // It makes no difference where the new-line is within the input.
1706 UChar32 c = inputBuf[fp->fInputIdx];
46f4442e 1707 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
374ca955 1708 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
46f4442e
A
1709 // In multi-line mode, hitting a new-line just before the end of input does not
1710 // set the hitEnd or requireEnd flags
1711 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
1712 break;
374ca955 1713 }
b75a7d8f
A
1714 }
1715 // not at a new line. Fail.
46f4442e
A
1716 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1717 }
1718 break;
1719
1720
1721 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
1722 {
1723 if (fp->fInputIdx >= fAnchorLimit) {
1724 // We really are at the end of input. Success.
1725 fHitEnd = TRUE;
1726 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
1727 break; // adding a new-line would not lose the match.
1728 }
1729 // If we are not positioned just before a new-line, the test fails; backtrack out.
1730 // It makes no difference where the new-line is within the input.
1731 if (inputBuf[fp->fInputIdx] != 0x0a) {
1732 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1733 }
b75a7d8f
A
1734 }
1735 break;
1736
1737
1738 case URX_CARET: // ^, test for start of line
46f4442e
A
1739 if (fp->fInputIdx != fAnchorStart) {
1740 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1741 }
b75a7d8f
A
1742 break;
1743
1744
1745 case URX_CARET_M: // ^, test for start of line in mulit-line mode
1746 {
46f4442e 1747 if (fp->fInputIdx == fAnchorStart) {
b75a7d8f
A
1748 // We are at the start input. Success.
1749 break;
1750 }
1751 // Check whether character just before the current pos is a new-line
1752 // unless we are at the end of input
1753 UChar c = inputBuf[fp->fInputIdx - 1];
46f4442e 1754 if ((fp->fInputIdx < fAnchorLimit) &&
73c04bcf 1755 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
b75a7d8f 1756 // It's a new-line. ^ is true. Success.
46f4442e
A
1757 // TODO: what should be done with positions between a CR and LF?
1758 break;
b75a7d8f
A
1759 }
1760 // Not at the start of a line. Fail.
46f4442e
A
1761 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1762 }
b75a7d8f
A
1763 break;
1764
1765
46f4442e
A
1766 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
1767 {
1768 U_ASSERT(fp->fInputIdx >= fAnchorStart);
1769 if (fp->fInputIdx <= fAnchorStart) {
1770 // We are at the start input. Success.
1771 break;
1772 }
1773 // Check whether character just before the current pos is a new-line
1774 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
1775 UChar c = inputBuf[fp->fInputIdx - 1];
1776 if (c != 0x0a) {
1777 // Not at the start of a line. Back-track out.
1778 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1779 }
1780 }
1781 break;
1782
b75a7d8f
A
1783 case URX_BACKSLASH_B: // Test for word boundaries
1784 {
1785 UBool success = isWordBoundary(fp->fInputIdx);
1786 success ^= (opValue != 0); // flip sense for \B
1787 if (!success) {
46f4442e 1788 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
1789 }
1790 }
1791 break;
1792
1793
374ca955
A
1794 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
1795 {
1796 UBool success = isUWordBoundary(fp->fInputIdx);
1797 success ^= (opValue != 0); // flip sense for \B
1798 if (!success) {
46f4442e 1799 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
374ca955
A
1800 }
1801 }
1802 break;
1803
1804
b75a7d8f
A
1805 case URX_BACKSLASH_D: // Test for decimal digit
1806 {
46f4442e
A
1807 if (fp->fInputIdx >= fActiveLimit) {
1808 fHitEnd = TRUE;
1809 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
1810 break;
1811 }
1812
46f4442e
A
1813 UChar32 c = fInput->char32At(fp->fInputIdx);
1814 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
b75a7d8f
A
1815 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
1816 success ^= (opValue != 0); // flip sense for \D
1817 if (success) {
1818 fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1);
1819 } else {
46f4442e 1820 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
1821 }
1822 }
1823 break;
1824
1825
b75a7d8f 1826 case URX_BACKSLASH_G: // Test for position at end of previous match
46f4442e
A
1827 if (!((fMatch && fp->fInputIdx==fMatchEnd) || fMatch==FALSE && fp->fInputIdx==fActiveStart)) {
1828 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
1829 }
1830 break;
1831
1832
1833 case URX_BACKSLASH_X:
1834 // Match a Grapheme, as defined by Unicode TR 29.
1835 // Differs slightly from Perl, which consumes combining marks independently
1836 // of context.
46f4442e 1837 {
b75a7d8f
A
1838
1839 // Fail if at end of input
46f4442e
A
1840 if (fp->fInputIdx >= fActiveLimit) {
1841 fHitEnd = TRUE;
1842 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
1843 break;
1844 }
1845
1846 // Examine (and consume) the current char.
1847 // Dispatch into a little state machine, based on the char.
1848 UChar32 c;
46f4442e 1849 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
b75a7d8f
A
1850 UnicodeSet **sets = fPattern->fStaticSets;
1851 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
1852 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
1853 if (sets[URX_GC_L]->contains(c)) goto GC_L;
1854 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
1855 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
1856 if (sets[URX_GC_V]->contains(c)) goto GC_V;
1857 if (sets[URX_GC_T]->contains(c)) goto GC_T;
1858 goto GC_Extend;
1859
1860
1861
1862GC_L:
46f4442e
A
1863 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
1864 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
b75a7d8f
A
1865 if (sets[URX_GC_L]->contains(c)) goto GC_L;
1866 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
1867 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
1868 if (sets[URX_GC_V]->contains(c)) goto GC_V;
1869 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
1870 goto GC_Extend;
1871
1872GC_V:
46f4442e
A
1873 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
1874 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
b75a7d8f
A
1875 if (sets[URX_GC_V]->contains(c)) goto GC_V;
1876 if (sets[URX_GC_T]->contains(c)) goto GC_T;
1877 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
1878 goto GC_Extend;
1879
1880GC_T:
46f4442e
A
1881 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
1882 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
b75a7d8f
A
1883 if (sets[URX_GC_T]->contains(c)) goto GC_T;
1884 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
1885 goto GC_Extend;
1886
1887GC_Extend:
1888 // Combining characters are consumed here
1889 for (;;) {
46f4442e 1890 if (fp->fInputIdx >= fActiveLimit) {
b75a7d8f
A
1891 break;
1892 }
46f4442e 1893 U16_GET(inputBuf, 0, fp->fInputIdx, fActiveLimit, c);
b75a7d8f
A
1894 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
1895 break;
1896 }
46f4442e 1897 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
b75a7d8f
A
1898 }
1899 goto GC_Done;
1900
1901GC_Control:
1902 // Most control chars stand alone (don't combine with combining chars),
1903 // except for that CR/LF sequence is a single grapheme cluster.
46f4442e 1904 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
b75a7d8f
A
1905 fp->fInputIdx++;
1906 }
1907
1908GC_Done:
46f4442e
A
1909 if (fp->fInputIdx >= fActiveLimit) {
1910 fHitEnd = TRUE;
1911 }
b75a7d8f
A
1912 break;
1913 }
1914
1915
1916
1917
46f4442e
A
1918 case URX_BACKSLASH_Z: // Test for end of Input
1919 if (fp->fInputIdx < fAnchorLimit) {
1920 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1921 } else {
1922 fHitEnd = TRUE;
1923 fRequireEnd = TRUE;
b75a7d8f
A
1924 }
1925 break;
1926
1927
1928
1929 case URX_STATIC_SETREF:
1930 {
1931 // Test input character against one of the predefined sets
1932 // (Word Characters, for example)
1933 // The high bit of the op value is a flag for the match polarity.
1934 // 0: success if input char is in set.
1935 // 1: success if input char is not in set.
46f4442e
A
1936 if (fp->fInputIdx >= fActiveLimit) {
1937 fHitEnd = TRUE;
1938 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
1939 break;
1940 }
1941
1942 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
1943 opValue &= ~URX_NEG_SET;
1944 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
1945 UChar32 c;
46f4442e 1946 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
b75a7d8f
A
1947 if (c < 256) {
1948 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
1949 if (s8->contains(c)) {
1950 success = !success;
1951 }
1952 } else {
1953 const UnicodeSet *s = fPattern->fStaticSets[opValue];
1954 if (s->contains(c)) {
1955 success = !success;
1956 }
1957 }
1958 if (!success) {
46f4442e 1959 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
1960 }
1961 }
1962 break;
1963
1964
1965 case URX_STAT_SETREF_N:
1966 {
1967 // Test input character for NOT being a member of one of
1968 // the predefined sets (Word Characters, for example)
46f4442e
A
1969 if (fp->fInputIdx >= fActiveLimit) {
1970 fHitEnd = TRUE;
1971 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
1972 break;
1973 }
1974
1975 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
1976 UChar32 c;
46f4442e 1977 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
b75a7d8f
A
1978 if (c < 256) {
1979 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
1980 if (s8->contains(c) == FALSE) {
1981 break;
1982 }
1983 } else {
1984 const UnicodeSet *s = fPattern->fStaticSets[opValue];
1985 if (s->contains(c) == FALSE) {
1986 break;
1987 }
1988 }
1989
46f4442e 1990 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
1991 }
1992 break;
1993
1994
1995 case URX_SETREF:
46f4442e
A
1996 if (fp->fInputIdx >= fActiveLimit) {
1997 fHitEnd = TRUE;
1998 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
1999 break;
2000 }
2001 // There is input left. Pick up one char and test it for set membership.
2002 UChar32 c;
2003 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
2004 U_ASSERT(opValue > 0 && opValue < sets->size());
2005 if (c<256) {
2006 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
2007 if (s8->contains(c)) {
2008 break;
2009 }
2010 } else {
2011 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
2012 if (s->contains(c)) {
2013 // The character is in the set. A Match.
2014 break;
b75a7d8f 2015 }
46f4442e
A
2016 }
2017 // the character wasn't in the set. Back track out.
2018 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f 2019 break;
46f4442e 2020
b75a7d8f
A
2021
2022 case URX_DOTANY:
2023 {
2024 // . matches anything, but stops at end-of-line.
46f4442e 2025 if (fp->fInputIdx >= fActiveLimit) {
b75a7d8f 2026 // At end of input. Match failed. Backtrack out.
46f4442e
A
2027 fHitEnd = TRUE;
2028 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
2029 break;
2030 }
2031 // There is input left. Advance over one char, unless we've hit end-of-line
2032 UChar32 c;
46f4442e 2033 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
b75a7d8f 2034 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
73c04bcf 2035 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
b75a7d8f 2036 // End of line in normal mode. . does not match.
46f4442e 2037 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
2038 break;
2039 }
2040 }
2041 break;
46f4442e
A
2042
2043
b75a7d8f
A
2044 case URX_DOTANY_ALL:
2045 {
2046 // ., in dot-matches-all (including new lines) mode
46f4442e 2047 if (fp->fInputIdx >= fActiveLimit) {
b75a7d8f 2048 // At end of input. Match failed. Backtrack out.
46f4442e
A
2049 fHitEnd = TRUE;
2050 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
2051 break;
2052 }
2053 // There is input left. Advance over one char, except if we are
2054 // at a cr/lf, advance over both of them.
2055 UChar32 c;
46f4442e
A
2056 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
2057 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
b75a7d8f
A
2058 // In the case of a CR/LF, we need to advance over both.
2059 UChar nextc = inputBuf[fp->fInputIdx];
2060 if (nextc == 0x0a) {
2061 fp->fInputIdx++;
2062 }
2063 }
2064 }
2065 break;
2066
46f4442e
A
2067
2068 case URX_DOTANY_UNIX:
b75a7d8f 2069 {
46f4442e
A
2070 // '.' operator, matches all, but stops at end-of-line.
2071 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
2072 if (fp->fInputIdx >= fActiveLimit) {
2073 // At end of input. Match failed. Backtrack out.
2074 fHitEnd = TRUE;
2075 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
2076 break;
2077 }
46f4442e 2078 // There is input left. Advance over one char, unless we've hit end-of-line
b75a7d8f 2079 UChar32 c;
46f4442e
A
2080 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
2081 if (c == 0x0a) {
2082 // End of line in normal mode. '.' does not match the \n
2083 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
2084 }
2085 }
2086 break;
2087
2088
2089 case URX_JMP:
2090 fp->fPatIdx = opValue;
2091 break;
2092
2093 case URX_FAIL:
2094 isMatch = FALSE;
2095 goto breakFromLoop;
2096
2097 case URX_JMP_SAV:
2098 U_ASSERT(opValue < fPattern->fCompiledPat->size());
46f4442e
A
2099 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
2100 fp->fPatIdx = opValue; // Then JMP.
b75a7d8f
A
2101 break;
2102
2103 case URX_JMP_SAV_X:
2104 // This opcode is used with (x)+, when x can match a zero length string.
2105 // Same as JMP_SAV, except conditional on the match having made forward progress.
2106 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
2107 // data address of the input position at the start of the loop.
2108 {
2109 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
2110 int32_t stoOp = pat[opValue-1];
2111 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
2112 int32_t frameLoc = URX_VAL(stoOp);
46f4442e 2113 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
b75a7d8f
A
2114 int32_t prevInputIdx = fp->fExtra[frameLoc];
2115 U_ASSERT(prevInputIdx <= fp->fInputIdx);
2116 if (prevInputIdx < fp->fInputIdx) {
2117 // The match did make progress. Repeat the loop.
46f4442e 2118 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
b75a7d8f
A
2119 fp->fPatIdx = opValue;
2120 fp->fExtra[frameLoc] = fp->fInputIdx;
2121 }
2122 // If the input position did not advance, we do nothing here,
2123 // execution will fall out of the loop.
2124 }
2125 break;
2126
2127 case URX_CTR_INIT:
2128 {
46f4442e 2129 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
b75a7d8f
A
2130 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
2131
2132 // Pick up the three extra operands that CTR_INIT has, and
2133 // skip the pattern location counter past
2134 int32_t instrOperandLoc = fp->fPatIdx;
2135 fp->fPatIdx += 3;
2136 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
2137 int32_t minCount = pat[instrOperandLoc+1];
2138 int32_t maxCount = pat[instrOperandLoc+2];
2139 U_ASSERT(minCount>=0);
2140 U_ASSERT(maxCount>=minCount || maxCount==-1);
2141 U_ASSERT(loopLoc>fp->fPatIdx);
2142
2143 if (minCount == 0) {
46f4442e 2144 fp = StateSave(fp, loopLoc+1, status);
b75a7d8f
A
2145 }
2146 if (maxCount == 0) {
46f4442e 2147 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
2148 }
2149 }
2150 break;
2151
2152 case URX_CTR_LOOP:
2153 {
2154 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
2155 int32_t initOp = pat[opValue];
2156 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
2157 int32_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
2158 int32_t minCount = pat[opValue+2];
2159 int32_t maxCount = pat[opValue+3];
2160 // Increment the counter. Note: we're not worrying about counter
2161 // overflow, since the data comes from UnicodeStrings, which
2162 // stores its length in an int32_t.
2163 (*pCounter)++;
2164 U_ASSERT(*pCounter > 0);
2165 if ((uint32_t)*pCounter >= (uint32_t)maxCount) {
2166 U_ASSERT(*pCounter == maxCount || maxCount == -1);
2167 break;
2168 }
2169 if (*pCounter >= minCount) {
46f4442e 2170 fp = StateSave(fp, fp->fPatIdx, status);
b75a7d8f
A
2171 }
2172 fp->fPatIdx = opValue + 4; // Loop back.
2173 }
2174 break;
2175
2176 case URX_CTR_INIT_NG:
2177 {
46f4442e
A
2178 // Initialize a non-greedy loop
2179 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
b75a7d8f
A
2180 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
2181
2182 // Pick up the three extra operands that CTR_INIT has, and
2183 // skip the pattern location counter past
2184 int32_t instrOperandLoc = fp->fPatIdx;
2185 fp->fPatIdx += 3;
2186 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
2187 int32_t minCount = pat[instrOperandLoc+1];
2188 int32_t maxCount = pat[instrOperandLoc+2];
2189 U_ASSERT(minCount>=0);
2190 U_ASSERT(maxCount>=minCount || maxCount==-1);
2191 U_ASSERT(loopLoc>fp->fPatIdx);
2192
2193 if (minCount == 0) {
2194 if (maxCount != 0) {
46f4442e 2195 fp = StateSave(fp, fp->fPatIdx, status);
b75a7d8f
A
2196 }
2197 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
2198 }
2199 }
2200 break;
2201
2202 case URX_CTR_LOOP_NG:
2203 {
46f4442e 2204 // Non-greedy {min, max} loops
b75a7d8f
A
2205 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
2206 int32_t initOp = pat[opValue];
2207 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
2208 int32_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
2209 int32_t minCount = pat[opValue+2];
2210 int32_t maxCount = pat[opValue+3];
2211 // Increment the counter. Note: we're not worrying about counter
2212 // overflow, since the data comes from UnicodeStrings, which
2213 // stores its length in an int32_t.
2214 (*pCounter)++;
2215 U_ASSERT(*pCounter > 0);
2216
2217 if ((uint32_t)*pCounter >= (uint32_t)maxCount) {
2218 // The loop has matched the maximum permitted number of times.
2219 // Break out of here with no action. Matching will
2220 // continue with the following pattern.
2221 U_ASSERT(*pCounter == maxCount || maxCount == -1);
2222 break;
2223 }
2224
2225 if (*pCounter < minCount) {
2226 // We haven't met the minimum number of matches yet.
2227 // Loop back for another one.
2228 fp->fPatIdx = opValue + 4; // Loop back.
2229 } else {
2230 // We do have the minimum number of matches.
2231 // Fall into the following pattern, but first do
2232 // a state save to the top of the loop, so that a failure
2233 // in the following pattern will try another iteration of the loop.
46f4442e 2234 fp = StateSave(fp, opValue + 4, status);
b75a7d8f
A
2235 }
2236 }
2237 break;
2238
b75a7d8f
A
2239 case URX_STO_SP:
2240 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
2241 fData[opValue] = fStack->size();
2242 break;
2243
2244 case URX_LD_SP:
2245 {
2246 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
2247 int32_t newStackSize = fData[opValue];
2248 U_ASSERT(newStackSize <= fStack->size());
46f4442e 2249 int32_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
b75a7d8f
A
2250 if (newFP == (int32_t *)fp) {
2251 break;
2252 }
2253 int32_t i;
46f4442e 2254 for (i=0; i<fFrameSize; i++) {
b75a7d8f
A
2255 newFP[i] = ((int32_t *)fp)[i];
2256 }
2257 fp = (REStackFrame *)newFP;
2258 fStack->setSize(newStackSize);
2259 }
2260 break;
2261
2262 case URX_BACKREF:
2263 case URX_BACKREF_I:
2264 {
46f4442e 2265 U_ASSERT(opValue < fFrameSize);
b75a7d8f
A
2266 int32_t groupStartIdx = fp->fExtra[opValue];
2267 int32_t groupEndIdx = fp->fExtra[opValue+1];
2268 U_ASSERT(groupStartIdx <= groupEndIdx);
2269 int32_t len = groupEndIdx-groupStartIdx;
2270 if (groupStartIdx < 0) {
2271 // This capture group has not participated in the match thus far,
46f4442e 2272 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
b75a7d8f
A
2273 }
2274
2275 if (len == 0) {
2276 // The capture group match was of an empty string.
2277 // Verified by testing: Perl matches succeed in this case, so
2278 // we do too.
2279 break;
2280 }
374ca955 2281
b75a7d8f 2282 UBool haveMatch = FALSE;
46f4442e 2283 if (fp->fInputIdx + len <= fActiveLimit) {
b75a7d8f
A
2284 if (opType == URX_BACKREF) {
2285 if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, len) == 0) {
2286 haveMatch = TRUE;
2287 }
2288 } else {
2289 if (u_strncasecmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx,
2290 len, U_FOLD_CASE_DEFAULT) == 0) {
2291 haveMatch = TRUE;
2292 }
2293 }
46f4442e
A
2294 } else {
2295 // TODO: probably need to do a partial string comparison, and only
2296 // set HitEnd if the available input matched. Ticket #6074
2297 fHitEnd = TRUE;
b75a7d8f
A
2298 }
2299 if (haveMatch) {
2300 fp->fInputIdx += len; // Match. Advance current input position.
2301 } else {
46f4442e 2302 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
b75a7d8f
A
2303 }
2304 }
2305 break;
2306
2307 case URX_STO_INP_LOC:
2308 {
46f4442e 2309 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
b75a7d8f
A
2310 fp->fExtra[opValue] = fp->fInputIdx;
2311 }
2312 break;
2313
2314 case URX_JMPX:
2315 {
2316 int32_t instrOperandLoc = fp->fPatIdx;
2317 fp->fPatIdx += 1;
2318 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
46f4442e 2319 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
b75a7d8f
A
2320 int32_t savedInputIdx = fp->fExtra[dataLoc];
2321 U_ASSERT(savedInputIdx <= fp->fInputIdx);
2322 if (savedInputIdx < fp->fInputIdx) {
2323 fp->fPatIdx = opValue; // JMP
2324 } else {
46f4442e 2325 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
b75a7d8f
A
2326 }
2327 }
2328 break;
2329
2330 case URX_LA_START:
2331 {
2332 // Entering a lookahead block.
2333 // Save Stack Ptr, Input Pos.
2334 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2335 fData[opValue] = fStack->size();
2336 fData[opValue+1] = fp->fInputIdx;
46f4442e
A
2337 fActiveStart = fLookStart; // Set the match region change for
2338 fActiveLimit = fLookLimit; // transparent bounds.
b75a7d8f
A
2339 }
2340 break;
2341
2342 case URX_LA_END:
2343 {
2344 // Leaving a look-ahead block.
2345 // restore Stack Ptr, Input Pos to positions they had on entry to block.
2346 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2347 int32_t stackSize = fStack->size();
2348 int32_t newStackSize = fData[opValue];
2349 U_ASSERT(stackSize >= newStackSize);
2350 if (stackSize > newStackSize) {
46f4442e
A
2351 // Copy the current top frame back to the new (cut back) top frame.
2352 // This makes the capture groups from within the look-ahead
2353 // expression available.
2354 int32_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
b75a7d8f 2355 int32_t i;
46f4442e 2356 for (i=0; i<fFrameSize; i++) {
b75a7d8f
A
2357 newFP[i] = ((int32_t *)fp)[i];
2358 }
2359 fp = (REStackFrame *)newFP;
2360 fStack->setSize(newStackSize);
2361 }
2362 fp->fInputIdx = fData[opValue+1];
46f4442e
A
2363
2364 // Restore the active region bounds in the input string; they may have
2365 // been changed because of transparent bounds on a Region.
2366 fActiveStart = fRegionStart;
2367 fActiveLimit = fRegionLimit;
b75a7d8f
A
2368 }
2369 break;
2370
2371 case URX_ONECHAR_I:
46f4442e 2372 if (fp->fInputIdx < fActiveLimit) {
b75a7d8f 2373 UChar32 c;
46f4442e
A
2374 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
2375 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
b75a7d8f
A
2376 break;
2377 }
46f4442e
A
2378 } else {
2379 fHitEnd = TRUE;
2380 }
2381 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
2382 break;
2383
2384 case URX_STRING_I:
2385 {
2386 // Test input against a literal string.
2387 // Strings require two slots in the compiled pattern, one for the
2388 // offset to the string text, and one for the length.
2389 int32_t stringStartIdx, stringLen;
2390 stringStartIdx = opValue;
2391
2392 op = pat[fp->fPatIdx];
2393 fp->fPatIdx++;
2394 opType = URX_TYPE(op);
2395 opValue = URX_VAL(op);
2396 U_ASSERT(opType == URX_STRING_LEN);
2397 stringLen = opValue;
374ca955 2398
b75a7d8f 2399 int32_t stringEndIndex = fp->fInputIdx + stringLen;
46f4442e 2400 if (stringEndIndex <= fActiveLimit) {
374ca955
A
2401 if (u_strncasecmp(inputBuf+fp->fInputIdx, litText+stringStartIdx,
2402 stringLen, U_FOLD_CASE_DEFAULT) == 0) {
2403 // Success. Advance the current input position.
2404 fp->fInputIdx = stringEndIndex;
2405 break;
2406 }
46f4442e
A
2407 } else {
2408 // Insufficent input left for a match.
2409 fHitEnd = TRUE; // See ticket 6074
2410 }
374ca955 2411 // No match. Back up matching to a saved state
46f4442e 2412 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
2413 }
2414 break;
2415
2416 case URX_LB_START:
2417 {
2418 // Entering a look-behind block.
2419 // Save Stack Ptr, Input Pos.
46f4442e 2420 // TODO: implement transparent bounds. Ticket #6067
b75a7d8f
A
2421 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2422 fData[opValue] = fStack->size();
2423 fData[opValue+1] = fp->fInputIdx;
2424 // Init the variable containing the start index for attempted matches.
2425 fData[opValue+2] = -1;
2426 // Save input string length, then reset to pin any matches to end at
2427 // the current position.
46f4442e
A
2428 fData[opValue+3] = fActiveLimit;
2429 fActiveLimit = fp->fInputIdx;
b75a7d8f
A
2430 }
2431 break;
2432
2433
2434 case URX_LB_CONT:
2435 {
2436 // Positive Look-Behind, at top of loop checking for matches of LB expression
2437 // at all possible input starting positions.
2438
2439 // Fetch the min and max possible match lengths. They are the operands
2440 // of this op in the pattern.
2441 int32_t minML = pat[fp->fPatIdx++];
2442 int32_t maxML = pat[fp->fPatIdx++];
2443 U_ASSERT(minML <= maxML);
2444 U_ASSERT(minML >= 0);
2445
2446 // Fetch (from data) the last input index where a match was attempted.
2447 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2448 int32_t *lbStartIdx = &fData[opValue+2];
2449 if (*lbStartIdx < 0) {
2450 // First time through loop.
2451 *lbStartIdx = fp->fInputIdx - minML;
2452 } else {
2453 // 2nd through nth time through the loop.
2454 // Back up start position for match by one.
2455 if (*lbStartIdx == 0) {
2456 (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0.
2457 } else {
2458 U16_BACK_1(inputBuf, 0, *lbStartIdx);
2459 }
2460 }
2461
2462 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
2463 // We have tried all potential match starting points without
2464 // getting a match. Backtrack out, and out of the
2465 // Look Behind altogether.
46f4442e 2466 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f 2467 int32_t restoreInputLen = fData[opValue+3];
46f4442e 2468 U_ASSERT(restoreInputLen >= fActiveLimit);
b75a7d8f 2469 U_ASSERT(restoreInputLen <= fInput->length());
46f4442e 2470 fActiveLimit = restoreInputLen;
b75a7d8f
A
2471 break;
2472 }
2473
2474 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
2475 // (successful match will fall off the end of the loop.)
46f4442e 2476 fp = StateSave(fp, fp->fPatIdx-3, status);
b75a7d8f
A
2477 fp->fInputIdx = *lbStartIdx;
2478 }
2479 break;
2480
2481 case URX_LB_END:
2482 // End of a look-behind block, after a successful match.
2483 {
2484 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
46f4442e 2485 if (fp->fInputIdx != fActiveLimit) {
b75a7d8f
A
2486 // The look-behind expression matched, but the match did not
2487 // extend all the way to the point that we are looking behind from.
2488 // FAIL out of here, which will take us back to the LB_CONT, which
2489 // will retry the match starting at another position or fail
2490 // the look-behind altogether, whichever is appropriate.
46f4442e 2491 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
2492 break;
2493 }
2494
2495 // Look-behind match is good. Restore the orignal input string length,
2496 // which had been truncated to pin the end of the lookbehind match to the
2497 // position being looked-behind.
2498 int32_t originalInputLen = fData[opValue+3];
46f4442e 2499 U_ASSERT(originalInputLen >= fActiveLimit);
b75a7d8f 2500 U_ASSERT(originalInputLen <= fInput->length());
46f4442e 2501 fActiveLimit = originalInputLen;
b75a7d8f
A
2502 }
2503 break;
2504
2505
2506 case URX_LBN_CONT:
2507 {
2508 // Negative Look-Behind, at top of loop checking for matches of LB expression
2509 // at all possible input starting positions.
2510
2511 // Fetch the extra parameters of this op.
2512 int32_t minML = pat[fp->fPatIdx++];
2513 int32_t maxML = pat[fp->fPatIdx++];
2514 int32_t continueLoc = pat[fp->fPatIdx++];
2515 continueLoc = URX_VAL(continueLoc);
2516 U_ASSERT(minML <= maxML);
2517 U_ASSERT(minML >= 0);
2518 U_ASSERT(continueLoc > fp->fPatIdx);
2519
2520 // Fetch (from data) the last input index where a match was attempted.
2521 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2522 int32_t *lbStartIdx = &fData[opValue+2];
2523 if (*lbStartIdx < 0) {
2524 // First time through loop.
2525 *lbStartIdx = fp->fInputIdx - minML;
2526 } else {
2527 // 2nd through nth time through the loop.
2528 // Back up start position for match by one.
2529 if (*lbStartIdx == 0) {
2530 (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0.
2531 } else {
2532 U16_BACK_1(inputBuf, 0, *lbStartIdx);
2533 }
2534 }
2535
2536 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
2537 // We have tried all potential match starting points without
2538 // getting a match, which means that the negative lookbehind as
2539 // a whole has succeeded. Jump forward to the continue location
2540 int32_t restoreInputLen = fData[opValue+3];
46f4442e 2541 U_ASSERT(restoreInputLen >= fActiveLimit);
b75a7d8f 2542 U_ASSERT(restoreInputLen <= fInput->length());
46f4442e 2543 fActiveLimit = restoreInputLen;
b75a7d8f
A
2544 fp->fPatIdx = continueLoc;
2545 break;
2546 }
2547
2548 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
2549 // (successful match will cause a FAIL out of the loop altogether.)
46f4442e 2550 fp = StateSave(fp, fp->fPatIdx-4, status);
b75a7d8f
A
2551 fp->fInputIdx = *lbStartIdx;
2552 }
2553 break;
2554
2555 case URX_LBN_END:
2556 // End of a negative look-behind block, after a successful match.
2557 {
2558 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
46f4442e 2559 if (fp->fInputIdx != fActiveLimit) {
b75a7d8f
A
2560 // The look-behind expression matched, but the match did not
2561 // extend all the way to the point that we are looking behind from.
2562 // FAIL out of here, which will take us back to the LB_CONT, which
2563 // will retry the match starting at another position or succeed
2564 // the look-behind altogether, whichever is appropriate.
46f4442e 2565 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
2566 break;
2567 }
2568
2569 // Look-behind expression matched, which means look-behind test as
2570 // a whole Fails
2571
2572 // Restore the orignal input string length, which had been truncated
2573 // inorder to pin the end of the lookbehind match
2574 // to the position being looked-behind.
2575 int32_t originalInputLen = fData[opValue+3];
46f4442e 2576 U_ASSERT(originalInputLen >= fActiveLimit);
b75a7d8f 2577 U_ASSERT(originalInputLen <= fInput->length());
46f4442e 2578 fActiveLimit = originalInputLen;
b75a7d8f
A
2579
2580 // Restore original stack position, discarding any state saved
2581 // by the successful pattern match.
2582 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2583 int32_t newStackSize = fData[opValue];
2584 U_ASSERT(fStack->size() > newStackSize);
2585 fStack->setSize(newStackSize);
2586
2587 // FAIL, which will take control back to someplace
2588 // prior to entering the look-behind test.
46f4442e 2589 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
2590 }
2591 break;
2592
2593
2594 case URX_LOOP_SR_I:
2595 // Loop Initialization for the optimized implementation of
2596 // [some character set]*
2597 // This op scans through all matching input.
2598 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
2599 {
2600 U_ASSERT(opValue > 0 && opValue < sets->size());
2601 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
2602 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
2603
2604 // Loop through input, until either the input is exhausted or
2605 // we reach a character that is not a member of the set.
2606 int32_t ix = fp->fInputIdx;
2607 for (;;) {
46f4442e
A
2608 if (ix >= fActiveLimit) {
2609 fHitEnd = TRUE;
b75a7d8f
A
2610 break;
2611 }
2612 UChar32 c;
46f4442e 2613 U16_NEXT(inputBuf, ix, fActiveLimit, c);
b75a7d8f
A
2614 if (c<256) {
2615 if (s8->contains(c) == FALSE) {
2616 U16_BACK_1(inputBuf, 0, ix);
2617 break;
2618 }
2619 } else {
2620 if (s->contains(c) == FALSE) {
2621 U16_BACK_1(inputBuf, 0, ix);
2622 break;
2623 }
2624 }
2625 }
2626
2627 // If there were no matching characters, skip over the loop altogether.
2628 // The loop doesn't run at all, a * op always succeeds.
2629 if (ix == fp->fInputIdx) {
2630 fp->fPatIdx++; // skip the URX_LOOP_C op.
2631 break;
2632 }
2633
2634 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
2635 // must follow. It's operand is the stack location
2636 // that holds the starting input index for the match of this [set]*
2637 int32_t loopcOp = pat[fp->fPatIdx];
2638 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
2639 int32_t stackLoc = URX_VAL(loopcOp);
46f4442e 2640 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
b75a7d8f
A
2641 fp->fExtra[stackLoc] = fp->fInputIdx;
2642 fp->fInputIdx = ix;
2643
2644 // Save State to the URX_LOOP_C op that follows this one,
2645 // so that match failures in the following code will return to there.
2646 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
46f4442e 2647 fp = StateSave(fp, fp->fPatIdx, status);
b75a7d8f
A
2648 fp->fPatIdx++;
2649 }
2650 break;
2651
2652
2653 case URX_LOOP_DOT_I:
2654 // Loop Initialization for the optimized implementation of .*
2655 // This op scans through all remaining input.
2656 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
2657 {
2658 // Loop through input until the input is exhausted (we reach an end-of-line)
46f4442e 2659 // In DOTALL mode, we can just go straight to the end of the input.
374ca955 2660 int32_t ix;
46f4442e
A
2661 if ((opValue & 1) == 1) {
2662 // Dot-matches-All mode. Jump straight to the end of the string.
2663 ix = fActiveLimit;
2664 fHitEnd = TRUE;
374ca955 2665 } else {
46f4442e 2666 // NOT DOT ALL mode. Line endings do not match '.'
b75a7d8f
A
2667 // Scan forward until a line ending or end of input.
2668 ix = fp->fInputIdx;
2669 for (;;) {
46f4442e
A
2670 if (ix >= fActiveLimit) {
2671 fHitEnd = TRUE;
2672 ix = fActiveLimit;
b75a7d8f
A
2673 break;
2674 }
2675 UChar32 c;
46f4442e
A
2676 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++]
2677 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
2678 if ((c == 0x0a) || // 0x0a is newline in both modes.
2679 ((opValue & 2) == 0) && // IF not UNIX_LINES mode
2680 (c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029) {
2681 // char is a line ending. Put the input pos back to the
2682 // line ending char, and exit the scanning loop.
2683 U16_BACK_1(inputBuf, 0, ix);
2684 break;
2685 }
b75a7d8f
A
2686 }
2687 }
2688 }
46f4442e 2689
b75a7d8f
A
2690 // If there were no matching characters, skip over the loop altogether.
2691 // The loop doesn't run at all, a * op always succeeds.
2692 if (ix == fp->fInputIdx) {
2693 fp->fPatIdx++; // skip the URX_LOOP_C op.
2694 break;
2695 }
2696
2697 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
2698 // must follow. It's operand is the stack location
46f4442e 2699 // that holds the starting input index for the match of this .*
b75a7d8f
A
2700 int32_t loopcOp = pat[fp->fPatIdx];
2701 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
2702 int32_t stackLoc = URX_VAL(loopcOp);
46f4442e 2703 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
b75a7d8f
A
2704 fp->fExtra[stackLoc] = fp->fInputIdx;
2705 fp->fInputIdx = ix;
2706
2707 // Save State to the URX_LOOP_C op that follows this one,
2708 // so that match failures in the following code will return to there.
2709 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
46f4442e 2710 fp = StateSave(fp, fp->fPatIdx, status);
b75a7d8f
A
2711 fp->fPatIdx++;
2712 }
2713 break;
2714
2715
2716 case URX_LOOP_C:
2717 {
46f4442e 2718 U_ASSERT(opValue>=0 && opValue<fFrameSize);
b75a7d8f
A
2719 int32_t terminalIdx = fp->fExtra[opValue];
2720 U_ASSERT(terminalIdx <= fp->fInputIdx);
2721 if (terminalIdx == fp->fInputIdx) {
2722 // We've backed up the input idx to the point that the loop started.
2723 // The loop is done. Leave here without saving state.
2724 // Subsequent failures won't come back here.
2725 break;
2726 }
2727 // Set up for the next iteration of the loop, with input index
2728 // backed up by one from the last time through,
2729 // and a state save to this instruction in case the following code fails again.
2730 // (We're going backwards because this loop emulates stack unwinding, not
2731 // the initial scan forward.)
2732 U_ASSERT(fp->fInputIdx > 0);
2733 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
2734 if (inputBuf[fp->fInputIdx] == 0x0a &&
2735 fp->fInputIdx > terminalIdx &&
2736 inputBuf[fp->fInputIdx-1] == 0x0d) {
2737 int32_t prevOp = pat[fp->fPatIdx-2];
2738 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
2739 // .*, stepping back over CRLF pair.
2740 fp->fInputIdx--;
2741 }
2742 }
2743
2744
46f4442e 2745 fp = StateSave(fp, fp->fPatIdx-1, status);
b75a7d8f
A
2746 }
2747 break;
2748
2749
2750
2751 default:
2752 // Trouble. The compiled pattern contains an entry with an
2753 // unrecognized type tag.
2754 U_ASSERT(FALSE);
2755 }
2756
2757 if (U_FAILURE(status)) {
46f4442e 2758 isMatch = FALSE;
b75a7d8f
A
2759 break;
2760 }
2761 }
2762
2763breakFromLoop:
2764 fMatch = isMatch;
2765 if (isMatch) {
2766 fLastMatchEnd = fMatchEnd;
2767 fMatchStart = startIdx;
2768 fMatchEnd = fp->fInputIdx;
2769 if (fTraceDebug) {
374ca955 2770 REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd));
b75a7d8f
A
2771 }
2772 }
2773 else
2774 {
2775 if (fTraceDebug) {
374ca955 2776 REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
b75a7d8f
A
2777 }
2778 }
2779
2780 fFrame = fp; // The active stack frame when the engine stopped.
2781 // Contains the capture group results that we need to
2782 // access later.
2783
2784 return;
2785}
2786
2787
2788
374ca955 2789UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
b75a7d8f
A
2790
2791U_NAMESPACE_END
2792
2793#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
2794