]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/rematch.cpp
ICU-62141.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / rematch.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **************************************************************************
5 * Copyright (C) 2002-2016 International Business Machines Corporation
6 * and others. All rights reserved.
7 **************************************************************************
8 */
9 //
10 // file: rematch.cpp
11 //
12 // Contains the implementation of class RegexMatcher,
13 // which is one of the main API classes for the ICU regular expression package.
14 //
15
16 #include "unicode/utypes.h"
17 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
18
19 #include "unicode/regex.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/ustring.h"
23 #include "unicode/rbbi.h"
24 #include "unicode/utf.h"
25 #include "unicode/utf16.h"
26 #include "uassert.h"
27 #include "cmemory.h"
28 #include "cstr.h"
29 #include "uvector.h"
30 #include "uvectr32.h"
31 #include "uvectr64.h"
32 #include "regeximp.h"
33 #include "regexst.h"
34 #include "regextxt.h"
35 #include "ucase.h"
36
37 // #include <malloc.h> // Needed for heapcheck testing
38
39
40 U_NAMESPACE_BEGIN
41
42 // Default limit for the size of the back track stack, to avoid system
43 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
44 // This value puts ICU's limits higher than most other regexp implementations,
45 // which use recursion rather than the heap, and take more storage per
46 // backtrack point.
47 //
48 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
49
50 // Time limit counter constant.
51 // Time limits for expression evaluation are in terms of quanta of work by
52 // the engine, each of which is 10,000 state saves.
53 // This constant determines that state saves per tick number.
54 static const int32_t TIMER_INITIAL_VALUE = 10000;
55
56
57 // Test for any of the Unicode line terminating characters.
58 static inline UBool isLineTerminator(UChar32 c) {
59 if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
60 return false;
61 }
62 return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029;
63 }
64
65 //-----------------------------------------------------------------------------
66 //
67 // Constructor and Destructor
68 //
69 //-----------------------------------------------------------------------------
70 RegexMatcher::RegexMatcher(const RegexPattern *pat) {
71 fDeferredStatus = U_ZERO_ERROR;
72 init(fDeferredStatus);
73 if (U_FAILURE(fDeferredStatus)) {
74 return;
75 }
76 if (pat==NULL) {
77 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
78 return;
79 }
80 fPattern = pat;
81 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
82 }
83
84
85
86 RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
87 uint32_t flags, UErrorCode &status) {
88 init(status);
89 if (U_FAILURE(status)) {
90 return;
91 }
92 UParseError pe;
93 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
94 fPattern = fPatternOwned;
95
96 UText inputText = UTEXT_INITIALIZER;
97 utext_openConstUnicodeString(&inputText, &input, &status);
98 init2(&inputText, status);
99 utext_close(&inputText);
100
101 fInputUniStrMaybeMutable = TRUE;
102 }
103
104
105 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
106 uint32_t flags, UErrorCode &status) {
107 init(status);
108 if (U_FAILURE(status)) {
109 return;
110 }
111 UParseError pe;
112 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
113 if (U_FAILURE(status)) {
114 return;
115 }
116
117 fPattern = fPatternOwned;
118 init2(input, status);
119 }
120
121
122 RegexMatcher::RegexMatcher(const UnicodeString &regexp,
123 uint32_t flags, UErrorCode &status) {
124 init(status);
125 if (U_FAILURE(status)) {
126 return;
127 }
128 UParseError pe;
129 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
130 if (U_FAILURE(status)) {
131 return;
132 }
133 fPattern = fPatternOwned;
134 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
135 }
136
137 RegexMatcher::RegexMatcher(UText *regexp,
138 uint32_t flags, UErrorCode &status) {
139 init(status);
140 if (U_FAILURE(status)) {
141 return;
142 }
143 UParseError pe;
144 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
145 if (U_FAILURE(status)) {
146 return;
147 }
148
149 fPattern = fPatternOwned;
150 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
151 }
152
153
154
155
156 RegexMatcher::~RegexMatcher() {
157 delete fStack;
158 if (fData != fSmallData) {
159 uprv_free(fData);
160 fData = NULL;
161 }
162 if (fPatternOwned) {
163 delete fPatternOwned;
164 fPatternOwned = NULL;
165 fPattern = NULL;
166 }
167
168 if (fInput) {
169 delete fInput;
170 }
171 if (fInputText) {
172 utext_close(fInputText);
173 }
174 if (fAltInputText) {
175 utext_close(fAltInputText);
176 }
177
178 #if UCONFIG_NO_BREAK_ITERATION==0
179 delete fWordBreakItr;
180 #endif
181 }
182
183 //
184 // init() common initialization for use by all constructors.
185 // Initialize all fields, get the object into a consistent state.
186 // This must be done even when the initial status shows an error,
187 // so that the object is initialized sufficiently well for the destructor
188 // to run safely.
189 //
190 void RegexMatcher::init(UErrorCode &status) {
191 fPattern = NULL;
192 fPatternOwned = NULL;
193 fFrameSize = 0;
194 fRegionStart = 0;
195 fRegionLimit = 0;
196 fAnchorStart = 0;
197 fAnchorLimit = 0;
198 fLookStart = 0;
199 fLookLimit = 0;
200 fActiveStart = 0;
201 fActiveLimit = 0;
202 fTransparentBounds = FALSE;
203 fAnchoringBounds = TRUE;
204 fMatch = FALSE;
205 fMatchStart = 0;
206 fMatchEnd = 0;
207 fLastMatchEnd = -1;
208 fAppendPosition = 0;
209 fHitEnd = FALSE;
210 fRequireEnd = FALSE;
211 fStack = NULL;
212 fFrame = NULL;
213 fTimeLimit = 0;
214 fTime = 0;
215 fTickCounter = 0;
216 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY;
217 fCallbackFn = NULL;
218 fCallbackContext = NULL;
219 fFindProgressCallbackFn = NULL;
220 fFindProgressCallbackContext = NULL;
221 fTraceDebug = FALSE;
222 fDeferredStatus = status;
223 fData = fSmallData;
224 fWordBreakItr = NULL;
225
226 fStack = NULL;
227 fInputText = NULL;
228 fAltInputText = NULL;
229 fInput = NULL;
230 fInputLength = 0;
231 fInputUniStrMaybeMutable = FALSE;
232 }
233
234 //
235 // init2() Common initialization for use by RegexMatcher constructors, part 2.
236 // This handles the common setup to be done after the Pattern is available.
237 //
238 void RegexMatcher::init2(UText *input, UErrorCode &status) {
239 if (U_FAILURE(status)) {
240 fDeferredStatus = status;
241 return;
242 }
243
244 if (fPattern->fDataSize > UPRV_LENGTHOF(fSmallData)) {
245 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
246 if (fData == NULL) {
247 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
248 return;
249 }
250 }
251
252 fStack = new UVector64(status);
253 if (fStack == NULL) {
254 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
255 return;
256 }
257
258 reset(input);
259 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
260 if (U_FAILURE(status)) {
261 fDeferredStatus = status;
262 return;
263 }
264 }
265
266
267 static const UChar BACKSLASH = 0x5c;
268 static const UChar DOLLARSIGN = 0x24;
269 static const UChar LEFTBRACKET = 0x7b;
270 static const UChar RIGHTBRACKET = 0x7d;
271
272 //--------------------------------------------------------------------------------
273 //
274 // appendReplacement
275 //
276 //--------------------------------------------------------------------------------
277 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
278 const UnicodeString &replacement,
279 UErrorCode &status) {
280 UText replacementText = UTEXT_INITIALIZER;
281
282 utext_openConstUnicodeString(&replacementText, &replacement, &status);
283 if (U_SUCCESS(status)) {
284 UText resultText = UTEXT_INITIALIZER;
285 utext_openUnicodeString(&resultText, &dest, &status);
286
287 if (U_SUCCESS(status)) {
288 appendReplacement(&resultText, &replacementText, status);
289 utext_close(&resultText);
290 }
291 utext_close(&replacementText);
292 }
293
294 return *this;
295 }
296
297 //
298 // appendReplacement, UText mode
299 //
300 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
301 UText *replacement,
302 UErrorCode &status) {
303 if (U_FAILURE(status)) {
304 return *this;
305 }
306 if (U_FAILURE(fDeferredStatus)) {
307 status = fDeferredStatus;
308 return *this;
309 }
310 if (fMatch == FALSE) {
311 status = U_REGEX_INVALID_STATE;
312 return *this;
313 }
314
315 // Copy input string from the end of previous match to start of current match
316 int64_t destLen = utext_nativeLength(dest);
317 if (fMatchStart > fAppendPosition) {
318 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
319 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
320 (int32_t)(fMatchStart-fAppendPosition), &status);
321 } else {
322 int32_t len16;
323 if (UTEXT_USES_U16(fInputText)) {
324 len16 = (int32_t)(fMatchStart-fAppendPosition);
325 } else {
326 UErrorCode lengthStatus = U_ZERO_ERROR;
327 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
328 }
329 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
330 if (inputChars == NULL) {
331 status = U_MEMORY_ALLOCATION_ERROR;
332 return *this;
333 }
334 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
335 destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
336 uprv_free(inputChars);
337 }
338 }
339 fAppendPosition = fMatchEnd;
340
341
342 // scan the replacement text, looking for substitutions ($n) and \escapes.
343 // TODO: optimize this loop by efficiently scanning for '$' or '\',
344 // move entire ranges not containing substitutions.
345 UTEXT_SETNATIVEINDEX(replacement, 0);
346 for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL; c = UTEXT_NEXT32(replacement)) {
347 if (c == BACKSLASH) {
348 // Backslash Escape. Copy the following char out without further checks.
349 // Note: Surrogate pairs don't need any special handling
350 // The second half wont be a '$' or a '\', and
351 // will move to the dest normally on the next
352 // loop iteration.
353 c = UTEXT_CURRENT32(replacement);
354 if (c == U_SENTINEL) {
355 break;
356 }
357
358 if (c==0x55/*U*/ || c==0x75/*u*/) {
359 // We have a \udddd or \Udddddddd escape sequence.
360 int32_t offset = 0;
361 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
362 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
363 if (escapedChar != (UChar32)0xFFFFFFFF) {
364 if (U_IS_BMP(escapedChar)) {
365 UChar c16 = (UChar)escapedChar;
366 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
367 } else {
368 UChar surrogate[2];
369 surrogate[0] = U16_LEAD(escapedChar);
370 surrogate[1] = U16_TRAIL(escapedChar);
371 if (U_SUCCESS(status)) {
372 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
373 }
374 }
375 // TODO: Report errors for mal-formed \u escapes?
376 // As this is, the original sequence is output, which may be OK.
377 if (context.lastOffset == offset) {
378 (void)UTEXT_PREVIOUS32(replacement);
379 } else if (context.lastOffset != offset-1) {
380 utext_moveIndex32(replacement, offset - context.lastOffset - 1);
381 }
382 }
383 } else {
384 (void)UTEXT_NEXT32(replacement);
385 // Plain backslash escape. Just put out the escaped character.
386 if (U_IS_BMP(c)) {
387 UChar c16 = (UChar)c;
388 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
389 } else {
390 UChar surrogate[2];
391 surrogate[0] = U16_LEAD(c);
392 surrogate[1] = U16_TRAIL(c);
393 if (U_SUCCESS(status)) {
394 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
395 }
396 }
397 }
398 } else if (c != DOLLARSIGN) {
399 // Normal char, not a $. Copy it out without further checks.
400 if (U_IS_BMP(c)) {
401 UChar c16 = (UChar)c;
402 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
403 } else {
404 UChar surrogate[2];
405 surrogate[0] = U16_LEAD(c);
406 surrogate[1] = U16_TRAIL(c);
407 if (U_SUCCESS(status)) {
408 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
409 }
410 }
411 } else {
412 // We've got a $. Pick up a capture group name or number if one follows.
413 // Consume digits so long as the resulting group number <= the number of
414 // number of capture groups in the pattern.
415
416 int32_t groupNum = 0;
417 int32_t numDigits = 0;
418 UChar32 nextChar = utext_current32(replacement);
419 if (nextChar == LEFTBRACKET) {
420 // Scan for a Named Capture Group, ${name}.
421 UnicodeString groupName;
422 utext_next32(replacement);
423 while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {
424 nextChar = utext_next32(replacement);
425 if (nextChar == U_SENTINEL) {
426 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
427 } else if ((nextChar >= 0x41 && nextChar <= 0x5a) || // A..Z
428 (nextChar >= 0x61 && nextChar <= 0x7a) || // a..z
429 (nextChar >= 0x31 && nextChar <= 0x39)) { // 0..9
430 groupName.append(nextChar);
431 } else if (nextChar == RIGHTBRACKET) {
432 groupNum = uhash_geti(fPattern->fNamedCaptureMap, &groupName);
433 if (groupNum == 0) {
434 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
435 }
436 } else {
437 // Character was something other than a name char or a closing '}'
438 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
439 }
440 }
441
442 } else if (u_isdigit(nextChar)) {
443 // $n Scan for a capture group number
444 int32_t numCaptureGroups = fPattern->fGroupMap->size();
445 for (;;) {
446 nextChar = UTEXT_CURRENT32(replacement);
447 if (nextChar == U_SENTINEL) {
448 break;
449 }
450 if (u_isdigit(nextChar) == FALSE) {
451 break;
452 }
453 int32_t nextDigitVal = u_charDigitValue(nextChar);
454 if (groupNum*10 + nextDigitVal > numCaptureGroups) {
455 // Don't consume the next digit if it makes the capture group number too big.
456 if (numDigits == 0) {
457 status = U_INDEX_OUTOFBOUNDS_ERROR;
458 }
459 break;
460 }
461 (void)UTEXT_NEXT32(replacement);
462 groupNum=groupNum*10 + nextDigitVal;
463 ++numDigits;
464 }
465 } else {
466 // $ not followed by capture group name or number.
467 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
468 }
469
470 if (U_SUCCESS(status)) {
471 destLen += appendGroup(groupNum, dest, status);
472 }
473 } // End of $ capture group handling
474 } // End of per-character loop through the replacement string.
475
476 return *this;
477 }
478
479
480
481 //--------------------------------------------------------------------------------
482 //
483 // appendTail Intended to be used in conjunction with appendReplacement()
484 // To the destination string, append everything following
485 // the last match position from the input string.
486 //
487 // Note: Match ranges do not affect appendTail or appendReplacement
488 //
489 //--------------------------------------------------------------------------------
490 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
491 UErrorCode status = U_ZERO_ERROR;
492 UText resultText = UTEXT_INITIALIZER;
493 utext_openUnicodeString(&resultText, &dest, &status);
494
495 if (U_SUCCESS(status)) {
496 appendTail(&resultText, status);
497 utext_close(&resultText);
498 }
499
500 return dest;
501 }
502
503 //
504 // appendTail, UText mode
505 //
506 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
507 if (U_FAILURE(status)) {
508 return dest;
509 }
510 if (U_FAILURE(fDeferredStatus)) {
511 status = fDeferredStatus;
512 return dest;
513 }
514
515 if (fInputLength > fAppendPosition) {
516 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
517 int64_t destLen = utext_nativeLength(dest);
518 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
519 (int32_t)(fInputLength-fAppendPosition), &status);
520 } else {
521 int32_t len16;
522 if (UTEXT_USES_U16(fInputText)) {
523 len16 = (int32_t)(fInputLength-fAppendPosition);
524 } else {
525 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
526 status = U_ZERO_ERROR; // buffer overflow
527 }
528
529 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
530 if (inputChars == NULL) {
531 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
532 } else {
533 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
534 int64_t destLen = utext_nativeLength(dest);
535 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
536 uprv_free(inputChars);
537 }
538 }
539 }
540 return dest;
541 }
542
543
544
545 //--------------------------------------------------------------------------------
546 //
547 // end
548 //
549 //--------------------------------------------------------------------------------
550 int32_t RegexMatcher::end(UErrorCode &err) const {
551 return end(0, err);
552 }
553
554 int64_t RegexMatcher::end64(UErrorCode &err) const {
555 return end64(0, err);
556 }
557
558 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
559 if (U_FAILURE(err)) {
560 return -1;
561 }
562 if (fMatch == FALSE) {
563 err = U_REGEX_INVALID_STATE;
564 return -1;
565 }
566 if (group < 0 || group > fPattern->fGroupMap->size()) {
567 err = U_INDEX_OUTOFBOUNDS_ERROR;
568 return -1;
569 }
570 int64_t e = -1;
571 if (group == 0) {
572 e = fMatchEnd;
573 } else {
574 // Get the position within the stack frame of the variables for
575 // this capture group.
576 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
577 U_ASSERT(groupOffset < fPattern->fFrameSize);
578 U_ASSERT(groupOffset >= 0);
579 e = fFrame->fExtra[groupOffset + 1];
580 }
581
582 return e;
583 }
584
585 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
586 return (int32_t)end64(group, err);
587 }
588
589 //--------------------------------------------------------------------------------
590 //
591 // findProgressInterrupt This function is called once for each advance in the target
592 // string from the find() function, and calls the user progress callback
593 // function if there is one installed.
594 //
595 // Return: TRUE if the find operation is to be terminated.
596 // FALSE if the find operation is to continue running.
597 //
598 //--------------------------------------------------------------------------------
599 UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) {
600 if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) {
601 status = U_REGEX_STOPPED_BY_CALLER;
602 return TRUE;
603 }
604 return FALSE;
605 }
606
607 //--------------------------------------------------------------------------------
608 //
609 // find()
610 //
611 //--------------------------------------------------------------------------------
612 UBool RegexMatcher::find() {
613 if (U_FAILURE(fDeferredStatus)) {
614 return FALSE;
615 }
616 UErrorCode status = U_ZERO_ERROR;
617 UBool result = find(status);
618 return result;
619 }
620
621 //--------------------------------------------------------------------------------
622 //
623 // find()
624 //
625 //--------------------------------------------------------------------------------
626 UBool RegexMatcher::find(UErrorCode &status) {
627 // Start at the position of the last match end. (Will be zero if the
628 // matcher has been reset.)
629 //
630 if (U_FAILURE(status)) {
631 return FALSE;
632 }
633 if (U_FAILURE(fDeferredStatus)) {
634 status = fDeferredStatus;
635 return FALSE;
636 }
637
638 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
639 return findUsingChunk(status);
640 }
641
642 int64_t startPos = fMatchEnd;
643 if (startPos==0) {
644 startPos = fActiveStart;
645 }
646
647 if (fMatch) {
648 // Save the position of any previous successful match.
649 fLastMatchEnd = fMatchEnd;
650
651 if (fMatchStart == fMatchEnd) {
652 // Previous match had zero length. Move start position up one position
653 // to avoid sending find() into a loop on zero-length matches.
654 if (startPos >= fActiveLimit) {
655 fMatch = FALSE;
656 fHitEnd = TRUE;
657 return FALSE;
658 }
659 UTEXT_SETNATIVEINDEX(fInputText, startPos);
660 (void)UTEXT_NEXT32(fInputText);
661 startPos = UTEXT_GETNATIVEINDEX(fInputText);
662 }
663 } else {
664 if (fLastMatchEnd >= 0) {
665 // A previous find() failed to match. Don't try again.
666 // (without this test, a pattern with a zero-length match
667 // could match again at the end of an input string.)
668 fHitEnd = TRUE;
669 return FALSE;
670 }
671 }
672
673
674 // Compute the position in the input string beyond which a match can not begin, because
675 // the minimum length match would extend past the end of the input.
676 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
677 // Be aware of possible overflows if making changes here.
678 int64_t testStartLimit;
679 if (UTEXT_USES_U16(fInputText)) {
680 testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
681 if (startPos > testStartLimit) {
682 fMatch = FALSE;
683 fHitEnd = TRUE;
684 return FALSE;
685 }
686 } else {
687 // We don't know exactly how long the minimum match length is in native characters.
688 // Treat anything > 0 as 1.
689 testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0);
690 }
691
692 UChar32 c;
693 U_ASSERT(startPos >= 0);
694
695 switch (fPattern->fStartType) {
696 case START_NO_INFO:
697 // No optimization was found.
698 // Try a match at each input position.
699 for (;;) {
700 MatchAt(startPos, FALSE, status);
701 if (U_FAILURE(status)) {
702 return FALSE;
703 }
704 if (fMatch) {
705 return TRUE;
706 }
707 if (startPos >= testStartLimit) {
708 fHitEnd = TRUE;
709 return FALSE;
710 }
711 UTEXT_SETNATIVEINDEX(fInputText, startPos);
712 (void)UTEXT_NEXT32(fInputText);
713 startPos = UTEXT_GETNATIVEINDEX(fInputText);
714 // Note that it's perfectly OK for a pattern to have a zero-length
715 // match at the end of a string, so we must make sure that the loop
716 // runs with startPos == testStartLimit the last time through.
717 if (findProgressInterrupt(startPos, status))
718 return FALSE;
719 }
720 U_ASSERT(FALSE);
721
722 case START_START:
723 // Matches are only possible at the start of the input string
724 // (pattern begins with ^ or \A)
725 if (startPos > fActiveStart) {
726 fMatch = FALSE;
727 return FALSE;
728 }
729 MatchAt(startPos, FALSE, status);
730 if (U_FAILURE(status)) {
731 return FALSE;
732 }
733 return fMatch;
734
735
736 case START_SET:
737 {
738 // Match may start on any char from a pre-computed set.
739 U_ASSERT(fPattern->fMinMatchLen > 0);
740 UTEXT_SETNATIVEINDEX(fInputText, startPos);
741 for (;;) {
742 int64_t pos = startPos;
743 c = UTEXT_NEXT32(fInputText);
744 startPos = UTEXT_GETNATIVEINDEX(fInputText);
745 // c will be -1 (U_SENTINEL) at end of text, in which case we
746 // skip this next block (so we don't have a negative array index)
747 // and handle end of text in the following block.
748 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
749 (c>=256 && fPattern->fInitialChars->contains(c)))) {
750 MatchAt(pos, FALSE, status);
751 if (U_FAILURE(status)) {
752 return FALSE;
753 }
754 if (fMatch) {
755 return TRUE;
756 }
757 UTEXT_SETNATIVEINDEX(fInputText, pos);
758 }
759 if (startPos > testStartLimit) {
760 fMatch = FALSE;
761 fHitEnd = TRUE;
762 return FALSE;
763 }
764 if (findProgressInterrupt(startPos, status))
765 return FALSE;
766 }
767 }
768 U_ASSERT(FALSE);
769
770 case START_STRING:
771 case START_CHAR:
772 {
773 // Match starts on exactly one char.
774 U_ASSERT(fPattern->fMinMatchLen > 0);
775 UChar32 theChar = fPattern->fInitialChar;
776 UTEXT_SETNATIVEINDEX(fInputText, startPos);
777 for (;;) {
778 int64_t pos = startPos;
779 c = UTEXT_NEXT32(fInputText);
780 startPos = UTEXT_GETNATIVEINDEX(fInputText);
781 if (c == theChar) {
782 MatchAt(pos, FALSE, status);
783 if (U_FAILURE(status)) {
784 return FALSE;
785 }
786 if (fMatch) {
787 return TRUE;
788 }
789 UTEXT_SETNATIVEINDEX(fInputText, startPos);
790 }
791 if (startPos > testStartLimit) {
792 fMatch = FALSE;
793 fHitEnd = TRUE;
794 return FALSE;
795 }
796 if (findProgressInterrupt(startPos, status))
797 return FALSE;
798 }
799 }
800 U_ASSERT(FALSE);
801
802 case START_LINE:
803 {
804 UChar32 c;
805 if (startPos == fAnchorStart) {
806 MatchAt(startPos, FALSE, status);
807 if (U_FAILURE(status)) {
808 return FALSE;
809 }
810 if (fMatch) {
811 return TRUE;
812 }
813 UTEXT_SETNATIVEINDEX(fInputText, startPos);
814 c = UTEXT_NEXT32(fInputText);
815 startPos = UTEXT_GETNATIVEINDEX(fInputText);
816 } else {
817 UTEXT_SETNATIVEINDEX(fInputText, startPos);
818 c = UTEXT_PREVIOUS32(fInputText);
819 UTEXT_SETNATIVEINDEX(fInputText, startPos);
820 }
821
822 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
823 for (;;) {
824 if (c == 0x0a) {
825 MatchAt(startPos, FALSE, status);
826 if (U_FAILURE(status)) {
827 return FALSE;
828 }
829 if (fMatch) {
830 return TRUE;
831 }
832 UTEXT_SETNATIVEINDEX(fInputText, startPos);
833 }
834 if (startPos >= testStartLimit) {
835 fMatch = FALSE;
836 fHitEnd = TRUE;
837 return FALSE;
838 }
839 c = UTEXT_NEXT32(fInputText);
840 startPos = UTEXT_GETNATIVEINDEX(fInputText);
841 // Note that it's perfectly OK for a pattern to have a zero-length
842 // match at the end of a string, so we must make sure that the loop
843 // runs with startPos == testStartLimit the last time through.
844 if (findProgressInterrupt(startPos, status))
845 return FALSE;
846 }
847 } else {
848 for (;;) {
849 if (isLineTerminator(c)) {
850 if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
851 (void)UTEXT_NEXT32(fInputText);
852 startPos = UTEXT_GETNATIVEINDEX(fInputText);
853 }
854 MatchAt(startPos, FALSE, status);
855 if (U_FAILURE(status)) {
856 return FALSE;
857 }
858 if (fMatch) {
859 return TRUE;
860 }
861 UTEXT_SETNATIVEINDEX(fInputText, startPos);
862 }
863 if (startPos >= testStartLimit) {
864 fMatch = FALSE;
865 fHitEnd = TRUE;
866 return FALSE;
867 }
868 c = UTEXT_NEXT32(fInputText);
869 startPos = UTEXT_GETNATIVEINDEX(fInputText);
870 // Note that it's perfectly OK for a pattern to have a zero-length
871 // match at the end of a string, so we must make sure that the loop
872 // runs with startPos == testStartLimit the last time through.
873 if (findProgressInterrupt(startPos, status))
874 return FALSE;
875 }
876 }
877 }
878
879 default:
880 U_ASSERT(FALSE);
881 }
882
883 U_ASSERT(FALSE);
884 return FALSE;
885 }
886
887
888
889 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
890 if (U_FAILURE(status)) {
891 return FALSE;
892 }
893 if (U_FAILURE(fDeferredStatus)) {
894 status = fDeferredStatus;
895 return FALSE;
896 }
897 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
898 // This will reset the region to be the full input length.
899 if (start < 0) {
900 status = U_INDEX_OUTOFBOUNDS_ERROR;
901 return FALSE;
902 }
903
904 int64_t nativeStart = start;
905 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
906 status = U_INDEX_OUTOFBOUNDS_ERROR;
907 return FALSE;
908 }
909 fMatchEnd = nativeStart;
910 return find(status);
911 }
912
913
914 //--------------------------------------------------------------------------------
915 //
916 // findUsingChunk() -- like find(), but with the advance knowledge that the
917 // entire string is available in the UText's chunk buffer.
918 //
919 //--------------------------------------------------------------------------------
920 UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
921 // Start at the position of the last match end. (Will be zero if the
922 // matcher has been reset.
923 //
924
925 int32_t startPos = (int32_t)fMatchEnd;
926 if (startPos==0) {
927 startPos = (int32_t)fActiveStart;
928 }
929
930 const UChar *inputBuf = fInputText->chunkContents;
931
932 if (fMatch) {
933 // Save the position of any previous successful match.
934 fLastMatchEnd = fMatchEnd;
935
936 if (fMatchStart == fMatchEnd) {
937 // Previous match had zero length. Move start position up one position
938 // to avoid sending find() into a loop on zero-length matches.
939 if (startPos >= fActiveLimit) {
940 fMatch = FALSE;
941 fHitEnd = TRUE;
942 return FALSE;
943 }
944 U16_FWD_1(inputBuf, startPos, fInputLength);
945 }
946 } else {
947 if (fLastMatchEnd >= 0) {
948 // A previous find() failed to match. Don't try again.
949 // (without this test, a pattern with a zero-length match
950 // could match again at the end of an input string.)
951 fHitEnd = TRUE;
952 return FALSE;
953 }
954 }
955
956
957 // Compute the position in the input string beyond which a match can not begin, because
958 // the minimum length match would extend past the end of the input.
959 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
960 // Be aware of possible overflows if making changes here.
961 // Note: a match can begin at inputBuf + testLen; it is an inclusive limit.
962 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
963 if (startPos > testLen) {
964 fMatch = FALSE;
965 fHitEnd = TRUE;
966 return FALSE;
967 }
968
969 UChar32 c;
970 U_ASSERT(startPos >= 0);
971
972 switch (fPattern->fStartType) {
973 case START_NO_INFO:
974 // No optimization was found.
975 // Try a match at each input position.
976 for (;;) {
977 MatchChunkAt(startPos, FALSE, status);
978 if (U_FAILURE(status)) {
979 return FALSE;
980 }
981 if (fMatch) {
982 return TRUE;
983 }
984 if (startPos >= testLen) {
985 fHitEnd = TRUE;
986 return FALSE;
987 }
988 U16_FWD_1(inputBuf, startPos, fActiveLimit);
989 // Note that it's perfectly OK for a pattern to have a zero-length
990 // match at the end of a string, so we must make sure that the loop
991 // runs with startPos == testLen the last time through.
992 if (findProgressInterrupt(startPos, status))
993 return FALSE;
994 }
995 U_ASSERT(FALSE);
996
997 case START_START:
998 // Matches are only possible at the start of the input string
999 // (pattern begins with ^ or \A)
1000 if (startPos > fActiveStart) {
1001 fMatch = FALSE;
1002 return FALSE;
1003 }
1004 MatchChunkAt(startPos, FALSE, status);
1005 if (U_FAILURE(status)) {
1006 return FALSE;
1007 }
1008 return fMatch;
1009
1010
1011 case START_SET:
1012 {
1013 // Match may start on any char from a pre-computed set.
1014 U_ASSERT(fPattern->fMinMatchLen > 0);
1015 for (;;) {
1016 int32_t pos = startPos;
1017 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
1018 if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
1019 (c>=256 && fPattern->fInitialChars->contains(c))) {
1020 MatchChunkAt(pos, FALSE, status);
1021 if (U_FAILURE(status)) {
1022 return FALSE;
1023 }
1024 if (fMatch) {
1025 return TRUE;
1026 }
1027 }
1028 if (startPos > testLen) {
1029 fMatch = FALSE;
1030 fHitEnd = TRUE;
1031 return FALSE;
1032 }
1033 if (findProgressInterrupt(startPos, status))
1034 return FALSE;
1035 }
1036 }
1037 U_ASSERT(FALSE);
1038
1039 case START_STRING:
1040 case START_CHAR:
1041 {
1042 // Match starts on exactly one char.
1043 U_ASSERT(fPattern->fMinMatchLen > 0);
1044 UChar32 theChar = fPattern->fInitialChar;
1045 for (;;) {
1046 int32_t pos = startPos;
1047 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
1048 if (c == theChar) {
1049 MatchChunkAt(pos, FALSE, status);
1050 if (U_FAILURE(status)) {
1051 return FALSE;
1052 }
1053 if (fMatch) {
1054 return TRUE;
1055 }
1056 }
1057 if (startPos > testLen) {
1058 fMatch = FALSE;
1059 fHitEnd = TRUE;
1060 return FALSE;
1061 }
1062 if (findProgressInterrupt(startPos, status))
1063 return FALSE;
1064 }
1065 }
1066 U_ASSERT(FALSE);
1067
1068 case START_LINE:
1069 {
1070 UChar32 c;
1071 if (startPos == fAnchorStart) {
1072 MatchChunkAt(startPos, FALSE, status);
1073 if (U_FAILURE(status)) {
1074 return FALSE;
1075 }
1076 if (fMatch) {
1077 return TRUE;
1078 }
1079 // In bug 31063104 which has a zero-length text buffer we get here with
1080 // inputBuf=NULL, startPos=fActiveLimit=0 (and fMatch F) which violates the
1081 // requirement for U16_FWD_1 (utf16.h) that startPos < fActiveLimit. Having
1082 // inputBuf=NULL (chunkContexts NULL) is probably due to an error in the
1083 // CFStringUText functions. Nevertheless, to be defensive, add test below.
1084 if (startPos >= testLen) {
1085 fHitEnd = TRUE;
1086 return FALSE;
1087 }
1088 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1089 }
1090
1091 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1092 for (;;) {
1093 c = inputBuf[startPos-1];
1094 if (c == 0x0a) {
1095 MatchChunkAt(startPos, FALSE, status);
1096 if (U_FAILURE(status)) {
1097 return FALSE;
1098 }
1099 if (fMatch) {
1100 return TRUE;
1101 }
1102 }
1103 if (startPos >= testLen) {
1104 fMatch = FALSE;
1105 fHitEnd = TRUE;
1106 return FALSE;
1107 }
1108 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1109 // Note that it's perfectly OK for a pattern to have a zero-length
1110 // match at the end of a string, so we must make sure that the loop
1111 // runs with startPos == testLen the last time through.
1112 if (findProgressInterrupt(startPos, status))
1113 return FALSE;
1114 }
1115 } else {
1116 for (;;) {
1117 c = inputBuf[startPos-1];
1118 if (isLineTerminator(c)) {
1119 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1120 startPos++;
1121 }
1122 MatchChunkAt(startPos, FALSE, status);
1123 if (U_FAILURE(status)) {
1124 return FALSE;
1125 }
1126 if (fMatch) {
1127 return TRUE;
1128 }
1129 }
1130 if (startPos >= testLen) {
1131 fMatch = FALSE;
1132 fHitEnd = TRUE;
1133 return FALSE;
1134 }
1135 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1136 // Note that it's perfectly OK for a pattern to have a zero-length
1137 // match at the end of a string, so we must make sure that the loop
1138 // runs with startPos == testLen the last time through.
1139 if (findProgressInterrupt(startPos, status))
1140 return FALSE;
1141 }
1142 }
1143 }
1144
1145 default:
1146 U_ASSERT(FALSE);
1147 }
1148
1149 U_ASSERT(FALSE);
1150 return FALSE;
1151 }
1152
1153
1154
1155 //--------------------------------------------------------------------------------
1156 //
1157 // group()
1158 //
1159 //--------------------------------------------------------------------------------
1160 UnicodeString RegexMatcher::group(UErrorCode &status) const {
1161 return group(0, status);
1162 }
1163
1164 // Return immutable shallow clone
1165 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1166 return group(0, dest, group_len, status);
1167 }
1168
1169 // Return immutable shallow clone
1170 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1171 group_len = 0;
1172 if (U_FAILURE(status)) {
1173 return dest;
1174 }
1175 if (U_FAILURE(fDeferredStatus)) {
1176 status = fDeferredStatus;
1177 } else if (fMatch == FALSE) {
1178 status = U_REGEX_INVALID_STATE;
1179 } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1180 status = U_INDEX_OUTOFBOUNDS_ERROR;
1181 }
1182
1183 if (U_FAILURE(status)) {
1184 return dest;
1185 }
1186
1187 int64_t s, e;
1188 if (groupNum == 0) {
1189 s = fMatchStart;
1190 e = fMatchEnd;
1191 } else {
1192 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1193 U_ASSERT(groupOffset < fPattern->fFrameSize);
1194 U_ASSERT(groupOffset >= 0);
1195 s = fFrame->fExtra[groupOffset];
1196 e = fFrame->fExtra[groupOffset+1];
1197 }
1198
1199 if (s < 0) {
1200 // A capture group wasn't part of the match
1201 return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1202 }
1203 U_ASSERT(s <= e);
1204 group_len = e - s;
1205
1206 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1207 if (dest)
1208 UTEXT_SETNATIVEINDEX(dest, s);
1209 return dest;
1210 }
1211
1212 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1213 UnicodeString result;
1214 int64_t groupStart = start64(groupNum, status);
1215 int64_t groupEnd = end64(groupNum, status);
1216 if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) {
1217 return result;
1218 }
1219
1220 // Get the group length using a utext_extract preflight.
1221 // UText is actually pretty efficient at this when underlying encoding is UTF-16.
1222 int32_t length = utext_extract(fInputText, groupStart, groupEnd, NULL, 0, &status);
1223 if (status != U_BUFFER_OVERFLOW_ERROR) {
1224 return result;
1225 }
1226
1227 status = U_ZERO_ERROR;
1228 UChar *buf = result.getBuffer(length);
1229 if (buf == NULL) {
1230 status = U_MEMORY_ALLOCATION_ERROR;
1231 } else {
1232 int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status);
1233 result.releaseBuffer(extractLength);
1234 U_ASSERT(length == extractLength);
1235 }
1236 return result;
1237 }
1238
1239
1240 //--------------------------------------------------------------------------------
1241 //
1242 // appendGroup() -- currently internal only, appends a group to a UText rather
1243 // than replacing its contents
1244 //
1245 //--------------------------------------------------------------------------------
1246
1247 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1248 if (U_FAILURE(status)) {
1249 return 0;
1250 }
1251 if (U_FAILURE(fDeferredStatus)) {
1252 status = fDeferredStatus;
1253 return 0;
1254 }
1255 int64_t destLen = utext_nativeLength(dest);
1256
1257 if (fMatch == FALSE) {
1258 status = U_REGEX_INVALID_STATE;
1259 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1260 }
1261 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1262 status = U_INDEX_OUTOFBOUNDS_ERROR;
1263 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1264 }
1265
1266 int64_t s, e;
1267 if (groupNum == 0) {
1268 s = fMatchStart;
1269 e = fMatchEnd;
1270 } else {
1271 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1272 U_ASSERT(groupOffset < fPattern->fFrameSize);
1273 U_ASSERT(groupOffset >= 0);
1274 s = fFrame->fExtra[groupOffset];
1275 e = fFrame->fExtra[groupOffset+1];
1276 }
1277
1278 if (s < 0) {
1279 // A capture group wasn't part of the match
1280 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1281 }
1282 U_ASSERT(s <= e);
1283
1284 int64_t deltaLen;
1285 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1286 U_ASSERT(e <= fInputLength);
1287 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1288 } else {
1289 int32_t len16;
1290 if (UTEXT_USES_U16(fInputText)) {
1291 len16 = (int32_t)(e-s);
1292 } else {
1293 UErrorCode lengthStatus = U_ZERO_ERROR;
1294 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1295 }
1296 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1297 if (groupChars == NULL) {
1298 status = U_MEMORY_ALLOCATION_ERROR;
1299 return 0;
1300 }
1301 utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1302
1303 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1304 uprv_free(groupChars);
1305 }
1306 return deltaLen;
1307 }
1308
1309
1310
1311 //--------------------------------------------------------------------------------
1312 //
1313 // groupCount()
1314 //
1315 //--------------------------------------------------------------------------------
1316 int32_t RegexMatcher::groupCount() const {
1317 return fPattern->fGroupMap->size();
1318 }
1319
1320 //--------------------------------------------------------------------------------
1321 //
1322 // hasAnchoringBounds()
1323 //
1324 //--------------------------------------------------------------------------------
1325 UBool RegexMatcher::hasAnchoringBounds() const {
1326 return fAnchoringBounds;
1327 }
1328
1329
1330 //--------------------------------------------------------------------------------
1331 //
1332 // hasTransparentBounds()
1333 //
1334 //--------------------------------------------------------------------------------
1335 UBool RegexMatcher::hasTransparentBounds() const {
1336 return fTransparentBounds;
1337 }
1338
1339
1340
1341 //--------------------------------------------------------------------------------
1342 //
1343 // hitEnd()
1344 //
1345 //--------------------------------------------------------------------------------
1346 UBool RegexMatcher::hitEnd() const {
1347 return fHitEnd;
1348 }
1349
1350
1351 //--------------------------------------------------------------------------------
1352 //
1353 // input()
1354 //
1355 //--------------------------------------------------------------------------------
1356 const UnicodeString &RegexMatcher::input() const {
1357 if (!fInput) {
1358 UErrorCode status = U_ZERO_ERROR;
1359 int32_t len16;
1360 if (UTEXT_USES_U16(fInputText)) {
1361 len16 = (int32_t)fInputLength;
1362 } else {
1363 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1364 status = U_ZERO_ERROR; // overflow, length status
1365 }
1366 UnicodeString *result = new UnicodeString(len16, 0, 0);
1367
1368 UChar *inputChars = result->getBuffer(len16);
1369 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1370 result->releaseBuffer(len16);
1371
1372 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1373 }
1374
1375 return *fInput;
1376 }
1377
1378 //--------------------------------------------------------------------------------
1379 //
1380 // inputText()
1381 //
1382 //--------------------------------------------------------------------------------
1383 UText *RegexMatcher::inputText() const {
1384 return fInputText;
1385 }
1386
1387
1388 //--------------------------------------------------------------------------------
1389 //
1390 // getInput() -- like inputText(), but makes a clone or copies into another UText
1391 //
1392 //--------------------------------------------------------------------------------
1393 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1394 if (U_FAILURE(status)) {
1395 return dest;
1396 }
1397 if (U_FAILURE(fDeferredStatus)) {
1398 status = fDeferredStatus;
1399 return dest;
1400 }
1401
1402 if (dest) {
1403 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1404 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1405 } else {
1406 int32_t input16Len;
1407 if (UTEXT_USES_U16(fInputText)) {
1408 input16Len = (int32_t)fInputLength;
1409 } else {
1410 UErrorCode lengthStatus = U_ZERO_ERROR;
1411 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1412 }
1413 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1414 if (inputChars == NULL) {
1415 return dest;
1416 }
1417
1418 status = U_ZERO_ERROR;
1419 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1420 status = U_ZERO_ERROR;
1421 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1422
1423 uprv_free(inputChars);
1424 }
1425 return dest;
1426 } else {
1427 return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1428 }
1429 }
1430
1431
1432 static UBool compat_SyncMutableUTextContents(UText *ut);
1433 static UBool compat_SyncMutableUTextContents(UText *ut) {
1434 UBool retVal = FALSE;
1435
1436 // In the following test, we're really only interested in whether the UText should switch
1437 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents
1438 // will still point to the correct data.
1439 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1440 UnicodeString *us=(UnicodeString *)ut->context;
1441
1442 // Update to the latest length.
1443 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1444 int32_t newLength = us->length();
1445
1446 // Update the chunk description.
1447 // The buffer may have switched between stack- and heap-based.
1448 ut->chunkContents = us->getBuffer();
1449 ut->chunkLength = newLength;
1450 ut->chunkNativeLimit = newLength;
1451 ut->nativeIndexingLimit = newLength;
1452 retVal = TRUE;
1453 }
1454
1455 return retVal;
1456 }
1457
1458 //--------------------------------------------------------------------------------
1459 //
1460 // lookingAt()
1461 //
1462 //--------------------------------------------------------------------------------
1463 UBool RegexMatcher::lookingAt(UErrorCode &status) {
1464 if (U_FAILURE(status)) {
1465 return FALSE;
1466 }
1467 if (U_FAILURE(fDeferredStatus)) {
1468 status = fDeferredStatus;
1469 return FALSE;
1470 }
1471
1472 if (fInputUniStrMaybeMutable) {
1473 if (compat_SyncMutableUTextContents(fInputText)) {
1474 fInputLength = utext_nativeLength(fInputText);
1475 reset();
1476 }
1477 }
1478 else {
1479 resetPreserveRegion();
1480 }
1481 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1482 MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1483 } else {
1484 MatchAt(fActiveStart, FALSE, status);
1485 }
1486 return fMatch;
1487 }
1488
1489
1490 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1491 if (U_FAILURE(status)) {
1492 return FALSE;
1493 }
1494 if (U_FAILURE(fDeferredStatus)) {
1495 status = fDeferredStatus;
1496 return FALSE;
1497 }
1498 reset();
1499
1500 if (start < 0) {
1501 status = U_INDEX_OUTOFBOUNDS_ERROR;
1502 return FALSE;
1503 }
1504
1505 if (fInputUniStrMaybeMutable) {
1506 if (compat_SyncMutableUTextContents(fInputText)) {
1507 fInputLength = utext_nativeLength(fInputText);
1508 reset();
1509 }
1510 }
1511
1512 int64_t nativeStart;
1513 nativeStart = start;
1514 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1515 status = U_INDEX_OUTOFBOUNDS_ERROR;
1516 return FALSE;
1517 }
1518
1519 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1520 MatchChunkAt((int32_t)nativeStart, FALSE, status);
1521 } else {
1522 MatchAt(nativeStart, FALSE, status);
1523 }
1524 return fMatch;
1525 }
1526
1527
1528
1529 //--------------------------------------------------------------------------------
1530 //
1531 // matches()
1532 //
1533 //--------------------------------------------------------------------------------
1534 UBool RegexMatcher::matches(UErrorCode &status) {
1535 if (U_FAILURE(status)) {
1536 return FALSE;
1537 }
1538 if (U_FAILURE(fDeferredStatus)) {
1539 status = fDeferredStatus;
1540 return FALSE;
1541 }
1542
1543 if (fInputUniStrMaybeMutable) {
1544 if (compat_SyncMutableUTextContents(fInputText)) {
1545 fInputLength = utext_nativeLength(fInputText);
1546 reset();
1547 }
1548 }
1549 else {
1550 resetPreserveRegion();
1551 }
1552
1553 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1554 MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1555 } else {
1556 MatchAt(fActiveStart, TRUE, status);
1557 }
1558 return fMatch;
1559 }
1560
1561
1562 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1563 if (U_FAILURE(status)) {
1564 return FALSE;
1565 }
1566 if (U_FAILURE(fDeferredStatus)) {
1567 status = fDeferredStatus;
1568 return FALSE;
1569 }
1570 reset();
1571
1572 if (start < 0) {
1573 status = U_INDEX_OUTOFBOUNDS_ERROR;
1574 return FALSE;
1575 }
1576
1577 if (fInputUniStrMaybeMutable) {
1578 if (compat_SyncMutableUTextContents(fInputText)) {
1579 fInputLength = utext_nativeLength(fInputText);
1580 reset();
1581 }
1582 }
1583
1584 int64_t nativeStart;
1585 nativeStart = start;
1586 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1587 status = U_INDEX_OUTOFBOUNDS_ERROR;
1588 return FALSE;
1589 }
1590
1591 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1592 MatchChunkAt((int32_t)nativeStart, TRUE, status);
1593 } else {
1594 MatchAt(nativeStart, TRUE, status);
1595 }
1596 return fMatch;
1597 }
1598
1599
1600
1601 //--------------------------------------------------------------------------------
1602 //
1603 // pattern
1604 //
1605 //--------------------------------------------------------------------------------
1606 const RegexPattern &RegexMatcher::pattern() const {
1607 return *fPattern;
1608 }
1609
1610
1611
1612 //--------------------------------------------------------------------------------
1613 //
1614 // region
1615 //
1616 //--------------------------------------------------------------------------------
1617 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1618 if (U_FAILURE(status)) {
1619 return *this;
1620 }
1621
1622 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1623 status = U_ILLEGAL_ARGUMENT_ERROR;
1624 }
1625
1626 int64_t nativeStart = regionStart;
1627 int64_t nativeLimit = regionLimit;
1628 if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1629 status = U_ILLEGAL_ARGUMENT_ERROR;
1630 }
1631
1632 if (startIndex == -1)
1633 this->reset();
1634 else
1635 resetPreserveRegion();
1636
1637 fRegionStart = nativeStart;
1638 fRegionLimit = nativeLimit;
1639 fActiveStart = nativeStart;
1640 fActiveLimit = nativeLimit;
1641
1642 if (startIndex != -1) {
1643 if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1644 status = U_INDEX_OUTOFBOUNDS_ERROR;
1645 }
1646 fMatchEnd = startIndex;
1647 }
1648
1649 if (!fTransparentBounds) {
1650 fLookStart = nativeStart;
1651 fLookLimit = nativeLimit;
1652 }
1653 if (fAnchoringBounds) {
1654 fAnchorStart = nativeStart;
1655 fAnchorLimit = nativeLimit;
1656 }
1657 return *this;
1658 }
1659
1660 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1661 return region(start, limit, -1, status);
1662 }
1663
1664 //--------------------------------------------------------------------------------
1665 //
1666 // regionEnd
1667 //
1668 //--------------------------------------------------------------------------------
1669 int32_t RegexMatcher::regionEnd() const {
1670 return (int32_t)fRegionLimit;
1671 }
1672
1673 int64_t RegexMatcher::regionEnd64() const {
1674 return fRegionLimit;
1675 }
1676
1677 //--------------------------------------------------------------------------------
1678 //
1679 // regionStart
1680 //
1681 //--------------------------------------------------------------------------------
1682 int32_t RegexMatcher::regionStart() const {
1683 return (int32_t)fRegionStart;
1684 }
1685
1686 int64_t RegexMatcher::regionStart64() const {
1687 return fRegionStart;
1688 }
1689
1690
1691 //--------------------------------------------------------------------------------
1692 //
1693 // replaceAll
1694 //
1695 //--------------------------------------------------------------------------------
1696 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1697 UText replacementText = UTEXT_INITIALIZER;
1698 UText resultText = UTEXT_INITIALIZER;
1699 UnicodeString resultString;
1700 if (U_FAILURE(status)) {
1701 return resultString;
1702 }
1703
1704 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1705 utext_openUnicodeString(&resultText, &resultString, &status);
1706
1707 replaceAll(&replacementText, &resultText, status);
1708
1709 utext_close(&resultText);
1710 utext_close(&replacementText);
1711
1712 return resultString;
1713 }
1714
1715
1716 //
1717 // replaceAll, UText mode
1718 //
1719 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1720 if (U_FAILURE(status)) {
1721 return dest;
1722 }
1723 if (U_FAILURE(fDeferredStatus)) {
1724 status = fDeferredStatus;
1725 return dest;
1726 }
1727
1728 if (dest == NULL) {
1729 UnicodeString emptyString;
1730 UText empty = UTEXT_INITIALIZER;
1731
1732 utext_openUnicodeString(&empty, &emptyString, &status);
1733 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1734 utext_close(&empty);
1735 }
1736
1737 if (U_SUCCESS(status)) {
1738 reset();
1739 while (find()) {
1740 appendReplacement(dest, replacement, status);
1741 if (U_FAILURE(status)) {
1742 break;
1743 }
1744 }
1745 appendTail(dest, status);
1746 }
1747
1748 return dest;
1749 }
1750
1751
1752 //--------------------------------------------------------------------------------
1753 //
1754 // replaceFirst
1755 //
1756 //--------------------------------------------------------------------------------
1757 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1758 UText replacementText = UTEXT_INITIALIZER;
1759 UText resultText = UTEXT_INITIALIZER;
1760 UnicodeString resultString;
1761
1762 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1763 utext_openUnicodeString(&resultText, &resultString, &status);
1764
1765 replaceFirst(&replacementText, &resultText, status);
1766
1767 utext_close(&resultText);
1768 utext_close(&replacementText);
1769
1770 return resultString;
1771 }
1772
1773 //
1774 // replaceFirst, UText mode
1775 //
1776 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1777 if (U_FAILURE(status)) {
1778 return dest;
1779 }
1780 if (U_FAILURE(fDeferredStatus)) {
1781 status = fDeferredStatus;
1782 return dest;
1783 }
1784
1785 reset();
1786 if (!find()) {
1787 return getInput(dest, status);
1788 }
1789
1790 if (dest == NULL) {
1791 UnicodeString emptyString;
1792 UText empty = UTEXT_INITIALIZER;
1793
1794 utext_openUnicodeString(&empty, &emptyString, &status);
1795 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1796 utext_close(&empty);
1797 }
1798
1799 appendReplacement(dest, replacement, status);
1800 appendTail(dest, status);
1801
1802 return dest;
1803 }
1804
1805
1806 //--------------------------------------------------------------------------------
1807 //
1808 // requireEnd
1809 //
1810 //--------------------------------------------------------------------------------
1811 UBool RegexMatcher::requireEnd() const {
1812 return fRequireEnd;
1813 }
1814
1815
1816 //--------------------------------------------------------------------------------
1817 //
1818 // reset
1819 //
1820 //--------------------------------------------------------------------------------
1821 RegexMatcher &RegexMatcher::reset() {
1822 fRegionStart = 0;
1823 fRegionLimit = fInputLength;
1824 fActiveStart = 0;
1825 fActiveLimit = fInputLength;
1826 fAnchorStart = 0;
1827 fAnchorLimit = fInputLength;
1828 fLookStart = 0;
1829 fLookLimit = fInputLength;
1830 resetPreserveRegion();
1831 return *this;
1832 }
1833
1834
1835
1836 void RegexMatcher::resetPreserveRegion() {
1837 fMatchStart = 0;
1838 fMatchEnd = 0;
1839 fLastMatchEnd = -1;
1840 fAppendPosition = 0;
1841 fMatch = FALSE;
1842 fHitEnd = FALSE;
1843 fRequireEnd = FALSE;
1844 fTime = 0;
1845 fTickCounter = TIMER_INITIAL_VALUE;
1846 //resetStack(); // more expensive than it looks...
1847 }
1848
1849
1850 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1851 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1852 if (fPattern->fNeedsAltInput) {
1853 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1854 }
1855 if (U_FAILURE(fDeferredStatus)) {
1856 return *this;
1857 }
1858 fInputLength = utext_nativeLength(fInputText);
1859
1860 reset();
1861 delete fInput;
1862 fInput = NULL;
1863
1864 // Do the following for any UnicodeString.
1865 // This is for compatibility for those clients who modify the input string "live" during regex operations.
1866 fInputUniStrMaybeMutable = TRUE;
1867
1868 if (fWordBreakItr != NULL) {
1869 #if UCONFIG_NO_BREAK_ITERATION==0
1870 UErrorCode status = U_ZERO_ERROR;
1871 fWordBreakItr->setText(fInputText, status);
1872 #endif
1873 }
1874 return *this;
1875 }
1876
1877
1878 RegexMatcher &RegexMatcher::reset(UText *input) {
1879 if (fInputText != input) {
1880 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1881 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1882 if (U_FAILURE(fDeferredStatus)) {
1883 return *this;
1884 }
1885 fInputLength = utext_nativeLength(fInputText);
1886
1887 delete fInput;
1888 fInput = NULL;
1889
1890 if (fWordBreakItr != NULL) {
1891 #if UCONFIG_NO_BREAK_ITERATION==0
1892 UErrorCode status = U_ZERO_ERROR;
1893 fWordBreakItr->setText(input, status);
1894 #endif
1895 }
1896 }
1897 reset();
1898 fInputUniStrMaybeMutable = FALSE;
1899
1900 return *this;
1901 }
1902
1903 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
1904 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1905 return *this;
1906 }*/
1907
1908 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1909 if (U_FAILURE(status)) {
1910 return *this;
1911 }
1912 reset(); // Reset also resets the region to be the entire string.
1913
1914 if (position < 0 || position > fActiveLimit) {
1915 status = U_INDEX_OUTOFBOUNDS_ERROR;
1916 return *this;
1917 }
1918 fMatchEnd = position;
1919 return *this;
1920 }
1921
1922
1923 //--------------------------------------------------------------------------------
1924 //
1925 // refresh
1926 //
1927 //--------------------------------------------------------------------------------
1928 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1929 if (U_FAILURE(status)) {
1930 return *this;
1931 }
1932 if (input == NULL) {
1933 status = U_ILLEGAL_ARGUMENT_ERROR;
1934 return *this;
1935 }
1936 if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1937 status = U_ILLEGAL_ARGUMENT_ERROR;
1938 return *this;
1939 }
1940 int64_t pos = utext_getNativeIndex(fInputText);
1941 // Shallow read-only clone of the new UText into the existing input UText
1942 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
1943 if (U_FAILURE(status)) {
1944 return *this;
1945 }
1946 utext_setNativeIndex(fInputText, pos);
1947
1948 if (fAltInputText != NULL) {
1949 pos = utext_getNativeIndex(fAltInputText);
1950 fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
1951 if (U_FAILURE(status)) {
1952 return *this;
1953 }
1954 utext_setNativeIndex(fAltInputText, pos);
1955 }
1956 return *this;
1957 }
1958
1959
1960
1961 //--------------------------------------------------------------------------------
1962 //
1963 // setTrace
1964 //
1965 //--------------------------------------------------------------------------------
1966 void RegexMatcher::setTrace(UBool state) {
1967 fTraceDebug = state;
1968 }
1969
1970
1971
1972 /**
1973 * UText, replace entire contents of the destination UText with a substring of the source UText.
1974 *
1975 * @param src The source UText
1976 * @param dest The destination UText. Must be writable.
1977 * May be NULL, in which case a new UText will be allocated.
1978 * @param start Start index of source substring.
1979 * @param limit Limit index of source substring.
1980 * @param status An error code.
1981 */
1982 static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) {
1983 if (U_FAILURE(*status)) {
1984 return dest;
1985 }
1986 if (start == limit) {
1987 if (dest) {
1988 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, status);
1989 return dest;
1990 } else {
1991 return utext_openUChars(NULL, NULL, 0, status);
1992 }
1993 }
1994 int32_t length = utext_extract(src, start, limit, NULL, 0, status);
1995 if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
1996 return dest;
1997 }
1998 *status = U_ZERO_ERROR;
1999 MaybeStackArray<UChar, 40> buffer;
2000 if (length >= buffer.getCapacity()) {
2001 UChar *newBuf = buffer.resize(length+1); // Leave space for terminating Nul.
2002 if (newBuf == NULL) {
2003 *status = U_MEMORY_ALLOCATION_ERROR;
2004 }
2005 }
2006 utext_extract(src, start, limit, buffer.getAlias(), length+1, status);
2007 if (dest) {
2008 utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status);
2009 return dest;
2010 }
2011
2012 // Caller did not provide a prexisting UText.
2013 // Open a new one, and have it adopt the text buffer storage.
2014 if (U_FAILURE(*status)) {
2015 return NULL;
2016 }
2017 int32_t ownedLength = 0;
2018 UChar *ownedBuf = buffer.orphanOrClone(length+1, ownedLength);
2019 if (ownedBuf == NULL) {
2020 *status = U_MEMORY_ALLOCATION_ERROR;
2021 return NULL;
2022 }
2023 UText *result = utext_openUChars(NULL, ownedBuf, length, status);
2024 if (U_FAILURE(*status)) {
2025 uprv_free(ownedBuf);
2026 return NULL;
2027 }
2028 result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT);
2029 return result;
2030 }
2031
2032
2033 //---------------------------------------------------------------------
2034 //
2035 // split
2036 //
2037 //---------------------------------------------------------------------
2038 int32_t RegexMatcher::split(const UnicodeString &input,
2039 UnicodeString dest[],
2040 int32_t destCapacity,
2041 UErrorCode &status)
2042 {
2043 UText inputText = UTEXT_INITIALIZER;
2044 utext_openConstUnicodeString(&inputText, &input, &status);
2045 if (U_FAILURE(status)) {
2046 return 0;
2047 }
2048
2049 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2050 if (destText == NULL) {
2051 status = U_MEMORY_ALLOCATION_ERROR;
2052 return 0;
2053 }
2054 int32_t i;
2055 for (i = 0; i < destCapacity; i++) {
2056 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2057 }
2058
2059 int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2060
2061 for (i = 0; i < destCapacity; i++) {
2062 utext_close(destText[i]);
2063 }
2064
2065 uprv_free(destText);
2066 utext_close(&inputText);
2067 return fieldCount;
2068 }
2069
2070 //
2071 // split, UText mode
2072 //
2073 int32_t RegexMatcher::split(UText *input,
2074 UText *dest[],
2075 int32_t destCapacity,
2076 UErrorCode &status)
2077 {
2078 //
2079 // Check arguements for validity
2080 //
2081 if (U_FAILURE(status)) {
2082 return 0;
2083 };
2084
2085 if (destCapacity < 1) {
2086 status = U_ILLEGAL_ARGUMENT_ERROR;
2087 return 0;
2088 }
2089
2090 //
2091 // Reset for the input text
2092 //
2093 reset(input);
2094 int64_t nextOutputStringStart = 0;
2095 if (fActiveLimit == 0) {
2096 return 0;
2097 }
2098
2099 //
2100 // Loop through the input text, searching for the delimiter pattern
2101 //
2102 int32_t i;
2103 int32_t numCaptureGroups = fPattern->fGroupMap->size();
2104 for (i=0; ; i++) {
2105 if (i>=destCapacity-1) {
2106 // There is one or zero output string left.
2107 // Fill the last output string with whatever is left from the input, then exit the loop.
2108 // ( i will be == destCapacity if we filled the output array while processing
2109 // capture groups of the delimiter expression, in which case we will discard the
2110 // last capture group saved in favor of the unprocessed remainder of the
2111 // input string.)
2112 i = destCapacity-1;
2113 if (fActiveLimit > nextOutputStringStart) {
2114 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2115 if (dest[i]) {
2116 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2117 input->chunkContents+nextOutputStringStart,
2118 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2119 } else {
2120 UText remainingText = UTEXT_INITIALIZER;
2121 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2122 fActiveLimit-nextOutputStringStart, &status);
2123 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2124 utext_close(&remainingText);
2125 }
2126 } else {
2127 UErrorCode lengthStatus = U_ZERO_ERROR;
2128 int32_t remaining16Length =
2129 utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2130 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2131 if (remainingChars == NULL) {
2132 status = U_MEMORY_ALLOCATION_ERROR;
2133 break;
2134 }
2135
2136 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2137 if (dest[i]) {
2138 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2139 } else {
2140 UText remainingText = UTEXT_INITIALIZER;
2141 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2142 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2143 utext_close(&remainingText);
2144 }
2145
2146 uprv_free(remainingChars);
2147 }
2148 }
2149 break;
2150 }
2151 if (find()) {
2152 // We found another delimiter. Move everything from where we started looking
2153 // up until the start of the delimiter into the next output string.
2154 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2155 if (dest[i]) {
2156 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2157 input->chunkContents+nextOutputStringStart,
2158 (int32_t)(fMatchStart-nextOutputStringStart), &status);
2159 } else {
2160 UText remainingText = UTEXT_INITIALIZER;
2161 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2162 fMatchStart-nextOutputStringStart, &status);
2163 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2164 utext_close(&remainingText);
2165 }
2166 } else {
2167 UErrorCode lengthStatus = U_ZERO_ERROR;
2168 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2169 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2170 if (remainingChars == NULL) {
2171 status = U_MEMORY_ALLOCATION_ERROR;
2172 break;
2173 }
2174 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2175 if (dest[i]) {
2176 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2177 } else {
2178 UText remainingText = UTEXT_INITIALIZER;
2179 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2180 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2181 utext_close(&remainingText);
2182 }
2183
2184 uprv_free(remainingChars);
2185 }
2186 nextOutputStringStart = fMatchEnd;
2187
2188 // If the delimiter pattern has capturing parentheses, the captured
2189 // text goes out into the next n destination strings.
2190 int32_t groupNum;
2191 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2192 if (i >= destCapacity-2) {
2193 // Never fill the last available output string with capture group text.
2194 // It will filled with the last field, the remainder of the
2195 // unsplit input text.
2196 break;
2197 }
2198 i++;
2199 dest[i] = utext_extract_replace(fInputText, dest[i],
2200 start64(groupNum, status), end64(groupNum, status), &status);
2201 }
2202
2203 if (nextOutputStringStart == fActiveLimit) {
2204 // The delimiter was at the end of the string. We're done, but first
2205 // we output one last empty string, for the empty field following
2206 // the delimiter at the end of input.
2207 if (i+1 < destCapacity) {
2208 ++i;
2209 if (dest[i] == NULL) {
2210 dest[i] = utext_openUChars(NULL, NULL, 0, &status);
2211 } else {
2212 static const UChar emptyString[] = {(UChar)0};
2213 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
2214 }
2215 }
2216 break;
2217
2218 }
2219 }
2220 else
2221 {
2222 // We ran off the end of the input while looking for the next delimiter.
2223 // All the remaining text goes into the current output string.
2224 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2225 if (dest[i]) {
2226 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2227 input->chunkContents+nextOutputStringStart,
2228 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2229 } else {
2230 UText remainingText = UTEXT_INITIALIZER;
2231 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2232 fActiveLimit-nextOutputStringStart, &status);
2233 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2234 utext_close(&remainingText);
2235 }
2236 } else {
2237 UErrorCode lengthStatus = U_ZERO_ERROR;
2238 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2239 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2240 if (remainingChars == NULL) {
2241 status = U_MEMORY_ALLOCATION_ERROR;
2242 break;
2243 }
2244
2245 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2246 if (dest[i]) {
2247 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2248 } else {
2249 UText remainingText = UTEXT_INITIALIZER;
2250 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2251 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2252 utext_close(&remainingText);
2253 }
2254
2255 uprv_free(remainingChars);
2256 }
2257 break;
2258 }
2259 if (U_FAILURE(status)) {
2260 break;
2261 }
2262 } // end of for loop
2263 return i+1;
2264 }
2265
2266
2267 //--------------------------------------------------------------------------------
2268 //
2269 // start
2270 //
2271 //--------------------------------------------------------------------------------
2272 int32_t RegexMatcher::start(UErrorCode &status) const {
2273 return start(0, status);
2274 }
2275
2276 int64_t RegexMatcher::start64(UErrorCode &status) const {
2277 return start64(0, status);
2278 }
2279
2280 //--------------------------------------------------------------------------------
2281 //
2282 // start(int32_t group, UErrorCode &status)
2283 //
2284 //--------------------------------------------------------------------------------
2285
2286 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2287 if (U_FAILURE(status)) {
2288 return -1;
2289 }
2290 if (U_FAILURE(fDeferredStatus)) {
2291 status = fDeferredStatus;
2292 return -1;
2293 }
2294 if (fMatch == FALSE) {
2295 status = U_REGEX_INVALID_STATE;
2296 return -1;
2297 }
2298 if (group < 0 || group > fPattern->fGroupMap->size()) {
2299 status = U_INDEX_OUTOFBOUNDS_ERROR;
2300 return -1;
2301 }
2302 int64_t s;
2303 if (group == 0) {
2304 s = fMatchStart;
2305 } else {
2306 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2307 U_ASSERT(groupOffset < fPattern->fFrameSize);
2308 U_ASSERT(groupOffset >= 0);
2309 s = fFrame->fExtra[groupOffset];
2310 }
2311
2312 return s;
2313 }
2314
2315
2316 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2317 return (int32_t)start64(group, status);
2318 }
2319
2320 //--------------------------------------------------------------------------------
2321 //
2322 // useAnchoringBounds
2323 //
2324 //--------------------------------------------------------------------------------
2325 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2326 fAnchoringBounds = b;
2327 fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2328 fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2329 return *this;
2330 }
2331
2332
2333 //--------------------------------------------------------------------------------
2334 //
2335 // useTransparentBounds
2336 //
2337 //--------------------------------------------------------------------------------
2338 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2339 fTransparentBounds = b;
2340 fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2341 fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2342 return *this;
2343 }
2344
2345 //--------------------------------------------------------------------------------
2346 //
2347 // setTimeLimit
2348 //
2349 //--------------------------------------------------------------------------------
2350 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2351 if (U_FAILURE(status)) {
2352 return;
2353 }
2354 if (U_FAILURE(fDeferredStatus)) {
2355 status = fDeferredStatus;
2356 return;
2357 }
2358 if (limit < 0) {
2359 status = U_ILLEGAL_ARGUMENT_ERROR;
2360 return;
2361 }
2362 fTimeLimit = limit;
2363 }
2364
2365
2366 //--------------------------------------------------------------------------------
2367 //
2368 // getTimeLimit
2369 //
2370 //--------------------------------------------------------------------------------
2371 int32_t RegexMatcher::getTimeLimit() const {
2372 return fTimeLimit;
2373 }
2374
2375
2376 //--------------------------------------------------------------------------------
2377 //
2378 // setStackLimit
2379 //
2380 //--------------------------------------------------------------------------------
2381 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2382 if (U_FAILURE(status)) {
2383 return;
2384 }
2385 if (U_FAILURE(fDeferredStatus)) {
2386 status = fDeferredStatus;
2387 return;
2388 }
2389 if (limit < 0) {
2390 status = U_ILLEGAL_ARGUMENT_ERROR;
2391 return;
2392 }
2393
2394 // Reset the matcher. This is needed here in case there is a current match
2395 // whose final stack frame (containing the match results, pointed to by fFrame)
2396 // would be lost by resizing to a smaller stack size.
2397 reset();
2398
2399 if (limit == 0) {
2400 // Unlimited stack expansion
2401 fStack->setMaxCapacity(0);
2402 } else {
2403 // Change the units of the limit from bytes to ints, and bump the size up
2404 // to be big enough to hold at least one stack frame for the pattern,
2405 // if it isn't there already.
2406 int32_t adjustedLimit = limit / sizeof(int32_t);
2407 if (adjustedLimit < fPattern->fFrameSize) {
2408 adjustedLimit = fPattern->fFrameSize;
2409 }
2410 fStack->setMaxCapacity(adjustedLimit);
2411 }
2412 fStackLimit = limit;
2413 }
2414
2415
2416 //--------------------------------------------------------------------------------
2417 //
2418 // getStackLimit
2419 //
2420 //--------------------------------------------------------------------------------
2421 int32_t RegexMatcher::getStackLimit() const {
2422 return fStackLimit;
2423 }
2424
2425
2426 //--------------------------------------------------------------------------------
2427 //
2428 // setMatchCallback
2429 //
2430 //--------------------------------------------------------------------------------
2431 void RegexMatcher::setMatchCallback(URegexMatchCallback *callback,
2432 const void *context,
2433 UErrorCode &status) {
2434 if (U_FAILURE(status)) {
2435 return;
2436 }
2437 fCallbackFn = callback;
2438 fCallbackContext = context;
2439 }
2440
2441
2442 //--------------------------------------------------------------------------------
2443 //
2444 // getMatchCallback
2445 //
2446 //--------------------------------------------------------------------------------
2447 void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback,
2448 const void *&context,
2449 UErrorCode &status) {
2450 if (U_FAILURE(status)) {
2451 return;
2452 }
2453 callback = fCallbackFn;
2454 context = fCallbackContext;
2455 }
2456
2457
2458 //--------------------------------------------------------------------------------
2459 //
2460 // setMatchCallback
2461 //
2462 //--------------------------------------------------------------------------------
2463 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback,
2464 const void *context,
2465 UErrorCode &status) {
2466 if (U_FAILURE(status)) {
2467 return;
2468 }
2469 fFindProgressCallbackFn = callback;
2470 fFindProgressCallbackContext = context;
2471 }
2472
2473
2474 //--------------------------------------------------------------------------------
2475 //
2476 // getMatchCallback
2477 //
2478 //--------------------------------------------------------------------------------
2479 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback,
2480 const void *&context,
2481 UErrorCode &status) {
2482 if (U_FAILURE(status)) {
2483 return;
2484 }
2485 callback = fFindProgressCallbackFn;
2486 context = fFindProgressCallbackContext;
2487 }
2488
2489
2490 //================================================================================
2491 //
2492 // Code following this point in this file is the internal
2493 // Match Engine Implementation.
2494 //
2495 //================================================================================
2496
2497
2498 //--------------------------------------------------------------------------------
2499 //
2500 // resetStack
2501 // Discard any previous contents of the state save stack, and initialize a
2502 // new stack frame to all -1. The -1s are needed for capture group limits,
2503 // where they indicate that a group has not yet matched anything.
2504 //--------------------------------------------------------------------------------
2505 REStackFrame *RegexMatcher::resetStack() {
2506 // Discard any previous contents of the state save stack, and initialize a
2507 // new stack frame with all -1 data. The -1s are needed for capture group limits,
2508 // where they indicate that a group has not yet matched anything.
2509 fStack->removeAllElements();
2510
2511 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2512 if(U_FAILURE(fDeferredStatus)) {
2513 return NULL;
2514 }
2515
2516 int32_t i;
2517 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2518 iFrame->fExtra[i] = -1;
2519 }
2520 return iFrame;
2521 }
2522
2523
2524
2525 //--------------------------------------------------------------------------------
2526 //
2527 // isWordBoundary
2528 // in perl, "xab..cd..", \b is true at positions 0,3,5,7
2529 // For us,
2530 // If the current char is a combining mark,
2531 // \b is FALSE.
2532 // Else Scan backwards to the first non-combining char.
2533 // We are at a boundary if the this char and the original chars are
2534 // opposite in membership in \w set
2535 //
2536 // parameters: pos - the current position in the input buffer
2537 //
2538 // TODO: double-check edge cases at region boundaries.
2539 //
2540 //--------------------------------------------------------------------------------
2541 UBool RegexMatcher::isWordBoundary(int64_t pos) {
2542 UBool isBoundary = FALSE;
2543 UBool cIsWord = FALSE;
2544
2545 if (pos >= fLookLimit) {
2546 fHitEnd = TRUE;
2547 } else {
2548 // Determine whether char c at current position is a member of the word set of chars.
2549 // If we're off the end of the string, behave as though we're not at a word char.
2550 UTEXT_SETNATIVEINDEX(fInputText, pos);
2551 UChar32 c = UTEXT_CURRENT32(fInputText);
2552 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2553 // Current char is a combining one. Not a boundary.
2554 return FALSE;
2555 }
2556 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2557 }
2558
2559 // Back up until we come to a non-combining char, determine whether
2560 // that char is a word char.
2561 UBool prevCIsWord = FALSE;
2562 for (;;) {
2563 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2564 break;
2565 }
2566 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2567 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2568 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2569 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2570 break;
2571 }
2572 }
2573 isBoundary = cIsWord ^ prevCIsWord;
2574 return isBoundary;
2575 }
2576
2577 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2578 UBool isBoundary = FALSE;
2579 UBool cIsWord = FALSE;
2580
2581 const UChar *inputBuf = fInputText->chunkContents;
2582
2583 if (pos >= fLookLimit) {
2584 fHitEnd = TRUE;
2585 } else {
2586 // Determine whether char c at current position is a member of the word set of chars.
2587 // If we're off the end of the string, behave as though we're not at a word char.
2588 UChar32 c;
2589 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2590 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2591 // Current char is a combining one. Not a boundary.
2592 return FALSE;
2593 }
2594 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2595 }
2596
2597 // Back up until we come to a non-combining char, determine whether
2598 // that char is a word char.
2599 UBool prevCIsWord = FALSE;
2600 for (;;) {
2601 if (pos <= fLookStart) {
2602 break;
2603 }
2604 UChar32 prevChar;
2605 U16_PREV(inputBuf, fLookStart, pos, prevChar);
2606 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2607 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2608 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2609 break;
2610 }
2611 }
2612 isBoundary = cIsWord ^ prevCIsWord;
2613 return isBoundary;
2614 }
2615
2616 //--------------------------------------------------------------------------------
2617 //
2618 // isUWordBoundary
2619 //
2620 // Test for a word boundary using RBBI word break.
2621 //
2622 // parameters: pos - the current position in the input buffer
2623 //
2624 //--------------------------------------------------------------------------------
2625 UBool RegexMatcher::isUWordBoundary(int64_t pos) {
2626 UBool returnVal = FALSE;
2627 #if UCONFIG_NO_BREAK_ITERATION==0
2628
2629 // If we haven't yet created a break iterator for this matcher, do it now.
2630 if (fWordBreakItr == NULL) {
2631 fWordBreakItr =
2632 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
2633 if (U_FAILURE(fDeferredStatus)) {
2634 return FALSE;
2635 }
2636 fWordBreakItr->setText(fInputText, fDeferredStatus);
2637 }
2638
2639 if (pos >= fLookLimit) {
2640 fHitEnd = TRUE;
2641 returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real"
2642 // words are not boundaries. All non-word chars stand by themselves,
2643 // with word boundaries on both sides.
2644 } else {
2645 if (!UTEXT_USES_U16(fInputText)) {
2646 // !!!: Would like a better way to do this!
2647 UErrorCode status = U_ZERO_ERROR;
2648 pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
2649 }
2650 returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2651 }
2652 #endif
2653 return returnVal;
2654 }
2655
2656 //--------------------------------------------------------------------------------
2657 //
2658 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state
2659 // saves. Increment the "time" counter, and call the
2660 // user callback function if there is one installed.
2661 //
2662 // If the match operation needs to be aborted, either for a time-out
2663 // or because the user callback asked for it, just set an error status.
2664 // The engine will pick that up and stop in its outer loop.
2665 //
2666 //--------------------------------------------------------------------------------
2667 void RegexMatcher::IncrementTime(UErrorCode &status) {
2668 fTickCounter = TIMER_INITIAL_VALUE;
2669 fTime++;
2670 if (fCallbackFn != NULL) {
2671 if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2672 status = U_REGEX_STOPPED_BY_CALLER;
2673 return;
2674 }
2675 }
2676 if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2677 status = U_REGEX_TIME_OUT;
2678 }
2679 }
2680
2681 //--------------------------------------------------------------------------------
2682 //
2683 // StateSave
2684 // Make a new stack frame, initialized as a copy of the current stack frame.
2685 // Set the pattern index in the original stack frame from the operand value
2686 // in the opcode. Execution of the engine continues with the state in
2687 // the newly created stack frame
2688 //
2689 // Note that reserveBlock() may grow the stack, resulting in the
2690 // whole thing being relocated in memory.
2691 //
2692 // Parameters:
2693 // fp The top frame pointer when called. At return, a new
2694 // fame will be present
2695 // savePatIdx An index into the compiled pattern. Goes into the original
2696 // (not new) frame. If execution ever back-tracks out of the
2697 // new frame, this will be where we continue from in the pattern.
2698 // Return
2699 // The new frame pointer.
2700 //
2701 //--------------------------------------------------------------------------------
2702 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2703 if (U_FAILURE(status)) {
2704 return fp;
2705 }
2706 // push storage for a new frame.
2707 int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2708 if (U_FAILURE(status)) {
2709 // Failure on attempted stack expansion.
2710 // Stack function set some other error code, change it to a more
2711 // specific one for regular expressions.
2712 status = U_REGEX_STACK_OVERFLOW;
2713 // We need to return a writable stack frame, so just return the
2714 // previous frame. The match operation will stop quickly
2715 // because of the error status, after which the frame will never
2716 // be looked at again.
2717 return fp;
2718 }
2719 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack.
2720
2721 // New stack frame = copy of old top frame.
2722 int64_t *source = (int64_t *)fp;
2723 int64_t *dest = newFP;
2724 for (;;) {
2725 *dest++ = *source++;
2726 if (source == newFP) {
2727 break;
2728 }
2729 }
2730
2731 fTickCounter--;
2732 if (fTickCounter <= 0) {
2733 IncrementTime(status); // Re-initializes fTickCounter
2734 }
2735 fp->fPatIdx = savePatIdx;
2736 return (REStackFrame *)newFP;
2737 }
2738
2739 #if defined(REGEX_DEBUG)
2740 namespace {
2741 UnicodeString StringFromUText(UText *ut) {
2742 UnicodeString result;
2743 for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) {
2744 result.append(c);
2745 }
2746 return result;
2747 }
2748 }
2749 #endif // REGEX_DEBUG
2750
2751
2752 //--------------------------------------------------------------------------------
2753 //
2754 // MatchAt This is the actual matching engine.
2755 //
2756 // startIdx: begin matching a this index.
2757 // toEnd: if true, match must extend to end of the input region
2758 //
2759 //--------------------------------------------------------------------------------
2760 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2761 UBool isMatch = FALSE; // True if the we have a match.
2762
2763 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2764
2765 int32_t op; // Operation from the compiled pattern, split into
2766 int32_t opType; // the opcode
2767 int32_t opValue; // and the operand value.
2768
2769 #ifdef REGEX_RUN_DEBUG
2770 if (fTraceDebug) {
2771 printf("MatchAt(startIdx=%ld)\n", startIdx);
2772 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
2773 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
2774 }
2775 #endif
2776
2777 if (U_FAILURE(status)) {
2778 return;
2779 }
2780
2781 // Cache frequently referenced items from the compiled pattern
2782 //
2783 int64_t *pat = fPattern->fCompiledPat->getBuffer();
2784
2785 const UChar *litText = fPattern->fLiteralText.getBuffer();
2786 UVector *sets = fPattern->fSets;
2787
2788 fFrameSize = fPattern->fFrameSize;
2789 REStackFrame *fp = resetStack();
2790 if (U_FAILURE(fDeferredStatus)) {
2791 status = fDeferredStatus;
2792 return;
2793 }
2794
2795 fp->fPatIdx = 0;
2796 fp->fInputIdx = startIdx;
2797
2798 // Zero out the pattern's static data
2799 int32_t i;
2800 for (i = 0; i<fPattern->fDataSize; i++) {
2801 fData[i] = 0;
2802 }
2803
2804 //
2805 // Main loop for interpreting the compiled pattern.
2806 // One iteration of the loop per pattern operation performed.
2807 //
2808 for (;;) {
2809 op = (int32_t)pat[fp->fPatIdx];
2810 opType = URX_TYPE(op);
2811 opValue = URX_VAL(op);
2812 #ifdef REGEX_RUN_DEBUG
2813 if (fTraceDebug) {
2814 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2815 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
2816 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2817 fPattern->dumpOp(fp->fPatIdx);
2818 }
2819 #endif
2820 fp->fPatIdx++;
2821
2822 switch (opType) {
2823
2824
2825 case URX_NOP:
2826 break;
2827
2828
2829 case URX_BACKTRACK:
2830 // Force a backtrack. In some circumstances, the pattern compiler
2831 // will notice that the pattern can't possibly match anything, and will
2832 // emit one of these at that point.
2833 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2834 break;
2835
2836
2837 case URX_ONECHAR:
2838 if (fp->fInputIdx < fActiveLimit) {
2839 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2840 UChar32 c = UTEXT_NEXT32(fInputText);
2841 if (c == opValue) {
2842 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2843 break;
2844 }
2845 } else {
2846 fHitEnd = TRUE;
2847 }
2848 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2849 break;
2850
2851
2852 case URX_STRING:
2853 {
2854 // Test input against a literal string.
2855 // Strings require two slots in the compiled pattern, one for the
2856 // offset to the string text, and one for the length.
2857
2858 int32_t stringStartIdx = opValue;
2859 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
2860 fp->fPatIdx++;
2861 opType = URX_TYPE(op);
2862 int32_t stringLen = URX_VAL(op);
2863 U_ASSERT(opType == URX_STRING_LEN);
2864 U_ASSERT(stringLen >= 2);
2865
2866 const UChar *patternString = litText+stringStartIdx;
2867 int32_t patternStringIndex = 0;
2868 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2869 UChar32 inputChar;
2870 UChar32 patternChar;
2871 UBool success = TRUE;
2872 while (patternStringIndex < stringLen) {
2873 if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
2874 success = FALSE;
2875 fHitEnd = TRUE;
2876 break;
2877 }
2878 inputChar = UTEXT_NEXT32(fInputText);
2879 U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
2880 if (patternChar != inputChar) {
2881 success = FALSE;
2882 break;
2883 }
2884 }
2885
2886 if (success) {
2887 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2888 } else {
2889 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2890 }
2891 }
2892 break;
2893
2894
2895 case URX_STATE_SAVE:
2896 fp = StateSave(fp, opValue, status);
2897 break;
2898
2899
2900 case URX_END:
2901 // The match loop will exit via this path on a successful match,
2902 // when we reach the end of the pattern.
2903 if (toEnd && fp->fInputIdx != fActiveLimit) {
2904 // The pattern matched, but not to the end of input. Try some more.
2905 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2906 break;
2907 }
2908 isMatch = TRUE;
2909 goto breakFromLoop;
2910
2911 // Start and End Capture stack frame variables are laid out out like this:
2912 // fp->fExtra[opValue] - The start of a completed capture group
2913 // opValue+1 - The end of a completed capture group
2914 // opValue+2 - the start of a capture group whose end
2915 // has not yet been reached (and might not ever be).
2916 case URX_START_CAPTURE:
2917 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2918 fp->fExtra[opValue+2] = fp->fInputIdx;
2919 break;
2920
2921
2922 case URX_END_CAPTURE:
2923 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2924 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
2925 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
2926 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
2927 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
2928 break;
2929
2930
2931 case URX_DOLLAR: // $, test for End of line
2932 // or for position before new line at end of input
2933 {
2934 if (fp->fInputIdx >= fAnchorLimit) {
2935 // We really are at the end of input. Success.
2936 fHitEnd = TRUE;
2937 fRequireEnd = TRUE;
2938 break;
2939 }
2940
2941 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2942
2943 // If we are positioned just before a new-line that is located at the
2944 // end of input, succeed.
2945 UChar32 c = UTEXT_NEXT32(fInputText);
2946 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2947 if (isLineTerminator(c)) {
2948 // If not in the middle of a CR/LF sequence
2949 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
2950 // At new-line at end of input. Success
2951 fHitEnd = TRUE;
2952 fRequireEnd = TRUE;
2953
2954 break;
2955 }
2956 }
2957 } else {
2958 UChar32 nextC = UTEXT_NEXT32(fInputText);
2959 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2960 fHitEnd = TRUE;
2961 fRequireEnd = TRUE;
2962 break; // At CR/LF at end of input. Success
2963 }
2964 }
2965
2966 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2967 }
2968 break;
2969
2970
2971 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
2972 if (fp->fInputIdx >= fAnchorLimit) {
2973 // Off the end of input. Success.
2974 fHitEnd = TRUE;
2975 fRequireEnd = TRUE;
2976 break;
2977 } else {
2978 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2979 UChar32 c = UTEXT_NEXT32(fInputText);
2980 // Either at the last character of input, or off the end.
2981 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
2982 fHitEnd = TRUE;
2983 fRequireEnd = TRUE;
2984 break;
2985 }
2986 }
2987
2988 // Not at end of input. Back-track out.
2989 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2990 break;
2991
2992
2993 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
2994 {
2995 if (fp->fInputIdx >= fAnchorLimit) {
2996 // We really are at the end of input. Success.
2997 fHitEnd = TRUE;
2998 fRequireEnd = TRUE;
2999 break;
3000 }
3001 // If we are positioned just before a new-line, succeed.
3002 // It makes no difference where the new-line is within the input.
3003 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3004 UChar32 c = UTEXT_CURRENT32(fInputText);
3005 if (isLineTerminator(c)) {
3006 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
3007 // In multi-line mode, hitting a new-line just before the end of input does not
3008 // set the hitEnd or requireEnd flags
3009 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
3010 break;
3011 }
3012 }
3013 // not at a new line. Fail.
3014 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3015 }
3016 break;
3017
3018
3019 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
3020 {
3021 if (fp->fInputIdx >= fAnchorLimit) {
3022 // We really are at the end of input. Success.
3023 fHitEnd = TRUE;
3024 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
3025 break; // adding a new-line would not lose the match.
3026 }
3027 // If we are not positioned just before a new-line, the test fails; backtrack out.
3028 // It makes no difference where the new-line is within the input.
3029 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3030 if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3031 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3032 }
3033 }
3034 break;
3035
3036
3037 case URX_CARET: // ^, test for start of line
3038 if (fp->fInputIdx != fAnchorStart) {
3039 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3040 }
3041 break;
3042
3043
3044 case URX_CARET_M: // ^, test for start of line in mulit-line mode
3045 {
3046 if (fp->fInputIdx == fAnchorStart) {
3047 // We are at the start input. Success.
3048 break;
3049 }
3050 // Check whether character just before the current pos is a new-line
3051 // unless we are at the end of input
3052 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3053 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3054 if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) {
3055 // It's a new-line. ^ is true. Success.
3056 // TODO: what should be done with positions between a CR and LF?
3057 break;
3058 }
3059 // Not at the start of a line. Fail.
3060 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3061 }
3062 break;
3063
3064
3065 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
3066 {
3067 U_ASSERT(fp->fInputIdx >= fAnchorStart);
3068 if (fp->fInputIdx <= fAnchorStart) {
3069 // We are at the start input. Success.
3070 break;
3071 }
3072 // Check whether character just before the current pos is a new-line
3073 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3074 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3075 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3076 if (c != 0x0a) {
3077 // Not at the start of a line. Back-track out.
3078 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3079 }
3080 }
3081 break;
3082
3083 case URX_BACKSLASH_B: // Test for word boundaries
3084 {
3085 UBool success = isWordBoundary(fp->fInputIdx);
3086 success ^= (UBool)(opValue != 0); // flip sense for \B
3087 if (!success) {
3088 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3089 }
3090 }
3091 break;
3092
3093
3094 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
3095 {
3096 UBool success = isUWordBoundary(fp->fInputIdx);
3097 success ^= (UBool)(opValue != 0); // flip sense for \B
3098 if (!success) {
3099 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3100 }
3101 }
3102 break;
3103
3104
3105 case URX_BACKSLASH_D: // Test for decimal digit
3106 {
3107 if (fp->fInputIdx >= fActiveLimit) {
3108 fHitEnd = TRUE;
3109 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3110 break;
3111 }
3112
3113 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3114
3115 UChar32 c = UTEXT_NEXT32(fInputText);
3116 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
3117 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3118 success ^= (UBool)(opValue != 0); // flip sense for \D
3119 if (success) {
3120 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3121 } else {
3122 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3123 }
3124 }
3125 break;
3126
3127
3128 case URX_BACKSLASH_G: // Test for position at end of previous match
3129 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3130 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3131 }
3132 break;
3133
3134
3135 case URX_BACKSLASH_H: // Test for \h, horizontal white space.
3136 {
3137 if (fp->fInputIdx >= fActiveLimit) {
3138 fHitEnd = TRUE;
3139 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3140 break;
3141 }
3142 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3143 UChar32 c = UTEXT_NEXT32(fInputText);
3144 int8_t ctype = u_charType(c);
3145 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
3146 success ^= (UBool)(opValue != 0); // flip sense for \H
3147 if (success) {
3148 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3149 } else {
3150 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3151 }
3152 }
3153 break;
3154
3155
3156 case URX_BACKSLASH_R: // Test for \R, any line break sequence.
3157 {
3158 if (fp->fInputIdx >= fActiveLimit) {
3159 fHitEnd = TRUE;
3160 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3161 break;
3162 }
3163 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3164 UChar32 c = UTEXT_NEXT32(fInputText);
3165 if (isLineTerminator(c)) {
3166 if (c == 0x0d && utext_current32(fInputText) == 0x0a) {
3167 utext_next32(fInputText);
3168 }
3169 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3170 } else {
3171 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3172 }
3173 }
3174 break;
3175
3176
3177 case URX_BACKSLASH_V: // \v, any single line ending character.
3178 {
3179 if (fp->fInputIdx >= fActiveLimit) {
3180 fHitEnd = TRUE;
3181 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3182 break;
3183 }
3184 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3185 UChar32 c = UTEXT_NEXT32(fInputText);
3186 UBool success = isLineTerminator(c);
3187 success ^= (UBool)(opValue != 0); // flip sense for \V
3188 if (success) {
3189 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3190 } else {
3191 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3192 }
3193 }
3194 break;
3195
3196
3197 case URX_BACKSLASH_X:
3198 // Match a Grapheme, as defined by Unicode TR 29.
3199 // Differs slightly from Perl, which consumes combining marks independently
3200 // of context.
3201 {
3202
3203 // Fail if at end of input
3204 if (fp->fInputIdx >= fActiveLimit) {
3205 fHitEnd = TRUE;
3206 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3207 break;
3208 }
3209
3210 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3211
3212 // Examine (and consume) the current char.
3213 // Dispatch into a little state machine, based on the char.
3214 UChar32 c;
3215 c = UTEXT_NEXT32(fInputText);
3216 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3217 UnicodeSet **sets = fPattern->fStaticSets;
3218 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
3219 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
3220 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3221 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3222 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3223 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3224 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3225 goto GC_Extend;
3226
3227
3228
3229 GC_L:
3230 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3231 c = UTEXT_NEXT32(fInputText);
3232 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3233 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3234 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3235 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3236 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3237 (void)UTEXT_PREVIOUS32(fInputText);
3238 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3239 goto GC_Extend;
3240
3241 GC_V:
3242 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3243 c = UTEXT_NEXT32(fInputText);
3244 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3245 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3246 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3247 (void)UTEXT_PREVIOUS32(fInputText);
3248 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3249 goto GC_Extend;
3250
3251 GC_T:
3252 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3253 c = UTEXT_NEXT32(fInputText);
3254 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3255 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3256 (void)UTEXT_PREVIOUS32(fInputText);
3257 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3258 goto GC_Extend;
3259
3260 GC_Extend:
3261 // Combining characters are consumed here
3262 for (;;) {
3263 if (fp->fInputIdx >= fActiveLimit) {
3264 break;
3265 }
3266 c = UTEXT_CURRENT32(fInputText);
3267 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
3268 break;
3269 }
3270 (void)UTEXT_NEXT32(fInputText);
3271 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3272 }
3273 goto GC_Done;
3274
3275 GC_Control:
3276 // Most control chars stand alone (don't combine with combining chars),
3277 // except for that CR/LF sequence is a single grapheme cluster.
3278 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
3279 c = UTEXT_NEXT32(fInputText);
3280 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3281 }
3282
3283 GC_Done:
3284 if (fp->fInputIdx >= fActiveLimit) {
3285 fHitEnd = TRUE;
3286 }
3287 break;
3288 }
3289
3290
3291
3292
3293 case URX_BACKSLASH_Z: // Test for end of Input
3294 if (fp->fInputIdx < fAnchorLimit) {
3295 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3296 } else {
3297 fHitEnd = TRUE;
3298 fRequireEnd = TRUE;
3299 }
3300 break;
3301
3302
3303
3304 case URX_STATIC_SETREF:
3305 {
3306 // Test input character against one of the predefined sets
3307 // (Word Characters, for example)
3308 // The high bit of the op value is a flag for the match polarity.
3309 // 0: success if input char is in set.
3310 // 1: success if input char is not in set.
3311 if (fp->fInputIdx >= fActiveLimit) {
3312 fHitEnd = TRUE;
3313 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3314 break;
3315 }
3316
3317 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3318 opValue &= ~URX_NEG_SET;
3319 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3320
3321 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3322 UChar32 c = UTEXT_NEXT32(fInputText);
3323 if (c < 256) {
3324 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3325 if (s8->contains(c)) {
3326 success = !success;
3327 }
3328 } else {
3329 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3330 if (s->contains(c)) {
3331 success = !success;
3332 }
3333 }
3334 if (success) {
3335 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3336 } else {
3337 // the character wasn't in the set.
3338 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3339 }
3340 }
3341 break;
3342
3343
3344 case URX_STAT_SETREF_N:
3345 {
3346 // Test input character for NOT being a member of one of
3347 // the predefined sets (Word Characters, for example)
3348 if (fp->fInputIdx >= fActiveLimit) {
3349 fHitEnd = TRUE;
3350 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3351 break;
3352 }
3353
3354 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3355
3356 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3357
3358 UChar32 c = UTEXT_NEXT32(fInputText);
3359 if (c < 256) {
3360 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3361 if (s8->contains(c) == FALSE) {
3362 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3363 break;
3364 }
3365 } else {
3366 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3367 if (s->contains(c) == FALSE) {
3368 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3369 break;
3370 }
3371 }
3372 // the character wasn't in the set.
3373 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3374 }
3375 break;
3376
3377
3378 case URX_SETREF:
3379 if (fp->fInputIdx >= fActiveLimit) {
3380 fHitEnd = TRUE;
3381 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3382 break;
3383 } else {
3384 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3385
3386 // There is input left. Pick up one char and test it for set membership.
3387 UChar32 c = UTEXT_NEXT32(fInputText);
3388 U_ASSERT(opValue > 0 && opValue < sets->size());
3389 if (c<256) {
3390 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3391 if (s8->contains(c)) {
3392 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3393 break;
3394 }
3395 } else {
3396 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3397 if (s->contains(c)) {
3398 // The character is in the set. A Match.
3399 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3400 break;
3401 }
3402 }
3403
3404 // the character wasn't in the set.
3405 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3406 }
3407 break;
3408
3409
3410 case URX_DOTANY:
3411 {
3412 // . matches anything, but stops at end-of-line.
3413 if (fp->fInputIdx >= fActiveLimit) {
3414 // At end of input. Match failed. Backtrack out.
3415 fHitEnd = TRUE;
3416 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3417 break;
3418 }
3419
3420 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3421
3422 // There is input left. Advance over one char, unless we've hit end-of-line
3423 UChar32 c = UTEXT_NEXT32(fInputText);
3424 if (isLineTerminator(c)) {
3425 // End of line in normal mode. . does not match.
3426 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3427 break;
3428 }
3429 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3430 }
3431 break;
3432
3433
3434 case URX_DOTANY_ALL:
3435 {
3436 // ., in dot-matches-all (including new lines) mode
3437 if (fp->fInputIdx >= fActiveLimit) {
3438 // At end of input. Match failed. Backtrack out.
3439 fHitEnd = TRUE;
3440 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3441 break;
3442 }
3443
3444 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3445
3446 // There is input left. Advance over one char, except if we are
3447 // at a cr/lf, advance over both of them.
3448 UChar32 c;
3449 c = UTEXT_NEXT32(fInputText);
3450 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3451 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3452 // In the case of a CR/LF, we need to advance over both.
3453 UChar32 nextc = UTEXT_CURRENT32(fInputText);
3454 if (nextc == 0x0a) {
3455 (void)UTEXT_NEXT32(fInputText);
3456 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3457 }
3458 }
3459 }
3460 break;
3461
3462
3463 case URX_DOTANY_UNIX:
3464 {
3465 // '.' operator, matches all, but stops at end-of-line.
3466 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
3467 if (fp->fInputIdx >= fActiveLimit) {
3468 // At end of input. Match failed. Backtrack out.
3469 fHitEnd = TRUE;
3470 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3471 break;
3472 }
3473
3474 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3475
3476 // There is input left. Advance over one char, unless we've hit end-of-line
3477 UChar32 c = UTEXT_NEXT32(fInputText);
3478 if (c == 0x0a) {
3479 // End of line in normal mode. '.' does not match the \n
3480 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3481 } else {
3482 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3483 }
3484 }
3485 break;
3486
3487
3488 case URX_JMP:
3489 fp->fPatIdx = opValue;
3490 break;
3491
3492 case URX_FAIL:
3493 isMatch = FALSE;
3494 goto breakFromLoop;
3495
3496 case URX_JMP_SAV:
3497 U_ASSERT(opValue < fPattern->fCompiledPat->size());
3498 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3499 fp->fPatIdx = opValue; // Then JMP.
3500 break;
3501
3502 case URX_JMP_SAV_X:
3503 // This opcode is used with (x)+, when x can match a zero length string.
3504 // Same as JMP_SAV, except conditional on the match having made forward progress.
3505 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3506 // data address of the input position at the start of the loop.
3507 {
3508 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3509 int32_t stoOp = (int32_t)pat[opValue-1];
3510 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3511 int32_t frameLoc = URX_VAL(stoOp);
3512 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3513 int64_t prevInputIdx = fp->fExtra[frameLoc];
3514 U_ASSERT(prevInputIdx <= fp->fInputIdx);
3515 if (prevInputIdx < fp->fInputIdx) {
3516 // The match did make progress. Repeat the loop.
3517 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3518 fp->fPatIdx = opValue;
3519 fp->fExtra[frameLoc] = fp->fInputIdx;
3520 }
3521 // If the input position did not advance, we do nothing here,
3522 // execution will fall out of the loop.
3523 }
3524 break;
3525
3526 case URX_CTR_INIT:
3527 {
3528 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3529 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3530
3531 // Pick up the three extra operands that CTR_INIT has, and
3532 // skip the pattern location counter past
3533 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3534 fp->fPatIdx += 3;
3535 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3536 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3537 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3538 U_ASSERT(minCount>=0);
3539 U_ASSERT(maxCount>=minCount || maxCount==-1);
3540 U_ASSERT(loopLoc>=fp->fPatIdx);
3541
3542 if (minCount == 0) {
3543 fp = StateSave(fp, loopLoc+1, status);
3544 }
3545 if (maxCount == -1) {
3546 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
3547 } else if (maxCount == 0) {
3548 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3549 }
3550 }
3551 break;
3552
3553 case URX_CTR_LOOP:
3554 {
3555 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3556 int32_t initOp = (int32_t)pat[opValue];
3557 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3558 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3559 int32_t minCount = (int32_t)pat[opValue+2];
3560 int32_t maxCount = (int32_t)pat[opValue+3];
3561 (*pCounter)++;
3562 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3563 U_ASSERT(*pCounter == maxCount);
3564 break;
3565 }
3566 if (*pCounter >= minCount) {
3567 if (maxCount == -1) {
3568 // Loop has no hard upper bound.
3569 // Check that it is progressing through the input, break if it is not.
3570 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
3571 if (fp->fInputIdx == *pLastInputIdx) {
3572 break;
3573 } else {
3574 *pLastInputIdx = fp->fInputIdx;
3575 }
3576 }
3577 fp = StateSave(fp, fp->fPatIdx, status);
3578 } else {
3579 // Increment time-out counter. (StateSave() does it if count >= minCount)
3580 fTickCounter--;
3581 if (fTickCounter <= 0) {
3582 IncrementTime(status); // Re-initializes fTickCounter
3583 }
3584 }
3585
3586 fp->fPatIdx = opValue + 4; // Loop back.
3587 }
3588 break;
3589
3590 case URX_CTR_INIT_NG:
3591 {
3592 // Initialize a non-greedy loop
3593 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3594 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3595
3596 // Pick up the three extra operands that CTR_INIT_NG has, and
3597 // skip the pattern location counter past
3598 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3599 fp->fPatIdx += 3;
3600 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3601 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3602 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3603 U_ASSERT(minCount>=0);
3604 U_ASSERT(maxCount>=minCount || maxCount==-1);
3605 U_ASSERT(loopLoc>fp->fPatIdx);
3606 if (maxCount == -1) {
3607 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
3608 }
3609
3610 if (minCount == 0) {
3611 if (maxCount != 0) {
3612 fp = StateSave(fp, fp->fPatIdx, status);
3613 }
3614 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
3615 }
3616 }
3617 break;
3618
3619 case URX_CTR_LOOP_NG:
3620 {
3621 // Non-greedy {min, max} loops
3622 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3623 int32_t initOp = (int32_t)pat[opValue];
3624 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3625 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3626 int32_t minCount = (int32_t)pat[opValue+2];
3627 int32_t maxCount = (int32_t)pat[opValue+3];
3628
3629 (*pCounter)++;
3630 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3631 // The loop has matched the maximum permitted number of times.
3632 // Break out of here with no action. Matching will
3633 // continue with the following pattern.
3634 U_ASSERT(*pCounter == maxCount);
3635 break;
3636 }
3637
3638 if (*pCounter < minCount) {
3639 // We haven't met the minimum number of matches yet.
3640 // Loop back for another one.
3641 fp->fPatIdx = opValue + 4; // Loop back.
3642 // Increment time-out counter. (StateSave() does it if count >= minCount)
3643 fTickCounter--;
3644 if (fTickCounter <= 0) {
3645 IncrementTime(status); // Re-initializes fTickCounter
3646 }
3647 } else {
3648 // We do have the minimum number of matches.
3649
3650 // If there is no upper bound on the loop iterations, check that the input index
3651 // is progressing, and stop the loop if it is not.
3652 if (maxCount == -1) {
3653 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
3654 if (fp->fInputIdx == *pLastInputIdx) {
3655 break;
3656 }
3657 *pLastInputIdx = fp->fInputIdx;
3658 }
3659
3660 // Loop Continuation: we will fall into the pattern following the loop
3661 // (non-greedy, don't execute loop body first), but first do
3662 // a state save to the top of the loop, so that a match failure
3663 // in the following pattern will try another iteration of the loop.
3664 fp = StateSave(fp, opValue + 4, status);
3665 }
3666 }
3667 break;
3668
3669 case URX_STO_SP:
3670 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3671 fData[opValue] = fStack->size();
3672 break;
3673
3674 case URX_LD_SP:
3675 {
3676 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3677 int32_t newStackSize = (int32_t)fData[opValue];
3678 U_ASSERT(newStackSize <= fStack->size());
3679 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3680 if (newFP == (int64_t *)fp) {
3681 break;
3682 }
3683 int32_t i;
3684 for (i=0; i<fFrameSize; i++) {
3685 newFP[i] = ((int64_t *)fp)[i];
3686 }
3687 fp = (REStackFrame *)newFP;
3688 fStack->setSize(newStackSize);
3689 }
3690 break;
3691
3692 case URX_BACKREF:
3693 {
3694 U_ASSERT(opValue < fFrameSize);
3695 int64_t groupStartIdx = fp->fExtra[opValue];
3696 int64_t groupEndIdx = fp->fExtra[opValue+1];
3697 U_ASSERT(groupStartIdx <= groupEndIdx);
3698 if (groupStartIdx < 0) {
3699 // This capture group has not participated in the match thus far,
3700 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3701 break;
3702 }
3703 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3704 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3705
3706 // Note: if the capture group match was of an empty string the backref
3707 // match succeeds. Verified by testing: Perl matches succeed
3708 // in this case, so we do too.
3709
3710 UBool success = TRUE;
3711 for (;;) {
3712 if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3713 success = TRUE;
3714 break;
3715 }
3716 if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
3717 success = FALSE;
3718 fHitEnd = TRUE;
3719 break;
3720 }
3721 UChar32 captureGroupChar = utext_next32(fAltInputText);
3722 UChar32 inputChar = utext_next32(fInputText);
3723 if (inputChar != captureGroupChar) {
3724 success = FALSE;
3725 break;
3726 }
3727 }
3728
3729 if (success) {
3730 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3731 } else {
3732 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3733 }
3734 }
3735 break;
3736
3737
3738
3739 case URX_BACKREF_I:
3740 {
3741 U_ASSERT(opValue < fFrameSize);
3742 int64_t groupStartIdx = fp->fExtra[opValue];
3743 int64_t groupEndIdx = fp->fExtra[opValue+1];
3744 U_ASSERT(groupStartIdx <= groupEndIdx);
3745 if (groupStartIdx < 0) {
3746 // This capture group has not participated in the match thus far,
3747 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3748 break;
3749 }
3750 utext_setNativeIndex(fAltInputText, groupStartIdx);
3751 utext_setNativeIndex(fInputText, fp->fInputIdx);
3752 CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
3753 CaseFoldingUTextIterator inputItr(*fInputText);
3754
3755 // Note: if the capture group match was of an empty string the backref
3756 // match succeeds. Verified by testing: Perl matches succeed
3757 // in this case, so we do too.
3758
3759 UBool success = TRUE;
3760 for (;;) {
3761 if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3762 success = TRUE;
3763 break;
3764 }
3765 if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
3766 success = FALSE;
3767 fHitEnd = TRUE;
3768 break;
3769 }
3770 UChar32 captureGroupChar = captureGroupItr.next();
3771 UChar32 inputChar = inputItr.next();
3772 if (inputChar != captureGroupChar) {
3773 success = FALSE;
3774 break;
3775 }
3776 }
3777
3778 if (success && inputItr.inExpansion()) {
3779 // We otained a match by consuming part of a string obtained from
3780 // case-folding a single code point of the input text.
3781 // This does not count as an overall match.
3782 success = FALSE;
3783 }
3784
3785 if (success) {
3786 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3787 } else {
3788 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3789 }
3790
3791 }
3792 break;
3793
3794 case URX_STO_INP_LOC:
3795 {
3796 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3797 fp->fExtra[opValue] = fp->fInputIdx;
3798 }
3799 break;
3800
3801 case URX_JMPX:
3802 {
3803 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3804 fp->fPatIdx += 1;
3805 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
3806 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3807 int64_t savedInputIdx = fp->fExtra[dataLoc];
3808 U_ASSERT(savedInputIdx <= fp->fInputIdx);
3809 if (savedInputIdx < fp->fInputIdx) {
3810 fp->fPatIdx = opValue; // JMP
3811 } else {
3812 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
3813 }
3814 }
3815 break;
3816
3817 case URX_LA_START:
3818 {
3819 // Entering a lookahead block.
3820 // Save Stack Ptr, Input Pos.
3821 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3822 fData[opValue] = fStack->size();
3823 fData[opValue+1] = fp->fInputIdx;
3824 fActiveStart = fLookStart; // Set the match region change for
3825 fActiveLimit = fLookLimit; // transparent bounds.
3826 }
3827 break;
3828
3829 case URX_LA_END:
3830 {
3831 // Leaving a look-ahead block.
3832 // restore Stack Ptr, Input Pos to positions they had on entry to block.
3833 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3834 int32_t stackSize = fStack->size();
3835 int32_t newStackSize =(int32_t)fData[opValue];
3836 U_ASSERT(stackSize >= newStackSize);
3837 if (stackSize > newStackSize) {
3838 // Copy the current top frame back to the new (cut back) top frame.
3839 // This makes the capture groups from within the look-ahead
3840 // expression available.
3841 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3842 int32_t i;
3843 for (i=0; i<fFrameSize; i++) {
3844 newFP[i] = ((int64_t *)fp)[i];
3845 }
3846 fp = (REStackFrame *)newFP;
3847 fStack->setSize(newStackSize);
3848 }
3849 fp->fInputIdx = fData[opValue+1];
3850
3851 // Restore the active region bounds in the input string; they may have
3852 // been changed because of transparent bounds on a Region.
3853 fActiveStart = fRegionStart;
3854 fActiveLimit = fRegionLimit;
3855 }
3856 break;
3857
3858 case URX_ONECHAR_I:
3859 // Case insensitive one char. The char from the pattern is already case folded.
3860 // Input text is not, but case folding the input can not reduce two or more code
3861 // points to one.
3862 if (fp->fInputIdx < fActiveLimit) {
3863 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3864
3865 UChar32 c = UTEXT_NEXT32(fInputText);
3866 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3867 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3868 break;
3869 }
3870 } else {
3871 fHitEnd = TRUE;
3872 }
3873
3874 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3875 break;
3876
3877 case URX_STRING_I:
3878 {
3879 // Case-insensitive test input against a literal string.
3880 // Strings require two slots in the compiled pattern, one for the
3881 // offset to the string text, and one for the length.
3882 // The compiled string has already been case folded.
3883 {
3884 const UChar *patternString = litText + opValue;
3885 int32_t patternStringIdx = 0;
3886
3887 op = (int32_t)pat[fp->fPatIdx];
3888 fp->fPatIdx++;
3889 opType = URX_TYPE(op);
3890 opValue = URX_VAL(op);
3891 U_ASSERT(opType == URX_STRING_LEN);
3892 int32_t patternStringLen = opValue; // Length of the string from the pattern.
3893
3894
3895 UChar32 cPattern;
3896 UChar32 cText;
3897 UBool success = TRUE;
3898
3899 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3900 CaseFoldingUTextIterator inputIterator(*fInputText);
3901 while (patternStringIdx < patternStringLen) {
3902 if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
3903 success = FALSE;
3904 fHitEnd = TRUE;
3905 break;
3906 }
3907 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
3908 cText = inputIterator.next();
3909 if (cText != cPattern) {
3910 success = FALSE;
3911 break;
3912 }
3913 }
3914 if (inputIterator.inExpansion()) {
3915 success = FALSE;
3916 }
3917
3918 if (success) {
3919 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3920 } else {
3921 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3922 }
3923 }
3924 }
3925 break;
3926
3927 case URX_LB_START:
3928 {
3929 // Entering a look-behind block.
3930 // Save Stack Ptr, Input Pos.
3931 // TODO: implement transparent bounds. Ticket #6067
3932 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3933 fData[opValue] = fStack->size();
3934 fData[opValue+1] = fp->fInputIdx;
3935 // Init the variable containing the start index for attempted matches.
3936 fData[opValue+2] = -1;
3937 // Save input string length, then reset to pin any matches to end at
3938 // the current position.
3939 fData[opValue+3] = fActiveLimit;
3940 fActiveLimit = fp->fInputIdx;
3941 }
3942 break;
3943
3944
3945 case URX_LB_CONT:
3946 {
3947 // Positive Look-Behind, at top of loop checking for matches of LB expression
3948 // at all possible input starting positions.
3949
3950 // Fetch the min and max possible match lengths. They are the operands
3951 // of this op in the pattern.
3952 int32_t minML = (int32_t)pat[fp->fPatIdx++];
3953 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3954 if (!UTEXT_USES_U16(fInputText)) {
3955 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
3956 // The max length need not be exact; it just needs to be >= actual maximum.
3957 maxML *= 3;
3958 }
3959 U_ASSERT(minML <= maxML);
3960 U_ASSERT(minML >= 0);
3961
3962 // Fetch (from data) the last input index where a match was attempted.
3963 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3964 int64_t &lbStartIdx = fData[opValue+2];
3965 if (lbStartIdx < 0) {
3966 // First time through loop.
3967 lbStartIdx = fp->fInputIdx - minML;
3968 if (lbStartIdx > 0) {
3969 // move index to a code point boudary, if it's not on one already.
3970 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3971 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3972 }
3973 } else {
3974 // 2nd through nth time through the loop.
3975 // Back up start position for match by one.
3976 if (lbStartIdx == 0) {
3977 (lbStartIdx)--;
3978 } else {
3979 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3980 (void)UTEXT_PREVIOUS32(fInputText);
3981 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3982 }
3983 }
3984
3985 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
3986 // We have tried all potential match starting points without
3987 // getting a match. Backtrack out, and out of the
3988 // Look Behind altogether.
3989 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3990 int64_t restoreInputLen = fData[opValue+3];
3991 U_ASSERT(restoreInputLen >= fActiveLimit);
3992 U_ASSERT(restoreInputLen <= fInputLength);
3993 fActiveLimit = restoreInputLen;
3994 break;
3995 }
3996
3997 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3998 // (successful match will fall off the end of the loop.)
3999 fp = StateSave(fp, fp->fPatIdx-3, status);
4000 fp->fInputIdx = lbStartIdx;
4001 }
4002 break;
4003
4004 case URX_LB_END:
4005 // End of a look-behind block, after a successful match.
4006 {
4007 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4008 if (fp->fInputIdx != fActiveLimit) {
4009 // The look-behind expression matched, but the match did not
4010 // extend all the way to the point that we are looking behind from.
4011 // FAIL out of here, which will take us back to the LB_CONT, which
4012 // will retry the match starting at another position or fail
4013 // the look-behind altogether, whichever is appropriate.
4014 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4015 break;
4016 }
4017
4018 // Look-behind match is good. Restore the orignal input string length,
4019 // which had been truncated to pin the end of the lookbehind match to the
4020 // position being looked-behind.
4021 int64_t originalInputLen = fData[opValue+3];
4022 U_ASSERT(originalInputLen >= fActiveLimit);
4023 U_ASSERT(originalInputLen <= fInputLength);
4024 fActiveLimit = originalInputLen;
4025 }
4026 break;
4027
4028
4029 case URX_LBN_CONT:
4030 {
4031 // Negative Look-Behind, at top of loop checking for matches of LB expression
4032 // at all possible input starting positions.
4033
4034 // Fetch the extra parameters of this op.
4035 int32_t minML = (int32_t)pat[fp->fPatIdx++];
4036 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
4037 if (!UTEXT_USES_U16(fInputText)) {
4038 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
4039 // The max length need not be exact; it just needs to be >= actual maximum.
4040 maxML *= 3;
4041 }
4042 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
4043 continueLoc = URX_VAL(continueLoc);
4044 U_ASSERT(minML <= maxML);
4045 U_ASSERT(minML >= 0);
4046 U_ASSERT(continueLoc > fp->fPatIdx);
4047
4048 // Fetch (from data) the last input index where a match was attempted.
4049 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4050 int64_t &lbStartIdx = fData[opValue+2];
4051 if (lbStartIdx < 0) {
4052 // First time through loop.
4053 lbStartIdx = fp->fInputIdx - minML;
4054 if (lbStartIdx > 0) {
4055 // move index to a code point boudary, if it's not on one already.
4056 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4057 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4058 }
4059 } else {
4060 // 2nd through nth time through the loop.
4061 // Back up start position for match by one.
4062 if (lbStartIdx == 0) {
4063 (lbStartIdx)--;
4064 } else {
4065 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4066 (void)UTEXT_PREVIOUS32(fInputText);
4067 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4068 }
4069 }
4070
4071 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
4072 // We have tried all potential match starting points without
4073 // getting a match, which means that the negative lookbehind as
4074 // a whole has succeeded. Jump forward to the continue location
4075 int64_t restoreInputLen = fData[opValue+3];
4076 U_ASSERT(restoreInputLen >= fActiveLimit);
4077 U_ASSERT(restoreInputLen <= fInputLength);
4078 fActiveLimit = restoreInputLen;
4079 fp->fPatIdx = continueLoc;
4080 break;
4081 }
4082
4083 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4084 // (successful match will cause a FAIL out of the loop altogether.)
4085 fp = StateSave(fp, fp->fPatIdx-4, status);
4086 fp->fInputIdx = lbStartIdx;
4087 }
4088 break;
4089
4090 case URX_LBN_END:
4091 // End of a negative look-behind block, after a successful match.
4092 {
4093 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4094 if (fp->fInputIdx != fActiveLimit) {
4095 // The look-behind expression matched, but the match did not
4096 // extend all the way to the point that we are looking behind from.
4097 // FAIL out of here, which will take us back to the LB_CONT, which
4098 // will retry the match starting at another position or succeed
4099 // the look-behind altogether, whichever is appropriate.
4100 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4101 break;
4102 }
4103
4104 // Look-behind expression matched, which means look-behind test as
4105 // a whole Fails
4106
4107 // Restore the orignal input string length, which had been truncated
4108 // inorder to pin the end of the lookbehind match
4109 // to the position being looked-behind.
4110 int64_t originalInputLen = fData[opValue+3];
4111 U_ASSERT(originalInputLen >= fActiveLimit);
4112 U_ASSERT(originalInputLen <= fInputLength);
4113 fActiveLimit = originalInputLen;
4114
4115 // Restore original stack position, discarding any state saved
4116 // by the successful pattern match.
4117 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4118 int32_t newStackSize = (int32_t)fData[opValue];
4119 U_ASSERT(fStack->size() > newStackSize);
4120 fStack->setSize(newStackSize);
4121
4122 // FAIL, which will take control back to someplace
4123 // prior to entering the look-behind test.
4124 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4125 }
4126 break;
4127
4128
4129 case URX_LOOP_SR_I:
4130 // Loop Initialization for the optimized implementation of
4131 // [some character set]*
4132 // This op scans through all matching input.
4133 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4134 {
4135 U_ASSERT(opValue > 0 && opValue < sets->size());
4136 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4137 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
4138
4139 // Loop through input, until either the input is exhausted or
4140 // we reach a character that is not a member of the set.
4141 int64_t ix = fp->fInputIdx;
4142 UTEXT_SETNATIVEINDEX(fInputText, ix);
4143 for (;;) {
4144 if (ix >= fActiveLimit) {
4145 fHitEnd = TRUE;
4146 break;
4147 }
4148 UChar32 c = UTEXT_NEXT32(fInputText);
4149 if (c<256) {
4150 if (s8->contains(c) == FALSE) {
4151 break;
4152 }
4153 } else {
4154 if (s->contains(c) == FALSE) {
4155 break;
4156 }
4157 }
4158 ix = UTEXT_GETNATIVEINDEX(fInputText);
4159 }
4160
4161 // If there were no matching characters, skip over the loop altogether.
4162 // The loop doesn't run at all, a * op always succeeds.
4163 if (ix == fp->fInputIdx) {
4164 fp->fPatIdx++; // skip the URX_LOOP_C op.
4165 break;
4166 }
4167
4168 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4169 // must follow. It's operand is the stack location
4170 // that holds the starting input index for the match of this [set]*
4171 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4172 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4173 int32_t stackLoc = URX_VAL(loopcOp);
4174 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4175 fp->fExtra[stackLoc] = fp->fInputIdx;
4176 fp->fInputIdx = ix;
4177
4178 // Save State to the URX_LOOP_C op that follows this one,
4179 // so that match failures in the following code will return to there.
4180 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4181 fp = StateSave(fp, fp->fPatIdx, status);
4182 fp->fPatIdx++;
4183 }
4184 break;
4185
4186
4187 case URX_LOOP_DOT_I:
4188 // Loop Initialization for the optimized implementation of .*
4189 // This op scans through all remaining input.
4190 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4191 {
4192 // Loop through input until the input is exhausted (we reach an end-of-line)
4193 // In DOTALL mode, we can just go straight to the end of the input.
4194 int64_t ix;
4195 if ((opValue & 1) == 1) {
4196 // Dot-matches-All mode. Jump straight to the end of the string.
4197 ix = fActiveLimit;
4198 fHitEnd = TRUE;
4199 } else {
4200 // NOT DOT ALL mode. Line endings do not match '.'
4201 // Scan forward until a line ending or end of input.
4202 ix = fp->fInputIdx;
4203 UTEXT_SETNATIVEINDEX(fInputText, ix);
4204 for (;;) {
4205 if (ix >= fActiveLimit) {
4206 fHitEnd = TRUE;
4207 break;
4208 }
4209 UChar32 c = UTEXT_NEXT32(fInputText);
4210 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
4211 if ((c == 0x0a) || // 0x0a is newline in both modes.
4212 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
4213 isLineTerminator(c))) {
4214 // char is a line ending. Exit the scanning loop.
4215 break;
4216 }
4217 }
4218 ix = UTEXT_GETNATIVEINDEX(fInputText);
4219 }
4220 }
4221
4222 // If there were no matching characters, skip over the loop altogether.
4223 // The loop doesn't run at all, a * op always succeeds.
4224 if (ix == fp->fInputIdx) {
4225 fp->fPatIdx++; // skip the URX_LOOP_C op.
4226 break;
4227 }
4228
4229 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4230 // must follow. It's operand is the stack location
4231 // that holds the starting input index for the match of this .*
4232 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4233 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4234 int32_t stackLoc = URX_VAL(loopcOp);
4235 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4236 fp->fExtra[stackLoc] = fp->fInputIdx;
4237 fp->fInputIdx = ix;
4238
4239 // Save State to the URX_LOOP_C op that follows this one,
4240 // so that match failures in the following code will return to there.
4241 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4242 fp = StateSave(fp, fp->fPatIdx, status);
4243 fp->fPatIdx++;
4244 }
4245 break;
4246
4247
4248 case URX_LOOP_C:
4249 {
4250 U_ASSERT(opValue>=0 && opValue<fFrameSize);
4251 backSearchIndex = fp->fExtra[opValue];
4252 U_ASSERT(backSearchIndex <= fp->fInputIdx);
4253 if (backSearchIndex == fp->fInputIdx) {
4254 // We've backed up the input idx to the point that the loop started.
4255 // The loop is done. Leave here without saving state.
4256 // Subsequent failures won't come back here.
4257 break;
4258 }
4259 // Set up for the next iteration of the loop, with input index
4260 // backed up by one from the last time through,
4261 // and a state save to this instruction in case the following code fails again.
4262 // (We're going backwards because this loop emulates stack unwinding, not
4263 // the initial scan forward.)
4264 U_ASSERT(fp->fInputIdx > 0);
4265 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4266 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4267 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4268
4269 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4270 if (prevC == 0x0a &&
4271 fp->fInputIdx > backSearchIndex &&
4272 twoPrevC == 0x0d) {
4273 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4274 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4275 // .*, stepping back over CRLF pair.
4276 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4277 }
4278 }
4279
4280
4281 fp = StateSave(fp, fp->fPatIdx-1, status);
4282 }
4283 break;
4284
4285
4286
4287 default:
4288 // Trouble. The compiled pattern contains an entry with an
4289 // unrecognized type tag.
4290 U_ASSERT(FALSE);
4291 }
4292
4293 if (U_FAILURE(status)) {
4294 isMatch = FALSE;
4295 break;
4296 }
4297 }
4298
4299 breakFromLoop:
4300 fMatch = isMatch;
4301 if (isMatch) {
4302 fLastMatchEnd = fMatchEnd;
4303 fMatchStart = startIdx;
4304 fMatchEnd = fp->fInputIdx;
4305 }
4306
4307 #ifdef REGEX_RUN_DEBUG
4308 if (fTraceDebug) {
4309 if (isMatch) {
4310 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
4311 } else {
4312 printf("No match\n\n");
4313 }
4314 }
4315 #endif
4316
4317 fFrame = fp; // The active stack frame when the engine stopped.
4318 // Contains the capture group results that we need to
4319 // access later.
4320 return;
4321 }
4322
4323
4324 //--------------------------------------------------------------------------------
4325 //
4326 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with the
4327 // assumption that the entire string is available in the UText's
4328 // chunk buffer. For now, that means we can use int32_t indexes,
4329 // except for anything that needs to be saved (like group starts
4330 // and ends).
4331 //
4332 // startIdx: begin matching a this index.
4333 // toEnd: if true, match must extend to end of the input region
4334 //
4335 //--------------------------------------------------------------------------------
4336 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4337 UBool isMatch = FALSE; // True if the we have a match.
4338
4339 int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4340
4341 int32_t op; // Operation from the compiled pattern, split into
4342 int32_t opType; // the opcode
4343 int32_t opValue; // and the operand value.
4344
4345 #ifdef REGEX_RUN_DEBUG
4346 if (fTraceDebug) {
4347 printf("MatchAt(startIdx=%d)\n", startIdx);
4348 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
4349 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
4350 }
4351 #endif
4352
4353 if (U_FAILURE(status)) {
4354 return;
4355 }
4356
4357 // Cache frequently referenced items from the compiled pattern
4358 //
4359 int64_t *pat = fPattern->fCompiledPat->getBuffer();
4360
4361 const UChar *litText = fPattern->fLiteralText.getBuffer();
4362 UVector *sets = fPattern->fSets;
4363
4364 const UChar *inputBuf = fInputText->chunkContents;
4365
4366 fFrameSize = fPattern->fFrameSize;
4367 REStackFrame *fp = resetStack();
4368 if (U_FAILURE(fDeferredStatus)) {
4369 status = fDeferredStatus;
4370 return;
4371 }
4372
4373 fp->fPatIdx = 0;
4374 fp->fInputIdx = startIdx;
4375
4376 // Zero out the pattern's static data
4377 int32_t i;
4378 for (i = 0; i<fPattern->fDataSize; i++) {
4379 fData[i] = 0;
4380 }
4381
4382 //
4383 // Main loop for interpreting the compiled pattern.
4384 // One iteration of the loop per pattern operation performed.
4385 //
4386 for (;;) {
4387 op = (int32_t)pat[fp->fPatIdx];
4388 opType = URX_TYPE(op);
4389 opValue = URX_VAL(op);
4390 #ifdef REGEX_RUN_DEBUG
4391 if (fTraceDebug) {
4392 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4393 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
4394 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4395 fPattern->dumpOp(fp->fPatIdx);
4396 }
4397 #endif
4398 fp->fPatIdx++;
4399
4400 switch (opType) {
4401
4402
4403 case URX_NOP:
4404 break;
4405
4406
4407 case URX_BACKTRACK:
4408 // Force a backtrack. In some circumstances, the pattern compiler
4409 // will notice that the pattern can't possibly match anything, and will
4410 // emit one of these at that point.
4411 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4412 break;
4413
4414
4415 case URX_ONECHAR:
4416 if (fp->fInputIdx < fActiveLimit) {
4417 UChar32 c;
4418 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4419 if (c == opValue) {
4420 break;
4421 }
4422 } else {
4423 fHitEnd = TRUE;
4424 }
4425 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4426 break;
4427
4428
4429 case URX_STRING:
4430 {
4431 // Test input against a literal string.
4432 // Strings require two slots in the compiled pattern, one for the
4433 // offset to the string text, and one for the length.
4434 int32_t stringStartIdx = opValue;
4435 int32_t stringLen;
4436
4437 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
4438 fp->fPatIdx++;
4439 opType = URX_TYPE(op);
4440 stringLen = URX_VAL(op);
4441 U_ASSERT(opType == URX_STRING_LEN);
4442 U_ASSERT(stringLen >= 2);
4443
4444 const UChar * pInp = inputBuf + fp->fInputIdx;
4445 const UChar * pInpLimit = inputBuf + fActiveLimit;
4446 const UChar * pPat = litText+stringStartIdx;
4447 const UChar * pEnd = pInp + stringLen;
4448 UBool success = TRUE;
4449 while (pInp < pEnd) {
4450 if (pInp >= pInpLimit) {
4451 fHitEnd = TRUE;
4452 success = FALSE;
4453 break;
4454 }
4455 if (*pInp++ != *pPat++) {
4456 success = FALSE;
4457 break;
4458 }
4459 }
4460
4461 if (success) {
4462 fp->fInputIdx += stringLen;
4463 } else {
4464 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4465 }
4466 }
4467 break;
4468
4469
4470 case URX_STATE_SAVE:
4471 fp = StateSave(fp, opValue, status);
4472 break;
4473
4474
4475 case URX_END:
4476 // The match loop will exit via this path on a successful match,
4477 // when we reach the end of the pattern.
4478 if (toEnd && fp->fInputIdx != fActiveLimit) {
4479 // The pattern matched, but not to the end of input. Try some more.
4480 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4481 break;
4482 }
4483 isMatch = TRUE;
4484 goto breakFromLoop;
4485
4486 // Start and End Capture stack frame variables are laid out out like this:
4487 // fp->fExtra[opValue] - The start of a completed capture group
4488 // opValue+1 - The end of a completed capture group
4489 // opValue+2 - the start of a capture group whose end
4490 // has not yet been reached (and might not ever be).
4491 case URX_START_CAPTURE:
4492 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4493 fp->fExtra[opValue+2] = fp->fInputIdx;
4494 break;
4495
4496
4497 case URX_END_CAPTURE:
4498 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4499 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
4500 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
4501 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
4502 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4503 break;
4504
4505
4506 case URX_DOLLAR: // $, test for End of line
4507 // or for position before new line at end of input
4508 if (fp->fInputIdx < fAnchorLimit-2) {
4509 // We are no where near the end of input. Fail.
4510 // This is the common case. Keep it first.
4511 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4512 break;
4513 }
4514 if (fp->fInputIdx >= fAnchorLimit) {
4515 // We really are at the end of input. Success.
4516 fHitEnd = TRUE;
4517 fRequireEnd = TRUE;
4518 break;
4519 }
4520
4521 // If we are positioned just before a new-line that is located at the
4522 // end of input, succeed.
4523 if (fp->fInputIdx == fAnchorLimit-1) {
4524 UChar32 c;
4525 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4526
4527 if (isLineTerminator(c)) {
4528 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4529 // At new-line at end of input. Success
4530 fHitEnd = TRUE;
4531 fRequireEnd = TRUE;
4532 break;
4533 }
4534 }
4535 } else if (fp->fInputIdx == fAnchorLimit-2 &&
4536 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4537 fHitEnd = TRUE;
4538 fRequireEnd = TRUE;
4539 break; // At CR/LF at end of input. Success
4540 }
4541
4542 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4543
4544 break;
4545
4546
4547 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
4548 if (fp->fInputIdx >= fAnchorLimit-1) {
4549 // Either at the last character of input, or off the end.
4550 if (fp->fInputIdx == fAnchorLimit-1) {
4551 // At last char of input. Success if it's a new line.
4552 if (inputBuf[fp->fInputIdx] == 0x0a) {
4553 fHitEnd = TRUE;
4554 fRequireEnd = TRUE;
4555 break;
4556 }
4557 } else {
4558 // Off the end of input. Success.
4559 fHitEnd = TRUE;
4560 fRequireEnd = TRUE;
4561 break;
4562 }
4563 }
4564
4565 // Not at end of input. Back-track out.
4566 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4567 break;
4568
4569
4570 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
4571 {
4572 if (fp->fInputIdx >= fAnchorLimit) {
4573 // We really are at the end of input. Success.
4574 fHitEnd = TRUE;
4575 fRequireEnd = TRUE;
4576 break;
4577 }
4578 // If we are positioned just before a new-line, succeed.
4579 // It makes no difference where the new-line is within the input.
4580 UChar32 c = inputBuf[fp->fInputIdx];
4581 if (isLineTerminator(c)) {
4582 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
4583 // In multi-line mode, hitting a new-line just before the end of input does not
4584 // set the hitEnd or requireEnd flags
4585 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4586 break;
4587 }
4588 }
4589 // not at a new line. Fail.
4590 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4591 }
4592 break;
4593
4594
4595 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
4596 {
4597 if (fp->fInputIdx >= fAnchorLimit) {
4598 // We really are at the end of input. Success.
4599 fHitEnd = TRUE;
4600 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
4601 break; // adding a new-line would not lose the match.
4602 }
4603 // If we are not positioned just before a new-line, the test fails; backtrack out.
4604 // It makes no difference where the new-line is within the input.
4605 if (inputBuf[fp->fInputIdx] != 0x0a) {
4606 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4607 }
4608 }
4609 break;
4610
4611
4612 case URX_CARET: // ^, test for start of line
4613 if (fp->fInputIdx != fAnchorStart) {
4614 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4615 }
4616 break;
4617
4618
4619 case URX_CARET_M: // ^, test for start of line in mulit-line mode
4620 {
4621 if (fp->fInputIdx == fAnchorStart) {
4622 // We are at the start input. Success.
4623 break;
4624 }
4625 // Check whether character just before the current pos is a new-line
4626 // unless we are at the end of input
4627 UChar c = inputBuf[fp->fInputIdx - 1];
4628 if ((fp->fInputIdx < fAnchorLimit) &&
4629 isLineTerminator(c)) {
4630 // It's a new-line. ^ is true. Success.
4631 // TODO: what should be done with positions between a CR and LF?
4632 break;
4633 }
4634 // Not at the start of a line. Fail.
4635 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4636 }
4637 break;
4638
4639
4640 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
4641 {
4642 U_ASSERT(fp->fInputIdx >= fAnchorStart);
4643 if (fp->fInputIdx <= fAnchorStart) {
4644 // We are at the start input. Success.
4645 break;
4646 }
4647 // Check whether character just before the current pos is a new-line
4648 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4649 UChar c = inputBuf[fp->fInputIdx - 1];
4650 if (c != 0x0a) {
4651 // Not at the start of a line. Back-track out.
4652 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4653 }
4654 }
4655 break;
4656
4657 case URX_BACKSLASH_B: // Test for word boundaries
4658 {
4659 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4660 success ^= (UBool)(opValue != 0); // flip sense for \B
4661 if (!success) {
4662 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4663 }
4664 }
4665 break;
4666
4667
4668 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
4669 {
4670 UBool success = isUWordBoundary(fp->fInputIdx);
4671 success ^= (UBool)(opValue != 0); // flip sense for \B
4672 if (!success) {
4673 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4674 }
4675 }
4676 break;
4677
4678
4679 case URX_BACKSLASH_D: // Test for decimal digit
4680 {
4681 if (fp->fInputIdx >= fActiveLimit) {
4682 fHitEnd = TRUE;
4683 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4684 break;
4685 }
4686
4687 UChar32 c;
4688 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4689 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
4690 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4691 success ^= (UBool)(opValue != 0); // flip sense for \D
4692 if (!success) {
4693 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4694 }
4695 }
4696 break;
4697
4698
4699 case URX_BACKSLASH_G: // Test for position at end of previous match
4700 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
4701 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4702 }
4703 break;
4704
4705
4706 case URX_BACKSLASH_H: // Test for \h, horizontal white space.
4707 {
4708 if (fp->fInputIdx >= fActiveLimit) {
4709 fHitEnd = TRUE;
4710 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4711 break;
4712 }
4713 UChar32 c;
4714 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4715 int8_t ctype = u_charType(c);
4716 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
4717 success ^= (UBool)(opValue != 0); // flip sense for \H
4718 if (!success) {
4719 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4720 }
4721 }
4722 break;
4723
4724
4725 case URX_BACKSLASH_R: // Test for \R, any line break sequence.
4726 {
4727 if (fp->fInputIdx >= fActiveLimit) {
4728 fHitEnd = TRUE;
4729 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4730 break;
4731 }
4732 UChar32 c;
4733 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4734 if (isLineTerminator(c)) {
4735 if (c == 0x0d && fp->fInputIdx < fActiveLimit) {
4736 // Check for CR/LF sequence. Consume both together when found.
4737 UChar c2;
4738 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2);
4739 if (c2 != 0x0a) {
4740 U16_PREV(inputBuf, 0, fp->fInputIdx, c2);
4741 }
4742 }
4743 } else {
4744 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4745 }
4746 }
4747 break;
4748
4749
4750 case URX_BACKSLASH_V: // Any single code point line ending.
4751 {
4752 if (fp->fInputIdx >= fActiveLimit) {
4753 fHitEnd = TRUE;
4754 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4755 break;
4756 }
4757 UChar32 c;
4758 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4759 UBool success = isLineTerminator(c);
4760 success ^= (UBool)(opValue != 0); // flip sense for \V
4761 if (!success) {
4762 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4763 }
4764 }
4765 break;
4766
4767
4768
4769 case URX_BACKSLASH_X:
4770 // Match a Grapheme, as defined by Unicode TR 29.
4771 // Differs slightly from Perl, which consumes combining marks independently
4772 // of context.
4773 {
4774
4775 // Fail if at end of input
4776 if (fp->fInputIdx >= fActiveLimit) {
4777 fHitEnd = TRUE;
4778 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4779 break;
4780 }
4781
4782 // Examine (and consume) the current char.
4783 // Dispatch into a little state machine, based on the char.
4784 UChar32 c;
4785 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4786 UnicodeSet **sets = fPattern->fStaticSets;
4787 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
4788 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
4789 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4790 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4791 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4792 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4793 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4794 goto GC_Extend;
4795
4796
4797
4798 GC_L:
4799 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4800 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4801 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4802 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4803 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4804 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4805 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4806 goto GC_Extend;
4807
4808 GC_V:
4809 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4810 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4811 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4812 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4813 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4814 goto GC_Extend;
4815
4816 GC_T:
4817 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4818 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4819 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4820 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4821 goto GC_Extend;
4822
4823 GC_Extend:
4824 // Combining characters are consumed here
4825 for (;;) {
4826 if (fp->fInputIdx >= fActiveLimit) {
4827 break;
4828 }
4829 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4830 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
4831 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
4832 break;
4833 }
4834 }
4835 goto GC_Done;
4836
4837 GC_Control:
4838 // Most control chars stand alone (don't combine with combining chars),
4839 // except for that CR/LF sequence is a single grapheme cluster.
4840 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
4841 fp->fInputIdx++;
4842 }
4843
4844 GC_Done:
4845 if (fp->fInputIdx >= fActiveLimit) {
4846 fHitEnd = TRUE;
4847 }
4848 break;
4849 }
4850
4851
4852
4853
4854 case URX_BACKSLASH_Z: // Test for end of Input
4855 if (fp->fInputIdx < fAnchorLimit) {
4856 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4857 } else {
4858 fHitEnd = TRUE;
4859 fRequireEnd = TRUE;
4860 }
4861 break;
4862
4863
4864
4865 case URX_STATIC_SETREF:
4866 {
4867 // Test input character against one of the predefined sets
4868 // (Word Characters, for example)
4869 // The high bit of the op value is a flag for the match polarity.
4870 // 0: success if input char is in set.
4871 // 1: success if input char is not in set.
4872 if (fp->fInputIdx >= fActiveLimit) {
4873 fHitEnd = TRUE;
4874 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4875 break;
4876 }
4877
4878 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
4879 opValue &= ~URX_NEG_SET;
4880 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4881
4882 UChar32 c;
4883 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4884 if (c < 256) {
4885 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4886 if (s8->contains(c)) {
4887 success = !success;
4888 }
4889 } else {
4890 const UnicodeSet *s = fPattern->fStaticSets[opValue];
4891 if (s->contains(c)) {
4892 success = !success;
4893 }
4894 }
4895 if (!success) {
4896 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4897 }
4898 }
4899 break;
4900
4901
4902 case URX_STAT_SETREF_N:
4903 {
4904 // Test input character for NOT being a member of one of
4905 // the predefined sets (Word Characters, for example)
4906 if (fp->fInputIdx >= fActiveLimit) {
4907 fHitEnd = TRUE;
4908 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4909 break;
4910 }
4911
4912 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4913
4914 UChar32 c;
4915 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4916 if (c < 256) {
4917 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4918 if (s8->contains(c) == FALSE) {
4919 break;
4920 }
4921 } else {
4922 const UnicodeSet *s = fPattern->fStaticSets[opValue];
4923 if (s->contains(c) == FALSE) {
4924 break;
4925 }
4926 }
4927 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4928 }
4929 break;
4930
4931
4932 case URX_SETREF:
4933 {
4934 if (fp->fInputIdx >= fActiveLimit) {
4935 fHitEnd = TRUE;
4936 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4937 break;
4938 }
4939
4940 U_ASSERT(opValue > 0 && opValue < sets->size());
4941
4942 // There is input left. Pick up one char and test it for set membership.
4943 UChar32 c;
4944 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4945 if (c<256) {
4946 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4947 if (s8->contains(c)) {
4948 // The character is in the set. A Match.
4949 break;
4950 }
4951 } else {
4952 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
4953 if (s->contains(c)) {
4954 // The character is in the set. A Match.
4955 break;
4956 }
4957 }
4958
4959 // the character wasn't in the set.
4960 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4961 }
4962 break;
4963
4964
4965 case URX_DOTANY:
4966 {
4967 // . matches anything, but stops at end-of-line.
4968 if (fp->fInputIdx >= fActiveLimit) {
4969 // At end of input. Match failed. Backtrack out.
4970 fHitEnd = TRUE;
4971 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4972 break;
4973 }
4974
4975 // There is input left. Advance over one char, unless we've hit end-of-line
4976 UChar32 c;
4977 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4978 if (isLineTerminator(c)) {
4979 // End of line in normal mode. . does not match.
4980 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4981 break;
4982 }
4983 }
4984 break;
4985
4986
4987 case URX_DOTANY_ALL:
4988 {
4989 // . in dot-matches-all (including new lines) mode
4990 if (fp->fInputIdx >= fActiveLimit) {
4991 // At end of input. Match failed. Backtrack out.
4992 fHitEnd = TRUE;
4993 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4994 break;
4995 }
4996
4997 // There is input left. Advance over one char, except if we are
4998 // at a cr/lf, advance over both of them.
4999 UChar32 c;
5000 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5001 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
5002 // In the case of a CR/LF, we need to advance over both.
5003 if (inputBuf[fp->fInputIdx] == 0x0a) {
5004 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
5005 }
5006 }
5007 }
5008 break;
5009
5010
5011 case URX_DOTANY_UNIX:
5012 {
5013 // '.' operator, matches all, but stops at end-of-line.
5014 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
5015 if (fp->fInputIdx >= fActiveLimit) {
5016 // At end of input. Match failed. Backtrack out.
5017 fHitEnd = TRUE;
5018 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5019 break;
5020 }
5021
5022 // There is input left. Advance over one char, unless we've hit end-of-line
5023 UChar32 c;
5024 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5025 if (c == 0x0a) {
5026 // End of line in normal mode. '.' does not match the \n
5027 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5028 }
5029 }
5030 break;
5031
5032
5033 case URX_JMP:
5034 fp->fPatIdx = opValue;
5035 break;
5036
5037 case URX_FAIL:
5038 isMatch = FALSE;
5039 goto breakFromLoop;
5040
5041 case URX_JMP_SAV:
5042 U_ASSERT(opValue < fPattern->fCompiledPat->size());
5043 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
5044 fp->fPatIdx = opValue; // Then JMP.
5045 break;
5046
5047 case URX_JMP_SAV_X:
5048 // This opcode is used with (x)+, when x can match a zero length string.
5049 // Same as JMP_SAV, except conditional on the match having made forward progress.
5050 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
5051 // data address of the input position at the start of the loop.
5052 {
5053 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
5054 int32_t stoOp = (int32_t)pat[opValue-1];
5055 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
5056 int32_t frameLoc = URX_VAL(stoOp);
5057 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
5058 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
5059 U_ASSERT(prevInputIdx <= fp->fInputIdx);
5060 if (prevInputIdx < fp->fInputIdx) {
5061 // The match did make progress. Repeat the loop.
5062 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
5063 fp->fPatIdx = opValue;
5064 fp->fExtra[frameLoc] = fp->fInputIdx;
5065 }
5066 // If the input position did not advance, we do nothing here,
5067 // execution will fall out of the loop.
5068 }
5069 break;
5070
5071 case URX_CTR_INIT:
5072 {
5073 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5074 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
5075
5076 // Pick up the three extra operands that CTR_INIT has, and
5077 // skip the pattern location counter past
5078 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5079 fp->fPatIdx += 3;
5080 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
5081 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5082 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5083 U_ASSERT(minCount>=0);
5084 U_ASSERT(maxCount>=minCount || maxCount==-1);
5085 U_ASSERT(loopLoc>=fp->fPatIdx);
5086
5087 if (minCount == 0) {
5088 fp = StateSave(fp, loopLoc+1, status);
5089 }
5090 if (maxCount == -1) {
5091 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
5092 } else if (maxCount == 0) {
5093 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5094 }
5095 }
5096 break;
5097
5098 case URX_CTR_LOOP:
5099 {
5100 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5101 int32_t initOp = (int32_t)pat[opValue];
5102 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
5103 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5104 int32_t minCount = (int32_t)pat[opValue+2];
5105 int32_t maxCount = (int32_t)pat[opValue+3];
5106 (*pCounter)++;
5107 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5108 U_ASSERT(*pCounter == maxCount);
5109 break;
5110 }
5111 if (*pCounter >= minCount) {
5112 if (maxCount == -1) {
5113 // Loop has no hard upper bound.
5114 // Check that it is progressing through the input, break if it is not.
5115 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
5116 if (fp->fInputIdx == *pLastInputIdx) {
5117 break;
5118 } else {
5119 *pLastInputIdx = fp->fInputIdx;
5120 }
5121 }
5122 fp = StateSave(fp, fp->fPatIdx, status);
5123 } else {
5124 // Increment time-out counter. (StateSave() does it if count >= minCount)
5125 fTickCounter--;
5126 if (fTickCounter <= 0) {
5127 IncrementTime(status); // Re-initializes fTickCounter
5128 }
5129 }
5130 fp->fPatIdx = opValue + 4; // Loop back.
5131 }
5132 break;
5133
5134 case URX_CTR_INIT_NG:
5135 {
5136 // Initialize a non-greedy loop
5137 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5138 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
5139
5140 // Pick up the three extra operands that CTR_INIT_NG has, and
5141 // skip the pattern location counter past
5142 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5143 fp->fPatIdx += 3;
5144 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
5145 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5146 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5147 U_ASSERT(minCount>=0);
5148 U_ASSERT(maxCount>=minCount || maxCount==-1);
5149 U_ASSERT(loopLoc>fp->fPatIdx);
5150 if (maxCount == -1) {
5151 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
5152 }
5153
5154 if (minCount == 0) {
5155 if (maxCount != 0) {
5156 fp = StateSave(fp, fp->fPatIdx, status);
5157 }
5158 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
5159 }
5160 }
5161 break;
5162
5163 case URX_CTR_LOOP_NG:
5164 {
5165 // Non-greedy {min, max} loops
5166 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5167 int32_t initOp = (int32_t)pat[opValue];
5168 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
5169 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5170 int32_t minCount = (int32_t)pat[opValue+2];
5171 int32_t maxCount = (int32_t)pat[opValue+3];
5172
5173 (*pCounter)++;
5174 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5175 // The loop has matched the maximum permitted number of times.
5176 // Break out of here with no action. Matching will
5177 // continue with the following pattern.
5178 U_ASSERT(*pCounter == maxCount);
5179 break;
5180 }
5181
5182 if (*pCounter < minCount) {
5183 // We haven't met the minimum number of matches yet.
5184 // Loop back for another one.
5185 fp->fPatIdx = opValue + 4; // Loop back.
5186 fTickCounter--;
5187 if (fTickCounter <= 0) {
5188 IncrementTime(status); // Re-initializes fTickCounter
5189 }
5190 } else {
5191 // We do have the minimum number of matches.
5192
5193 // If there is no upper bound on the loop iterations, check that the input index
5194 // is progressing, and stop the loop if it is not.
5195 if (maxCount == -1) {
5196 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
5197 if (fp->fInputIdx == *pLastInputIdx) {
5198 break;
5199 }
5200 *pLastInputIdx = fp->fInputIdx;
5201 }
5202
5203 // Loop Continuation: we will fall into the pattern following the loop
5204 // (non-greedy, don't execute loop body first), but first do
5205 // a state save to the top of the loop, so that a match failure
5206 // in the following pattern will try another iteration of the loop.
5207 fp = StateSave(fp, opValue + 4, status);
5208 }
5209 }
5210 break;
5211
5212 case URX_STO_SP:
5213 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5214 fData[opValue] = fStack->size();
5215 break;
5216
5217 case URX_LD_SP:
5218 {
5219 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5220 int32_t newStackSize = (int32_t)fData[opValue];
5221 U_ASSERT(newStackSize <= fStack->size());
5222 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5223 if (newFP == (int64_t *)fp) {
5224 break;
5225 }
5226 int32_t i;
5227 for (i=0; i<fFrameSize; i++) {
5228 newFP[i] = ((int64_t *)fp)[i];
5229 }
5230 fp = (REStackFrame *)newFP;
5231 fStack->setSize(newStackSize);
5232 }
5233 break;
5234
5235 case URX_BACKREF:
5236 {
5237 U_ASSERT(opValue < fFrameSize);
5238 int64_t groupStartIdx = fp->fExtra[opValue];
5239 int64_t groupEndIdx = fp->fExtra[opValue+1];
5240 U_ASSERT(groupStartIdx <= groupEndIdx);
5241 int64_t inputIndex = fp->fInputIdx;
5242 if (groupStartIdx < 0) {
5243 // This capture group has not participated in the match thus far,
5244 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5245 break;
5246 }
5247 UBool success = TRUE;
5248 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
5249 if (inputIndex >= fActiveLimit) {
5250 success = FALSE;
5251 fHitEnd = TRUE;
5252 break;
5253 }
5254 if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
5255 success = FALSE;
5256 break;
5257 }
5258 }
5259 if (success && groupStartIdx < groupEndIdx && U16_IS_LEAD(inputBuf[groupEndIdx-1]) &&
5260 inputIndex < fActiveLimit && U16_IS_TRAIL(inputBuf[inputIndex])) {
5261 // Capture group ended with an unpaired lead surrogate.
5262 // Back reference is not permitted to match lead only of a surrogatge pair.
5263 success = FALSE;
5264 }
5265 if (success) {
5266 fp->fInputIdx = inputIndex;
5267 } else {
5268 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5269 }
5270 }
5271 break;
5272
5273 case URX_BACKREF_I:
5274 {
5275 U_ASSERT(opValue < fFrameSize);
5276 int64_t groupStartIdx = fp->fExtra[opValue];
5277 int64_t groupEndIdx = fp->fExtra[opValue+1];
5278 U_ASSERT(groupStartIdx <= groupEndIdx);
5279 if (groupStartIdx < 0) {
5280 // This capture group has not participated in the match thus far,
5281 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5282 break;
5283 }
5284 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
5285 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
5286
5287 // Note: if the capture group match was of an empty string the backref
5288 // match succeeds. Verified by testing: Perl matches succeed
5289 // in this case, so we do too.
5290
5291 UBool success = TRUE;
5292 for (;;) {
5293 UChar32 captureGroupChar = captureGroupItr.next();
5294 if (captureGroupChar == U_SENTINEL) {
5295 success = TRUE;
5296 break;
5297 }
5298 UChar32 inputChar = inputItr.next();
5299 if (inputChar == U_SENTINEL) {
5300 success = FALSE;
5301 fHitEnd = TRUE;
5302 break;
5303 }
5304 if (inputChar != captureGroupChar) {
5305 success = FALSE;
5306 break;
5307 }
5308 }
5309
5310 if (success && inputItr.inExpansion()) {
5311 // We otained a match by consuming part of a string obtained from
5312 // case-folding a single code point of the input text.
5313 // This does not count as an overall match.
5314 success = FALSE;
5315 }
5316
5317 if (success) {
5318 fp->fInputIdx = inputItr.getIndex();
5319 } else {
5320 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5321 }
5322 }
5323 break;
5324
5325 case URX_STO_INP_LOC:
5326 {
5327 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5328 fp->fExtra[opValue] = fp->fInputIdx;
5329 }
5330 break;
5331
5332 case URX_JMPX:
5333 {
5334 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5335 fp->fPatIdx += 1;
5336 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
5337 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5338 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5339 U_ASSERT(savedInputIdx <= fp->fInputIdx);
5340 if (savedInputIdx < fp->fInputIdx) {
5341 fp->fPatIdx = opValue; // JMP
5342 } else {
5343 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
5344 }
5345 }
5346 break;
5347
5348 case URX_LA_START:
5349 {
5350 // Entering a lookahead block.
5351 // Save Stack Ptr, Input Pos.
5352 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5353 fData[opValue] = fStack->size();
5354 fData[opValue+1] = fp->fInputIdx;
5355 fActiveStart = fLookStart; // Set the match region change for
5356 fActiveLimit = fLookLimit; // transparent bounds.
5357 }
5358 break;
5359
5360 case URX_LA_END:
5361 {
5362 // Leaving a look-ahead block.
5363 // restore Stack Ptr, Input Pos to positions they had on entry to block.
5364 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5365 int32_t stackSize = fStack->size();
5366 int32_t newStackSize = (int32_t)fData[opValue];
5367 U_ASSERT(stackSize >= newStackSize);
5368 if (stackSize > newStackSize) {
5369 // Copy the current top frame back to the new (cut back) top frame.
5370 // This makes the capture groups from within the look-ahead
5371 // expression available.
5372 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5373 int32_t i;
5374 for (i=0; i<fFrameSize; i++) {
5375 newFP[i] = ((int64_t *)fp)[i];
5376 }
5377 fp = (REStackFrame *)newFP;
5378 fStack->setSize(newStackSize);
5379 }
5380 fp->fInputIdx = fData[opValue+1];
5381
5382 // Restore the active region bounds in the input string; they may have
5383 // been changed because of transparent bounds on a Region.
5384 fActiveStart = fRegionStart;
5385 fActiveLimit = fRegionLimit;
5386 }
5387 break;
5388
5389 case URX_ONECHAR_I:
5390 if (fp->fInputIdx < fActiveLimit) {
5391 UChar32 c;
5392 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5393 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5394 break;
5395 }
5396 } else {
5397 fHitEnd = TRUE;
5398 }
5399 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5400 break;
5401
5402 case URX_STRING_I:
5403 // Case-insensitive test input against a literal string.
5404 // Strings require two slots in the compiled pattern, one for the
5405 // offset to the string text, and one for the length.
5406 // The compiled string has already been case folded.
5407 {
5408 const UChar *patternString = litText + opValue;
5409
5410 op = (int32_t)pat[fp->fPatIdx];
5411 fp->fPatIdx++;
5412 opType = URX_TYPE(op);
5413 opValue = URX_VAL(op);
5414 U_ASSERT(opType == URX_STRING_LEN);
5415 int32_t patternStringLen = opValue; // Length of the string from the pattern.
5416
5417 UChar32 cText;
5418 UChar32 cPattern;
5419 UBool success = TRUE;
5420 int32_t patternStringIdx = 0;
5421 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
5422 while (patternStringIdx < patternStringLen) {
5423 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
5424 cText = inputIterator.next();
5425 if (cText != cPattern) {
5426 success = FALSE;
5427 if (cText == U_SENTINEL) {
5428 fHitEnd = TRUE;
5429 }
5430 break;
5431 }
5432 }
5433 if (inputIterator.inExpansion()) {
5434 success = FALSE;
5435 }
5436
5437 if (success) {
5438 fp->fInputIdx = inputIterator.getIndex();
5439 } else {
5440 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5441 }
5442 }
5443 break;
5444
5445 case URX_LB_START:
5446 {
5447 // Entering a look-behind block.
5448 // Save Stack Ptr, Input Pos.
5449 // TODO: implement transparent bounds. Ticket #6067
5450 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5451 fData[opValue] = fStack->size();
5452 fData[opValue+1] = fp->fInputIdx;
5453 // Init the variable containing the start index for attempted matches.
5454 fData[opValue+2] = -1;
5455 // Save input string length, then reset to pin any matches to end at
5456 // the current position.
5457 fData[opValue+3] = fActiveLimit;
5458 fActiveLimit = fp->fInputIdx;
5459 }
5460 break;
5461
5462
5463 case URX_LB_CONT:
5464 {
5465 // Positive Look-Behind, at top of loop checking for matches of LB expression
5466 // at all possible input starting positions.
5467
5468 // Fetch the min and max possible match lengths. They are the operands
5469 // of this op in the pattern.
5470 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5471 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5472 U_ASSERT(minML <= maxML);
5473 U_ASSERT(minML >= 0);
5474
5475 // Fetch (from data) the last input index where a match was attempted.
5476 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5477 int64_t &lbStartIdx = fData[opValue+2];
5478 if (lbStartIdx < 0) {
5479 // First time through loop.
5480 lbStartIdx = fp->fInputIdx - minML;
5481 if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
5482 U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5483 }
5484 } else {
5485 // 2nd through nth time through the loop.
5486 // Back up start position for match by one.
5487 if (lbStartIdx == 0) {
5488 lbStartIdx--;
5489 } else {
5490 U16_BACK_1(inputBuf, 0, lbStartIdx);
5491 }
5492 }
5493
5494 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5495 // We have tried all potential match starting points without
5496 // getting a match. Backtrack out, and out of the
5497 // Look Behind altogether.
5498 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5499 int64_t restoreInputLen = fData[opValue+3];
5500 U_ASSERT(restoreInputLen >= fActiveLimit);
5501 U_ASSERT(restoreInputLen <= fInputLength);
5502 fActiveLimit = restoreInputLen;
5503 break;
5504 }
5505
5506 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5507 // (successful match will fall off the end of the loop.)
5508 fp = StateSave(fp, fp->fPatIdx-3, status);
5509 fp->fInputIdx = lbStartIdx;
5510 }
5511 break;
5512
5513 case URX_LB_END:
5514 // End of a look-behind block, after a successful match.
5515 {
5516 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5517 if (fp->fInputIdx != fActiveLimit) {
5518 // The look-behind expression matched, but the match did not
5519 // extend all the way to the point that we are looking behind from.
5520 // FAIL out of here, which will take us back to the LB_CONT, which
5521 // will retry the match starting at another position or fail
5522 // the look-behind altogether, whichever is appropriate.
5523 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5524 break;
5525 }
5526
5527 // Look-behind match is good. Restore the orignal input string length,
5528 // which had been truncated to pin the end of the lookbehind match to the
5529 // position being looked-behind.
5530 int64_t originalInputLen = fData[opValue+3];
5531 U_ASSERT(originalInputLen >= fActiveLimit);
5532 U_ASSERT(originalInputLen <= fInputLength);
5533 fActiveLimit = originalInputLen;
5534 }
5535 break;
5536
5537
5538 case URX_LBN_CONT:
5539 {
5540 // Negative Look-Behind, at top of loop checking for matches of LB expression
5541 // at all possible input starting positions.
5542
5543 // Fetch the extra parameters of this op.
5544 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5545 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5546 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5547 continueLoc = URX_VAL(continueLoc);
5548 U_ASSERT(minML <= maxML);
5549 U_ASSERT(minML >= 0);
5550 U_ASSERT(continueLoc > fp->fPatIdx);
5551
5552 // Fetch (from data) the last input index where a match was attempted.
5553 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5554 int64_t &lbStartIdx = fData[opValue+2];
5555 if (lbStartIdx < 0) {
5556 // First time through loop.
5557 lbStartIdx = fp->fInputIdx - minML;
5558 if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
5559 U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5560 }
5561 } else {
5562 // 2nd through nth time through the loop.
5563 // Back up start position for match by one.
5564 if (lbStartIdx == 0) {
5565 lbStartIdx--; // Because U16_BACK is unsafe starting at 0.
5566 } else {
5567 U16_BACK_1(inputBuf, 0, lbStartIdx);
5568 }
5569 }
5570
5571 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5572 // We have tried all potential match starting points without
5573 // getting a match, which means that the negative lookbehind as
5574 // a whole has succeeded. Jump forward to the continue location
5575 int64_t restoreInputLen = fData[opValue+3];
5576 U_ASSERT(restoreInputLen >= fActiveLimit);
5577 U_ASSERT(restoreInputLen <= fInputLength);
5578 fActiveLimit = restoreInputLen;
5579 fp->fPatIdx = continueLoc;
5580 break;
5581 }
5582
5583 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5584 // (successful match will cause a FAIL out of the loop altogether.)
5585 fp = StateSave(fp, fp->fPatIdx-4, status);
5586 fp->fInputIdx = lbStartIdx;
5587 }
5588 break;
5589
5590 case URX_LBN_END:
5591 // End of a negative look-behind block, after a successful match.
5592 {
5593 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5594 if (fp->fInputIdx != fActiveLimit) {
5595 // The look-behind expression matched, but the match did not
5596 // extend all the way to the point that we are looking behind from.
5597 // FAIL out of here, which will take us back to the LB_CONT, which
5598 // will retry the match starting at another position or succeed
5599 // the look-behind altogether, whichever is appropriate.
5600 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5601 break;
5602 }
5603
5604 // Look-behind expression matched, which means look-behind test as
5605 // a whole Fails
5606
5607 // Restore the orignal input string length, which had been truncated
5608 // inorder to pin the end of the lookbehind match
5609 // to the position being looked-behind.
5610 int64_t originalInputLen = fData[opValue+3];
5611 U_ASSERT(originalInputLen >= fActiveLimit);
5612 U_ASSERT(originalInputLen <= fInputLength);
5613 fActiveLimit = originalInputLen;
5614
5615 // Restore original stack position, discarding any state saved
5616 // by the successful pattern match.
5617 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5618 int32_t newStackSize = (int32_t)fData[opValue];
5619 U_ASSERT(fStack->size() > newStackSize);
5620 fStack->setSize(newStackSize);
5621
5622 // FAIL, which will take control back to someplace
5623 // prior to entering the look-behind test.
5624 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5625 }
5626 break;
5627
5628
5629 case URX_LOOP_SR_I:
5630 // Loop Initialization for the optimized implementation of
5631 // [some character set]*
5632 // This op scans through all matching input.
5633 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5634 {
5635 U_ASSERT(opValue > 0 && opValue < sets->size());
5636 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5637 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
5638
5639 // Loop through input, until either the input is exhausted or
5640 // we reach a character that is not a member of the set.
5641 int32_t ix = (int32_t)fp->fInputIdx;
5642 for (;;) {
5643 if (ix >= fActiveLimit) {
5644 fHitEnd = TRUE;
5645 break;
5646 }
5647 UChar32 c;
5648 U16_NEXT(inputBuf, ix, fActiveLimit, c);
5649 if (c<256) {
5650 if (s8->contains(c) == FALSE) {
5651 U16_BACK_1(inputBuf, 0, ix);
5652 break;
5653 }
5654 } else {
5655 if (s->contains(c) == FALSE) {
5656 U16_BACK_1(inputBuf, 0, ix);
5657 break;
5658 }
5659 }
5660 }
5661
5662 // If there were no matching characters, skip over the loop altogether.
5663 // The loop doesn't run at all, a * op always succeeds.
5664 if (ix == fp->fInputIdx) {
5665 fp->fPatIdx++; // skip the URX_LOOP_C op.
5666 break;
5667 }
5668
5669 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5670 // must follow. It's operand is the stack location
5671 // that holds the starting input index for the match of this [set]*
5672 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5673 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5674 int32_t stackLoc = URX_VAL(loopcOp);
5675 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5676 fp->fExtra[stackLoc] = fp->fInputIdx;
5677 fp->fInputIdx = ix;
5678
5679 // Save State to the URX_LOOP_C op that follows this one,
5680 // so that match failures in the following code will return to there.
5681 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5682 fp = StateSave(fp, fp->fPatIdx, status);
5683 fp->fPatIdx++;
5684 }
5685 break;
5686
5687
5688 case URX_LOOP_DOT_I:
5689 // Loop Initialization for the optimized implementation of .*
5690 // This op scans through all remaining input.
5691 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5692 {
5693 // Loop through input until the input is exhausted (we reach an end-of-line)
5694 // In DOTALL mode, we can just go straight to the end of the input.
5695 int32_t ix;
5696 if ((opValue & 1) == 1) {
5697 // Dot-matches-All mode. Jump straight to the end of the string.
5698 ix = (int32_t)fActiveLimit;
5699 fHitEnd = TRUE;
5700 } else {
5701 // NOT DOT ALL mode. Line endings do not match '.'
5702 // Scan forward until a line ending or end of input.
5703 ix = (int32_t)fp->fInputIdx;
5704 for (;;) {
5705 if (ix >= fActiveLimit) {
5706 fHitEnd = TRUE;
5707 break;
5708 }
5709 UChar32 c;
5710 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++]
5711 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
5712 if ((c == 0x0a) || // 0x0a is newline in both modes.
5713 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
5714 isLineTerminator(c))) {
5715 // char is a line ending. Put the input pos back to the
5716 // line ending char, and exit the scanning loop.
5717 U16_BACK_1(inputBuf, 0, ix);
5718 break;
5719 }
5720 }
5721 }
5722 }
5723
5724 // If there were no matching characters, skip over the loop altogether.
5725 // The loop doesn't run at all, a * op always succeeds.
5726 if (ix == fp->fInputIdx) {
5727 fp->fPatIdx++; // skip the URX_LOOP_C op.
5728 break;
5729 }
5730
5731 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5732 // must follow. It's operand is the stack location
5733 // that holds the starting input index for the match of this .*
5734 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5735 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5736 int32_t stackLoc = URX_VAL(loopcOp);
5737 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5738 fp->fExtra[stackLoc] = fp->fInputIdx;
5739 fp->fInputIdx = ix;
5740
5741 // Save State to the URX_LOOP_C op that follows this one,
5742 // so that match failures in the following code will return to there.
5743 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5744 fp = StateSave(fp, fp->fPatIdx, status);
5745 fp->fPatIdx++;
5746 }
5747 break;
5748
5749
5750 case URX_LOOP_C:
5751 {
5752 U_ASSERT(opValue>=0 && opValue<fFrameSize);
5753 backSearchIndex = (int32_t)fp->fExtra[opValue];
5754 U_ASSERT(backSearchIndex <= fp->fInputIdx);
5755 if (backSearchIndex == fp->fInputIdx) {
5756 // We've backed up the input idx to the point that the loop started.
5757 // The loop is done. Leave here without saving state.
5758 // Subsequent failures won't come back here.
5759 break;
5760 }
5761 // Set up for the next iteration of the loop, with input index
5762 // backed up by one from the last time through,
5763 // and a state save to this instruction in case the following code fails again.
5764 // (We're going backwards because this loop emulates stack unwinding, not
5765 // the initial scan forward.)
5766 U_ASSERT(fp->fInputIdx > 0);
5767 UChar32 prevC;
5768 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
5769
5770 if (prevC == 0x0a &&
5771 fp->fInputIdx > backSearchIndex &&
5772 inputBuf[fp->fInputIdx-1] == 0x0d) {
5773 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
5774 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
5775 // .*, stepping back over CRLF pair.
5776 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
5777 }
5778 }
5779
5780
5781 fp = StateSave(fp, fp->fPatIdx-1, status);
5782 }
5783 break;
5784
5785
5786
5787 default:
5788 // Trouble. The compiled pattern contains an entry with an
5789 // unrecognized type tag.
5790 U_ASSERT(FALSE);
5791 }
5792
5793 if (U_FAILURE(status)) {
5794 isMatch = FALSE;
5795 break;
5796 }
5797 }
5798
5799 breakFromLoop:
5800 fMatch = isMatch;
5801 if (isMatch) {
5802 fLastMatchEnd = fMatchEnd;
5803 fMatchStart = startIdx;
5804 fMatchEnd = fp->fInputIdx;
5805 }
5806
5807 #ifdef REGEX_RUN_DEBUG
5808 if (fTraceDebug) {
5809 if (isMatch) {
5810 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
5811 } else {
5812 printf("No match\n\n");
5813 }
5814 }
5815 #endif
5816
5817 fFrame = fp; // The active stack frame when the engine stopped.
5818 // Contains the capture group results that we need to
5819 // access later.
5820
5821 return;
5822 }
5823
5824
5825 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
5826
5827 U_NAMESPACE_END
5828
5829 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
5830