]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/rematch.cpp
ICU-531.48.tar.gz
[apple/icu.git] / icuSources / i18n / rematch.cpp
1 /*
2 **************************************************************************
3 * Copyright (C) 2002-2013 International Business Machines Corporation *
4 * and others. All rights reserved. *
5 **************************************************************************
6 */
7 //
8 // file: rematch.cpp
9 //
10 // Contains the implementation of class RegexMatcher,
11 // which is one of the main API classes for the ICU regular expression package.
12 //
13
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
16
17 #include "unicode/regex.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/ustring.h"
21 #include "unicode/rbbi.h"
22 #include "unicode/utf.h"
23 #include "unicode/utf16.h"
24 #include "uassert.h"
25 #include "cmemory.h"
26 #include "uvector.h"
27 #include "uvectr32.h"
28 #include "uvectr64.h"
29 #include "regeximp.h"
30 #include "regexst.h"
31 #include "regextxt.h"
32 #include "ucase.h"
33
34 // #include <malloc.h> // Needed for heapcheck testing
35
36
37 // Find progress callback
38 // ----------------------
39 // Macro to inline test & call to ReportFindProgress(). Eliminates unnecessary function call.
40 //
41 #define REGEXFINDPROGRESS_INTERRUPT(pos, status) \
42 (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE)
43
44
45 // Smart Backtracking
46 // ------------------
47 // When a failure would go back to a LOOP_C instruction,
48 // strings, characters, and setrefs scan backwards for a valid start
49 // character themselves, pop the stack, and save state, emulating the
50 // LOOP_C's effect but assured that the next character of input is a
51 // possible matching character.
52 //
53 // Good idea in theory; unfortunately it only helps out a few specific
54 // cases and slows the engine down a little in the rest.
55
56 U_NAMESPACE_BEGIN
57
58 // Default limit for the size of the back track stack, to avoid system
59 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
60 // This value puts ICU's limits higher than most other regexp implementations,
61 // which use recursion rather than the heap, and take more storage per
62 // backtrack point.
63 //
64 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
65
66 // Time limit counter constant.
67 // Time limits for expression evaluation are in terms of quanta of work by
68 // the engine, each of which is 10,000 state saves.
69 // This constant determines that state saves per tick number.
70 static const int32_t TIMER_INITIAL_VALUE = 10000;
71
72 //-----------------------------------------------------------------------------
73 //
74 // Constructor and Destructor
75 //
76 //-----------------------------------------------------------------------------
77 RegexMatcher::RegexMatcher(const RegexPattern *pat) {
78 fDeferredStatus = U_ZERO_ERROR;
79 init(fDeferredStatus);
80 if (U_FAILURE(fDeferredStatus)) {
81 return;
82 }
83 if (pat==NULL) {
84 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
85 return;
86 }
87 fPattern = pat;
88 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
89 }
90
91
92
93 RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
94 uint32_t flags, UErrorCode &status) {
95 init(status);
96 if (U_FAILURE(status)) {
97 return;
98 }
99 UParseError pe;
100 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
101 fPattern = fPatternOwned;
102
103 UText inputText = UTEXT_INITIALIZER;
104 utext_openConstUnicodeString(&inputText, &input, &status);
105 init2(&inputText, status);
106 utext_close(&inputText);
107
108 fInputUniStrMaybeMutable = TRUE;
109 }
110
111
112 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
113 uint32_t flags, UErrorCode &status) {
114 init(status);
115 if (U_FAILURE(status)) {
116 return;
117 }
118 UParseError pe;
119 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
120 if (U_FAILURE(status)) {
121 return;
122 }
123
124 fPattern = fPatternOwned;
125 init2(input, status);
126 }
127
128
129 RegexMatcher::RegexMatcher(const UnicodeString &regexp,
130 uint32_t flags, UErrorCode &status) {
131 init(status);
132 if (U_FAILURE(status)) {
133 return;
134 }
135 UParseError pe;
136 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
137 if (U_FAILURE(status)) {
138 return;
139 }
140 fPattern = fPatternOwned;
141 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
142 }
143
144 RegexMatcher::RegexMatcher(UText *regexp,
145 uint32_t flags, UErrorCode &status) {
146 init(status);
147 if (U_FAILURE(status)) {
148 return;
149 }
150 UParseError pe;
151 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
152 if (U_FAILURE(status)) {
153 return;
154 }
155
156 fPattern = fPatternOwned;
157 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
158 }
159
160
161
162
163 RegexMatcher::~RegexMatcher() {
164 delete fStack;
165 if (fData != fSmallData) {
166 uprv_free(fData);
167 fData = NULL;
168 }
169 if (fPatternOwned) {
170 delete fPatternOwned;
171 fPatternOwned = NULL;
172 fPattern = NULL;
173 }
174
175 if (fInput) {
176 delete fInput;
177 }
178 if (fInputText) {
179 utext_close(fInputText);
180 }
181 if (fAltInputText) {
182 utext_close(fAltInputText);
183 }
184
185 #if UCONFIG_NO_BREAK_ITERATION==0
186 delete fWordBreakItr;
187 #endif
188 }
189
190 //
191 // init() common initialization for use by all constructors.
192 // Initialize all fields, get the object into a consistent state.
193 // This must be done even when the initial status shows an error,
194 // so that the object is initialized sufficiently well for the destructor
195 // to run safely.
196 //
197 void RegexMatcher::init(UErrorCode &status) {
198 fPattern = NULL;
199 fPatternOwned = NULL;
200 fFrameSize = 0;
201 fRegionStart = 0;
202 fRegionLimit = 0;
203 fAnchorStart = 0;
204 fAnchorLimit = 0;
205 fLookStart = 0;
206 fLookLimit = 0;
207 fActiveStart = 0;
208 fActiveLimit = 0;
209 fTransparentBounds = FALSE;
210 fAnchoringBounds = TRUE;
211 fMatch = FALSE;
212 fMatchStart = 0;
213 fMatchEnd = 0;
214 fLastMatchEnd = -1;
215 fAppendPosition = 0;
216 fHitEnd = FALSE;
217 fRequireEnd = FALSE;
218 fStack = NULL;
219 fFrame = NULL;
220 fTimeLimit = 0;
221 fTime = 0;
222 fTickCounter = 0;
223 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY;
224 fCallbackFn = NULL;
225 fCallbackContext = NULL;
226 fFindProgressCallbackFn = NULL;
227 fFindProgressCallbackContext = NULL;
228 fTraceDebug = FALSE;
229 fDeferredStatus = status;
230 fData = fSmallData;
231 fWordBreakItr = NULL;
232
233 fStack = NULL;
234 fInputText = NULL;
235 fAltInputText = NULL;
236 fInput = NULL;
237 fInputLength = 0;
238 fInputUniStrMaybeMutable = FALSE;
239
240 if (U_FAILURE(status)) {
241 fDeferredStatus = status;
242 }
243 }
244
245 //
246 // init2() Common initialization for use by RegexMatcher constructors, part 2.
247 // This handles the common setup to be done after the Pattern is available.
248 //
249 void RegexMatcher::init2(UText *input, UErrorCode &status) {
250 if (U_FAILURE(status)) {
251 fDeferredStatus = status;
252 return;
253 }
254
255 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) {
256 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
257 if (fData == NULL) {
258 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
259 return;
260 }
261 }
262
263 fStack = new UVector64(status);
264 if (fStack == NULL) {
265 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
266 return;
267 }
268
269 reset(input);
270 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
271 if (U_FAILURE(status)) {
272 fDeferredStatus = status;
273 return;
274 }
275 }
276
277
278 static const UChar BACKSLASH = 0x5c;
279 static const UChar DOLLARSIGN = 0x24;
280 //--------------------------------------------------------------------------------
281 //
282 // appendReplacement
283 //
284 //--------------------------------------------------------------------------------
285 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
286 const UnicodeString &replacement,
287 UErrorCode &status) {
288 UText replacementText = UTEXT_INITIALIZER;
289
290 utext_openConstUnicodeString(&replacementText, &replacement, &status);
291 if (U_SUCCESS(status)) {
292 UText resultText = UTEXT_INITIALIZER;
293 utext_openUnicodeString(&resultText, &dest, &status);
294
295 if (U_SUCCESS(status)) {
296 appendReplacement(&resultText, &replacementText, status);
297 utext_close(&resultText);
298 }
299 utext_close(&replacementText);
300 }
301
302 return *this;
303 }
304
305 //
306 // appendReplacement, UText mode
307 //
308 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
309 UText *replacement,
310 UErrorCode &status) {
311 if (U_FAILURE(status)) {
312 return *this;
313 }
314 if (U_FAILURE(fDeferredStatus)) {
315 status = fDeferredStatus;
316 return *this;
317 }
318 if (fMatch == FALSE) {
319 status = U_REGEX_INVALID_STATE;
320 return *this;
321 }
322
323 // Copy input string from the end of previous match to start of current match
324 int64_t destLen = utext_nativeLength(dest);
325 if (fMatchStart > fAppendPosition) {
326 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
327 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
328 (int32_t)(fMatchStart-fAppendPosition), &status);
329 } else {
330 int32_t len16;
331 if (UTEXT_USES_U16(fInputText)) {
332 len16 = (int32_t)(fMatchStart-fAppendPosition);
333 } else {
334 UErrorCode lengthStatus = U_ZERO_ERROR;
335 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
336 }
337 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
338 if (inputChars == NULL) {
339 status = U_MEMORY_ALLOCATION_ERROR;
340 return *this;
341 }
342 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
343 destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
344 uprv_free(inputChars);
345 }
346 }
347 fAppendPosition = fMatchEnd;
348
349
350 // scan the replacement text, looking for substitutions ($n) and \escapes.
351 // TODO: optimize this loop by efficiently scanning for '$' or '\',
352 // move entire ranges not containing substitutions.
353 UTEXT_SETNATIVEINDEX(replacement, 0);
354 UChar32 c = UTEXT_NEXT32(replacement);
355 while (c != U_SENTINEL) {
356 if (c == BACKSLASH) {
357 // Backslash Escape. Copy the following char out without further checks.
358 // Note: Surrogate pairs don't need any special handling
359 // The second half wont be a '$' or a '\', and
360 // will move to the dest normally on the next
361 // loop iteration.
362 c = UTEXT_CURRENT32(replacement);
363 if (c == U_SENTINEL) {
364 break;
365 }
366
367 if (c==0x55/*U*/ || c==0x75/*u*/) {
368 // We have a \udddd or \Udddddddd escape sequence.
369 int32_t offset = 0;
370 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
371 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
372 if (escapedChar != (UChar32)0xFFFFFFFF) {
373 if (U_IS_BMP(escapedChar)) {
374 UChar c16 = (UChar)escapedChar;
375 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
376 } else {
377 UChar surrogate[2];
378 surrogate[0] = U16_LEAD(escapedChar);
379 surrogate[1] = U16_TRAIL(escapedChar);
380 if (U_SUCCESS(status)) {
381 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
382 }
383 }
384 // TODO: Report errors for mal-formed \u escapes?
385 // As this is, the original sequence is output, which may be OK.
386 if (context.lastOffset == offset) {
387 (void)UTEXT_PREVIOUS32(replacement);
388 } else if (context.lastOffset != offset-1) {
389 utext_moveIndex32(replacement, offset - context.lastOffset - 1);
390 }
391 }
392 } else {
393 (void)UTEXT_NEXT32(replacement);
394 // Plain backslash escape. Just put out the escaped character.
395 if (U_IS_BMP(c)) {
396 UChar c16 = (UChar)c;
397 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
398 } else {
399 UChar surrogate[2];
400 surrogate[0] = U16_LEAD(c);
401 surrogate[1] = U16_TRAIL(c);
402 if (U_SUCCESS(status)) {
403 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
404 }
405 }
406 }
407 } else if (c != DOLLARSIGN) {
408 // Normal char, not a $. Copy it out without further checks.
409 if (U_IS_BMP(c)) {
410 UChar c16 = (UChar)c;
411 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
412 } else {
413 UChar surrogate[2];
414 surrogate[0] = U16_LEAD(c);
415 surrogate[1] = U16_TRAIL(c);
416 if (U_SUCCESS(status)) {
417 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
418 }
419 }
420 } else {
421 // We've got a $. Pick up a capture group number if one follows.
422 // Consume at most the number of digits necessary for the largest capture
423 // number that is valid for this pattern.
424
425 int32_t numDigits = 0;
426 int32_t groupNum = 0;
427 UChar32 digitC;
428 for (;;) {
429 digitC = UTEXT_CURRENT32(replacement);
430 if (digitC == U_SENTINEL) {
431 break;
432 }
433 if (u_isdigit(digitC) == FALSE) {
434 break;
435 }
436 (void)UTEXT_NEXT32(replacement);
437 groupNum=groupNum*10 + u_charDigitValue(digitC);
438 numDigits++;
439 if (numDigits >= fPattern->fMaxCaptureDigits) {
440 break;
441 }
442 }
443
444
445 if (numDigits == 0) {
446 // The $ didn't introduce a group number at all.
447 // Treat it as just part of the substitution text.
448 UChar c16 = DOLLARSIGN;
449 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
450 } else {
451 // Finally, append the capture group data to the destination.
452 destLen += appendGroup(groupNum, dest, status);
453 if (U_FAILURE(status)) {
454 // Can fail if group number is out of range.
455 break;
456 }
457 }
458 }
459
460 if (U_FAILURE(status)) {
461 break;
462 } else {
463 c = UTEXT_NEXT32(replacement);
464 }
465 }
466
467 return *this;
468 }
469
470
471
472 //--------------------------------------------------------------------------------
473 //
474 // appendTail Intended to be used in conjunction with appendReplacement()
475 // To the destination string, append everything following
476 // the last match position from the input string.
477 //
478 // Note: Match ranges do not affect appendTail or appendReplacement
479 //
480 //--------------------------------------------------------------------------------
481 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
482 UErrorCode status = U_ZERO_ERROR;
483 UText resultText = UTEXT_INITIALIZER;
484 utext_openUnicodeString(&resultText, &dest, &status);
485
486 if (U_SUCCESS(status)) {
487 appendTail(&resultText, status);
488 utext_close(&resultText);
489 }
490
491 return dest;
492 }
493
494 //
495 // appendTail, UText mode
496 //
497 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
498 if (U_FAILURE(status)) {
499 return dest;
500 }
501 if (U_FAILURE(fDeferredStatus)) {
502 status = fDeferredStatus;
503 return dest;
504 }
505
506 if (fInputLength > fAppendPosition) {
507 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
508 int64_t destLen = utext_nativeLength(dest);
509 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
510 (int32_t)(fInputLength-fAppendPosition), &status);
511 } else {
512 int32_t len16;
513 if (UTEXT_USES_U16(fInputText)) {
514 len16 = (int32_t)(fInputLength-fAppendPosition);
515 } else {
516 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
517 status = U_ZERO_ERROR; // buffer overflow
518 }
519
520 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
521 if (inputChars == NULL) {
522 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
523 } else {
524 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
525 int64_t destLen = utext_nativeLength(dest);
526 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
527 uprv_free(inputChars);
528 }
529 }
530 }
531 return dest;
532 }
533
534
535
536 //--------------------------------------------------------------------------------
537 //
538 // end
539 //
540 //--------------------------------------------------------------------------------
541 int32_t RegexMatcher::end(UErrorCode &err) const {
542 return end(0, err);
543 }
544
545 int64_t RegexMatcher::end64(UErrorCode &err) const {
546 return end64(0, err);
547 }
548
549 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
550 if (U_FAILURE(err)) {
551 return -1;
552 }
553 if (fMatch == FALSE) {
554 err = U_REGEX_INVALID_STATE;
555 return -1;
556 }
557 if (group < 0 || group > fPattern->fGroupMap->size()) {
558 err = U_INDEX_OUTOFBOUNDS_ERROR;
559 return -1;
560 }
561 int64_t e = -1;
562 if (group == 0) {
563 e = fMatchEnd;
564 } else {
565 // Get the position within the stack frame of the variables for
566 // this capture group.
567 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
568 U_ASSERT(groupOffset < fPattern->fFrameSize);
569 U_ASSERT(groupOffset >= 0);
570 e = fFrame->fExtra[groupOffset + 1];
571 }
572
573 return e;
574 }
575
576 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
577 return (int32_t)end64(group, err);
578 }
579
580
581 //--------------------------------------------------------------------------------
582 //
583 // find()
584 //
585 //--------------------------------------------------------------------------------
586 UBool RegexMatcher::find() {
587 // Start at the position of the last match end. (Will be zero if the
588 // matcher has been reset.)
589 //
590 if (U_FAILURE(fDeferredStatus)) {
591 return FALSE;
592 }
593
594 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
595 return findUsingChunk();
596 }
597
598 int64_t startPos = fMatchEnd;
599 if (startPos==0) {
600 startPos = fActiveStart;
601 }
602
603 if (fMatch) {
604 // Save the position of any previous successful match.
605 fLastMatchEnd = fMatchEnd;
606
607 if (fMatchStart == fMatchEnd) {
608 // Previous match had zero length. Move start position up one position
609 // to avoid sending find() into a loop on zero-length matches.
610 if (startPos >= fActiveLimit) {
611 fMatch = FALSE;
612 fHitEnd = TRUE;
613 return FALSE;
614 }
615 UTEXT_SETNATIVEINDEX(fInputText, startPos);
616 (void)UTEXT_NEXT32(fInputText);
617 startPos = UTEXT_GETNATIVEINDEX(fInputText);
618 }
619 } else {
620 if (fLastMatchEnd >= 0) {
621 // A previous find() failed to match. Don't try again.
622 // (without this test, a pattern with a zero-length match
623 // could match again at the end of an input string.)
624 fHitEnd = TRUE;
625 return FALSE;
626 }
627 }
628
629
630 // Compute the position in the input string beyond which a match can not begin, because
631 // the minimum length match would extend past the end of the input.
632 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
633 // Be aware of possible overflows if making changes here.
634 int64_t testStartLimit;
635 if (UTEXT_USES_U16(fInputText)) {
636 testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
637 if (startPos > testStartLimit) {
638 fMatch = FALSE;
639 fHitEnd = TRUE;
640 return FALSE;
641 }
642 } else {
643 // For now, let the matcher discover that it can't match on its own
644 // We don't know how long the match len is in native characters
645 testStartLimit = fActiveLimit;
646 }
647
648 UChar32 c;
649 U_ASSERT(startPos >= 0);
650
651 switch (fPattern->fStartType) {
652 case START_NO_INFO:
653 // No optimization was found.
654 // Try a match at each input position.
655 for (;;) {
656 MatchAt(startPos, FALSE, fDeferredStatus);
657 if (U_FAILURE(fDeferredStatus)) {
658 return FALSE;
659 }
660 if (fMatch) {
661 return TRUE;
662 }
663 if (startPos >= testStartLimit) {
664 fHitEnd = TRUE;
665 return FALSE;
666 }
667 UTEXT_SETNATIVEINDEX(fInputText, startPos);
668 (void)UTEXT_NEXT32(fInputText);
669 startPos = UTEXT_GETNATIVEINDEX(fInputText);
670 // Note that it's perfectly OK for a pattern to have a zero-length
671 // match at the end of a string, so we must make sure that the loop
672 // runs with startPos == testStartLimit the last time through.
673 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
674 return FALSE;
675 }
676 U_ASSERT(FALSE);
677
678 case START_START:
679 // Matches are only possible at the start of the input string
680 // (pattern begins with ^ or \A)
681 if (startPos > fActiveStart) {
682 fMatch = FALSE;
683 return FALSE;
684 }
685 MatchAt(startPos, FALSE, fDeferredStatus);
686 if (U_FAILURE(fDeferredStatus)) {
687 return FALSE;
688 }
689 return fMatch;
690
691
692 case START_SET:
693 {
694 // Match may start on any char from a pre-computed set.
695 U_ASSERT(fPattern->fMinMatchLen > 0);
696 int64_t pos;
697 UTEXT_SETNATIVEINDEX(fInputText, startPos);
698 for (;;) {
699 c = UTEXT_NEXT32(fInputText);
700 pos = UTEXT_GETNATIVEINDEX(fInputText);
701 // c will be -1 (U_SENTINEL) at end of text, in which case we
702 // skip this next block (so we don't have a negative array index)
703 // and handle end of text in the following block.
704 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
705 (c>=256 && fPattern->fInitialChars->contains(c)))) {
706 MatchAt(startPos, FALSE, fDeferredStatus);
707 if (U_FAILURE(fDeferredStatus)) {
708 return FALSE;
709 }
710 if (fMatch) {
711 return TRUE;
712 }
713 UTEXT_SETNATIVEINDEX(fInputText, pos);
714 }
715 if (startPos >= testStartLimit) {
716 fMatch = FALSE;
717 fHitEnd = TRUE;
718 return FALSE;
719 }
720 startPos = pos;
721 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
722 return FALSE;
723 }
724 }
725 U_ASSERT(FALSE);
726
727 case START_STRING:
728 case START_CHAR:
729 {
730 // Match starts on exactly one char.
731 U_ASSERT(fPattern->fMinMatchLen > 0);
732 UChar32 theChar = fPattern->fInitialChar;
733 int64_t pos;
734 UTEXT_SETNATIVEINDEX(fInputText, startPos);
735 for (;;) {
736 c = UTEXT_NEXT32(fInputText);
737 pos = UTEXT_GETNATIVEINDEX(fInputText);
738 if (c == theChar) {
739 MatchAt(startPos, FALSE, fDeferredStatus);
740 if (U_FAILURE(fDeferredStatus)) {
741 return FALSE;
742 }
743 if (fMatch) {
744 return TRUE;
745 }
746 UTEXT_SETNATIVEINDEX(fInputText, pos);
747 }
748 if (startPos >= testStartLimit) {
749 fMatch = FALSE;
750 fHitEnd = TRUE;
751 return FALSE;
752 }
753 startPos = pos;
754 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
755 return FALSE;
756 }
757 }
758 U_ASSERT(FALSE);
759
760 case START_LINE:
761 {
762 UChar32 c;
763 if (startPos == fAnchorStart) {
764 MatchAt(startPos, FALSE, fDeferredStatus);
765 if (U_FAILURE(fDeferredStatus)) {
766 return FALSE;
767 }
768 if (fMatch) {
769 return TRUE;
770 }
771 UTEXT_SETNATIVEINDEX(fInputText, startPos);
772 c = UTEXT_NEXT32(fInputText);
773 startPos = UTEXT_GETNATIVEINDEX(fInputText);
774 } else {
775 UTEXT_SETNATIVEINDEX(fInputText, startPos);
776 c = UTEXT_PREVIOUS32(fInputText);
777 UTEXT_SETNATIVEINDEX(fInputText, startPos);
778 }
779
780 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
781 for (;;) {
782 if (c == 0x0a) {
783 MatchAt(startPos, FALSE, fDeferredStatus);
784 if (U_FAILURE(fDeferredStatus)) {
785 return FALSE;
786 }
787 if (fMatch) {
788 return TRUE;
789 }
790 UTEXT_SETNATIVEINDEX(fInputText, startPos);
791 }
792 if (startPos >= testStartLimit) {
793 fMatch = FALSE;
794 fHitEnd = TRUE;
795 return FALSE;
796 }
797 c = UTEXT_NEXT32(fInputText);
798 startPos = UTEXT_GETNATIVEINDEX(fInputText);
799 // Note that it's perfectly OK for a pattern to have a zero-length
800 // match at the end of a string, so we must make sure that the loop
801 // runs with startPos == testStartLimit the last time through.
802 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
803 return FALSE;
804 }
805 } else {
806 for (;;) {
807 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
808 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
809 if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
810 (void)UTEXT_NEXT32(fInputText);
811 startPos = UTEXT_GETNATIVEINDEX(fInputText);
812 }
813 MatchAt(startPos, FALSE, fDeferredStatus);
814 if (U_FAILURE(fDeferredStatus)) {
815 return FALSE;
816 }
817 if (fMatch) {
818 return TRUE;
819 }
820 UTEXT_SETNATIVEINDEX(fInputText, startPos);
821 }
822 if (startPos >= testStartLimit) {
823 fMatch = FALSE;
824 fHitEnd = TRUE;
825 return FALSE;
826 }
827 c = UTEXT_NEXT32(fInputText);
828 startPos = UTEXT_GETNATIVEINDEX(fInputText);
829 // Note that it's perfectly OK for a pattern to have a zero-length
830 // match at the end of a string, so we must make sure that the loop
831 // runs with startPos == testStartLimit the last time through.
832 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
833 return FALSE;
834 }
835 }
836 }
837
838 default:
839 U_ASSERT(FALSE);
840 }
841
842 U_ASSERT(FALSE);
843 return FALSE;
844 }
845
846
847
848 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
849 if (U_FAILURE(status)) {
850 return FALSE;
851 }
852 if (U_FAILURE(fDeferredStatus)) {
853 status = fDeferredStatus;
854 return FALSE;
855 }
856 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
857 // This will reset the region to be the full input length.
858 if (start < 0) {
859 status = U_INDEX_OUTOFBOUNDS_ERROR;
860 return FALSE;
861 }
862
863 int64_t nativeStart = start;
864 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
865 status = U_INDEX_OUTOFBOUNDS_ERROR;
866 return FALSE;
867 }
868 fMatchEnd = nativeStart;
869 return find();
870 }
871
872
873 //--------------------------------------------------------------------------------
874 //
875 // findUsingChunk() -- like find(), but with the advance knowledge that the
876 // entire string is available in the UText's chunk buffer.
877 //
878 //--------------------------------------------------------------------------------
879 UBool RegexMatcher::findUsingChunk() {
880 // Start at the position of the last match end. (Will be zero if the
881 // matcher has been reset.
882 //
883
884 int32_t startPos = (int32_t)fMatchEnd;
885 if (startPos==0) {
886 startPos = (int32_t)fActiveStart;
887 }
888
889 const UChar *inputBuf = fInputText->chunkContents;
890
891 if (fMatch) {
892 // Save the position of any previous successful match.
893 fLastMatchEnd = fMatchEnd;
894
895 if (fMatchStart == fMatchEnd) {
896 // Previous match had zero length. Move start position up one position
897 // to avoid sending find() into a loop on zero-length matches.
898 if (startPos >= fActiveLimit) {
899 fMatch = FALSE;
900 fHitEnd = TRUE;
901 return FALSE;
902 }
903 U16_FWD_1(inputBuf, startPos, fInputLength);
904 }
905 } else {
906 if (fLastMatchEnd >= 0) {
907 // A previous find() failed to match. Don't try again.
908 // (without this test, a pattern with a zero-length match
909 // could match again at the end of an input string.)
910 fHitEnd = TRUE;
911 return FALSE;
912 }
913 }
914
915
916 // Compute the position in the input string beyond which a match can not begin, because
917 // the minimum length match would extend past the end of the input.
918 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
919 // Be aware of possible overflows if making changes here.
920 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
921 if (startPos > testLen) {
922 fMatch = FALSE;
923 fHitEnd = TRUE;
924 return FALSE;
925 }
926
927 UChar32 c;
928 U_ASSERT(startPos >= 0);
929
930 switch (fPattern->fStartType) {
931 case START_NO_INFO:
932 // No optimization was found.
933 // Try a match at each input position.
934 for (;;) {
935 MatchChunkAt(startPos, FALSE, fDeferredStatus);
936 if (U_FAILURE(fDeferredStatus)) {
937 return FALSE;
938 }
939 if (fMatch) {
940 return TRUE;
941 }
942 if (startPos >= testLen) {
943 fHitEnd = TRUE;
944 return FALSE;
945 }
946 U16_FWD_1(inputBuf, startPos, fActiveLimit);
947 // Note that it's perfectly OK for a pattern to have a zero-length
948 // match at the end of a string, so we must make sure that the loop
949 // runs with startPos == testLen the last time through.
950 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
951 return FALSE;
952 }
953 U_ASSERT(FALSE);
954
955 case START_START:
956 // Matches are only possible at the start of the input string
957 // (pattern begins with ^ or \A)
958 if (startPos > fActiveStart) {
959 fMatch = FALSE;
960 return FALSE;
961 }
962 MatchChunkAt(startPos, FALSE, fDeferredStatus);
963 if (U_FAILURE(fDeferredStatus)) {
964 return FALSE;
965 }
966 return fMatch;
967
968
969 case START_SET:
970 {
971 // Match may start on any char from a pre-computed set.
972 U_ASSERT(fPattern->fMinMatchLen > 0);
973 for (;;) {
974 int32_t pos = startPos;
975 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
976 if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
977 (c>=256 && fPattern->fInitialChars->contains(c))) {
978 MatchChunkAt(pos, FALSE, fDeferredStatus);
979 if (U_FAILURE(fDeferredStatus)) {
980 return FALSE;
981 }
982 if (fMatch) {
983 return TRUE;
984 }
985 }
986 if (pos >= testLen) {
987 fMatch = FALSE;
988 fHitEnd = TRUE;
989 return FALSE;
990 }
991 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
992 return FALSE;
993 }
994 }
995 U_ASSERT(FALSE);
996
997 case START_STRING:
998 case START_CHAR:
999 {
1000 // Match starts on exactly one char.
1001 U_ASSERT(fPattern->fMinMatchLen > 0);
1002 UChar32 theChar = fPattern->fInitialChar;
1003 for (;;) {
1004 int32_t pos = startPos;
1005 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
1006 if (c == theChar) {
1007 MatchChunkAt(pos, FALSE, fDeferredStatus);
1008 if (U_FAILURE(fDeferredStatus)) {
1009 return FALSE;
1010 }
1011 if (fMatch) {
1012 return TRUE;
1013 }
1014 }
1015 if (pos >= testLen) {
1016 fMatch = FALSE;
1017 fHitEnd = TRUE;
1018 return FALSE;
1019 }
1020 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1021 return FALSE;
1022 }
1023 }
1024 U_ASSERT(FALSE);
1025
1026 case START_LINE:
1027 {
1028 UChar32 c;
1029 if (startPos == fAnchorStart) {
1030 MatchChunkAt(startPos, FALSE, fDeferredStatus);
1031 if (U_FAILURE(fDeferredStatus)) {
1032 return FALSE;
1033 }
1034 if (fMatch) {
1035 return TRUE;
1036 }
1037 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1038 }
1039
1040 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1041 for (;;) {
1042 c = inputBuf[startPos-1];
1043 if (c == 0x0a) {
1044 MatchChunkAt(startPos, FALSE, fDeferredStatus);
1045 if (U_FAILURE(fDeferredStatus)) {
1046 return FALSE;
1047 }
1048 if (fMatch) {
1049 return TRUE;
1050 }
1051 }
1052 if (startPos >= testLen) {
1053 fMatch = FALSE;
1054 fHitEnd = TRUE;
1055 return FALSE;
1056 }
1057 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1058 // Note that it's perfectly OK for a pattern to have a zero-length
1059 // match at the end of a string, so we must make sure that the loop
1060 // runs with startPos == testLen the last time through.
1061 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1062 return FALSE;
1063 }
1064 } else {
1065 for (;;) {
1066 c = inputBuf[startPos-1];
1067 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
1068 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
1069 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1070 startPos++;
1071 }
1072 MatchChunkAt(startPos, FALSE, fDeferredStatus);
1073 if (U_FAILURE(fDeferredStatus)) {
1074 return FALSE;
1075 }
1076 if (fMatch) {
1077 return TRUE;
1078 }
1079 }
1080 if (startPos >= testLen) {
1081 fMatch = FALSE;
1082 fHitEnd = TRUE;
1083 return FALSE;
1084 }
1085 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1086 // Note that it's perfectly OK for a pattern to have a zero-length
1087 // match at the end of a string, so we must make sure that the loop
1088 // runs with startPos == testLen the last time through.
1089 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1090 return FALSE;
1091 }
1092 }
1093 }
1094
1095 default:
1096 U_ASSERT(FALSE);
1097 }
1098
1099 U_ASSERT(FALSE);
1100 return FALSE;
1101 }
1102
1103
1104
1105 //--------------------------------------------------------------------------------
1106 //
1107 // group()
1108 //
1109 //--------------------------------------------------------------------------------
1110 UnicodeString RegexMatcher::group(UErrorCode &status) const {
1111 return group(0, status);
1112 }
1113
1114 // Return immutable shallow clone
1115 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1116 return group(0, dest, group_len, status);
1117 }
1118
1119 // Return immutable shallow clone
1120 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1121 group_len = 0;
1122 if (U_FAILURE(status)) {
1123 return dest;
1124 }
1125 if (U_FAILURE(fDeferredStatus)) {
1126 status = fDeferredStatus;
1127 } else if (fMatch == FALSE) {
1128 status = U_REGEX_INVALID_STATE;
1129 } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1130 status = U_INDEX_OUTOFBOUNDS_ERROR;
1131 }
1132
1133 if (U_FAILURE(status)) {
1134 return dest;
1135 }
1136
1137 int64_t s, e;
1138 if (groupNum == 0) {
1139 s = fMatchStart;
1140 e = fMatchEnd;
1141 } else {
1142 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1143 U_ASSERT(groupOffset < fPattern->fFrameSize);
1144 U_ASSERT(groupOffset >= 0);
1145 s = fFrame->fExtra[groupOffset];
1146 e = fFrame->fExtra[groupOffset+1];
1147 }
1148
1149 if (s < 0) {
1150 // A capture group wasn't part of the match
1151 return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1152 }
1153 U_ASSERT(s <= e);
1154 group_len = e - s;
1155
1156 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1157 if (dest)
1158 UTEXT_SETNATIVEINDEX(dest, s);
1159 return dest;
1160 }
1161
1162 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1163 UnicodeString result;
1164 if (U_FAILURE(status)) {
1165 return result;
1166 }
1167 UText resultText = UTEXT_INITIALIZER;
1168 utext_openUnicodeString(&resultText, &result, &status);
1169 group(groupNum, &resultText, status);
1170 utext_close(&resultText);
1171 return result;
1172 }
1173
1174
1175 // Return deep (mutable) clone
1176 // Technology Preview (as an API), but note that the UnicodeString API is implemented
1177 // using this function.
1178 UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const {
1179 if (U_FAILURE(status)) {
1180 return dest;
1181 }
1182
1183 if (U_FAILURE(fDeferredStatus)) {
1184 status = fDeferredStatus;
1185 } else if (fMatch == FALSE) {
1186 status = U_REGEX_INVALID_STATE;
1187 } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1188 status = U_INDEX_OUTOFBOUNDS_ERROR;
1189 }
1190 if (U_FAILURE(status)) {
1191 return dest;
1192 }
1193
1194 int64_t s, e;
1195 if (groupNum == 0) {
1196 s = fMatchStart;
1197 e = fMatchEnd;
1198 } else {
1199 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1200 U_ASSERT(groupOffset < fPattern->fFrameSize);
1201 U_ASSERT(groupOffset >= 0);
1202 s = fFrame->fExtra[groupOffset];
1203 e = fFrame->fExtra[groupOffset+1];
1204 }
1205
1206 if (s < 0) {
1207 // A capture group wasn't part of the match
1208 if (dest) {
1209 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1210 return dest;
1211 } else {
1212 return utext_openUChars(NULL, NULL, 0, &status);
1213 }
1214 }
1215 U_ASSERT(s <= e);
1216
1217 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1218 U_ASSERT(e <= fInputLength);
1219 if (dest) {
1220 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status);
1221 } else {
1222 UText groupText = UTEXT_INITIALIZER;
1223 utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status);
1224 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1225 utext_close(&groupText);
1226 }
1227 } else {
1228 int32_t len16;
1229 if (UTEXT_USES_U16(fInputText)) {
1230 len16 = (int32_t)(e-s);
1231 } else {
1232 UErrorCode lengthStatus = U_ZERO_ERROR;
1233 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1234 }
1235 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1236 if (groupChars == NULL) {
1237 status = U_MEMORY_ALLOCATION_ERROR;
1238 return dest;
1239 }
1240 utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1241
1242 if (dest) {
1243 utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status);
1244 } else {
1245 UText groupText = UTEXT_INITIALIZER;
1246 utext_openUChars(&groupText, groupChars, len16, &status);
1247 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1248 utext_close(&groupText);
1249 }
1250
1251 uprv_free(groupChars);
1252 }
1253 return dest;
1254 }
1255
1256 //--------------------------------------------------------------------------------
1257 //
1258 // appendGroup() -- currently internal only, appends a group to a UText rather
1259 // than replacing its contents
1260 //
1261 //--------------------------------------------------------------------------------
1262
1263 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1264 if (U_FAILURE(status)) {
1265 return 0;
1266 }
1267 if (U_FAILURE(fDeferredStatus)) {
1268 status = fDeferredStatus;
1269 return 0;
1270 }
1271 int64_t destLen = utext_nativeLength(dest);
1272
1273 if (fMatch == FALSE) {
1274 status = U_REGEX_INVALID_STATE;
1275 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1276 }
1277 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1278 status = U_INDEX_OUTOFBOUNDS_ERROR;
1279 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1280 }
1281
1282 int64_t s, e;
1283 if (groupNum == 0) {
1284 s = fMatchStart;
1285 e = fMatchEnd;
1286 } else {
1287 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1288 U_ASSERT(groupOffset < fPattern->fFrameSize);
1289 U_ASSERT(groupOffset >= 0);
1290 s = fFrame->fExtra[groupOffset];
1291 e = fFrame->fExtra[groupOffset+1];
1292 }
1293
1294 if (s < 0) {
1295 // A capture group wasn't part of the match
1296 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1297 }
1298 U_ASSERT(s <= e);
1299
1300 int64_t deltaLen;
1301 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1302 U_ASSERT(e <= fInputLength);
1303 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1304 } else {
1305 int32_t len16;
1306 if (UTEXT_USES_U16(fInputText)) {
1307 len16 = (int32_t)(e-s);
1308 } else {
1309 UErrorCode lengthStatus = U_ZERO_ERROR;
1310 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1311 }
1312 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1313 if (groupChars == NULL) {
1314 status = U_MEMORY_ALLOCATION_ERROR;
1315 return 0;
1316 }
1317 utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1318
1319 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1320 uprv_free(groupChars);
1321 }
1322 return deltaLen;
1323 }
1324
1325
1326
1327 //--------------------------------------------------------------------------------
1328 //
1329 // groupCount()
1330 //
1331 //--------------------------------------------------------------------------------
1332 int32_t RegexMatcher::groupCount() const {
1333 return fPattern->fGroupMap->size();
1334 }
1335
1336
1337
1338 //--------------------------------------------------------------------------------
1339 //
1340 // hasAnchoringBounds()
1341 //
1342 //--------------------------------------------------------------------------------
1343 UBool RegexMatcher::hasAnchoringBounds() const {
1344 return fAnchoringBounds;
1345 }
1346
1347
1348 //--------------------------------------------------------------------------------
1349 //
1350 // hasTransparentBounds()
1351 //
1352 //--------------------------------------------------------------------------------
1353 UBool RegexMatcher::hasTransparentBounds() const {
1354 return fTransparentBounds;
1355 }
1356
1357
1358
1359 //--------------------------------------------------------------------------------
1360 //
1361 // hitEnd()
1362 //
1363 //--------------------------------------------------------------------------------
1364 UBool RegexMatcher::hitEnd() const {
1365 return fHitEnd;
1366 }
1367
1368
1369 //--------------------------------------------------------------------------------
1370 //
1371 // input()
1372 //
1373 //--------------------------------------------------------------------------------
1374 const UnicodeString &RegexMatcher::input() const {
1375 if (!fInput) {
1376 UErrorCode status = U_ZERO_ERROR;
1377 int32_t len16;
1378 if (UTEXT_USES_U16(fInputText)) {
1379 len16 = (int32_t)fInputLength;
1380 } else {
1381 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1382 status = U_ZERO_ERROR; // overflow, length status
1383 }
1384 UnicodeString *result = new UnicodeString(len16, 0, 0);
1385
1386 UChar *inputChars = result->getBuffer(len16);
1387 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1388 result->releaseBuffer(len16);
1389
1390 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1391 }
1392
1393 return *fInput;
1394 }
1395
1396 //--------------------------------------------------------------------------------
1397 //
1398 // inputText()
1399 //
1400 //--------------------------------------------------------------------------------
1401 UText *RegexMatcher::inputText() const {
1402 return fInputText;
1403 }
1404
1405
1406 //--------------------------------------------------------------------------------
1407 //
1408 // getInput() -- like inputText(), but makes a clone or copies into another UText
1409 //
1410 //--------------------------------------------------------------------------------
1411 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1412 if (U_FAILURE(status)) {
1413 return dest;
1414 }
1415 if (U_FAILURE(fDeferredStatus)) {
1416 status = fDeferredStatus;
1417 return dest;
1418 }
1419
1420 if (dest) {
1421 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1422 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1423 } else {
1424 int32_t input16Len;
1425 if (UTEXT_USES_U16(fInputText)) {
1426 input16Len = (int32_t)fInputLength;
1427 } else {
1428 UErrorCode lengthStatus = U_ZERO_ERROR;
1429 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1430 }
1431 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1432 if (inputChars == NULL) {
1433 return dest;
1434 }
1435
1436 status = U_ZERO_ERROR;
1437 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1438 status = U_ZERO_ERROR;
1439 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1440
1441 uprv_free(inputChars);
1442 }
1443 return dest;
1444 } else {
1445 return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1446 }
1447 }
1448
1449
1450 static UBool compat_SyncMutableUTextContents(UText *ut);
1451 static UBool compat_SyncMutableUTextContents(UText *ut) {
1452 UBool retVal = FALSE;
1453
1454 // In the following test, we're really only interested in whether the UText should switch
1455 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents
1456 // will still point to the correct data.
1457 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1458 UnicodeString *us=(UnicodeString *)ut->context;
1459
1460 // Update to the latest length.
1461 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1462 int32_t newLength = us->length();
1463
1464 // Update the chunk description.
1465 // The buffer may have switched between stack- and heap-based.
1466 ut->chunkContents = us->getBuffer();
1467 ut->chunkLength = newLength;
1468 ut->chunkNativeLimit = newLength;
1469 ut->nativeIndexingLimit = newLength;
1470 retVal = TRUE;
1471 }
1472
1473 return retVal;
1474 }
1475
1476 //--------------------------------------------------------------------------------
1477 //
1478 // lookingAt()
1479 //
1480 //--------------------------------------------------------------------------------
1481 UBool RegexMatcher::lookingAt(UErrorCode &status) {
1482 if (U_FAILURE(status)) {
1483 return FALSE;
1484 }
1485 if (U_FAILURE(fDeferredStatus)) {
1486 status = fDeferredStatus;
1487 return FALSE;
1488 }
1489
1490 if (fInputUniStrMaybeMutable) {
1491 if (compat_SyncMutableUTextContents(fInputText)) {
1492 fInputLength = utext_nativeLength(fInputText);
1493 reset();
1494 }
1495 }
1496 else {
1497 resetPreserveRegion();
1498 }
1499 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1500 MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1501 } else {
1502 MatchAt(fActiveStart, FALSE, status);
1503 }
1504 return fMatch;
1505 }
1506
1507
1508 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1509 if (U_FAILURE(status)) {
1510 return FALSE;
1511 }
1512 if (U_FAILURE(fDeferredStatus)) {
1513 status = fDeferredStatus;
1514 return FALSE;
1515 }
1516 reset();
1517
1518 if (start < 0) {
1519 status = U_INDEX_OUTOFBOUNDS_ERROR;
1520 return FALSE;
1521 }
1522
1523 if (fInputUniStrMaybeMutable) {
1524 if (compat_SyncMutableUTextContents(fInputText)) {
1525 fInputLength = utext_nativeLength(fInputText);
1526 reset();
1527 }
1528 }
1529
1530 int64_t nativeStart;
1531 nativeStart = start;
1532 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1533 status = U_INDEX_OUTOFBOUNDS_ERROR;
1534 return FALSE;
1535 }
1536
1537 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1538 MatchChunkAt((int32_t)nativeStart, FALSE, status);
1539 } else {
1540 MatchAt(nativeStart, FALSE, status);
1541 }
1542 return fMatch;
1543 }
1544
1545
1546
1547 //--------------------------------------------------------------------------------
1548 //
1549 // matches()
1550 //
1551 //--------------------------------------------------------------------------------
1552 UBool RegexMatcher::matches(UErrorCode &status) {
1553 if (U_FAILURE(status)) {
1554 return FALSE;
1555 }
1556 if (U_FAILURE(fDeferredStatus)) {
1557 status = fDeferredStatus;
1558 return FALSE;
1559 }
1560
1561 if (fInputUniStrMaybeMutable) {
1562 if (compat_SyncMutableUTextContents(fInputText)) {
1563 fInputLength = utext_nativeLength(fInputText);
1564 reset();
1565 }
1566 }
1567 else {
1568 resetPreserveRegion();
1569 }
1570
1571 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1572 MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1573 } else {
1574 MatchAt(fActiveStart, TRUE, status);
1575 }
1576 return fMatch;
1577 }
1578
1579
1580 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1581 if (U_FAILURE(status)) {
1582 return FALSE;
1583 }
1584 if (U_FAILURE(fDeferredStatus)) {
1585 status = fDeferredStatus;
1586 return FALSE;
1587 }
1588 reset();
1589
1590 if (start < 0) {
1591 status = U_INDEX_OUTOFBOUNDS_ERROR;
1592 return FALSE;
1593 }
1594
1595 if (fInputUniStrMaybeMutable) {
1596 if (compat_SyncMutableUTextContents(fInputText)) {
1597 fInputLength = utext_nativeLength(fInputText);
1598 reset();
1599 }
1600 }
1601
1602 int64_t nativeStart;
1603 nativeStart = start;
1604 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1605 status = U_INDEX_OUTOFBOUNDS_ERROR;
1606 return FALSE;
1607 }
1608
1609 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1610 MatchChunkAt((int32_t)nativeStart, TRUE, status);
1611 } else {
1612 MatchAt(nativeStart, TRUE, status);
1613 }
1614 return fMatch;
1615 }
1616
1617
1618
1619 //--------------------------------------------------------------------------------
1620 //
1621 // pattern
1622 //
1623 //--------------------------------------------------------------------------------
1624 const RegexPattern &RegexMatcher::pattern() const {
1625 return *fPattern;
1626 }
1627
1628
1629
1630 //--------------------------------------------------------------------------------
1631 //
1632 // region
1633 //
1634 //--------------------------------------------------------------------------------
1635 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1636 if (U_FAILURE(status)) {
1637 return *this;
1638 }
1639
1640 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1641 status = U_ILLEGAL_ARGUMENT_ERROR;
1642 }
1643
1644 int64_t nativeStart = regionStart;
1645 int64_t nativeLimit = regionLimit;
1646 if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1647 status = U_ILLEGAL_ARGUMENT_ERROR;
1648 }
1649
1650 if (startIndex == -1)
1651 this->reset();
1652 else
1653 resetPreserveRegion();
1654
1655 fRegionStart = nativeStart;
1656 fRegionLimit = nativeLimit;
1657 fActiveStart = nativeStart;
1658 fActiveLimit = nativeLimit;
1659
1660 if (startIndex != -1) {
1661 if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1662 status = U_INDEX_OUTOFBOUNDS_ERROR;
1663 }
1664 fMatchEnd = startIndex;
1665 }
1666
1667 if (!fTransparentBounds) {
1668 fLookStart = nativeStart;
1669 fLookLimit = nativeLimit;
1670 }
1671 if (fAnchoringBounds) {
1672 fAnchorStart = nativeStart;
1673 fAnchorLimit = nativeLimit;
1674 }
1675 return *this;
1676 }
1677
1678 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1679 return region(start, limit, -1, status);
1680 }
1681
1682 //--------------------------------------------------------------------------------
1683 //
1684 // regionEnd
1685 //
1686 //--------------------------------------------------------------------------------
1687 int32_t RegexMatcher::regionEnd() const {
1688 return (int32_t)fRegionLimit;
1689 }
1690
1691 int64_t RegexMatcher::regionEnd64() const {
1692 return fRegionLimit;
1693 }
1694
1695 //--------------------------------------------------------------------------------
1696 //
1697 // regionStart
1698 //
1699 //--------------------------------------------------------------------------------
1700 int32_t RegexMatcher::regionStart() const {
1701 return (int32_t)fRegionStart;
1702 }
1703
1704 int64_t RegexMatcher::regionStart64() const {
1705 return fRegionStart;
1706 }
1707
1708
1709 //--------------------------------------------------------------------------------
1710 //
1711 // replaceAll
1712 //
1713 //--------------------------------------------------------------------------------
1714 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1715 UText replacementText = UTEXT_INITIALIZER;
1716 UText resultText = UTEXT_INITIALIZER;
1717 UnicodeString resultString;
1718 if (U_FAILURE(status)) {
1719 return resultString;
1720 }
1721
1722 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1723 utext_openUnicodeString(&resultText, &resultString, &status);
1724
1725 replaceAll(&replacementText, &resultText, status);
1726
1727 utext_close(&resultText);
1728 utext_close(&replacementText);
1729
1730 return resultString;
1731 }
1732
1733
1734 //
1735 // replaceAll, UText mode
1736 //
1737 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1738 if (U_FAILURE(status)) {
1739 return dest;
1740 }
1741 if (U_FAILURE(fDeferredStatus)) {
1742 status = fDeferredStatus;
1743 return dest;
1744 }
1745
1746 if (dest == NULL) {
1747 UnicodeString emptyString;
1748 UText empty = UTEXT_INITIALIZER;
1749
1750 utext_openUnicodeString(&empty, &emptyString, &status);
1751 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1752 utext_close(&empty);
1753 }
1754
1755 if (U_SUCCESS(status)) {
1756 reset();
1757 while (find()) {
1758 appendReplacement(dest, replacement, status);
1759 if (U_FAILURE(status)) {
1760 break;
1761 }
1762 }
1763 appendTail(dest, status);
1764 }
1765
1766 return dest;
1767 }
1768
1769
1770 //--------------------------------------------------------------------------------
1771 //
1772 // replaceFirst
1773 //
1774 //--------------------------------------------------------------------------------
1775 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1776 UText replacementText = UTEXT_INITIALIZER;
1777 UText resultText = UTEXT_INITIALIZER;
1778 UnicodeString resultString;
1779
1780 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1781 utext_openUnicodeString(&resultText, &resultString, &status);
1782
1783 replaceFirst(&replacementText, &resultText, status);
1784
1785 utext_close(&resultText);
1786 utext_close(&replacementText);
1787
1788 return resultString;
1789 }
1790
1791 //
1792 // replaceFirst, UText mode
1793 //
1794 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1795 if (U_FAILURE(status)) {
1796 return dest;
1797 }
1798 if (U_FAILURE(fDeferredStatus)) {
1799 status = fDeferredStatus;
1800 return dest;
1801 }
1802
1803 reset();
1804 if (!find()) {
1805 return getInput(dest, status);
1806 }
1807
1808 if (dest == NULL) {
1809 UnicodeString emptyString;
1810 UText empty = UTEXT_INITIALIZER;
1811
1812 utext_openUnicodeString(&empty, &emptyString, &status);
1813 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1814 utext_close(&empty);
1815 }
1816
1817 appendReplacement(dest, replacement, status);
1818 appendTail(dest, status);
1819
1820 return dest;
1821 }
1822
1823
1824 //--------------------------------------------------------------------------------
1825 //
1826 // requireEnd
1827 //
1828 //--------------------------------------------------------------------------------
1829 UBool RegexMatcher::requireEnd() const {
1830 return fRequireEnd;
1831 }
1832
1833
1834 //--------------------------------------------------------------------------------
1835 //
1836 // reset
1837 //
1838 //--------------------------------------------------------------------------------
1839 RegexMatcher &RegexMatcher::reset() {
1840 fRegionStart = 0;
1841 fRegionLimit = fInputLength;
1842 fActiveStart = 0;
1843 fActiveLimit = fInputLength;
1844 fAnchorStart = 0;
1845 fAnchorLimit = fInputLength;
1846 fLookStart = 0;
1847 fLookLimit = fInputLength;
1848 resetPreserveRegion();
1849 return *this;
1850 }
1851
1852
1853
1854 void RegexMatcher::resetPreserveRegion() {
1855 fMatchStart = 0;
1856 fMatchEnd = 0;
1857 fLastMatchEnd = -1;
1858 fAppendPosition = 0;
1859 fMatch = FALSE;
1860 fHitEnd = FALSE;
1861 fRequireEnd = FALSE;
1862 fTime = 0;
1863 fTickCounter = TIMER_INITIAL_VALUE;
1864 //resetStack(); // more expensive than it looks...
1865 }
1866
1867
1868 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1869 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1870 if (fPattern->fNeedsAltInput) {
1871 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1872 }
1873 fInputLength = utext_nativeLength(fInputText);
1874
1875 reset();
1876 delete fInput;
1877 fInput = NULL;
1878
1879 // Do the following for any UnicodeString.
1880 // This is for compatibility for those clients who modify the input string "live" during regex operations.
1881 fInputUniStrMaybeMutable = TRUE;
1882
1883 if (fWordBreakItr != NULL) {
1884 #if UCONFIG_NO_BREAK_ITERATION==0
1885 UErrorCode status = U_ZERO_ERROR;
1886 fWordBreakItr->setText(fInputText, status);
1887 #endif
1888 }
1889 return *this;
1890 }
1891
1892
1893 RegexMatcher &RegexMatcher::reset(UText *input) {
1894 if (fInputText != input) {
1895 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1896 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1897 fInputLength = utext_nativeLength(fInputText);
1898
1899 delete fInput;
1900 fInput = NULL;
1901
1902 if (fWordBreakItr != NULL) {
1903 #if UCONFIG_NO_BREAK_ITERATION==0
1904 UErrorCode status = U_ZERO_ERROR;
1905 fWordBreakItr->setText(input, status);
1906 #endif
1907 }
1908 }
1909 reset();
1910 fInputUniStrMaybeMutable = FALSE;
1911
1912 return *this;
1913 }
1914
1915 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
1916 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1917 return *this;
1918 }*/
1919
1920 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1921 if (U_FAILURE(status)) {
1922 return *this;
1923 }
1924 reset(); // Reset also resets the region to be the entire string.
1925
1926 if (position < 0 || position > fActiveLimit) {
1927 status = U_INDEX_OUTOFBOUNDS_ERROR;
1928 return *this;
1929 }
1930 fMatchEnd = position;
1931 return *this;
1932 }
1933
1934
1935 //--------------------------------------------------------------------------------
1936 //
1937 // refresh
1938 //
1939 //--------------------------------------------------------------------------------
1940 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1941 if (U_FAILURE(status)) {
1942 return *this;
1943 }
1944 if (input == NULL) {
1945 status = U_ILLEGAL_ARGUMENT_ERROR;
1946 return *this;
1947 }
1948 if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1949 status = U_ILLEGAL_ARGUMENT_ERROR;
1950 return *this;
1951 }
1952 int64_t pos = utext_getNativeIndex(fInputText);
1953 // Shallow read-only clone of the new UText into the existing input UText
1954 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
1955 if (U_FAILURE(status)) {
1956 return *this;
1957 }
1958 utext_setNativeIndex(fInputText, pos);
1959
1960 if (fAltInputText != NULL) {
1961 pos = utext_getNativeIndex(fAltInputText);
1962 fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
1963 if (U_FAILURE(status)) {
1964 return *this;
1965 }
1966 utext_setNativeIndex(fAltInputText, pos);
1967 }
1968 return *this;
1969 }
1970
1971
1972
1973 //--------------------------------------------------------------------------------
1974 //
1975 // setTrace
1976 //
1977 //--------------------------------------------------------------------------------
1978 void RegexMatcher::setTrace(UBool state) {
1979 fTraceDebug = state;
1980 }
1981
1982
1983
1984 //---------------------------------------------------------------------
1985 //
1986 // split
1987 //
1988 //---------------------------------------------------------------------
1989 int32_t RegexMatcher::split(const UnicodeString &input,
1990 UnicodeString dest[],
1991 int32_t destCapacity,
1992 UErrorCode &status)
1993 {
1994 UText inputText = UTEXT_INITIALIZER;
1995 utext_openConstUnicodeString(&inputText, &input, &status);
1996 if (U_FAILURE(status)) {
1997 return 0;
1998 }
1999
2000 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2001 if (destText == NULL) {
2002 status = U_MEMORY_ALLOCATION_ERROR;
2003 return 0;
2004 }
2005 int32_t i;
2006 for (i = 0; i < destCapacity; i++) {
2007 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2008 }
2009
2010 int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2011
2012 for (i = 0; i < destCapacity; i++) {
2013 utext_close(destText[i]);
2014 }
2015
2016 uprv_free(destText);
2017 utext_close(&inputText);
2018 return fieldCount;
2019 }
2020
2021 //
2022 // split, UText mode
2023 //
2024 int32_t RegexMatcher::split(UText *input,
2025 UText *dest[],
2026 int32_t destCapacity,
2027 UErrorCode &status)
2028 {
2029 //
2030 // Check arguements for validity
2031 //
2032 if (U_FAILURE(status)) {
2033 return 0;
2034 };
2035
2036 if (destCapacity < 1) {
2037 status = U_ILLEGAL_ARGUMENT_ERROR;
2038 return 0;
2039 }
2040
2041 //
2042 // Reset for the input text
2043 //
2044 reset(input);
2045 int64_t nextOutputStringStart = 0;
2046 if (fActiveLimit == 0) {
2047 return 0;
2048 }
2049
2050 //
2051 // Loop through the input text, searching for the delimiter pattern
2052 //
2053 int32_t i;
2054 int32_t numCaptureGroups = fPattern->fGroupMap->size();
2055 for (i=0; ; i++) {
2056 if (i>=destCapacity-1) {
2057 // There is one or zero output string left.
2058 // Fill the last output string with whatever is left from the input, then exit the loop.
2059 // ( i will be == destCapacity if we filled the output array while processing
2060 // capture groups of the delimiter expression, in which case we will discard the
2061 // last capture group saved in favor of the unprocessed remainder of the
2062 // input string.)
2063 i = destCapacity-1;
2064 if (fActiveLimit > nextOutputStringStart) {
2065 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2066 if (dest[i]) {
2067 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2068 input->chunkContents+nextOutputStringStart,
2069 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2070 } else {
2071 UText remainingText = UTEXT_INITIALIZER;
2072 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2073 fActiveLimit-nextOutputStringStart, &status);
2074 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2075 utext_close(&remainingText);
2076 }
2077 } else {
2078 UErrorCode lengthStatus = U_ZERO_ERROR;
2079 int32_t remaining16Length =
2080 utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2081 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2082 if (remainingChars == NULL) {
2083 status = U_MEMORY_ALLOCATION_ERROR;
2084 break;
2085 }
2086
2087 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2088 if (dest[i]) {
2089 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2090 } else {
2091 UText remainingText = UTEXT_INITIALIZER;
2092 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2093 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2094 utext_close(&remainingText);
2095 }
2096
2097 uprv_free(remainingChars);
2098 }
2099 }
2100 break;
2101 }
2102 if (find()) {
2103 // We found another delimiter. Move everything from where we started looking
2104 // up until the start of the delimiter into the next output string.
2105 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2106 if (dest[i]) {
2107 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2108 input->chunkContents+nextOutputStringStart,
2109 (int32_t)(fMatchStart-nextOutputStringStart), &status);
2110 } else {
2111 UText remainingText = UTEXT_INITIALIZER;
2112 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2113 fMatchStart-nextOutputStringStart, &status);
2114 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2115 utext_close(&remainingText);
2116 }
2117 } else {
2118 UErrorCode lengthStatus = U_ZERO_ERROR;
2119 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2120 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2121 if (remainingChars == NULL) {
2122 status = U_MEMORY_ALLOCATION_ERROR;
2123 break;
2124 }
2125 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2126 if (dest[i]) {
2127 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2128 } else {
2129 UText remainingText = UTEXT_INITIALIZER;
2130 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2131 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2132 utext_close(&remainingText);
2133 }
2134
2135 uprv_free(remainingChars);
2136 }
2137 nextOutputStringStart = fMatchEnd;
2138
2139 // If the delimiter pattern has capturing parentheses, the captured
2140 // text goes out into the next n destination strings.
2141 int32_t groupNum;
2142 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2143 if (i >= destCapacity-2) {
2144 // Never fill the last available output string with capture group text.
2145 // It will filled with the last field, the remainder of the
2146 // unsplit input text.
2147 break;
2148 }
2149 i++;
2150 dest[i] = group(groupNum, dest[i], status);
2151 }
2152
2153 if (nextOutputStringStart == fActiveLimit) {
2154 // The delimiter was at the end of the string. We're done, but first
2155 // we output one last empty string, for the empty field following
2156 // the delimiter at the end of input.
2157 if (i+1 < destCapacity) {
2158 ++i;
2159 if (dest[i] == NULL) {
2160 dest[i] = utext_openUChars(NULL, NULL, 0, &status);
2161 } else {
2162 static UChar emptyString[] = {(UChar)0};
2163 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
2164 }
2165 }
2166 break;
2167
2168 }
2169 }
2170 else
2171 {
2172 // We ran off the end of the input while looking for the next delimiter.
2173 // All the remaining text goes into the current output string.
2174 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2175 if (dest[i]) {
2176 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2177 input->chunkContents+nextOutputStringStart,
2178 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2179 } else {
2180 UText remainingText = UTEXT_INITIALIZER;
2181 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2182 fActiveLimit-nextOutputStringStart, &status);
2183 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2184 utext_close(&remainingText);
2185 }
2186 } else {
2187 UErrorCode lengthStatus = U_ZERO_ERROR;
2188 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2189 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2190 if (remainingChars == NULL) {
2191 status = U_MEMORY_ALLOCATION_ERROR;
2192 break;
2193 }
2194
2195 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2196 if (dest[i]) {
2197 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2198 } else {
2199 UText remainingText = UTEXT_INITIALIZER;
2200 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2201 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2202 utext_close(&remainingText);
2203 }
2204
2205 uprv_free(remainingChars);
2206 }
2207 break;
2208 }
2209 if (U_FAILURE(status)) {
2210 break;
2211 }
2212 } // end of for loop
2213 return i+1;
2214 }
2215
2216
2217 //--------------------------------------------------------------------------------
2218 //
2219 // start
2220 //
2221 //--------------------------------------------------------------------------------
2222 int32_t RegexMatcher::start(UErrorCode &status) const {
2223 return start(0, status);
2224 }
2225
2226 int64_t RegexMatcher::start64(UErrorCode &status) const {
2227 return start64(0, status);
2228 }
2229
2230 //--------------------------------------------------------------------------------
2231 //
2232 // start(int32_t group, UErrorCode &status)
2233 //
2234 //--------------------------------------------------------------------------------
2235
2236 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2237 if (U_FAILURE(status)) {
2238 return -1;
2239 }
2240 if (U_FAILURE(fDeferredStatus)) {
2241 status = fDeferredStatus;
2242 return -1;
2243 }
2244 if (fMatch == FALSE) {
2245 status = U_REGEX_INVALID_STATE;
2246 return -1;
2247 }
2248 if (group < 0 || group > fPattern->fGroupMap->size()) {
2249 status = U_INDEX_OUTOFBOUNDS_ERROR;
2250 return -1;
2251 }
2252 int64_t s;
2253 if (group == 0) {
2254 s = fMatchStart;
2255 } else {
2256 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2257 U_ASSERT(groupOffset < fPattern->fFrameSize);
2258 U_ASSERT(groupOffset >= 0);
2259 s = fFrame->fExtra[groupOffset];
2260 }
2261
2262 return s;
2263 }
2264
2265
2266 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2267 return (int32_t)start64(group, status);
2268 }
2269
2270 //--------------------------------------------------------------------------------
2271 //
2272 // useAnchoringBounds
2273 //
2274 //--------------------------------------------------------------------------------
2275 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2276 fAnchoringBounds = b;
2277 fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2278 fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2279 return *this;
2280 }
2281
2282
2283 //--------------------------------------------------------------------------------
2284 //
2285 // useTransparentBounds
2286 //
2287 //--------------------------------------------------------------------------------
2288 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2289 fTransparentBounds = b;
2290 fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2291 fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2292 return *this;
2293 }
2294
2295 //--------------------------------------------------------------------------------
2296 //
2297 // setTimeLimit
2298 //
2299 //--------------------------------------------------------------------------------
2300 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2301 if (U_FAILURE(status)) {
2302 return;
2303 }
2304 if (U_FAILURE(fDeferredStatus)) {
2305 status = fDeferredStatus;
2306 return;
2307 }
2308 if (limit < 0) {
2309 status = U_ILLEGAL_ARGUMENT_ERROR;
2310 return;
2311 }
2312 fTimeLimit = limit;
2313 }
2314
2315
2316 //--------------------------------------------------------------------------------
2317 //
2318 // getTimeLimit
2319 //
2320 //--------------------------------------------------------------------------------
2321 int32_t RegexMatcher::getTimeLimit() const {
2322 return fTimeLimit;
2323 }
2324
2325
2326 //--------------------------------------------------------------------------------
2327 //
2328 // setStackLimit
2329 //
2330 //--------------------------------------------------------------------------------
2331 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2332 if (U_FAILURE(status)) {
2333 return;
2334 }
2335 if (U_FAILURE(fDeferredStatus)) {
2336 status = fDeferredStatus;
2337 return;
2338 }
2339 if (limit < 0) {
2340 status = U_ILLEGAL_ARGUMENT_ERROR;
2341 return;
2342 }
2343
2344 // Reset the matcher. This is needed here in case there is a current match
2345 // whose final stack frame (containing the match results, pointed to by fFrame)
2346 // would be lost by resizing to a smaller stack size.
2347 reset();
2348
2349 if (limit == 0) {
2350 // Unlimited stack expansion
2351 fStack->setMaxCapacity(0);
2352 } else {
2353 // Change the units of the limit from bytes to ints, and bump the size up
2354 // to be big enough to hold at least one stack frame for the pattern,
2355 // if it isn't there already.
2356 int32_t adjustedLimit = limit / sizeof(int32_t);
2357 if (adjustedLimit < fPattern->fFrameSize) {
2358 adjustedLimit = fPattern->fFrameSize;
2359 }
2360 fStack->setMaxCapacity(adjustedLimit);
2361 }
2362 fStackLimit = limit;
2363 }
2364
2365
2366 //--------------------------------------------------------------------------------
2367 //
2368 // getStackLimit
2369 //
2370 //--------------------------------------------------------------------------------
2371 int32_t RegexMatcher::getStackLimit() const {
2372 return fStackLimit;
2373 }
2374
2375
2376 //--------------------------------------------------------------------------------
2377 //
2378 // setMatchCallback
2379 //
2380 //--------------------------------------------------------------------------------
2381 void RegexMatcher::setMatchCallback(URegexMatchCallback *callback,
2382 const void *context,
2383 UErrorCode &status) {
2384 if (U_FAILURE(status)) {
2385 return;
2386 }
2387 fCallbackFn = callback;
2388 fCallbackContext = context;
2389 }
2390
2391
2392 //--------------------------------------------------------------------------------
2393 //
2394 // getMatchCallback
2395 //
2396 //--------------------------------------------------------------------------------
2397 void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback,
2398 const void *&context,
2399 UErrorCode &status) {
2400 if (U_FAILURE(status)) {
2401 return;
2402 }
2403 callback = fCallbackFn;
2404 context = fCallbackContext;
2405 }
2406
2407
2408 //--------------------------------------------------------------------------------
2409 //
2410 // setMatchCallback
2411 //
2412 //--------------------------------------------------------------------------------
2413 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback,
2414 const void *context,
2415 UErrorCode &status) {
2416 if (U_FAILURE(status)) {
2417 return;
2418 }
2419 fFindProgressCallbackFn = callback;
2420 fFindProgressCallbackContext = context;
2421 }
2422
2423
2424 //--------------------------------------------------------------------------------
2425 //
2426 // getMatchCallback
2427 //
2428 //--------------------------------------------------------------------------------
2429 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback,
2430 const void *&context,
2431 UErrorCode &status) {
2432 if (U_FAILURE(status)) {
2433 return;
2434 }
2435 callback = fFindProgressCallbackFn;
2436 context = fFindProgressCallbackContext;
2437 }
2438
2439
2440 //================================================================================
2441 //
2442 // Code following this point in this file is the internal
2443 // Match Engine Implementation.
2444 //
2445 //================================================================================
2446
2447
2448 //--------------------------------------------------------------------------------
2449 //
2450 // resetStack
2451 // Discard any previous contents of the state save stack, and initialize a
2452 // new stack frame to all -1. The -1s are needed for capture group limits,
2453 // where they indicate that a group has not yet matched anything.
2454 //--------------------------------------------------------------------------------
2455 REStackFrame *RegexMatcher::resetStack() {
2456 // Discard any previous contents of the state save stack, and initialize a
2457 // new stack frame with all -1 data. The -1s are needed for capture group limits,
2458 // where they indicate that a group has not yet matched anything.
2459 fStack->removeAllElements();
2460
2461 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2462 int32_t i;
2463 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2464 iFrame->fExtra[i] = -1;
2465 }
2466 return iFrame;
2467 }
2468
2469
2470
2471 //--------------------------------------------------------------------------------
2472 //
2473 // isWordBoundary
2474 // in perl, "xab..cd..", \b is true at positions 0,3,5,7
2475 // For us,
2476 // If the current char is a combining mark,
2477 // \b is FALSE.
2478 // Else Scan backwards to the first non-combining char.
2479 // We are at a boundary if the this char and the original chars are
2480 // opposite in membership in \w set
2481 //
2482 // parameters: pos - the current position in the input buffer
2483 //
2484 // TODO: double-check edge cases at region boundaries.
2485 //
2486 //--------------------------------------------------------------------------------
2487 UBool RegexMatcher::isWordBoundary(int64_t pos) {
2488 UBool isBoundary = FALSE;
2489 UBool cIsWord = FALSE;
2490
2491 if (pos >= fLookLimit) {
2492 fHitEnd = TRUE;
2493 } else {
2494 // Determine whether char c at current position is a member of the word set of chars.
2495 // If we're off the end of the string, behave as though we're not at a word char.
2496 UTEXT_SETNATIVEINDEX(fInputText, pos);
2497 UChar32 c = UTEXT_CURRENT32(fInputText);
2498 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2499 // Current char is a combining one. Not a boundary.
2500 return FALSE;
2501 }
2502 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2503 }
2504
2505 // Back up until we come to a non-combining char, determine whether
2506 // that char is a word char.
2507 UBool prevCIsWord = FALSE;
2508 for (;;) {
2509 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2510 break;
2511 }
2512 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2513 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2514 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2515 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2516 break;
2517 }
2518 }
2519 isBoundary = cIsWord ^ prevCIsWord;
2520 return isBoundary;
2521 }
2522
2523 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2524 UBool isBoundary = FALSE;
2525 UBool cIsWord = FALSE;
2526
2527 const UChar *inputBuf = fInputText->chunkContents;
2528
2529 if (pos >= fLookLimit) {
2530 fHitEnd = TRUE;
2531 } else {
2532 // Determine whether char c at current position is a member of the word set of chars.
2533 // If we're off the end of the string, behave as though we're not at a word char.
2534 UChar32 c;
2535 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2536 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2537 // Current char is a combining one. Not a boundary.
2538 return FALSE;
2539 }
2540 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2541 }
2542
2543 // Back up until we come to a non-combining char, determine whether
2544 // that char is a word char.
2545 UBool prevCIsWord = FALSE;
2546 for (;;) {
2547 if (pos <= fLookStart) {
2548 break;
2549 }
2550 UChar32 prevChar;
2551 U16_PREV(inputBuf, fLookStart, pos, prevChar);
2552 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2553 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2554 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2555 break;
2556 }
2557 }
2558 isBoundary = cIsWord ^ prevCIsWord;
2559 return isBoundary;
2560 }
2561
2562 //--------------------------------------------------------------------------------
2563 //
2564 // isUWordBoundary
2565 //
2566 // Test for a word boundary using RBBI word break.
2567 //
2568 // parameters: pos - the current position in the input buffer
2569 //
2570 //--------------------------------------------------------------------------------
2571 UBool RegexMatcher::isUWordBoundary(int64_t pos) {
2572 UBool returnVal = FALSE;
2573 #if UCONFIG_NO_BREAK_ITERATION==0
2574
2575 // If we haven't yet created a break iterator for this matcher, do it now.
2576 if (fWordBreakItr == NULL) {
2577 fWordBreakItr =
2578 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
2579 if (U_FAILURE(fDeferredStatus)) {
2580 return FALSE;
2581 }
2582 fWordBreakItr->setText(fInputText, fDeferredStatus);
2583 }
2584
2585 if (pos >= fLookLimit) {
2586 fHitEnd = TRUE;
2587 returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real"
2588 // words are not boundaries. All non-word chars stand by themselves,
2589 // with word boundaries on both sides.
2590 } else {
2591 if (!UTEXT_USES_U16(fInputText)) {
2592 // !!!: Would like a better way to do this!
2593 UErrorCode status = U_ZERO_ERROR;
2594 pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
2595 }
2596 returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2597 }
2598 #endif
2599 return returnVal;
2600 }
2601
2602 //--------------------------------------------------------------------------------
2603 //
2604 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state
2605 // saves. Increment the "time" counter, and call the
2606 // user callback function if there is one installed.
2607 //
2608 // If the match operation needs to be aborted, either for a time-out
2609 // or because the user callback asked for it, just set an error status.
2610 // The engine will pick that up and stop in its outer loop.
2611 //
2612 //--------------------------------------------------------------------------------
2613 void RegexMatcher::IncrementTime(UErrorCode &status) {
2614 fTickCounter = TIMER_INITIAL_VALUE;
2615 fTime++;
2616 if (fCallbackFn != NULL) {
2617 if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2618 status = U_REGEX_STOPPED_BY_CALLER;
2619 return;
2620 }
2621 }
2622 if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2623 status = U_REGEX_TIME_OUT;
2624 }
2625 }
2626
2627 //--------------------------------------------------------------------------------
2628 //
2629 // ReportFindProgress This function is called once for each advance in the target
2630 // string from the find() function, and calls the user progress callback
2631 // function if there is one installed.
2632 //
2633 // NOTE:
2634 //
2635 // If the match operation needs to be aborted because the user
2636 // callback asked for it, just set an error status.
2637 // The engine will pick that up and stop in its outer loop.
2638 //
2639 //--------------------------------------------------------------------------------
2640 UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) {
2641 if (fFindProgressCallbackFn != NULL) {
2642 if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) {
2643 status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/;
2644 return FALSE;
2645 }
2646 }
2647 return TRUE;
2648 }
2649
2650 //--------------------------------------------------------------------------------
2651 //
2652 // StateSave
2653 // Make a new stack frame, initialized as a copy of the current stack frame.
2654 // Set the pattern index in the original stack frame from the operand value
2655 // in the opcode. Execution of the engine continues with the state in
2656 // the newly created stack frame
2657 //
2658 // Note that reserveBlock() may grow the stack, resulting in the
2659 // whole thing being relocated in memory.
2660 //
2661 // Parameters:
2662 // fp The top frame pointer when called. At return, a new
2663 // fame will be present
2664 // savePatIdx An index into the compiled pattern. Goes into the original
2665 // (not new) frame. If execution ever back-tracks out of the
2666 // new frame, this will be where we continue from in the pattern.
2667 // Return
2668 // The new frame pointer.
2669 //
2670 //--------------------------------------------------------------------------------
2671 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2672 // push storage for a new frame.
2673 int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2674 if (newFP == NULL) {
2675 // Failure on attempted stack expansion.
2676 // Stack function set some other error code, change it to a more
2677 // specific one for regular expressions.
2678 status = U_REGEX_STACK_OVERFLOW;
2679 // We need to return a writable stack frame, so just return the
2680 // previous frame. The match operation will stop quickly
2681 // because of the error status, after which the frame will never
2682 // be looked at again.
2683 return fp;
2684 }
2685 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack.
2686
2687 // New stack frame = copy of old top frame.
2688 int64_t *source = (int64_t *)fp;
2689 int64_t *dest = newFP;
2690 for (;;) {
2691 *dest++ = *source++;
2692 if (source == newFP) {
2693 break;
2694 }
2695 }
2696
2697 fTickCounter--;
2698 if (fTickCounter <= 0) {
2699 IncrementTime(status); // Re-initializes fTickCounter
2700 }
2701 fp->fPatIdx = savePatIdx;
2702 return (REStackFrame *)newFP;
2703 }
2704
2705
2706 //--------------------------------------------------------------------------------
2707 //
2708 // MatchAt This is the actual matching engine.
2709 //
2710 // startIdx: begin matching a this index.
2711 // toEnd: if true, match must extend to end of the input region
2712 //
2713 //--------------------------------------------------------------------------------
2714 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2715 UBool isMatch = FALSE; // True if the we have a match.
2716
2717 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2718
2719 int32_t op; // Operation from the compiled pattern, split into
2720 int32_t opType; // the opcode
2721 int32_t opValue; // and the operand value.
2722
2723 #ifdef REGEX_RUN_DEBUG
2724 if (fTraceDebug)
2725 {
2726 printf("MatchAt(startIdx=%ld)\n", startIdx);
2727 printf("Original Pattern: ");
2728 UChar32 c = utext_next32From(fPattern->fPattern, 0);
2729 while (c != U_SENTINEL) {
2730 if (c<32 || c>256) {
2731 c = '.';
2732 }
2733 printf("%c", c);
2734
2735 c = UTEXT_NEXT32(fPattern->fPattern);
2736 }
2737 printf("\n");
2738 printf("Input String: ");
2739 c = utext_next32From(fInputText, 0);
2740 while (c != U_SENTINEL) {
2741 if (c<32 || c>256) {
2742 c = '.';
2743 }
2744 printf("%c", c);
2745
2746 c = UTEXT_NEXT32(fInputText);
2747 }
2748 printf("\n");
2749 printf("\n");
2750 }
2751 #endif
2752
2753 if (U_FAILURE(status)) {
2754 return;
2755 }
2756
2757 // Cache frequently referenced items from the compiled pattern
2758 //
2759 int64_t *pat = fPattern->fCompiledPat->getBuffer();
2760
2761 const UChar *litText = fPattern->fLiteralText.getBuffer();
2762 UVector *sets = fPattern->fSets;
2763
2764 fFrameSize = fPattern->fFrameSize;
2765 REStackFrame *fp = resetStack();
2766
2767 fp->fPatIdx = 0;
2768 fp->fInputIdx = startIdx;
2769
2770 // Zero out the pattern's static data
2771 int32_t i;
2772 for (i = 0; i<fPattern->fDataSize; i++) {
2773 fData[i] = 0;
2774 }
2775
2776 //
2777 // Main loop for interpreting the compiled pattern.
2778 // One iteration of the loop per pattern operation performed.
2779 //
2780 for (;;) {
2781 op = (int32_t)pat[fp->fPatIdx];
2782 opType = URX_TYPE(op);
2783 opValue = URX_VAL(op);
2784 #ifdef REGEX_RUN_DEBUG
2785 if (fTraceDebug) {
2786 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2787 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
2788 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2789 fPattern->dumpOp(fp->fPatIdx);
2790 }
2791 #endif
2792 fp->fPatIdx++;
2793
2794 switch (opType) {
2795
2796
2797 case URX_NOP:
2798 break;
2799
2800
2801 case URX_BACKTRACK:
2802 // Force a backtrack. In some circumstances, the pattern compiler
2803 // will notice that the pattern can't possibly match anything, and will
2804 // emit one of these at that point.
2805 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2806 break;
2807
2808
2809 case URX_ONECHAR:
2810 if (fp->fInputIdx < fActiveLimit) {
2811 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2812 UChar32 c = UTEXT_NEXT32(fInputText);
2813 if (c == opValue) {
2814 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2815 break;
2816 }
2817 } else {
2818 fHitEnd = TRUE;
2819 }
2820 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2821 break;
2822
2823
2824 case URX_STRING:
2825 {
2826 // Test input against a literal string.
2827 // Strings require two slots in the compiled pattern, one for the
2828 // offset to the string text, and one for the length.
2829
2830 int32_t stringStartIdx = opValue;
2831 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
2832 fp->fPatIdx++;
2833 opType = URX_TYPE(op);
2834 int32_t stringLen = URX_VAL(op);
2835 U_ASSERT(opType == URX_STRING_LEN);
2836 U_ASSERT(stringLen >= 2);
2837
2838 const UChar *patternString = litText+stringStartIdx;
2839 int32_t patternStringIndex = 0;
2840 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2841 UChar32 inputChar;
2842 UChar32 patternChar;
2843 UBool success = TRUE;
2844 while (patternStringIndex < stringLen) {
2845 if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
2846 success = FALSE;
2847 fHitEnd = TRUE;
2848 break;
2849 }
2850 inputChar = UTEXT_NEXT32(fInputText);
2851 U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
2852 if (patternChar != inputChar) {
2853 success = FALSE;
2854 break;
2855 }
2856 }
2857
2858 if (success) {
2859 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2860 } else {
2861 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2862 }
2863 }
2864 break;
2865
2866
2867 case URX_STATE_SAVE:
2868 fp = StateSave(fp, opValue, status);
2869 break;
2870
2871
2872 case URX_END:
2873 // The match loop will exit via this path on a successful match,
2874 // when we reach the end of the pattern.
2875 if (toEnd && fp->fInputIdx != fActiveLimit) {
2876 // The pattern matched, but not to the end of input. Try some more.
2877 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2878 break;
2879 }
2880 isMatch = TRUE;
2881 goto breakFromLoop;
2882
2883 // Start and End Capture stack frame variables are laid out out like this:
2884 // fp->fExtra[opValue] - The start of a completed capture group
2885 // opValue+1 - The end of a completed capture group
2886 // opValue+2 - the start of a capture group whose end
2887 // has not yet been reached (and might not ever be).
2888 case URX_START_CAPTURE:
2889 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2890 fp->fExtra[opValue+2] = fp->fInputIdx;
2891 break;
2892
2893
2894 case URX_END_CAPTURE:
2895 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2896 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
2897 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
2898 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
2899 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
2900 break;
2901
2902
2903 case URX_DOLLAR: // $, test for End of line
2904 // or for position before new line at end of input
2905 {
2906 if (fp->fInputIdx >= fAnchorLimit) {
2907 // We really are at the end of input. Success.
2908 fHitEnd = TRUE;
2909 fRequireEnd = TRUE;
2910 break;
2911 }
2912
2913 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2914
2915 // If we are positioned just before a new-line that is located at the
2916 // end of input, succeed.
2917 UChar32 c = UTEXT_NEXT32(fInputText);
2918 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2919 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
2920 // If not in the middle of a CR/LF sequence
2921 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
2922 // At new-line at end of input. Success
2923 fHitEnd = TRUE;
2924 fRequireEnd = TRUE;
2925
2926 break;
2927 }
2928 }
2929 } else {
2930 UChar32 nextC = UTEXT_NEXT32(fInputText);
2931 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2932 fHitEnd = TRUE;
2933 fRequireEnd = TRUE;
2934 break; // At CR/LF at end of input. Success
2935 }
2936 }
2937
2938 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2939 }
2940 break;
2941
2942
2943 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
2944 if (fp->fInputIdx >= fAnchorLimit) {
2945 // Off the end of input. Success.
2946 fHitEnd = TRUE;
2947 fRequireEnd = TRUE;
2948 break;
2949 } else {
2950 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2951 UChar32 c = UTEXT_NEXT32(fInputText);
2952 // Either at the last character of input, or off the end.
2953 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
2954 fHitEnd = TRUE;
2955 fRequireEnd = TRUE;
2956 break;
2957 }
2958 }
2959
2960 // Not at end of input. Back-track out.
2961 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2962 break;
2963
2964
2965 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
2966 {
2967 if (fp->fInputIdx >= fAnchorLimit) {
2968 // We really are at the end of input. Success.
2969 fHitEnd = TRUE;
2970 fRequireEnd = TRUE;
2971 break;
2972 }
2973 // If we are positioned just before a new-line, succeed.
2974 // It makes no difference where the new-line is within the input.
2975 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2976 UChar32 c = UTEXT_CURRENT32(fInputText);
2977 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
2978 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
2979 // In multi-line mode, hitting a new-line just before the end of input does not
2980 // set the hitEnd or requireEnd flags
2981 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
2982 break;
2983 }
2984 }
2985 // not at a new line. Fail.
2986 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2987 }
2988 break;
2989
2990
2991 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
2992 {
2993 if (fp->fInputIdx >= fAnchorLimit) {
2994 // We really are at the end of input. Success.
2995 fHitEnd = TRUE;
2996 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
2997 break; // adding a new-line would not lose the match.
2998 }
2999 // If we are not positioned just before a new-line, the test fails; backtrack out.
3000 // It makes no difference where the new-line is within the input.
3001 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3002 if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3003 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3004 }
3005 }
3006 break;
3007
3008
3009 case URX_CARET: // ^, test for start of line
3010 if (fp->fInputIdx != fAnchorStart) {
3011 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3012 }
3013 break;
3014
3015
3016 case URX_CARET_M: // ^, test for start of line in mulit-line mode
3017 {
3018 if (fp->fInputIdx == fAnchorStart) {
3019 // We are at the start input. Success.
3020 break;
3021 }
3022 // Check whether character just before the current pos is a new-line
3023 // unless we are at the end of input
3024 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3025 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3026 if ((fp->fInputIdx < fAnchorLimit) &&
3027 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3028 // It's a new-line. ^ is true. Success.
3029 // TODO: what should be done with positions between a CR and LF?
3030 break;
3031 }
3032 // Not at the start of a line. Fail.
3033 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3034 }
3035 break;
3036
3037
3038 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
3039 {
3040 U_ASSERT(fp->fInputIdx >= fAnchorStart);
3041 if (fp->fInputIdx <= fAnchorStart) {
3042 // We are at the start input. Success.
3043 break;
3044 }
3045 // Check whether character just before the current pos is a new-line
3046 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3047 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3048 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3049 if (c != 0x0a) {
3050 // Not at the start of a line. Back-track out.
3051 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3052 }
3053 }
3054 break;
3055
3056 case URX_BACKSLASH_B: // Test for word boundaries
3057 {
3058 UBool success = isWordBoundary(fp->fInputIdx);
3059 success ^= (UBool)(opValue != 0); // flip sense for \B
3060 if (!success) {
3061 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3062 }
3063 }
3064 break;
3065
3066
3067 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
3068 {
3069 UBool success = isUWordBoundary(fp->fInputIdx);
3070 success ^= (UBool)(opValue != 0); // flip sense for \B
3071 if (!success) {
3072 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3073 }
3074 }
3075 break;
3076
3077
3078 case URX_BACKSLASH_D: // Test for decimal digit
3079 {
3080 if (fp->fInputIdx >= fActiveLimit) {
3081 fHitEnd = TRUE;
3082 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3083 break;
3084 }
3085
3086 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3087
3088 UChar32 c = UTEXT_NEXT32(fInputText);
3089 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
3090 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3091 success ^= (UBool)(opValue != 0); // flip sense for \D
3092 if (success) {
3093 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3094 } else {
3095 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3096 }
3097 }
3098 break;
3099
3100
3101 case URX_BACKSLASH_G: // Test for position at end of previous match
3102 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3103 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3104 }
3105 break;
3106
3107
3108 case URX_BACKSLASH_X:
3109 // Match a Grapheme, as defined by Unicode TR 29.
3110 // Differs slightly from Perl, which consumes combining marks independently
3111 // of context.
3112 {
3113
3114 // Fail if at end of input
3115 if (fp->fInputIdx >= fActiveLimit) {
3116 fHitEnd = TRUE;
3117 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3118 break;
3119 }
3120
3121 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3122
3123 // Examine (and consume) the current char.
3124 // Dispatch into a little state machine, based on the char.
3125 UChar32 c;
3126 c = UTEXT_NEXT32(fInputText);
3127 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3128 UnicodeSet **sets = fPattern->fStaticSets;
3129 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
3130 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
3131 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3132 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3133 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3134 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3135 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3136 goto GC_Extend;
3137
3138
3139
3140 GC_L:
3141 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3142 c = UTEXT_NEXT32(fInputText);
3143 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3144 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3145 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3146 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3147 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3148 (void)UTEXT_PREVIOUS32(fInputText);
3149 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3150 goto GC_Extend;
3151
3152 GC_V:
3153 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3154 c = UTEXT_NEXT32(fInputText);
3155 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3156 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3157 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3158 (void)UTEXT_PREVIOUS32(fInputText);
3159 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3160 goto GC_Extend;
3161
3162 GC_T:
3163 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3164 c = UTEXT_NEXT32(fInputText);
3165 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3166 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3167 (void)UTEXT_PREVIOUS32(fInputText);
3168 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3169 goto GC_Extend;
3170
3171 GC_Extend:
3172 // Combining characters are consumed here
3173 for (;;) {
3174 if (fp->fInputIdx >= fActiveLimit) {
3175 break;
3176 }
3177 c = UTEXT_CURRENT32(fInputText);
3178 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
3179 break;
3180 }
3181 (void)UTEXT_NEXT32(fInputText);
3182 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3183 }
3184 goto GC_Done;
3185
3186 GC_Control:
3187 // Most control chars stand alone (don't combine with combining chars),
3188 // except for that CR/LF sequence is a single grapheme cluster.
3189 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
3190 c = UTEXT_NEXT32(fInputText);
3191 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3192 }
3193
3194 GC_Done:
3195 if (fp->fInputIdx >= fActiveLimit) {
3196 fHitEnd = TRUE;
3197 }
3198 break;
3199 }
3200
3201
3202
3203
3204 case URX_BACKSLASH_Z: // Test for end of Input
3205 if (fp->fInputIdx < fAnchorLimit) {
3206 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3207 } else {
3208 fHitEnd = TRUE;
3209 fRequireEnd = TRUE;
3210 }
3211 break;
3212
3213
3214
3215 case URX_STATIC_SETREF:
3216 {
3217 // Test input character against one of the predefined sets
3218 // (Word Characters, for example)
3219 // The high bit of the op value is a flag for the match polarity.
3220 // 0: success if input char is in set.
3221 // 1: success if input char is not in set.
3222 if (fp->fInputIdx >= fActiveLimit) {
3223 fHitEnd = TRUE;
3224 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3225 break;
3226 }
3227
3228 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3229 opValue &= ~URX_NEG_SET;
3230 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3231
3232 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3233 UChar32 c = UTEXT_NEXT32(fInputText);
3234 if (c < 256) {
3235 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3236 if (s8->contains(c)) {
3237 success = !success;
3238 }
3239 } else {
3240 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3241 if (s->contains(c)) {
3242 success = !success;
3243 }
3244 }
3245 if (success) {
3246 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3247 } else {
3248 // the character wasn't in the set.
3249 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3250 }
3251 }
3252 break;
3253
3254
3255 case URX_STAT_SETREF_N:
3256 {
3257 // Test input character for NOT being a member of one of
3258 // the predefined sets (Word Characters, for example)
3259 if (fp->fInputIdx >= fActiveLimit) {
3260 fHitEnd = TRUE;
3261 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3262 break;
3263 }
3264
3265 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3266
3267 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3268
3269 UChar32 c = UTEXT_NEXT32(fInputText);
3270 if (c < 256) {
3271 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3272 if (s8->contains(c) == FALSE) {
3273 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3274 break;
3275 }
3276 } else {
3277 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3278 if (s->contains(c) == FALSE) {
3279 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3280 break;
3281 }
3282 }
3283 // the character wasn't in the set.
3284 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3285 }
3286 break;
3287
3288
3289 case URX_SETREF:
3290 if (fp->fInputIdx >= fActiveLimit) {
3291 fHitEnd = TRUE;
3292 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3293 break;
3294 } else {
3295 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3296
3297 // There is input left. Pick up one char and test it for set membership.
3298 UChar32 c = UTEXT_NEXT32(fInputText);
3299 U_ASSERT(opValue > 0 && opValue < sets->size());
3300 if (c<256) {
3301 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3302 if (s8->contains(c)) {
3303 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3304 break;
3305 }
3306 } else {
3307 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3308 if (s->contains(c)) {
3309 // The character is in the set. A Match.
3310 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3311 break;
3312 }
3313 }
3314
3315 // the character wasn't in the set.
3316 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3317 }
3318 break;
3319
3320
3321 case URX_DOTANY:
3322 {
3323 // . matches anything, but stops at end-of-line.
3324 if (fp->fInputIdx >= fActiveLimit) {
3325 // At end of input. Match failed. Backtrack out.
3326 fHitEnd = TRUE;
3327 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3328 break;
3329 }
3330
3331 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3332
3333 // There is input left. Advance over one char, unless we've hit end-of-line
3334 UChar32 c = UTEXT_NEXT32(fInputText);
3335 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
3336 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3337 // End of line in normal mode. . does not match.
3338 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3339 break;
3340 }
3341 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3342 }
3343 break;
3344
3345
3346 case URX_DOTANY_ALL:
3347 {
3348 // ., in dot-matches-all (including new lines) mode
3349 if (fp->fInputIdx >= fActiveLimit) {
3350 // At end of input. Match failed. Backtrack out.
3351 fHitEnd = TRUE;
3352 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3353 break;
3354 }
3355
3356 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3357
3358 // There is input left. Advance over one char, except if we are
3359 // at a cr/lf, advance over both of them.
3360 UChar32 c;
3361 c = UTEXT_NEXT32(fInputText);
3362 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3363 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3364 // In the case of a CR/LF, we need to advance over both.
3365 UChar32 nextc = UTEXT_CURRENT32(fInputText);
3366 if (nextc == 0x0a) {
3367 (void)UTEXT_NEXT32(fInputText);
3368 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3369 }
3370 }
3371 }
3372 break;
3373
3374
3375 case URX_DOTANY_UNIX:
3376 {
3377 // '.' operator, matches all, but stops at end-of-line.
3378 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
3379 if (fp->fInputIdx >= fActiveLimit) {
3380 // At end of input. Match failed. Backtrack out.
3381 fHitEnd = TRUE;
3382 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3383 break;
3384 }
3385
3386 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3387
3388 // There is input left. Advance over one char, unless we've hit end-of-line
3389 UChar32 c = UTEXT_NEXT32(fInputText);
3390 if (c == 0x0a) {
3391 // End of line in normal mode. '.' does not match the \n
3392 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3393 } else {
3394 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3395 }
3396 }
3397 break;
3398
3399
3400 case URX_JMP:
3401 fp->fPatIdx = opValue;
3402 break;
3403
3404 case URX_FAIL:
3405 isMatch = FALSE;
3406 goto breakFromLoop;
3407
3408 case URX_JMP_SAV:
3409 U_ASSERT(opValue < fPattern->fCompiledPat->size());
3410 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3411 fp->fPatIdx = opValue; // Then JMP.
3412 break;
3413
3414 case URX_JMP_SAV_X:
3415 // This opcode is used with (x)+, when x can match a zero length string.
3416 // Same as JMP_SAV, except conditional on the match having made forward progress.
3417 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3418 // data address of the input position at the start of the loop.
3419 {
3420 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3421 int32_t stoOp = (int32_t)pat[opValue-1];
3422 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3423 int32_t frameLoc = URX_VAL(stoOp);
3424 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3425 int64_t prevInputIdx = fp->fExtra[frameLoc];
3426 U_ASSERT(prevInputIdx <= fp->fInputIdx);
3427 if (prevInputIdx < fp->fInputIdx) {
3428 // The match did make progress. Repeat the loop.
3429 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3430 fp->fPatIdx = opValue;
3431 fp->fExtra[frameLoc] = fp->fInputIdx;
3432 }
3433 // If the input position did not advance, we do nothing here,
3434 // execution will fall out of the loop.
3435 }
3436 break;
3437
3438 case URX_CTR_INIT:
3439 {
3440 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3441 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3442
3443 // Pick up the three extra operands that CTR_INIT has, and
3444 // skip the pattern location counter past
3445 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3446 fp->fPatIdx += 3;
3447 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3448 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3449 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3450 U_ASSERT(minCount>=0);
3451 U_ASSERT(maxCount>=minCount || maxCount==-1);
3452 U_ASSERT(loopLoc>=fp->fPatIdx);
3453
3454 if (minCount == 0) {
3455 fp = StateSave(fp, loopLoc+1, status);
3456 }
3457 if (maxCount == -1) {
3458 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
3459 } else if (maxCount == 0) {
3460 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3461 }
3462 }
3463 break;
3464
3465 case URX_CTR_LOOP:
3466 {
3467 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3468 int32_t initOp = (int32_t)pat[opValue];
3469 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3470 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3471 int32_t minCount = (int32_t)pat[opValue+2];
3472 int32_t maxCount = (int32_t)pat[opValue+3];
3473 (*pCounter)++;
3474 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3475 U_ASSERT(*pCounter == maxCount);
3476 break;
3477 }
3478 if (*pCounter >= minCount) {
3479 if (maxCount == -1) {
3480 // Loop has no hard upper bound.
3481 // Check that it is progressing through the input, break if it is not.
3482 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
3483 if (fp->fInputIdx == *pLastInputIdx) {
3484 break;
3485 } else {
3486 *pLastInputIdx = fp->fInputIdx;
3487 }
3488 }
3489 fp = StateSave(fp, fp->fPatIdx, status);
3490 }
3491 fp->fPatIdx = opValue + 4; // Loop back.
3492 }
3493 break;
3494
3495 case URX_CTR_INIT_NG:
3496 {
3497 // Initialize a non-greedy loop
3498 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3499 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3500
3501 // Pick up the three extra operands that CTR_INIT_NG has, and
3502 // skip the pattern location counter past
3503 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3504 fp->fPatIdx += 3;
3505 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3506 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3507 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3508 U_ASSERT(minCount>=0);
3509 U_ASSERT(maxCount>=minCount || maxCount==-1);
3510 U_ASSERT(loopLoc>fp->fPatIdx);
3511 if (maxCount == -1) {
3512 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
3513 }
3514
3515 if (minCount == 0) {
3516 if (maxCount != 0) {
3517 fp = StateSave(fp, fp->fPatIdx, status);
3518 }
3519 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
3520 }
3521 }
3522 break;
3523
3524 case URX_CTR_LOOP_NG:
3525 {
3526 // Non-greedy {min, max} loops
3527 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3528 int32_t initOp = (int32_t)pat[opValue];
3529 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3530 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3531 int32_t minCount = (int32_t)pat[opValue+2];
3532 int32_t maxCount = (int32_t)pat[opValue+3];
3533
3534 (*pCounter)++;
3535 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3536 // The loop has matched the maximum permitted number of times.
3537 // Break out of here with no action. Matching will
3538 // continue with the following pattern.
3539 U_ASSERT(*pCounter == maxCount);
3540 break;
3541 }
3542
3543 if (*pCounter < minCount) {
3544 // We haven't met the minimum number of matches yet.
3545 // Loop back for another one.
3546 fp->fPatIdx = opValue + 4; // Loop back.
3547 } else {
3548 // We do have the minimum number of matches.
3549
3550 // If there is no upper bound on the loop iterations, check that the input index
3551 // is progressing, and stop the loop if it is not.
3552 if (maxCount == -1) {
3553 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
3554 if (fp->fInputIdx == *pLastInputIdx) {
3555 break;
3556 }
3557 *pLastInputIdx = fp->fInputIdx;
3558 }
3559
3560 // Loop Continuation: we will fall into the pattern following the loop
3561 // (non-greedy, don't execute loop body first), but first do
3562 // a state save to the top of the loop, so that a match failure
3563 // in the following pattern will try another iteration of the loop.
3564 fp = StateSave(fp, opValue + 4, status);
3565 }
3566 }
3567 break;
3568
3569 case URX_STO_SP:
3570 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3571 fData[opValue] = fStack->size();
3572 break;
3573
3574 case URX_LD_SP:
3575 {
3576 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3577 int32_t newStackSize = (int32_t)fData[opValue];
3578 U_ASSERT(newStackSize <= fStack->size());
3579 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3580 if (newFP == (int64_t *)fp) {
3581 break;
3582 }
3583 int32_t i;
3584 for (i=0; i<fFrameSize; i++) {
3585 newFP[i] = ((int64_t *)fp)[i];
3586 }
3587 fp = (REStackFrame *)newFP;
3588 fStack->setSize(newStackSize);
3589 }
3590 break;
3591
3592 case URX_BACKREF:
3593 {
3594 U_ASSERT(opValue < fFrameSize);
3595 int64_t groupStartIdx = fp->fExtra[opValue];
3596 int64_t groupEndIdx = fp->fExtra[opValue+1];
3597 U_ASSERT(groupStartIdx <= groupEndIdx);
3598 if (groupStartIdx < 0) {
3599 // This capture group has not participated in the match thus far,
3600 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3601 break;
3602 }
3603 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3604 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3605
3606 // Note: if the capture group match was of an empty string the backref
3607 // match succeeds. Verified by testing: Perl matches succeed
3608 // in this case, so we do too.
3609
3610 UBool success = TRUE;
3611 for (;;) {
3612 if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3613 success = TRUE;
3614 break;
3615 }
3616 if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
3617 success = FALSE;
3618 fHitEnd = TRUE;
3619 break;
3620 }
3621 UChar32 captureGroupChar = utext_next32(fAltInputText);
3622 UChar32 inputChar = utext_next32(fInputText);
3623 if (inputChar != captureGroupChar) {
3624 success = FALSE;
3625 break;
3626 }
3627 }
3628
3629 if (success) {
3630 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3631 } else {
3632 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3633 }
3634 }
3635 break;
3636
3637
3638
3639 case URX_BACKREF_I:
3640 {
3641 U_ASSERT(opValue < fFrameSize);
3642 int64_t groupStartIdx = fp->fExtra[opValue];
3643 int64_t groupEndIdx = fp->fExtra[opValue+1];
3644 U_ASSERT(groupStartIdx <= groupEndIdx);
3645 if (groupStartIdx < 0) {
3646 // This capture group has not participated in the match thus far,
3647 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3648 break;
3649 }
3650 utext_setNativeIndex(fAltInputText, groupStartIdx);
3651 utext_setNativeIndex(fInputText, fp->fInputIdx);
3652 CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
3653 CaseFoldingUTextIterator inputItr(*fInputText);
3654
3655 // Note: if the capture group match was of an empty string the backref
3656 // match succeeds. Verified by testing: Perl matches succeed
3657 // in this case, so we do too.
3658
3659 UBool success = TRUE;
3660 for (;;) {
3661 if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3662 success = TRUE;
3663 break;
3664 }
3665 if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
3666 success = FALSE;
3667 fHitEnd = TRUE;
3668 break;
3669 }
3670 UChar32 captureGroupChar = captureGroupItr.next();
3671 UChar32 inputChar = inputItr.next();
3672 if (inputChar != captureGroupChar) {
3673 success = FALSE;
3674 break;
3675 }
3676 }
3677
3678 if (success && inputItr.inExpansion()) {
3679 // We otained a match by consuming part of a string obtained from
3680 // case-folding a single code point of the input text.
3681 // This does not count as an overall match.
3682 success = FALSE;
3683 }
3684
3685 if (success) {
3686 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3687 } else {
3688 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3689 }
3690
3691 }
3692 break;
3693
3694 case URX_STO_INP_LOC:
3695 {
3696 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3697 fp->fExtra[opValue] = fp->fInputIdx;
3698 }
3699 break;
3700
3701 case URX_JMPX:
3702 {
3703 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3704 fp->fPatIdx += 1;
3705 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
3706 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3707 int64_t savedInputIdx = fp->fExtra[dataLoc];
3708 U_ASSERT(savedInputIdx <= fp->fInputIdx);
3709 if (savedInputIdx < fp->fInputIdx) {
3710 fp->fPatIdx = opValue; // JMP
3711 } else {
3712 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
3713 }
3714 }
3715 break;
3716
3717 case URX_LA_START:
3718 {
3719 // Entering a lookahead block.
3720 // Save Stack Ptr, Input Pos.
3721 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3722 fData[opValue] = fStack->size();
3723 fData[opValue+1] = fp->fInputIdx;
3724 fActiveStart = fLookStart; // Set the match region change for
3725 fActiveLimit = fLookLimit; // transparent bounds.
3726 }
3727 break;
3728
3729 case URX_LA_END:
3730 {
3731 // Leaving a look-ahead block.
3732 // restore Stack Ptr, Input Pos to positions they had on entry to block.
3733 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3734 int32_t stackSize = fStack->size();
3735 int32_t newStackSize =(int32_t)fData[opValue];
3736 U_ASSERT(stackSize >= newStackSize);
3737 if (stackSize > newStackSize) {
3738 // Copy the current top frame back to the new (cut back) top frame.
3739 // This makes the capture groups from within the look-ahead
3740 // expression available.
3741 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3742 int32_t i;
3743 for (i=0; i<fFrameSize; i++) {
3744 newFP[i] = ((int64_t *)fp)[i];
3745 }
3746 fp = (REStackFrame *)newFP;
3747 fStack->setSize(newStackSize);
3748 }
3749 fp->fInputIdx = fData[opValue+1];
3750
3751 // Restore the active region bounds in the input string; they may have
3752 // been changed because of transparent bounds on a Region.
3753 fActiveStart = fRegionStart;
3754 fActiveLimit = fRegionLimit;
3755 }
3756 break;
3757
3758 case URX_ONECHAR_I:
3759 // Case insensitive one char. The char from the pattern is already case folded.
3760 // Input text is not, but case folding the input can not reduce two or more code
3761 // points to one.
3762 if (fp->fInputIdx < fActiveLimit) {
3763 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3764
3765 UChar32 c = UTEXT_NEXT32(fInputText);
3766 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3767 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3768 break;
3769 }
3770 } else {
3771 fHitEnd = TRUE;
3772 }
3773
3774 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3775 break;
3776
3777 case URX_STRING_I:
3778 {
3779 // Case-insensitive test input against a literal string.
3780 // Strings require two slots in the compiled pattern, one for the
3781 // offset to the string text, and one for the length.
3782 // The compiled string has already been case folded.
3783 {
3784 const UChar *patternString = litText + opValue;
3785 int32_t patternStringIdx = 0;
3786
3787 op = (int32_t)pat[fp->fPatIdx];
3788 fp->fPatIdx++;
3789 opType = URX_TYPE(op);
3790 opValue = URX_VAL(op);
3791 U_ASSERT(opType == URX_STRING_LEN);
3792 int32_t patternStringLen = opValue; // Length of the string from the pattern.
3793
3794
3795 UChar32 cPattern;
3796 UChar32 cText;
3797 UBool success = TRUE;
3798
3799 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3800 CaseFoldingUTextIterator inputIterator(*fInputText);
3801 while (patternStringIdx < patternStringLen) {
3802 if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
3803 success = FALSE;
3804 fHitEnd = TRUE;
3805 break;
3806 }
3807 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
3808 cText = inputIterator.next();
3809 if (cText != cPattern) {
3810 success = FALSE;
3811 break;
3812 }
3813 }
3814 if (inputIterator.inExpansion()) {
3815 success = FALSE;
3816 }
3817
3818 if (success) {
3819 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3820 } else {
3821 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3822 }
3823 }
3824 }
3825 break;
3826
3827 case URX_LB_START:
3828 {
3829 // Entering a look-behind block.
3830 // Save Stack Ptr, Input Pos.
3831 // TODO: implement transparent bounds. Ticket #6067
3832 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3833 fData[opValue] = fStack->size();
3834 fData[opValue+1] = fp->fInputIdx;
3835 // Init the variable containing the start index for attempted matches.
3836 fData[opValue+2] = -1;
3837 // Save input string length, then reset to pin any matches to end at
3838 // the current position.
3839 fData[opValue+3] = fActiveLimit;
3840 fActiveLimit = fp->fInputIdx;
3841 }
3842 break;
3843
3844
3845 case URX_LB_CONT:
3846 {
3847 // Positive Look-Behind, at top of loop checking for matches of LB expression
3848 // at all possible input starting positions.
3849
3850 // Fetch the min and max possible match lengths. They are the operands
3851 // of this op in the pattern.
3852 int32_t minML = (int32_t)pat[fp->fPatIdx++];
3853 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3854 U_ASSERT(minML <= maxML);
3855 U_ASSERT(minML >= 0);
3856
3857 // Fetch (from data) the last input index where a match was attempted.
3858 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3859 int64_t *lbStartIdx = &fData[opValue+2];
3860 if (*lbStartIdx < 0) {
3861 // First time through loop.
3862 *lbStartIdx = fp->fInputIdx - minML;
3863 } else {
3864 // 2nd through nth time through the loop.
3865 // Back up start position for match by one.
3866 if (*lbStartIdx == 0) {
3867 (*lbStartIdx)--;
3868 } else {
3869 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
3870 (void)UTEXT_PREVIOUS32(fInputText);
3871 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3872 }
3873 }
3874
3875 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
3876 // We have tried all potential match starting points without
3877 // getting a match. Backtrack out, and out of the
3878 // Look Behind altogether.
3879 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3880 int64_t restoreInputLen = fData[opValue+3];
3881 U_ASSERT(restoreInputLen >= fActiveLimit);
3882 U_ASSERT(restoreInputLen <= fInputLength);
3883 fActiveLimit = restoreInputLen;
3884 break;
3885 }
3886
3887 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3888 // (successful match will fall off the end of the loop.)
3889 fp = StateSave(fp, fp->fPatIdx-3, status);
3890 fp->fInputIdx = *lbStartIdx;
3891 }
3892 break;
3893
3894 case URX_LB_END:
3895 // End of a look-behind block, after a successful match.
3896 {
3897 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3898 if (fp->fInputIdx != fActiveLimit) {
3899 // The look-behind expression matched, but the match did not
3900 // extend all the way to the point that we are looking behind from.
3901 // FAIL out of here, which will take us back to the LB_CONT, which
3902 // will retry the match starting at another position or fail
3903 // the look-behind altogether, whichever is appropriate.
3904 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3905 break;
3906 }
3907
3908 // Look-behind match is good. Restore the orignal input string length,
3909 // which had been truncated to pin the end of the lookbehind match to the
3910 // position being looked-behind.
3911 int64_t originalInputLen = fData[opValue+3];
3912 U_ASSERT(originalInputLen >= fActiveLimit);
3913 U_ASSERT(originalInputLen <= fInputLength);
3914 fActiveLimit = originalInputLen;
3915 }
3916 break;
3917
3918
3919 case URX_LBN_CONT:
3920 {
3921 // Negative Look-Behind, at top of loop checking for matches of LB expression
3922 // at all possible input starting positions.
3923
3924 // Fetch the extra parameters of this op.
3925 int32_t minML = (int32_t)pat[fp->fPatIdx++];
3926 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3927 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
3928 continueLoc = URX_VAL(continueLoc);
3929 U_ASSERT(minML <= maxML);
3930 U_ASSERT(minML >= 0);
3931 U_ASSERT(continueLoc > fp->fPatIdx);
3932
3933 // Fetch (from data) the last input index where a match was attempted.
3934 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3935 int64_t *lbStartIdx = &fData[opValue+2];
3936 if (*lbStartIdx < 0) {
3937 // First time through loop.
3938 *lbStartIdx = fp->fInputIdx - minML;
3939 } else {
3940 // 2nd through nth time through the loop.
3941 // Back up start position for match by one.
3942 if (*lbStartIdx == 0) {
3943 (*lbStartIdx)--;
3944 } else {
3945 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
3946 (void)UTEXT_PREVIOUS32(fInputText);
3947 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3948 }
3949 }
3950
3951 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
3952 // We have tried all potential match starting points without
3953 // getting a match, which means that the negative lookbehind as
3954 // a whole has succeeded. Jump forward to the continue location
3955 int64_t restoreInputLen = fData[opValue+3];
3956 U_ASSERT(restoreInputLen >= fActiveLimit);
3957 U_ASSERT(restoreInputLen <= fInputLength);
3958 fActiveLimit = restoreInputLen;
3959 fp->fPatIdx = continueLoc;
3960 break;
3961 }
3962
3963 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3964 // (successful match will cause a FAIL out of the loop altogether.)
3965 fp = StateSave(fp, fp->fPatIdx-4, status);
3966 fp->fInputIdx = *lbStartIdx;
3967 }
3968 break;
3969
3970 case URX_LBN_END:
3971 // End of a negative look-behind block, after a successful match.
3972 {
3973 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3974 if (fp->fInputIdx != fActiveLimit) {
3975 // The look-behind expression matched, but the match did not
3976 // extend all the way to the point that we are looking behind from.
3977 // FAIL out of here, which will take us back to the LB_CONT, which
3978 // will retry the match starting at another position or succeed
3979 // the look-behind altogether, whichever is appropriate.
3980 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3981 break;
3982 }
3983
3984 // Look-behind expression matched, which means look-behind test as
3985 // a whole Fails
3986
3987 // Restore the orignal input string length, which had been truncated
3988 // inorder to pin the end of the lookbehind match
3989 // to the position being looked-behind.
3990 int64_t originalInputLen = fData[opValue+3];
3991 U_ASSERT(originalInputLen >= fActiveLimit);
3992 U_ASSERT(originalInputLen <= fInputLength);
3993 fActiveLimit = originalInputLen;
3994
3995 // Restore original stack position, discarding any state saved
3996 // by the successful pattern match.
3997 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3998 int32_t newStackSize = (int32_t)fData[opValue];
3999 U_ASSERT(fStack->size() > newStackSize);
4000 fStack->setSize(newStackSize);
4001
4002 // FAIL, which will take control back to someplace
4003 // prior to entering the look-behind test.
4004 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4005 }
4006 break;
4007
4008
4009 case URX_LOOP_SR_I:
4010 // Loop Initialization for the optimized implementation of
4011 // [some character set]*
4012 // This op scans through all matching input.
4013 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4014 {
4015 U_ASSERT(opValue > 0 && opValue < sets->size());
4016 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4017 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
4018
4019 // Loop through input, until either the input is exhausted or
4020 // we reach a character that is not a member of the set.
4021 int64_t ix = fp->fInputIdx;
4022 UTEXT_SETNATIVEINDEX(fInputText, ix);
4023 for (;;) {
4024 if (ix >= fActiveLimit) {
4025 fHitEnd = TRUE;
4026 break;
4027 }
4028 UChar32 c = UTEXT_NEXT32(fInputText);
4029 if (c<256) {
4030 if (s8->contains(c) == FALSE) {
4031 break;
4032 }
4033 } else {
4034 if (s->contains(c) == FALSE) {
4035 break;
4036 }
4037 }
4038 ix = UTEXT_GETNATIVEINDEX(fInputText);
4039 }
4040
4041 // If there were no matching characters, skip over the loop altogether.
4042 // The loop doesn't run at all, a * op always succeeds.
4043 if (ix == fp->fInputIdx) {
4044 fp->fPatIdx++; // skip the URX_LOOP_C op.
4045 break;
4046 }
4047
4048 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4049 // must follow. It's operand is the stack location
4050 // that holds the starting input index for the match of this [set]*
4051 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4052 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4053 int32_t stackLoc = URX_VAL(loopcOp);
4054 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4055 fp->fExtra[stackLoc] = fp->fInputIdx;
4056 fp->fInputIdx = ix;
4057
4058 // Save State to the URX_LOOP_C op that follows this one,
4059 // so that match failures in the following code will return to there.
4060 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4061 fp = StateSave(fp, fp->fPatIdx, status);
4062 fp->fPatIdx++;
4063 }
4064 break;
4065
4066
4067 case URX_LOOP_DOT_I:
4068 // Loop Initialization for the optimized implementation of .*
4069 // This op scans through all remaining input.
4070 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4071 {
4072 // Loop through input until the input is exhausted (we reach an end-of-line)
4073 // In DOTALL mode, we can just go straight to the end of the input.
4074 int64_t ix;
4075 if ((opValue & 1) == 1) {
4076 // Dot-matches-All mode. Jump straight to the end of the string.
4077 ix = fActiveLimit;
4078 fHitEnd = TRUE;
4079 } else {
4080 // NOT DOT ALL mode. Line endings do not match '.'
4081 // Scan forward until a line ending or end of input.
4082 ix = fp->fInputIdx;
4083 UTEXT_SETNATIVEINDEX(fInputText, ix);
4084 for (;;) {
4085 if (ix >= fActiveLimit) {
4086 fHitEnd = TRUE;
4087 break;
4088 }
4089 UChar32 c = UTEXT_NEXT32(fInputText);
4090 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
4091 if ((c == 0x0a) || // 0x0a is newline in both modes.
4092 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
4093 (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) {
4094 // char is a line ending. Exit the scanning loop.
4095 break;
4096 }
4097 }
4098 ix = UTEXT_GETNATIVEINDEX(fInputText);
4099 }
4100 }
4101
4102 // If there were no matching characters, skip over the loop altogether.
4103 // The loop doesn't run at all, a * op always succeeds.
4104 if (ix == fp->fInputIdx) {
4105 fp->fPatIdx++; // skip the URX_LOOP_C op.
4106 break;
4107 }
4108
4109 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4110 // must follow. It's operand is the stack location
4111 // that holds the starting input index for the match of this .*
4112 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4113 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4114 int32_t stackLoc = URX_VAL(loopcOp);
4115 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4116 fp->fExtra[stackLoc] = fp->fInputIdx;
4117 fp->fInputIdx = ix;
4118
4119 // Save State to the URX_LOOP_C op that follows this one,
4120 // so that match failures in the following code will return to there.
4121 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4122 fp = StateSave(fp, fp->fPatIdx, status);
4123 fp->fPatIdx++;
4124 }
4125 break;
4126
4127
4128 case URX_LOOP_C:
4129 {
4130 U_ASSERT(opValue>=0 && opValue<fFrameSize);
4131 backSearchIndex = fp->fExtra[opValue];
4132 U_ASSERT(backSearchIndex <= fp->fInputIdx);
4133 if (backSearchIndex == fp->fInputIdx) {
4134 // We've backed up the input idx to the point that the loop started.
4135 // The loop is done. Leave here without saving state.
4136 // Subsequent failures won't come back here.
4137 break;
4138 }
4139 // Set up for the next iteration of the loop, with input index
4140 // backed up by one from the last time through,
4141 // and a state save to this instruction in case the following code fails again.
4142 // (We're going backwards because this loop emulates stack unwinding, not
4143 // the initial scan forward.)
4144 U_ASSERT(fp->fInputIdx > 0);
4145 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4146 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4147 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4148
4149 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4150 if (prevC == 0x0a &&
4151 fp->fInputIdx > backSearchIndex &&
4152 twoPrevC == 0x0d) {
4153 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4154 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4155 // .*, stepping back over CRLF pair.
4156 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4157 }
4158 }
4159
4160
4161 fp = StateSave(fp, fp->fPatIdx-1, status);
4162 }
4163 break;
4164
4165
4166
4167 default:
4168 // Trouble. The compiled pattern contains an entry with an
4169 // unrecognized type tag.
4170 U_ASSERT(FALSE);
4171 }
4172
4173 if (U_FAILURE(status)) {
4174 isMatch = FALSE;
4175 break;
4176 }
4177 }
4178
4179 breakFromLoop:
4180 fMatch = isMatch;
4181 if (isMatch) {
4182 fLastMatchEnd = fMatchEnd;
4183 fMatchStart = startIdx;
4184 fMatchEnd = fp->fInputIdx;
4185 }
4186
4187 #ifdef REGEX_RUN_DEBUG
4188 if (fTraceDebug) {
4189 if (isMatch) {
4190 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
4191 } else {
4192 printf("No match\n\n");
4193 }
4194 }
4195 #endif
4196
4197 fFrame = fp; // The active stack frame when the engine stopped.
4198 // Contains the capture group results that we need to
4199 // access later.
4200 return;
4201 }
4202
4203
4204 //--------------------------------------------------------------------------------
4205 //
4206 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with the
4207 // assumption that the entire string is available in the UText's
4208 // chunk buffer. For now, that means we can use int32_t indexes,
4209 // except for anything that needs to be saved (like group starts
4210 // and ends).
4211 //
4212 // startIdx: begin matching a this index.
4213 // toEnd: if true, match must extend to end of the input region
4214 //
4215 //--------------------------------------------------------------------------------
4216 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4217 UBool isMatch = FALSE; // True if the we have a match.
4218
4219 int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4220
4221 int32_t op; // Operation from the compiled pattern, split into
4222 int32_t opType; // the opcode
4223 int32_t opValue; // and the operand value.
4224
4225 #ifdef REGEX_RUN_DEBUG
4226 if (fTraceDebug) {
4227 printf("MatchAt(startIdx=%d)\n", startIdx);
4228 printf("Original Pattern: ");
4229 UChar32 c = utext_next32From(fPattern->fPattern, 0);
4230 while (c != U_SENTINEL) {
4231 if (c<32 || c>256) {
4232 c = '.';
4233 }
4234 printf("%c", c);
4235
4236 c = UTEXT_NEXT32(fPattern->fPattern);
4237 }
4238 printf("\n");
4239 printf("Input String: ");
4240 c = utext_next32From(fInputText, 0);
4241 while (c != U_SENTINEL) {
4242 if (c<32 || c>256) {
4243 c = '.';
4244 }
4245 printf("%c", c);
4246
4247 c = UTEXT_NEXT32(fInputText);
4248 }
4249 printf("\n");
4250 printf("\n");
4251 }
4252 #endif
4253
4254 if (U_FAILURE(status)) {
4255 return;
4256 }
4257
4258 // Cache frequently referenced items from the compiled pattern
4259 //
4260 int64_t *pat = fPattern->fCompiledPat->getBuffer();
4261
4262 const UChar *litText = fPattern->fLiteralText.getBuffer();
4263 UVector *sets = fPattern->fSets;
4264
4265 const UChar *inputBuf = fInputText->chunkContents;
4266
4267 fFrameSize = fPattern->fFrameSize;
4268 REStackFrame *fp = resetStack();
4269
4270 fp->fPatIdx = 0;
4271 fp->fInputIdx = startIdx;
4272
4273 // Zero out the pattern's static data
4274 int32_t i;
4275 for (i = 0; i<fPattern->fDataSize; i++) {
4276 fData[i] = 0;
4277 }
4278
4279 //
4280 // Main loop for interpreting the compiled pattern.
4281 // One iteration of the loop per pattern operation performed.
4282 //
4283 for (;;) {
4284 op = (int32_t)pat[fp->fPatIdx];
4285 opType = URX_TYPE(op);
4286 opValue = URX_VAL(op);
4287 #ifdef REGEX_RUN_DEBUG
4288 if (fTraceDebug) {
4289 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4290 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
4291 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4292 fPattern->dumpOp(fp->fPatIdx);
4293 }
4294 #endif
4295 fp->fPatIdx++;
4296
4297 switch (opType) {
4298
4299
4300 case URX_NOP:
4301 break;
4302
4303
4304 case URX_BACKTRACK:
4305 // Force a backtrack. In some circumstances, the pattern compiler
4306 // will notice that the pattern can't possibly match anything, and will
4307 // emit one of these at that point.
4308 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4309 break;
4310
4311
4312 case URX_ONECHAR:
4313 if (fp->fInputIdx < fActiveLimit) {
4314 UChar32 c;
4315 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4316 if (c == opValue) {
4317 break;
4318 }
4319 } else {
4320 fHitEnd = TRUE;
4321 }
4322 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4323 break;
4324
4325
4326 case URX_STRING:
4327 {
4328 // Test input against a literal string.
4329 // Strings require two slots in the compiled pattern, one for the
4330 // offset to the string text, and one for the length.
4331 int32_t stringStartIdx = opValue;
4332 int32_t stringLen;
4333
4334 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
4335 fp->fPatIdx++;
4336 opType = URX_TYPE(op);
4337 stringLen = URX_VAL(op);
4338 U_ASSERT(opType == URX_STRING_LEN);
4339 U_ASSERT(stringLen >= 2);
4340
4341 const UChar * pInp = inputBuf + fp->fInputIdx;
4342 const UChar * pInpLimit = inputBuf + fActiveLimit;
4343 const UChar * pPat = litText+stringStartIdx;
4344 const UChar * pEnd = pInp + stringLen;
4345 UBool success = TRUE;
4346 while (pInp < pEnd) {
4347 if (pInp >= pInpLimit) {
4348 fHitEnd = TRUE;
4349 success = FALSE;
4350 break;
4351 }
4352 if (*pInp++ != *pPat++) {
4353 success = FALSE;
4354 break;
4355 }
4356 }
4357
4358 if (success) {
4359 fp->fInputIdx += stringLen;
4360 } else {
4361 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4362 }
4363 }
4364 break;
4365
4366
4367 case URX_STATE_SAVE:
4368 fp = StateSave(fp, opValue, status);
4369 break;
4370
4371
4372 case URX_END:
4373 // The match loop will exit via this path on a successful match,
4374 // when we reach the end of the pattern.
4375 if (toEnd && fp->fInputIdx != fActiveLimit) {
4376 // The pattern matched, but not to the end of input. Try some more.
4377 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4378 break;
4379 }
4380 isMatch = TRUE;
4381 goto breakFromLoop;
4382
4383 // Start and End Capture stack frame variables are laid out out like this:
4384 // fp->fExtra[opValue] - The start of a completed capture group
4385 // opValue+1 - The end of a completed capture group
4386 // opValue+2 - the start of a capture group whose end
4387 // has not yet been reached (and might not ever be).
4388 case URX_START_CAPTURE:
4389 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4390 fp->fExtra[opValue+2] = fp->fInputIdx;
4391 break;
4392
4393
4394 case URX_END_CAPTURE:
4395 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4396 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
4397 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
4398 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
4399 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4400 break;
4401
4402
4403 case URX_DOLLAR: // $, test for End of line
4404 // or for position before new line at end of input
4405 if (fp->fInputIdx < fAnchorLimit-2) {
4406 // We are no where near the end of input. Fail.
4407 // This is the common case. Keep it first.
4408 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4409 break;
4410 }
4411 if (fp->fInputIdx >= fAnchorLimit) {
4412 // We really are at the end of input. Success.
4413 fHitEnd = TRUE;
4414 fRequireEnd = TRUE;
4415 break;
4416 }
4417
4418 // If we are positioned just before a new-line that is located at the
4419 // end of input, succeed.
4420 if (fp->fInputIdx == fAnchorLimit-1) {
4421 UChar32 c;
4422 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4423
4424 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
4425 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4426 // At new-line at end of input. Success
4427 fHitEnd = TRUE;
4428 fRequireEnd = TRUE;
4429 break;
4430 }
4431 }
4432 } else if (fp->fInputIdx == fAnchorLimit-2 &&
4433 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4434 fHitEnd = TRUE;
4435 fRequireEnd = TRUE;
4436 break; // At CR/LF at end of input. Success
4437 }
4438
4439 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4440
4441 break;
4442
4443
4444 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
4445 if (fp->fInputIdx >= fAnchorLimit-1) {
4446 // Either at the last character of input, or off the end.
4447 if (fp->fInputIdx == fAnchorLimit-1) {
4448 // At last char of input. Success if it's a new line.
4449 if (inputBuf[fp->fInputIdx] == 0x0a) {
4450 fHitEnd = TRUE;
4451 fRequireEnd = TRUE;
4452 break;
4453 }
4454 } else {
4455 // Off the end of input. Success.
4456 fHitEnd = TRUE;
4457 fRequireEnd = TRUE;
4458 break;
4459 }
4460 }
4461
4462 // Not at end of input. Back-track out.
4463 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4464 break;
4465
4466
4467 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
4468 {
4469 if (fp->fInputIdx >= fAnchorLimit) {
4470 // We really are at the end of input. Success.
4471 fHitEnd = TRUE;
4472 fRequireEnd = TRUE;
4473 break;
4474 }
4475 // If we are positioned just before a new-line, succeed.
4476 // It makes no difference where the new-line is within the input.
4477 UChar32 c = inputBuf[fp->fInputIdx];
4478 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
4479 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
4480 // In multi-line mode, hitting a new-line just before the end of input does not
4481 // set the hitEnd or requireEnd flags
4482 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4483 break;
4484 }
4485 }
4486 // not at a new line. Fail.
4487 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4488 }
4489 break;
4490
4491
4492 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
4493 {
4494 if (fp->fInputIdx >= fAnchorLimit) {
4495 // We really are at the end of input. Success.
4496 fHitEnd = TRUE;
4497 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
4498 break; // adding a new-line would not lose the match.
4499 }
4500 // If we are not positioned just before a new-line, the test fails; backtrack out.
4501 // It makes no difference where the new-line is within the input.
4502 if (inputBuf[fp->fInputIdx] != 0x0a) {
4503 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4504 }
4505 }
4506 break;
4507
4508
4509 case URX_CARET: // ^, test for start of line
4510 if (fp->fInputIdx != fAnchorStart) {
4511 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4512 }
4513 break;
4514
4515
4516 case URX_CARET_M: // ^, test for start of line in mulit-line mode
4517 {
4518 if (fp->fInputIdx == fAnchorStart) {
4519 // We are at the start input. Success.
4520 break;
4521 }
4522 // Check whether character just before the current pos is a new-line
4523 // unless we are at the end of input
4524 UChar c = inputBuf[fp->fInputIdx - 1];
4525 if ((fp->fInputIdx < fAnchorLimit) &&
4526 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
4527 // It's a new-line. ^ is true. Success.
4528 // TODO: what should be done with positions between a CR and LF?
4529 break;
4530 }
4531 // Not at the start of a line. Fail.
4532 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4533 }
4534 break;
4535
4536
4537 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
4538 {
4539 U_ASSERT(fp->fInputIdx >= fAnchorStart);
4540 if (fp->fInputIdx <= fAnchorStart) {
4541 // We are at the start input. Success.
4542 break;
4543 }
4544 // Check whether character just before the current pos is a new-line
4545 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4546 UChar c = inputBuf[fp->fInputIdx - 1];
4547 if (c != 0x0a) {
4548 // Not at the start of a line. Back-track out.
4549 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4550 }
4551 }
4552 break;
4553
4554 case URX_BACKSLASH_B: // Test for word boundaries
4555 {
4556 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4557 success ^= (UBool)(opValue != 0); // flip sense for \B
4558 if (!success) {
4559 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4560 }
4561 }
4562 break;
4563
4564
4565 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
4566 {
4567 UBool success = isUWordBoundary(fp->fInputIdx);
4568 success ^= (UBool)(opValue != 0); // flip sense for \B
4569 if (!success) {
4570 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4571 }
4572 }
4573 break;
4574
4575
4576 case URX_BACKSLASH_D: // Test for decimal digit
4577 {
4578 if (fp->fInputIdx >= fActiveLimit) {
4579 fHitEnd = TRUE;
4580 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4581 break;
4582 }
4583
4584 UChar32 c;
4585 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4586 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
4587 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4588 success ^= (UBool)(opValue != 0); // flip sense for \D
4589 if (!success) {
4590 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4591 }
4592 }
4593 break;
4594
4595
4596 case URX_BACKSLASH_G: // Test for position at end of previous match
4597 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
4598 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4599 }
4600 break;
4601
4602
4603 case URX_BACKSLASH_X:
4604 // Match a Grapheme, as defined by Unicode TR 29.
4605 // Differs slightly from Perl, which consumes combining marks independently
4606 // of context.
4607 {
4608
4609 // Fail if at end of input
4610 if (fp->fInputIdx >= fActiveLimit) {
4611 fHitEnd = TRUE;
4612 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4613 break;
4614 }
4615
4616 // Examine (and consume) the current char.
4617 // Dispatch into a little state machine, based on the char.
4618 UChar32 c;
4619 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4620 UnicodeSet **sets = fPattern->fStaticSets;
4621 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
4622 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
4623 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4624 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4625 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4626 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4627 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4628 goto GC_Extend;
4629
4630
4631
4632 GC_L:
4633 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4634 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4635 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4636 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4637 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4638 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4639 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4640 goto GC_Extend;
4641
4642 GC_V:
4643 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4644 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4645 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4646 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4647 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4648 goto GC_Extend;
4649
4650 GC_T:
4651 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4652 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4653 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4654 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4655 goto GC_Extend;
4656
4657 GC_Extend:
4658 // Combining characters are consumed here
4659 for (;;) {
4660 if (fp->fInputIdx >= fActiveLimit) {
4661 break;
4662 }
4663 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4664 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
4665 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
4666 break;
4667 }
4668 }
4669 goto GC_Done;
4670
4671 GC_Control:
4672 // Most control chars stand alone (don't combine with combining chars),
4673 // except for that CR/LF sequence is a single grapheme cluster.
4674 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
4675 fp->fInputIdx++;
4676 }
4677
4678 GC_Done:
4679 if (fp->fInputIdx >= fActiveLimit) {
4680 fHitEnd = TRUE;
4681 }
4682 break;
4683 }
4684
4685
4686
4687
4688 case URX_BACKSLASH_Z: // Test for end of Input
4689 if (fp->fInputIdx < fAnchorLimit) {
4690 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4691 } else {
4692 fHitEnd = TRUE;
4693 fRequireEnd = TRUE;
4694 }
4695 break;
4696
4697
4698
4699 case URX_STATIC_SETREF:
4700 {
4701 // Test input character against one of the predefined sets
4702 // (Word Characters, for example)
4703 // The high bit of the op value is a flag for the match polarity.
4704 // 0: success if input char is in set.
4705 // 1: success if input char is not in set.
4706 if (fp->fInputIdx >= fActiveLimit) {
4707 fHitEnd = TRUE;
4708 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4709 break;
4710 }
4711
4712 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
4713 opValue &= ~URX_NEG_SET;
4714 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4715
4716 UChar32 c;
4717 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4718 if (c < 256) {
4719 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4720 if (s8->contains(c)) {
4721 success = !success;
4722 }
4723 } else {
4724 const UnicodeSet *s = fPattern->fStaticSets[opValue];
4725 if (s->contains(c)) {
4726 success = !success;
4727 }
4728 }
4729 if (!success) {
4730 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4731 }
4732 }
4733 break;
4734
4735
4736 case URX_STAT_SETREF_N:
4737 {
4738 // Test input character for NOT being a member of one of
4739 // the predefined sets (Word Characters, for example)
4740 if (fp->fInputIdx >= fActiveLimit) {
4741 fHitEnd = TRUE;
4742 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4743 break;
4744 }
4745
4746 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4747
4748 UChar32 c;
4749 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4750 if (c < 256) {
4751 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4752 if (s8->contains(c) == FALSE) {
4753 break;
4754 }
4755 } else {
4756 const UnicodeSet *s = fPattern->fStaticSets[opValue];
4757 if (s->contains(c) == FALSE) {
4758 break;
4759 }
4760 }
4761 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4762 }
4763 break;
4764
4765
4766 case URX_SETREF:
4767 {
4768 if (fp->fInputIdx >= fActiveLimit) {
4769 fHitEnd = TRUE;
4770 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4771 break;
4772 }
4773
4774 U_ASSERT(opValue > 0 && opValue < sets->size());
4775
4776 // There is input left. Pick up one char and test it for set membership.
4777 UChar32 c;
4778 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4779 if (c<256) {
4780 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4781 if (s8->contains(c)) {
4782 // The character is in the set. A Match.
4783 break;
4784 }
4785 } else {
4786 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
4787 if (s->contains(c)) {
4788 // The character is in the set. A Match.
4789 break;
4790 }
4791 }
4792
4793 // the character wasn't in the set.
4794 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4795 }
4796 break;
4797
4798
4799 case URX_DOTANY:
4800 {
4801 // . matches anything, but stops at end-of-line.
4802 if (fp->fInputIdx >= fActiveLimit) {
4803 // At end of input. Match failed. Backtrack out.
4804 fHitEnd = TRUE;
4805 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4806 break;
4807 }
4808
4809 // There is input left. Advance over one char, unless we've hit end-of-line
4810 UChar32 c;
4811 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4812 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
4813 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
4814 // End of line in normal mode. . does not match.
4815 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4816 break;
4817 }
4818 }
4819 break;
4820
4821
4822 case URX_DOTANY_ALL:
4823 {
4824 // . in dot-matches-all (including new lines) mode
4825 if (fp->fInputIdx >= fActiveLimit) {
4826 // At end of input. Match failed. Backtrack out.
4827 fHitEnd = TRUE;
4828 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4829 break;
4830 }
4831
4832 // There is input left. Advance over one char, except if we are
4833 // at a cr/lf, advance over both of them.
4834 UChar32 c;
4835 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4836 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
4837 // In the case of a CR/LF, we need to advance over both.
4838 if (inputBuf[fp->fInputIdx] == 0x0a) {
4839 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
4840 }
4841 }
4842 }
4843 break;
4844
4845
4846 case URX_DOTANY_UNIX:
4847 {
4848 // '.' operator, matches all, but stops at end-of-line.
4849 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
4850 if (fp->fInputIdx >= fActiveLimit) {
4851 // At end of input. Match failed. Backtrack out.
4852 fHitEnd = TRUE;
4853 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4854 break;
4855 }
4856
4857 // There is input left. Advance over one char, unless we've hit end-of-line
4858 UChar32 c;
4859 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4860 if (c == 0x0a) {
4861 // End of line in normal mode. '.' does not match the \n
4862 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4863 }
4864 }
4865 break;
4866
4867
4868 case URX_JMP:
4869 fp->fPatIdx = opValue;
4870 break;
4871
4872 case URX_FAIL:
4873 isMatch = FALSE;
4874 goto breakFromLoop;
4875
4876 case URX_JMP_SAV:
4877 U_ASSERT(opValue < fPattern->fCompiledPat->size());
4878 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
4879 fp->fPatIdx = opValue; // Then JMP.
4880 break;
4881
4882 case URX_JMP_SAV_X:
4883 // This opcode is used with (x)+, when x can match a zero length string.
4884 // Same as JMP_SAV, except conditional on the match having made forward progress.
4885 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
4886 // data address of the input position at the start of the loop.
4887 {
4888 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
4889 int32_t stoOp = (int32_t)pat[opValue-1];
4890 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
4891 int32_t frameLoc = URX_VAL(stoOp);
4892 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
4893 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
4894 U_ASSERT(prevInputIdx <= fp->fInputIdx);
4895 if (prevInputIdx < fp->fInputIdx) {
4896 // The match did make progress. Repeat the loop.
4897 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
4898 fp->fPatIdx = opValue;
4899 fp->fExtra[frameLoc] = fp->fInputIdx;
4900 }
4901 // If the input position did not advance, we do nothing here,
4902 // execution will fall out of the loop.
4903 }
4904 break;
4905
4906 case URX_CTR_INIT:
4907 {
4908 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
4909 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
4910
4911 // Pick up the three extra operands that CTR_INIT has, and
4912 // skip the pattern location counter past
4913 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
4914 fp->fPatIdx += 3;
4915 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
4916 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
4917 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
4918 U_ASSERT(minCount>=0);
4919 U_ASSERT(maxCount>=minCount || maxCount==-1);
4920 U_ASSERT(loopLoc>=fp->fPatIdx);
4921
4922 if (minCount == 0) {
4923 fp = StateSave(fp, loopLoc+1, status);
4924 }
4925 if (maxCount == -1) {
4926 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
4927 } else if (maxCount == 0) {
4928 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4929 }
4930 }
4931 break;
4932
4933 case URX_CTR_LOOP:
4934 {
4935 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
4936 int32_t initOp = (int32_t)pat[opValue];
4937 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
4938 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
4939 int32_t minCount = (int32_t)pat[opValue+2];
4940 int32_t maxCount = (int32_t)pat[opValue+3];
4941 (*pCounter)++;
4942 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
4943 U_ASSERT(*pCounter == maxCount);
4944 break;
4945 }
4946 if (*pCounter >= minCount) {
4947 if (maxCount == -1) {
4948 // Loop has no hard upper bound.
4949 // Check that it is progressing through the input, break if it is not.
4950 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
4951 if (fp->fInputIdx == *pLastInputIdx) {
4952 break;
4953 } else {
4954 *pLastInputIdx = fp->fInputIdx;
4955 }
4956 }
4957 fp = StateSave(fp, fp->fPatIdx, status);
4958 }
4959 fp->fPatIdx = opValue + 4; // Loop back.
4960 }
4961 break;
4962
4963 case URX_CTR_INIT_NG:
4964 {
4965 // Initialize a non-greedy loop
4966 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
4967 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
4968
4969 // Pick up the three extra operands that CTR_INIT_NG has, and
4970 // skip the pattern location counter past
4971 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
4972 fp->fPatIdx += 3;
4973 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
4974 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
4975 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
4976 U_ASSERT(minCount>=0);
4977 U_ASSERT(maxCount>=minCount || maxCount==-1);
4978 U_ASSERT(loopLoc>fp->fPatIdx);
4979 if (maxCount == -1) {
4980 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
4981 }
4982
4983 if (minCount == 0) {
4984 if (maxCount != 0) {
4985 fp = StateSave(fp, fp->fPatIdx, status);
4986 }
4987 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
4988 }
4989 }
4990 break;
4991
4992 case URX_CTR_LOOP_NG:
4993 {
4994 // Non-greedy {min, max} loops
4995 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
4996 int32_t initOp = (int32_t)pat[opValue];
4997 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
4998 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
4999 int32_t minCount = (int32_t)pat[opValue+2];
5000 int32_t maxCount = (int32_t)pat[opValue+3];
5001
5002 (*pCounter)++;
5003 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5004 // The loop has matched the maximum permitted number of times.
5005 // Break out of here with no action. Matching will
5006 // continue with the following pattern.
5007 U_ASSERT(*pCounter == maxCount);
5008 break;
5009 }
5010
5011 if (*pCounter < minCount) {
5012 // We haven't met the minimum number of matches yet.
5013 // Loop back for another one.
5014 fp->fPatIdx = opValue + 4; // Loop back.
5015 } else {
5016 // We do have the minimum number of matches.
5017
5018 // If there is no upper bound on the loop iterations, check that the input index
5019 // is progressing, and stop the loop if it is not.
5020 if (maxCount == -1) {
5021 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
5022 if (fp->fInputIdx == *pLastInputIdx) {
5023 break;
5024 }
5025 *pLastInputIdx = fp->fInputIdx;
5026 }
5027
5028 // Loop Continuation: we will fall into the pattern following the loop
5029 // (non-greedy, don't execute loop body first), but first do
5030 // a state save to the top of the loop, so that a match failure
5031 // in the following pattern will try another iteration of the loop.
5032 fp = StateSave(fp, opValue + 4, status);
5033 }
5034 }
5035 break;
5036
5037 case URX_STO_SP:
5038 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5039 fData[opValue] = fStack->size();
5040 break;
5041
5042 case URX_LD_SP:
5043 {
5044 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5045 int32_t newStackSize = (int32_t)fData[opValue];
5046 U_ASSERT(newStackSize <= fStack->size());
5047 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5048 if (newFP == (int64_t *)fp) {
5049 break;
5050 }
5051 int32_t i;
5052 for (i=0; i<fFrameSize; i++) {
5053 newFP[i] = ((int64_t *)fp)[i];
5054 }
5055 fp = (REStackFrame *)newFP;
5056 fStack->setSize(newStackSize);
5057 }
5058 break;
5059
5060 case URX_BACKREF:
5061 {
5062 U_ASSERT(opValue < fFrameSize);
5063 int64_t groupStartIdx = fp->fExtra[opValue];
5064 int64_t groupEndIdx = fp->fExtra[opValue+1];
5065 U_ASSERT(groupStartIdx <= groupEndIdx);
5066 int64_t inputIndex = fp->fInputIdx;
5067 if (groupStartIdx < 0) {
5068 // This capture group has not participated in the match thus far,
5069 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5070 break;
5071 }
5072 UBool success = TRUE;
5073 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
5074 if (inputIndex >= fActiveLimit) {
5075 success = FALSE;
5076 fHitEnd = TRUE;
5077 break;
5078 }
5079 if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
5080 success = FALSE;
5081 break;
5082 }
5083 }
5084 if (success) {
5085 fp->fInputIdx = inputIndex;
5086 } else {
5087 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5088 }
5089 }
5090 break;
5091
5092 case URX_BACKREF_I:
5093 {
5094 U_ASSERT(opValue < fFrameSize);
5095 int64_t groupStartIdx = fp->fExtra[opValue];
5096 int64_t groupEndIdx = fp->fExtra[opValue+1];
5097 U_ASSERT(groupStartIdx <= groupEndIdx);
5098 if (groupStartIdx < 0) {
5099 // This capture group has not participated in the match thus far,
5100 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5101 break;
5102 }
5103 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
5104 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
5105
5106 // Note: if the capture group match was of an empty string the backref
5107 // match succeeds. Verified by testing: Perl matches succeed
5108 // in this case, so we do too.
5109
5110 UBool success = TRUE;
5111 for (;;) {
5112 UChar32 captureGroupChar = captureGroupItr.next();
5113 if (captureGroupChar == U_SENTINEL) {
5114 success = TRUE;
5115 break;
5116 }
5117 UChar32 inputChar = inputItr.next();
5118 if (inputChar == U_SENTINEL) {
5119 success = FALSE;
5120 fHitEnd = TRUE;
5121 break;
5122 }
5123 if (inputChar != captureGroupChar) {
5124 success = FALSE;
5125 break;
5126 }
5127 }
5128
5129 if (success && inputItr.inExpansion()) {
5130 // We otained a match by consuming part of a string obtained from
5131 // case-folding a single code point of the input text.
5132 // This does not count as an overall match.
5133 success = FALSE;
5134 }
5135
5136 if (success) {
5137 fp->fInputIdx = inputItr.getIndex();
5138 } else {
5139 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5140 }
5141 }
5142 break;
5143
5144 case URX_STO_INP_LOC:
5145 {
5146 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5147 fp->fExtra[opValue] = fp->fInputIdx;
5148 }
5149 break;
5150
5151 case URX_JMPX:
5152 {
5153 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5154 fp->fPatIdx += 1;
5155 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
5156 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5157 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5158 U_ASSERT(savedInputIdx <= fp->fInputIdx);
5159 if (savedInputIdx < fp->fInputIdx) {
5160 fp->fPatIdx = opValue; // JMP
5161 } else {
5162 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
5163 }
5164 }
5165 break;
5166
5167 case URX_LA_START:
5168 {
5169 // Entering a lookahead block.
5170 // Save Stack Ptr, Input Pos.
5171 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5172 fData[opValue] = fStack->size();
5173 fData[opValue+1] = fp->fInputIdx;
5174 fActiveStart = fLookStart; // Set the match region change for
5175 fActiveLimit = fLookLimit; // transparent bounds.
5176 }
5177 break;
5178
5179 case URX_LA_END:
5180 {
5181 // Leaving a look-ahead block.
5182 // restore Stack Ptr, Input Pos to positions they had on entry to block.
5183 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5184 int32_t stackSize = fStack->size();
5185 int32_t newStackSize = (int32_t)fData[opValue];
5186 U_ASSERT(stackSize >= newStackSize);
5187 if (stackSize > newStackSize) {
5188 // Copy the current top frame back to the new (cut back) top frame.
5189 // This makes the capture groups from within the look-ahead
5190 // expression available.
5191 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5192 int32_t i;
5193 for (i=0; i<fFrameSize; i++) {
5194 newFP[i] = ((int64_t *)fp)[i];
5195 }
5196 fp = (REStackFrame *)newFP;
5197 fStack->setSize(newStackSize);
5198 }
5199 fp->fInputIdx = fData[opValue+1];
5200
5201 // Restore the active region bounds in the input string; they may have
5202 // been changed because of transparent bounds on a Region.
5203 fActiveStart = fRegionStart;
5204 fActiveLimit = fRegionLimit;
5205 }
5206 break;
5207
5208 case URX_ONECHAR_I:
5209 if (fp->fInputIdx < fActiveLimit) {
5210 UChar32 c;
5211 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5212 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5213 break;
5214 }
5215 } else {
5216 fHitEnd = TRUE;
5217 }
5218 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5219 break;
5220
5221 case URX_STRING_I:
5222 // Case-insensitive test input against a literal string.
5223 // Strings require two slots in the compiled pattern, one for the
5224 // offset to the string text, and one for the length.
5225 // The compiled string has already been case folded.
5226 {
5227 const UChar *patternString = litText + opValue;
5228
5229 op = (int32_t)pat[fp->fPatIdx];
5230 fp->fPatIdx++;
5231 opType = URX_TYPE(op);
5232 opValue = URX_VAL(op);
5233 U_ASSERT(opType == URX_STRING_LEN);
5234 int32_t patternStringLen = opValue; // Length of the string from the pattern.
5235
5236 UChar32 cText;
5237 UChar32 cPattern;
5238 UBool success = TRUE;
5239 int32_t patternStringIdx = 0;
5240 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
5241 while (patternStringIdx < patternStringLen) {
5242 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
5243 cText = inputIterator.next();
5244 if (cText != cPattern) {
5245 success = FALSE;
5246 if (cText == U_SENTINEL) {
5247 fHitEnd = TRUE;
5248 }
5249 break;
5250 }
5251 }
5252 if (inputIterator.inExpansion()) {
5253 success = FALSE;
5254 }
5255
5256 if (success) {
5257 fp->fInputIdx = inputIterator.getIndex();
5258 } else {
5259 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5260 }
5261 }
5262 break;
5263
5264 case URX_LB_START:
5265 {
5266 // Entering a look-behind block.
5267 // Save Stack Ptr, Input Pos.
5268 // TODO: implement transparent bounds. Ticket #6067
5269 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5270 fData[opValue] = fStack->size();
5271 fData[opValue+1] = fp->fInputIdx;
5272 // Init the variable containing the start index for attempted matches.
5273 fData[opValue+2] = -1;
5274 // Save input string length, then reset to pin any matches to end at
5275 // the current position.
5276 fData[opValue+3] = fActiveLimit;
5277 fActiveLimit = fp->fInputIdx;
5278 }
5279 break;
5280
5281
5282 case URX_LB_CONT:
5283 {
5284 // Positive Look-Behind, at top of loop checking for matches of LB expression
5285 // at all possible input starting positions.
5286
5287 // Fetch the min and max possible match lengths. They are the operands
5288 // of this op in the pattern.
5289 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5290 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5291 U_ASSERT(minML <= maxML);
5292 U_ASSERT(minML >= 0);
5293
5294 // Fetch (from data) the last input index where a match was attempted.
5295 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5296 int64_t *lbStartIdx = &fData[opValue+2];
5297 if (*lbStartIdx < 0) {
5298 // First time through loop.
5299 *lbStartIdx = fp->fInputIdx - minML;
5300 } else {
5301 // 2nd through nth time through the loop.
5302 // Back up start position for match by one.
5303 if (*lbStartIdx == 0) {
5304 (*lbStartIdx)--;
5305 } else {
5306 U16_BACK_1(inputBuf, 0, *lbStartIdx);
5307 }
5308 }
5309
5310 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5311 // We have tried all potential match starting points without
5312 // getting a match. Backtrack out, and out of the
5313 // Look Behind altogether.
5314 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5315 int64_t restoreInputLen = fData[opValue+3];
5316 U_ASSERT(restoreInputLen >= fActiveLimit);
5317 U_ASSERT(restoreInputLen <= fInputLength);
5318 fActiveLimit = restoreInputLen;
5319 break;
5320 }
5321
5322 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5323 // (successful match will fall off the end of the loop.)
5324 fp = StateSave(fp, fp->fPatIdx-3, status);
5325 fp->fInputIdx = *lbStartIdx;
5326 }
5327 break;
5328
5329 case URX_LB_END:
5330 // End of a look-behind block, after a successful match.
5331 {
5332 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5333 if (fp->fInputIdx != fActiveLimit) {
5334 // The look-behind expression matched, but the match did not
5335 // extend all the way to the point that we are looking behind from.
5336 // FAIL out of here, which will take us back to the LB_CONT, which
5337 // will retry the match starting at another position or fail
5338 // the look-behind altogether, whichever is appropriate.
5339 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5340 break;
5341 }
5342
5343 // Look-behind match is good. Restore the orignal input string length,
5344 // which had been truncated to pin the end of the lookbehind match to the
5345 // position being looked-behind.
5346 int64_t originalInputLen = fData[opValue+3];
5347 U_ASSERT(originalInputLen >= fActiveLimit);
5348 U_ASSERT(originalInputLen <= fInputLength);
5349 fActiveLimit = originalInputLen;
5350 }
5351 break;
5352
5353
5354 case URX_LBN_CONT:
5355 {
5356 // Negative Look-Behind, at top of loop checking for matches of LB expression
5357 // at all possible input starting positions.
5358
5359 // Fetch the extra parameters of this op.
5360 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5361 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5362 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5363 continueLoc = URX_VAL(continueLoc);
5364 U_ASSERT(minML <= maxML);
5365 U_ASSERT(minML >= 0);
5366 U_ASSERT(continueLoc > fp->fPatIdx);
5367
5368 // Fetch (from data) the last input index where a match was attempted.
5369 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5370 int64_t *lbStartIdx = &fData[opValue+2];
5371 if (*lbStartIdx < 0) {
5372 // First time through loop.
5373 *lbStartIdx = fp->fInputIdx - minML;
5374 } else {
5375 // 2nd through nth time through the loop.
5376 // Back up start position for match by one.
5377 if (*lbStartIdx == 0) {
5378 (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0.
5379 } else {
5380 U16_BACK_1(inputBuf, 0, *lbStartIdx);
5381 }
5382 }
5383
5384 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5385 // We have tried all potential match starting points without
5386 // getting a match, which means that the negative lookbehind as
5387 // a whole has succeeded. Jump forward to the continue location
5388 int64_t restoreInputLen = fData[opValue+3];
5389 U_ASSERT(restoreInputLen >= fActiveLimit);
5390 U_ASSERT(restoreInputLen <= fInputLength);
5391 fActiveLimit = restoreInputLen;
5392 fp->fPatIdx = continueLoc;
5393 break;
5394 }
5395
5396 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5397 // (successful match will cause a FAIL out of the loop altogether.)
5398 fp = StateSave(fp, fp->fPatIdx-4, status);
5399 fp->fInputIdx = *lbStartIdx;
5400 }
5401 break;
5402
5403 case URX_LBN_END:
5404 // End of a negative look-behind block, after a successful match.
5405 {
5406 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5407 if (fp->fInputIdx != fActiveLimit) {
5408 // The look-behind expression matched, but the match did not
5409 // extend all the way to the point that we are looking behind from.
5410 // FAIL out of here, which will take us back to the LB_CONT, which
5411 // will retry the match starting at another position or succeed
5412 // the look-behind altogether, whichever is appropriate.
5413 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5414 break;
5415 }
5416
5417 // Look-behind expression matched, which means look-behind test as
5418 // a whole Fails
5419
5420 // Restore the orignal input string length, which had been truncated
5421 // inorder to pin the end of the lookbehind match
5422 // to the position being looked-behind.
5423 int64_t originalInputLen = fData[opValue+3];
5424 U_ASSERT(originalInputLen >= fActiveLimit);
5425 U_ASSERT(originalInputLen <= fInputLength);
5426 fActiveLimit = originalInputLen;
5427
5428 // Restore original stack position, discarding any state saved
5429 // by the successful pattern match.
5430 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5431 int32_t newStackSize = (int32_t)fData[opValue];
5432 U_ASSERT(fStack->size() > newStackSize);
5433 fStack->setSize(newStackSize);
5434
5435 // FAIL, which will take control back to someplace
5436 // prior to entering the look-behind test.
5437 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5438 }
5439 break;
5440
5441
5442 case URX_LOOP_SR_I:
5443 // Loop Initialization for the optimized implementation of
5444 // [some character set]*
5445 // This op scans through all matching input.
5446 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5447 {
5448 U_ASSERT(opValue > 0 && opValue < sets->size());
5449 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5450 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
5451
5452 // Loop through input, until either the input is exhausted or
5453 // we reach a character that is not a member of the set.
5454 int32_t ix = (int32_t)fp->fInputIdx;
5455 for (;;) {
5456 if (ix >= fActiveLimit) {
5457 fHitEnd = TRUE;
5458 break;
5459 }
5460 UChar32 c;
5461 U16_NEXT(inputBuf, ix, fActiveLimit, c);
5462 if (c<256) {
5463 if (s8->contains(c) == FALSE) {
5464 U16_BACK_1(inputBuf, 0, ix);
5465 break;
5466 }
5467 } else {
5468 if (s->contains(c) == FALSE) {
5469 U16_BACK_1(inputBuf, 0, ix);
5470 break;
5471 }
5472 }
5473 }
5474
5475 // If there were no matching characters, skip over the loop altogether.
5476 // The loop doesn't run at all, a * op always succeeds.
5477 if (ix == fp->fInputIdx) {
5478 fp->fPatIdx++; // skip the URX_LOOP_C op.
5479 break;
5480 }
5481
5482 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5483 // must follow. It's operand is the stack location
5484 // that holds the starting input index for the match of this [set]*
5485 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5486 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5487 int32_t stackLoc = URX_VAL(loopcOp);
5488 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5489 fp->fExtra[stackLoc] = fp->fInputIdx;
5490 fp->fInputIdx = ix;
5491
5492 // Save State to the URX_LOOP_C op that follows this one,
5493 // so that match failures in the following code will return to there.
5494 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5495 fp = StateSave(fp, fp->fPatIdx, status);
5496 fp->fPatIdx++;
5497 }
5498 break;
5499
5500
5501 case URX_LOOP_DOT_I:
5502 // Loop Initialization for the optimized implementation of .*
5503 // This op scans through all remaining input.
5504 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5505 {
5506 // Loop through input until the input is exhausted (we reach an end-of-line)
5507 // In DOTALL mode, we can just go straight to the end of the input.
5508 int32_t ix;
5509 if ((opValue & 1) == 1) {
5510 // Dot-matches-All mode. Jump straight to the end of the string.
5511 ix = (int32_t)fActiveLimit;
5512 fHitEnd = TRUE;
5513 } else {
5514 // NOT DOT ALL mode. Line endings do not match '.'
5515 // Scan forward until a line ending or end of input.
5516 ix = (int32_t)fp->fInputIdx;
5517 for (;;) {
5518 if (ix >= fActiveLimit) {
5519 fHitEnd = TRUE;
5520 break;
5521 }
5522 UChar32 c;
5523 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++]
5524 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
5525 if ((c == 0x0a) || // 0x0a is newline in both modes.
5526 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
5527 ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) {
5528 // char is a line ending. Put the input pos back to the
5529 // line ending char, and exit the scanning loop.
5530 U16_BACK_1(inputBuf, 0, ix);
5531 break;
5532 }
5533 }
5534 }
5535 }
5536
5537 // If there were no matching characters, skip over the loop altogether.
5538 // The loop doesn't run at all, a * op always succeeds.
5539 if (ix == fp->fInputIdx) {
5540 fp->fPatIdx++; // skip the URX_LOOP_C op.
5541 break;
5542 }
5543
5544 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5545 // must follow. It's operand is the stack location
5546 // that holds the starting input index for the match of this .*
5547 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5548 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5549 int32_t stackLoc = URX_VAL(loopcOp);
5550 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5551 fp->fExtra[stackLoc] = fp->fInputIdx;
5552 fp->fInputIdx = ix;
5553
5554 // Save State to the URX_LOOP_C op that follows this one,
5555 // so that match failures in the following code will return to there.
5556 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5557 fp = StateSave(fp, fp->fPatIdx, status);
5558 fp->fPatIdx++;
5559 }
5560 break;
5561
5562
5563 case URX_LOOP_C:
5564 {
5565 U_ASSERT(opValue>=0 && opValue<fFrameSize);
5566 backSearchIndex = (int32_t)fp->fExtra[opValue];
5567 U_ASSERT(backSearchIndex <= fp->fInputIdx);
5568 if (backSearchIndex == fp->fInputIdx) {
5569 // We've backed up the input idx to the point that the loop started.
5570 // The loop is done. Leave here without saving state.
5571 // Subsequent failures won't come back here.
5572 break;
5573 }
5574 // Set up for the next iteration of the loop, with input index
5575 // backed up by one from the last time through,
5576 // and a state save to this instruction in case the following code fails again.
5577 // (We're going backwards because this loop emulates stack unwinding, not
5578 // the initial scan forward.)
5579 U_ASSERT(fp->fInputIdx > 0);
5580 UChar32 prevC;
5581 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
5582
5583 if (prevC == 0x0a &&
5584 fp->fInputIdx > backSearchIndex &&
5585 inputBuf[fp->fInputIdx-1] == 0x0d) {
5586 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
5587 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
5588 // .*, stepping back over CRLF pair.
5589 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
5590 }
5591 }
5592
5593
5594 fp = StateSave(fp, fp->fPatIdx-1, status);
5595 }
5596 break;
5597
5598
5599
5600 default:
5601 // Trouble. The compiled pattern contains an entry with an
5602 // unrecognized type tag.
5603 U_ASSERT(FALSE);
5604 }
5605
5606 if (U_FAILURE(status)) {
5607 isMatch = FALSE;
5608 break;
5609 }
5610 }
5611
5612 breakFromLoop:
5613 fMatch = isMatch;
5614 if (isMatch) {
5615 fLastMatchEnd = fMatchEnd;
5616 fMatchStart = startIdx;
5617 fMatchEnd = fp->fInputIdx;
5618 }
5619
5620 #ifdef REGEX_RUN_DEBUG
5621 if (fTraceDebug) {
5622 if (isMatch) {
5623 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
5624 } else {
5625 printf("No match\n\n");
5626 }
5627 }
5628 #endif
5629
5630 fFrame = fp; // The active stack frame when the engine stopped.
5631 // Contains the capture group results that we need to
5632 // access later.
5633
5634 return;
5635 }
5636
5637
5638 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
5639
5640 U_NAMESPACE_END
5641
5642 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS