]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/rematch.cpp
ICU-461.12.tar.gz
[apple/icu.git] / icuSources / i18n / rematch.cpp
1 /*
2 **************************************************************************
3 * Copyright (C) 2002-2010 International Business Machines Corporation *
4 * and others. All rights reserved. *
5 **************************************************************************
6 */
7 //
8 // file: rematch.cpp
9 //
10 // Contains the implementation of class RegexMatcher,
11 // which is one of the main API classes for the ICU regular expression package.
12 //
13
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
16
17 #include "unicode/regex.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/ustring.h"
21 #include "unicode/rbbi.h"
22 #include "uassert.h"
23 #include "cmemory.h"
24 #include "uvector.h"
25 #include "uvectr32.h"
26 #include "uvectr64.h"
27 #include "regeximp.h"
28 #include "regexst.h"
29 #include "regextxt.h"
30 #include "ucase.h"
31
32 // #include <malloc.h> // Needed for heapcheck testing
33
34
35 // Find progress callback
36 // ----------------------
37 // Macro to inline test & call to ReportFindProgress(). Eliminates unnecessary function call.
38 //
39 #define REGEXFINDPROGRESS_INTERRUPT(pos, status) \
40 (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE)
41
42
43 // Smart Backtracking
44 // ------------------
45 // When a failure would go back to a LOOP_C instruction,
46 // strings, characters, and setrefs scan backwards for a valid start
47 // character themselves, pop the stack, and save state, emulating the
48 // LOOP_C's effect but assured that the next character of input is a
49 // possible matching character.
50 //
51 // Good idea in theory; unfortunately it only helps out a few specific
52 // cases and slows the engine down a little in the rest.
53
54 //#define REGEX_SMART_BACKTRACKING 1
55
56 U_NAMESPACE_BEGIN
57
58 // Default limit for the size of the back track stack, to avoid system
59 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
60 // This value puts ICU's limits higher than most other regexp implementations,
61 // which use recursion rather than the heap, and take more storage per
62 // backtrack point.
63 //
64 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
65
66 // Time limit counter constant.
67 // Time limits for expression evaluation are in terms of quanta of work by
68 // the engine, each of which is 10,000 state saves.
69 // This constant determines that state saves per tick number.
70 static const int32_t TIMER_INITIAL_VALUE = 10000;
71
72 //-----------------------------------------------------------------------------
73 //
74 // Constructor and Destructor
75 //
76 //-----------------------------------------------------------------------------
77 RegexMatcher::RegexMatcher(const RegexPattern *pat) {
78 fDeferredStatus = U_ZERO_ERROR;
79 init(fDeferredStatus);
80 if (U_FAILURE(fDeferredStatus)) {
81 return;
82 }
83 if (pat==NULL) {
84 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
85 return;
86 }
87 fPattern = pat;
88 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
89 }
90
91
92
93 RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
94 uint32_t flags, UErrorCode &status) {
95 init(status);
96 if (U_FAILURE(status)) {
97 return;
98 }
99 UParseError pe;
100 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
101 fPattern = fPatternOwned;
102
103 UText inputText = UTEXT_INITIALIZER;
104 utext_openConstUnicodeString(&inputText, &input, &status);
105 init2(&inputText, status);
106 utext_close(&inputText);
107
108 fInputUniStrMaybeMutable = TRUE;
109 }
110
111
112 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
113 uint32_t flags, UErrorCode &status) {
114 init(status);
115 if (U_FAILURE(status)) {
116 return;
117 }
118 UParseError pe;
119 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
120 if (U_FAILURE(status)) {
121 return;
122 }
123
124 fPattern = fPatternOwned;
125 init2(input, status);
126 }
127
128
129 RegexMatcher::RegexMatcher(const UnicodeString &regexp,
130 uint32_t flags, UErrorCode &status) {
131 init(status);
132 if (U_FAILURE(status)) {
133 return;
134 }
135 UParseError pe;
136 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
137 if (U_FAILURE(status)) {
138 return;
139 }
140 fPattern = fPatternOwned;
141 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
142 }
143
144 RegexMatcher::RegexMatcher(UText *regexp,
145 uint32_t flags, UErrorCode &status) {
146 init(status);
147 if (U_FAILURE(status)) {
148 return;
149 }
150 UParseError pe;
151 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
152 if (U_FAILURE(status)) {
153 return;
154 }
155
156 fPattern = fPatternOwned;
157 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
158 }
159
160
161
162
163 RegexMatcher::~RegexMatcher() {
164 delete fStack;
165 if (fData != fSmallData) {
166 uprv_free(fData);
167 fData = NULL;
168 }
169 if (fPatternOwned) {
170 delete fPatternOwned;
171 fPatternOwned = NULL;
172 fPattern = NULL;
173 }
174
175 if (fInput) {
176 delete fInput;
177 }
178 if (fInputText) {
179 utext_close(fInputText);
180 }
181 if (fAltInputText) {
182 utext_close(fAltInputText);
183 }
184
185 #if UCONFIG_NO_BREAK_ITERATION==0
186 delete fWordBreakItr;
187 #endif
188 }
189
190 //
191 // init() common initialization for use by all constructors.
192 // Initialize all fields, get the object into a consistent state.
193 // This must be done even when the initial status shows an error,
194 // so that the object is initialized sufficiently well for the destructor
195 // to run safely.
196 //
197 void RegexMatcher::init(UErrorCode &status) {
198 fPattern = NULL;
199 fPatternOwned = NULL;
200 fFrameSize = 0;
201 fRegionStart = 0;
202 fRegionLimit = 0;
203 fAnchorStart = 0;
204 fAnchorLimit = 0;
205 fLookStart = 0;
206 fLookLimit = 0;
207 fActiveStart = 0;
208 fActiveLimit = 0;
209 fTransparentBounds = FALSE;
210 fAnchoringBounds = TRUE;
211 fMatch = FALSE;
212 fMatchStart = 0;
213 fMatchEnd = 0;
214 fLastMatchEnd = -1;
215 fAppendPosition = 0;
216 fHitEnd = FALSE;
217 fRequireEnd = FALSE;
218 fStack = NULL;
219 fFrame = NULL;
220 fTimeLimit = 0;
221 fTime = 0;
222 fTickCounter = 0;
223 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY;
224 fCallbackFn = NULL;
225 fCallbackContext = NULL;
226 fFindProgressCallbackFn = NULL;
227 fFindProgressCallbackContext = NULL;
228 fTraceDebug = FALSE;
229 fDeferredStatus = status;
230 fData = fSmallData;
231 fWordBreakItr = NULL;
232
233 fStack = new UVector64(status);
234 fInputText = NULL;
235 fAltInputText = NULL;
236 fInput = NULL;
237 fInputLength = 0;
238 fInputUniStrMaybeMutable = FALSE;
239
240 if (U_FAILURE(status)) {
241 fDeferredStatus = status;
242 }
243 }
244
245 //
246 // init2() Common initialization for use by RegexMatcher constructors, part 2.
247 // This handles the common setup to be done after the Pattern is available.
248 //
249 void RegexMatcher::init2(UText *input, UErrorCode &status) {
250 if (U_FAILURE(status)) {
251 fDeferredStatus = status;
252 return;
253 }
254
255 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) {
256 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
257 if (fData == NULL) {
258 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
259 return;
260 }
261 }
262
263 reset(input);
264 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
265 if (U_FAILURE(status)) {
266 fDeferredStatus = status;
267 return;
268 }
269 }
270
271
272 static const UChar BACKSLASH = 0x5c;
273 static const UChar DOLLARSIGN = 0x24;
274 //--------------------------------------------------------------------------------
275 //
276 // appendReplacement
277 //
278 //--------------------------------------------------------------------------------
279 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
280 const UnicodeString &replacement,
281 UErrorCode &status) {
282 UText replacementText = UTEXT_INITIALIZER;
283
284 utext_openConstUnicodeString(&replacementText, &replacement, &status);
285 if (U_SUCCESS(status)) {
286 UText resultText = UTEXT_INITIALIZER;
287 utext_openUnicodeString(&resultText, &dest, &status);
288
289 if (U_SUCCESS(status)) {
290 appendReplacement(&resultText, &replacementText, status);
291 utext_close(&resultText);
292 }
293 utext_close(&replacementText);
294 }
295
296 return *this;
297 }
298
299 //
300 // appendReplacement, UText mode
301 //
302 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
303 UText *replacement,
304 UErrorCode &status) {
305 if (U_FAILURE(status)) {
306 return *this;
307 }
308 if (U_FAILURE(fDeferredStatus)) {
309 status = fDeferredStatus;
310 return *this;
311 }
312 if (fMatch == FALSE) {
313 status = U_REGEX_INVALID_STATE;
314 return *this;
315 }
316
317 // Copy input string from the end of previous match to start of current match
318 int64_t destLen = utext_nativeLength(dest);
319 if (fMatchStart > fAppendPosition) {
320 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
321 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
322 (int32_t)(fMatchStart-fAppendPosition), &status);
323 } else {
324 int32_t len16;
325 if (UTEXT_USES_U16(fInputText)) {
326 len16 = (int32_t)(fMatchStart-fAppendPosition);
327 } else {
328 UErrorCode lengthStatus = U_ZERO_ERROR;
329 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
330 }
331 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
332 if (inputChars == NULL) {
333 status = U_MEMORY_ALLOCATION_ERROR;
334 return *this;
335 }
336 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
337 destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
338 uprv_free(inputChars);
339 }
340 }
341 fAppendPosition = fMatchEnd;
342
343
344 // scan the replacement text, looking for substitutions ($n) and \escapes.
345 // TODO: optimize this loop by efficiently scanning for '$' or '\',
346 // move entire ranges not containing substitutions.
347 UTEXT_SETNATIVEINDEX(replacement, 0);
348 UChar32 c = UTEXT_NEXT32(replacement);
349 while (c != U_SENTINEL) {
350 if (c == BACKSLASH) {
351 // Backslash Escape. Copy the following char out without further checks.
352 // Note: Surrogate pairs don't need any special handling
353 // The second half wont be a '$' or a '\', and
354 // will move to the dest normally on the next
355 // loop iteration.
356 c = UTEXT_CURRENT32(replacement);
357 if (c == U_SENTINEL) {
358 break;
359 }
360
361 if (c==0x55/*U*/ || c==0x75/*u*/) {
362 // We have a \udddd or \Udddddddd escape sequence.
363 int32_t offset = 0;
364 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
365 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
366 if (escapedChar != (UChar32)0xFFFFFFFF) {
367 if (U_IS_BMP(escapedChar)) {
368 UChar c16 = (UChar)escapedChar;
369 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
370 } else {
371 UChar surrogate[2];
372 surrogate[0] = U16_LEAD(escapedChar);
373 surrogate[1] = U16_TRAIL(escapedChar);
374 if (U_SUCCESS(status)) {
375 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
376 }
377 }
378 // TODO: Report errors for mal-formed \u escapes?
379 // As this is, the original sequence is output, which may be OK.
380 if (context.lastOffset == offset) {
381 UTEXT_PREVIOUS32(replacement);
382 } else if (context.lastOffset != offset-1) {
383 utext_moveIndex32(replacement, offset - context.lastOffset - 1);
384 }
385 }
386 } else {
387 UTEXT_NEXT32(replacement);
388 // Plain backslash escape. Just put out the escaped character.
389 if (U_IS_BMP(c)) {
390 UChar c16 = (UChar)c;
391 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
392 } else {
393 UChar surrogate[2];
394 surrogate[0] = U16_LEAD(c);
395 surrogate[1] = U16_TRAIL(c);
396 if (U_SUCCESS(status)) {
397 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
398 }
399 }
400 }
401 } else if (c != DOLLARSIGN) {
402 // Normal char, not a $. Copy it out without further checks.
403 if (U_IS_BMP(c)) {
404 UChar c16 = (UChar)c;
405 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
406 } else {
407 UChar surrogate[2];
408 surrogate[0] = U16_LEAD(c);
409 surrogate[1] = U16_TRAIL(c);
410 if (U_SUCCESS(status)) {
411 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
412 }
413 }
414 } else {
415 // We've got a $. Pick up a capture group number if one follows.
416 // Consume at most the number of digits necessary for the largest capture
417 // number that is valid for this pattern.
418
419 int32_t numDigits = 0;
420 int32_t groupNum = 0;
421 UChar32 digitC;
422 for (;;) {
423 digitC = UTEXT_CURRENT32(replacement);
424 if (digitC == U_SENTINEL) {
425 break;
426 }
427 if (u_isdigit(digitC) == FALSE) {
428 break;
429 }
430 UTEXT_NEXT32(replacement);
431 groupNum=groupNum*10 + u_charDigitValue(digitC);
432 numDigits++;
433 if (numDigits >= fPattern->fMaxCaptureDigits) {
434 break;
435 }
436 }
437
438
439 if (numDigits == 0) {
440 // The $ didn't introduce a group number at all.
441 // Treat it as just part of the substitution text.
442 UChar c16 = DOLLARSIGN;
443 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
444 } else {
445 // Finally, append the capture group data to the destination.
446 destLen += appendGroup(groupNum, dest, status);
447 if (U_FAILURE(status)) {
448 // Can fail if group number is out of range.
449 break;
450 }
451 }
452 }
453
454 if (U_FAILURE(status)) {
455 break;
456 } else {
457 c = UTEXT_NEXT32(replacement);
458 }
459 }
460
461 return *this;
462 }
463
464
465
466 //--------------------------------------------------------------------------------
467 //
468 // appendTail Intended to be used in conjunction with appendReplacement()
469 // To the destination string, append everything following
470 // the last match position from the input string.
471 //
472 // Note: Match ranges do not affect appendTail or appendReplacement
473 //
474 //--------------------------------------------------------------------------------
475 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
476 UErrorCode status = U_ZERO_ERROR;
477 UText resultText = UTEXT_INITIALIZER;
478 utext_openUnicodeString(&resultText, &dest, &status);
479
480 if (U_SUCCESS(status)) {
481 appendTail(&resultText, status);
482 utext_close(&resultText);
483 }
484
485 return dest;
486 }
487
488 //
489 // appendTail, UText mode
490 //
491 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
492 UBool bailOut = FALSE;
493 if (U_FAILURE(status)) {
494 bailOut = TRUE;
495 }
496 if (U_FAILURE(fDeferredStatus)) {
497 status = fDeferredStatus;
498 bailOut = TRUE;
499 }
500
501 if (bailOut) {
502 // dest must not be NULL
503 if (dest) {
504 utext_replace(dest, utext_nativeLength(dest), utext_nativeLength(dest), NULL, 0, &status);
505 return dest;
506 }
507 }
508
509 if (fInputLength > fAppendPosition) {
510 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
511 int64_t destLen = utext_nativeLength(dest);
512 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
513 (int32_t)(fInputLength-fAppendPosition), &status);
514 } else {
515 int32_t len16;
516 if (UTEXT_USES_U16(fInputText)) {
517 len16 = (int32_t)(fInputLength-fAppendPosition);
518 } else {
519 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
520 status = U_ZERO_ERROR; // buffer overflow
521 }
522
523 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
524 if (inputChars == NULL) {
525 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
526 } else {
527 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
528 int64_t destLen = utext_nativeLength(dest);
529 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
530 uprv_free(inputChars);
531 }
532 }
533 }
534 return dest;
535 }
536
537
538
539 //--------------------------------------------------------------------------------
540 //
541 // end
542 //
543 //--------------------------------------------------------------------------------
544 int32_t RegexMatcher::end(UErrorCode &err) const {
545 return end(0, err);
546 }
547
548 int64_t RegexMatcher::end64(UErrorCode &err) const {
549 return end64(0, err);
550 }
551
552 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
553 if (U_FAILURE(err)) {
554 return -1;
555 }
556 if (fMatch == FALSE) {
557 err = U_REGEX_INVALID_STATE;
558 return -1;
559 }
560 if (group < 0 || group > fPattern->fGroupMap->size()) {
561 err = U_INDEX_OUTOFBOUNDS_ERROR;
562 return -1;
563 }
564 int64_t e = -1;
565 if (group == 0) {
566 e = fMatchEnd;
567 } else {
568 // Get the position within the stack frame of the variables for
569 // this capture group.
570 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
571 U_ASSERT(groupOffset < fPattern->fFrameSize);
572 U_ASSERT(groupOffset >= 0);
573 e = fFrame->fExtra[groupOffset + 1];
574 }
575
576 return e;
577 }
578
579 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
580 return (int32_t)end64(group, err);
581 }
582
583
584 //--------------------------------------------------------------------------------
585 //
586 // find()
587 //
588 //--------------------------------------------------------------------------------
589 UBool RegexMatcher::find() {
590 // Start at the position of the last match end. (Will be zero if the
591 // matcher has been reset.)
592 //
593 if (U_FAILURE(fDeferredStatus)) {
594 return FALSE;
595 }
596
597 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
598 return findUsingChunk();
599 }
600
601 int64_t startPos = fMatchEnd;
602 if (startPos==0) {
603 startPos = fActiveStart;
604 }
605
606 if (fMatch) {
607 // Save the position of any previous successful match.
608 fLastMatchEnd = fMatchEnd;
609
610 if (fMatchStart == fMatchEnd) {
611 // Previous match had zero length. Move start position up one position
612 // to avoid sending find() into a loop on zero-length matches.
613 if (startPos >= fActiveLimit) {
614 fMatch = FALSE;
615 fHitEnd = TRUE;
616 return FALSE;
617 }
618 UTEXT_SETNATIVEINDEX(fInputText, startPos);
619 UTEXT_NEXT32(fInputText);
620 startPos = UTEXT_GETNATIVEINDEX(fInputText);
621 }
622 } else {
623 if (fLastMatchEnd >= 0) {
624 // A previous find() failed to match. Don't try again.
625 // (without this test, a pattern with a zero-length match
626 // could match again at the end of an input string.)
627 fHitEnd = TRUE;
628 return FALSE;
629 }
630 }
631
632
633 // Compute the position in the input string beyond which a match can not begin, because
634 // the minimum length match would extend past the end of the input.
635 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
636 // Be aware of possible overflows if making changes here.
637 int64_t testStartLimit;
638 if (UTEXT_USES_U16(fInputText)) {
639 testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
640 if (startPos > testStartLimit) {
641 fMatch = FALSE;
642 fHitEnd = TRUE;
643 return FALSE;
644 }
645 } else {
646 // For now, let the matcher discover that it can't match on its own
647 // We don't know how long the match len is in native characters
648 testStartLimit = fActiveLimit;
649 }
650
651 UChar32 c;
652 U_ASSERT(startPos >= 0);
653
654 switch (fPattern->fStartType) {
655 case START_NO_INFO:
656 // No optimization was found.
657 // Try a match at each input position.
658 for (;;) {
659 MatchAt(startPos, FALSE, fDeferredStatus);
660 if (U_FAILURE(fDeferredStatus)) {
661 return FALSE;
662 }
663 if (fMatch) {
664 return TRUE;
665 }
666 if (startPos >= testStartLimit) {
667 fHitEnd = TRUE;
668 return FALSE;
669 }
670 UTEXT_SETNATIVEINDEX(fInputText, startPos);
671 UTEXT_NEXT32(fInputText);
672 startPos = UTEXT_GETNATIVEINDEX(fInputText);
673 // Note that it's perfectly OK for a pattern to have a zero-length
674 // match at the end of a string, so we must make sure that the loop
675 // runs with startPos == testStartLimit the last time through.
676 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
677 return FALSE;
678 }
679 U_ASSERT(FALSE);
680
681 case START_START:
682 // Matches are only possible at the start of the input string
683 // (pattern begins with ^ or \A)
684 if (startPos > fActiveStart) {
685 fMatch = FALSE;
686 return FALSE;
687 }
688 MatchAt(startPos, FALSE, fDeferredStatus);
689 if (U_FAILURE(fDeferredStatus)) {
690 return FALSE;
691 }
692 return fMatch;
693
694
695 case START_SET:
696 {
697 // Match may start on any char from a pre-computed set.
698 U_ASSERT(fPattern->fMinMatchLen > 0);
699 int64_t pos;
700 UTEXT_SETNATIVEINDEX(fInputText, startPos);
701 for (;;) {
702 c = UTEXT_NEXT32(fInputText);
703 pos = UTEXT_GETNATIVEINDEX(fInputText);
704 // c will be -1 (U_SENTINEL) at end of text, in which case we
705 // skip this next block (so we don't have a negative array index)
706 // and handle end of text in the following block.
707 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
708 (c>=256 && fPattern->fInitialChars->contains(c)))) {
709 MatchAt(startPos, FALSE, fDeferredStatus);
710 if (U_FAILURE(fDeferredStatus)) {
711 return FALSE;
712 }
713 if (fMatch) {
714 return TRUE;
715 }
716 UTEXT_SETNATIVEINDEX(fInputText, pos);
717 }
718 if (startPos >= testStartLimit) {
719 fMatch = FALSE;
720 fHitEnd = TRUE;
721 return FALSE;
722 }
723 startPos = pos;
724 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
725 return FALSE;
726 }
727 }
728 U_ASSERT(FALSE);
729
730 case START_STRING:
731 case START_CHAR:
732 {
733 // Match starts on exactly one char.
734 U_ASSERT(fPattern->fMinMatchLen > 0);
735 UChar32 theChar = fPattern->fInitialChar;
736 int64_t pos;
737 UTEXT_SETNATIVEINDEX(fInputText, startPos);
738 for (;;) {
739 c = UTEXT_NEXT32(fInputText);
740 pos = UTEXT_GETNATIVEINDEX(fInputText);
741 if (c == theChar) {
742 MatchAt(startPos, FALSE, fDeferredStatus);
743 if (U_FAILURE(fDeferredStatus)) {
744 return FALSE;
745 }
746 if (fMatch) {
747 return TRUE;
748 }
749 UTEXT_SETNATIVEINDEX(fInputText, pos);
750 }
751 if (startPos >= testStartLimit) {
752 fMatch = FALSE;
753 fHitEnd = TRUE;
754 return FALSE;
755 }
756 startPos = pos;
757 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
758 return FALSE;
759 }
760 }
761 U_ASSERT(FALSE);
762
763 case START_LINE:
764 {
765 UChar32 c;
766 if (startPos == fAnchorStart) {
767 MatchAt(startPos, FALSE, fDeferredStatus);
768 if (U_FAILURE(fDeferredStatus)) {
769 return FALSE;
770 }
771 if (fMatch) {
772 return TRUE;
773 }
774 UTEXT_SETNATIVEINDEX(fInputText, startPos);
775 c = UTEXT_NEXT32(fInputText);
776 startPos = UTEXT_GETNATIVEINDEX(fInputText);
777 } else {
778 UTEXT_SETNATIVEINDEX(fInputText, startPos);
779 c = UTEXT_PREVIOUS32(fInputText);
780 UTEXT_SETNATIVEINDEX(fInputText, startPos);
781 }
782
783 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
784 for (;;) {
785 if (c == 0x0a) {
786 MatchAt(startPos, FALSE, fDeferredStatus);
787 if (U_FAILURE(fDeferredStatus)) {
788 return FALSE;
789 }
790 if (fMatch) {
791 return TRUE;
792 }
793 UTEXT_SETNATIVEINDEX(fInputText, startPos);
794 }
795 if (startPos >= testStartLimit) {
796 fMatch = FALSE;
797 fHitEnd = TRUE;
798 return FALSE;
799 }
800 c = UTEXT_NEXT32(fInputText);
801 startPos = UTEXT_GETNATIVEINDEX(fInputText);
802 // Note that it's perfectly OK for a pattern to have a zero-length
803 // match at the end of a string, so we must make sure that the loop
804 // runs with startPos == testStartLimit the last time through.
805 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
806 return FALSE;
807 }
808 } else {
809 for (;;) {
810 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
811 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
812 if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
813 UTEXT_NEXT32(fInputText);
814 startPos = UTEXT_GETNATIVEINDEX(fInputText);
815 }
816 MatchAt(startPos, FALSE, fDeferredStatus);
817 if (U_FAILURE(fDeferredStatus)) {
818 return FALSE;
819 }
820 if (fMatch) {
821 return TRUE;
822 }
823 UTEXT_SETNATIVEINDEX(fInputText, startPos);
824 }
825 if (startPos >= testStartLimit) {
826 fMatch = FALSE;
827 fHitEnd = TRUE;
828 return FALSE;
829 }
830 c = UTEXT_NEXT32(fInputText);
831 startPos = UTEXT_GETNATIVEINDEX(fInputText);
832 // Note that it's perfectly OK for a pattern to have a zero-length
833 // match at the end of a string, so we must make sure that the loop
834 // runs with startPos == testStartLimit the last time through.
835 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
836 return FALSE;
837 }
838 }
839 }
840
841 default:
842 U_ASSERT(FALSE);
843 }
844
845 U_ASSERT(FALSE);
846 return FALSE;
847 }
848
849
850
851 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
852 if (U_FAILURE(status)) {
853 return FALSE;
854 }
855 if (U_FAILURE(fDeferredStatus)) {
856 status = fDeferredStatus;
857 return FALSE;
858 }
859 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
860 // This will reset the region to be the full input length.
861 if (start < 0) {
862 status = U_INDEX_OUTOFBOUNDS_ERROR;
863 return FALSE;
864 }
865
866 int64_t nativeStart = start;
867 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
868 status = U_INDEX_OUTOFBOUNDS_ERROR;
869 return FALSE;
870 }
871 fMatchEnd = nativeStart;
872 return find();
873 }
874
875
876 //--------------------------------------------------------------------------------
877 //
878 // findUsingChunk() -- like find(), but with the advance knowledge that the
879 // entire string is available in the UText's chunk buffer.
880 //
881 //--------------------------------------------------------------------------------
882 UBool RegexMatcher::findUsingChunk() {
883 // Start at the position of the last match end. (Will be zero if the
884 // matcher has been reset.
885 //
886
887 int32_t startPos = (int32_t)fMatchEnd;
888 if (startPos==0) {
889 startPos = (int32_t)fActiveStart;
890 }
891
892 const UChar *inputBuf = fInputText->chunkContents;
893
894 if (fMatch) {
895 // Save the position of any previous successful match.
896 fLastMatchEnd = fMatchEnd;
897
898 if (fMatchStart == fMatchEnd) {
899 // Previous match had zero length. Move start position up one position
900 // to avoid sending find() into a loop on zero-length matches.
901 if (startPos >= fActiveLimit) {
902 fMatch = FALSE;
903 fHitEnd = TRUE;
904 return FALSE;
905 }
906 U16_FWD_1(inputBuf, startPos, fInputLength);
907 }
908 } else {
909 if (fLastMatchEnd >= 0) {
910 // A previous find() failed to match. Don't try again.
911 // (without this test, a pattern with a zero-length match
912 // could match again at the end of an input string.)
913 fHitEnd = TRUE;
914 return FALSE;
915 }
916 }
917
918
919 // Compute the position in the input string beyond which a match can not begin, because
920 // the minimum length match would extend past the end of the input.
921 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
922 // Be aware of possible overflows if making changes here.
923 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
924 if (startPos > testLen) {
925 fMatch = FALSE;
926 fHitEnd = TRUE;
927 return FALSE;
928 }
929
930 UChar32 c;
931 U_ASSERT(startPos >= 0);
932
933 switch (fPattern->fStartType) {
934 case START_NO_INFO:
935 // No optimization was found.
936 // Try a match at each input position.
937 for (;;) {
938 MatchChunkAt(startPos, FALSE, fDeferredStatus);
939 if (U_FAILURE(fDeferredStatus)) {
940 return FALSE;
941 }
942 if (fMatch) {
943 return TRUE;
944 }
945 if (startPos >= testLen) {
946 fHitEnd = TRUE;
947 return FALSE;
948 }
949 U16_FWD_1(inputBuf, startPos, fActiveLimit);
950 // Note that it's perfectly OK for a pattern to have a zero-length
951 // match at the end of a string, so we must make sure that the loop
952 // runs with startPos == testLen the last time through.
953 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
954 return FALSE;
955 }
956 U_ASSERT(FALSE);
957
958 case START_START:
959 // Matches are only possible at the start of the input string
960 // (pattern begins with ^ or \A)
961 if (startPos > fActiveStart) {
962 fMatch = FALSE;
963 return FALSE;
964 }
965 MatchChunkAt(startPos, FALSE, fDeferredStatus);
966 if (U_FAILURE(fDeferredStatus)) {
967 return FALSE;
968 }
969 return fMatch;
970
971
972 case START_SET:
973 {
974 // Match may start on any char from a pre-computed set.
975 U_ASSERT(fPattern->fMinMatchLen > 0);
976 for (;;) {
977 int32_t pos = startPos;
978 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
979 if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
980 (c>=256 && fPattern->fInitialChars->contains(c))) {
981 MatchChunkAt(pos, FALSE, fDeferredStatus);
982 if (U_FAILURE(fDeferredStatus)) {
983 return FALSE;
984 }
985 if (fMatch) {
986 return TRUE;
987 }
988 }
989 if (pos >= testLen) {
990 fMatch = FALSE;
991 fHitEnd = TRUE;
992 return FALSE;
993 }
994 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
995 return FALSE;
996 }
997 }
998 U_ASSERT(FALSE);
999
1000 case START_STRING:
1001 case START_CHAR:
1002 {
1003 // Match starts on exactly one char.
1004 U_ASSERT(fPattern->fMinMatchLen > 0);
1005 UChar32 theChar = fPattern->fInitialChar;
1006 for (;;) {
1007 int32_t pos = startPos;
1008 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
1009 if (c == theChar) {
1010 MatchChunkAt(pos, FALSE, fDeferredStatus);
1011 if (U_FAILURE(fDeferredStatus)) {
1012 return FALSE;
1013 }
1014 if (fMatch) {
1015 return TRUE;
1016 }
1017 }
1018 if (pos >= testLen) {
1019 fMatch = FALSE;
1020 fHitEnd = TRUE;
1021 return FALSE;
1022 }
1023 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1024 return FALSE;
1025 }
1026 }
1027 U_ASSERT(FALSE);
1028
1029 case START_LINE:
1030 {
1031 UChar32 c;
1032 if (startPos == fAnchorStart) {
1033 MatchChunkAt(startPos, FALSE, fDeferredStatus);
1034 if (U_FAILURE(fDeferredStatus)) {
1035 return FALSE;
1036 }
1037 if (fMatch) {
1038 return TRUE;
1039 }
1040 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1041 }
1042
1043 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1044 for (;;) {
1045 c = inputBuf[startPos-1];
1046 if (c == 0x0a) {
1047 MatchChunkAt(startPos, FALSE, fDeferredStatus);
1048 if (U_FAILURE(fDeferredStatus)) {
1049 return FALSE;
1050 }
1051 if (fMatch) {
1052 return TRUE;
1053 }
1054 }
1055 if (startPos >= testLen) {
1056 fMatch = FALSE;
1057 fHitEnd = TRUE;
1058 return FALSE;
1059 }
1060 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1061 // Note that it's perfectly OK for a pattern to have a zero-length
1062 // match at the end of a string, so we must make sure that the loop
1063 // runs with startPos == testLen the last time through.
1064 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1065 return FALSE;
1066 }
1067 } else {
1068 for (;;) {
1069 c = inputBuf[startPos-1];
1070 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
1071 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
1072 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1073 startPos++;
1074 }
1075 MatchChunkAt(startPos, FALSE, fDeferredStatus);
1076 if (U_FAILURE(fDeferredStatus)) {
1077 return FALSE;
1078 }
1079 if (fMatch) {
1080 return TRUE;
1081 }
1082 }
1083 if (startPos >= testLen) {
1084 fMatch = FALSE;
1085 fHitEnd = TRUE;
1086 return FALSE;
1087 }
1088 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1089 // Note that it's perfectly OK for a pattern to have a zero-length
1090 // match at the end of a string, so we must make sure that the loop
1091 // runs with startPos == testLen the last time through.
1092 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1093 return FALSE;
1094 }
1095 }
1096 }
1097
1098 default:
1099 U_ASSERT(FALSE);
1100 }
1101
1102 U_ASSERT(FALSE);
1103 return FALSE;
1104 }
1105
1106
1107
1108 //--------------------------------------------------------------------------------
1109 //
1110 // group()
1111 //
1112 //--------------------------------------------------------------------------------
1113 UnicodeString RegexMatcher::group(UErrorCode &status) const {
1114 return group(0, status);
1115 }
1116
1117 // Return immutable shallow clone
1118 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1119 return group(0, dest, group_len, status);
1120 }
1121
1122 // Return immutable shallow clone
1123 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1124 group_len = 0;
1125 UBool bailOut = FALSE;
1126 if (U_FAILURE(status)) {
1127 return dest;
1128 }
1129 if (U_FAILURE(fDeferredStatus)) {
1130 status = fDeferredStatus;
1131 bailOut = TRUE;
1132 }
1133 if (fMatch == FALSE) {
1134 status = U_REGEX_INVALID_STATE;
1135 bailOut = TRUE;
1136 }
1137 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1138 status = U_INDEX_OUTOFBOUNDS_ERROR;
1139 bailOut = TRUE;
1140 }
1141
1142 if (bailOut) {
1143 return (dest) ? dest : utext_openUChars(NULL, NULL, 0, &status);
1144 }
1145
1146 int64_t s, e;
1147 if (groupNum == 0) {
1148 s = fMatchStart;
1149 e = fMatchEnd;
1150 } else {
1151 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1152 U_ASSERT(groupOffset < fPattern->fFrameSize);
1153 U_ASSERT(groupOffset >= 0);
1154 s = fFrame->fExtra[groupOffset];
1155 e = fFrame->fExtra[groupOffset+1];
1156 }
1157
1158 if (s < 0) {
1159 // A capture group wasn't part of the match
1160 return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1161 }
1162 U_ASSERT(s <= e);
1163 group_len = e - s;
1164
1165 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1166 if (dest)
1167 UTEXT_SETNATIVEINDEX(dest, s);
1168 return dest;
1169 }
1170
1171 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1172 UnicodeString result;
1173 if (U_FAILURE(status)) {
1174 return result;
1175 }
1176 UText resultText = UTEXT_INITIALIZER;
1177 utext_openUnicodeString(&resultText, &result, &status);
1178 group(groupNum, &resultText, status);
1179 utext_close(&resultText);
1180 return result;
1181 }
1182
1183
1184 // Return deep (mutable) clone
1185 // Technology Preview (as an API), but note that the UnicodeString API is implemented
1186 // using this function.
1187 UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const {
1188 UBool bailOut = FALSE;
1189 if (U_FAILURE(status)) {
1190 return dest;
1191 }
1192 if (U_FAILURE(fDeferredStatus)) {
1193 status = fDeferredStatus;
1194 bailOut = TRUE;
1195 }
1196
1197 if (fMatch == FALSE) {
1198 status = U_REGEX_INVALID_STATE;
1199 bailOut = TRUE;
1200 }
1201 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1202 status = U_INDEX_OUTOFBOUNDS_ERROR;
1203 bailOut = TRUE;
1204 }
1205
1206 if (bailOut) {
1207 if (dest) {
1208 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1209 return dest;
1210 } else {
1211 return utext_openUChars(NULL, NULL, 0, &status);
1212 }
1213 }
1214
1215 int64_t s, e;
1216 if (groupNum == 0) {
1217 s = fMatchStart;
1218 e = fMatchEnd;
1219 } else {
1220 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1221 U_ASSERT(groupOffset < fPattern->fFrameSize);
1222 U_ASSERT(groupOffset >= 0);
1223 s = fFrame->fExtra[groupOffset];
1224 e = fFrame->fExtra[groupOffset+1];
1225 }
1226
1227 if (s < 0) {
1228 // A capture group wasn't part of the match
1229 if (dest) {
1230 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1231 return dest;
1232 } else {
1233 return utext_openUChars(NULL, NULL, 0, &status);
1234 }
1235 }
1236 U_ASSERT(s <= e);
1237
1238 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1239 U_ASSERT(e <= fInputLength);
1240 if (dest) {
1241 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status);
1242 } else {
1243 UText groupText = UTEXT_INITIALIZER;
1244 utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status);
1245 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1246 utext_close(&groupText);
1247 }
1248 } else {
1249 int32_t len16;
1250 if (UTEXT_USES_U16(fInputText)) {
1251 len16 = (int32_t)(e-s);
1252 } else {
1253 UErrorCode lengthStatus = U_ZERO_ERROR;
1254 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1255 }
1256 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1257 if (groupChars == NULL) {
1258 status = U_MEMORY_ALLOCATION_ERROR;
1259 return dest;
1260 }
1261 utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1262
1263 if (dest) {
1264 utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status);
1265 } else {
1266 UText groupText = UTEXT_INITIALIZER;
1267 utext_openUChars(&groupText, groupChars, len16, &status);
1268 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1269 utext_close(&groupText);
1270 }
1271
1272 uprv_free(groupChars);
1273 }
1274 return dest;
1275 }
1276
1277 //--------------------------------------------------------------------------------
1278 //
1279 // appendGroup() -- currently internal only, appends a group to a UText rather
1280 // than replacing its contents
1281 //
1282 //--------------------------------------------------------------------------------
1283
1284 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1285 if (U_FAILURE(status)) {
1286 return 0;
1287 }
1288 if (U_FAILURE(fDeferredStatus)) {
1289 status = fDeferredStatus;
1290 return 0;
1291 }
1292 int64_t destLen = utext_nativeLength(dest);
1293
1294 if (fMatch == FALSE) {
1295 status = U_REGEX_INVALID_STATE;
1296 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1297 }
1298 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1299 status = U_INDEX_OUTOFBOUNDS_ERROR;
1300 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1301 }
1302
1303 int64_t s, e;
1304 if (groupNum == 0) {
1305 s = fMatchStart;
1306 e = fMatchEnd;
1307 } else {
1308 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1309 U_ASSERT(groupOffset < fPattern->fFrameSize);
1310 U_ASSERT(groupOffset >= 0);
1311 s = fFrame->fExtra[groupOffset];
1312 e = fFrame->fExtra[groupOffset+1];
1313 }
1314
1315 if (s < 0) {
1316 // A capture group wasn't part of the match
1317 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1318 }
1319 U_ASSERT(s <= e);
1320
1321 int64_t deltaLen;
1322 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1323 U_ASSERT(e <= fInputLength);
1324 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1325 } else {
1326 int32_t len16;
1327 if (UTEXT_USES_U16(fInputText)) {
1328 len16 = (int32_t)(e-s);
1329 } else {
1330 UErrorCode lengthStatus = U_ZERO_ERROR;
1331 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1332 }
1333 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1334 if (groupChars == NULL) {
1335 status = U_MEMORY_ALLOCATION_ERROR;
1336 return 0;
1337 }
1338 utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1339
1340 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1341 uprv_free(groupChars);
1342 }
1343 return deltaLen;
1344 }
1345
1346
1347
1348 //--------------------------------------------------------------------------------
1349 //
1350 // groupCount()
1351 //
1352 //--------------------------------------------------------------------------------
1353 int32_t RegexMatcher::groupCount() const {
1354 return fPattern->fGroupMap->size();
1355 }
1356
1357
1358
1359 //--------------------------------------------------------------------------------
1360 //
1361 // hasAnchoringBounds()
1362 //
1363 //--------------------------------------------------------------------------------
1364 UBool RegexMatcher::hasAnchoringBounds() const {
1365 return fAnchoringBounds;
1366 }
1367
1368
1369 //--------------------------------------------------------------------------------
1370 //
1371 // hasTransparentBounds()
1372 //
1373 //--------------------------------------------------------------------------------
1374 UBool RegexMatcher::hasTransparentBounds() const {
1375 return fTransparentBounds;
1376 }
1377
1378
1379
1380 //--------------------------------------------------------------------------------
1381 //
1382 // hitEnd()
1383 //
1384 //--------------------------------------------------------------------------------
1385 UBool RegexMatcher::hitEnd() const {
1386 return fHitEnd;
1387 }
1388
1389
1390 //--------------------------------------------------------------------------------
1391 //
1392 // input()
1393 //
1394 //--------------------------------------------------------------------------------
1395 const UnicodeString &RegexMatcher::input() const {
1396 if (!fInput) {
1397 UErrorCode status = U_ZERO_ERROR;
1398 int32_t len16;
1399 if (UTEXT_USES_U16(fInputText)) {
1400 len16 = (int32_t)fInputLength;
1401 } else {
1402 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1403 status = U_ZERO_ERROR; // overflow, length status
1404 }
1405 UnicodeString *result = new UnicodeString(len16, 0, 0);
1406
1407 UChar *inputChars = result->getBuffer(len16);
1408 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1409 result->releaseBuffer(len16);
1410
1411 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1412 }
1413
1414 return *fInput;
1415 }
1416
1417 //--------------------------------------------------------------------------------
1418 //
1419 // inputText()
1420 //
1421 //--------------------------------------------------------------------------------
1422 UText *RegexMatcher::inputText() const {
1423 return fInputText;
1424 }
1425
1426
1427 //--------------------------------------------------------------------------------
1428 //
1429 // getInput() -- like inputText(), but makes a clone or copies into another UText
1430 //
1431 //--------------------------------------------------------------------------------
1432 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1433 UBool bailOut = FALSE;
1434 if (U_FAILURE(status)) {
1435 return dest;
1436 }
1437 if (U_FAILURE(fDeferredStatus)) {
1438 status = fDeferredStatus;
1439 bailOut = TRUE;
1440 }
1441
1442 if (bailOut) {
1443 if (dest) {
1444 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1445 return dest;
1446 } else {
1447 return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1448 }
1449 }
1450
1451 if (dest) {
1452 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1453 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1454 } else {
1455 int32_t input16Len;
1456 if (UTEXT_USES_U16(fInputText)) {
1457 input16Len = (int32_t)fInputLength;
1458 } else {
1459 UErrorCode lengthStatus = U_ZERO_ERROR;
1460 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1461 }
1462 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1463 if (inputChars == NULL) {
1464 return dest;
1465 }
1466
1467 status = U_ZERO_ERROR;
1468 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1469 status = U_ZERO_ERROR;
1470 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1471
1472 uprv_free(inputChars);
1473 }
1474 return dest;
1475 } else {
1476 return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1477 }
1478 }
1479
1480
1481 static UBool compat_SyncMutableUTextContents(UText *ut);
1482 static UBool compat_SyncMutableUTextContents(UText *ut) {
1483 UBool retVal = FALSE;
1484
1485 // In the following test, we're really only interested in whether the UText should switch
1486 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents
1487 // will still point to the correct data.
1488 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1489 UnicodeString *us=(UnicodeString *)ut->context;
1490
1491 // Update to the latest length.
1492 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1493 int32_t newLength = us->length();
1494
1495 // Update the chunk description.
1496 // The buffer may have switched between stack- and heap-based.
1497 ut->chunkContents = us->getBuffer();
1498 ut->chunkLength = newLength;
1499 ut->chunkNativeLimit = newLength;
1500 ut->nativeIndexingLimit = newLength;
1501 retVal = TRUE;
1502 }
1503
1504 return retVal;
1505 }
1506
1507 //--------------------------------------------------------------------------------
1508 //
1509 // lookingAt()
1510 //
1511 //--------------------------------------------------------------------------------
1512 UBool RegexMatcher::lookingAt(UErrorCode &status) {
1513 if (U_FAILURE(status)) {
1514 return FALSE;
1515 }
1516 if (U_FAILURE(fDeferredStatus)) {
1517 status = fDeferredStatus;
1518 return FALSE;
1519 }
1520
1521 if (fInputUniStrMaybeMutable) {
1522 if (compat_SyncMutableUTextContents(fInputText)) {
1523 fInputLength = utext_nativeLength(fInputText);
1524 reset();
1525 }
1526 }
1527 else {
1528 resetPreserveRegion();
1529 }
1530 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1531 MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1532 } else {
1533 MatchAt(fActiveStart, FALSE, status);
1534 }
1535 return fMatch;
1536 }
1537
1538
1539 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1540 if (U_FAILURE(status)) {
1541 return FALSE;
1542 }
1543 if (U_FAILURE(fDeferredStatus)) {
1544 status = fDeferredStatus;
1545 return FALSE;
1546 }
1547 reset();
1548
1549 if (start < 0) {
1550 status = U_INDEX_OUTOFBOUNDS_ERROR;
1551 return FALSE;
1552 }
1553
1554 if (fInputUniStrMaybeMutable) {
1555 if (compat_SyncMutableUTextContents(fInputText)) {
1556 fInputLength = utext_nativeLength(fInputText);
1557 reset();
1558 }
1559 }
1560
1561 int64_t nativeStart;
1562 nativeStart = start;
1563 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1564 status = U_INDEX_OUTOFBOUNDS_ERROR;
1565 return FALSE;
1566 }
1567
1568 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1569 MatchChunkAt((int32_t)nativeStart, FALSE, status);
1570 } else {
1571 MatchAt(nativeStart, FALSE, status);
1572 }
1573 return fMatch;
1574 }
1575
1576
1577
1578 //--------------------------------------------------------------------------------
1579 //
1580 // matches()
1581 //
1582 //--------------------------------------------------------------------------------
1583 UBool RegexMatcher::matches(UErrorCode &status) {
1584 if (U_FAILURE(status)) {
1585 return FALSE;
1586 }
1587 if (U_FAILURE(fDeferredStatus)) {
1588 status = fDeferredStatus;
1589 return FALSE;
1590 }
1591
1592 if (fInputUniStrMaybeMutable) {
1593 if (compat_SyncMutableUTextContents(fInputText)) {
1594 fInputLength = utext_nativeLength(fInputText);
1595 reset();
1596 }
1597 }
1598 else {
1599 resetPreserveRegion();
1600 }
1601
1602 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1603 MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1604 } else {
1605 MatchAt(fActiveStart, TRUE, status);
1606 }
1607 return fMatch;
1608 }
1609
1610
1611 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1612 if (U_FAILURE(status)) {
1613 return FALSE;
1614 }
1615 if (U_FAILURE(fDeferredStatus)) {
1616 status = fDeferredStatus;
1617 return FALSE;
1618 }
1619 reset();
1620
1621 if (start < 0) {
1622 status = U_INDEX_OUTOFBOUNDS_ERROR;
1623 return FALSE;
1624 }
1625
1626 if (fInputUniStrMaybeMutable) {
1627 if (compat_SyncMutableUTextContents(fInputText)) {
1628 fInputLength = utext_nativeLength(fInputText);
1629 reset();
1630 }
1631 }
1632
1633 int64_t nativeStart;
1634 nativeStart = start;
1635 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1636 status = U_INDEX_OUTOFBOUNDS_ERROR;
1637 return FALSE;
1638 }
1639
1640 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1641 MatchChunkAt((int32_t)nativeStart, TRUE, status);
1642 } else {
1643 MatchAt(nativeStart, TRUE, status);
1644 }
1645 return fMatch;
1646 }
1647
1648
1649
1650 //--------------------------------------------------------------------------------
1651 //
1652 // pattern
1653 //
1654 //--------------------------------------------------------------------------------
1655 const RegexPattern &RegexMatcher::pattern() const {
1656 return *fPattern;
1657 }
1658
1659
1660
1661 //--------------------------------------------------------------------------------
1662 //
1663 // region
1664 //
1665 //--------------------------------------------------------------------------------
1666 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1667 if (U_FAILURE(status)) {
1668 return *this;
1669 }
1670
1671 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1672 status = U_ILLEGAL_ARGUMENT_ERROR;
1673 }
1674
1675 int64_t nativeStart = regionStart;
1676 int64_t nativeLimit = regionLimit;
1677 if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1678 status = U_ILLEGAL_ARGUMENT_ERROR;
1679 }
1680
1681 if (startIndex == -1)
1682 this->reset();
1683 else
1684 resetPreserveRegion();
1685
1686 fRegionStart = nativeStart;
1687 fRegionLimit = nativeLimit;
1688 fActiveStart = nativeStart;
1689 fActiveLimit = nativeLimit;
1690
1691 if (startIndex != -1) {
1692 if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1693 status = U_INDEX_OUTOFBOUNDS_ERROR;
1694 }
1695 fMatchEnd = startIndex;
1696 }
1697
1698 if (!fTransparentBounds) {
1699 fLookStart = nativeStart;
1700 fLookLimit = nativeLimit;
1701 }
1702 if (fAnchoringBounds) {
1703 fAnchorStart = nativeStart;
1704 fAnchorLimit = nativeLimit;
1705 }
1706 return *this;
1707 }
1708
1709 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1710 return region(start, limit, -1, status);
1711 }
1712
1713 //--------------------------------------------------------------------------------
1714 //
1715 // regionEnd
1716 //
1717 //--------------------------------------------------------------------------------
1718 int32_t RegexMatcher::regionEnd() const {
1719 return (int32_t)fRegionLimit;
1720 }
1721
1722 int64_t RegexMatcher::regionEnd64() const {
1723 return fRegionLimit;
1724 }
1725
1726 //--------------------------------------------------------------------------------
1727 //
1728 // regionStart
1729 //
1730 //--------------------------------------------------------------------------------
1731 int32_t RegexMatcher::regionStart() const {
1732 return (int32_t)fRegionStart;
1733 }
1734
1735 int64_t RegexMatcher::regionStart64() const {
1736 return fRegionStart;
1737 }
1738
1739
1740 //--------------------------------------------------------------------------------
1741 //
1742 // replaceAll
1743 //
1744 //--------------------------------------------------------------------------------
1745 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1746 UText replacementText = UTEXT_INITIALIZER;
1747 UText resultText = UTEXT_INITIALIZER;
1748 UnicodeString resultString;
1749 if (U_FAILURE(status)) {
1750 return resultString;
1751 }
1752
1753 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1754 utext_openUnicodeString(&resultText, &resultString, &status);
1755
1756 replaceAll(&replacementText, &resultText, status);
1757
1758 utext_close(&resultText);
1759 utext_close(&replacementText);
1760
1761 return resultString;
1762 }
1763
1764
1765 //
1766 // replaceAll, UText mode
1767 //
1768 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1769 if (U_FAILURE(status)) {
1770 return dest;
1771 }
1772 if (U_FAILURE(fDeferredStatus)) {
1773 status = fDeferredStatus;
1774 return dest;
1775 }
1776
1777 if (dest == NULL) {
1778 UnicodeString emptyString;
1779 UText empty = UTEXT_INITIALIZER;
1780
1781 utext_openUnicodeString(&empty, &emptyString, &status);
1782 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1783 utext_close(&empty);
1784 }
1785
1786 if (U_SUCCESS(status)) {
1787 reset();
1788 while (find()) {
1789 appendReplacement(dest, replacement, status);
1790 if (U_FAILURE(status)) {
1791 break;
1792 }
1793 }
1794 appendTail(dest, status);
1795 }
1796
1797 return dest;
1798 }
1799
1800
1801 //--------------------------------------------------------------------------------
1802 //
1803 // replaceFirst
1804 //
1805 //--------------------------------------------------------------------------------
1806 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1807 UText replacementText = UTEXT_INITIALIZER;
1808 UText resultText = UTEXT_INITIALIZER;
1809 UnicodeString resultString;
1810
1811 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1812 utext_openUnicodeString(&resultText, &resultString, &status);
1813
1814 replaceFirst(&replacementText, &resultText, status);
1815
1816 utext_close(&resultText);
1817 utext_close(&replacementText);
1818
1819 return resultString;
1820 }
1821
1822 //
1823 // replaceFirst, UText mode
1824 //
1825 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1826 if (U_FAILURE(status)) {
1827 return dest;
1828 }
1829 if (U_FAILURE(fDeferredStatus)) {
1830 status = fDeferredStatus;
1831 return dest;
1832 }
1833
1834 reset();
1835 if (!find()) {
1836 return getInput(dest, status);
1837 }
1838
1839 if (dest == NULL) {
1840 UnicodeString emptyString;
1841 UText empty = UTEXT_INITIALIZER;
1842
1843 utext_openUnicodeString(&empty, &emptyString, &status);
1844 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1845 utext_close(&empty);
1846 }
1847
1848 appendReplacement(dest, replacement, status);
1849 appendTail(dest, status);
1850
1851 return dest;
1852 }
1853
1854
1855 //--------------------------------------------------------------------------------
1856 //
1857 // requireEnd
1858 //
1859 //--------------------------------------------------------------------------------
1860 UBool RegexMatcher::requireEnd() const {
1861 return fRequireEnd;
1862 }
1863
1864
1865 //--------------------------------------------------------------------------------
1866 //
1867 // reset
1868 //
1869 //--------------------------------------------------------------------------------
1870 RegexMatcher &RegexMatcher::reset() {
1871 fRegionStart = 0;
1872 fRegionLimit = fInputLength;
1873 fActiveStart = 0;
1874 fActiveLimit = fInputLength;
1875 fAnchorStart = 0;
1876 fAnchorLimit = fInputLength;
1877 fLookStart = 0;
1878 fLookLimit = fInputLength;
1879 resetPreserveRegion();
1880 return *this;
1881 }
1882
1883
1884
1885 void RegexMatcher::resetPreserveRegion() {
1886 fMatchStart = 0;
1887 fMatchEnd = 0;
1888 fLastMatchEnd = -1;
1889 fAppendPosition = 0;
1890 fMatch = FALSE;
1891 fHitEnd = FALSE;
1892 fRequireEnd = FALSE;
1893 fTime = 0;
1894 fTickCounter = TIMER_INITIAL_VALUE;
1895 //resetStack(); // more expensive than it looks...
1896 }
1897
1898
1899 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1900 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1901 if (fPattern->fNeedsAltInput) {
1902 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1903 }
1904 fInputLength = utext_nativeLength(fInputText);
1905
1906 reset();
1907 delete fInput;
1908 fInput = NULL;
1909
1910 // Do the following for any UnicodeString.
1911 // This is for compatibility for those clients who modify the input string "live" during regex operations.
1912 fInputUniStrMaybeMutable = TRUE;
1913
1914 if (fWordBreakItr != NULL) {
1915 #if UCONFIG_NO_BREAK_ITERATION==0
1916 UErrorCode status = U_ZERO_ERROR;
1917 fWordBreakItr->setText(fInputText, status);
1918 #endif
1919 }
1920 return *this;
1921 }
1922
1923
1924 RegexMatcher &RegexMatcher::reset(UText *input) {
1925 if (fInputText != input) {
1926 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1927 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1928 fInputLength = utext_nativeLength(fInputText);
1929
1930 delete fInput;
1931 fInput = NULL;
1932
1933 if (fWordBreakItr != NULL) {
1934 #if UCONFIG_NO_BREAK_ITERATION==0
1935 UErrorCode status = U_ZERO_ERROR;
1936 fWordBreakItr->setText(input, status);
1937 #endif
1938 }
1939 }
1940 reset();
1941 fInputUniStrMaybeMutable = FALSE;
1942
1943 return *this;
1944 }
1945
1946 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
1947 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1948 return *this;
1949 }*/
1950
1951 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1952 if (U_FAILURE(status)) {
1953 return *this;
1954 }
1955 reset(); // Reset also resets the region to be the entire string.
1956
1957 if (position < 0 || position > fActiveLimit) {
1958 status = U_INDEX_OUTOFBOUNDS_ERROR;
1959 return *this;
1960 }
1961 fMatchEnd = position;
1962 return *this;
1963 }
1964
1965
1966
1967
1968
1969 //--------------------------------------------------------------------------------
1970 //
1971 // setTrace
1972 //
1973 //--------------------------------------------------------------------------------
1974 void RegexMatcher::setTrace(UBool state) {
1975 fTraceDebug = state;
1976 }
1977
1978
1979
1980 //---------------------------------------------------------------------
1981 //
1982 // split
1983 //
1984 //---------------------------------------------------------------------
1985 int32_t RegexMatcher::split(const UnicodeString &input,
1986 UnicodeString dest[],
1987 int32_t destCapacity,
1988 UErrorCode &status)
1989 {
1990 UText inputText = UTEXT_INITIALIZER;
1991 utext_openConstUnicodeString(&inputText, &input, &status);
1992 if (U_FAILURE(status)) {
1993 return 0;
1994 }
1995
1996 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
1997 if (destText == NULL) {
1998 status = U_MEMORY_ALLOCATION_ERROR;
1999 return 0;
2000 }
2001 int32_t i;
2002 for (i = 0; i < destCapacity; i++) {
2003 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2004 }
2005
2006 int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2007
2008 for (i = 0; i < destCapacity; i++) {
2009 utext_close(destText[i]);
2010 }
2011
2012 uprv_free(destText);
2013 utext_close(&inputText);
2014 return fieldCount;
2015 }
2016
2017 //
2018 // split, UText mode
2019 //
2020 int32_t RegexMatcher::split(UText *input,
2021 UText *dest[],
2022 int32_t destCapacity,
2023 UErrorCode &status)
2024 {
2025 //
2026 // Check arguements for validity
2027 //
2028 if (U_FAILURE(status)) {
2029 return 0;
2030 };
2031
2032 if (destCapacity < 1) {
2033 status = U_ILLEGAL_ARGUMENT_ERROR;
2034 return 0;
2035 }
2036
2037 //
2038 // Reset for the input text
2039 //
2040 reset(input);
2041 int64_t nextOutputStringStart = 0;
2042 if (fActiveLimit == 0) {
2043 return 0;
2044 }
2045
2046 //
2047 // Loop through the input text, searching for the delimiter pattern
2048 //
2049 int32_t i;
2050 int32_t numCaptureGroups = fPattern->fGroupMap->size();
2051 for (i=0; ; i++) {
2052 if (i>=destCapacity-1) {
2053 // There is one or zero output string left.
2054 // Fill the last output string with whatever is left from the input, then exit the loop.
2055 // ( i will be == destCapacity if we filled the output array while processing
2056 // capture groups of the delimiter expression, in which case we will discard the
2057 // last capture group saved in favor of the unprocessed remainder of the
2058 // input string.)
2059 i = destCapacity-1;
2060 if (fActiveLimit > nextOutputStringStart) {
2061 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2062 if (dest[i]) {
2063 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2064 input->chunkContents+nextOutputStringStart,
2065 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2066 } else {
2067 UText remainingText = UTEXT_INITIALIZER;
2068 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2069 fActiveLimit-nextOutputStringStart, &status);
2070 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2071 utext_close(&remainingText);
2072 }
2073 } else {
2074 UErrorCode lengthStatus = U_ZERO_ERROR;
2075 int32_t remaining16Length =
2076 utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2077 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2078 if (remainingChars == NULL) {
2079 status = U_MEMORY_ALLOCATION_ERROR;
2080 break;
2081 }
2082
2083 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2084 if (dest[i]) {
2085 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2086 } else {
2087 UText remainingText = UTEXT_INITIALIZER;
2088 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2089 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2090 utext_close(&remainingText);
2091 }
2092
2093 uprv_free(remainingChars);
2094 }
2095 }
2096 break;
2097 }
2098 if (find()) {
2099 // We found another delimiter. Move everything from where we started looking
2100 // up until the start of the delimiter into the next output string.
2101 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2102 if (dest[i]) {
2103 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2104 input->chunkContents+nextOutputStringStart,
2105 (int32_t)(fMatchStart-nextOutputStringStart), &status);
2106 } else {
2107 UText remainingText = UTEXT_INITIALIZER;
2108 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2109 fMatchStart-nextOutputStringStart, &status);
2110 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2111 utext_close(&remainingText);
2112 }
2113 } else {
2114 UErrorCode lengthStatus = U_ZERO_ERROR;
2115 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2116 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2117 if (remainingChars == NULL) {
2118 status = U_MEMORY_ALLOCATION_ERROR;
2119 break;
2120 }
2121 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2122 if (dest[i]) {
2123 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2124 } else {
2125 UText remainingText = UTEXT_INITIALIZER;
2126 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2127 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2128 utext_close(&remainingText);
2129 }
2130
2131 uprv_free(remainingChars);
2132 }
2133 nextOutputStringStart = fMatchEnd;
2134
2135 // If the delimiter pattern has capturing parentheses, the captured
2136 // text goes out into the next n destination strings.
2137 int32_t groupNum;
2138 UBool lastGroupWasNullUText = FALSE;
2139 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2140 if (i==destCapacity-1) {
2141 break;
2142 }
2143 i++;
2144 lastGroupWasNullUText = (dest[i] == NULL ? TRUE : FALSE);
2145 dest[i] = group(groupNum, dest[i], status);
2146 }
2147
2148 if (nextOutputStringStart == fActiveLimit) {
2149 // The delimiter was at the end of the string. We're done.
2150 break;
2151 } else if (i == destCapacity-1) {
2152 // We're out of capture groups, and the rest of the string is more important
2153 if (lastGroupWasNullUText) {
2154 utext_close(dest[i]);
2155 dest[i] = NULL;
2156 }
2157 }
2158
2159 }
2160 else
2161 {
2162 // We ran off the end of the input while looking for the next delimiter.
2163 // All the remaining text goes into the current output string.
2164 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2165 if (dest[i]) {
2166 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2167 input->chunkContents+nextOutputStringStart,
2168 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2169 } else {
2170 UText remainingText = UTEXT_INITIALIZER;
2171 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2172 fActiveLimit-nextOutputStringStart, &status);
2173 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2174 utext_close(&remainingText);
2175 }
2176 } else {
2177 UErrorCode lengthStatus = U_ZERO_ERROR;
2178 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2179 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2180 if (remainingChars == NULL) {
2181 status = U_MEMORY_ALLOCATION_ERROR;
2182 break;
2183 }
2184
2185 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2186 if (dest[i]) {
2187 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2188 } else {
2189 UText remainingText = UTEXT_INITIALIZER;
2190 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2191 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2192 utext_close(&remainingText);
2193 }
2194
2195 uprv_free(remainingChars);
2196 }
2197 break;
2198 }
2199 if (U_FAILURE(status)) {
2200 break;
2201 }
2202 } // end of for loop
2203 return i+1;
2204 }
2205
2206
2207 //--------------------------------------------------------------------------------
2208 //
2209 // start
2210 //
2211 //--------------------------------------------------------------------------------
2212 int32_t RegexMatcher::start(UErrorCode &status) const {
2213 return start(0, status);
2214 }
2215
2216 int64_t RegexMatcher::start64(UErrorCode &status) const {
2217 return start64(0, status);
2218 }
2219
2220 //--------------------------------------------------------------------------------
2221 //
2222 // start(int32_t group, UErrorCode &status)
2223 //
2224 //--------------------------------------------------------------------------------
2225
2226 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2227 if (U_FAILURE(status)) {
2228 return -1;
2229 }
2230 if (U_FAILURE(fDeferredStatus)) {
2231 status = fDeferredStatus;
2232 return -1;
2233 }
2234 if (fMatch == FALSE) {
2235 status = U_REGEX_INVALID_STATE;
2236 return -1;
2237 }
2238 if (group < 0 || group > fPattern->fGroupMap->size()) {
2239 status = U_INDEX_OUTOFBOUNDS_ERROR;
2240 return -1;
2241 }
2242 int64_t s;
2243 if (group == 0) {
2244 s = fMatchStart;
2245 } else {
2246 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2247 U_ASSERT(groupOffset < fPattern->fFrameSize);
2248 U_ASSERT(groupOffset >= 0);
2249 s = fFrame->fExtra[groupOffset];
2250 }
2251
2252 return s;
2253 }
2254
2255
2256 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2257 return (int32_t)start64(group, status);
2258 }
2259
2260 //--------------------------------------------------------------------------------
2261 //
2262 // useAnchoringBounds
2263 //
2264 //--------------------------------------------------------------------------------
2265 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2266 fAnchoringBounds = b;
2267 fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2268 fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2269 return *this;
2270 }
2271
2272
2273 //--------------------------------------------------------------------------------
2274 //
2275 // useTransparentBounds
2276 //
2277 //--------------------------------------------------------------------------------
2278 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2279 fTransparentBounds = b;
2280 fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2281 fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2282 return *this;
2283 }
2284
2285 //--------------------------------------------------------------------------------
2286 //
2287 // setTimeLimit
2288 //
2289 //--------------------------------------------------------------------------------
2290 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2291 if (U_FAILURE(status)) {
2292 return;
2293 }
2294 if (U_FAILURE(fDeferredStatus)) {
2295 status = fDeferredStatus;
2296 return;
2297 }
2298 if (limit < 0) {
2299 status = U_ILLEGAL_ARGUMENT_ERROR;
2300 return;
2301 }
2302 fTimeLimit = limit;
2303 }
2304
2305
2306 //--------------------------------------------------------------------------------
2307 //
2308 // getTimeLimit
2309 //
2310 //--------------------------------------------------------------------------------
2311 int32_t RegexMatcher::getTimeLimit() const {
2312 return fTimeLimit;
2313 }
2314
2315
2316 //--------------------------------------------------------------------------------
2317 //
2318 // setStackLimit
2319 //
2320 //--------------------------------------------------------------------------------
2321 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2322 if (U_FAILURE(status)) {
2323 return;
2324 }
2325 if (U_FAILURE(fDeferredStatus)) {
2326 status = fDeferredStatus;
2327 return;
2328 }
2329 if (limit < 0) {
2330 status = U_ILLEGAL_ARGUMENT_ERROR;
2331 return;
2332 }
2333
2334 // Reset the matcher. This is needed here in case there is a current match
2335 // whose final stack frame (containing the match results, pointed to by fFrame)
2336 // would be lost by resizing to a smaller stack size.
2337 reset();
2338
2339 if (limit == 0) {
2340 // Unlimited stack expansion
2341 fStack->setMaxCapacity(0);
2342 } else {
2343 // Change the units of the limit from bytes to ints, and bump the size up
2344 // to be big enough to hold at least one stack frame for the pattern,
2345 // if it isn't there already.
2346 int32_t adjustedLimit = limit / sizeof(int32_t);
2347 if (adjustedLimit < fPattern->fFrameSize) {
2348 adjustedLimit = fPattern->fFrameSize;
2349 }
2350 fStack->setMaxCapacity(adjustedLimit);
2351 }
2352 fStackLimit = limit;
2353 }
2354
2355
2356 //--------------------------------------------------------------------------------
2357 //
2358 // getStackLimit
2359 //
2360 //--------------------------------------------------------------------------------
2361 int32_t RegexMatcher::getStackLimit() const {
2362 return fStackLimit;
2363 }
2364
2365
2366 //--------------------------------------------------------------------------------
2367 //
2368 // setMatchCallback
2369 //
2370 //--------------------------------------------------------------------------------
2371 void RegexMatcher::setMatchCallback(URegexMatchCallback *callback,
2372 const void *context,
2373 UErrorCode &status) {
2374 if (U_FAILURE(status)) {
2375 return;
2376 }
2377 fCallbackFn = callback;
2378 fCallbackContext = context;
2379 }
2380
2381
2382 //--------------------------------------------------------------------------------
2383 //
2384 // getMatchCallback
2385 //
2386 //--------------------------------------------------------------------------------
2387 void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback,
2388 const void *&context,
2389 UErrorCode &status) {
2390 if (U_FAILURE(status)) {
2391 return;
2392 }
2393 callback = fCallbackFn;
2394 context = fCallbackContext;
2395 }
2396
2397
2398 //--------------------------------------------------------------------------------
2399 //
2400 // setMatchCallback
2401 //
2402 //--------------------------------------------------------------------------------
2403 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback,
2404 const void *context,
2405 UErrorCode &status) {
2406 if (U_FAILURE(status)) {
2407 return;
2408 }
2409 fFindProgressCallbackFn = callback;
2410 fFindProgressCallbackContext = context;
2411 }
2412
2413
2414 //--------------------------------------------------------------------------------
2415 //
2416 // getMatchCallback
2417 //
2418 //--------------------------------------------------------------------------------
2419 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback,
2420 const void *&context,
2421 UErrorCode &status) {
2422 if (U_FAILURE(status)) {
2423 return;
2424 }
2425 callback = fFindProgressCallbackFn;
2426 context = fFindProgressCallbackContext;
2427 }
2428
2429
2430 //================================================================================
2431 //
2432 // Code following this point in this file is the internal
2433 // Match Engine Implementation.
2434 //
2435 //================================================================================
2436
2437
2438 //--------------------------------------------------------------------------------
2439 //
2440 // resetStack
2441 // Discard any previous contents of the state save stack, and initialize a
2442 // new stack frame to all -1. The -1s are needed for capture group limits,
2443 // where they indicate that a group has not yet matched anything.
2444 //--------------------------------------------------------------------------------
2445 REStackFrame *RegexMatcher::resetStack() {
2446 // Discard any previous contents of the state save stack, and initialize a
2447 // new stack frame with all -1 data. The -1s are needed for capture group limits,
2448 // where they indicate that a group has not yet matched anything.
2449 fStack->removeAllElements();
2450
2451 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2452 int32_t i;
2453 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2454 iFrame->fExtra[i] = -1;
2455 }
2456 return iFrame;
2457 }
2458
2459
2460
2461 //--------------------------------------------------------------------------------
2462 //
2463 // isWordBoundary
2464 // in perl, "xab..cd..", \b is true at positions 0,3,5,7
2465 // For us,
2466 // If the current char is a combining mark,
2467 // \b is FALSE.
2468 // Else Scan backwards to the first non-combining char.
2469 // We are at a boundary if the this char and the original chars are
2470 // opposite in membership in \w set
2471 //
2472 // parameters: pos - the current position in the input buffer
2473 //
2474 // TODO: double-check edge cases at region boundaries.
2475 //
2476 //--------------------------------------------------------------------------------
2477 UBool RegexMatcher::isWordBoundary(int64_t pos) {
2478 UBool isBoundary = FALSE;
2479 UBool cIsWord = FALSE;
2480
2481 if (pos >= fLookLimit) {
2482 fHitEnd = TRUE;
2483 } else {
2484 // Determine whether char c at current position is a member of the word set of chars.
2485 // If we're off the end of the string, behave as though we're not at a word char.
2486 UTEXT_SETNATIVEINDEX(fInputText, pos);
2487 UChar32 c = UTEXT_CURRENT32(fInputText);
2488 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2489 // Current char is a combining one. Not a boundary.
2490 return FALSE;
2491 }
2492 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2493 }
2494
2495 // Back up until we come to a non-combining char, determine whether
2496 // that char is a word char.
2497 UBool prevCIsWord = FALSE;
2498 for (;;) {
2499 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2500 break;
2501 }
2502 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2503 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2504 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2505 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2506 break;
2507 }
2508 }
2509 isBoundary = cIsWord ^ prevCIsWord;
2510 return isBoundary;
2511 }
2512
2513 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2514 UBool isBoundary = FALSE;
2515 UBool cIsWord = FALSE;
2516
2517 const UChar *inputBuf = fInputText->chunkContents;
2518
2519 if (pos >= fLookLimit) {
2520 fHitEnd = TRUE;
2521 } else {
2522 // Determine whether char c at current position is a member of the word set of chars.
2523 // If we're off the end of the string, behave as though we're not at a word char.
2524 UChar32 c;
2525 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2526 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2527 // Current char is a combining one. Not a boundary.
2528 return FALSE;
2529 }
2530 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2531 }
2532
2533 // Back up until we come to a non-combining char, determine whether
2534 // that char is a word char.
2535 UBool prevCIsWord = FALSE;
2536 for (;;) {
2537 if (pos <= fLookStart) {
2538 break;
2539 }
2540 UChar32 prevChar;
2541 U16_PREV(inputBuf, fLookStart, pos, prevChar);
2542 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2543 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2544 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2545 break;
2546 }
2547 }
2548 isBoundary = cIsWord ^ prevCIsWord;
2549 return isBoundary;
2550 }
2551
2552 //--------------------------------------------------------------------------------
2553 //
2554 // isUWordBoundary
2555 //
2556 // Test for a word boundary using RBBI word break.
2557 //
2558 // parameters: pos - the current position in the input buffer
2559 //
2560 //--------------------------------------------------------------------------------
2561 UBool RegexMatcher::isUWordBoundary(int64_t pos) {
2562 UBool returnVal = FALSE;
2563 #if UCONFIG_NO_BREAK_ITERATION==0
2564
2565 // If we haven't yet created a break iterator for this matcher, do it now.
2566 if (fWordBreakItr == NULL) {
2567 fWordBreakItr =
2568 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
2569 if (U_FAILURE(fDeferredStatus)) {
2570 return FALSE;
2571 }
2572 fWordBreakItr->setText(fInputText, fDeferredStatus);
2573 }
2574
2575 if (pos >= fLookLimit) {
2576 fHitEnd = TRUE;
2577 returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real"
2578 // words are not boundaries. All non-word chars stand by themselves,
2579 // with word boundaries on both sides.
2580 } else {
2581 if (!UTEXT_USES_U16(fInputText)) {
2582 // !!!: Would like a better way to do this!
2583 UErrorCode status = U_ZERO_ERROR;
2584 pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
2585 }
2586 returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2587 }
2588 #endif
2589 return returnVal;
2590 }
2591
2592 //--------------------------------------------------------------------------------
2593 //
2594 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state
2595 // saves. Increment the "time" counter, and call the
2596 // user callback function if there is one installed.
2597 //
2598 // If the match operation needs to be aborted, either for a time-out
2599 // or because the user callback asked for it, just set an error status.
2600 // The engine will pick that up and stop in its outer loop.
2601 //
2602 //--------------------------------------------------------------------------------
2603 void RegexMatcher::IncrementTime(UErrorCode &status) {
2604 fTickCounter = TIMER_INITIAL_VALUE;
2605 fTime++;
2606 if (fCallbackFn != NULL) {
2607 if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2608 status = U_REGEX_STOPPED_BY_CALLER;
2609 return;
2610 }
2611 }
2612 if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2613 status = U_REGEX_TIME_OUT;
2614 }
2615 }
2616
2617 //--------------------------------------------------------------------------------
2618 //
2619 // ReportFindProgress This function is called once for each advance in the target
2620 // string from the find() function, and calls the user progress callback
2621 // function if there is one installed.
2622 //
2623 // NOTE:
2624 //
2625 // If the match operation needs to be aborted because the user
2626 // callback asked for it, just set an error status.
2627 // The engine will pick that up and stop in its outer loop.
2628 //
2629 //--------------------------------------------------------------------------------
2630 UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) {
2631 if (fFindProgressCallbackFn != NULL) {
2632 if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) {
2633 status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/;
2634 return FALSE;
2635 }
2636 }
2637 return TRUE;
2638 }
2639
2640 //--------------------------------------------------------------------------------
2641 //
2642 // StateSave
2643 // Make a new stack frame, initialized as a copy of the current stack frame.
2644 // Set the pattern index in the original stack frame from the operand value
2645 // in the opcode. Execution of the engine continues with the state in
2646 // the newly created stack frame
2647 //
2648 // Note that reserveBlock() may grow the stack, resulting in the
2649 // whole thing being relocated in memory.
2650 //
2651 // Parameters:
2652 // fp The top frame pointer when called. At return, a new
2653 // fame will be present
2654 // savePatIdx An index into the compiled pattern. Goes into the original
2655 // (not new) frame. If execution ever back-tracks out of the
2656 // new frame, this will be where we continue from in the pattern.
2657 // Return
2658 // The new frame pointer.
2659 //
2660 //--------------------------------------------------------------------------------
2661 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2662 // push storage for a new frame.
2663 int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2664 if (newFP == NULL) {
2665 // Failure on attempted stack expansion.
2666 // Stack function set some other error code, change it to a more
2667 // specific one for regular expressions.
2668 status = U_REGEX_STACK_OVERFLOW;
2669 // We need to return a writable stack frame, so just return the
2670 // previous frame. The match operation will stop quickly
2671 // because of the error status, after which the frame will never
2672 // be looked at again.
2673 return fp;
2674 }
2675 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack.
2676
2677 // New stack frame = copy of old top frame.
2678 int64_t *source = (int64_t *)fp;
2679 int64_t *dest = newFP;
2680 for (;;) {
2681 *dest++ = *source++;
2682 if (source == newFP) {
2683 break;
2684 }
2685 }
2686
2687 fTickCounter--;
2688 if (fTickCounter <= 0) {
2689 IncrementTime(status); // Re-initializes fTickCounter
2690 }
2691 fp->fPatIdx = savePatIdx;
2692 return (REStackFrame *)newFP;
2693 }
2694
2695
2696 //--------------------------------------------------------------------------------
2697 //
2698 // MatchAt This is the actual matching engine.
2699 //
2700 // startIdx: begin matching a this index.
2701 // toEnd: if true, match must extend to end of the input region
2702 //
2703 //--------------------------------------------------------------------------------
2704 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2705 UBool isMatch = FALSE; // True if the we have a match.
2706
2707 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2708
2709 int32_t op; // Operation from the compiled pattern, split into
2710 int32_t opType; // the opcode
2711 int32_t opValue; // and the operand value.
2712
2713 #ifdef REGEX_RUN_DEBUG
2714 if (fTraceDebug)
2715 {
2716 printf("MatchAt(startIdx=%ld)\n", startIdx);
2717 printf("Original Pattern: ");
2718 UChar32 c = utext_next32From(fPattern->fPattern, 0);
2719 while (c != U_SENTINEL) {
2720 if (c<32 || c>256) {
2721 c = '.';
2722 }
2723 REGEX_DUMP_DEBUG_PRINTF(("%c", c));
2724
2725 c = UTEXT_NEXT32(fPattern->fPattern);
2726 }
2727 printf("\n");
2728 printf("Input String: ");
2729 c = utext_next32From(fInputText, 0);
2730 while (c != U_SENTINEL) {
2731 if (c<32 || c>256) {
2732 c = '.';
2733 }
2734 printf("%c", c);
2735
2736 c = UTEXT_NEXT32(fInputText);
2737 }
2738 printf("\n");
2739 printf("\n");
2740 }
2741 #endif
2742
2743 if (U_FAILURE(status)) {
2744 return;
2745 }
2746
2747 // Cache frequently referenced items from the compiled pattern
2748 //
2749 int64_t *pat = fPattern->fCompiledPat->getBuffer();
2750
2751 const UChar *litText = fPattern->fLiteralText.getBuffer();
2752 UVector *sets = fPattern->fSets;
2753
2754 fFrameSize = fPattern->fFrameSize;
2755 REStackFrame *fp = resetStack();
2756
2757 fp->fPatIdx = 0;
2758 fp->fInputIdx = startIdx;
2759
2760 // Zero out the pattern's static data
2761 int32_t i;
2762 for (i = 0; i<fPattern->fDataSize; i++) {
2763 fData[i] = 0;
2764 }
2765
2766 //
2767 // Main loop for interpreting the compiled pattern.
2768 // One iteration of the loop per pattern operation performed.
2769 //
2770 for (;;) {
2771 #if 0
2772 if (_heapchk() != _HEAPOK) {
2773 fprintf(stderr, "Heap Trouble\n");
2774 }
2775 #endif
2776
2777 op = (int32_t)pat[fp->fPatIdx];
2778 opType = URX_TYPE(op);
2779 opValue = URX_VAL(op);
2780 #ifdef REGEX_RUN_DEBUG
2781 if (fTraceDebug) {
2782 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2783 printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp->fInputIdx,
2784 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2785 fPattern->dumpOp(fp->fPatIdx);
2786 }
2787 #endif
2788 fp->fPatIdx++;
2789
2790 switch (opType) {
2791
2792
2793 case URX_NOP:
2794 break;
2795
2796
2797 case URX_BACKTRACK:
2798 // Force a backtrack. In some circumstances, the pattern compiler
2799 // will notice that the pattern can't possibly match anything, and will
2800 // emit one of these at that point.
2801 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2802 break;
2803
2804
2805 case URX_ONECHAR:
2806 if (fp->fInputIdx < fActiveLimit) {
2807 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2808 UChar32 c = UTEXT_NEXT32(fInputText);
2809 if (c == opValue) {
2810 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2811 break;
2812 }
2813 } else {
2814 fHitEnd = TRUE;
2815 }
2816
2817 #ifdef REGEX_SMART_BACKTRACKING
2818 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
2819 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
2820 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
2821 UBool success = FALSE;
2822 UChar32 c = UTEXT_PREVIOUS32(fInputText);
2823 while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) {
2824 if (c == opValue) {
2825 success = TRUE;
2826 break;
2827 } else if (c == U_SENTINEL) {
2828 break;
2829 }
2830 c = UTEXT_PREVIOUS32(fInputText);
2831 }
2832 if (success) {
2833 fHitEnd = FALSE;
2834 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2835 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2836 if (fp->fInputIdx > backSearchIndex) {
2837 fp = StateSave(fp, fp->fPatIdx, status);
2838 }
2839 fp->fPatIdx++; // Skip the LOOP_C, we just did that
2840 break;
2841 }
2842 }
2843 }
2844 #endif
2845
2846 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2847 break;
2848
2849
2850 case URX_STRING:
2851 {
2852 // Test input against a literal string.
2853 // Strings require two slots in the compiled pattern, one for the
2854 // offset to the string text, and one for the length.
2855 int32_t stringStartIdx = opValue;
2856 int32_t stringLen;
2857
2858 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
2859 fp->fPatIdx++;
2860 opType = URX_TYPE(op);
2861 stringLen = URX_VAL(op);
2862 U_ASSERT(opType == URX_STRING_LEN);
2863 U_ASSERT(stringLen >= 2);
2864
2865 const UChar *patternChars = litText+stringStartIdx;
2866 const UChar *patternEnd = patternChars+stringLen;
2867
2868 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2869 UChar32 c;
2870 UBool success = TRUE;
2871
2872 while (patternChars < patternEnd && success) {
2873 c = UTEXT_NEXT32(fInputText);
2874
2875 if (c != U_SENTINEL && UTEXT_GETNATIVEINDEX(fInputText) <= fActiveLimit) {
2876 if (U_IS_BMP(c)) {
2877 success = (*patternChars == c);
2878 patternChars += 1;
2879 } else if (patternChars+1 < patternEnd) {
2880 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
2881 patternChars += 2;
2882 }
2883 } else {
2884 success = FALSE;
2885 fHitEnd = TRUE; // TODO: See ticket 6074
2886 }
2887 }
2888
2889 if (success) {
2890 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2891 } else {
2892 #ifdef REGEX_SMART_BACKTRACKING
2893 if (fp->fInputIdx > backSearchIndex && fStack->size()) {
2894 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
2895 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
2896 // Reset to last start point
2897 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2898 patternChars = litText+stringStartIdx;
2899
2900 // Search backwards for a possible start
2901 do {
2902 c = UTEXT_PREVIOUS32(fInputText);
2903 if (c == U_SENTINEL) {
2904 break;
2905 } else if ((U_IS_BMP(c) && *patternChars == c) ||
2906 (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
2907 success = TRUE;
2908 break;
2909 }
2910 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
2911
2912 // And try again
2913 if (success) {
2914 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2915 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2916 if (fp->fInputIdx > backSearchIndex) {
2917 fp = StateSave(fp, fp->fPatIdx, status);
2918 }
2919 fp->fPatIdx++; // Skip the LOOP_C, we just did that
2920 break;
2921 }
2922 }
2923 }
2924 #endif
2925 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2926 }
2927 }
2928 break;
2929
2930
2931 case URX_STATE_SAVE:
2932 fp = StateSave(fp, opValue, status);
2933 break;
2934
2935
2936 case URX_END:
2937 // The match loop will exit via this path on a successful match,
2938 // when we reach the end of the pattern.
2939 if (toEnd && fp->fInputIdx != fActiveLimit) {
2940 // The pattern matched, but not to the end of input. Try some more.
2941 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2942 break;
2943 }
2944 isMatch = TRUE;
2945 goto breakFromLoop;
2946
2947 // Start and End Capture stack frame variables are laid out out like this:
2948 // fp->fExtra[opValue] - The start of a completed capture group
2949 // opValue+1 - The end of a completed capture group
2950 // opValue+2 - the start of a capture group whose end
2951 // has not yet been reached (and might not ever be).
2952 case URX_START_CAPTURE:
2953 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2954 fp->fExtra[opValue+2] = fp->fInputIdx;
2955 break;
2956
2957
2958 case URX_END_CAPTURE:
2959 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2960 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
2961 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
2962 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
2963 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
2964 break;
2965
2966
2967 case URX_DOLLAR: // $, test for End of line
2968 // or for position before new line at end of input
2969 {
2970 if (fp->fInputIdx >= fAnchorLimit) {
2971 // We really are at the end of input. Success.
2972 fHitEnd = TRUE;
2973 fRequireEnd = TRUE;
2974 break;
2975 }
2976
2977 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2978
2979 // If we are positioned just before a new-line that is located at the
2980 // end of input, succeed.
2981 UChar32 c = UTEXT_NEXT32(fInputText);
2982 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2983 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
2984 // If not in the middle of a CR/LF sequence
2985 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && (UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
2986 // At new-line at end of input. Success
2987 fHitEnd = TRUE;
2988 fRequireEnd = TRUE;
2989
2990 break;
2991 }
2992 }
2993 } else {
2994 UChar32 nextC = UTEXT_NEXT32(fInputText);
2995 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2996 fHitEnd = TRUE;
2997 fRequireEnd = TRUE;
2998 break; // At CR/LF at end of input. Success
2999 }
3000 }
3001
3002 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3003 }
3004 break;
3005
3006
3007 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
3008 if (fp->fInputIdx >= fAnchorLimit) {
3009 // Off the end of input. Success.
3010 fHitEnd = TRUE;
3011 fRequireEnd = TRUE;
3012 break;
3013 } else {
3014 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3015 UChar32 c = UTEXT_NEXT32(fInputText);
3016 // Either at the last character of input, or off the end.
3017 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
3018 fHitEnd = TRUE;
3019 fRequireEnd = TRUE;
3020 break;
3021 }
3022 }
3023
3024 // Not at end of input. Back-track out.
3025 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3026 break;
3027
3028
3029 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
3030 {
3031 if (fp->fInputIdx >= fAnchorLimit) {
3032 // We really are at the end of input. Success.
3033 fHitEnd = TRUE;
3034 fRequireEnd = TRUE;
3035 break;
3036 }
3037 // If we are positioned just before a new-line, succeed.
3038 // It makes no difference where the new-line is within the input.
3039 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3040 UChar32 c = UTEXT_CURRENT32(fInputText);
3041 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
3042 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
3043 // In multi-line mode, hitting a new-line just before the end of input does not
3044 // set the hitEnd or requireEnd flags
3045 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
3046 break;
3047 }
3048 }
3049 // not at a new line. Fail.
3050 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3051 }
3052 break;
3053
3054
3055 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
3056 {
3057 if (fp->fInputIdx >= fAnchorLimit) {
3058 // We really are at the end of input. Success.
3059 fHitEnd = TRUE;
3060 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
3061 break; // adding a new-line would not lose the match.
3062 }
3063 // If we are not positioned just before a new-line, the test fails; backtrack out.
3064 // It makes no difference where the new-line is within the input.
3065 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3066 if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3067 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3068 }
3069 }
3070 break;
3071
3072
3073 case URX_CARET: // ^, test for start of line
3074 if (fp->fInputIdx != fAnchorStart) {
3075 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3076 }
3077 break;
3078
3079
3080 case URX_CARET_M: // ^, test for start of line in mulit-line mode
3081 {
3082 if (fp->fInputIdx == fAnchorStart) {
3083 // We are at the start input. Success.
3084 break;
3085 }
3086 // Check whether character just before the current pos is a new-line
3087 // unless we are at the end of input
3088 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3089 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3090 if ((fp->fInputIdx < fAnchorLimit) &&
3091 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3092 // It's a new-line. ^ is true. Success.
3093 // TODO: what should be done with positions between a CR and LF?
3094 break;
3095 }
3096 // Not at the start of a line. Fail.
3097 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3098 }
3099 break;
3100
3101
3102 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
3103 {
3104 U_ASSERT(fp->fInputIdx >= fAnchorStart);
3105 if (fp->fInputIdx <= fAnchorStart) {
3106 // We are at the start input. Success.
3107 break;
3108 }
3109 // Check whether character just before the current pos is a new-line
3110 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3111 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3112 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3113 if (c != 0x0a) {
3114 // Not at the start of a line. Back-track out.
3115 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3116 }
3117 }
3118 break;
3119
3120 case URX_BACKSLASH_B: // Test for word boundaries
3121 {
3122 UBool success = isWordBoundary(fp->fInputIdx);
3123 success ^= (opValue != 0); // flip sense for \B
3124 if (!success) {
3125 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3126 }
3127 }
3128 break;
3129
3130
3131 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
3132 {
3133 UBool success = isUWordBoundary(fp->fInputIdx);
3134 success ^= (opValue != 0); // flip sense for \B
3135 if (!success) {
3136 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3137 }
3138 }
3139 break;
3140
3141
3142 case URX_BACKSLASH_D: // Test for decimal digit
3143 {
3144 if (fp->fInputIdx >= fActiveLimit) {
3145 fHitEnd = TRUE;
3146 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3147 break;
3148 }
3149
3150 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3151
3152 UChar32 c = UTEXT_NEXT32(fInputText);
3153 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
3154 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3155 success ^= (opValue != 0); // flip sense for \D
3156 if (success) {
3157 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3158 } else {
3159 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3160 }
3161 }
3162 break;
3163
3164
3165 case URX_BACKSLASH_G: // Test for position at end of previous match
3166 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3167 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3168 }
3169 break;
3170
3171
3172 case URX_BACKSLASH_X:
3173 // Match a Grapheme, as defined by Unicode TR 29.
3174 // Differs slightly from Perl, which consumes combining marks independently
3175 // of context.
3176 {
3177
3178 // Fail if at end of input
3179 if (fp->fInputIdx >= fActiveLimit) {
3180 fHitEnd = TRUE;
3181 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3182 break;
3183 }
3184
3185 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3186
3187 // Examine (and consume) the current char.
3188 // Dispatch into a little state machine, based on the char.
3189 UChar32 c;
3190 c = UTEXT_NEXT32(fInputText);
3191 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3192 UnicodeSet **sets = fPattern->fStaticSets;
3193 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
3194 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
3195 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3196 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3197 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3198 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3199 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3200 goto GC_Extend;
3201
3202
3203
3204 GC_L:
3205 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3206 c = UTEXT_NEXT32(fInputText);
3207 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3208 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3209 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3210 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3211 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3212 UTEXT_PREVIOUS32(fInputText);
3213 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3214 goto GC_Extend;
3215
3216 GC_V:
3217 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3218 c = UTEXT_NEXT32(fInputText);
3219 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3220 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3221 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3222 UTEXT_PREVIOUS32(fInputText);
3223 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3224 goto GC_Extend;
3225
3226 GC_T:
3227 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3228 c = UTEXT_NEXT32(fInputText);
3229 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3230 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3231 UTEXT_PREVIOUS32(fInputText);
3232 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3233 goto GC_Extend;
3234
3235 GC_Extend:
3236 // Combining characters are consumed here
3237 for (;;) {
3238 if (fp->fInputIdx >= fActiveLimit) {
3239 break;
3240 }
3241 c = UTEXT_CURRENT32(fInputText);
3242 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
3243 break;
3244 }
3245 UTEXT_NEXT32(fInputText);
3246 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3247 }
3248 goto GC_Done;
3249
3250 GC_Control:
3251 // Most control chars stand alone (don't combine with combining chars),
3252 // except for that CR/LF sequence is a single grapheme cluster.
3253 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
3254 c = UTEXT_NEXT32(fInputText);
3255 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3256 }
3257
3258 GC_Done:
3259 if (fp->fInputIdx >= fActiveLimit) {
3260 fHitEnd = TRUE;
3261 }
3262 break;
3263 }
3264
3265
3266
3267
3268 case URX_BACKSLASH_Z: // Test for end of Input
3269 if (fp->fInputIdx < fAnchorLimit) {
3270 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3271 } else {
3272 fHitEnd = TRUE;
3273 fRequireEnd = TRUE;
3274 }
3275 break;
3276
3277
3278
3279 case URX_STATIC_SETREF:
3280 {
3281 // Test input character against one of the predefined sets
3282 // (Word Characters, for example)
3283 // The high bit of the op value is a flag for the match polarity.
3284 // 0: success if input char is in set.
3285 // 1: success if input char is not in set.
3286 if (fp->fInputIdx >= fActiveLimit) {
3287 fHitEnd = TRUE;
3288 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3289 break;
3290 }
3291
3292 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3293 opValue &= ~URX_NEG_SET;
3294 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3295
3296 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3297 UChar32 c = UTEXT_NEXT32(fInputText);
3298 if (c < 256) {
3299 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3300 if (s8->contains(c)) {
3301 success = !success;
3302 }
3303 } else {
3304 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3305 if (s->contains(c)) {
3306 success = !success;
3307 }
3308 }
3309 if (success) {
3310 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3311 } else {
3312 // the character wasn't in the set.
3313 #ifdef REGEX_SMART_BACKTRACKING
3314 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3315 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3316 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3317 // Try to find it, backwards
3318 UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
3319 success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset
3320 do {
3321 c = UTEXT_PREVIOUS32(fInputText);
3322 if (c == U_SENTINEL) {
3323 break;
3324 } else if (c < 256) {
3325 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3326 if (s8->contains(c)) {
3327 success = !success;
3328 }
3329 } else {
3330 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3331 if (s->contains(c)) {
3332 success = !success;
3333 }
3334 }
3335 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex && !success);
3336
3337 if (success && c != U_SENTINEL) {
3338 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3339 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3340 if (fp->fInputIdx > backSearchIndex) {
3341 fp = StateSave(fp, fp->fPatIdx, status);
3342 }
3343 fp->fPatIdx++; // Skip the LOOP_C, we just did that
3344 break;
3345 }
3346 }
3347 }
3348 #endif
3349 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3350 }
3351 }
3352 break;
3353
3354
3355 case URX_STAT_SETREF_N:
3356 {
3357 // Test input character for NOT being a member of one of
3358 // the predefined sets (Word Characters, for example)
3359 if (fp->fInputIdx >= fActiveLimit) {
3360 fHitEnd = TRUE;
3361 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3362 break;
3363 }
3364
3365 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3366
3367 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3368
3369 UChar32 c = UTEXT_NEXT32(fInputText);
3370 if (c < 256) {
3371 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3372 if (s8->contains(c) == FALSE) {
3373 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3374 break;
3375 }
3376 } else {
3377 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3378 if (s->contains(c) == FALSE) {
3379 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3380 break;
3381 }
3382 }
3383 // the character wasn't in the set.
3384 #ifdef REGEX_SMART_BACKTRACKING
3385 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3386 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3387 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3388 // Try to find it, backwards
3389 UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
3390 UBool success = FALSE;
3391 do {
3392 c = UTEXT_PREVIOUS32(fInputText);
3393 if (c == U_SENTINEL) {
3394 break;
3395 } else if (c < 256) {
3396 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3397 if (s8->contains(c) == FALSE) {
3398 success = TRUE;
3399 break;
3400 }
3401 } else {
3402 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3403 if (s->contains(c) == FALSE) {
3404 success = TRUE;
3405 break;
3406 }
3407 }
3408 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
3409
3410 if (success) {
3411 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3412 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3413 if (fp->fInputIdx > backSearchIndex) {
3414 fp = StateSave(fp, fp->fPatIdx, status);
3415 }
3416 fp->fPatIdx++; // Skip the LOOP_C, we just did that
3417 break;
3418 }
3419 }
3420 }
3421 #endif
3422 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3423 }
3424 break;
3425
3426
3427 case URX_SETREF:
3428 if (fp->fInputIdx >= fActiveLimit) {
3429 fHitEnd = TRUE;
3430 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3431 break;
3432 } else {
3433 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3434
3435 // There is input left. Pick up one char and test it for set membership.
3436 UChar32 c = UTEXT_NEXT32(fInputText);
3437 U_ASSERT(opValue > 0 && opValue < sets->size());
3438 if (c<256) {
3439 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3440 if (s8->contains(c)) {
3441 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3442 break;
3443 }
3444 } else {
3445 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3446 if (s->contains(c)) {
3447 // The character is in the set. A Match.
3448 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3449 break;
3450 }
3451 }
3452
3453 // the character wasn't in the set.
3454 #ifdef REGEX_SMART_BACKTRACKING
3455 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3456 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3457 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3458 // Try to find it, backwards
3459 UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
3460 UBool success = FALSE;
3461 do {
3462 c = UTEXT_PREVIOUS32(fInputText);
3463 if (c == U_SENTINEL) {
3464 break;
3465 } else if (c < 256) {
3466 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3467 if (s8->contains(c)) {
3468 success = TRUE;
3469 break;
3470 }
3471 } else {
3472 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3473 if (s->contains(c)) {
3474 success = TRUE;
3475 break;
3476 }
3477 }
3478 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
3479
3480 if (success) {
3481 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3482 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3483 if (fp->fInputIdx > backSearchIndex) {
3484 fp = StateSave(fp, fp->fPatIdx, status);
3485 }
3486 fp->fPatIdx++; // Skip the LOOP_C, we just did that
3487 break;
3488 }
3489 }
3490 }
3491 #endif
3492 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3493 }
3494 break;
3495
3496
3497 case URX_DOTANY:
3498 {
3499 // . matches anything, but stops at end-of-line.
3500 if (fp->fInputIdx >= fActiveLimit) {
3501 // At end of input. Match failed. Backtrack out.
3502 fHitEnd = TRUE;
3503 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3504 break;
3505 }
3506
3507 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3508
3509 // There is input left. Advance over one char, unless we've hit end-of-line
3510 UChar32 c = UTEXT_NEXT32(fInputText);
3511 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
3512 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3513 // End of line in normal mode. . does not match.
3514 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3515 break;
3516 }
3517 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3518 }
3519 break;
3520
3521
3522 case URX_DOTANY_ALL:
3523 {
3524 // ., in dot-matches-all (including new lines) mode
3525 if (fp->fInputIdx >= fActiveLimit) {
3526 // At end of input. Match failed. Backtrack out.
3527 fHitEnd = TRUE;
3528 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3529 break;
3530 }
3531
3532 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3533
3534 // There is input left. Advance over one char, except if we are
3535 // at a cr/lf, advance over both of them.
3536 UChar32 c;
3537 c = UTEXT_NEXT32(fInputText);
3538 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3539 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3540 // In the case of a CR/LF, we need to advance over both.
3541 UChar32 nextc = UTEXT_CURRENT32(fInputText);
3542 if (nextc == 0x0a) {
3543 UTEXT_NEXT32(fInputText);
3544 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3545 }
3546 }
3547 }
3548 break;
3549
3550
3551 case URX_DOTANY_UNIX:
3552 {
3553 // '.' operator, matches all, but stops at end-of-line.
3554 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
3555 if (fp->fInputIdx >= fActiveLimit) {
3556 // At end of input. Match failed. Backtrack out.
3557 fHitEnd = TRUE;
3558 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3559 break;
3560 }
3561
3562 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3563
3564 // There is input left. Advance over one char, unless we've hit end-of-line
3565 UChar32 c = UTEXT_NEXT32(fInputText);
3566 if (c == 0x0a) {
3567 // End of line in normal mode. '.' does not match the \n
3568 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3569 } else {
3570 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3571 }
3572 }
3573 break;
3574
3575
3576 case URX_JMP:
3577 fp->fPatIdx = opValue;
3578 break;
3579
3580 case URX_FAIL:
3581 isMatch = FALSE;
3582 goto breakFromLoop;
3583
3584 case URX_JMP_SAV:
3585 U_ASSERT(opValue < fPattern->fCompiledPat->size());
3586 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3587 fp->fPatIdx = opValue; // Then JMP.
3588 break;
3589
3590 case URX_JMP_SAV_X:
3591 // This opcode is used with (x)+, when x can match a zero length string.
3592 // Same as JMP_SAV, except conditional on the match having made forward progress.
3593 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3594 // data address of the input position at the start of the loop.
3595 {
3596 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3597 int32_t stoOp = (int32_t)pat[opValue-1];
3598 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3599 int32_t frameLoc = URX_VAL(stoOp);
3600 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3601 int64_t prevInputIdx = fp->fExtra[frameLoc];
3602 U_ASSERT(prevInputIdx <= fp->fInputIdx);
3603 if (prevInputIdx < fp->fInputIdx) {
3604 // The match did make progress. Repeat the loop.
3605 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3606 fp->fPatIdx = opValue;
3607 fp->fExtra[frameLoc] = fp->fInputIdx;
3608 }
3609 // If the input position did not advance, we do nothing here,
3610 // execution will fall out of the loop.
3611 }
3612 break;
3613
3614 case URX_CTR_INIT:
3615 {
3616 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3617 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3618
3619 // Pick up the three extra operands that CTR_INIT has, and
3620 // skip the pattern location counter past
3621 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3622 fp->fPatIdx += 3;
3623 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3624 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3625 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3626 U_ASSERT(minCount>=0);
3627 U_ASSERT(maxCount>=minCount || maxCount==-1);
3628 U_ASSERT(loopLoc>fp->fPatIdx);
3629
3630 if (minCount == 0) {
3631 fp = StateSave(fp, loopLoc+1, status);
3632 }
3633 if (maxCount == 0) {
3634 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3635 }
3636 }
3637 break;
3638
3639 case URX_CTR_LOOP:
3640 {
3641 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3642 int32_t initOp = (int32_t)pat[opValue];
3643 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3644 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3645 int32_t minCount = (int32_t)pat[opValue+2];
3646 int32_t maxCount = (int32_t)pat[opValue+3];
3647 // Increment the counter. Note: we DIDN'T worry about counter
3648 // overflow, since the data comes from UnicodeStrings, which
3649 // stores its length in an int32_t. Do we have to think about
3650 // this now that we're using UText? Probably not, since the length
3651 // in UChar32s is still an int32_t.
3652 (*pCounter)++;
3653 U_ASSERT(*pCounter > 0);
3654 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
3655 U_ASSERT(*pCounter == maxCount || maxCount == -1);
3656 break;
3657 }
3658 if (*pCounter >= minCount) {
3659 fp = StateSave(fp, fp->fPatIdx, status);
3660 }
3661 fp->fPatIdx = opValue + 4; // Loop back.
3662 }
3663 break;
3664
3665 case URX_CTR_INIT_NG:
3666 {
3667 // Initialize a non-greedy loop
3668 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3669 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3670
3671 // Pick up the three extra operands that CTR_INIT has, and
3672 // skip the pattern location counter past
3673 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3674 fp->fPatIdx += 3;
3675 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3676 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3677 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3678 U_ASSERT(minCount>=0);
3679 U_ASSERT(maxCount>=minCount || maxCount==-1);
3680 U_ASSERT(loopLoc>fp->fPatIdx);
3681
3682 if (minCount == 0) {
3683 if (maxCount != 0) {
3684 fp = StateSave(fp, fp->fPatIdx, status);
3685 }
3686 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
3687 }
3688 }
3689 break;
3690
3691 case URX_CTR_LOOP_NG:
3692 {
3693 // Non-greedy {min, max} loops
3694 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3695 int32_t initOp = (int32_t)pat[opValue];
3696 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3697 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3698 int32_t minCount = (int32_t)pat[opValue+2];
3699 int32_t maxCount = (int32_t)pat[opValue+3];
3700 // Increment the counter. Note: we DIDN'T worry about counter
3701 // overflow, since the data comes from UnicodeStrings, which
3702 // stores its length in an int32_t. Do we have to think about
3703 // this now that we're using UText? Probably not, since the length
3704 // in UChar32s is still an int32_t.
3705 (*pCounter)++;
3706 U_ASSERT(*pCounter > 0);
3707
3708 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
3709 // The loop has matched the maximum permitted number of times.
3710 // Break out of here with no action. Matching will
3711 // continue with the following pattern.
3712 U_ASSERT(*pCounter == maxCount || maxCount == -1);
3713 break;
3714 }
3715
3716 if (*pCounter < minCount) {
3717 // We haven't met the minimum number of matches yet.
3718 // Loop back for another one.
3719 fp->fPatIdx = opValue + 4; // Loop back.
3720 } else {
3721 // We do have the minimum number of matches.
3722 // Fall into the following pattern, but first do
3723 // a state save to the top of the loop, so that a failure
3724 // in the following pattern will try another iteration of the loop.
3725 fp = StateSave(fp, opValue + 4, status);
3726 }
3727 }
3728 break;
3729
3730 case URX_STO_SP:
3731 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3732 fData[opValue] = fStack->size();
3733 break;
3734
3735 case URX_LD_SP:
3736 {
3737 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3738 int32_t newStackSize = (int32_t)fData[opValue];
3739 U_ASSERT(newStackSize <= fStack->size());
3740 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3741 if (newFP == (int64_t *)fp) {
3742 break;
3743 }
3744 int32_t i;
3745 for (i=0; i<fFrameSize; i++) {
3746 newFP[i] = ((int64_t *)fp)[i];
3747 }
3748 fp = (REStackFrame *)newFP;
3749 fStack->setSize(newStackSize);
3750 }
3751 break;
3752
3753 case URX_BACKREF:
3754 case URX_BACKREF_I:
3755 {
3756 U_ASSERT(opValue < fFrameSize);
3757 int64_t groupStartIdx = fp->fExtra[opValue];
3758 int64_t groupEndIdx = fp->fExtra[opValue+1];
3759 U_ASSERT(groupStartIdx <= groupEndIdx);
3760 if (groupStartIdx < 0) {
3761 // This capture group has not participated in the match thus far,
3762 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3763 }
3764
3765 if (groupEndIdx == groupStartIdx) {
3766 // The capture group match was of an empty string.
3767 // Verified by testing: Perl matches succeed in this case, so
3768 // we do too.
3769 break;
3770 }
3771
3772 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3773 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3774
3775 UBool haveMatch = (opType == URX_BACKREF ?
3776 (0 == utext_compareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1)) :
3777 (0 == utext_caseCompareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1, U_FOLD_CASE_DEFAULT, &status)));
3778 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3779
3780 if (fp->fInputIdx > fActiveLimit) {
3781 fHitEnd = TRUE;
3782 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3783 } else if (!haveMatch) {
3784 if (fp->fInputIdx == fActiveLimit) {
3785 fHitEnd = TRUE;
3786 }
3787 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3788 }
3789 }
3790 break;
3791
3792 case URX_STO_INP_LOC:
3793 {
3794 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3795 fp->fExtra[opValue] = fp->fInputIdx;
3796 }
3797 break;
3798
3799 case URX_JMPX:
3800 {
3801 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3802 fp->fPatIdx += 1;
3803 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
3804 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3805 int64_t savedInputIdx = fp->fExtra[dataLoc];
3806 U_ASSERT(savedInputIdx <= fp->fInputIdx);
3807 if (savedInputIdx < fp->fInputIdx) {
3808 fp->fPatIdx = opValue; // JMP
3809 } else {
3810 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
3811 }
3812 }
3813 break;
3814
3815 case URX_LA_START:
3816 {
3817 // Entering a lookahead block.
3818 // Save Stack Ptr, Input Pos.
3819 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3820 fData[opValue] = fStack->size();
3821 fData[opValue+1] = fp->fInputIdx;
3822 fActiveStart = fLookStart; // Set the match region change for
3823 fActiveLimit = fLookLimit; // transparent bounds.
3824 }
3825 break;
3826
3827 case URX_LA_END:
3828 {
3829 // Leaving a look-ahead block.
3830 // restore Stack Ptr, Input Pos to positions they had on entry to block.
3831 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3832 int32_t stackSize = fStack->size();
3833 int32_t newStackSize =(int32_t)fData[opValue];
3834 U_ASSERT(stackSize >= newStackSize);
3835 if (stackSize > newStackSize) {
3836 // Copy the current top frame back to the new (cut back) top frame.
3837 // This makes the capture groups from within the look-ahead
3838 // expression available.
3839 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3840 int32_t i;
3841 for (i=0; i<fFrameSize; i++) {
3842 newFP[i] = ((int64_t *)fp)[i];
3843 }
3844 fp = (REStackFrame *)newFP;
3845 fStack->setSize(newStackSize);
3846 }
3847 fp->fInputIdx = fData[opValue+1];
3848
3849 // Restore the active region bounds in the input string; they may have
3850 // been changed because of transparent bounds on a Region.
3851 fActiveStart = fRegionStart;
3852 fActiveLimit = fRegionLimit;
3853 }
3854 break;
3855
3856 case URX_ONECHAR_I:
3857 if (fp->fInputIdx < fActiveLimit) {
3858 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3859
3860 UChar32 c = UTEXT_NEXT32(fInputText);
3861 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3862 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3863 break;
3864 }
3865 } else {
3866 fHitEnd = TRUE;
3867 }
3868
3869 #ifdef REGEX_SMART_BACKTRACKING
3870 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3871 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3872 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3873 UBool success = FALSE;
3874 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3875 while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) {
3876 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3877 success = TRUE;
3878 break;
3879 } else if (c == U_SENTINEL) {
3880 break;
3881 }
3882 c = UTEXT_PREVIOUS32(fInputText);
3883 }
3884 if (success) {
3885 fHitEnd = FALSE;
3886 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3887 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3888 if (fp->fInputIdx > backSearchIndex) {
3889 fp = StateSave(fp, fp->fPatIdx, status);
3890 }
3891 fp->fPatIdx++; // Skip the LOOP_C, we just did that
3892 break;
3893 }
3894 }
3895 }
3896 #endif
3897
3898 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3899 break;
3900
3901 case URX_STRING_I:
3902 {
3903 // Test input against a literal string.
3904 // Strings require two slots in the compiled pattern, one for the
3905 // offset to the string text, and one for the length.
3906 const UCaseProps *csp = ucase_getSingleton();
3907 {
3908 int32_t stringStartIdx, stringLen;
3909 stringStartIdx = opValue;
3910
3911 op = (int32_t)pat[fp->fPatIdx];
3912 fp->fPatIdx++;
3913 opType = URX_TYPE(op);
3914 opValue = URX_VAL(op);
3915 U_ASSERT(opType == URX_STRING_LEN);
3916 stringLen = opValue;
3917
3918 const UChar *patternChars = litText+stringStartIdx;
3919 const UChar *patternEnd = patternChars+stringLen;
3920
3921 const UChar *foldChars = NULL;
3922 int32_t foldOffset, foldLength;
3923 UChar32 c;
3924
3925 foldOffset = foldLength = 0;
3926 UBool success = TRUE;
3927
3928 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3929 while (patternChars < patternEnd && success) {
3930 if(foldOffset < foldLength) {
3931 U16_NEXT_UNSAFE(foldChars, foldOffset, c);
3932 } else {
3933 c = UTEXT_NEXT32(fInputText);
3934 if (c != U_SENTINEL) {
3935 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
3936 if(foldLength >= 0) {
3937 if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings
3938 foldOffset = 0;
3939 U16_NEXT_UNSAFE(foldChars, foldOffset, c);
3940 } else {
3941 c = foldLength;
3942 foldLength = foldOffset; // to avoid reading chars from the folding buffer
3943 }
3944 }
3945 }
3946
3947 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3948 }
3949
3950 success = FALSE;
3951 if (c != U_SENTINEL && (fp->fInputIdx <= fActiveLimit)) {
3952 if (U_IS_BMP(c)) {
3953 success = (*patternChars == c);
3954 patternChars += 1;
3955 } else if (patternChars+1 < patternEnd) {
3956 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
3957 patternChars += 2;
3958 }
3959 } else {
3960 fHitEnd = TRUE; // TODO: See ticket 6074
3961 }
3962 }
3963
3964 if (!success) {
3965 #ifdef REGEX_SMART_BACKTRACKING
3966 if (fp->fInputIdx > backSearchIndex && fStack->size()) {
3967 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3968 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3969 // Reset to last start point
3970 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3971 patternChars = litText+stringStartIdx;
3972
3973 // Search backwards for a possible start
3974 do {
3975 c = UTEXT_PREVIOUS32(fInputText);
3976 if (c == U_SENTINEL) {
3977 break;
3978 } else {
3979 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
3980 if(foldLength >= 0) {
3981 if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings
3982 foldOffset = 0;
3983 U16_NEXT_UNSAFE(foldChars, foldOffset, c);
3984 } else {
3985 c = foldLength;
3986 foldLength = foldOffset; // to avoid reading chars from the folding buffer
3987 }
3988 }
3989
3990 if ((U_IS_BMP(c) && *patternChars == c) ||
3991 (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
3992 success = TRUE;
3993 break;
3994 }
3995 }
3996 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
3997
3998 // And try again
3999 if (success) {
4000 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4001 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4002 if (fp->fInputIdx > backSearchIndex) {
4003 fp = StateSave(fp, fp->fPatIdx, status);
4004 }
4005 fp->fPatIdx++; // Skip the LOOP_C, we just did that
4006 break;
4007 }
4008 }
4009 }
4010 #endif
4011 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4012 }
4013 }
4014 }
4015 break;
4016
4017 case URX_LB_START:
4018 {
4019 // Entering a look-behind block.
4020 // Save Stack Ptr, Input Pos.
4021 // TODO: implement transparent bounds. Ticket #6067
4022 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4023 fData[opValue] = fStack->size();
4024 fData[opValue+1] = fp->fInputIdx;
4025 // Init the variable containing the start index for attempted matches.
4026 fData[opValue+2] = -1;
4027 // Save input string length, then reset to pin any matches to end at
4028 // the current position.
4029 fData[opValue+3] = fActiveLimit;
4030 fActiveLimit = fp->fInputIdx;
4031 }
4032 break;
4033
4034
4035 case URX_LB_CONT:
4036 {
4037 // Positive Look-Behind, at top of loop checking for matches of LB expression
4038 // at all possible input starting positions.
4039
4040 // Fetch the min and max possible match lengths. They are the operands
4041 // of this op in the pattern.
4042 int32_t minML = (int32_t)pat[fp->fPatIdx++];
4043 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
4044 U_ASSERT(minML <= maxML);
4045 U_ASSERT(minML >= 0);
4046
4047 // Fetch (from data) the last input index where a match was attempted.
4048 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4049 int64_t *lbStartIdx = &fData[opValue+2];
4050 if (*lbStartIdx < 0) {
4051 // First time through loop.
4052 *lbStartIdx = fp->fInputIdx - minML;
4053 } else {
4054 // 2nd through nth time through the loop.
4055 // Back up start position for match by one.
4056 if (*lbStartIdx == 0) {
4057 (*lbStartIdx)--;
4058 } else {
4059 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
4060 UTEXT_PREVIOUS32(fInputText);
4061 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4062 }
4063 }
4064
4065 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
4066 // We have tried all potential match starting points without
4067 // getting a match. Backtrack out, and out of the
4068 // Look Behind altogether.
4069 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4070 int64_t restoreInputLen = fData[opValue+3];
4071 U_ASSERT(restoreInputLen >= fActiveLimit);
4072 U_ASSERT(restoreInputLen <= fInputLength);
4073 fActiveLimit = restoreInputLen;
4074 break;
4075 }
4076
4077 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4078 // (successful match will fall off the end of the loop.)
4079 fp = StateSave(fp, fp->fPatIdx-3, status);
4080 fp->fInputIdx = *lbStartIdx;
4081 }
4082 break;
4083
4084 case URX_LB_END:
4085 // End of a look-behind block, after a successful match.
4086 {
4087 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4088 if (fp->fInputIdx != fActiveLimit) {
4089 // The look-behind expression matched, but the match did not
4090 // extend all the way to the point that we are looking behind from.
4091 // FAIL out of here, which will take us back to the LB_CONT, which
4092 // will retry the match starting at another position or fail
4093 // the look-behind altogether, whichever is appropriate.
4094 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4095 break;
4096 }
4097
4098 // Look-behind match is good. Restore the orignal input string length,
4099 // which had been truncated to pin the end of the lookbehind match to the
4100 // position being looked-behind.
4101 int64_t originalInputLen = fData[opValue+3];
4102 U_ASSERT(originalInputLen >= fActiveLimit);
4103 U_ASSERT(originalInputLen <= fInputLength);
4104 fActiveLimit = originalInputLen;
4105 }
4106 break;
4107
4108
4109 case URX_LBN_CONT:
4110 {
4111 // Negative Look-Behind, at top of loop checking for matches of LB expression
4112 // at all possible input starting positions.
4113
4114 // Fetch the extra parameters of this op.
4115 int32_t minML = (int32_t)pat[fp->fPatIdx++];
4116 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
4117 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
4118 continueLoc = URX_VAL(continueLoc);
4119 U_ASSERT(minML <= maxML);
4120 U_ASSERT(minML >= 0);
4121 U_ASSERT(continueLoc > fp->fPatIdx);
4122
4123 // Fetch (from data) the last input index where a match was attempted.
4124 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4125 int64_t *lbStartIdx = &fData[opValue+2];
4126 if (*lbStartIdx < 0) {
4127 // First time through loop.
4128 *lbStartIdx = fp->fInputIdx - minML;
4129 } else {
4130 // 2nd through nth time through the loop.
4131 // Back up start position for match by one.
4132 if (*lbStartIdx == 0) {
4133 (*lbStartIdx)--;
4134 } else {
4135 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
4136 UTEXT_PREVIOUS32(fInputText);
4137 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4138 }
4139 }
4140
4141 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
4142 // We have tried all potential match starting points without
4143 // getting a match, which means that the negative lookbehind as
4144 // a whole has succeeded. Jump forward to the continue location
4145 int64_t restoreInputLen = fData[opValue+3];
4146 U_ASSERT(restoreInputLen >= fActiveLimit);
4147 U_ASSERT(restoreInputLen <= fInputLength);
4148 fActiveLimit = restoreInputLen;
4149 fp->fPatIdx = continueLoc;
4150 break;
4151 }
4152
4153 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4154 // (successful match will cause a FAIL out of the loop altogether.)
4155 fp = StateSave(fp, fp->fPatIdx-4, status);
4156 fp->fInputIdx = *lbStartIdx;
4157 }
4158 break;
4159
4160 case URX_LBN_END:
4161 // End of a negative look-behind block, after a successful match.
4162 {
4163 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4164 if (fp->fInputIdx != fActiveLimit) {
4165 // The look-behind expression matched, but the match did not
4166 // extend all the way to the point that we are looking behind from.
4167 // FAIL out of here, which will take us back to the LB_CONT, which
4168 // will retry the match starting at another position or succeed
4169 // the look-behind altogether, whichever is appropriate.
4170 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4171 break;
4172 }
4173
4174 // Look-behind expression matched, which means look-behind test as
4175 // a whole Fails
4176
4177 // Restore the orignal input string length, which had been truncated
4178 // inorder to pin the end of the lookbehind match
4179 // to the position being looked-behind.
4180 int64_t originalInputLen = fData[opValue+3];
4181 U_ASSERT(originalInputLen >= fActiveLimit);
4182 U_ASSERT(originalInputLen <= fInputLength);
4183 fActiveLimit = originalInputLen;
4184
4185 // Restore original stack position, discarding any state saved
4186 // by the successful pattern match.
4187 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4188 int32_t newStackSize = (int32_t)fData[opValue];
4189 U_ASSERT(fStack->size() > newStackSize);
4190 fStack->setSize(newStackSize);
4191
4192 // FAIL, which will take control back to someplace
4193 // prior to entering the look-behind test.
4194 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4195 }
4196 break;
4197
4198
4199 case URX_LOOP_SR_I:
4200 // Loop Initialization for the optimized implementation of
4201 // [some character set]*
4202 // This op scans through all matching input.
4203 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4204 {
4205 U_ASSERT(opValue > 0 && opValue < sets->size());
4206 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4207 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
4208
4209 // Loop through input, until either the input is exhausted or
4210 // we reach a character that is not a member of the set.
4211 int64_t ix = fp->fInputIdx;
4212 UTEXT_SETNATIVEINDEX(fInputText, ix);
4213 for (;;) {
4214 if (ix >= fActiveLimit) {
4215 fHitEnd = TRUE;
4216 break;
4217 }
4218 UChar32 c = UTEXT_NEXT32(fInputText);
4219 if (c<256) {
4220 if (s8->contains(c) == FALSE) {
4221 break;
4222 }
4223 } else {
4224 if (s->contains(c) == FALSE) {
4225 break;
4226 }
4227 }
4228 ix = UTEXT_GETNATIVEINDEX(fInputText);
4229 }
4230
4231 // If there were no matching characters, skip over the loop altogether.
4232 // The loop doesn't run at all, a * op always succeeds.
4233 if (ix == fp->fInputIdx) {
4234 fp->fPatIdx++; // skip the URX_LOOP_C op.
4235 break;
4236 }
4237
4238 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4239 // must follow. It's operand is the stack location
4240 // that holds the starting input index for the match of this [set]*
4241 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4242 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4243 int32_t stackLoc = URX_VAL(loopcOp);
4244 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4245 fp->fExtra[stackLoc] = fp->fInputIdx;
4246 #ifdef REGEX_SMART_BACKTRACKING
4247 backSearchIndex = fp->fInputIdx;
4248 #endif
4249 fp->fInputIdx = ix;
4250
4251 // Save State to the URX_LOOP_C op that follows this one,
4252 // so that match failures in the following code will return to there.
4253 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4254 fp = StateSave(fp, fp->fPatIdx, status);
4255 fp->fPatIdx++;
4256 }
4257 break;
4258
4259
4260 case URX_LOOP_DOT_I:
4261 // Loop Initialization for the optimized implementation of .*
4262 // This op scans through all remaining input.
4263 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4264 {
4265 // Loop through input until the input is exhausted (we reach an end-of-line)
4266 // In DOTALL mode, we can just go straight to the end of the input.
4267 int64_t ix;
4268 if ((opValue & 1) == 1) {
4269 // Dot-matches-All mode. Jump straight to the end of the string.
4270 ix = fActiveLimit;
4271 fHitEnd = TRUE;
4272 } else {
4273 // NOT DOT ALL mode. Line endings do not match '.'
4274 // Scan forward until a line ending or end of input.
4275 ix = fp->fInputIdx;
4276 UTEXT_SETNATIVEINDEX(fInputText, ix);
4277 for (;;) {
4278 if (ix >= fActiveLimit) {
4279 fHitEnd = TRUE;
4280 break;
4281 }
4282 UChar32 c = UTEXT_NEXT32(fInputText);
4283 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
4284 if ((c == 0x0a) || // 0x0a is newline in both modes.
4285 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
4286 (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) {
4287 // char is a line ending. Exit the scanning loop.
4288 break;
4289 }
4290 }
4291 ix = UTEXT_GETNATIVEINDEX(fInputText);
4292 }
4293 }
4294
4295 // If there were no matching characters, skip over the loop altogether.
4296 // The loop doesn't run at all, a * op always succeeds.
4297 if (ix == fp->fInputIdx) {
4298 fp->fPatIdx++; // skip the URX_LOOP_C op.
4299 break;
4300 }
4301
4302 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4303 // must follow. It's operand is the stack location
4304 // that holds the starting input index for the match of this .*
4305 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4306 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4307 int32_t stackLoc = URX_VAL(loopcOp);
4308 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4309 fp->fExtra[stackLoc] = fp->fInputIdx;
4310 #ifdef REGEX_SMART_BACKTRACKING
4311 backSearchIndex = fp->fInputIdx;
4312 #endif
4313 fp->fInputIdx = ix;
4314
4315 // Save State to the URX_LOOP_C op that follows this one,
4316 // so that match failures in the following code will return to there.
4317 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4318 fp = StateSave(fp, fp->fPatIdx, status);
4319 fp->fPatIdx++;
4320 }
4321 break;
4322
4323
4324 case URX_LOOP_C:
4325 {
4326 U_ASSERT(opValue>=0 && opValue<fFrameSize);
4327 backSearchIndex = fp->fExtra[opValue];
4328 U_ASSERT(backSearchIndex <= fp->fInputIdx);
4329 if (backSearchIndex == fp->fInputIdx) {
4330 // We've backed up the input idx to the point that the loop started.
4331 // The loop is done. Leave here without saving state.
4332 // Subsequent failures won't come back here.
4333 break;
4334 }
4335 // Set up for the next iteration of the loop, with input index
4336 // backed up by one from the last time through,
4337 // and a state save to this instruction in case the following code fails again.
4338 // (We're going backwards because this loop emulates stack unwinding, not
4339 // the initial scan forward.)
4340 U_ASSERT(fp->fInputIdx > 0);
4341 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4342 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4343 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4344
4345 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4346 if (prevC == 0x0a &&
4347 fp->fInputIdx > backSearchIndex &&
4348 twoPrevC == 0x0d) {
4349 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4350 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4351 // .*, stepping back over CRLF pair.
4352 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4353 }
4354 }
4355
4356
4357 fp = StateSave(fp, fp->fPatIdx-1, status);
4358 }
4359 break;
4360
4361
4362
4363 default:
4364 // Trouble. The compiled pattern contains an entry with an
4365 // unrecognized type tag.
4366 U_ASSERT(FALSE);
4367 }
4368
4369 if (U_FAILURE(status)) {
4370 isMatch = FALSE;
4371 break;
4372 }
4373 }
4374
4375 breakFromLoop:
4376 fMatch = isMatch;
4377 if (isMatch) {
4378 fLastMatchEnd = fMatchEnd;
4379 fMatchStart = startIdx;
4380 fMatchEnd = fp->fInputIdx;
4381 if (fTraceDebug) {
4382 REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd));
4383 }
4384 }
4385 else
4386 {
4387 if (fTraceDebug) {
4388 REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
4389 }
4390 }
4391
4392 fFrame = fp; // The active stack frame when the engine stopped.
4393 // Contains the capture group results that we need to
4394 // access later.
4395 return;
4396 }
4397
4398
4399 //--------------------------------------------------------------------------------
4400 //
4401 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with the
4402 // assumption that the entire string is available in the UText's
4403 // chunk buffer. For now, that means we can use int32_t indexes,
4404 // except for anything that needs to be saved (like group starts
4405 // and ends).
4406 //
4407 // startIdx: begin matching a this index.
4408 // toEnd: if true, match must extend to end of the input region
4409 //
4410 //--------------------------------------------------------------------------------
4411 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4412 UBool isMatch = FALSE; // True if the we have a match.
4413
4414 int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4415
4416 int32_t op; // Operation from the compiled pattern, split into
4417 int32_t opType; // the opcode
4418 int32_t opValue; // and the operand value.
4419
4420 #ifdef REGEX_RUN_DEBUG
4421 if (fTraceDebug)
4422 {
4423 printf("MatchAt(startIdx=%ld)\n", startIdx);
4424 printf("Original Pattern: ");
4425 UChar32 c = utext_next32From(fPattern->fPattern, 0);
4426 while (c != U_SENTINEL) {
4427 if (c<32 || c>256) {
4428 c = '.';
4429 }
4430 REGEX_DUMP_DEBUG_PRINTF(("%c", c));
4431
4432 c = UTEXT_NEXT32(fPattern->fPattern);
4433 }
4434 printf("\n");
4435 printf("Input String: ");
4436 c = utext_next32From(fInputText, 0);
4437 while (c != U_SENTINEL) {
4438 if (c<32 || c>256) {
4439 c = '.';
4440 }
4441 printf("%c", c);
4442
4443 c = UTEXT_NEXT32(fInputText);
4444 }
4445 printf("\n");
4446 printf("\n");
4447 }
4448 #endif
4449
4450 if (U_FAILURE(status)) {
4451 return;
4452 }
4453
4454 // Cache frequently referenced items from the compiled pattern
4455 //
4456 int64_t *pat = fPattern->fCompiledPat->getBuffer();
4457
4458 const UChar *litText = fPattern->fLiteralText.getBuffer();
4459 UVector *sets = fPattern->fSets;
4460
4461 const UChar *inputBuf = fInputText->chunkContents;
4462
4463 fFrameSize = fPattern->fFrameSize;
4464 REStackFrame *fp = resetStack();
4465
4466 fp->fPatIdx = 0;
4467 fp->fInputIdx = startIdx;
4468
4469 // Zero out the pattern's static data
4470 int32_t i;
4471 for (i = 0; i<fPattern->fDataSize; i++) {
4472 fData[i] = 0;
4473 }
4474
4475 //
4476 // Main loop for interpreting the compiled pattern.
4477 // One iteration of the loop per pattern operation performed.
4478 //
4479 for (;;) {
4480 #if 0
4481 if (_heapchk() != _HEAPOK) {
4482 fprintf(stderr, "Heap Trouble\n");
4483 }
4484 #endif
4485
4486 op = (int32_t)pat[fp->fPatIdx];
4487 opType = URX_TYPE(op);
4488 opValue = URX_VAL(op);
4489 #ifdef REGEX_RUN_DEBUG
4490 if (fTraceDebug) {
4491 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4492 printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp->fInputIdx,
4493 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4494 fPattern->dumpOp(fp->fPatIdx);
4495 }
4496 #endif
4497 fp->fPatIdx++;
4498
4499 switch (opType) {
4500
4501
4502 case URX_NOP:
4503 break;
4504
4505
4506 case URX_BACKTRACK:
4507 // Force a backtrack. In some circumstances, the pattern compiler
4508 // will notice that the pattern can't possibly match anything, and will
4509 // emit one of these at that point.
4510 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4511 break;
4512
4513
4514 case URX_ONECHAR:
4515 if (fp->fInputIdx < fActiveLimit) {
4516 UChar32 c;
4517 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4518 if (c == opValue) {
4519 break;
4520 }
4521 } else {
4522 fHitEnd = TRUE;
4523 }
4524
4525 #ifdef REGEX_SMART_BACKTRACKING
4526 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
4527 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
4528 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
4529 int64_t reverseIndex = fp->fInputIdx;
4530 UChar32 c;
4531 do {
4532 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
4533 if (c == opValue) {
4534 break;
4535 }
4536 } while (reverseIndex > backSearchIndex);
4537 if (c == opValue) {
4538 fHitEnd = FALSE;
4539 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4540 fp->fInputIdx = reverseIndex;
4541 if (fp->fInputIdx > backSearchIndex) {
4542 fp = StateSave(fp, fp->fPatIdx, status);
4543 }
4544 fp->fPatIdx++; // Skip the LOOP_C, we just did that
4545 break;
4546 }
4547 }
4548 }
4549 #endif
4550
4551 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4552 break;
4553
4554
4555 case URX_STRING:
4556 {
4557 // Test input against a literal string.
4558 // Strings require two slots in the compiled pattern, one for the
4559 // offset to the string text, and one for the length.
4560 int32_t stringStartIdx = opValue;
4561 int32_t stringLen;
4562
4563 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
4564 fp->fPatIdx++;
4565 opType = URX_TYPE(op);
4566 stringLen = URX_VAL(op);
4567 U_ASSERT(opType == URX_STRING_LEN);
4568 U_ASSERT(stringLen >= 2);
4569
4570 if (fp->fInputIdx + stringLen > fActiveLimit) {
4571 // No match. String is longer than the remaining input text.
4572 fHitEnd = TRUE; // TODO: See ticket 6074
4573 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4574 break;
4575 }
4576
4577 const UChar * pInp = inputBuf + fp->fInputIdx;
4578 const UChar * pPat = litText+stringStartIdx;
4579 const UChar * pEnd = pInp + stringLen;
4580 UBool success = FALSE;
4581 for(;;) {
4582 if (*pInp == *pPat) {
4583 pInp++;
4584 pPat++;
4585 if (pInp == pEnd) {
4586 // Successful Match.
4587 success = TRUE;
4588 break;
4589 }
4590 } else {
4591 // Match failed.
4592 break;
4593 }
4594 }
4595
4596 if (success) {
4597 fp->fInputIdx += stringLen;
4598 } else {
4599 #ifdef REGEX_SMART_BACKTRACKING
4600 if (fp->fInputIdx > backSearchIndex && fStack->size()) {
4601 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
4602 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
4603 // Reset to last start point
4604 int64_t reverseIndex = fp->fInputIdx;
4605 UChar32 c;
4606 pPat = litText+stringStartIdx;
4607
4608 // Search backwards for a possible start
4609 do {
4610 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
4611 if ((U_IS_BMP(c) && *pPat == c) ||
4612 (*pPat == U16_LEAD(c) && *(pPat+1) == U16_TRAIL(c))) {
4613 success = TRUE;
4614 break;
4615 }
4616 } while (reverseIndex > backSearchIndex);
4617
4618 // And try again
4619 if (success) {
4620 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4621 fp->fInputIdx = reverseIndex;
4622 if (fp->fInputIdx > backSearchIndex) {
4623 fp = StateSave(fp, fp->fPatIdx, status);
4624 }
4625 fp->fPatIdx++; // Skip the LOOP_C, we just did that
4626 break;
4627 }
4628 }
4629 }
4630 #endif
4631 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4632 }
4633 }
4634 break;
4635
4636
4637 case URX_STATE_SAVE:
4638 fp = StateSave(fp, opValue, status);
4639 break;
4640
4641
4642 case URX_END:
4643 // The match loop will exit via this path on a successful match,
4644 // when we reach the end of the pattern.
4645 if (toEnd && fp->fInputIdx != fActiveLimit) {
4646 // The pattern matched, but not to the end of input. Try some more.
4647 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4648 break;
4649 }
4650 isMatch = TRUE;
4651 goto breakFromLoop;
4652
4653 // Start and End Capture stack frame variables are laid out out like this:
4654 // fp->fExtra[opValue] - The start of a completed capture group
4655 // opValue+1 - The end of a completed capture group
4656 // opValue+2 - the start of a capture group whose end
4657 // has not yet been reached (and might not ever be).
4658 case URX_START_CAPTURE:
4659 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4660 fp->fExtra[opValue+2] = fp->fInputIdx;
4661 break;
4662
4663
4664 case URX_END_CAPTURE:
4665 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4666 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
4667 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
4668 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
4669 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4670 break;
4671
4672
4673 case URX_DOLLAR: // $, test for End of line
4674 // or for position before new line at end of input
4675 if (fp->fInputIdx < fAnchorLimit-2) {
4676 // We are no where near the end of input. Fail.
4677 // This is the common case. Keep it first.
4678 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4679 break;
4680 }
4681 if (fp->fInputIdx >= fAnchorLimit) {
4682 // We really are at the end of input. Success.
4683 fHitEnd = TRUE;
4684 fRequireEnd = TRUE;
4685 break;
4686 }
4687
4688 // If we are positioned just before a new-line that is located at the
4689 // end of input, succeed.
4690 if (fp->fInputIdx == fAnchorLimit-1) {
4691 UChar32 c;
4692 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4693
4694 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
4695 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4696 // At new-line at end of input. Success
4697 fHitEnd = TRUE;
4698 fRequireEnd = TRUE;
4699 break;
4700 }
4701 }
4702 } else if (fp->fInputIdx == fAnchorLimit-2 &&
4703 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4704 fHitEnd = TRUE;
4705 fRequireEnd = TRUE;
4706 break; // At CR/LF at end of input. Success
4707 }
4708
4709 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4710
4711 break;
4712
4713
4714 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
4715 if (fp->fInputIdx >= fAnchorLimit-1) {
4716 // Either at the last character of input, or off the end.
4717 if (fp->fInputIdx == fAnchorLimit-1) {
4718 // At last char of input. Success if it's a new line.
4719 if (inputBuf[fp->fInputIdx] == 0x0a) {
4720 fHitEnd = TRUE;
4721 fRequireEnd = TRUE;
4722 break;
4723 }
4724 } else {
4725 // Off the end of input. Success.
4726 fHitEnd = TRUE;
4727 fRequireEnd = TRUE;
4728 break;
4729 }
4730 }
4731
4732 // Not at end of input. Back-track out.
4733 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4734 break;
4735
4736
4737 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
4738 {
4739 if (fp->fInputIdx >= fAnchorLimit) {
4740 // We really are at the end of input. Success.
4741 fHitEnd = TRUE;
4742 fRequireEnd = TRUE;
4743 break;
4744 }
4745 // If we are positioned just before a new-line, succeed.
4746 // It makes no difference where the new-line is within the input.
4747 UChar32 c = inputBuf[fp->fInputIdx];
4748 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
4749 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
4750 // In multi-line mode, hitting a new-line just before the end of input does not
4751 // set the hitEnd or requireEnd flags
4752 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4753 break;
4754 }
4755 }
4756 // not at a new line. Fail.
4757 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4758 }
4759 break;
4760
4761
4762 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
4763 {
4764 if (fp->fInputIdx >= fAnchorLimit) {
4765 // We really are at the end of input. Success.
4766 fHitEnd = TRUE;
4767 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
4768 break; // adding a new-line would not lose the match.
4769 }
4770 // If we are not positioned just before a new-line, the test fails; backtrack out.
4771 // It makes no difference where the new-line is within the input.
4772 if (inputBuf[fp->fInputIdx] != 0x0a) {
4773 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4774 }
4775 }
4776 break;
4777
4778
4779 case URX_CARET: // ^, test for start of line
4780 if (fp->fInputIdx != fAnchorStart) {
4781 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4782 }
4783 break;
4784
4785
4786 case URX_CARET_M: // ^, test for start of line in mulit-line mode
4787 {
4788 if (fp->fInputIdx == fAnchorStart) {
4789 // We are at the start input. Success.
4790 break;
4791 }
4792 // Check whether character just before the current pos is a new-line
4793 // unless we are at the end of input
4794 UChar c = inputBuf[fp->fInputIdx - 1];
4795 if ((fp->fInputIdx < fAnchorLimit) &&
4796 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
4797 // It's a new-line. ^ is true. Success.
4798 // TODO: what should be done with positions between a CR and LF?
4799 break;
4800 }
4801 // Not at the start of a line. Fail.
4802 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4803 }
4804 break;
4805
4806
4807 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
4808 {
4809 U_ASSERT(fp->fInputIdx >= fAnchorStart);
4810 if (fp->fInputIdx <= fAnchorStart) {
4811 // We are at the start input. Success.
4812 break;
4813 }
4814 // Check whether character just before the current pos is a new-line
4815 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4816 UChar c = inputBuf[fp->fInputIdx - 1];
4817 if (c != 0x0a) {
4818 // Not at the start of a line. Back-track out.
4819 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4820 }
4821 }
4822 break;
4823
4824 case URX_BACKSLASH_B: // Test for word boundaries
4825 {
4826 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4827 success ^= (opValue != 0); // flip sense for \B
4828 if (!success) {
4829 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4830 }
4831 }
4832 break;
4833
4834
4835 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
4836 {
4837 UBool success = isUWordBoundary(fp->fInputIdx);
4838 success ^= (opValue != 0); // flip sense for \B
4839 if (!success) {
4840 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4841 }
4842 }
4843 break;
4844
4845
4846 case URX_BACKSLASH_D: // Test for decimal digit
4847 {
4848 if (fp->fInputIdx >= fActiveLimit) {
4849 fHitEnd = TRUE;
4850 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4851 break;
4852 }
4853
4854 UChar32 c;
4855 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4856 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
4857 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4858 success ^= (opValue != 0); // flip sense for \D
4859 if (!success) {
4860 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4861 }
4862 }
4863 break;
4864
4865
4866 case URX_BACKSLASH_G: // Test for position at end of previous match
4867 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
4868 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4869 }
4870 break;
4871
4872
4873 case URX_BACKSLASH_X:
4874 // Match a Grapheme, as defined by Unicode TR 29.
4875 // Differs slightly from Perl, which consumes combining marks independently
4876 // of context.
4877 {
4878
4879 // Fail if at end of input
4880 if (fp->fInputIdx >= fActiveLimit) {
4881 fHitEnd = TRUE;
4882 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4883 break;
4884 }
4885
4886 // Examine (and consume) the current char.
4887 // Dispatch into a little state machine, based on the char.
4888 UChar32 c;
4889 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4890 UnicodeSet **sets = fPattern->fStaticSets;
4891 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
4892 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
4893 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4894 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4895 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4896 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4897 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4898 goto GC_Extend;
4899
4900
4901
4902 GC_L:
4903 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4904 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4905 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4906 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4907 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4908 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4909 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4910 goto GC_Extend;
4911
4912 GC_V:
4913 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4914 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4915 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4916 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4917 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4918 goto GC_Extend;
4919
4920 GC_T:
4921 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4922 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4923 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4924 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4925 goto GC_Extend;
4926
4927 GC_Extend:
4928 // Combining characters are consumed here
4929 for (;;) {
4930 if (fp->fInputIdx >= fActiveLimit) {
4931 break;
4932 }
4933 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4934 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
4935 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
4936 break;
4937 }
4938 }
4939 goto GC_Done;
4940
4941 GC_Control:
4942 // Most control chars stand alone (don't combine with combining chars),
4943 // except for that CR/LF sequence is a single grapheme cluster.
4944 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
4945 fp->fInputIdx++;
4946 }
4947
4948 GC_Done:
4949 if (fp->fInputIdx >= fActiveLimit) {
4950 fHitEnd = TRUE;
4951 }
4952 break;
4953 }
4954
4955
4956
4957
4958 case URX_BACKSLASH_Z: // Test for end of Input
4959 if (fp->fInputIdx < fAnchorLimit) {
4960 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4961 } else {
4962 fHitEnd = TRUE;
4963 fRequireEnd = TRUE;
4964 }
4965 break;
4966
4967
4968
4969 case URX_STATIC_SETREF:
4970 {
4971 // Test input character against one of the predefined sets
4972 // (Word Characters, for example)
4973 // The high bit of the op value is a flag for the match polarity.
4974 // 0: success if input char is in set.
4975 // 1: success if input char is not in set.
4976 if (fp->fInputIdx >= fActiveLimit) {
4977 fHitEnd = TRUE;
4978 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4979 break;
4980 }
4981
4982 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
4983 opValue &= ~URX_NEG_SET;
4984 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4985
4986 UChar32 c;
4987 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4988 if (c < 256) {
4989 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4990 if (s8->contains(c)) {
4991 success = !success;
4992 }
4993 } else {
4994 const UnicodeSet *s = fPattern->fStaticSets[opValue];
4995 if (s->contains(c)) {
4996 success = !success;
4997 }
4998 }
4999 if (!success) {
5000 #ifdef REGEX_SMART_BACKTRACKING
5001 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5002 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5003 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5004 // Try to find it, backwards
5005 int64_t reverseIndex = fp->fInputIdx;
5006 U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
5007 success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset
5008 do {
5009 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5010 if (c < 256) {
5011 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
5012 if (s8->contains(c)) {
5013 success = !success;
5014 }
5015 } else {
5016 const UnicodeSet *s = fPattern->fStaticSets[opValue];
5017 if (s->contains(c)) {
5018 success = !success;
5019 }
5020 }
5021 } while (reverseIndex > backSearchIndex && !success);
5022
5023 if (success) {
5024 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5025 fp->fInputIdx = reverseIndex;
5026 if (fp->fInputIdx > backSearchIndex) {
5027 fp = StateSave(fp, fp->fPatIdx, status);
5028 }
5029 fp->fPatIdx++; // Skip the LOOP_C, we just did that
5030 break;
5031 }
5032 }
5033 }
5034 #endif
5035 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5036 }
5037 }
5038 break;
5039
5040
5041 case URX_STAT_SETREF_N:
5042 {
5043 // Test input character for NOT being a member of one of
5044 // the predefined sets (Word Characters, for example)
5045 if (fp->fInputIdx >= fActiveLimit) {
5046 fHitEnd = TRUE;
5047 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5048 break;
5049 }
5050
5051 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
5052
5053 UChar32 c;
5054 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5055 if (c < 256) {
5056 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
5057 if (s8->contains(c) == FALSE) {
5058 break;
5059 }
5060 } else {
5061 const UnicodeSet *s = fPattern->fStaticSets[opValue];
5062 if (s->contains(c) == FALSE) {
5063 break;
5064 }
5065 }
5066
5067 #ifdef REGEX_SMART_BACKTRACKING
5068 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5069 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5070 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5071 // Try to find it, backwards
5072 int64_t reverseIndex = fp->fInputIdx;
5073 U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
5074 UBool success = FALSE;
5075 do {
5076 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5077 if (c < 256) {
5078 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
5079 if (s8->contains(c) == FALSE) {
5080 success = TRUE;
5081 break;
5082 }
5083 } else {
5084 const UnicodeSet *s = fPattern->fStaticSets[opValue];
5085 if (s->contains(c) == FALSE) {
5086 success = TRUE;
5087 break;
5088 }
5089 }
5090 } while (reverseIndex > backSearchIndex);
5091
5092 if (success) {
5093 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5094 fp->fInputIdx = reverseIndex;
5095 if (fp->fInputIdx > backSearchIndex) {
5096 fp = StateSave(fp, fp->fPatIdx, status);
5097 }
5098 fp->fPatIdx++; // Skip the LOOP_C, we just did that
5099 break;
5100 }
5101 }
5102 }
5103 #endif
5104 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5105 }
5106 break;
5107
5108
5109 case URX_SETREF:
5110 {
5111 if (fp->fInputIdx >= fActiveLimit) {
5112 fHitEnd = TRUE;
5113 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5114 break;
5115 }
5116
5117 U_ASSERT(opValue > 0 && opValue < sets->size());
5118
5119 // There is input left. Pick up one char and test it for set membership.
5120 UChar32 c;
5121 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5122 if (c<256) {
5123 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5124 if (s8->contains(c)) {
5125 // The character is in the set. A Match.
5126 break;
5127 }
5128 } else {
5129 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
5130 if (s->contains(c)) {
5131 // The character is in the set. A Match.
5132 break;
5133 }
5134 }
5135
5136 // the character wasn't in the set.
5137 #ifdef REGEX_SMART_BACKTRACKING
5138 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5139 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5140 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5141 // Try to find it, backwards
5142 int64_t reverseIndex = fp->fInputIdx;
5143 U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
5144 UBool success = FALSE;
5145 do {
5146 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5147 if (c < 256) {
5148 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5149 if (s8->contains(c)) {
5150 success = TRUE;
5151 break;
5152 }
5153 } else {
5154 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
5155 if (s->contains(c)) {
5156 success = TRUE;
5157 break;
5158 }
5159 }
5160 } while (reverseIndex > backSearchIndex);
5161
5162 if (success) {
5163 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5164 fp->fInputIdx = reverseIndex;
5165 if (fp->fInputIdx > reverseIndex) {
5166 fp = StateSave(fp, fp->fPatIdx, status);
5167 }
5168 fp->fPatIdx++; // Skip the LOOP_C, we just did that
5169 break;
5170 }
5171 }
5172 }
5173 #endif
5174 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5175 }
5176 break;
5177
5178
5179 case URX_DOTANY:
5180 {
5181 // . matches anything, but stops at end-of-line.
5182 if (fp->fInputIdx >= fActiveLimit) {
5183 // At end of input. Match failed. Backtrack out.
5184 fHitEnd = TRUE;
5185 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5186 break;
5187 }
5188
5189 // There is input left. Advance over one char, unless we've hit end-of-line
5190 UChar32 c;
5191 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5192 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
5193 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
5194 // End of line in normal mode. . does not match.
5195 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5196 break;
5197 }
5198 }
5199 break;
5200
5201
5202 case URX_DOTANY_ALL:
5203 {
5204 // . in dot-matches-all (including new lines) mode
5205 if (fp->fInputIdx >= fActiveLimit) {
5206 // At end of input. Match failed. Backtrack out.
5207 fHitEnd = TRUE;
5208 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5209 break;
5210 }
5211
5212 // There is input left. Advance over one char, except if we are
5213 // at a cr/lf, advance over both of them.
5214 UChar32 c;
5215 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5216 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
5217 // In the case of a CR/LF, we need to advance over both.
5218 if (inputBuf[fp->fInputIdx] == 0x0a) {
5219 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
5220 }
5221 }
5222 }
5223 break;
5224
5225
5226 case URX_DOTANY_UNIX:
5227 {
5228 // '.' operator, matches all, but stops at end-of-line.
5229 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
5230 if (fp->fInputIdx >= fActiveLimit) {
5231 // At end of input. Match failed. Backtrack out.
5232 fHitEnd = TRUE;
5233 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5234 break;
5235 }
5236
5237 // There is input left. Advance over one char, unless we've hit end-of-line
5238 UChar32 c;
5239 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5240 if (c == 0x0a) {
5241 // End of line in normal mode. '.' does not match the \n
5242 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5243 }
5244 }
5245 break;
5246
5247
5248 case URX_JMP:
5249 fp->fPatIdx = opValue;
5250 break;
5251
5252 case URX_FAIL:
5253 isMatch = FALSE;
5254 goto breakFromLoop;
5255
5256 case URX_JMP_SAV:
5257 U_ASSERT(opValue < fPattern->fCompiledPat->size());
5258 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
5259 fp->fPatIdx = opValue; // Then JMP.
5260 break;
5261
5262 case URX_JMP_SAV_X:
5263 // This opcode is used with (x)+, when x can match a zero length string.
5264 // Same as JMP_SAV, except conditional on the match having made forward progress.
5265 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
5266 // data address of the input position at the start of the loop.
5267 {
5268 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
5269 int32_t stoOp = (int32_t)pat[opValue-1];
5270 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
5271 int32_t frameLoc = URX_VAL(stoOp);
5272 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
5273 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
5274 U_ASSERT(prevInputIdx <= fp->fInputIdx);
5275 if (prevInputIdx < fp->fInputIdx) {
5276 // The match did make progress. Repeat the loop.
5277 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
5278 fp->fPatIdx = opValue;
5279 fp->fExtra[frameLoc] = fp->fInputIdx;
5280 }
5281 // If the input position did not advance, we do nothing here,
5282 // execution will fall out of the loop.
5283 }
5284 break;
5285
5286 case URX_CTR_INIT:
5287 {
5288 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5289 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
5290
5291 // Pick up the three extra operands that CTR_INIT has, and
5292 // skip the pattern location counter past
5293 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5294 fp->fPatIdx += 3;
5295 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
5296 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5297 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5298 U_ASSERT(minCount>=0);
5299 U_ASSERT(maxCount>=minCount || maxCount==-1);
5300 U_ASSERT(loopLoc>fp->fPatIdx);
5301
5302 if (minCount == 0) {
5303 fp = StateSave(fp, loopLoc+1, status);
5304 }
5305 if (maxCount == 0) {
5306 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5307 }
5308 }
5309 break;
5310
5311 case URX_CTR_LOOP:
5312 {
5313 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5314 int32_t initOp = (int32_t)pat[opValue];
5315 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
5316 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5317 int32_t minCount = (int32_t)pat[opValue+2];
5318 int32_t maxCount = (int32_t)pat[opValue+3];
5319 // Increment the counter. Note: we DIDN'T worry about counter
5320 // overflow, since the data comes from UnicodeStrings, which
5321 // stores its length in an int32_t. Do we have to think about
5322 // this now that we're using UText? Probably not, since the length
5323 // in UChar32s is still an int32_t.
5324 (*pCounter)++;
5325 U_ASSERT(*pCounter > 0);
5326 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
5327 U_ASSERT(*pCounter == maxCount || maxCount == -1);
5328 break;
5329 }
5330 if (*pCounter >= minCount) {
5331 fp = StateSave(fp, fp->fPatIdx, status);
5332 }
5333 fp->fPatIdx = opValue + 4; // Loop back.
5334 }
5335 break;
5336
5337 case URX_CTR_INIT_NG:
5338 {
5339 // Initialize a non-greedy loop
5340 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5341 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
5342
5343 // Pick up the three extra operands that CTR_INIT has, and
5344 // skip the pattern location counter past
5345 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5346 fp->fPatIdx += 3;
5347 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
5348 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5349 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5350 U_ASSERT(minCount>=0);
5351 U_ASSERT(maxCount>=minCount || maxCount==-1);
5352 U_ASSERT(loopLoc>fp->fPatIdx);
5353
5354 if (minCount == 0) {
5355 if (maxCount != 0) {
5356 fp = StateSave(fp, fp->fPatIdx, status);
5357 }
5358 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
5359 }
5360 }
5361 break;
5362
5363 case URX_CTR_LOOP_NG:
5364 {
5365 // Non-greedy {min, max} loops
5366 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5367 int32_t initOp = (int32_t)pat[opValue];
5368 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
5369 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5370 int32_t minCount = (int32_t)pat[opValue+2];
5371 int32_t maxCount = (int32_t)pat[opValue+3];
5372 // Increment the counter. Note: we DIDN'T worry about counter
5373 // overflow, since the data comes from UnicodeStrings, which
5374 // stores its length in an int32_t. Do we have to think about
5375 // this now that we're using UText? Probably not, since the length
5376 // in UChar32s is still an int32_t.
5377 (*pCounter)++;
5378 U_ASSERT(*pCounter > 0);
5379
5380 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
5381 // The loop has matched the maximum permitted number of times.
5382 // Break out of here with no action. Matching will
5383 // continue with the following pattern.
5384 U_ASSERT(*pCounter == maxCount || maxCount == -1);
5385 break;
5386 }
5387
5388 if (*pCounter < minCount) {
5389 // We haven't met the minimum number of matches yet.
5390 // Loop back for another one.
5391 fp->fPatIdx = opValue + 4; // Loop back.
5392 } else {
5393 // We do have the minimum number of matches.
5394 // Fall into the following pattern, but first do
5395 // a state save to the top of the loop, so that a failure
5396 // in the following pattern will try another iteration of the loop.
5397 fp = StateSave(fp, opValue + 4, status);
5398 }
5399 }
5400 break;
5401
5402 case URX_STO_SP:
5403 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5404 fData[opValue] = fStack->size();
5405 break;
5406
5407 case URX_LD_SP:
5408 {
5409 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5410 int32_t newStackSize = (int32_t)fData[opValue];
5411 U_ASSERT(newStackSize <= fStack->size());
5412 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5413 if (newFP == (int64_t *)fp) {
5414 break;
5415 }
5416 int32_t i;
5417 for (i=0; i<fFrameSize; i++) {
5418 newFP[i] = ((int64_t *)fp)[i];
5419 }
5420 fp = (REStackFrame *)newFP;
5421 fStack->setSize(newStackSize);
5422 }
5423 break;
5424
5425 case URX_BACKREF:
5426 case URX_BACKREF_I:
5427 {
5428 U_ASSERT(opValue < fFrameSize);
5429 int64_t groupStartIdx = fp->fExtra[opValue];
5430 int64_t groupEndIdx = fp->fExtra[opValue+1];
5431 U_ASSERT(groupStartIdx <= groupEndIdx);
5432 int64_t len = groupEndIdx-groupStartIdx;
5433 if (groupStartIdx < 0) {
5434 // This capture group has not participated in the match thus far,
5435 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5436 }
5437
5438 if (len == 0) {
5439 // The capture group match was of an empty string.
5440 // Verified by testing: Perl matches succeed in this case, so
5441 // we do too.
5442 break;
5443 }
5444
5445 UBool haveMatch = FALSE;
5446 if (fp->fInputIdx + len <= fActiveLimit) {
5447 if (opType == URX_BACKREF) {
5448 if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, (int32_t)len) == 0) {
5449 haveMatch = TRUE;
5450 }
5451 } else {
5452 if (u_strncasecmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx,
5453 (int32_t)len, U_FOLD_CASE_DEFAULT) == 0) {
5454 haveMatch = TRUE;
5455 }
5456 }
5457 } else {
5458 // TODO: probably need to do a partial string comparison, and only
5459 // set HitEnd if the available input matched. Ticket #6074
5460 fHitEnd = TRUE;
5461 }
5462 if (haveMatch) {
5463 fp->fInputIdx += len; // Match. Advance current input position.
5464 } else {
5465 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5466 }
5467 }
5468 break;
5469
5470 case URX_STO_INP_LOC:
5471 {
5472 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5473 fp->fExtra[opValue] = fp->fInputIdx;
5474 }
5475 break;
5476
5477 case URX_JMPX:
5478 {
5479 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5480 fp->fPatIdx += 1;
5481 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
5482 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5483 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5484 U_ASSERT(savedInputIdx <= fp->fInputIdx);
5485 if (savedInputIdx < fp->fInputIdx) {
5486 fp->fPatIdx = opValue; // JMP
5487 } else {
5488 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
5489 }
5490 }
5491 break;
5492
5493 case URX_LA_START:
5494 {
5495 // Entering a lookahead block.
5496 // Save Stack Ptr, Input Pos.
5497 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5498 fData[opValue] = fStack->size();
5499 fData[opValue+1] = fp->fInputIdx;
5500 fActiveStart = fLookStart; // Set the match region change for
5501 fActiveLimit = fLookLimit; // transparent bounds.
5502 }
5503 break;
5504
5505 case URX_LA_END:
5506 {
5507 // Leaving a look-ahead block.
5508 // restore Stack Ptr, Input Pos to positions they had on entry to block.
5509 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5510 int32_t stackSize = fStack->size();
5511 int32_t newStackSize = (int32_t)fData[opValue];
5512 U_ASSERT(stackSize >= newStackSize);
5513 if (stackSize > newStackSize) {
5514 // Copy the current top frame back to the new (cut back) top frame.
5515 // This makes the capture groups from within the look-ahead
5516 // expression available.
5517 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5518 int32_t i;
5519 for (i=0; i<fFrameSize; i++) {
5520 newFP[i] = ((int64_t *)fp)[i];
5521 }
5522 fp = (REStackFrame *)newFP;
5523 fStack->setSize(newStackSize);
5524 }
5525 fp->fInputIdx = fData[opValue+1];
5526
5527 // Restore the active region bounds in the input string; they may have
5528 // been changed because of transparent bounds on a Region.
5529 fActiveStart = fRegionStart;
5530 fActiveLimit = fRegionLimit;
5531 }
5532 break;
5533
5534 case URX_ONECHAR_I:
5535 if (fp->fInputIdx < fActiveLimit) {
5536 UChar32 c;
5537 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5538 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5539 break;
5540 }
5541 } else {
5542 fHitEnd = TRUE;
5543 }
5544
5545 #ifdef REGEX_SMART_BACKTRACKING
5546 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5547 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5548 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5549 UBool success = FALSE;
5550 int64_t reverseIndex = fp->fInputIdx;
5551 UChar32 c;
5552 while (reverseIndex > backSearchIndex) {
5553 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5554 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5555 success = TRUE;
5556 break;
5557 } else if (c == U_SENTINEL) {
5558 break;
5559 }
5560 }
5561 if (success) {
5562 fHitEnd = FALSE;
5563 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5564 fp->fInputIdx = reverseIndex;
5565 if (fp->fInputIdx > backSearchIndex) {
5566 fp = StateSave(fp, fp->fPatIdx, status);
5567 }
5568 fp->fPatIdx++; // Skip the LOOP_C, we just did that
5569 break;
5570 }
5571 }
5572 }
5573 #endif
5574
5575 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5576 break;
5577
5578 case URX_STRING_I:
5579 {
5580 // Test input against a literal string.
5581 // Strings require two slots in the compiled pattern, one for the
5582 // offset to the string text, and one for the length.
5583 const UCaseProps *csp = ucase_getSingleton();
5584 {
5585 int32_t stringStartIdx, stringLen;
5586 stringStartIdx = opValue;
5587
5588 op = (int32_t)pat[fp->fPatIdx];
5589 fp->fPatIdx++;
5590 opType = URX_TYPE(op);
5591 opValue = URX_VAL(op);
5592 U_ASSERT(opType == URX_STRING_LEN);
5593 stringLen = opValue;
5594
5595 const UChar *patternChars = litText+stringStartIdx;
5596 const UChar *patternEnd = patternChars+stringLen;
5597
5598 const UChar *foldChars = NULL;
5599 int32_t foldOffset, foldLength;
5600 UChar32 c;
5601
5602 #ifdef REGEX_SMART_BACKTRACKING
5603 int32_t originalInputIdx = fp->fInputIdx;
5604 #endif
5605 UBool success = TRUE;
5606
5607 foldOffset = foldLength = 0;
5608
5609 while (patternChars < patternEnd && success) {
5610 if(foldOffset < foldLength) {
5611 U16_NEXT_UNSAFE(foldChars, foldOffset, c);
5612 } else {
5613 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5614 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
5615 if(foldLength >= 0) {
5616 if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings
5617 foldOffset = 0;
5618 U16_NEXT_UNSAFE(foldChars, foldOffset, c);
5619 } else {
5620 c = foldLength;
5621 foldLength = foldOffset; // to avoid reading chars from the folding buffer
5622 }
5623 }
5624 }
5625
5626 if (fp->fInputIdx <= fActiveLimit) {
5627 if (U_IS_BMP(c)) {
5628 success = (*patternChars == c);
5629 patternChars += 1;
5630 } else if (patternChars+1 < patternEnd) {
5631 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
5632 patternChars += 2;
5633 }
5634 } else {
5635 success = FALSE;
5636 fHitEnd = TRUE; // TODO: See ticket 6074
5637 }
5638 }
5639
5640 if (!success) {
5641 #ifdef REGEX_SMART_BACKTRACKING
5642 if (fp->fInputIdx > backSearchIndex && fStack->size()) {
5643 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5644 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5645 // Reset to last start point
5646 int64_t reverseIndex = originalInputIdx;
5647 patternChars = litText+stringStartIdx;
5648
5649 // Search backwards for a possible start
5650 do {
5651 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5652 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
5653 if(foldLength >= 0) {
5654 if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings
5655 foldOffset = 0;
5656 U16_NEXT_UNSAFE(foldChars, foldOffset, c);
5657 } else {
5658 c = foldLength;
5659 foldLength = foldOffset; // to avoid reading chars from the folding buffer
5660 }
5661 }
5662
5663 if ((U_IS_BMP(c) && *patternChars == c) ||
5664 (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
5665 success = TRUE;
5666 break;
5667 }
5668 } while (reverseIndex > backSearchIndex);
5669
5670 // And try again
5671 if (success) {
5672 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5673 fp->fInputIdx = reverseIndex;
5674 if (fp->fInputIdx > backSearchIndex) {
5675 fp = StateSave(fp, fp->fPatIdx, status);
5676 }
5677 fp->fPatIdx++; // Skip the LOOP_C, we just did that
5678 break;
5679 }
5680 }
5681 }
5682 #endif
5683 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5684 }
5685 }
5686 }
5687 break;
5688
5689 case URX_LB_START:
5690 {
5691 // Entering a look-behind block.
5692 // Save Stack Ptr, Input Pos.
5693 // TODO: implement transparent bounds. Ticket #6067
5694 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5695 fData[opValue] = fStack->size();
5696 fData[opValue+1] = fp->fInputIdx;
5697 // Init the variable containing the start index for attempted matches.
5698 fData[opValue+2] = -1;
5699 // Save input string length, then reset to pin any matches to end at
5700 // the current position.
5701 fData[opValue+3] = fActiveLimit;
5702 fActiveLimit = fp->fInputIdx;
5703 }
5704 break;
5705
5706
5707 case URX_LB_CONT:
5708 {
5709 // Positive Look-Behind, at top of loop checking for matches of LB expression
5710 // at all possible input starting positions.
5711
5712 // Fetch the min and max possible match lengths. They are the operands
5713 // of this op in the pattern.
5714 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5715 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5716 U_ASSERT(minML <= maxML);
5717 U_ASSERT(minML >= 0);
5718
5719 // Fetch (from data) the last input index where a match was attempted.
5720 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5721 int64_t *lbStartIdx = &fData[opValue+2];
5722 if (*lbStartIdx < 0) {
5723 // First time through loop.
5724 *lbStartIdx = fp->fInputIdx - minML;
5725 } else {
5726 // 2nd through nth time through the loop.
5727 // Back up start position for match by one.
5728 if (*lbStartIdx == 0) {
5729 (*lbStartIdx)--;
5730 } else {
5731 U16_BACK_1(inputBuf, 0, *lbStartIdx);
5732 }
5733 }
5734
5735 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5736 // We have tried all potential match starting points without
5737 // getting a match. Backtrack out, and out of the
5738 // Look Behind altogether.
5739 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5740 int64_t restoreInputLen = fData[opValue+3];
5741 U_ASSERT(restoreInputLen >= fActiveLimit);
5742 U_ASSERT(restoreInputLen <= fInputLength);
5743 fActiveLimit = restoreInputLen;
5744 break;
5745 }
5746
5747 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5748 // (successful match will fall off the end of the loop.)
5749 fp = StateSave(fp, fp->fPatIdx-3, status);
5750 fp->fInputIdx = *lbStartIdx;
5751 }
5752 break;
5753
5754 case URX_LB_END:
5755 // End of a look-behind block, after a successful match.
5756 {
5757 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5758 if (fp->fInputIdx != fActiveLimit) {
5759 // The look-behind expression matched, but the match did not
5760 // extend all the way to the point that we are looking behind from.
5761 // FAIL out of here, which will take us back to the LB_CONT, which
5762 // will retry the match starting at another position or fail
5763 // the look-behind altogether, whichever is appropriate.
5764 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5765 break;
5766 }
5767
5768 // Look-behind match is good. Restore the orignal input string length,
5769 // which had been truncated to pin the end of the lookbehind match to the
5770 // position being looked-behind.
5771 int64_t originalInputLen = fData[opValue+3];
5772 U_ASSERT(originalInputLen >= fActiveLimit);
5773 U_ASSERT(originalInputLen <= fInputLength);
5774 fActiveLimit = originalInputLen;
5775 }
5776 break;
5777
5778
5779 case URX_LBN_CONT:
5780 {
5781 // Negative Look-Behind, at top of loop checking for matches of LB expression
5782 // at all possible input starting positions.
5783
5784 // Fetch the extra parameters of this op.
5785 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5786 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5787 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5788 continueLoc = URX_VAL(continueLoc);
5789 U_ASSERT(minML <= maxML);
5790 U_ASSERT(minML >= 0);
5791 U_ASSERT(continueLoc > fp->fPatIdx);
5792
5793 // Fetch (from data) the last input index where a match was attempted.
5794 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5795 int64_t *lbStartIdx = &fData[opValue+2];
5796 if (*lbStartIdx < 0) {
5797 // First time through loop.
5798 *lbStartIdx = fp->fInputIdx - minML;
5799 } else {
5800 // 2nd through nth time through the loop.
5801 // Back up start position for match by one.
5802 if (*lbStartIdx == 0) {
5803 (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0.
5804 } else {
5805 U16_BACK_1(inputBuf, 0, *lbStartIdx);
5806 }
5807 }
5808
5809 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5810 // We have tried all potential match starting points without
5811 // getting a match, which means that the negative lookbehind as
5812 // a whole has succeeded. Jump forward to the continue location
5813 int64_t restoreInputLen = fData[opValue+3];
5814 U_ASSERT(restoreInputLen >= fActiveLimit);
5815 U_ASSERT(restoreInputLen <= fInputLength);
5816 fActiveLimit = restoreInputLen;
5817 fp->fPatIdx = continueLoc;
5818 break;
5819 }
5820
5821 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5822 // (successful match will cause a FAIL out of the loop altogether.)
5823 fp = StateSave(fp, fp->fPatIdx-4, status);
5824 fp->fInputIdx = *lbStartIdx;
5825 }
5826 break;
5827
5828 case URX_LBN_END:
5829 // End of a negative look-behind block, after a successful match.
5830 {
5831 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5832 if (fp->fInputIdx != fActiveLimit) {
5833 // The look-behind expression matched, but the match did not
5834 // extend all the way to the point that we are looking behind from.
5835 // FAIL out of here, which will take us back to the LB_CONT, which
5836 // will retry the match starting at another position or succeed
5837 // the look-behind altogether, whichever is appropriate.
5838 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5839 break;
5840 }
5841
5842 // Look-behind expression matched, which means look-behind test as
5843 // a whole Fails
5844
5845 // Restore the orignal input string length, which had been truncated
5846 // inorder to pin the end of the lookbehind match
5847 // to the position being looked-behind.
5848 int64_t originalInputLen = fData[opValue+3];
5849 U_ASSERT(originalInputLen >= fActiveLimit);
5850 U_ASSERT(originalInputLen <= fInputLength);
5851 fActiveLimit = originalInputLen;
5852
5853 // Restore original stack position, discarding any state saved
5854 // by the successful pattern match.
5855 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5856 int32_t newStackSize = (int32_t)fData[opValue];
5857 U_ASSERT(fStack->size() > newStackSize);
5858 fStack->setSize(newStackSize);
5859
5860 // FAIL, which will take control back to someplace
5861 // prior to entering the look-behind test.
5862 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5863 }
5864 break;
5865
5866
5867 case URX_LOOP_SR_I:
5868 // Loop Initialization for the optimized implementation of
5869 // [some character set]*
5870 // This op scans through all matching input.
5871 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5872 {
5873 U_ASSERT(opValue > 0 && opValue < sets->size());
5874 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5875 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
5876
5877 // Loop through input, until either the input is exhausted or
5878 // we reach a character that is not a member of the set.
5879 int32_t ix = (int32_t)fp->fInputIdx;
5880 for (;;) {
5881 if (ix >= fActiveLimit) {
5882 fHitEnd = TRUE;
5883 break;
5884 }
5885 UChar32 c;
5886 U16_NEXT(inputBuf, ix, fActiveLimit, c);
5887 if (c<256) {
5888 if (s8->contains(c) == FALSE) {
5889 U16_BACK_1(inputBuf, 0, ix);
5890 break;
5891 }
5892 } else {
5893 if (s->contains(c) == FALSE) {
5894 U16_BACK_1(inputBuf, 0, ix);
5895 break;
5896 }
5897 }
5898 }
5899
5900 // If there were no matching characters, skip over the loop altogether.
5901 // The loop doesn't run at all, a * op always succeeds.
5902 if (ix == fp->fInputIdx) {
5903 fp->fPatIdx++; // skip the URX_LOOP_C op.
5904 break;
5905 }
5906
5907 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5908 // must follow. It's operand is the stack location
5909 // that holds the starting input index for the match of this [set]*
5910 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5911 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5912 int32_t stackLoc = URX_VAL(loopcOp);
5913 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5914 fp->fExtra[stackLoc] = fp->fInputIdx;
5915 #ifdef REGEX_SMART_BACKTRACKING
5916 backSearchIndex = fp->fInputIdx;
5917 #endif
5918 fp->fInputIdx = ix;
5919
5920 // Save State to the URX_LOOP_C op that follows this one,
5921 // so that match failures in the following code will return to there.
5922 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5923 fp = StateSave(fp, fp->fPatIdx, status);
5924 fp->fPatIdx++;
5925 }
5926 break;
5927
5928
5929 case URX_LOOP_DOT_I:
5930 // Loop Initialization for the optimized implementation of .*
5931 // This op scans through all remaining input.
5932 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5933 {
5934 // Loop through input until the input is exhausted (we reach an end-of-line)
5935 // In DOTALL mode, we can just go straight to the end of the input.
5936 int32_t ix;
5937 if ((opValue & 1) == 1) {
5938 // Dot-matches-All mode. Jump straight to the end of the string.
5939 ix = (int32_t)fActiveLimit;
5940 fHitEnd = TRUE;
5941 } else {
5942 // NOT DOT ALL mode. Line endings do not match '.'
5943 // Scan forward until a line ending or end of input.
5944 ix = (int32_t)fp->fInputIdx;
5945 for (;;) {
5946 if (ix >= fActiveLimit) {
5947 fHitEnd = TRUE;
5948 break;
5949 }
5950 UChar32 c;
5951 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++]
5952 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
5953 if ((c == 0x0a) || // 0x0a is newline in both modes.
5954 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
5955 ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) {
5956 // char is a line ending. Put the input pos back to the
5957 // line ending char, and exit the scanning loop.
5958 U16_BACK_1(inputBuf, 0, ix);
5959 break;
5960 }
5961 }
5962 }
5963 }
5964
5965 // If there were no matching characters, skip over the loop altogether.
5966 // The loop doesn't run at all, a * op always succeeds.
5967 if (ix == fp->fInputIdx) {
5968 fp->fPatIdx++; // skip the URX_LOOP_C op.
5969 break;
5970 }
5971
5972 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5973 // must follow. It's operand is the stack location
5974 // that holds the starting input index for the match of this .*
5975 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5976 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5977 int32_t stackLoc = URX_VAL(loopcOp);
5978 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5979 fp->fExtra[stackLoc] = fp->fInputIdx;
5980 #ifdef REGEX_SMART_BACKTRACKING
5981 backSearchIndex = fp->fInputIdx;
5982 #endif
5983 fp->fInputIdx = ix;
5984
5985 // Save State to the URX_LOOP_C op that follows this one,
5986 // so that match failures in the following code will return to there.
5987 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5988 fp = StateSave(fp, fp->fPatIdx, status);
5989 fp->fPatIdx++;
5990 }
5991 break;
5992
5993
5994 case URX_LOOP_C:
5995 {
5996 U_ASSERT(opValue>=0 && opValue<fFrameSize);
5997 backSearchIndex = (int32_t)fp->fExtra[opValue];
5998 U_ASSERT(backSearchIndex <= fp->fInputIdx);
5999 if (backSearchIndex == fp->fInputIdx) {
6000 // We've backed up the input idx to the point that the loop started.
6001 // The loop is done. Leave here without saving state.
6002 // Subsequent failures won't come back here.
6003 break;
6004 }
6005 // Set up for the next iteration of the loop, with input index
6006 // backed up by one from the last time through,
6007 // and a state save to this instruction in case the following code fails again.
6008 // (We're going backwards because this loop emulates stack unwinding, not
6009 // the initial scan forward.)
6010 U_ASSERT(fp->fInputIdx > 0);
6011 UChar32 prevC;
6012 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
6013
6014 if (prevC == 0x0a &&
6015 fp->fInputIdx > backSearchIndex &&
6016 inputBuf[fp->fInputIdx-1] == 0x0d) {
6017 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
6018 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
6019 // .*, stepping back over CRLF pair.
6020 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
6021 }
6022 }
6023
6024
6025 fp = StateSave(fp, fp->fPatIdx-1, status);
6026 }
6027 break;
6028
6029
6030
6031 default:
6032 // Trouble. The compiled pattern contains an entry with an
6033 // unrecognized type tag.
6034 U_ASSERT(FALSE);
6035 }
6036
6037 if (U_FAILURE(status)) {
6038 isMatch = FALSE;
6039 break;
6040 }
6041 }
6042
6043 breakFromLoop:
6044 fMatch = isMatch;
6045 if (isMatch) {
6046 fLastMatchEnd = fMatchEnd;
6047 fMatchStart = startIdx;
6048 fMatchEnd = fp->fInputIdx;
6049 if (fTraceDebug) {
6050 REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd));
6051 }
6052 }
6053 else
6054 {
6055 if (fTraceDebug) {
6056 REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
6057 }
6058 }
6059
6060 fFrame = fp; // The active stack frame when the engine stopped.
6061 // Contains the capture group results that we need to
6062 // access later.
6063
6064 return;
6065 }
6066
6067
6068 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
6069
6070 U_NAMESPACE_END
6071
6072 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
6073