]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/rematch.cpp
ICU-491.11.1.tar.gz
[apple/icu.git] / icuSources / i18n / rematch.cpp
1 /*
2 **************************************************************************
3 * Copyright (C) 2002-2012 International Business Machines Corporation *
4 * and others. All rights reserved. *
5 **************************************************************************
6 */
7 //
8 // file: rematch.cpp
9 //
10 // Contains the implementation of class RegexMatcher,
11 // which is one of the main API classes for the ICU regular expression package.
12 //
13
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
16
17 #include "unicode/regex.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/ustring.h"
21 #include "unicode/rbbi.h"
22 #include "unicode/utf.h"
23 #include "unicode/utf16.h"
24 #include "uassert.h"
25 #include "cmemory.h"
26 #include "uvector.h"
27 #include "uvectr32.h"
28 #include "uvectr64.h"
29 #include "regeximp.h"
30 #include "regexst.h"
31 #include "regextxt.h"
32 #include "ucase.h"
33
34 // #include <malloc.h> // Needed for heapcheck testing
35
36
37 // Find progress callback
38 // ----------------------
39 // Macro to inline test & call to ReportFindProgress(). Eliminates unnecessary function call.
40 //
41 #define REGEXFINDPROGRESS_INTERRUPT(pos, status) \
42 (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE)
43
44
45 // Smart Backtracking
46 // ------------------
47 // When a failure would go back to a LOOP_C instruction,
48 // strings, characters, and setrefs scan backwards for a valid start
49 // character themselves, pop the stack, and save state, emulating the
50 // LOOP_C's effect but assured that the next character of input is a
51 // possible matching character.
52 //
53 // Good idea in theory; unfortunately it only helps out a few specific
54 // cases and slows the engine down a little in the rest.
55
56 U_NAMESPACE_BEGIN
57
58 // Default limit for the size of the back track stack, to avoid system
59 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
60 // This value puts ICU's limits higher than most other regexp implementations,
61 // which use recursion rather than the heap, and take more storage per
62 // backtrack point.
63 //
64 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
65
66 // Time limit counter constant.
67 // Time limits for expression evaluation are in terms of quanta of work by
68 // the engine, each of which is 10,000 state saves.
69 // This constant determines that state saves per tick number.
70 static const int32_t TIMER_INITIAL_VALUE = 10000;
71
72 //-----------------------------------------------------------------------------
73 //
74 // Constructor and Destructor
75 //
76 //-----------------------------------------------------------------------------
77 RegexMatcher::RegexMatcher(const RegexPattern *pat) {
78 fDeferredStatus = U_ZERO_ERROR;
79 init(fDeferredStatus);
80 if (U_FAILURE(fDeferredStatus)) {
81 return;
82 }
83 if (pat==NULL) {
84 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
85 return;
86 }
87 fPattern = pat;
88 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
89 }
90
91
92
93 RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
94 uint32_t flags, UErrorCode &status) {
95 init(status);
96 if (U_FAILURE(status)) {
97 return;
98 }
99 UParseError pe;
100 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
101 fPattern = fPatternOwned;
102
103 UText inputText = UTEXT_INITIALIZER;
104 utext_openConstUnicodeString(&inputText, &input, &status);
105 init2(&inputText, status);
106 utext_close(&inputText);
107
108 fInputUniStrMaybeMutable = TRUE;
109 }
110
111
112 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
113 uint32_t flags, UErrorCode &status) {
114 init(status);
115 if (U_FAILURE(status)) {
116 return;
117 }
118 UParseError pe;
119 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
120 if (U_FAILURE(status)) {
121 return;
122 }
123
124 fPattern = fPatternOwned;
125 init2(input, status);
126 }
127
128
129 RegexMatcher::RegexMatcher(const UnicodeString &regexp,
130 uint32_t flags, UErrorCode &status) {
131 init(status);
132 if (U_FAILURE(status)) {
133 return;
134 }
135 UParseError pe;
136 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
137 if (U_FAILURE(status)) {
138 return;
139 }
140 fPattern = fPatternOwned;
141 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
142 }
143
144 RegexMatcher::RegexMatcher(UText *regexp,
145 uint32_t flags, UErrorCode &status) {
146 init(status);
147 if (U_FAILURE(status)) {
148 return;
149 }
150 UParseError pe;
151 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
152 if (U_FAILURE(status)) {
153 return;
154 }
155
156 fPattern = fPatternOwned;
157 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
158 }
159
160
161
162
163 RegexMatcher::~RegexMatcher() {
164 delete fStack;
165 if (fData != fSmallData) {
166 uprv_free(fData);
167 fData = NULL;
168 }
169 if (fPatternOwned) {
170 delete fPatternOwned;
171 fPatternOwned = NULL;
172 fPattern = NULL;
173 }
174
175 if (fInput) {
176 delete fInput;
177 }
178 if (fInputText) {
179 utext_close(fInputText);
180 }
181 if (fAltInputText) {
182 utext_close(fAltInputText);
183 }
184
185 #if UCONFIG_NO_BREAK_ITERATION==0
186 delete fWordBreakItr;
187 #endif
188 }
189
190 //
191 // init() common initialization for use by all constructors.
192 // Initialize all fields, get the object into a consistent state.
193 // This must be done even when the initial status shows an error,
194 // so that the object is initialized sufficiently well for the destructor
195 // to run safely.
196 //
197 void RegexMatcher::init(UErrorCode &status) {
198 fPattern = NULL;
199 fPatternOwned = NULL;
200 fFrameSize = 0;
201 fRegionStart = 0;
202 fRegionLimit = 0;
203 fAnchorStart = 0;
204 fAnchorLimit = 0;
205 fLookStart = 0;
206 fLookLimit = 0;
207 fActiveStart = 0;
208 fActiveLimit = 0;
209 fTransparentBounds = FALSE;
210 fAnchoringBounds = TRUE;
211 fMatch = FALSE;
212 fMatchStart = 0;
213 fMatchEnd = 0;
214 fLastMatchEnd = -1;
215 fAppendPosition = 0;
216 fHitEnd = FALSE;
217 fRequireEnd = FALSE;
218 fStack = NULL;
219 fFrame = NULL;
220 fTimeLimit = 0;
221 fTime = 0;
222 fTickCounter = 0;
223 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY;
224 fCallbackFn = NULL;
225 fCallbackContext = NULL;
226 fFindProgressCallbackFn = NULL;
227 fFindProgressCallbackContext = NULL;
228 fTraceDebug = FALSE;
229 fDeferredStatus = status;
230 fData = fSmallData;
231 fWordBreakItr = NULL;
232
233 fStack = NULL;
234 fInputText = NULL;
235 fAltInputText = NULL;
236 fInput = NULL;
237 fInputLength = 0;
238 fInputUniStrMaybeMutable = FALSE;
239
240 if (U_FAILURE(status)) {
241 fDeferredStatus = status;
242 }
243 }
244
245 //
246 // init2() Common initialization for use by RegexMatcher constructors, part 2.
247 // This handles the common setup to be done after the Pattern is available.
248 //
249 void RegexMatcher::init2(UText *input, UErrorCode &status) {
250 if (U_FAILURE(status)) {
251 fDeferredStatus = status;
252 return;
253 }
254
255 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) {
256 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
257 if (fData == NULL) {
258 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
259 return;
260 }
261 }
262
263 fStack = new UVector64(status);
264 if (fStack == NULL) {
265 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
266 return;
267 }
268
269 reset(input);
270 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
271 if (U_FAILURE(status)) {
272 fDeferredStatus = status;
273 return;
274 }
275 }
276
277
278 static const UChar BACKSLASH = 0x5c;
279 static const UChar DOLLARSIGN = 0x24;
280 //--------------------------------------------------------------------------------
281 //
282 // appendReplacement
283 //
284 //--------------------------------------------------------------------------------
285 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
286 const UnicodeString &replacement,
287 UErrorCode &status) {
288 UText replacementText = UTEXT_INITIALIZER;
289
290 utext_openConstUnicodeString(&replacementText, &replacement, &status);
291 if (U_SUCCESS(status)) {
292 UText resultText = UTEXT_INITIALIZER;
293 utext_openUnicodeString(&resultText, &dest, &status);
294
295 if (U_SUCCESS(status)) {
296 appendReplacement(&resultText, &replacementText, status);
297 utext_close(&resultText);
298 }
299 utext_close(&replacementText);
300 }
301
302 return *this;
303 }
304
305 //
306 // appendReplacement, UText mode
307 //
308 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
309 UText *replacement,
310 UErrorCode &status) {
311 if (U_FAILURE(status)) {
312 return *this;
313 }
314 if (U_FAILURE(fDeferredStatus)) {
315 status = fDeferredStatus;
316 return *this;
317 }
318 if (fMatch == FALSE) {
319 status = U_REGEX_INVALID_STATE;
320 return *this;
321 }
322
323 // Copy input string from the end of previous match to start of current match
324 int64_t destLen = utext_nativeLength(dest);
325 if (fMatchStart > fAppendPosition) {
326 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
327 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
328 (int32_t)(fMatchStart-fAppendPosition), &status);
329 } else {
330 int32_t len16;
331 if (UTEXT_USES_U16(fInputText)) {
332 len16 = (int32_t)(fMatchStart-fAppendPosition);
333 } else {
334 UErrorCode lengthStatus = U_ZERO_ERROR;
335 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
336 }
337 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
338 if (inputChars == NULL) {
339 status = U_MEMORY_ALLOCATION_ERROR;
340 return *this;
341 }
342 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
343 destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
344 uprv_free(inputChars);
345 }
346 }
347 fAppendPosition = fMatchEnd;
348
349
350 // scan the replacement text, looking for substitutions ($n) and \escapes.
351 // TODO: optimize this loop by efficiently scanning for '$' or '\',
352 // move entire ranges not containing substitutions.
353 UTEXT_SETNATIVEINDEX(replacement, 0);
354 UChar32 c = UTEXT_NEXT32(replacement);
355 while (c != U_SENTINEL) {
356 if (c == BACKSLASH) {
357 // Backslash Escape. Copy the following char out without further checks.
358 // Note: Surrogate pairs don't need any special handling
359 // The second half wont be a '$' or a '\', and
360 // will move to the dest normally on the next
361 // loop iteration.
362 c = UTEXT_CURRENT32(replacement);
363 if (c == U_SENTINEL) {
364 break;
365 }
366
367 if (c==0x55/*U*/ || c==0x75/*u*/) {
368 // We have a \udddd or \Udddddddd escape sequence.
369 int32_t offset = 0;
370 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
371 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
372 if (escapedChar != (UChar32)0xFFFFFFFF) {
373 if (U_IS_BMP(escapedChar)) {
374 UChar c16 = (UChar)escapedChar;
375 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
376 } else {
377 UChar surrogate[2];
378 surrogate[0] = U16_LEAD(escapedChar);
379 surrogate[1] = U16_TRAIL(escapedChar);
380 if (U_SUCCESS(status)) {
381 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
382 }
383 }
384 // TODO: Report errors for mal-formed \u escapes?
385 // As this is, the original sequence is output, which may be OK.
386 if (context.lastOffset == offset) {
387 (void)UTEXT_PREVIOUS32(replacement);
388 } else if (context.lastOffset != offset-1) {
389 utext_moveIndex32(replacement, offset - context.lastOffset - 1);
390 }
391 }
392 } else {
393 (void)UTEXT_NEXT32(replacement);
394 // Plain backslash escape. Just put out the escaped character.
395 if (U_IS_BMP(c)) {
396 UChar c16 = (UChar)c;
397 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
398 } else {
399 UChar surrogate[2];
400 surrogate[0] = U16_LEAD(c);
401 surrogate[1] = U16_TRAIL(c);
402 if (U_SUCCESS(status)) {
403 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
404 }
405 }
406 }
407 } else if (c != DOLLARSIGN) {
408 // Normal char, not a $. Copy it out without further checks.
409 if (U_IS_BMP(c)) {
410 UChar c16 = (UChar)c;
411 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
412 } else {
413 UChar surrogate[2];
414 surrogate[0] = U16_LEAD(c);
415 surrogate[1] = U16_TRAIL(c);
416 if (U_SUCCESS(status)) {
417 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
418 }
419 }
420 } else {
421 // We've got a $. Pick up a capture group number if one follows.
422 // Consume at most the number of digits necessary for the largest capture
423 // number that is valid for this pattern.
424
425 int32_t numDigits = 0;
426 int32_t groupNum = 0;
427 UChar32 digitC;
428 for (;;) {
429 digitC = UTEXT_CURRENT32(replacement);
430 if (digitC == U_SENTINEL) {
431 break;
432 }
433 if (u_isdigit(digitC) == FALSE) {
434 break;
435 }
436 (void)UTEXT_NEXT32(replacement);
437 groupNum=groupNum*10 + u_charDigitValue(digitC);
438 numDigits++;
439 if (numDigits >= fPattern->fMaxCaptureDigits) {
440 break;
441 }
442 }
443
444
445 if (numDigits == 0) {
446 // The $ didn't introduce a group number at all.
447 // Treat it as just part of the substitution text.
448 UChar c16 = DOLLARSIGN;
449 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
450 } else {
451 // Finally, append the capture group data to the destination.
452 destLen += appendGroup(groupNum, dest, status);
453 if (U_FAILURE(status)) {
454 // Can fail if group number is out of range.
455 break;
456 }
457 }
458 }
459
460 if (U_FAILURE(status)) {
461 break;
462 } else {
463 c = UTEXT_NEXT32(replacement);
464 }
465 }
466
467 return *this;
468 }
469
470
471
472 //--------------------------------------------------------------------------------
473 //
474 // appendTail Intended to be used in conjunction with appendReplacement()
475 // To the destination string, append everything following
476 // the last match position from the input string.
477 //
478 // Note: Match ranges do not affect appendTail or appendReplacement
479 //
480 //--------------------------------------------------------------------------------
481 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
482 UErrorCode status = U_ZERO_ERROR;
483 UText resultText = UTEXT_INITIALIZER;
484 utext_openUnicodeString(&resultText, &dest, &status);
485
486 if (U_SUCCESS(status)) {
487 appendTail(&resultText, status);
488 utext_close(&resultText);
489 }
490
491 return dest;
492 }
493
494 //
495 // appendTail, UText mode
496 //
497 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
498 UBool bailOut = FALSE;
499 if (U_FAILURE(status)) {
500 bailOut = TRUE;
501 }
502 if (U_FAILURE(fDeferredStatus)) {
503 status = fDeferredStatus;
504 bailOut = TRUE;
505 }
506
507 if (bailOut) {
508 // dest must not be NULL
509 if (dest) {
510 utext_replace(dest, utext_nativeLength(dest), utext_nativeLength(dest), NULL, 0, &status);
511 return dest;
512 }
513 }
514
515 if (fInputLength > fAppendPosition) {
516 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
517 int64_t destLen = utext_nativeLength(dest);
518 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
519 (int32_t)(fInputLength-fAppendPosition), &status);
520 } else {
521 int32_t len16;
522 if (UTEXT_USES_U16(fInputText)) {
523 len16 = (int32_t)(fInputLength-fAppendPosition);
524 } else {
525 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
526 status = U_ZERO_ERROR; // buffer overflow
527 }
528
529 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
530 if (inputChars == NULL) {
531 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
532 } else {
533 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
534 int64_t destLen = utext_nativeLength(dest);
535 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
536 uprv_free(inputChars);
537 }
538 }
539 }
540 return dest;
541 }
542
543
544
545 //--------------------------------------------------------------------------------
546 //
547 // end
548 //
549 //--------------------------------------------------------------------------------
550 int32_t RegexMatcher::end(UErrorCode &err) const {
551 return end(0, err);
552 }
553
554 int64_t RegexMatcher::end64(UErrorCode &err) const {
555 return end64(0, err);
556 }
557
558 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
559 if (U_FAILURE(err)) {
560 return -1;
561 }
562 if (fMatch == FALSE) {
563 err = U_REGEX_INVALID_STATE;
564 return -1;
565 }
566 if (group < 0 || group > fPattern->fGroupMap->size()) {
567 err = U_INDEX_OUTOFBOUNDS_ERROR;
568 return -1;
569 }
570 int64_t e = -1;
571 if (group == 0) {
572 e = fMatchEnd;
573 } else {
574 // Get the position within the stack frame of the variables for
575 // this capture group.
576 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
577 U_ASSERT(groupOffset < fPattern->fFrameSize);
578 U_ASSERT(groupOffset >= 0);
579 e = fFrame->fExtra[groupOffset + 1];
580 }
581
582 return e;
583 }
584
585 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
586 return (int32_t)end64(group, err);
587 }
588
589
590 //--------------------------------------------------------------------------------
591 //
592 // find()
593 //
594 //--------------------------------------------------------------------------------
595 UBool RegexMatcher::find() {
596 // Start at the position of the last match end. (Will be zero if the
597 // matcher has been reset.)
598 //
599 if (U_FAILURE(fDeferredStatus)) {
600 return FALSE;
601 }
602
603 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
604 return findUsingChunk();
605 }
606
607 int64_t startPos = fMatchEnd;
608 if (startPos==0) {
609 startPos = fActiveStart;
610 }
611
612 if (fMatch) {
613 // Save the position of any previous successful match.
614 fLastMatchEnd = fMatchEnd;
615
616 if (fMatchStart == fMatchEnd) {
617 // Previous match had zero length. Move start position up one position
618 // to avoid sending find() into a loop on zero-length matches.
619 if (startPos >= fActiveLimit) {
620 fMatch = FALSE;
621 fHitEnd = TRUE;
622 return FALSE;
623 }
624 UTEXT_SETNATIVEINDEX(fInputText, startPos);
625 (void)UTEXT_NEXT32(fInputText);
626 startPos = UTEXT_GETNATIVEINDEX(fInputText);
627 }
628 } else {
629 if (fLastMatchEnd >= 0) {
630 // A previous find() failed to match. Don't try again.
631 // (without this test, a pattern with a zero-length match
632 // could match again at the end of an input string.)
633 fHitEnd = TRUE;
634 return FALSE;
635 }
636 }
637
638
639 // Compute the position in the input string beyond which a match can not begin, because
640 // the minimum length match would extend past the end of the input.
641 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
642 // Be aware of possible overflows if making changes here.
643 int64_t testStartLimit;
644 if (UTEXT_USES_U16(fInputText)) {
645 testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
646 if (startPos > testStartLimit) {
647 fMatch = FALSE;
648 fHitEnd = TRUE;
649 return FALSE;
650 }
651 } else {
652 // For now, let the matcher discover that it can't match on its own
653 // We don't know how long the match len is in native characters
654 testStartLimit = fActiveLimit;
655 }
656
657 UChar32 c;
658 U_ASSERT(startPos >= 0);
659
660 switch (fPattern->fStartType) {
661 case START_NO_INFO:
662 // No optimization was found.
663 // Try a match at each input position.
664 for (;;) {
665 MatchAt(startPos, FALSE, fDeferredStatus);
666 if (U_FAILURE(fDeferredStatus)) {
667 return FALSE;
668 }
669 if (fMatch) {
670 return TRUE;
671 }
672 if (startPos >= testStartLimit) {
673 fHitEnd = TRUE;
674 return FALSE;
675 }
676 UTEXT_SETNATIVEINDEX(fInputText, startPos);
677 (void)UTEXT_NEXT32(fInputText);
678 startPos = UTEXT_GETNATIVEINDEX(fInputText);
679 // Note that it's perfectly OK for a pattern to have a zero-length
680 // match at the end of a string, so we must make sure that the loop
681 // runs with startPos == testStartLimit the last time through.
682 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
683 return FALSE;
684 }
685 U_ASSERT(FALSE);
686
687 case START_START:
688 // Matches are only possible at the start of the input string
689 // (pattern begins with ^ or \A)
690 if (startPos > fActiveStart) {
691 fMatch = FALSE;
692 return FALSE;
693 }
694 MatchAt(startPos, FALSE, fDeferredStatus);
695 if (U_FAILURE(fDeferredStatus)) {
696 return FALSE;
697 }
698 return fMatch;
699
700
701 case START_SET:
702 {
703 // Match may start on any char from a pre-computed set.
704 U_ASSERT(fPattern->fMinMatchLen > 0);
705 int64_t pos;
706 UTEXT_SETNATIVEINDEX(fInputText, startPos);
707 for (;;) {
708 c = UTEXT_NEXT32(fInputText);
709 pos = UTEXT_GETNATIVEINDEX(fInputText);
710 // c will be -1 (U_SENTINEL) at end of text, in which case we
711 // skip this next block (so we don't have a negative array index)
712 // and handle end of text in the following block.
713 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
714 (c>=256 && fPattern->fInitialChars->contains(c)))) {
715 MatchAt(startPos, FALSE, fDeferredStatus);
716 if (U_FAILURE(fDeferredStatus)) {
717 return FALSE;
718 }
719 if (fMatch) {
720 return TRUE;
721 }
722 UTEXT_SETNATIVEINDEX(fInputText, pos);
723 }
724 if (startPos >= testStartLimit) {
725 fMatch = FALSE;
726 fHitEnd = TRUE;
727 return FALSE;
728 }
729 startPos = pos;
730 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
731 return FALSE;
732 }
733 }
734 U_ASSERT(FALSE);
735
736 case START_STRING:
737 case START_CHAR:
738 {
739 // Match starts on exactly one char.
740 U_ASSERT(fPattern->fMinMatchLen > 0);
741 UChar32 theChar = fPattern->fInitialChar;
742 int64_t pos;
743 UTEXT_SETNATIVEINDEX(fInputText, startPos);
744 for (;;) {
745 c = UTEXT_NEXT32(fInputText);
746 pos = UTEXT_GETNATIVEINDEX(fInputText);
747 if (c == theChar) {
748 MatchAt(startPos, FALSE, fDeferredStatus);
749 if (U_FAILURE(fDeferredStatus)) {
750 return FALSE;
751 }
752 if (fMatch) {
753 return TRUE;
754 }
755 UTEXT_SETNATIVEINDEX(fInputText, pos);
756 }
757 if (startPos >= testStartLimit) {
758 fMatch = FALSE;
759 fHitEnd = TRUE;
760 return FALSE;
761 }
762 startPos = pos;
763 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
764 return FALSE;
765 }
766 }
767 U_ASSERT(FALSE);
768
769 case START_LINE:
770 {
771 UChar32 c;
772 if (startPos == fAnchorStart) {
773 MatchAt(startPos, FALSE, fDeferredStatus);
774 if (U_FAILURE(fDeferredStatus)) {
775 return FALSE;
776 }
777 if (fMatch) {
778 return TRUE;
779 }
780 UTEXT_SETNATIVEINDEX(fInputText, startPos);
781 c = UTEXT_NEXT32(fInputText);
782 startPos = UTEXT_GETNATIVEINDEX(fInputText);
783 } else {
784 UTEXT_SETNATIVEINDEX(fInputText, startPos);
785 c = UTEXT_PREVIOUS32(fInputText);
786 UTEXT_SETNATIVEINDEX(fInputText, startPos);
787 }
788
789 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
790 for (;;) {
791 if (c == 0x0a) {
792 MatchAt(startPos, FALSE, fDeferredStatus);
793 if (U_FAILURE(fDeferredStatus)) {
794 return FALSE;
795 }
796 if (fMatch) {
797 return TRUE;
798 }
799 UTEXT_SETNATIVEINDEX(fInputText, startPos);
800 }
801 if (startPos >= testStartLimit) {
802 fMatch = FALSE;
803 fHitEnd = TRUE;
804 return FALSE;
805 }
806 c = UTEXT_NEXT32(fInputText);
807 startPos = UTEXT_GETNATIVEINDEX(fInputText);
808 // Note that it's perfectly OK for a pattern to have a zero-length
809 // match at the end of a string, so we must make sure that the loop
810 // runs with startPos == testStartLimit the last time through.
811 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
812 return FALSE;
813 }
814 } else {
815 for (;;) {
816 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
817 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
818 if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
819 (void)UTEXT_NEXT32(fInputText);
820 startPos = UTEXT_GETNATIVEINDEX(fInputText);
821 }
822 MatchAt(startPos, FALSE, fDeferredStatus);
823 if (U_FAILURE(fDeferredStatus)) {
824 return FALSE;
825 }
826 if (fMatch) {
827 return TRUE;
828 }
829 UTEXT_SETNATIVEINDEX(fInputText, startPos);
830 }
831 if (startPos >= testStartLimit) {
832 fMatch = FALSE;
833 fHitEnd = TRUE;
834 return FALSE;
835 }
836 c = UTEXT_NEXT32(fInputText);
837 startPos = UTEXT_GETNATIVEINDEX(fInputText);
838 // Note that it's perfectly OK for a pattern to have a zero-length
839 // match at the end of a string, so we must make sure that the loop
840 // runs with startPos == testStartLimit the last time through.
841 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
842 return FALSE;
843 }
844 }
845 }
846
847 default:
848 U_ASSERT(FALSE);
849 }
850
851 U_ASSERT(FALSE);
852 return FALSE;
853 }
854
855
856
857 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
858 if (U_FAILURE(status)) {
859 return FALSE;
860 }
861 if (U_FAILURE(fDeferredStatus)) {
862 status = fDeferredStatus;
863 return FALSE;
864 }
865 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
866 // This will reset the region to be the full input length.
867 if (start < 0) {
868 status = U_INDEX_OUTOFBOUNDS_ERROR;
869 return FALSE;
870 }
871
872 int64_t nativeStart = start;
873 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
874 status = U_INDEX_OUTOFBOUNDS_ERROR;
875 return FALSE;
876 }
877 fMatchEnd = nativeStart;
878 return find();
879 }
880
881
882 //--------------------------------------------------------------------------------
883 //
884 // findUsingChunk() -- like find(), but with the advance knowledge that the
885 // entire string is available in the UText's chunk buffer.
886 //
887 //--------------------------------------------------------------------------------
888 UBool RegexMatcher::findUsingChunk() {
889 // Start at the position of the last match end. (Will be zero if the
890 // matcher has been reset.
891 //
892
893 int32_t startPos = (int32_t)fMatchEnd;
894 if (startPos==0) {
895 startPos = (int32_t)fActiveStart;
896 }
897
898 const UChar *inputBuf = fInputText->chunkContents;
899
900 if (fMatch) {
901 // Save the position of any previous successful match.
902 fLastMatchEnd = fMatchEnd;
903
904 if (fMatchStart == fMatchEnd) {
905 // Previous match had zero length. Move start position up one position
906 // to avoid sending find() into a loop on zero-length matches.
907 if (startPos >= fActiveLimit) {
908 fMatch = FALSE;
909 fHitEnd = TRUE;
910 return FALSE;
911 }
912 U16_FWD_1(inputBuf, startPos, fInputLength);
913 }
914 } else {
915 if (fLastMatchEnd >= 0) {
916 // A previous find() failed to match. Don't try again.
917 // (without this test, a pattern with a zero-length match
918 // could match again at the end of an input string.)
919 fHitEnd = TRUE;
920 return FALSE;
921 }
922 }
923
924
925 // Compute the position in the input string beyond which a match can not begin, because
926 // the minimum length match would extend past the end of the input.
927 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
928 // Be aware of possible overflows if making changes here.
929 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
930 if (startPos > testLen) {
931 fMatch = FALSE;
932 fHitEnd = TRUE;
933 return FALSE;
934 }
935
936 UChar32 c;
937 U_ASSERT(startPos >= 0);
938
939 switch (fPattern->fStartType) {
940 case START_NO_INFO:
941 // No optimization was found.
942 // Try a match at each input position.
943 for (;;) {
944 MatchChunkAt(startPos, FALSE, fDeferredStatus);
945 if (U_FAILURE(fDeferredStatus)) {
946 return FALSE;
947 }
948 if (fMatch) {
949 return TRUE;
950 }
951 if (startPos >= testLen) {
952 fHitEnd = TRUE;
953 return FALSE;
954 }
955 U16_FWD_1(inputBuf, startPos, fActiveLimit);
956 // Note that it's perfectly OK for a pattern to have a zero-length
957 // match at the end of a string, so we must make sure that the loop
958 // runs with startPos == testLen the last time through.
959 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
960 return FALSE;
961 }
962 U_ASSERT(FALSE);
963
964 case START_START:
965 // Matches are only possible at the start of the input string
966 // (pattern begins with ^ or \A)
967 if (startPos > fActiveStart) {
968 fMatch = FALSE;
969 return FALSE;
970 }
971 MatchChunkAt(startPos, FALSE, fDeferredStatus);
972 if (U_FAILURE(fDeferredStatus)) {
973 return FALSE;
974 }
975 return fMatch;
976
977
978 case START_SET:
979 {
980 // Match may start on any char from a pre-computed set.
981 U_ASSERT(fPattern->fMinMatchLen > 0);
982 for (;;) {
983 int32_t pos = startPos;
984 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
985 if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
986 (c>=256 && fPattern->fInitialChars->contains(c))) {
987 MatchChunkAt(pos, FALSE, fDeferredStatus);
988 if (U_FAILURE(fDeferredStatus)) {
989 return FALSE;
990 }
991 if (fMatch) {
992 return TRUE;
993 }
994 }
995 if (pos >= testLen) {
996 fMatch = FALSE;
997 fHitEnd = TRUE;
998 return FALSE;
999 }
1000 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1001 return FALSE;
1002 }
1003 }
1004 U_ASSERT(FALSE);
1005
1006 case START_STRING:
1007 case START_CHAR:
1008 {
1009 // Match starts on exactly one char.
1010 U_ASSERT(fPattern->fMinMatchLen > 0);
1011 UChar32 theChar = fPattern->fInitialChar;
1012 for (;;) {
1013 int32_t pos = startPos;
1014 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
1015 if (c == theChar) {
1016 MatchChunkAt(pos, FALSE, fDeferredStatus);
1017 if (U_FAILURE(fDeferredStatus)) {
1018 return FALSE;
1019 }
1020 if (fMatch) {
1021 return TRUE;
1022 }
1023 }
1024 if (pos >= testLen) {
1025 fMatch = FALSE;
1026 fHitEnd = TRUE;
1027 return FALSE;
1028 }
1029 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1030 return FALSE;
1031 }
1032 }
1033 U_ASSERT(FALSE);
1034
1035 case START_LINE:
1036 {
1037 UChar32 c;
1038 if (startPos == fAnchorStart) {
1039 MatchChunkAt(startPos, FALSE, fDeferredStatus);
1040 if (U_FAILURE(fDeferredStatus)) {
1041 return FALSE;
1042 }
1043 if (fMatch) {
1044 return TRUE;
1045 }
1046 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1047 }
1048
1049 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1050 for (;;) {
1051 c = inputBuf[startPos-1];
1052 if (c == 0x0a) {
1053 MatchChunkAt(startPos, FALSE, fDeferredStatus);
1054 if (U_FAILURE(fDeferredStatus)) {
1055 return FALSE;
1056 }
1057 if (fMatch) {
1058 return TRUE;
1059 }
1060 }
1061 if (startPos >= testLen) {
1062 fMatch = FALSE;
1063 fHitEnd = TRUE;
1064 return FALSE;
1065 }
1066 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1067 // Note that it's perfectly OK for a pattern to have a zero-length
1068 // match at the end of a string, so we must make sure that the loop
1069 // runs with startPos == testLen the last time through.
1070 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1071 return FALSE;
1072 }
1073 } else {
1074 for (;;) {
1075 c = inputBuf[startPos-1];
1076 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
1077 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
1078 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1079 startPos++;
1080 }
1081 MatchChunkAt(startPos, FALSE, fDeferredStatus);
1082 if (U_FAILURE(fDeferredStatus)) {
1083 return FALSE;
1084 }
1085 if (fMatch) {
1086 return TRUE;
1087 }
1088 }
1089 if (startPos >= testLen) {
1090 fMatch = FALSE;
1091 fHitEnd = TRUE;
1092 return FALSE;
1093 }
1094 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1095 // Note that it's perfectly OK for a pattern to have a zero-length
1096 // match at the end of a string, so we must make sure that the loop
1097 // runs with startPos == testLen the last time through.
1098 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1099 return FALSE;
1100 }
1101 }
1102 }
1103
1104 default:
1105 U_ASSERT(FALSE);
1106 }
1107
1108 U_ASSERT(FALSE);
1109 return FALSE;
1110 }
1111
1112
1113
1114 //--------------------------------------------------------------------------------
1115 //
1116 // group()
1117 //
1118 //--------------------------------------------------------------------------------
1119 UnicodeString RegexMatcher::group(UErrorCode &status) const {
1120 return group(0, status);
1121 }
1122
1123 // Return immutable shallow clone
1124 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1125 return group(0, dest, group_len, status);
1126 }
1127
1128 // Return immutable shallow clone
1129 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1130 group_len = 0;
1131 UBool bailOut = FALSE;
1132 if (U_FAILURE(status)) {
1133 return dest;
1134 }
1135 if (U_FAILURE(fDeferredStatus)) {
1136 status = fDeferredStatus;
1137 bailOut = TRUE;
1138 }
1139 if (fMatch == FALSE) {
1140 status = U_REGEX_INVALID_STATE;
1141 bailOut = TRUE;
1142 }
1143 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1144 status = U_INDEX_OUTOFBOUNDS_ERROR;
1145 bailOut = TRUE;
1146 }
1147
1148 if (bailOut) {
1149 return (dest) ? dest : utext_openUChars(NULL, NULL, 0, &status);
1150 }
1151
1152 int64_t s, e;
1153 if (groupNum == 0) {
1154 s = fMatchStart;
1155 e = fMatchEnd;
1156 } else {
1157 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1158 U_ASSERT(groupOffset < fPattern->fFrameSize);
1159 U_ASSERT(groupOffset >= 0);
1160 s = fFrame->fExtra[groupOffset];
1161 e = fFrame->fExtra[groupOffset+1];
1162 }
1163
1164 if (s < 0) {
1165 // A capture group wasn't part of the match
1166 return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1167 }
1168 U_ASSERT(s <= e);
1169 group_len = e - s;
1170
1171 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1172 if (dest)
1173 UTEXT_SETNATIVEINDEX(dest, s);
1174 return dest;
1175 }
1176
1177 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1178 UnicodeString result;
1179 if (U_FAILURE(status)) {
1180 return result;
1181 }
1182 UText resultText = UTEXT_INITIALIZER;
1183 utext_openUnicodeString(&resultText, &result, &status);
1184 group(groupNum, &resultText, status);
1185 utext_close(&resultText);
1186 return result;
1187 }
1188
1189
1190 // Return deep (mutable) clone
1191 // Technology Preview (as an API), but note that the UnicodeString API is implemented
1192 // using this function.
1193 UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const {
1194 UBool bailOut = FALSE;
1195 if (U_FAILURE(status)) {
1196 return dest;
1197 }
1198 if (U_FAILURE(fDeferredStatus)) {
1199 status = fDeferredStatus;
1200 bailOut = TRUE;
1201 }
1202
1203 if (fMatch == FALSE) {
1204 status = U_REGEX_INVALID_STATE;
1205 bailOut = TRUE;
1206 }
1207 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1208 status = U_INDEX_OUTOFBOUNDS_ERROR;
1209 bailOut = TRUE;
1210 }
1211
1212 if (bailOut) {
1213 if (dest) {
1214 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1215 return dest;
1216 } else {
1217 return utext_openUChars(NULL, NULL, 0, &status);
1218 }
1219 }
1220
1221 int64_t s, e;
1222 if (groupNum == 0) {
1223 s = fMatchStart;
1224 e = fMatchEnd;
1225 } else {
1226 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1227 U_ASSERT(groupOffset < fPattern->fFrameSize);
1228 U_ASSERT(groupOffset >= 0);
1229 s = fFrame->fExtra[groupOffset];
1230 e = fFrame->fExtra[groupOffset+1];
1231 }
1232
1233 if (s < 0) {
1234 // A capture group wasn't part of the match
1235 if (dest) {
1236 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1237 return dest;
1238 } else {
1239 return utext_openUChars(NULL, NULL, 0, &status);
1240 }
1241 }
1242 U_ASSERT(s <= e);
1243
1244 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1245 U_ASSERT(e <= fInputLength);
1246 if (dest) {
1247 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status);
1248 } else {
1249 UText groupText = UTEXT_INITIALIZER;
1250 utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status);
1251 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1252 utext_close(&groupText);
1253 }
1254 } else {
1255 int32_t len16;
1256 if (UTEXT_USES_U16(fInputText)) {
1257 len16 = (int32_t)(e-s);
1258 } else {
1259 UErrorCode lengthStatus = U_ZERO_ERROR;
1260 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1261 }
1262 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1263 if (groupChars == NULL) {
1264 status = U_MEMORY_ALLOCATION_ERROR;
1265 return dest;
1266 }
1267 utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1268
1269 if (dest) {
1270 utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status);
1271 } else {
1272 UText groupText = UTEXT_INITIALIZER;
1273 utext_openUChars(&groupText, groupChars, len16, &status);
1274 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1275 utext_close(&groupText);
1276 }
1277
1278 uprv_free(groupChars);
1279 }
1280 return dest;
1281 }
1282
1283 //--------------------------------------------------------------------------------
1284 //
1285 // appendGroup() -- currently internal only, appends a group to a UText rather
1286 // than replacing its contents
1287 //
1288 //--------------------------------------------------------------------------------
1289
1290 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1291 if (U_FAILURE(status)) {
1292 return 0;
1293 }
1294 if (U_FAILURE(fDeferredStatus)) {
1295 status = fDeferredStatus;
1296 return 0;
1297 }
1298 int64_t destLen = utext_nativeLength(dest);
1299
1300 if (fMatch == FALSE) {
1301 status = U_REGEX_INVALID_STATE;
1302 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1303 }
1304 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1305 status = U_INDEX_OUTOFBOUNDS_ERROR;
1306 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1307 }
1308
1309 int64_t s, e;
1310 if (groupNum == 0) {
1311 s = fMatchStart;
1312 e = fMatchEnd;
1313 } else {
1314 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1315 U_ASSERT(groupOffset < fPattern->fFrameSize);
1316 U_ASSERT(groupOffset >= 0);
1317 s = fFrame->fExtra[groupOffset];
1318 e = fFrame->fExtra[groupOffset+1];
1319 }
1320
1321 if (s < 0) {
1322 // A capture group wasn't part of the match
1323 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1324 }
1325 U_ASSERT(s <= e);
1326
1327 int64_t deltaLen;
1328 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1329 U_ASSERT(e <= fInputLength);
1330 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1331 } else {
1332 int32_t len16;
1333 if (UTEXT_USES_U16(fInputText)) {
1334 len16 = (int32_t)(e-s);
1335 } else {
1336 UErrorCode lengthStatus = U_ZERO_ERROR;
1337 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1338 }
1339 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1340 if (groupChars == NULL) {
1341 status = U_MEMORY_ALLOCATION_ERROR;
1342 return 0;
1343 }
1344 utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1345
1346 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1347 uprv_free(groupChars);
1348 }
1349 return deltaLen;
1350 }
1351
1352
1353
1354 //--------------------------------------------------------------------------------
1355 //
1356 // groupCount()
1357 //
1358 //--------------------------------------------------------------------------------
1359 int32_t RegexMatcher::groupCount() const {
1360 return fPattern->fGroupMap->size();
1361 }
1362
1363
1364
1365 //--------------------------------------------------------------------------------
1366 //
1367 // hasAnchoringBounds()
1368 //
1369 //--------------------------------------------------------------------------------
1370 UBool RegexMatcher::hasAnchoringBounds() const {
1371 return fAnchoringBounds;
1372 }
1373
1374
1375 //--------------------------------------------------------------------------------
1376 //
1377 // hasTransparentBounds()
1378 //
1379 //--------------------------------------------------------------------------------
1380 UBool RegexMatcher::hasTransparentBounds() const {
1381 return fTransparentBounds;
1382 }
1383
1384
1385
1386 //--------------------------------------------------------------------------------
1387 //
1388 // hitEnd()
1389 //
1390 //--------------------------------------------------------------------------------
1391 UBool RegexMatcher::hitEnd() const {
1392 return fHitEnd;
1393 }
1394
1395
1396 //--------------------------------------------------------------------------------
1397 //
1398 // input()
1399 //
1400 //--------------------------------------------------------------------------------
1401 const UnicodeString &RegexMatcher::input() const {
1402 if (!fInput) {
1403 UErrorCode status = U_ZERO_ERROR;
1404 int32_t len16;
1405 if (UTEXT_USES_U16(fInputText)) {
1406 len16 = (int32_t)fInputLength;
1407 } else {
1408 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1409 status = U_ZERO_ERROR; // overflow, length status
1410 }
1411 UnicodeString *result = new UnicodeString(len16, 0, 0);
1412
1413 UChar *inputChars = result->getBuffer(len16);
1414 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1415 result->releaseBuffer(len16);
1416
1417 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1418 }
1419
1420 return *fInput;
1421 }
1422
1423 //--------------------------------------------------------------------------------
1424 //
1425 // inputText()
1426 //
1427 //--------------------------------------------------------------------------------
1428 UText *RegexMatcher::inputText() const {
1429 return fInputText;
1430 }
1431
1432
1433 //--------------------------------------------------------------------------------
1434 //
1435 // getInput() -- like inputText(), but makes a clone or copies into another UText
1436 //
1437 //--------------------------------------------------------------------------------
1438 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1439 UBool bailOut = FALSE;
1440 if (U_FAILURE(status)) {
1441 return dest;
1442 }
1443 if (U_FAILURE(fDeferredStatus)) {
1444 status = fDeferredStatus;
1445 bailOut = TRUE;
1446 }
1447
1448 if (bailOut) {
1449 if (dest) {
1450 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1451 return dest;
1452 } else {
1453 return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1454 }
1455 }
1456
1457 if (dest) {
1458 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1459 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1460 } else {
1461 int32_t input16Len;
1462 if (UTEXT_USES_U16(fInputText)) {
1463 input16Len = (int32_t)fInputLength;
1464 } else {
1465 UErrorCode lengthStatus = U_ZERO_ERROR;
1466 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1467 }
1468 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1469 if (inputChars == NULL) {
1470 return dest;
1471 }
1472
1473 status = U_ZERO_ERROR;
1474 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1475 status = U_ZERO_ERROR;
1476 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1477
1478 uprv_free(inputChars);
1479 }
1480 return dest;
1481 } else {
1482 return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1483 }
1484 }
1485
1486
1487 static UBool compat_SyncMutableUTextContents(UText *ut);
1488 static UBool compat_SyncMutableUTextContents(UText *ut) {
1489 UBool retVal = FALSE;
1490
1491 // In the following test, we're really only interested in whether the UText should switch
1492 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents
1493 // will still point to the correct data.
1494 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1495 UnicodeString *us=(UnicodeString *)ut->context;
1496
1497 // Update to the latest length.
1498 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1499 int32_t newLength = us->length();
1500
1501 // Update the chunk description.
1502 // The buffer may have switched between stack- and heap-based.
1503 ut->chunkContents = us->getBuffer();
1504 ut->chunkLength = newLength;
1505 ut->chunkNativeLimit = newLength;
1506 ut->nativeIndexingLimit = newLength;
1507 retVal = TRUE;
1508 }
1509
1510 return retVal;
1511 }
1512
1513 //--------------------------------------------------------------------------------
1514 //
1515 // lookingAt()
1516 //
1517 //--------------------------------------------------------------------------------
1518 UBool RegexMatcher::lookingAt(UErrorCode &status) {
1519 if (U_FAILURE(status)) {
1520 return FALSE;
1521 }
1522 if (U_FAILURE(fDeferredStatus)) {
1523 status = fDeferredStatus;
1524 return FALSE;
1525 }
1526
1527 if (fInputUniStrMaybeMutable) {
1528 if (compat_SyncMutableUTextContents(fInputText)) {
1529 fInputLength = utext_nativeLength(fInputText);
1530 reset();
1531 }
1532 }
1533 else {
1534 resetPreserveRegion();
1535 }
1536 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1537 MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1538 } else {
1539 MatchAt(fActiveStart, FALSE, status);
1540 }
1541 return fMatch;
1542 }
1543
1544
1545 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1546 if (U_FAILURE(status)) {
1547 return FALSE;
1548 }
1549 if (U_FAILURE(fDeferredStatus)) {
1550 status = fDeferredStatus;
1551 return FALSE;
1552 }
1553 reset();
1554
1555 if (start < 0) {
1556 status = U_INDEX_OUTOFBOUNDS_ERROR;
1557 return FALSE;
1558 }
1559
1560 if (fInputUniStrMaybeMutable) {
1561 if (compat_SyncMutableUTextContents(fInputText)) {
1562 fInputLength = utext_nativeLength(fInputText);
1563 reset();
1564 }
1565 }
1566
1567 int64_t nativeStart;
1568 nativeStart = start;
1569 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1570 status = U_INDEX_OUTOFBOUNDS_ERROR;
1571 return FALSE;
1572 }
1573
1574 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1575 MatchChunkAt((int32_t)nativeStart, FALSE, status);
1576 } else {
1577 MatchAt(nativeStart, FALSE, status);
1578 }
1579 return fMatch;
1580 }
1581
1582
1583
1584 //--------------------------------------------------------------------------------
1585 //
1586 // matches()
1587 //
1588 //--------------------------------------------------------------------------------
1589 UBool RegexMatcher::matches(UErrorCode &status) {
1590 if (U_FAILURE(status)) {
1591 return FALSE;
1592 }
1593 if (U_FAILURE(fDeferredStatus)) {
1594 status = fDeferredStatus;
1595 return FALSE;
1596 }
1597
1598 if (fInputUniStrMaybeMutable) {
1599 if (compat_SyncMutableUTextContents(fInputText)) {
1600 fInputLength = utext_nativeLength(fInputText);
1601 reset();
1602 }
1603 }
1604 else {
1605 resetPreserveRegion();
1606 }
1607
1608 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1609 MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1610 } else {
1611 MatchAt(fActiveStart, TRUE, status);
1612 }
1613 return fMatch;
1614 }
1615
1616
1617 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1618 if (U_FAILURE(status)) {
1619 return FALSE;
1620 }
1621 if (U_FAILURE(fDeferredStatus)) {
1622 status = fDeferredStatus;
1623 return FALSE;
1624 }
1625 reset();
1626
1627 if (start < 0) {
1628 status = U_INDEX_OUTOFBOUNDS_ERROR;
1629 return FALSE;
1630 }
1631
1632 if (fInputUniStrMaybeMutable) {
1633 if (compat_SyncMutableUTextContents(fInputText)) {
1634 fInputLength = utext_nativeLength(fInputText);
1635 reset();
1636 }
1637 }
1638
1639 int64_t nativeStart;
1640 nativeStart = start;
1641 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1642 status = U_INDEX_OUTOFBOUNDS_ERROR;
1643 return FALSE;
1644 }
1645
1646 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1647 MatchChunkAt((int32_t)nativeStart, TRUE, status);
1648 } else {
1649 MatchAt(nativeStart, TRUE, status);
1650 }
1651 return fMatch;
1652 }
1653
1654
1655
1656 //--------------------------------------------------------------------------------
1657 //
1658 // pattern
1659 //
1660 //--------------------------------------------------------------------------------
1661 const RegexPattern &RegexMatcher::pattern() const {
1662 return *fPattern;
1663 }
1664
1665
1666
1667 //--------------------------------------------------------------------------------
1668 //
1669 // region
1670 //
1671 //--------------------------------------------------------------------------------
1672 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1673 if (U_FAILURE(status)) {
1674 return *this;
1675 }
1676
1677 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1678 status = U_ILLEGAL_ARGUMENT_ERROR;
1679 }
1680
1681 int64_t nativeStart = regionStart;
1682 int64_t nativeLimit = regionLimit;
1683 if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1684 status = U_ILLEGAL_ARGUMENT_ERROR;
1685 }
1686
1687 if (startIndex == -1)
1688 this->reset();
1689 else
1690 resetPreserveRegion();
1691
1692 fRegionStart = nativeStart;
1693 fRegionLimit = nativeLimit;
1694 fActiveStart = nativeStart;
1695 fActiveLimit = nativeLimit;
1696
1697 if (startIndex != -1) {
1698 if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1699 status = U_INDEX_OUTOFBOUNDS_ERROR;
1700 }
1701 fMatchEnd = startIndex;
1702 }
1703
1704 if (!fTransparentBounds) {
1705 fLookStart = nativeStart;
1706 fLookLimit = nativeLimit;
1707 }
1708 if (fAnchoringBounds) {
1709 fAnchorStart = nativeStart;
1710 fAnchorLimit = nativeLimit;
1711 }
1712 return *this;
1713 }
1714
1715 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1716 return region(start, limit, -1, status);
1717 }
1718
1719 //--------------------------------------------------------------------------------
1720 //
1721 // regionEnd
1722 //
1723 //--------------------------------------------------------------------------------
1724 int32_t RegexMatcher::regionEnd() const {
1725 return (int32_t)fRegionLimit;
1726 }
1727
1728 int64_t RegexMatcher::regionEnd64() const {
1729 return fRegionLimit;
1730 }
1731
1732 //--------------------------------------------------------------------------------
1733 //
1734 // regionStart
1735 //
1736 //--------------------------------------------------------------------------------
1737 int32_t RegexMatcher::regionStart() const {
1738 return (int32_t)fRegionStart;
1739 }
1740
1741 int64_t RegexMatcher::regionStart64() const {
1742 return fRegionStart;
1743 }
1744
1745
1746 //--------------------------------------------------------------------------------
1747 //
1748 // replaceAll
1749 //
1750 //--------------------------------------------------------------------------------
1751 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1752 UText replacementText = UTEXT_INITIALIZER;
1753 UText resultText = UTEXT_INITIALIZER;
1754 UnicodeString resultString;
1755 if (U_FAILURE(status)) {
1756 return resultString;
1757 }
1758
1759 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1760 utext_openUnicodeString(&resultText, &resultString, &status);
1761
1762 replaceAll(&replacementText, &resultText, status);
1763
1764 utext_close(&resultText);
1765 utext_close(&replacementText);
1766
1767 return resultString;
1768 }
1769
1770
1771 //
1772 // replaceAll, UText mode
1773 //
1774 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1775 if (U_FAILURE(status)) {
1776 return dest;
1777 }
1778 if (U_FAILURE(fDeferredStatus)) {
1779 status = fDeferredStatus;
1780 return dest;
1781 }
1782
1783 if (dest == NULL) {
1784 UnicodeString emptyString;
1785 UText empty = UTEXT_INITIALIZER;
1786
1787 utext_openUnicodeString(&empty, &emptyString, &status);
1788 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1789 utext_close(&empty);
1790 }
1791
1792 if (U_SUCCESS(status)) {
1793 reset();
1794 while (find()) {
1795 appendReplacement(dest, replacement, status);
1796 if (U_FAILURE(status)) {
1797 break;
1798 }
1799 }
1800 appendTail(dest, status);
1801 }
1802
1803 return dest;
1804 }
1805
1806
1807 //--------------------------------------------------------------------------------
1808 //
1809 // replaceFirst
1810 //
1811 //--------------------------------------------------------------------------------
1812 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1813 UText replacementText = UTEXT_INITIALIZER;
1814 UText resultText = UTEXT_INITIALIZER;
1815 UnicodeString resultString;
1816
1817 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1818 utext_openUnicodeString(&resultText, &resultString, &status);
1819
1820 replaceFirst(&replacementText, &resultText, status);
1821
1822 utext_close(&resultText);
1823 utext_close(&replacementText);
1824
1825 return resultString;
1826 }
1827
1828 //
1829 // replaceFirst, UText mode
1830 //
1831 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1832 if (U_FAILURE(status)) {
1833 return dest;
1834 }
1835 if (U_FAILURE(fDeferredStatus)) {
1836 status = fDeferredStatus;
1837 return dest;
1838 }
1839
1840 reset();
1841 if (!find()) {
1842 return getInput(dest, status);
1843 }
1844
1845 if (dest == NULL) {
1846 UnicodeString emptyString;
1847 UText empty = UTEXT_INITIALIZER;
1848
1849 utext_openUnicodeString(&empty, &emptyString, &status);
1850 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1851 utext_close(&empty);
1852 }
1853
1854 appendReplacement(dest, replacement, status);
1855 appendTail(dest, status);
1856
1857 return dest;
1858 }
1859
1860
1861 //--------------------------------------------------------------------------------
1862 //
1863 // requireEnd
1864 //
1865 //--------------------------------------------------------------------------------
1866 UBool RegexMatcher::requireEnd() const {
1867 return fRequireEnd;
1868 }
1869
1870
1871 //--------------------------------------------------------------------------------
1872 //
1873 // reset
1874 //
1875 //--------------------------------------------------------------------------------
1876 RegexMatcher &RegexMatcher::reset() {
1877 fRegionStart = 0;
1878 fRegionLimit = fInputLength;
1879 fActiveStart = 0;
1880 fActiveLimit = fInputLength;
1881 fAnchorStart = 0;
1882 fAnchorLimit = fInputLength;
1883 fLookStart = 0;
1884 fLookLimit = fInputLength;
1885 resetPreserveRegion();
1886 return *this;
1887 }
1888
1889
1890
1891 void RegexMatcher::resetPreserveRegion() {
1892 fMatchStart = 0;
1893 fMatchEnd = 0;
1894 fLastMatchEnd = -1;
1895 fAppendPosition = 0;
1896 fMatch = FALSE;
1897 fHitEnd = FALSE;
1898 fRequireEnd = FALSE;
1899 fTime = 0;
1900 fTickCounter = TIMER_INITIAL_VALUE;
1901 //resetStack(); // more expensive than it looks...
1902 }
1903
1904
1905 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1906 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1907 if (fPattern->fNeedsAltInput) {
1908 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1909 }
1910 fInputLength = utext_nativeLength(fInputText);
1911
1912 reset();
1913 delete fInput;
1914 fInput = NULL;
1915
1916 // Do the following for any UnicodeString.
1917 // This is for compatibility for those clients who modify the input string "live" during regex operations.
1918 fInputUniStrMaybeMutable = TRUE;
1919
1920 if (fWordBreakItr != NULL) {
1921 #if UCONFIG_NO_BREAK_ITERATION==0
1922 UErrorCode status = U_ZERO_ERROR;
1923 fWordBreakItr->setText(fInputText, status);
1924 #endif
1925 }
1926 return *this;
1927 }
1928
1929
1930 RegexMatcher &RegexMatcher::reset(UText *input) {
1931 if (fInputText != input) {
1932 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1933 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1934 fInputLength = utext_nativeLength(fInputText);
1935
1936 delete fInput;
1937 fInput = NULL;
1938
1939 if (fWordBreakItr != NULL) {
1940 #if UCONFIG_NO_BREAK_ITERATION==0
1941 UErrorCode status = U_ZERO_ERROR;
1942 fWordBreakItr->setText(input, status);
1943 #endif
1944 }
1945 }
1946 reset();
1947 fInputUniStrMaybeMutable = FALSE;
1948
1949 return *this;
1950 }
1951
1952 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
1953 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1954 return *this;
1955 }*/
1956
1957 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1958 if (U_FAILURE(status)) {
1959 return *this;
1960 }
1961 reset(); // Reset also resets the region to be the entire string.
1962
1963 if (position < 0 || position > fActiveLimit) {
1964 status = U_INDEX_OUTOFBOUNDS_ERROR;
1965 return *this;
1966 }
1967 fMatchEnd = position;
1968 return *this;
1969 }
1970
1971
1972 //--------------------------------------------------------------------------------
1973 //
1974 // refresh
1975 //
1976 //--------------------------------------------------------------------------------
1977 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1978 if (U_FAILURE(status)) {
1979 return *this;
1980 }
1981 if (input == NULL) {
1982 status = U_ILLEGAL_ARGUMENT_ERROR;
1983 return *this;
1984 }
1985 if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1986 status = U_ILLEGAL_ARGUMENT_ERROR;
1987 return *this;
1988 }
1989 int64_t pos = utext_getNativeIndex(fInputText);
1990 // Shallow read-only clone of the new UText into the existing input UText
1991 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
1992 if (U_FAILURE(status)) {
1993 return *this;
1994 }
1995 utext_setNativeIndex(fInputText, pos);
1996
1997 if (fAltInputText != NULL) {
1998 pos = utext_getNativeIndex(fAltInputText);
1999 fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
2000 if (U_FAILURE(status)) {
2001 return *this;
2002 }
2003 utext_setNativeIndex(fAltInputText, pos);
2004 }
2005 return *this;
2006 }
2007
2008
2009
2010 //--------------------------------------------------------------------------------
2011 //
2012 // setTrace
2013 //
2014 //--------------------------------------------------------------------------------
2015 void RegexMatcher::setTrace(UBool state) {
2016 fTraceDebug = state;
2017 }
2018
2019
2020
2021 //---------------------------------------------------------------------
2022 //
2023 // split
2024 //
2025 //---------------------------------------------------------------------
2026 int32_t RegexMatcher::split(const UnicodeString &input,
2027 UnicodeString dest[],
2028 int32_t destCapacity,
2029 UErrorCode &status)
2030 {
2031 UText inputText = UTEXT_INITIALIZER;
2032 utext_openConstUnicodeString(&inputText, &input, &status);
2033 if (U_FAILURE(status)) {
2034 return 0;
2035 }
2036
2037 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2038 if (destText == NULL) {
2039 status = U_MEMORY_ALLOCATION_ERROR;
2040 return 0;
2041 }
2042 int32_t i;
2043 for (i = 0; i < destCapacity; i++) {
2044 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2045 }
2046
2047 int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2048
2049 for (i = 0; i < destCapacity; i++) {
2050 utext_close(destText[i]);
2051 }
2052
2053 uprv_free(destText);
2054 utext_close(&inputText);
2055 return fieldCount;
2056 }
2057
2058 //
2059 // split, UText mode
2060 //
2061 int32_t RegexMatcher::split(UText *input,
2062 UText *dest[],
2063 int32_t destCapacity,
2064 UErrorCode &status)
2065 {
2066 //
2067 // Check arguements for validity
2068 //
2069 if (U_FAILURE(status)) {
2070 return 0;
2071 };
2072
2073 if (destCapacity < 1) {
2074 status = U_ILLEGAL_ARGUMENT_ERROR;
2075 return 0;
2076 }
2077
2078 //
2079 // Reset for the input text
2080 //
2081 reset(input);
2082 int64_t nextOutputStringStart = 0;
2083 if (fActiveLimit == 0) {
2084 return 0;
2085 }
2086
2087 //
2088 // Loop through the input text, searching for the delimiter pattern
2089 //
2090 int32_t i;
2091 int32_t numCaptureGroups = fPattern->fGroupMap->size();
2092 for (i=0; ; i++) {
2093 if (i>=destCapacity-1) {
2094 // There is one or zero output string left.
2095 // Fill the last output string with whatever is left from the input, then exit the loop.
2096 // ( i will be == destCapacity if we filled the output array while processing
2097 // capture groups of the delimiter expression, in which case we will discard the
2098 // last capture group saved in favor of the unprocessed remainder of the
2099 // input string.)
2100 i = destCapacity-1;
2101 if (fActiveLimit > nextOutputStringStart) {
2102 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2103 if (dest[i]) {
2104 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2105 input->chunkContents+nextOutputStringStart,
2106 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2107 } else {
2108 UText remainingText = UTEXT_INITIALIZER;
2109 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2110 fActiveLimit-nextOutputStringStart, &status);
2111 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2112 utext_close(&remainingText);
2113 }
2114 } else {
2115 UErrorCode lengthStatus = U_ZERO_ERROR;
2116 int32_t remaining16Length =
2117 utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2118 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2119 if (remainingChars == NULL) {
2120 status = U_MEMORY_ALLOCATION_ERROR;
2121 break;
2122 }
2123
2124 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2125 if (dest[i]) {
2126 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2127 } else {
2128 UText remainingText = UTEXT_INITIALIZER;
2129 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2130 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2131 utext_close(&remainingText);
2132 }
2133
2134 uprv_free(remainingChars);
2135 }
2136 }
2137 break;
2138 }
2139 if (find()) {
2140 // We found another delimiter. Move everything from where we started looking
2141 // up until the start of the delimiter into the next output string.
2142 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2143 if (dest[i]) {
2144 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2145 input->chunkContents+nextOutputStringStart,
2146 (int32_t)(fMatchStart-nextOutputStringStart), &status);
2147 } else {
2148 UText remainingText = UTEXT_INITIALIZER;
2149 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2150 fMatchStart-nextOutputStringStart, &status);
2151 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2152 utext_close(&remainingText);
2153 }
2154 } else {
2155 UErrorCode lengthStatus = U_ZERO_ERROR;
2156 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2157 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2158 if (remainingChars == NULL) {
2159 status = U_MEMORY_ALLOCATION_ERROR;
2160 break;
2161 }
2162 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2163 if (dest[i]) {
2164 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2165 } else {
2166 UText remainingText = UTEXT_INITIALIZER;
2167 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2168 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2169 utext_close(&remainingText);
2170 }
2171
2172 uprv_free(remainingChars);
2173 }
2174 nextOutputStringStart = fMatchEnd;
2175
2176 // If the delimiter pattern has capturing parentheses, the captured
2177 // text goes out into the next n destination strings.
2178 int32_t groupNum;
2179 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2180 if (i >= destCapacity-2) {
2181 // Never fill the last available output string with capture group text.
2182 // It will filled with the last field, the remainder of the
2183 // unsplit input text.
2184 break;
2185 }
2186 i++;
2187 dest[i] = group(groupNum, dest[i], status);
2188 }
2189
2190 if (nextOutputStringStart == fActiveLimit) {
2191 // The delimiter was at the end of the string. We're done, but first
2192 // we output one last empty string, for the empty field following
2193 // the delimiter at the end of input.
2194 if (i+1 < destCapacity) {
2195 ++i;
2196 if (dest[i] == NULL) {
2197 dest[i] = utext_openUChars(NULL, NULL, 0, &status);
2198 } else {
2199 static UChar emptyString[] = {(UChar)0};
2200 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
2201 }
2202 }
2203 break;
2204
2205 }
2206 }
2207 else
2208 {
2209 // We ran off the end of the input while looking for the next delimiter.
2210 // All the remaining text goes into the current output string.
2211 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2212 if (dest[i]) {
2213 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2214 input->chunkContents+nextOutputStringStart,
2215 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2216 } else {
2217 UText remainingText = UTEXT_INITIALIZER;
2218 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2219 fActiveLimit-nextOutputStringStart, &status);
2220 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2221 utext_close(&remainingText);
2222 }
2223 } else {
2224 UErrorCode lengthStatus = U_ZERO_ERROR;
2225 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2226 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2227 if (remainingChars == NULL) {
2228 status = U_MEMORY_ALLOCATION_ERROR;
2229 break;
2230 }
2231
2232 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2233 if (dest[i]) {
2234 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2235 } else {
2236 UText remainingText = UTEXT_INITIALIZER;
2237 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2238 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2239 utext_close(&remainingText);
2240 }
2241
2242 uprv_free(remainingChars);
2243 }
2244 break;
2245 }
2246 if (U_FAILURE(status)) {
2247 break;
2248 }
2249 } // end of for loop
2250 return i+1;
2251 }
2252
2253
2254 //--------------------------------------------------------------------------------
2255 //
2256 // start
2257 //
2258 //--------------------------------------------------------------------------------
2259 int32_t RegexMatcher::start(UErrorCode &status) const {
2260 return start(0, status);
2261 }
2262
2263 int64_t RegexMatcher::start64(UErrorCode &status) const {
2264 return start64(0, status);
2265 }
2266
2267 //--------------------------------------------------------------------------------
2268 //
2269 // start(int32_t group, UErrorCode &status)
2270 //
2271 //--------------------------------------------------------------------------------
2272
2273 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2274 if (U_FAILURE(status)) {
2275 return -1;
2276 }
2277 if (U_FAILURE(fDeferredStatus)) {
2278 status = fDeferredStatus;
2279 return -1;
2280 }
2281 if (fMatch == FALSE) {
2282 status = U_REGEX_INVALID_STATE;
2283 return -1;
2284 }
2285 if (group < 0 || group > fPattern->fGroupMap->size()) {
2286 status = U_INDEX_OUTOFBOUNDS_ERROR;
2287 return -1;
2288 }
2289 int64_t s;
2290 if (group == 0) {
2291 s = fMatchStart;
2292 } else {
2293 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2294 U_ASSERT(groupOffset < fPattern->fFrameSize);
2295 U_ASSERT(groupOffset >= 0);
2296 s = fFrame->fExtra[groupOffset];
2297 }
2298
2299 return s;
2300 }
2301
2302
2303 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2304 return (int32_t)start64(group, status);
2305 }
2306
2307 //--------------------------------------------------------------------------------
2308 //
2309 // useAnchoringBounds
2310 //
2311 //--------------------------------------------------------------------------------
2312 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2313 fAnchoringBounds = b;
2314 fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2315 fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2316 return *this;
2317 }
2318
2319
2320 //--------------------------------------------------------------------------------
2321 //
2322 // useTransparentBounds
2323 //
2324 //--------------------------------------------------------------------------------
2325 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2326 fTransparentBounds = b;
2327 fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2328 fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2329 return *this;
2330 }
2331
2332 //--------------------------------------------------------------------------------
2333 //
2334 // setTimeLimit
2335 //
2336 //--------------------------------------------------------------------------------
2337 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2338 if (U_FAILURE(status)) {
2339 return;
2340 }
2341 if (U_FAILURE(fDeferredStatus)) {
2342 status = fDeferredStatus;
2343 return;
2344 }
2345 if (limit < 0) {
2346 status = U_ILLEGAL_ARGUMENT_ERROR;
2347 return;
2348 }
2349 fTimeLimit = limit;
2350 }
2351
2352
2353 //--------------------------------------------------------------------------------
2354 //
2355 // getTimeLimit
2356 //
2357 //--------------------------------------------------------------------------------
2358 int32_t RegexMatcher::getTimeLimit() const {
2359 return fTimeLimit;
2360 }
2361
2362
2363 //--------------------------------------------------------------------------------
2364 //
2365 // setStackLimit
2366 //
2367 //--------------------------------------------------------------------------------
2368 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2369 if (U_FAILURE(status)) {
2370 return;
2371 }
2372 if (U_FAILURE(fDeferredStatus)) {
2373 status = fDeferredStatus;
2374 return;
2375 }
2376 if (limit < 0) {
2377 status = U_ILLEGAL_ARGUMENT_ERROR;
2378 return;
2379 }
2380
2381 // Reset the matcher. This is needed here in case there is a current match
2382 // whose final stack frame (containing the match results, pointed to by fFrame)
2383 // would be lost by resizing to a smaller stack size.
2384 reset();
2385
2386 if (limit == 0) {
2387 // Unlimited stack expansion
2388 fStack->setMaxCapacity(0);
2389 } else {
2390 // Change the units of the limit from bytes to ints, and bump the size up
2391 // to be big enough to hold at least one stack frame for the pattern,
2392 // if it isn't there already.
2393 int32_t adjustedLimit = limit / sizeof(int32_t);
2394 if (adjustedLimit < fPattern->fFrameSize) {
2395 adjustedLimit = fPattern->fFrameSize;
2396 }
2397 fStack->setMaxCapacity(adjustedLimit);
2398 }
2399 fStackLimit = limit;
2400 }
2401
2402
2403 //--------------------------------------------------------------------------------
2404 //
2405 // getStackLimit
2406 //
2407 //--------------------------------------------------------------------------------
2408 int32_t RegexMatcher::getStackLimit() const {
2409 return fStackLimit;
2410 }
2411
2412
2413 //--------------------------------------------------------------------------------
2414 //
2415 // setMatchCallback
2416 //
2417 //--------------------------------------------------------------------------------
2418 void RegexMatcher::setMatchCallback(URegexMatchCallback *callback,
2419 const void *context,
2420 UErrorCode &status) {
2421 if (U_FAILURE(status)) {
2422 return;
2423 }
2424 fCallbackFn = callback;
2425 fCallbackContext = context;
2426 }
2427
2428
2429 //--------------------------------------------------------------------------------
2430 //
2431 // getMatchCallback
2432 //
2433 //--------------------------------------------------------------------------------
2434 void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback,
2435 const void *&context,
2436 UErrorCode &status) {
2437 if (U_FAILURE(status)) {
2438 return;
2439 }
2440 callback = fCallbackFn;
2441 context = fCallbackContext;
2442 }
2443
2444
2445 //--------------------------------------------------------------------------------
2446 //
2447 // setMatchCallback
2448 //
2449 //--------------------------------------------------------------------------------
2450 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback,
2451 const void *context,
2452 UErrorCode &status) {
2453 if (U_FAILURE(status)) {
2454 return;
2455 }
2456 fFindProgressCallbackFn = callback;
2457 fFindProgressCallbackContext = context;
2458 }
2459
2460
2461 //--------------------------------------------------------------------------------
2462 //
2463 // getMatchCallback
2464 //
2465 //--------------------------------------------------------------------------------
2466 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback,
2467 const void *&context,
2468 UErrorCode &status) {
2469 if (U_FAILURE(status)) {
2470 return;
2471 }
2472 callback = fFindProgressCallbackFn;
2473 context = fFindProgressCallbackContext;
2474 }
2475
2476
2477 //================================================================================
2478 //
2479 // Code following this point in this file is the internal
2480 // Match Engine Implementation.
2481 //
2482 //================================================================================
2483
2484
2485 //--------------------------------------------------------------------------------
2486 //
2487 // resetStack
2488 // Discard any previous contents of the state save stack, and initialize a
2489 // new stack frame to all -1. The -1s are needed for capture group limits,
2490 // where they indicate that a group has not yet matched anything.
2491 //--------------------------------------------------------------------------------
2492 REStackFrame *RegexMatcher::resetStack() {
2493 // Discard any previous contents of the state save stack, and initialize a
2494 // new stack frame with all -1 data. The -1s are needed for capture group limits,
2495 // where they indicate that a group has not yet matched anything.
2496 fStack->removeAllElements();
2497
2498 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2499 int32_t i;
2500 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2501 iFrame->fExtra[i] = -1;
2502 }
2503 return iFrame;
2504 }
2505
2506
2507
2508 //--------------------------------------------------------------------------------
2509 //
2510 // isWordBoundary
2511 // in perl, "xab..cd..", \b is true at positions 0,3,5,7
2512 // For us,
2513 // If the current char is a combining mark,
2514 // \b is FALSE.
2515 // Else Scan backwards to the first non-combining char.
2516 // We are at a boundary if the this char and the original chars are
2517 // opposite in membership in \w set
2518 //
2519 // parameters: pos - the current position in the input buffer
2520 //
2521 // TODO: double-check edge cases at region boundaries.
2522 //
2523 //--------------------------------------------------------------------------------
2524 UBool RegexMatcher::isWordBoundary(int64_t pos) {
2525 UBool isBoundary = FALSE;
2526 UBool cIsWord = FALSE;
2527
2528 if (pos >= fLookLimit) {
2529 fHitEnd = TRUE;
2530 } else {
2531 // Determine whether char c at current position is a member of the word set of chars.
2532 // If we're off the end of the string, behave as though we're not at a word char.
2533 UTEXT_SETNATIVEINDEX(fInputText, pos);
2534 UChar32 c = UTEXT_CURRENT32(fInputText);
2535 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2536 // Current char is a combining one. Not a boundary.
2537 return FALSE;
2538 }
2539 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2540 }
2541
2542 // Back up until we come to a non-combining char, determine whether
2543 // that char is a word char.
2544 UBool prevCIsWord = FALSE;
2545 for (;;) {
2546 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2547 break;
2548 }
2549 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2550 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2551 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2552 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2553 break;
2554 }
2555 }
2556 isBoundary = cIsWord ^ prevCIsWord;
2557 return isBoundary;
2558 }
2559
2560 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2561 UBool isBoundary = FALSE;
2562 UBool cIsWord = FALSE;
2563
2564 const UChar *inputBuf = fInputText->chunkContents;
2565
2566 if (pos >= fLookLimit) {
2567 fHitEnd = TRUE;
2568 } else {
2569 // Determine whether char c at current position is a member of the word set of chars.
2570 // If we're off the end of the string, behave as though we're not at a word char.
2571 UChar32 c;
2572 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2573 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2574 // Current char is a combining one. Not a boundary.
2575 return FALSE;
2576 }
2577 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2578 }
2579
2580 // Back up until we come to a non-combining char, determine whether
2581 // that char is a word char.
2582 UBool prevCIsWord = FALSE;
2583 for (;;) {
2584 if (pos <= fLookStart) {
2585 break;
2586 }
2587 UChar32 prevChar;
2588 U16_PREV(inputBuf, fLookStart, pos, prevChar);
2589 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2590 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2591 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2592 break;
2593 }
2594 }
2595 isBoundary = cIsWord ^ prevCIsWord;
2596 return isBoundary;
2597 }
2598
2599 //--------------------------------------------------------------------------------
2600 //
2601 // isUWordBoundary
2602 //
2603 // Test for a word boundary using RBBI word break.
2604 //
2605 // parameters: pos - the current position in the input buffer
2606 //
2607 //--------------------------------------------------------------------------------
2608 UBool RegexMatcher::isUWordBoundary(int64_t pos) {
2609 UBool returnVal = FALSE;
2610 #if UCONFIG_NO_BREAK_ITERATION==0
2611
2612 // If we haven't yet created a break iterator for this matcher, do it now.
2613 if (fWordBreakItr == NULL) {
2614 fWordBreakItr =
2615 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
2616 if (U_FAILURE(fDeferredStatus)) {
2617 return FALSE;
2618 }
2619 fWordBreakItr->setText(fInputText, fDeferredStatus);
2620 }
2621
2622 if (pos >= fLookLimit) {
2623 fHitEnd = TRUE;
2624 returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real"
2625 // words are not boundaries. All non-word chars stand by themselves,
2626 // with word boundaries on both sides.
2627 } else {
2628 if (!UTEXT_USES_U16(fInputText)) {
2629 // !!!: Would like a better way to do this!
2630 UErrorCode status = U_ZERO_ERROR;
2631 pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
2632 }
2633 returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2634 }
2635 #endif
2636 return returnVal;
2637 }
2638
2639 //--------------------------------------------------------------------------------
2640 //
2641 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state
2642 // saves. Increment the "time" counter, and call the
2643 // user callback function if there is one installed.
2644 //
2645 // If the match operation needs to be aborted, either for a time-out
2646 // or because the user callback asked for it, just set an error status.
2647 // The engine will pick that up and stop in its outer loop.
2648 //
2649 //--------------------------------------------------------------------------------
2650 void RegexMatcher::IncrementTime(UErrorCode &status) {
2651 fTickCounter = TIMER_INITIAL_VALUE;
2652 fTime++;
2653 if (fCallbackFn != NULL) {
2654 if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2655 status = U_REGEX_STOPPED_BY_CALLER;
2656 return;
2657 }
2658 }
2659 if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2660 status = U_REGEX_TIME_OUT;
2661 }
2662 }
2663
2664 //--------------------------------------------------------------------------------
2665 //
2666 // ReportFindProgress This function is called once for each advance in the target
2667 // string from the find() function, and calls the user progress callback
2668 // function if there is one installed.
2669 //
2670 // NOTE:
2671 //
2672 // If the match operation needs to be aborted because the user
2673 // callback asked for it, just set an error status.
2674 // The engine will pick that up and stop in its outer loop.
2675 //
2676 //--------------------------------------------------------------------------------
2677 UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) {
2678 if (fFindProgressCallbackFn != NULL) {
2679 if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) {
2680 status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/;
2681 return FALSE;
2682 }
2683 }
2684 return TRUE;
2685 }
2686
2687 //--------------------------------------------------------------------------------
2688 //
2689 // StateSave
2690 // Make a new stack frame, initialized as a copy of the current stack frame.
2691 // Set the pattern index in the original stack frame from the operand value
2692 // in the opcode. Execution of the engine continues with the state in
2693 // the newly created stack frame
2694 //
2695 // Note that reserveBlock() may grow the stack, resulting in the
2696 // whole thing being relocated in memory.
2697 //
2698 // Parameters:
2699 // fp The top frame pointer when called. At return, a new
2700 // fame will be present
2701 // savePatIdx An index into the compiled pattern. Goes into the original
2702 // (not new) frame. If execution ever back-tracks out of the
2703 // new frame, this will be where we continue from in the pattern.
2704 // Return
2705 // The new frame pointer.
2706 //
2707 //--------------------------------------------------------------------------------
2708 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2709 // push storage for a new frame.
2710 int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2711 if (newFP == NULL) {
2712 // Failure on attempted stack expansion.
2713 // Stack function set some other error code, change it to a more
2714 // specific one for regular expressions.
2715 status = U_REGEX_STACK_OVERFLOW;
2716 // We need to return a writable stack frame, so just return the
2717 // previous frame. The match operation will stop quickly
2718 // because of the error status, after which the frame will never
2719 // be looked at again.
2720 return fp;
2721 }
2722 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack.
2723
2724 // New stack frame = copy of old top frame.
2725 int64_t *source = (int64_t *)fp;
2726 int64_t *dest = newFP;
2727 for (;;) {
2728 *dest++ = *source++;
2729 if (source == newFP) {
2730 break;
2731 }
2732 }
2733
2734 fTickCounter--;
2735 if (fTickCounter <= 0) {
2736 IncrementTime(status); // Re-initializes fTickCounter
2737 }
2738 fp->fPatIdx = savePatIdx;
2739 return (REStackFrame *)newFP;
2740 }
2741
2742
2743 //--------------------------------------------------------------------------------
2744 //
2745 // MatchAt This is the actual matching engine.
2746 //
2747 // startIdx: begin matching a this index.
2748 // toEnd: if true, match must extend to end of the input region
2749 //
2750 //--------------------------------------------------------------------------------
2751 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2752 UBool isMatch = FALSE; // True if the we have a match.
2753
2754 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2755
2756 int32_t op; // Operation from the compiled pattern, split into
2757 int32_t opType; // the opcode
2758 int32_t opValue; // and the operand value.
2759
2760 #ifdef REGEX_RUN_DEBUG
2761 if (fTraceDebug)
2762 {
2763 printf("MatchAt(startIdx=%ld)\n", startIdx);
2764 printf("Original Pattern: ");
2765 UChar32 c = utext_next32From(fPattern->fPattern, 0);
2766 while (c != U_SENTINEL) {
2767 if (c<32 || c>256) {
2768 c = '.';
2769 }
2770 REGEX_DUMP_DEBUG_PRINTF(("%c", c));
2771
2772 c = UTEXT_NEXT32(fPattern->fPattern);
2773 }
2774 printf("\n");
2775 printf("Input String: ");
2776 c = utext_next32From(fInputText, 0);
2777 while (c != U_SENTINEL) {
2778 if (c<32 || c>256) {
2779 c = '.';
2780 }
2781 printf("%c", c);
2782
2783 c = UTEXT_NEXT32(fInputText);
2784 }
2785 printf("\n");
2786 printf("\n");
2787 }
2788 #endif
2789
2790 if (U_FAILURE(status)) {
2791 return;
2792 }
2793
2794 // Cache frequently referenced items from the compiled pattern
2795 //
2796 int64_t *pat = fPattern->fCompiledPat->getBuffer();
2797
2798 const UChar *litText = fPattern->fLiteralText.getBuffer();
2799 UVector *sets = fPattern->fSets;
2800
2801 fFrameSize = fPattern->fFrameSize;
2802 REStackFrame *fp = resetStack();
2803
2804 fp->fPatIdx = 0;
2805 fp->fInputIdx = startIdx;
2806
2807 // Zero out the pattern's static data
2808 int32_t i;
2809 for (i = 0; i<fPattern->fDataSize; i++) {
2810 fData[i] = 0;
2811 }
2812
2813 //
2814 // Main loop for interpreting the compiled pattern.
2815 // One iteration of the loop per pattern operation performed.
2816 //
2817 for (;;) {
2818 #if 0
2819 if (_heapchk() != _HEAPOK) {
2820 fprintf(stderr, "Heap Trouble\n");
2821 }
2822 #endif
2823
2824 op = (int32_t)pat[fp->fPatIdx];
2825 opType = URX_TYPE(op);
2826 opValue = URX_VAL(op);
2827 #ifdef REGEX_RUN_DEBUG
2828 if (fTraceDebug) {
2829 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2830 printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp->fInputIdx,
2831 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2832 fPattern->dumpOp(fp->fPatIdx);
2833 }
2834 #endif
2835 fp->fPatIdx++;
2836
2837 switch (opType) {
2838
2839
2840 case URX_NOP:
2841 break;
2842
2843
2844 case URX_BACKTRACK:
2845 // Force a backtrack. In some circumstances, the pattern compiler
2846 // will notice that the pattern can't possibly match anything, and will
2847 // emit one of these at that point.
2848 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2849 break;
2850
2851
2852 case URX_ONECHAR:
2853 if (fp->fInputIdx < fActiveLimit) {
2854 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2855 UChar32 c = UTEXT_NEXT32(fInputText);
2856 if (c == opValue) {
2857 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2858 break;
2859 }
2860 } else {
2861 fHitEnd = TRUE;
2862 }
2863 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2864 break;
2865
2866
2867 case URX_STRING:
2868 {
2869 // Test input against a literal string.
2870 // Strings require two slots in the compiled pattern, one for the
2871 // offset to the string text, and one for the length.
2872
2873 int32_t stringStartIdx = opValue;
2874 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
2875 fp->fPatIdx++;
2876 opType = URX_TYPE(op);
2877 int32_t stringLen = URX_VAL(op);
2878 U_ASSERT(opType == URX_STRING_LEN);
2879 U_ASSERT(stringLen >= 2);
2880
2881 const UChar *patternString = litText+stringStartIdx;
2882 int32_t patternStringIndex = 0;
2883 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2884 UChar32 inputChar;
2885 UChar32 patternChar;
2886 UBool success = TRUE;
2887 while (patternStringIndex < stringLen) {
2888 if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
2889 success = FALSE;
2890 fHitEnd = TRUE;
2891 break;
2892 }
2893 inputChar = UTEXT_NEXT32(fInputText);
2894 U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
2895 if (patternChar != inputChar) {
2896 success = FALSE;
2897 break;
2898 }
2899 }
2900
2901 if (success) {
2902 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2903 } else {
2904 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2905 }
2906 }
2907 break;
2908
2909
2910 case URX_STATE_SAVE:
2911 fp = StateSave(fp, opValue, status);
2912 break;
2913
2914
2915 case URX_END:
2916 // The match loop will exit via this path on a successful match,
2917 // when we reach the end of the pattern.
2918 if (toEnd && fp->fInputIdx != fActiveLimit) {
2919 // The pattern matched, but not to the end of input. Try some more.
2920 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2921 break;
2922 }
2923 isMatch = TRUE;
2924 goto breakFromLoop;
2925
2926 // Start and End Capture stack frame variables are laid out out like this:
2927 // fp->fExtra[opValue] - The start of a completed capture group
2928 // opValue+1 - The end of a completed capture group
2929 // opValue+2 - the start of a capture group whose end
2930 // has not yet been reached (and might not ever be).
2931 case URX_START_CAPTURE:
2932 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2933 fp->fExtra[opValue+2] = fp->fInputIdx;
2934 break;
2935
2936
2937 case URX_END_CAPTURE:
2938 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2939 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
2940 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
2941 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
2942 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
2943 break;
2944
2945
2946 case URX_DOLLAR: // $, test for End of line
2947 // or for position before new line at end of input
2948 {
2949 if (fp->fInputIdx >= fAnchorLimit) {
2950 // We really are at the end of input. Success.
2951 fHitEnd = TRUE;
2952 fRequireEnd = TRUE;
2953 break;
2954 }
2955
2956 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2957
2958 // If we are positioned just before a new-line that is located at the
2959 // end of input, succeed.
2960 UChar32 c = UTEXT_NEXT32(fInputText);
2961 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2962 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
2963 // If not in the middle of a CR/LF sequence
2964 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
2965 // At new-line at end of input. Success
2966 fHitEnd = TRUE;
2967 fRequireEnd = TRUE;
2968
2969 break;
2970 }
2971 }
2972 } else {
2973 UChar32 nextC = UTEXT_NEXT32(fInputText);
2974 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2975 fHitEnd = TRUE;
2976 fRequireEnd = TRUE;
2977 break; // At CR/LF at end of input. Success
2978 }
2979 }
2980
2981 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2982 }
2983 break;
2984
2985
2986 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
2987 if (fp->fInputIdx >= fAnchorLimit) {
2988 // Off the end of input. Success.
2989 fHitEnd = TRUE;
2990 fRequireEnd = TRUE;
2991 break;
2992 } else {
2993 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2994 UChar32 c = UTEXT_NEXT32(fInputText);
2995 // Either at the last character of input, or off the end.
2996 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
2997 fHitEnd = TRUE;
2998 fRequireEnd = TRUE;
2999 break;
3000 }
3001 }
3002
3003 // Not at end of input. Back-track out.
3004 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3005 break;
3006
3007
3008 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
3009 {
3010 if (fp->fInputIdx >= fAnchorLimit) {
3011 // We really are at the end of input. Success.
3012 fHitEnd = TRUE;
3013 fRequireEnd = TRUE;
3014 break;
3015 }
3016 // If we are positioned just before a new-line, succeed.
3017 // It makes no difference where the new-line is within the input.
3018 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3019 UChar32 c = UTEXT_CURRENT32(fInputText);
3020 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
3021 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
3022 // In multi-line mode, hitting a new-line just before the end of input does not
3023 // set the hitEnd or requireEnd flags
3024 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
3025 break;
3026 }
3027 }
3028 // not at a new line. Fail.
3029 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3030 }
3031 break;
3032
3033
3034 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
3035 {
3036 if (fp->fInputIdx >= fAnchorLimit) {
3037 // We really are at the end of input. Success.
3038 fHitEnd = TRUE;
3039 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
3040 break; // adding a new-line would not lose the match.
3041 }
3042 // If we are not positioned just before a new-line, the test fails; backtrack out.
3043 // It makes no difference where the new-line is within the input.
3044 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3045 if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3046 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3047 }
3048 }
3049 break;
3050
3051
3052 case URX_CARET: // ^, test for start of line
3053 if (fp->fInputIdx != fAnchorStart) {
3054 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3055 }
3056 break;
3057
3058
3059 case URX_CARET_M: // ^, test for start of line in mulit-line mode
3060 {
3061 if (fp->fInputIdx == fAnchorStart) {
3062 // We are at the start input. Success.
3063 break;
3064 }
3065 // Check whether character just before the current pos is a new-line
3066 // unless we are at the end of input
3067 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3068 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3069 if ((fp->fInputIdx < fAnchorLimit) &&
3070 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3071 // It's a new-line. ^ is true. Success.
3072 // TODO: what should be done with positions between a CR and LF?
3073 break;
3074 }
3075 // Not at the start of a line. Fail.
3076 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3077 }
3078 break;
3079
3080
3081 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
3082 {
3083 U_ASSERT(fp->fInputIdx >= fAnchorStart);
3084 if (fp->fInputIdx <= fAnchorStart) {
3085 // We are at the start input. Success.
3086 break;
3087 }
3088 // Check whether character just before the current pos is a new-line
3089 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3090 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3091 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3092 if (c != 0x0a) {
3093 // Not at the start of a line. Back-track out.
3094 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3095 }
3096 }
3097 break;
3098
3099 case URX_BACKSLASH_B: // Test for word boundaries
3100 {
3101 UBool success = isWordBoundary(fp->fInputIdx);
3102 success ^= (opValue != 0); // flip sense for \B
3103 if (!success) {
3104 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3105 }
3106 }
3107 break;
3108
3109
3110 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
3111 {
3112 UBool success = isUWordBoundary(fp->fInputIdx);
3113 success ^= (opValue != 0); // flip sense for \B
3114 if (!success) {
3115 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3116 }
3117 }
3118 break;
3119
3120
3121 case URX_BACKSLASH_D: // Test for decimal digit
3122 {
3123 if (fp->fInputIdx >= fActiveLimit) {
3124 fHitEnd = TRUE;
3125 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3126 break;
3127 }
3128
3129 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3130
3131 UChar32 c = UTEXT_NEXT32(fInputText);
3132 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
3133 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3134 success ^= (opValue != 0); // flip sense for \D
3135 if (success) {
3136 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3137 } else {
3138 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3139 }
3140 }
3141 break;
3142
3143
3144 case URX_BACKSLASH_G: // Test for position at end of previous match
3145 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3146 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3147 }
3148 break;
3149
3150
3151 case URX_BACKSLASH_X:
3152 // Match a Grapheme, as defined by Unicode TR 29.
3153 // Differs slightly from Perl, which consumes combining marks independently
3154 // of context.
3155 {
3156
3157 // Fail if at end of input
3158 if (fp->fInputIdx >= fActiveLimit) {
3159 fHitEnd = TRUE;
3160 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3161 break;
3162 }
3163
3164 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3165
3166 // Examine (and consume) the current char.
3167 // Dispatch into a little state machine, based on the char.
3168 UChar32 c;
3169 c = UTEXT_NEXT32(fInputText);
3170 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3171 UnicodeSet **sets = fPattern->fStaticSets;
3172 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
3173 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
3174 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3175 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3176 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3177 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3178 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3179 goto GC_Extend;
3180
3181
3182
3183 GC_L:
3184 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3185 c = UTEXT_NEXT32(fInputText);
3186 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3187 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3188 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3189 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3190 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3191 (void)UTEXT_PREVIOUS32(fInputText);
3192 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3193 goto GC_Extend;
3194
3195 GC_V:
3196 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3197 c = UTEXT_NEXT32(fInputText);
3198 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3199 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3200 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3201 (void)UTEXT_PREVIOUS32(fInputText);
3202 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3203 goto GC_Extend;
3204
3205 GC_T:
3206 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3207 c = UTEXT_NEXT32(fInputText);
3208 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3209 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3210 (void)UTEXT_PREVIOUS32(fInputText);
3211 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3212 goto GC_Extend;
3213
3214 GC_Extend:
3215 // Combining characters are consumed here
3216 for (;;) {
3217 if (fp->fInputIdx >= fActiveLimit) {
3218 break;
3219 }
3220 c = UTEXT_CURRENT32(fInputText);
3221 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
3222 break;
3223 }
3224 (void)UTEXT_NEXT32(fInputText);
3225 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3226 }
3227 goto GC_Done;
3228
3229 GC_Control:
3230 // Most control chars stand alone (don't combine with combining chars),
3231 // except for that CR/LF sequence is a single grapheme cluster.
3232 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
3233 c = UTEXT_NEXT32(fInputText);
3234 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3235 }
3236
3237 GC_Done:
3238 if (fp->fInputIdx >= fActiveLimit) {
3239 fHitEnd = TRUE;
3240 }
3241 break;
3242 }
3243
3244
3245
3246
3247 case URX_BACKSLASH_Z: // Test for end of Input
3248 if (fp->fInputIdx < fAnchorLimit) {
3249 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3250 } else {
3251 fHitEnd = TRUE;
3252 fRequireEnd = TRUE;
3253 }
3254 break;
3255
3256
3257
3258 case URX_STATIC_SETREF:
3259 {
3260 // Test input character against one of the predefined sets
3261 // (Word Characters, for example)
3262 // The high bit of the op value is a flag for the match polarity.
3263 // 0: success if input char is in set.
3264 // 1: success if input char is not in set.
3265 if (fp->fInputIdx >= fActiveLimit) {
3266 fHitEnd = TRUE;
3267 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3268 break;
3269 }
3270
3271 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3272 opValue &= ~URX_NEG_SET;
3273 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3274
3275 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3276 UChar32 c = UTEXT_NEXT32(fInputText);
3277 if (c < 256) {
3278 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3279 if (s8->contains(c)) {
3280 success = !success;
3281 }
3282 } else {
3283 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3284 if (s->contains(c)) {
3285 success = !success;
3286 }
3287 }
3288 if (success) {
3289 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3290 } else {
3291 // the character wasn't in the set.
3292 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3293 }
3294 }
3295 break;
3296
3297
3298 case URX_STAT_SETREF_N:
3299 {
3300 // Test input character for NOT being a member of one of
3301 // the predefined sets (Word Characters, for example)
3302 if (fp->fInputIdx >= fActiveLimit) {
3303 fHitEnd = TRUE;
3304 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3305 break;
3306 }
3307
3308 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3309
3310 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3311
3312 UChar32 c = UTEXT_NEXT32(fInputText);
3313 if (c < 256) {
3314 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3315 if (s8->contains(c) == FALSE) {
3316 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3317 break;
3318 }
3319 } else {
3320 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3321 if (s->contains(c) == FALSE) {
3322 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3323 break;
3324 }
3325 }
3326 // the character wasn't in the set.
3327 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3328 }
3329 break;
3330
3331
3332 case URX_SETREF:
3333 if (fp->fInputIdx >= fActiveLimit) {
3334 fHitEnd = TRUE;
3335 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3336 break;
3337 } else {
3338 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3339
3340 // There is input left. Pick up one char and test it for set membership.
3341 UChar32 c = UTEXT_NEXT32(fInputText);
3342 U_ASSERT(opValue > 0 && opValue < sets->size());
3343 if (c<256) {
3344 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3345 if (s8->contains(c)) {
3346 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3347 break;
3348 }
3349 } else {
3350 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3351 if (s->contains(c)) {
3352 // The character is in the set. A Match.
3353 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3354 break;
3355 }
3356 }
3357
3358 // the character wasn't in the set.
3359 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3360 }
3361 break;
3362
3363
3364 case URX_DOTANY:
3365 {
3366 // . matches anything, but stops at end-of-line.
3367 if (fp->fInputIdx >= fActiveLimit) {
3368 // At end of input. Match failed. Backtrack out.
3369 fHitEnd = TRUE;
3370 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3371 break;
3372 }
3373
3374 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3375
3376 // There is input left. Advance over one char, unless we've hit end-of-line
3377 UChar32 c = UTEXT_NEXT32(fInputText);
3378 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
3379 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3380 // End of line in normal mode. . does not match.
3381 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3382 break;
3383 }
3384 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3385 }
3386 break;
3387
3388
3389 case URX_DOTANY_ALL:
3390 {
3391 // ., in dot-matches-all (including new lines) mode
3392 if (fp->fInputIdx >= fActiveLimit) {
3393 // At end of input. Match failed. Backtrack out.
3394 fHitEnd = TRUE;
3395 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3396 break;
3397 }
3398
3399 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3400
3401 // There is input left. Advance over one char, except if we are
3402 // at a cr/lf, advance over both of them.
3403 UChar32 c;
3404 c = UTEXT_NEXT32(fInputText);
3405 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3406 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3407 // In the case of a CR/LF, we need to advance over both.
3408 UChar32 nextc = UTEXT_CURRENT32(fInputText);
3409 if (nextc == 0x0a) {
3410 (void)UTEXT_NEXT32(fInputText);
3411 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3412 }
3413 }
3414 }
3415 break;
3416
3417
3418 case URX_DOTANY_UNIX:
3419 {
3420 // '.' operator, matches all, but stops at end-of-line.
3421 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
3422 if (fp->fInputIdx >= fActiveLimit) {
3423 // At end of input. Match failed. Backtrack out.
3424 fHitEnd = TRUE;
3425 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3426 break;
3427 }
3428
3429 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3430
3431 // There is input left. Advance over one char, unless we've hit end-of-line
3432 UChar32 c = UTEXT_NEXT32(fInputText);
3433 if (c == 0x0a) {
3434 // End of line in normal mode. '.' does not match the \n
3435 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3436 } else {
3437 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3438 }
3439 }
3440 break;
3441
3442
3443 case URX_JMP:
3444 fp->fPatIdx = opValue;
3445 break;
3446
3447 case URX_FAIL:
3448 isMatch = FALSE;
3449 goto breakFromLoop;
3450
3451 case URX_JMP_SAV:
3452 U_ASSERT(opValue < fPattern->fCompiledPat->size());
3453 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3454 fp->fPatIdx = opValue; // Then JMP.
3455 break;
3456
3457 case URX_JMP_SAV_X:
3458 // This opcode is used with (x)+, when x can match a zero length string.
3459 // Same as JMP_SAV, except conditional on the match having made forward progress.
3460 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3461 // data address of the input position at the start of the loop.
3462 {
3463 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3464 int32_t stoOp = (int32_t)pat[opValue-1];
3465 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3466 int32_t frameLoc = URX_VAL(stoOp);
3467 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3468 int64_t prevInputIdx = fp->fExtra[frameLoc];
3469 U_ASSERT(prevInputIdx <= fp->fInputIdx);
3470 if (prevInputIdx < fp->fInputIdx) {
3471 // The match did make progress. Repeat the loop.
3472 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3473 fp->fPatIdx = opValue;
3474 fp->fExtra[frameLoc] = fp->fInputIdx;
3475 }
3476 // If the input position did not advance, we do nothing here,
3477 // execution will fall out of the loop.
3478 }
3479 break;
3480
3481 case URX_CTR_INIT:
3482 {
3483 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3484 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3485
3486 // Pick up the three extra operands that CTR_INIT has, and
3487 // skip the pattern location counter past
3488 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3489 fp->fPatIdx += 3;
3490 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3491 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3492 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3493 U_ASSERT(minCount>=0);
3494 U_ASSERT(maxCount>=minCount || maxCount==-1);
3495 U_ASSERT(loopLoc>fp->fPatIdx);
3496
3497 if (minCount == 0) {
3498 fp = StateSave(fp, loopLoc+1, status);
3499 }
3500 if (maxCount == 0) {
3501 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3502 }
3503 }
3504 break;
3505
3506 case URX_CTR_LOOP:
3507 {
3508 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3509 int32_t initOp = (int32_t)pat[opValue];
3510 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3511 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3512 int32_t minCount = (int32_t)pat[opValue+2];
3513 int32_t maxCount = (int32_t)pat[opValue+3];
3514 // Increment the counter. Note: we DIDN'T worry about counter
3515 // overflow, since the data comes from UnicodeStrings, which
3516 // stores its length in an int32_t. Do we have to think about
3517 // this now that we're using UText? Probably not, since the length
3518 // in UChar32s is still an int32_t.
3519 (*pCounter)++;
3520 U_ASSERT(*pCounter > 0);
3521 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
3522 U_ASSERT(*pCounter == maxCount || maxCount == -1);
3523 break;
3524 }
3525 if (*pCounter >= minCount) {
3526 fp = StateSave(fp, fp->fPatIdx, status);
3527 }
3528 fp->fPatIdx = opValue + 4; // Loop back.
3529 }
3530 break;
3531
3532 case URX_CTR_INIT_NG:
3533 {
3534 // Initialize a non-greedy loop
3535 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3536 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3537
3538 // Pick up the three extra operands that CTR_INIT has, and
3539 // skip the pattern location counter past
3540 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3541 fp->fPatIdx += 3;
3542 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3543 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3544 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3545 U_ASSERT(minCount>=0);
3546 U_ASSERT(maxCount>=minCount || maxCount==-1);
3547 U_ASSERT(loopLoc>fp->fPatIdx);
3548
3549 if (minCount == 0) {
3550 if (maxCount != 0) {
3551 fp = StateSave(fp, fp->fPatIdx, status);
3552 }
3553 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
3554 }
3555 }
3556 break;
3557
3558 case URX_CTR_LOOP_NG:
3559 {
3560 // Non-greedy {min, max} loops
3561 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3562 int32_t initOp = (int32_t)pat[opValue];
3563 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3564 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3565 int32_t minCount = (int32_t)pat[opValue+2];
3566 int32_t maxCount = (int32_t)pat[opValue+3];
3567 // Increment the counter. Note: we DIDN'T worry about counter
3568 // overflow, since the data comes from UnicodeStrings, which
3569 // stores its length in an int32_t. Do we have to think about
3570 // this now that we're using UText? Probably not, since the length
3571 // in UChar32s is still an int32_t.
3572 (*pCounter)++;
3573 U_ASSERT(*pCounter > 0);
3574
3575 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
3576 // The loop has matched the maximum permitted number of times.
3577 // Break out of here with no action. Matching will
3578 // continue with the following pattern.
3579 U_ASSERT(*pCounter == maxCount || maxCount == -1);
3580 break;
3581 }
3582
3583 if (*pCounter < minCount) {
3584 // We haven't met the minimum number of matches yet.
3585 // Loop back for another one.
3586 fp->fPatIdx = opValue + 4; // Loop back.
3587 } else {
3588 // We do have the minimum number of matches.
3589 // Fall into the following pattern, but first do
3590 // a state save to the top of the loop, so that a failure
3591 // in the following pattern will try another iteration of the loop.
3592 fp = StateSave(fp, opValue + 4, status);
3593 }
3594 }
3595 break;
3596
3597 case URX_STO_SP:
3598 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3599 fData[opValue] = fStack->size();
3600 break;
3601
3602 case URX_LD_SP:
3603 {
3604 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3605 int32_t newStackSize = (int32_t)fData[opValue];
3606 U_ASSERT(newStackSize <= fStack->size());
3607 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3608 if (newFP == (int64_t *)fp) {
3609 break;
3610 }
3611 int32_t i;
3612 for (i=0; i<fFrameSize; i++) {
3613 newFP[i] = ((int64_t *)fp)[i];
3614 }
3615 fp = (REStackFrame *)newFP;
3616 fStack->setSize(newStackSize);
3617 }
3618 break;
3619
3620 case URX_BACKREF:
3621 {
3622 U_ASSERT(opValue < fFrameSize);
3623 int64_t groupStartIdx = fp->fExtra[opValue];
3624 int64_t groupEndIdx = fp->fExtra[opValue+1];
3625 U_ASSERT(groupStartIdx <= groupEndIdx);
3626 if (groupStartIdx < 0) {
3627 // This capture group has not participated in the match thus far,
3628 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3629 break;
3630 }
3631 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3632 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3633
3634 // Note: if the capture group match was of an empty string the backref
3635 // match succeeds. Verified by testing: Perl matches succeed
3636 // in this case, so we do too.
3637
3638 UBool success = TRUE;
3639 for (;;) {
3640 if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3641 success = TRUE;
3642 break;
3643 }
3644 if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
3645 success = FALSE;
3646 fHitEnd = TRUE;
3647 break;
3648 }
3649 UChar32 captureGroupChar = utext_next32(fAltInputText);
3650 UChar32 inputChar = utext_next32(fInputText);
3651 if (inputChar != captureGroupChar) {
3652 success = FALSE;
3653 break;
3654 }
3655 }
3656
3657 if (success) {
3658 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3659 } else {
3660 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3661 }
3662 }
3663 break;
3664
3665
3666
3667 case URX_BACKREF_I:
3668 {
3669 U_ASSERT(opValue < fFrameSize);
3670 int64_t groupStartIdx = fp->fExtra[opValue];
3671 int64_t groupEndIdx = fp->fExtra[opValue+1];
3672 U_ASSERT(groupStartIdx <= groupEndIdx);
3673 if (groupStartIdx < 0) {
3674 // This capture group has not participated in the match thus far,
3675 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3676 break;
3677 }
3678 utext_setNativeIndex(fAltInputText, groupStartIdx);
3679 utext_setNativeIndex(fInputText, fp->fInputIdx);
3680 CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
3681 CaseFoldingUTextIterator inputItr(*fInputText);
3682
3683 // Note: if the capture group match was of an empty string the backref
3684 // match succeeds. Verified by testing: Perl matches succeed
3685 // in this case, so we do too.
3686
3687 UBool success = TRUE;
3688 for (;;) {
3689 if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3690 success = TRUE;
3691 break;
3692 }
3693 if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
3694 success = FALSE;
3695 fHitEnd = TRUE;
3696 break;
3697 }
3698 UChar32 captureGroupChar = captureGroupItr.next();
3699 UChar32 inputChar = inputItr.next();
3700 if (inputChar != captureGroupChar) {
3701 success = FALSE;
3702 break;
3703 }
3704 }
3705
3706 if (success && inputItr.inExpansion()) {
3707 // We otained a match by consuming part of a string obtained from
3708 // case-folding a single code point of the input text.
3709 // This does not count as an overall match.
3710 success = FALSE;
3711 }
3712
3713 if (success) {
3714 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3715 } else {
3716 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3717 }
3718
3719 }
3720 break;
3721
3722 case URX_STO_INP_LOC:
3723 {
3724 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3725 fp->fExtra[opValue] = fp->fInputIdx;
3726 }
3727 break;
3728
3729 case URX_JMPX:
3730 {
3731 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3732 fp->fPatIdx += 1;
3733 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
3734 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3735 int64_t savedInputIdx = fp->fExtra[dataLoc];
3736 U_ASSERT(savedInputIdx <= fp->fInputIdx);
3737 if (savedInputIdx < fp->fInputIdx) {
3738 fp->fPatIdx = opValue; // JMP
3739 } else {
3740 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
3741 }
3742 }
3743 break;
3744
3745 case URX_LA_START:
3746 {
3747 // Entering a lookahead block.
3748 // Save Stack Ptr, Input Pos.
3749 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3750 fData[opValue] = fStack->size();
3751 fData[opValue+1] = fp->fInputIdx;
3752 fActiveStart = fLookStart; // Set the match region change for
3753 fActiveLimit = fLookLimit; // transparent bounds.
3754 }
3755 break;
3756
3757 case URX_LA_END:
3758 {
3759 // Leaving a look-ahead block.
3760 // restore Stack Ptr, Input Pos to positions they had on entry to block.
3761 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3762 int32_t stackSize = fStack->size();
3763 int32_t newStackSize =(int32_t)fData[opValue];
3764 U_ASSERT(stackSize >= newStackSize);
3765 if (stackSize > newStackSize) {
3766 // Copy the current top frame back to the new (cut back) top frame.
3767 // This makes the capture groups from within the look-ahead
3768 // expression available.
3769 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3770 int32_t i;
3771 for (i=0; i<fFrameSize; i++) {
3772 newFP[i] = ((int64_t *)fp)[i];
3773 }
3774 fp = (REStackFrame *)newFP;
3775 fStack->setSize(newStackSize);
3776 }
3777 fp->fInputIdx = fData[opValue+1];
3778
3779 // Restore the active region bounds in the input string; they may have
3780 // been changed because of transparent bounds on a Region.
3781 fActiveStart = fRegionStart;
3782 fActiveLimit = fRegionLimit;
3783 }
3784 break;
3785
3786 case URX_ONECHAR_I:
3787 // Case insensitive one char. The char from the pattern is already case folded.
3788 // Input text is not, but case folding the input can not reduce two or more code
3789 // points to one.
3790 if (fp->fInputIdx < fActiveLimit) {
3791 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3792
3793 UChar32 c = UTEXT_NEXT32(fInputText);
3794 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3795 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3796 break;
3797 }
3798 } else {
3799 fHitEnd = TRUE;
3800 }
3801
3802 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3803 break;
3804
3805 case URX_STRING_I:
3806 {
3807 // Case-insensitive test input against a literal string.
3808 // Strings require two slots in the compiled pattern, one for the
3809 // offset to the string text, and one for the length.
3810 // The compiled string has already been case folded.
3811 {
3812 const UChar *patternString = litText + opValue;
3813 int32_t patternStringIdx = 0;
3814
3815 op = (int32_t)pat[fp->fPatIdx];
3816 fp->fPatIdx++;
3817 opType = URX_TYPE(op);
3818 opValue = URX_VAL(op);
3819 U_ASSERT(opType == URX_STRING_LEN);
3820 int32_t patternStringLen = opValue; // Length of the string from the pattern.
3821
3822
3823 UChar32 cPattern;
3824 UChar32 cText;
3825 UBool success = TRUE;
3826
3827 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3828 CaseFoldingUTextIterator inputIterator(*fInputText);
3829 while (patternStringIdx < patternStringLen) {
3830 if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
3831 success = FALSE;
3832 fHitEnd = TRUE;
3833 break;
3834 }
3835 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
3836 cText = inputIterator.next();
3837 if (cText != cPattern) {
3838 success = FALSE;
3839 break;
3840 }
3841 }
3842 if (inputIterator.inExpansion()) {
3843 success = FALSE;
3844 }
3845
3846 if (success) {
3847 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3848 } else {
3849 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3850 }
3851 }
3852 }
3853 break;
3854
3855 case URX_LB_START:
3856 {
3857 // Entering a look-behind block.
3858 // Save Stack Ptr, Input Pos.
3859 // TODO: implement transparent bounds. Ticket #6067
3860 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3861 fData[opValue] = fStack->size();
3862 fData[opValue+1] = fp->fInputIdx;
3863 // Init the variable containing the start index for attempted matches.
3864 fData[opValue+2] = -1;
3865 // Save input string length, then reset to pin any matches to end at
3866 // the current position.
3867 fData[opValue+3] = fActiveLimit;
3868 fActiveLimit = fp->fInputIdx;
3869 }
3870 break;
3871
3872
3873 case URX_LB_CONT:
3874 {
3875 // Positive Look-Behind, at top of loop checking for matches of LB expression
3876 // at all possible input starting positions.
3877
3878 // Fetch the min and max possible match lengths. They are the operands
3879 // of this op in the pattern.
3880 int32_t minML = (int32_t)pat[fp->fPatIdx++];
3881 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3882 U_ASSERT(minML <= maxML);
3883 U_ASSERT(minML >= 0);
3884
3885 // Fetch (from data) the last input index where a match was attempted.
3886 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3887 int64_t *lbStartIdx = &fData[opValue+2];
3888 if (*lbStartIdx < 0) {
3889 // First time through loop.
3890 *lbStartIdx = fp->fInputIdx - minML;
3891 } else {
3892 // 2nd through nth time through the loop.
3893 // Back up start position for match by one.
3894 if (*lbStartIdx == 0) {
3895 (*lbStartIdx)--;
3896 } else {
3897 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
3898 (void)UTEXT_PREVIOUS32(fInputText);
3899 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3900 }
3901 }
3902
3903 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
3904 // We have tried all potential match starting points without
3905 // getting a match. Backtrack out, and out of the
3906 // Look Behind altogether.
3907 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3908 int64_t restoreInputLen = fData[opValue+3];
3909 U_ASSERT(restoreInputLen >= fActiveLimit);
3910 U_ASSERT(restoreInputLen <= fInputLength);
3911 fActiveLimit = restoreInputLen;
3912 break;
3913 }
3914
3915 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3916 // (successful match will fall off the end of the loop.)
3917 fp = StateSave(fp, fp->fPatIdx-3, status);
3918 fp->fInputIdx = *lbStartIdx;
3919 }
3920 break;
3921
3922 case URX_LB_END:
3923 // End of a look-behind block, after a successful match.
3924 {
3925 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3926 if (fp->fInputIdx != fActiveLimit) {
3927 // The look-behind expression matched, but the match did not
3928 // extend all the way to the point that we are looking behind from.
3929 // FAIL out of here, which will take us back to the LB_CONT, which
3930 // will retry the match starting at another position or fail
3931 // the look-behind altogether, whichever is appropriate.
3932 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3933 break;
3934 }
3935
3936 // Look-behind match is good. Restore the orignal input string length,
3937 // which had been truncated to pin the end of the lookbehind match to the
3938 // position being looked-behind.
3939 int64_t originalInputLen = fData[opValue+3];
3940 U_ASSERT(originalInputLen >= fActiveLimit);
3941 U_ASSERT(originalInputLen <= fInputLength);
3942 fActiveLimit = originalInputLen;
3943 }
3944 break;
3945
3946
3947 case URX_LBN_CONT:
3948 {
3949 // Negative Look-Behind, at top of loop checking for matches of LB expression
3950 // at all possible input starting positions.
3951
3952 // Fetch the extra parameters of this op.
3953 int32_t minML = (int32_t)pat[fp->fPatIdx++];
3954 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3955 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
3956 continueLoc = URX_VAL(continueLoc);
3957 U_ASSERT(minML <= maxML);
3958 U_ASSERT(minML >= 0);
3959 U_ASSERT(continueLoc > fp->fPatIdx);
3960
3961 // Fetch (from data) the last input index where a match was attempted.
3962 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3963 int64_t *lbStartIdx = &fData[opValue+2];
3964 if (*lbStartIdx < 0) {
3965 // First time through loop.
3966 *lbStartIdx = fp->fInputIdx - minML;
3967 } else {
3968 // 2nd through nth time through the loop.
3969 // Back up start position for match by one.
3970 if (*lbStartIdx == 0) {
3971 (*lbStartIdx)--;
3972 } else {
3973 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
3974 (void)UTEXT_PREVIOUS32(fInputText);
3975 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3976 }
3977 }
3978
3979 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
3980 // We have tried all potential match starting points without
3981 // getting a match, which means that the negative lookbehind as
3982 // a whole has succeeded. Jump forward to the continue location
3983 int64_t restoreInputLen = fData[opValue+3];
3984 U_ASSERT(restoreInputLen >= fActiveLimit);
3985 U_ASSERT(restoreInputLen <= fInputLength);
3986 fActiveLimit = restoreInputLen;
3987 fp->fPatIdx = continueLoc;
3988 break;
3989 }
3990
3991 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3992 // (successful match will cause a FAIL out of the loop altogether.)
3993 fp = StateSave(fp, fp->fPatIdx-4, status);
3994 fp->fInputIdx = *lbStartIdx;
3995 }
3996 break;
3997
3998 case URX_LBN_END:
3999 // End of a negative look-behind block, after a successful match.
4000 {
4001 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4002 if (fp->fInputIdx != fActiveLimit) {
4003 // The look-behind expression matched, but the match did not
4004 // extend all the way to the point that we are looking behind from.
4005 // FAIL out of here, which will take us back to the LB_CONT, which
4006 // will retry the match starting at another position or succeed
4007 // the look-behind altogether, whichever is appropriate.
4008 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4009 break;
4010 }
4011
4012 // Look-behind expression matched, which means look-behind test as
4013 // a whole Fails
4014
4015 // Restore the orignal input string length, which had been truncated
4016 // inorder to pin the end of the lookbehind match
4017 // to the position being looked-behind.
4018 int64_t originalInputLen = fData[opValue+3];
4019 U_ASSERT(originalInputLen >= fActiveLimit);
4020 U_ASSERT(originalInputLen <= fInputLength);
4021 fActiveLimit = originalInputLen;
4022
4023 // Restore original stack position, discarding any state saved
4024 // by the successful pattern match.
4025 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4026 int32_t newStackSize = (int32_t)fData[opValue];
4027 U_ASSERT(fStack->size() > newStackSize);
4028 fStack->setSize(newStackSize);
4029
4030 // FAIL, which will take control back to someplace
4031 // prior to entering the look-behind test.
4032 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4033 }
4034 break;
4035
4036
4037 case URX_LOOP_SR_I:
4038 // Loop Initialization for the optimized implementation of
4039 // [some character set]*
4040 // This op scans through all matching input.
4041 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4042 {
4043 U_ASSERT(opValue > 0 && opValue < sets->size());
4044 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4045 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
4046
4047 // Loop through input, until either the input is exhausted or
4048 // we reach a character that is not a member of the set.
4049 int64_t ix = fp->fInputIdx;
4050 UTEXT_SETNATIVEINDEX(fInputText, ix);
4051 for (;;) {
4052 if (ix >= fActiveLimit) {
4053 fHitEnd = TRUE;
4054 break;
4055 }
4056 UChar32 c = UTEXT_NEXT32(fInputText);
4057 if (c<256) {
4058 if (s8->contains(c) == FALSE) {
4059 break;
4060 }
4061 } else {
4062 if (s->contains(c) == FALSE) {
4063 break;
4064 }
4065 }
4066 ix = UTEXT_GETNATIVEINDEX(fInputText);
4067 }
4068
4069 // If there were no matching characters, skip over the loop altogether.
4070 // The loop doesn't run at all, a * op always succeeds.
4071 if (ix == fp->fInputIdx) {
4072 fp->fPatIdx++; // skip the URX_LOOP_C op.
4073 break;
4074 }
4075
4076 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4077 // must follow. It's operand is the stack location
4078 // that holds the starting input index for the match of this [set]*
4079 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4080 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4081 int32_t stackLoc = URX_VAL(loopcOp);
4082 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4083 fp->fExtra[stackLoc] = fp->fInputIdx;
4084 fp->fInputIdx = ix;
4085
4086 // Save State to the URX_LOOP_C op that follows this one,
4087 // so that match failures in the following code will return to there.
4088 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4089 fp = StateSave(fp, fp->fPatIdx, status);
4090 fp->fPatIdx++;
4091 }
4092 break;
4093
4094
4095 case URX_LOOP_DOT_I:
4096 // Loop Initialization for the optimized implementation of .*
4097 // This op scans through all remaining input.
4098 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4099 {
4100 // Loop through input until the input is exhausted (we reach an end-of-line)
4101 // In DOTALL mode, we can just go straight to the end of the input.
4102 int64_t ix;
4103 if ((opValue & 1) == 1) {
4104 // Dot-matches-All mode. Jump straight to the end of the string.
4105 ix = fActiveLimit;
4106 fHitEnd = TRUE;
4107 } else {
4108 // NOT DOT ALL mode. Line endings do not match '.'
4109 // Scan forward until a line ending or end of input.
4110 ix = fp->fInputIdx;
4111 UTEXT_SETNATIVEINDEX(fInputText, ix);
4112 for (;;) {
4113 if (ix >= fActiveLimit) {
4114 fHitEnd = TRUE;
4115 break;
4116 }
4117 UChar32 c = UTEXT_NEXT32(fInputText);
4118 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
4119 if ((c == 0x0a) || // 0x0a is newline in both modes.
4120 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
4121 (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) {
4122 // char is a line ending. Exit the scanning loop.
4123 break;
4124 }
4125 }
4126 ix = UTEXT_GETNATIVEINDEX(fInputText);
4127 }
4128 }
4129
4130 // If there were no matching characters, skip over the loop altogether.
4131 // The loop doesn't run at all, a * op always succeeds.
4132 if (ix == fp->fInputIdx) {
4133 fp->fPatIdx++; // skip the URX_LOOP_C op.
4134 break;
4135 }
4136
4137 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4138 // must follow. It's operand is the stack location
4139 // that holds the starting input index for the match of this .*
4140 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4141 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4142 int32_t stackLoc = URX_VAL(loopcOp);
4143 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4144 fp->fExtra[stackLoc] = fp->fInputIdx;
4145 fp->fInputIdx = ix;
4146
4147 // Save State to the URX_LOOP_C op that follows this one,
4148 // so that match failures in the following code will return to there.
4149 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4150 fp = StateSave(fp, fp->fPatIdx, status);
4151 fp->fPatIdx++;
4152 }
4153 break;
4154
4155
4156 case URX_LOOP_C:
4157 {
4158 U_ASSERT(opValue>=0 && opValue<fFrameSize);
4159 backSearchIndex = fp->fExtra[opValue];
4160 U_ASSERT(backSearchIndex <= fp->fInputIdx);
4161 if (backSearchIndex == fp->fInputIdx) {
4162 // We've backed up the input idx to the point that the loop started.
4163 // The loop is done. Leave here without saving state.
4164 // Subsequent failures won't come back here.
4165 break;
4166 }
4167 // Set up for the next iteration of the loop, with input index
4168 // backed up by one from the last time through,
4169 // and a state save to this instruction in case the following code fails again.
4170 // (We're going backwards because this loop emulates stack unwinding, not
4171 // the initial scan forward.)
4172 U_ASSERT(fp->fInputIdx > 0);
4173 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4174 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4175 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4176
4177 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4178 if (prevC == 0x0a &&
4179 fp->fInputIdx > backSearchIndex &&
4180 twoPrevC == 0x0d) {
4181 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4182 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4183 // .*, stepping back over CRLF pair.
4184 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4185 }
4186 }
4187
4188
4189 fp = StateSave(fp, fp->fPatIdx-1, status);
4190 }
4191 break;
4192
4193
4194
4195 default:
4196 // Trouble. The compiled pattern contains an entry with an
4197 // unrecognized type tag.
4198 U_ASSERT(FALSE);
4199 }
4200
4201 if (U_FAILURE(status)) {
4202 isMatch = FALSE;
4203 break;
4204 }
4205 }
4206
4207 breakFromLoop:
4208 fMatch = isMatch;
4209 if (isMatch) {
4210 fLastMatchEnd = fMatchEnd;
4211 fMatchStart = startIdx;
4212 fMatchEnd = fp->fInputIdx;
4213 if (fTraceDebug) {
4214 REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd));
4215 }
4216 }
4217 else
4218 {
4219 if (fTraceDebug) {
4220 REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
4221 }
4222 }
4223
4224 fFrame = fp; // The active stack frame when the engine stopped.
4225 // Contains the capture group results that we need to
4226 // access later.
4227 return;
4228 }
4229
4230
4231 //--------------------------------------------------------------------------------
4232 //
4233 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with the
4234 // assumption that the entire string is available in the UText's
4235 // chunk buffer. For now, that means we can use int32_t indexes,
4236 // except for anything that needs to be saved (like group starts
4237 // and ends).
4238 //
4239 // startIdx: begin matching a this index.
4240 // toEnd: if true, match must extend to end of the input region
4241 //
4242 //--------------------------------------------------------------------------------
4243 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4244 UBool isMatch = FALSE; // True if the we have a match.
4245
4246 int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4247
4248 int32_t op; // Operation from the compiled pattern, split into
4249 int32_t opType; // the opcode
4250 int32_t opValue; // and the operand value.
4251
4252 #ifdef REGEX_RUN_DEBUG
4253 if (fTraceDebug)
4254 {
4255 printf("MatchAt(startIdx=%ld)\n", startIdx);
4256 printf("Original Pattern: ");
4257 UChar32 c = utext_next32From(fPattern->fPattern, 0);
4258 while (c != U_SENTINEL) {
4259 if (c<32 || c>256) {
4260 c = '.';
4261 }
4262 REGEX_DUMP_DEBUG_PRINTF(("%c", c));
4263
4264 c = UTEXT_NEXT32(fPattern->fPattern);
4265 }
4266 printf("\n");
4267 printf("Input String: ");
4268 c = utext_next32From(fInputText, 0);
4269 while (c != U_SENTINEL) {
4270 if (c<32 || c>256) {
4271 c = '.';
4272 }
4273 printf("%c", c);
4274
4275 c = UTEXT_NEXT32(fInputText);
4276 }
4277 printf("\n");
4278 printf("\n");
4279 }
4280 #endif
4281
4282 if (U_FAILURE(status)) {
4283 return;
4284 }
4285
4286 // Cache frequently referenced items from the compiled pattern
4287 //
4288 int64_t *pat = fPattern->fCompiledPat->getBuffer();
4289
4290 const UChar *litText = fPattern->fLiteralText.getBuffer();
4291 UVector *sets = fPattern->fSets;
4292
4293 const UChar *inputBuf = fInputText->chunkContents;
4294
4295 fFrameSize = fPattern->fFrameSize;
4296 REStackFrame *fp = resetStack();
4297
4298 fp->fPatIdx = 0;
4299 fp->fInputIdx = startIdx;
4300
4301 // Zero out the pattern's static data
4302 int32_t i;
4303 for (i = 0; i<fPattern->fDataSize; i++) {
4304 fData[i] = 0;
4305 }
4306
4307 //
4308 // Main loop for interpreting the compiled pattern.
4309 // One iteration of the loop per pattern operation performed.
4310 //
4311 for (;;) {
4312 #if 0
4313 if (_heapchk() != _HEAPOK) {
4314 fprintf(stderr, "Heap Trouble\n");
4315 }
4316 #endif
4317
4318 op = (int32_t)pat[fp->fPatIdx];
4319 opType = URX_TYPE(op);
4320 opValue = URX_VAL(op);
4321 #ifdef REGEX_RUN_DEBUG
4322 if (fTraceDebug) {
4323 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4324 printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp->fInputIdx,
4325 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4326 fPattern->dumpOp(fp->fPatIdx);
4327 }
4328 #endif
4329 fp->fPatIdx++;
4330
4331 switch (opType) {
4332
4333
4334 case URX_NOP:
4335 break;
4336
4337
4338 case URX_BACKTRACK:
4339 // Force a backtrack. In some circumstances, the pattern compiler
4340 // will notice that the pattern can't possibly match anything, and will
4341 // emit one of these at that point.
4342 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4343 break;
4344
4345
4346 case URX_ONECHAR:
4347 if (fp->fInputIdx < fActiveLimit) {
4348 UChar32 c;
4349 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4350 if (c == opValue) {
4351 break;
4352 }
4353 } else {
4354 fHitEnd = TRUE;
4355 }
4356 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4357 break;
4358
4359
4360 case URX_STRING:
4361 {
4362 // Test input against a literal string.
4363 // Strings require two slots in the compiled pattern, one for the
4364 // offset to the string text, and one for the length.
4365 int32_t stringStartIdx = opValue;
4366 int32_t stringLen;
4367
4368 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
4369 fp->fPatIdx++;
4370 opType = URX_TYPE(op);
4371 stringLen = URX_VAL(op);
4372 U_ASSERT(opType == URX_STRING_LEN);
4373 U_ASSERT(stringLen >= 2);
4374
4375 const UChar * pInp = inputBuf + fp->fInputIdx;
4376 const UChar * pInpLimit = inputBuf + fActiveLimit;
4377 const UChar * pPat = litText+stringStartIdx;
4378 const UChar * pEnd = pInp + stringLen;
4379 UBool success = TRUE;
4380 while (pInp < pEnd) {
4381 if (pInp >= pInpLimit) {
4382 fHitEnd = TRUE;
4383 success = FALSE;
4384 break;
4385 }
4386 if (*pInp++ != *pPat++) {
4387 success = FALSE;
4388 break;
4389 }
4390 }
4391
4392 if (success) {
4393 fp->fInputIdx += stringLen;
4394 } else {
4395 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4396 }
4397 }
4398 break;
4399
4400
4401 case URX_STATE_SAVE:
4402 fp = StateSave(fp, opValue, status);
4403 break;
4404
4405
4406 case URX_END:
4407 // The match loop will exit via this path on a successful match,
4408 // when we reach the end of the pattern.
4409 if (toEnd && fp->fInputIdx != fActiveLimit) {
4410 // The pattern matched, but not to the end of input. Try some more.
4411 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4412 break;
4413 }
4414 isMatch = TRUE;
4415 goto breakFromLoop;
4416
4417 // Start and End Capture stack frame variables are laid out out like this:
4418 // fp->fExtra[opValue] - The start of a completed capture group
4419 // opValue+1 - The end of a completed capture group
4420 // opValue+2 - the start of a capture group whose end
4421 // has not yet been reached (and might not ever be).
4422 case URX_START_CAPTURE:
4423 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4424 fp->fExtra[opValue+2] = fp->fInputIdx;
4425 break;
4426
4427
4428 case URX_END_CAPTURE:
4429 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4430 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
4431 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
4432 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
4433 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4434 break;
4435
4436
4437 case URX_DOLLAR: // $, test for End of line
4438 // or for position before new line at end of input
4439 if (fp->fInputIdx < fAnchorLimit-2) {
4440 // We are no where near the end of input. Fail.
4441 // This is the common case. Keep it first.
4442 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4443 break;
4444 }
4445 if (fp->fInputIdx >= fAnchorLimit) {
4446 // We really are at the end of input. Success.
4447 fHitEnd = TRUE;
4448 fRequireEnd = TRUE;
4449 break;
4450 }
4451
4452 // If we are positioned just before a new-line that is located at the
4453 // end of input, succeed.
4454 if (fp->fInputIdx == fAnchorLimit-1) {
4455 UChar32 c;
4456 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4457
4458 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
4459 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4460 // At new-line at end of input. Success
4461 fHitEnd = TRUE;
4462 fRequireEnd = TRUE;
4463 break;
4464 }
4465 }
4466 } else if (fp->fInputIdx == fAnchorLimit-2 &&
4467 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4468 fHitEnd = TRUE;
4469 fRequireEnd = TRUE;
4470 break; // At CR/LF at end of input. Success
4471 }
4472
4473 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4474
4475 break;
4476
4477
4478 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
4479 if (fp->fInputIdx >= fAnchorLimit-1) {
4480 // Either at the last character of input, or off the end.
4481 if (fp->fInputIdx == fAnchorLimit-1) {
4482 // At last char of input. Success if it's a new line.
4483 if (inputBuf[fp->fInputIdx] == 0x0a) {
4484 fHitEnd = TRUE;
4485 fRequireEnd = TRUE;
4486 break;
4487 }
4488 } else {
4489 // Off the end of input. Success.
4490 fHitEnd = TRUE;
4491 fRequireEnd = TRUE;
4492 break;
4493 }
4494 }
4495
4496 // Not at end of input. Back-track out.
4497 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4498 break;
4499
4500
4501 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
4502 {
4503 if (fp->fInputIdx >= fAnchorLimit) {
4504 // We really are at the end of input. Success.
4505 fHitEnd = TRUE;
4506 fRequireEnd = TRUE;
4507 break;
4508 }
4509 // If we are positioned just before a new-line, succeed.
4510 // It makes no difference where the new-line is within the input.
4511 UChar32 c = inputBuf[fp->fInputIdx];
4512 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
4513 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
4514 // In multi-line mode, hitting a new-line just before the end of input does not
4515 // set the hitEnd or requireEnd flags
4516 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4517 break;
4518 }
4519 }
4520 // not at a new line. Fail.
4521 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4522 }
4523 break;
4524
4525
4526 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
4527 {
4528 if (fp->fInputIdx >= fAnchorLimit) {
4529 // We really are at the end of input. Success.
4530 fHitEnd = TRUE;
4531 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
4532 break; // adding a new-line would not lose the match.
4533 }
4534 // If we are not positioned just before a new-line, the test fails; backtrack out.
4535 // It makes no difference where the new-line is within the input.
4536 if (inputBuf[fp->fInputIdx] != 0x0a) {
4537 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4538 }
4539 }
4540 break;
4541
4542
4543 case URX_CARET: // ^, test for start of line
4544 if (fp->fInputIdx != fAnchorStart) {
4545 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4546 }
4547 break;
4548
4549
4550 case URX_CARET_M: // ^, test for start of line in mulit-line mode
4551 {
4552 if (fp->fInputIdx == fAnchorStart) {
4553 // We are at the start input. Success.
4554 break;
4555 }
4556 // Check whether character just before the current pos is a new-line
4557 // unless we are at the end of input
4558 UChar c = inputBuf[fp->fInputIdx - 1];
4559 if ((fp->fInputIdx < fAnchorLimit) &&
4560 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
4561 // It's a new-line. ^ is true. Success.
4562 // TODO: what should be done with positions between a CR and LF?
4563 break;
4564 }
4565 // Not at the start of a line. Fail.
4566 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4567 }
4568 break;
4569
4570
4571 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
4572 {
4573 U_ASSERT(fp->fInputIdx >= fAnchorStart);
4574 if (fp->fInputIdx <= fAnchorStart) {
4575 // We are at the start input. Success.
4576 break;
4577 }
4578 // Check whether character just before the current pos is a new-line
4579 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4580 UChar c = inputBuf[fp->fInputIdx - 1];
4581 if (c != 0x0a) {
4582 // Not at the start of a line. Back-track out.
4583 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4584 }
4585 }
4586 break;
4587
4588 case URX_BACKSLASH_B: // Test for word boundaries
4589 {
4590 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4591 success ^= (opValue != 0); // flip sense for \B
4592 if (!success) {
4593 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4594 }
4595 }
4596 break;
4597
4598
4599 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
4600 {
4601 UBool success = isUWordBoundary(fp->fInputIdx);
4602 success ^= (opValue != 0); // flip sense for \B
4603 if (!success) {
4604 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4605 }
4606 }
4607 break;
4608
4609
4610 case URX_BACKSLASH_D: // Test for decimal digit
4611 {
4612 if (fp->fInputIdx >= fActiveLimit) {
4613 fHitEnd = TRUE;
4614 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4615 break;
4616 }
4617
4618 UChar32 c;
4619 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4620 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
4621 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4622 success ^= (opValue != 0); // flip sense for \D
4623 if (!success) {
4624 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4625 }
4626 }
4627 break;
4628
4629
4630 case URX_BACKSLASH_G: // Test for position at end of previous match
4631 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
4632 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4633 }
4634 break;
4635
4636
4637 case URX_BACKSLASH_X:
4638 // Match a Grapheme, as defined by Unicode TR 29.
4639 // Differs slightly from Perl, which consumes combining marks independently
4640 // of context.
4641 {
4642
4643 // Fail if at end of input
4644 if (fp->fInputIdx >= fActiveLimit) {
4645 fHitEnd = TRUE;
4646 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4647 break;
4648 }
4649
4650 // Examine (and consume) the current char.
4651 // Dispatch into a little state machine, based on the char.
4652 UChar32 c;
4653 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4654 UnicodeSet **sets = fPattern->fStaticSets;
4655 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
4656 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
4657 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4658 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4659 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4660 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4661 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4662 goto GC_Extend;
4663
4664
4665
4666 GC_L:
4667 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4668 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4669 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4670 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4671 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4672 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4673 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4674 goto GC_Extend;
4675
4676 GC_V:
4677 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4678 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4679 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4680 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4681 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4682 goto GC_Extend;
4683
4684 GC_T:
4685 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4686 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4687 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4688 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4689 goto GC_Extend;
4690
4691 GC_Extend:
4692 // Combining characters are consumed here
4693 for (;;) {
4694 if (fp->fInputIdx >= fActiveLimit) {
4695 break;
4696 }
4697 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4698 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
4699 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
4700 break;
4701 }
4702 }
4703 goto GC_Done;
4704
4705 GC_Control:
4706 // Most control chars stand alone (don't combine with combining chars),
4707 // except for that CR/LF sequence is a single grapheme cluster.
4708 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
4709 fp->fInputIdx++;
4710 }
4711
4712 GC_Done:
4713 if (fp->fInputIdx >= fActiveLimit) {
4714 fHitEnd = TRUE;
4715 }
4716 break;
4717 }
4718
4719
4720
4721
4722 case URX_BACKSLASH_Z: // Test for end of Input
4723 if (fp->fInputIdx < fAnchorLimit) {
4724 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4725 } else {
4726 fHitEnd = TRUE;
4727 fRequireEnd = TRUE;
4728 }
4729 break;
4730
4731
4732
4733 case URX_STATIC_SETREF:
4734 {
4735 // Test input character against one of the predefined sets
4736 // (Word Characters, for example)
4737 // The high bit of the op value is a flag for the match polarity.
4738 // 0: success if input char is in set.
4739 // 1: success if input char is not in set.
4740 if (fp->fInputIdx >= fActiveLimit) {
4741 fHitEnd = TRUE;
4742 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4743 break;
4744 }
4745
4746 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
4747 opValue &= ~URX_NEG_SET;
4748 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4749
4750 UChar32 c;
4751 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4752 if (c < 256) {
4753 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4754 if (s8->contains(c)) {
4755 success = !success;
4756 }
4757 } else {
4758 const UnicodeSet *s = fPattern->fStaticSets[opValue];
4759 if (s->contains(c)) {
4760 success = !success;
4761 }
4762 }
4763 if (!success) {
4764 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4765 }
4766 }
4767 break;
4768
4769
4770 case URX_STAT_SETREF_N:
4771 {
4772 // Test input character for NOT being a member of one of
4773 // the predefined sets (Word Characters, for example)
4774 if (fp->fInputIdx >= fActiveLimit) {
4775 fHitEnd = TRUE;
4776 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4777 break;
4778 }
4779
4780 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4781
4782 UChar32 c;
4783 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4784 if (c < 256) {
4785 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4786 if (s8->contains(c) == FALSE) {
4787 break;
4788 }
4789 } else {
4790 const UnicodeSet *s = fPattern->fStaticSets[opValue];
4791 if (s->contains(c) == FALSE) {
4792 break;
4793 }
4794 }
4795 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4796 }
4797 break;
4798
4799
4800 case URX_SETREF:
4801 {
4802 if (fp->fInputIdx >= fActiveLimit) {
4803 fHitEnd = TRUE;
4804 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4805 break;
4806 }
4807
4808 U_ASSERT(opValue > 0 && opValue < sets->size());
4809
4810 // There is input left. Pick up one char and test it for set membership.
4811 UChar32 c;
4812 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4813 if (c<256) {
4814 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4815 if (s8->contains(c)) {
4816 // The character is in the set. A Match.
4817 break;
4818 }
4819 } else {
4820 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
4821 if (s->contains(c)) {
4822 // The character is in the set. A Match.
4823 break;
4824 }
4825 }
4826
4827 // the character wasn't in the set.
4828 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4829 }
4830 break;
4831
4832
4833 case URX_DOTANY:
4834 {
4835 // . matches anything, but stops at end-of-line.
4836 if (fp->fInputIdx >= fActiveLimit) {
4837 // At end of input. Match failed. Backtrack out.
4838 fHitEnd = TRUE;
4839 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4840 break;
4841 }
4842
4843 // There is input left. Advance over one char, unless we've hit end-of-line
4844 UChar32 c;
4845 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4846 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
4847 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
4848 // End of line in normal mode. . does not match.
4849 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4850 break;
4851 }
4852 }
4853 break;
4854
4855
4856 case URX_DOTANY_ALL:
4857 {
4858 // . in dot-matches-all (including new lines) mode
4859 if (fp->fInputIdx >= fActiveLimit) {
4860 // At end of input. Match failed. Backtrack out.
4861 fHitEnd = TRUE;
4862 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4863 break;
4864 }
4865
4866 // There is input left. Advance over one char, except if we are
4867 // at a cr/lf, advance over both of them.
4868 UChar32 c;
4869 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4870 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
4871 // In the case of a CR/LF, we need to advance over both.
4872 if (inputBuf[fp->fInputIdx] == 0x0a) {
4873 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
4874 }
4875 }
4876 }
4877 break;
4878
4879
4880 case URX_DOTANY_UNIX:
4881 {
4882 // '.' operator, matches all, but stops at end-of-line.
4883 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
4884 if (fp->fInputIdx >= fActiveLimit) {
4885 // At end of input. Match failed. Backtrack out.
4886 fHitEnd = TRUE;
4887 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4888 break;
4889 }
4890
4891 // There is input left. Advance over one char, unless we've hit end-of-line
4892 UChar32 c;
4893 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4894 if (c == 0x0a) {
4895 // End of line in normal mode. '.' does not match the \n
4896 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4897 }
4898 }
4899 break;
4900
4901
4902 case URX_JMP:
4903 fp->fPatIdx = opValue;
4904 break;
4905
4906 case URX_FAIL:
4907 isMatch = FALSE;
4908 goto breakFromLoop;
4909
4910 case URX_JMP_SAV:
4911 U_ASSERT(opValue < fPattern->fCompiledPat->size());
4912 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
4913 fp->fPatIdx = opValue; // Then JMP.
4914 break;
4915
4916 case URX_JMP_SAV_X:
4917 // This opcode is used with (x)+, when x can match a zero length string.
4918 // Same as JMP_SAV, except conditional on the match having made forward progress.
4919 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
4920 // data address of the input position at the start of the loop.
4921 {
4922 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
4923 int32_t stoOp = (int32_t)pat[opValue-1];
4924 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
4925 int32_t frameLoc = URX_VAL(stoOp);
4926 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
4927 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
4928 U_ASSERT(prevInputIdx <= fp->fInputIdx);
4929 if (prevInputIdx < fp->fInputIdx) {
4930 // The match did make progress. Repeat the loop.
4931 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
4932 fp->fPatIdx = opValue;
4933 fp->fExtra[frameLoc] = fp->fInputIdx;
4934 }
4935 // If the input position did not advance, we do nothing here,
4936 // execution will fall out of the loop.
4937 }
4938 break;
4939
4940 case URX_CTR_INIT:
4941 {
4942 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
4943 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
4944
4945 // Pick up the three extra operands that CTR_INIT has, and
4946 // skip the pattern location counter past
4947 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
4948 fp->fPatIdx += 3;
4949 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
4950 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
4951 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
4952 U_ASSERT(minCount>=0);
4953 U_ASSERT(maxCount>=minCount || maxCount==-1);
4954 U_ASSERT(loopLoc>fp->fPatIdx);
4955
4956 if (minCount == 0) {
4957 fp = StateSave(fp, loopLoc+1, status);
4958 }
4959 if (maxCount == 0) {
4960 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4961 }
4962 }
4963 break;
4964
4965 case URX_CTR_LOOP:
4966 {
4967 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
4968 int32_t initOp = (int32_t)pat[opValue];
4969 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
4970 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
4971 int32_t minCount = (int32_t)pat[opValue+2];
4972 int32_t maxCount = (int32_t)pat[opValue+3];
4973 // Increment the counter. Note: we DIDN'T worry about counter
4974 // overflow, since the data comes from UnicodeStrings, which
4975 // stores its length in an int32_t. Do we have to think about
4976 // this now that we're using UText? Probably not, since the length
4977 // in UChar32s is still an int32_t.
4978 (*pCounter)++;
4979 U_ASSERT(*pCounter > 0);
4980 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
4981 U_ASSERT(*pCounter == maxCount || maxCount == -1);
4982 break;
4983 }
4984 if (*pCounter >= minCount) {
4985 fp = StateSave(fp, fp->fPatIdx, status);
4986 }
4987 fp->fPatIdx = opValue + 4; // Loop back.
4988 }
4989 break;
4990
4991 case URX_CTR_INIT_NG:
4992 {
4993 // Initialize a non-greedy loop
4994 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
4995 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
4996
4997 // Pick up the three extra operands that CTR_INIT has, and
4998 // skip the pattern location counter past
4999 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5000 fp->fPatIdx += 3;
5001 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
5002 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5003 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5004 U_ASSERT(minCount>=0);
5005 U_ASSERT(maxCount>=minCount || maxCount==-1);
5006 U_ASSERT(loopLoc>fp->fPatIdx);
5007
5008 if (minCount == 0) {
5009 if (maxCount != 0) {
5010 fp = StateSave(fp, fp->fPatIdx, status);
5011 }
5012 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
5013 }
5014 }
5015 break;
5016
5017 case URX_CTR_LOOP_NG:
5018 {
5019 // Non-greedy {min, max} loops
5020 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5021 int32_t initOp = (int32_t)pat[opValue];
5022 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
5023 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5024 int32_t minCount = (int32_t)pat[opValue+2];
5025 int32_t maxCount = (int32_t)pat[opValue+3];
5026 // Increment the counter. Note: we DIDN'T worry about counter
5027 // overflow, since the data comes from UnicodeStrings, which
5028 // stores its length in an int32_t. Do we have to think about
5029 // this now that we're using UText? Probably not, since the length
5030 // in UChar32s is still an int32_t.
5031 (*pCounter)++;
5032 U_ASSERT(*pCounter > 0);
5033
5034 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
5035 // The loop has matched the maximum permitted number of times.
5036 // Break out of here with no action. Matching will
5037 // continue with the following pattern.
5038 U_ASSERT(*pCounter == maxCount || maxCount == -1);
5039 break;
5040 }
5041
5042 if (*pCounter < minCount) {
5043 // We haven't met the minimum number of matches yet.
5044 // Loop back for another one.
5045 fp->fPatIdx = opValue + 4; // Loop back.
5046 } else {
5047 // We do have the minimum number of matches.
5048 // Fall into the following pattern, but first do
5049 // a state save to the top of the loop, so that a failure
5050 // in the following pattern will try another iteration of the loop.
5051 fp = StateSave(fp, opValue + 4, status);
5052 }
5053 }
5054 break;
5055
5056 case URX_STO_SP:
5057 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5058 fData[opValue] = fStack->size();
5059 break;
5060
5061 case URX_LD_SP:
5062 {
5063 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5064 int32_t newStackSize = (int32_t)fData[opValue];
5065 U_ASSERT(newStackSize <= fStack->size());
5066 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5067 if (newFP == (int64_t *)fp) {
5068 break;
5069 }
5070 int32_t i;
5071 for (i=0; i<fFrameSize; i++) {
5072 newFP[i] = ((int64_t *)fp)[i];
5073 }
5074 fp = (REStackFrame *)newFP;
5075 fStack->setSize(newStackSize);
5076 }
5077 break;
5078
5079 case URX_BACKREF:
5080 {
5081 U_ASSERT(opValue < fFrameSize);
5082 int64_t groupStartIdx = fp->fExtra[opValue];
5083 int64_t groupEndIdx = fp->fExtra[opValue+1];
5084 U_ASSERT(groupStartIdx <= groupEndIdx);
5085 int64_t inputIndex = fp->fInputIdx;
5086 if (groupStartIdx < 0) {
5087 // This capture group has not participated in the match thus far,
5088 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5089 break;
5090 }
5091 UBool success = TRUE;
5092 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
5093 if (inputIndex >= fActiveLimit) {
5094 success = FALSE;
5095 fHitEnd = TRUE;
5096 break;
5097 }
5098 if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
5099 success = FALSE;
5100 break;
5101 }
5102 }
5103 if (success) {
5104 fp->fInputIdx = inputIndex;
5105 } else {
5106 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5107 }
5108 }
5109 break;
5110
5111 case URX_BACKREF_I:
5112 {
5113 U_ASSERT(opValue < fFrameSize);
5114 int64_t groupStartIdx = fp->fExtra[opValue];
5115 int64_t groupEndIdx = fp->fExtra[opValue+1];
5116 U_ASSERT(groupStartIdx <= groupEndIdx);
5117 if (groupStartIdx < 0) {
5118 // This capture group has not participated in the match thus far,
5119 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5120 break;
5121 }
5122 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
5123 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
5124
5125 // Note: if the capture group match was of an empty string the backref
5126 // match succeeds. Verified by testing: Perl matches succeed
5127 // in this case, so we do too.
5128
5129 UBool success = TRUE;
5130 for (;;) {
5131 UChar32 captureGroupChar = captureGroupItr.next();
5132 if (captureGroupChar == U_SENTINEL) {
5133 success = TRUE;
5134 break;
5135 }
5136 UChar32 inputChar = inputItr.next();
5137 if (inputChar == U_SENTINEL) {
5138 success = FALSE;
5139 fHitEnd = TRUE;
5140 break;
5141 }
5142 if (inputChar != captureGroupChar) {
5143 success = FALSE;
5144 break;
5145 }
5146 }
5147
5148 if (success && inputItr.inExpansion()) {
5149 // We otained a match by consuming part of a string obtained from
5150 // case-folding a single code point of the input text.
5151 // This does not count as an overall match.
5152 success = FALSE;
5153 }
5154
5155 if (success) {
5156 fp->fInputIdx = inputItr.getIndex();
5157 } else {
5158 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5159 }
5160 }
5161 break;
5162
5163 case URX_STO_INP_LOC:
5164 {
5165 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5166 fp->fExtra[opValue] = fp->fInputIdx;
5167 }
5168 break;
5169
5170 case URX_JMPX:
5171 {
5172 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5173 fp->fPatIdx += 1;
5174 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
5175 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5176 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5177 U_ASSERT(savedInputIdx <= fp->fInputIdx);
5178 if (savedInputIdx < fp->fInputIdx) {
5179 fp->fPatIdx = opValue; // JMP
5180 } else {
5181 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
5182 }
5183 }
5184 break;
5185
5186 case URX_LA_START:
5187 {
5188 // Entering a lookahead block.
5189 // Save Stack Ptr, Input Pos.
5190 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5191 fData[opValue] = fStack->size();
5192 fData[opValue+1] = fp->fInputIdx;
5193 fActiveStart = fLookStart; // Set the match region change for
5194 fActiveLimit = fLookLimit; // transparent bounds.
5195 }
5196 break;
5197
5198 case URX_LA_END:
5199 {
5200 // Leaving a look-ahead block.
5201 // restore Stack Ptr, Input Pos to positions they had on entry to block.
5202 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5203 int32_t stackSize = fStack->size();
5204 int32_t newStackSize = (int32_t)fData[opValue];
5205 U_ASSERT(stackSize >= newStackSize);
5206 if (stackSize > newStackSize) {
5207 // Copy the current top frame back to the new (cut back) top frame.
5208 // This makes the capture groups from within the look-ahead
5209 // expression available.
5210 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5211 int32_t i;
5212 for (i=0; i<fFrameSize; i++) {
5213 newFP[i] = ((int64_t *)fp)[i];
5214 }
5215 fp = (REStackFrame *)newFP;
5216 fStack->setSize(newStackSize);
5217 }
5218 fp->fInputIdx = fData[opValue+1];
5219
5220 // Restore the active region bounds in the input string; they may have
5221 // been changed because of transparent bounds on a Region.
5222 fActiveStart = fRegionStart;
5223 fActiveLimit = fRegionLimit;
5224 }
5225 break;
5226
5227 case URX_ONECHAR_I:
5228 if (fp->fInputIdx < fActiveLimit) {
5229 UChar32 c;
5230 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5231 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5232 break;
5233 }
5234 } else {
5235 fHitEnd = TRUE;
5236 }
5237 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5238 break;
5239
5240 case URX_STRING_I:
5241 // Case-insensitive test input against a literal string.
5242 // Strings require two slots in the compiled pattern, one for the
5243 // offset to the string text, and one for the length.
5244 // The compiled string has already been case folded.
5245 {
5246 const UChar *patternString = litText + opValue;
5247
5248 op = (int32_t)pat[fp->fPatIdx];
5249 fp->fPatIdx++;
5250 opType = URX_TYPE(op);
5251 opValue = URX_VAL(op);
5252 U_ASSERT(opType == URX_STRING_LEN);
5253 int32_t patternStringLen = opValue; // Length of the string from the pattern.
5254
5255 UChar32 cText;
5256 UChar32 cPattern;
5257 UBool success = TRUE;
5258 int32_t patternStringIdx = 0;
5259 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
5260 while (patternStringIdx < patternStringLen) {
5261 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
5262 cText = inputIterator.next();
5263 if (cText != cPattern) {
5264 success = FALSE;
5265 if (cText == U_SENTINEL) {
5266 fHitEnd = TRUE;
5267 }
5268 break;
5269 }
5270 }
5271 if (inputIterator.inExpansion()) {
5272 success = FALSE;
5273 }
5274
5275 if (success) {
5276 fp->fInputIdx = inputIterator.getIndex();
5277 } else {
5278 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5279 }
5280 }
5281 break;
5282
5283 case URX_LB_START:
5284 {
5285 // Entering a look-behind block.
5286 // Save Stack Ptr, Input Pos.
5287 // TODO: implement transparent bounds. Ticket #6067
5288 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5289 fData[opValue] = fStack->size();
5290 fData[opValue+1] = fp->fInputIdx;
5291 // Init the variable containing the start index for attempted matches.
5292 fData[opValue+2] = -1;
5293 // Save input string length, then reset to pin any matches to end at
5294 // the current position.
5295 fData[opValue+3] = fActiveLimit;
5296 fActiveLimit = fp->fInputIdx;
5297 }
5298 break;
5299
5300
5301 case URX_LB_CONT:
5302 {
5303 // Positive Look-Behind, at top of loop checking for matches of LB expression
5304 // at all possible input starting positions.
5305
5306 // Fetch the min and max possible match lengths. They are the operands
5307 // of this op in the pattern.
5308 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5309 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5310 U_ASSERT(minML <= maxML);
5311 U_ASSERT(minML >= 0);
5312
5313 // Fetch (from data) the last input index where a match was attempted.
5314 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5315 int64_t *lbStartIdx = &fData[opValue+2];
5316 if (*lbStartIdx < 0) {
5317 // First time through loop.
5318 *lbStartIdx = fp->fInputIdx - minML;
5319 } else {
5320 // 2nd through nth time through the loop.
5321 // Back up start position for match by one.
5322 if (*lbStartIdx == 0) {
5323 (*lbStartIdx)--;
5324 } else {
5325 U16_BACK_1(inputBuf, 0, *lbStartIdx);
5326 }
5327 }
5328
5329 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5330 // We have tried all potential match starting points without
5331 // getting a match. Backtrack out, and out of the
5332 // Look Behind altogether.
5333 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5334 int64_t restoreInputLen = fData[opValue+3];
5335 U_ASSERT(restoreInputLen >= fActiveLimit);
5336 U_ASSERT(restoreInputLen <= fInputLength);
5337 fActiveLimit = restoreInputLen;
5338 break;
5339 }
5340
5341 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5342 // (successful match will fall off the end of the loop.)
5343 fp = StateSave(fp, fp->fPatIdx-3, status);
5344 fp->fInputIdx = *lbStartIdx;
5345 }
5346 break;
5347
5348 case URX_LB_END:
5349 // End of a look-behind block, after a successful match.
5350 {
5351 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5352 if (fp->fInputIdx != fActiveLimit) {
5353 // The look-behind expression matched, but the match did not
5354 // extend all the way to the point that we are looking behind from.
5355 // FAIL out of here, which will take us back to the LB_CONT, which
5356 // will retry the match starting at another position or fail
5357 // the look-behind altogether, whichever is appropriate.
5358 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5359 break;
5360 }
5361
5362 // Look-behind match is good. Restore the orignal input string length,
5363 // which had been truncated to pin the end of the lookbehind match to the
5364 // position being looked-behind.
5365 int64_t originalInputLen = fData[opValue+3];
5366 U_ASSERT(originalInputLen >= fActiveLimit);
5367 U_ASSERT(originalInputLen <= fInputLength);
5368 fActiveLimit = originalInputLen;
5369 }
5370 break;
5371
5372
5373 case URX_LBN_CONT:
5374 {
5375 // Negative Look-Behind, at top of loop checking for matches of LB expression
5376 // at all possible input starting positions.
5377
5378 // Fetch the extra parameters of this op.
5379 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5380 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5381 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5382 continueLoc = URX_VAL(continueLoc);
5383 U_ASSERT(minML <= maxML);
5384 U_ASSERT(minML >= 0);
5385 U_ASSERT(continueLoc > fp->fPatIdx);
5386
5387 // Fetch (from data) the last input index where a match was attempted.
5388 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5389 int64_t *lbStartIdx = &fData[opValue+2];
5390 if (*lbStartIdx < 0) {
5391 // First time through loop.
5392 *lbStartIdx = fp->fInputIdx - minML;
5393 } else {
5394 // 2nd through nth time through the loop.
5395 // Back up start position for match by one.
5396 if (*lbStartIdx == 0) {
5397 (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0.
5398 } else {
5399 U16_BACK_1(inputBuf, 0, *lbStartIdx);
5400 }
5401 }
5402
5403 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5404 // We have tried all potential match starting points without
5405 // getting a match, which means that the negative lookbehind as
5406 // a whole has succeeded. Jump forward to the continue location
5407 int64_t restoreInputLen = fData[opValue+3];
5408 U_ASSERT(restoreInputLen >= fActiveLimit);
5409 U_ASSERT(restoreInputLen <= fInputLength);
5410 fActiveLimit = restoreInputLen;
5411 fp->fPatIdx = continueLoc;
5412 break;
5413 }
5414
5415 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5416 // (successful match will cause a FAIL out of the loop altogether.)
5417 fp = StateSave(fp, fp->fPatIdx-4, status);
5418 fp->fInputIdx = *lbStartIdx;
5419 }
5420 break;
5421
5422 case URX_LBN_END:
5423 // End of a negative look-behind block, after a successful match.
5424 {
5425 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5426 if (fp->fInputIdx != fActiveLimit) {
5427 // The look-behind expression matched, but the match did not
5428 // extend all the way to the point that we are looking behind from.
5429 // FAIL out of here, which will take us back to the LB_CONT, which
5430 // will retry the match starting at another position or succeed
5431 // the look-behind altogether, whichever is appropriate.
5432 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5433 break;
5434 }
5435
5436 // Look-behind expression matched, which means look-behind test as
5437 // a whole Fails
5438
5439 // Restore the orignal input string length, which had been truncated
5440 // inorder to pin the end of the lookbehind match
5441 // to the position being looked-behind.
5442 int64_t originalInputLen = fData[opValue+3];
5443 U_ASSERT(originalInputLen >= fActiveLimit);
5444 U_ASSERT(originalInputLen <= fInputLength);
5445 fActiveLimit = originalInputLen;
5446
5447 // Restore original stack position, discarding any state saved
5448 // by the successful pattern match.
5449 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5450 int32_t newStackSize = (int32_t)fData[opValue];
5451 U_ASSERT(fStack->size() > newStackSize);
5452 fStack->setSize(newStackSize);
5453
5454 // FAIL, which will take control back to someplace
5455 // prior to entering the look-behind test.
5456 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5457 }
5458 break;
5459
5460
5461 case URX_LOOP_SR_I:
5462 // Loop Initialization for the optimized implementation of
5463 // [some character set]*
5464 // This op scans through all matching input.
5465 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5466 {
5467 U_ASSERT(opValue > 0 && opValue < sets->size());
5468 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5469 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
5470
5471 // Loop through input, until either the input is exhausted or
5472 // we reach a character that is not a member of the set.
5473 int32_t ix = (int32_t)fp->fInputIdx;
5474 for (;;) {
5475 if (ix >= fActiveLimit) {
5476 fHitEnd = TRUE;
5477 break;
5478 }
5479 UChar32 c;
5480 U16_NEXT(inputBuf, ix, fActiveLimit, c);
5481 if (c<256) {
5482 if (s8->contains(c) == FALSE) {
5483 U16_BACK_1(inputBuf, 0, ix);
5484 break;
5485 }
5486 } else {
5487 if (s->contains(c) == FALSE) {
5488 U16_BACK_1(inputBuf, 0, ix);
5489 break;
5490 }
5491 }
5492 }
5493
5494 // If there were no matching characters, skip over the loop altogether.
5495 // The loop doesn't run at all, a * op always succeeds.
5496 if (ix == fp->fInputIdx) {
5497 fp->fPatIdx++; // skip the URX_LOOP_C op.
5498 break;
5499 }
5500
5501 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5502 // must follow. It's operand is the stack location
5503 // that holds the starting input index for the match of this [set]*
5504 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5505 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5506 int32_t stackLoc = URX_VAL(loopcOp);
5507 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5508 fp->fExtra[stackLoc] = fp->fInputIdx;
5509 fp->fInputIdx = ix;
5510
5511 // Save State to the URX_LOOP_C op that follows this one,
5512 // so that match failures in the following code will return to there.
5513 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5514 fp = StateSave(fp, fp->fPatIdx, status);
5515 fp->fPatIdx++;
5516 }
5517 break;
5518
5519
5520 case URX_LOOP_DOT_I:
5521 // Loop Initialization for the optimized implementation of .*
5522 // This op scans through all remaining input.
5523 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5524 {
5525 // Loop through input until the input is exhausted (we reach an end-of-line)
5526 // In DOTALL mode, we can just go straight to the end of the input.
5527 int32_t ix;
5528 if ((opValue & 1) == 1) {
5529 // Dot-matches-All mode. Jump straight to the end of the string.
5530 ix = (int32_t)fActiveLimit;
5531 fHitEnd = TRUE;
5532 } else {
5533 // NOT DOT ALL mode. Line endings do not match '.'
5534 // Scan forward until a line ending or end of input.
5535 ix = (int32_t)fp->fInputIdx;
5536 for (;;) {
5537 if (ix >= fActiveLimit) {
5538 fHitEnd = TRUE;
5539 break;
5540 }
5541 UChar32 c;
5542 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++]
5543 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
5544 if ((c == 0x0a) || // 0x0a is newline in both modes.
5545 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
5546 ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) {
5547 // char is a line ending. Put the input pos back to the
5548 // line ending char, and exit the scanning loop.
5549 U16_BACK_1(inputBuf, 0, ix);
5550 break;
5551 }
5552 }
5553 }
5554 }
5555
5556 // If there were no matching characters, skip over the loop altogether.
5557 // The loop doesn't run at all, a * op always succeeds.
5558 if (ix == fp->fInputIdx) {
5559 fp->fPatIdx++; // skip the URX_LOOP_C op.
5560 break;
5561 }
5562
5563 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5564 // must follow. It's operand is the stack location
5565 // that holds the starting input index for the match of this .*
5566 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5567 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5568 int32_t stackLoc = URX_VAL(loopcOp);
5569 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5570 fp->fExtra[stackLoc] = fp->fInputIdx;
5571 fp->fInputIdx = ix;
5572
5573 // Save State to the URX_LOOP_C op that follows this one,
5574 // so that match failures in the following code will return to there.
5575 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5576 fp = StateSave(fp, fp->fPatIdx, status);
5577 fp->fPatIdx++;
5578 }
5579 break;
5580
5581
5582 case URX_LOOP_C:
5583 {
5584 U_ASSERT(opValue>=0 && opValue<fFrameSize);
5585 backSearchIndex = (int32_t)fp->fExtra[opValue];
5586 U_ASSERT(backSearchIndex <= fp->fInputIdx);
5587 if (backSearchIndex == fp->fInputIdx) {
5588 // We've backed up the input idx to the point that the loop started.
5589 // The loop is done. Leave here without saving state.
5590 // Subsequent failures won't come back here.
5591 break;
5592 }
5593 // Set up for the next iteration of the loop, with input index
5594 // backed up by one from the last time through,
5595 // and a state save to this instruction in case the following code fails again.
5596 // (We're going backwards because this loop emulates stack unwinding, not
5597 // the initial scan forward.)
5598 U_ASSERT(fp->fInputIdx > 0);
5599 UChar32 prevC;
5600 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
5601
5602 if (prevC == 0x0a &&
5603 fp->fInputIdx > backSearchIndex &&
5604 inputBuf[fp->fInputIdx-1] == 0x0d) {
5605 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
5606 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
5607 // .*, stepping back over CRLF pair.
5608 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
5609 }
5610 }
5611
5612
5613 fp = StateSave(fp, fp->fPatIdx-1, status);
5614 }
5615 break;
5616
5617
5618
5619 default:
5620 // Trouble. The compiled pattern contains an entry with an
5621 // unrecognized type tag.
5622 U_ASSERT(FALSE);
5623 }
5624
5625 if (U_FAILURE(status)) {
5626 isMatch = FALSE;
5627 break;
5628 }
5629 }
5630
5631 breakFromLoop:
5632 fMatch = isMatch;
5633 if (isMatch) {
5634 fLastMatchEnd = fMatchEnd;
5635 fMatchStart = startIdx;
5636 fMatchEnd = fp->fInputIdx;
5637 if (fTraceDebug) {
5638 REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd));
5639 }
5640 }
5641 else
5642 {
5643 if (fTraceDebug) {
5644 REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
5645 }
5646 }
5647
5648 fFrame = fp; // The active stack frame when the engine stopped.
5649 // Contains the capture group results that we need to
5650 // access later.
5651
5652 return;
5653 }
5654
5655
5656 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
5657
5658 U_NAMESPACE_END
5659
5660 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS