]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/rematch.cpp
ICU-62141.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / rematch.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f
A
3/*
4**************************************************************************
2ca993e8
A
5* Copyright (C) 2002-2016 International Business Machines Corporation
6* and others. All rights reserved.
b75a7d8f
A
7**************************************************************************
8*/
46f4442e
A
9//
10// file: rematch.cpp
11//
12// Contains the implementation of class RegexMatcher,
13// which is one of the main API classes for the ICU regular expression package.
14//
b75a7d8f
A
15
16#include "unicode/utypes.h"
17#if !UCONFIG_NO_REGULAR_EXPRESSIONS
18
19#include "unicode/regex.h"
20#include "unicode/uniset.h"
21#include "unicode/uchar.h"
22#include "unicode/ustring.h"
374ca955 23#include "unicode/rbbi.h"
4388f060
A
24#include "unicode/utf.h"
25#include "unicode/utf16.h"
b75a7d8f
A
26#include "uassert.h"
27#include "cmemory.h"
2ca993e8 28#include "cstr.h"
b75a7d8f
A
29#include "uvector.h"
30#include "uvectr32.h"
729e4ab9 31#include "uvectr64.h"
b75a7d8f
A
32#include "regeximp.h"
33#include "regexst.h"
729e4ab9
A
34#include "regextxt.h"
35#include "ucase.h"
b75a7d8f
A
36
37// #include <malloc.h> // Needed for heapcheck testing
38
2ca993e8 39
b75a7d8f
A
40U_NAMESPACE_BEGIN
41
46f4442e
A
42// Default limit for the size of the back track stack, to avoid system
43// failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
44// This value puts ICU's limits higher than most other regexp implementations,
45// which use recursion rather than the heap, and take more storage per
46// backtrack point.
47//
48static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
49
50// Time limit counter constant.
51// Time limits for expression evaluation are in terms of quanta of work by
52// the engine, each of which is 10,000 state saves.
53// This constant determines that state saves per tick number.
54static const int32_t TIMER_INITIAL_VALUE = 10000;
55
b331163b
A
56
57// Test for any of the Unicode line terminating characters.
58static inline UBool isLineTerminator(UChar32 c) {
59 if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
60 return false;
61 }
62 return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029;
63}
64
b75a7d8f
A
65//-----------------------------------------------------------------------------
66//
67// Constructor and Destructor
68//
69//-----------------------------------------------------------------------------
57a6839d 70RegexMatcher::RegexMatcher(const RegexPattern *pat) {
46f4442e
A
71 fDeferredStatus = U_ZERO_ERROR;
72 init(fDeferredStatus);
73 if (U_FAILURE(fDeferredStatus)) {
74 return;
75 }
b75a7d8f
A
76 if (pat==NULL) {
77 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
78 return;
79 }
46f4442e 80 fPattern = pat;
729e4ab9 81 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
b75a7d8f
A
82}
83
84
85
86RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
87 uint32_t flags, UErrorCode &status) {
46f4442e 88 init(status);
b75a7d8f
A
89 if (U_FAILURE(status)) {
90 return;
91 }
46f4442e
A
92 UParseError pe;
93 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
729e4ab9 94 fPattern = fPatternOwned;
57a6839d 95
729e4ab9
A
96 UText inputText = UTEXT_INITIALIZER;
97 utext_openConstUnicodeString(&inputText, &input, &status);
98 init2(&inputText, status);
99 utext_close(&inputText);
100
57a6839d 101 fInputUniStrMaybeMutable = TRUE;
729e4ab9
A
102}
103
104
105RegexMatcher::RegexMatcher(UText *regexp, UText *input,
106 uint32_t flags, UErrorCode &status) {
107 init(status);
108 if (U_FAILURE(status)) {
109 return;
110 }
111 UParseError pe;
112 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
113 if (U_FAILURE(status)) {
114 return;
115 }
116
46f4442e
A
117 fPattern = fPatternOwned;
118 init2(input, status);
b75a7d8f
A
119}
120
121
57a6839d 122RegexMatcher::RegexMatcher(const UnicodeString &regexp,
b75a7d8f 123 uint32_t flags, UErrorCode &status) {
46f4442e 124 init(status);
b75a7d8f
A
125 if (U_FAILURE(status)) {
126 return;
127 }
46f4442e
A
128 UParseError pe;
129 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
729e4ab9
A
130 if (U_FAILURE(status)) {
131 return;
132 }
133 fPattern = fPatternOwned;
134 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
135}
136
57a6839d 137RegexMatcher::RegexMatcher(UText *regexp,
729e4ab9
A
138 uint32_t flags, UErrorCode &status) {
139 init(status);
140 if (U_FAILURE(status)) {
141 return;
142 }
143 UParseError pe;
144 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
145 if (U_FAILURE(status)) {
146 return;
147 }
148
46f4442e 149 fPattern = fPatternOwned;
729e4ab9 150 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
b75a7d8f
A
151}
152
153
154
46f4442e 155
b75a7d8f
A
156RegexMatcher::~RegexMatcher() {
157 delete fStack;
158 if (fData != fSmallData) {
374ca955 159 uprv_free(fData);
b75a7d8f
A
160 fData = NULL;
161 }
162 if (fPatternOwned) {
163 delete fPatternOwned;
164 fPatternOwned = NULL;
165 fPattern = NULL;
166 }
57a6839d 167
729e4ab9
A
168 if (fInput) {
169 delete fInput;
170 }
171 if (fInputText) {
172 utext_close(fInputText);
173 }
174 if (fAltInputText) {
175 utext_close(fAltInputText);
176 }
57a6839d 177
374ca955
A
178 #if UCONFIG_NO_BREAK_ITERATION==0
179 delete fWordBreakItr;
180 #endif
b75a7d8f
A
181}
182
46f4442e
A
183//
184// init() common initialization for use by all constructors.
185// Initialize all fields, get the object into a consistent state.
186// This must be done even when the initial status shows an error,
187// so that the object is initialized sufficiently well for the destructor
188// to run safely.
189//
190void RegexMatcher::init(UErrorCode &status) {
191 fPattern = NULL;
192 fPatternOwned = NULL;
46f4442e
A
193 fFrameSize = 0;
194 fRegionStart = 0;
195 fRegionLimit = 0;
196 fAnchorStart = 0;
197 fAnchorLimit = 0;
198 fLookStart = 0;
199 fLookLimit = 0;
200 fActiveStart = 0;
201 fActiveLimit = 0;
202 fTransparentBounds = FALSE;
203 fAnchoringBounds = TRUE;
204 fMatch = FALSE;
205 fMatchStart = 0;
206 fMatchEnd = 0;
207 fLastMatchEnd = -1;
208 fAppendPosition = 0;
209 fHitEnd = FALSE;
210 fRequireEnd = FALSE;
211 fStack = NULL;
212 fFrame = NULL;
213 fTimeLimit = 0;
214 fTime = 0;
215 fTickCounter = 0;
216 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY;
217 fCallbackFn = NULL;
218 fCallbackContext = NULL;
729e4ab9
A
219 fFindProgressCallbackFn = NULL;
220 fFindProgressCallbackContext = NULL;
46f4442e
A
221 fTraceDebug = FALSE;
222 fDeferredStatus = status;
223 fData = fSmallData;
224 fWordBreakItr = NULL;
57a6839d 225
4388f060 226 fStack = NULL;
729e4ab9
A
227 fInputText = NULL;
228 fAltInputText = NULL;
229 fInput = NULL;
230 fInputLength = 0;
231 fInputUniStrMaybeMutable = FALSE;
46f4442e
A
232}
233
234//
235// init2() Common initialization for use by RegexMatcher constructors, part 2.
236// This handles the common setup to be done after the Pattern is available.
237//
729e4ab9 238void RegexMatcher::init2(UText *input, UErrorCode &status) {
46f4442e
A
239 if (U_FAILURE(status)) {
240 fDeferredStatus = status;
241 return;
242 }
243
2ca993e8 244 if (fPattern->fDataSize > UPRV_LENGTHOF(fSmallData)) {
57a6839d 245 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
46f4442e
A
246 if (fData == NULL) {
247 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
248 return;
249 }
250 }
251
4388f060
A
252 fStack = new UVector64(status);
253 if (fStack == NULL) {
254 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
255 return;
256 }
257
46f4442e
A
258 reset(input);
259 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
260 if (U_FAILURE(status)) {
261 fDeferredStatus = status;
262 return;
263 }
264}
b75a7d8f
A
265
266
267static const UChar BACKSLASH = 0x5c;
268static const UChar DOLLARSIGN = 0x24;
b331163b
A
269static const UChar LEFTBRACKET = 0x7b;
270static const UChar RIGHTBRACKET = 0x7d;
271
b75a7d8f
A
272//--------------------------------------------------------------------------------
273//
274// appendReplacement
275//
276//--------------------------------------------------------------------------------
277RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
278 const UnicodeString &replacement,
279 UErrorCode &status) {
729e4ab9 280 UText replacementText = UTEXT_INITIALIZER;
57a6839d 281
729e4ab9 282 utext_openConstUnicodeString(&replacementText, &replacement, &status);
57a6839d 283 if (U_SUCCESS(status)) {
729e4ab9
A
284 UText resultText = UTEXT_INITIALIZER;
285 utext_openUnicodeString(&resultText, &dest, &status);
57a6839d 286
729e4ab9
A
287 if (U_SUCCESS(status)) {
288 appendReplacement(&resultText, &replacementText, status);
289 utext_close(&resultText);
290 }
291 utext_close(&replacementText);
292 }
57a6839d 293
729e4ab9
A
294 return *this;
295}
296
297//
298// appendReplacement, UText mode
299//
300RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
301 UText *replacement,
302 UErrorCode &status) {
b75a7d8f
A
303 if (U_FAILURE(status)) {
304 return *this;
305 }
306 if (U_FAILURE(fDeferredStatus)) {
307 status = fDeferredStatus;
308 return *this;
309 }
310 if (fMatch == FALSE) {
311 status = U_REGEX_INVALID_STATE;
312 return *this;
313 }
57a6839d 314
b75a7d8f 315 // Copy input string from the end of previous match to start of current match
729e4ab9
A
316 int64_t destLen = utext_nativeLength(dest);
317 if (fMatchStart > fAppendPosition) {
318 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
57a6839d 319 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
729e4ab9
A
320 (int32_t)(fMatchStart-fAppendPosition), &status);
321 } else {
322 int32_t len16;
323 if (UTEXT_USES_U16(fInputText)) {
324 len16 = (int32_t)(fMatchStart-fAppendPosition);
325 } else {
326 UErrorCode lengthStatus = U_ZERO_ERROR;
327 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
328 }
329 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
330 if (inputChars == NULL) {
331 status = U_MEMORY_ALLOCATION_ERROR;
332 return *this;
333 }
334 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
335 destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
336 uprv_free(inputChars);
337 }
b75a7d8f 338 }
46f4442e 339 fAppendPosition = fMatchEnd;
57a6839d
A
340
341
b75a7d8f
A
342 // scan the replacement text, looking for substitutions ($n) and \escapes.
343 // TODO: optimize this loop by efficiently scanning for '$' or '\',
344 // move entire ranges not containing substitutions.
729e4ab9 345 UTEXT_SETNATIVEINDEX(replacement, 0);
b331163b 346 for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL; c = UTEXT_NEXT32(replacement)) {
b75a7d8f
A
347 if (c == BACKSLASH) {
348 // Backslash Escape. Copy the following char out without further checks.
349 // Note: Surrogate pairs don't need any special handling
350 // The second half wont be a '$' or a '\', and
351 // will move to the dest normally on the next
352 // loop iteration.
729e4ab9
A
353 c = UTEXT_CURRENT32(replacement);
354 if (c == U_SENTINEL) {
b75a7d8f
A
355 break;
356 }
57a6839d 357
b75a7d8f
A
358 if (c==0x55/*U*/ || c==0x75/*u*/) {
359 // We have a \udddd or \Udddddddd escape sequence.
729e4ab9
A
360 int32_t offset = 0;
361 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
362 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
b75a7d8f 363 if (escapedChar != (UChar32)0xFFFFFFFF) {
729e4ab9
A
364 if (U_IS_BMP(escapedChar)) {
365 UChar c16 = (UChar)escapedChar;
366 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
367 } else {
368 UChar surrogate[2];
369 surrogate[0] = U16_LEAD(escapedChar);
370 surrogate[1] = U16_TRAIL(escapedChar);
371 if (U_SUCCESS(status)) {
372 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
373 }
374 }
b75a7d8f
A
375 // TODO: Report errors for mal-formed \u escapes?
376 // As this is, the original sequence is output, which may be OK.
729e4ab9 377 if (context.lastOffset == offset) {
4388f060 378 (void)UTEXT_PREVIOUS32(replacement);
729e4ab9
A
379 } else if (context.lastOffset != offset-1) {
380 utext_moveIndex32(replacement, offset - context.lastOffset - 1);
381 }
382 }
383 } else {
4388f060 384 (void)UTEXT_NEXT32(replacement);
729e4ab9
A
385 // Plain backslash escape. Just put out the escaped character.
386 if (U_IS_BMP(c)) {
387 UChar c16 = (UChar)c;
388 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
389 } else {
390 UChar surrogate[2];
391 surrogate[0] = U16_LEAD(c);
392 surrogate[1] = U16_TRAIL(c);
393 if (U_SUCCESS(status)) {
394 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
395 }
b75a7d8f
A
396 }
397 }
729e4ab9 398 } else if (c != DOLLARSIGN) {
b75a7d8f 399 // Normal char, not a $. Copy it out without further checks.
729e4ab9
A
400 if (U_IS_BMP(c)) {
401 UChar c16 = (UChar)c;
402 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
403 } else {
404 UChar surrogate[2];
405 surrogate[0] = U16_LEAD(c);
406 surrogate[1] = U16_TRAIL(c);
407 if (U_SUCCESS(status)) {
408 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
409 }
b75a7d8f 410 }
729e4ab9 411 } else {
b331163b
A
412 // We've got a $. Pick up a capture group name or number if one follows.
413 // Consume digits so long as the resulting group number <= the number of
414 // number of capture groups in the pattern.
57a6839d 415
729e4ab9 416 int32_t groupNum = 0;
b331163b
A
417 int32_t numDigits = 0;
418 UChar32 nextChar = utext_current32(replacement);
419 if (nextChar == LEFTBRACKET) {
420 // Scan for a Named Capture Group, ${name}.
421 UnicodeString groupName;
422 utext_next32(replacement);
423 while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {
424 nextChar = utext_next32(replacement);
425 if (nextChar == U_SENTINEL) {
426 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
427 } else if ((nextChar >= 0x41 && nextChar <= 0x5a) || // A..Z
428 (nextChar >= 0x61 && nextChar <= 0x7a) || // a..z
429 (nextChar >= 0x31 && nextChar <= 0x39)) { // 0..9
430 groupName.append(nextChar);
431 } else if (nextChar == RIGHTBRACKET) {
432 groupNum = uhash_geti(fPattern->fNamedCaptureMap, &groupName);
433 if (groupNum == 0) {
434 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
435 }
436 } else {
437 // Character was something other than a name char or a closing '}'
438 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
439 }
729e4ab9 440 }
0f5d89e8 441
b331163b
A
442 } else if (u_isdigit(nextChar)) {
443 // $n Scan for a capture group number
444 int32_t numCaptureGroups = fPattern->fGroupMap->size();
445 for (;;) {
446 nextChar = UTEXT_CURRENT32(replacement);
447 if (nextChar == U_SENTINEL) {
448 break;
449 }
450 if (u_isdigit(nextChar) == FALSE) {
451 break;
452 }
453 int32_t nextDigitVal = u_charDigitValue(nextChar);
454 if (groupNum*10 + nextDigitVal > numCaptureGroups) {
455 // Don't consume the next digit if it makes the capture group number too big.
456 if (numDigits == 0) {
457 status = U_INDEX_OUTOFBOUNDS_ERROR;
458 }
459 break;
460 }
461 (void)UTEXT_NEXT32(replacement);
0f5d89e8 462 groupNum=groupNum*10 + nextDigitVal;
b331163b 463 ++numDigits;
729e4ab9 464 }
b331163b
A
465 } else {
466 // $ not followed by capture group name or number.
467 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
b75a7d8f 468 }
57a6839d 469
b331163b 470 if (U_SUCCESS(status)) {
729e4ab9 471 destLen += appendGroup(groupNum, dest, status);
b75a7d8f 472 }
b331163b
A
473 } // End of $ capture group handling
474 } // End of per-character loop through the replacement string.
57a6839d 475
b75a7d8f
A
476 return *this;
477}
478
479
480
481//--------------------------------------------------------------------------------
482//
483// appendTail Intended to be used in conjunction with appendReplacement()
484// To the destination string, append everything following
485// the last match position from the input string.
486//
46f4442e
A
487// Note: Match ranges do not affect appendTail or appendReplacement
488//
b75a7d8f
A
489//--------------------------------------------------------------------------------
490UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
729e4ab9
A
491 UErrorCode status = U_ZERO_ERROR;
492 UText resultText = UTEXT_INITIALIZER;
493 utext_openUnicodeString(&resultText, &dest, &status);
57a6839d 494
729e4ab9
A
495 if (U_SUCCESS(status)) {
496 appendTail(&resultText, status);
497 utext_close(&resultText);
498 }
57a6839d 499
729e4ab9
A
500 return dest;
501}
502
503//
504// appendTail, UText mode
505//
506UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
729e4ab9 507 if (U_FAILURE(status)) {
57a6839d 508 return dest;
729e4ab9
A
509 }
510 if (U_FAILURE(fDeferredStatus)) {
511 status = fDeferredStatus;
57a6839d 512 return dest;
729e4ab9 513 }
57a6839d 514
729e4ab9
A
515 if (fInputLength > fAppendPosition) {
516 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
517 int64_t destLen = utext_nativeLength(dest);
57a6839d 518 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
729e4ab9
A
519 (int32_t)(fInputLength-fAppendPosition), &status);
520 } else {
521 int32_t len16;
522 if (UTEXT_USES_U16(fInputText)) {
523 len16 = (int32_t)(fInputLength-fAppendPosition);
524 } else {
525 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
526 status = U_ZERO_ERROR; // buffer overflow
527 }
57a6839d 528
729e4ab9
A
529 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
530 if (inputChars == NULL) {
531 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
532 } else {
57a6839d 533 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
729e4ab9
A
534 int64_t destLen = utext_nativeLength(dest);
535 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
536 uprv_free(inputChars);
537 }
538 }
b75a7d8f
A
539 }
540 return dest;
541}
542
543
544
545//--------------------------------------------------------------------------------
546//
547// end
548//
549//--------------------------------------------------------------------------------
550int32_t RegexMatcher::end(UErrorCode &err) const {
551 return end(0, err);
552}
553
729e4ab9
A
554int64_t RegexMatcher::end64(UErrorCode &err) const {
555 return end64(0, err);
556}
b75a7d8f 557
729e4ab9 558int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
b75a7d8f
A
559 if (U_FAILURE(err)) {
560 return -1;
561 }
562 if (fMatch == FALSE) {
563 err = U_REGEX_INVALID_STATE;
564 return -1;
565 }
566 if (group < 0 || group > fPattern->fGroupMap->size()) {
567 err = U_INDEX_OUTOFBOUNDS_ERROR;
568 return -1;
569 }
729e4ab9 570 int64_t e = -1;
b75a7d8f 571 if (group == 0) {
57a6839d 572 e = fMatchEnd;
b75a7d8f
A
573 } else {
574 // Get the position within the stack frame of the variables for
575 // this capture group.
576 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
577 U_ASSERT(groupOffset < fPattern->fFrameSize);
578 U_ASSERT(groupOffset >= 0);
579 e = fFrame->fExtra[groupOffset + 1];
580 }
57a6839d 581
729e4ab9 582 return e;
b75a7d8f
A
583}
584
729e4ab9
A
585int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
586 return (int32_t)end64(group, err);
587}
b75a7d8f 588
b331163b
A
589//--------------------------------------------------------------------------------
590//
591// findProgressInterrupt This function is called once for each advance in the target
592// string from the find() function, and calls the user progress callback
593// function if there is one installed.
594//
595// Return: TRUE if the find operation is to be terminated.
596// FALSE if the find operation is to continue running.
597//
598//--------------------------------------------------------------------------------
599UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) {
600 if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) {
601 status = U_REGEX_STOPPED_BY_CALLER;
602 return TRUE;
603 }
604 return FALSE;
605}
b75a7d8f
A
606
607//--------------------------------------------------------------------------------
608//
609// find()
610//
611//--------------------------------------------------------------------------------
612UBool RegexMatcher::find() {
b331163b
A
613 if (U_FAILURE(fDeferredStatus)) {
614 return FALSE;
615 }
616 UErrorCode status = U_ZERO_ERROR;
617 UBool result = find(status);
618 return result;
619}
620
621//--------------------------------------------------------------------------------
622//
623// find()
624//
625//--------------------------------------------------------------------------------
626UBool RegexMatcher::find(UErrorCode &status) {
b75a7d8f 627 // Start at the position of the last match end. (Will be zero if the
729e4ab9 628 // matcher has been reset.)
b75a7d8f 629 //
b331163b
A
630 if (U_FAILURE(status)) {
631 return FALSE;
632 }
b75a7d8f 633 if (U_FAILURE(fDeferredStatus)) {
b331163b 634 status = fDeferredStatus;
b75a7d8f
A
635 return FALSE;
636 }
57a6839d 637
729e4ab9 638 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
b331163b 639 return findUsingChunk(status);
729e4ab9 640 }
b75a7d8f 641
729e4ab9 642 int64_t startPos = fMatchEnd;
46f4442e
A
643 if (startPos==0) {
644 startPos = fActiveStart;
645 }
374ca955
A
646
647 if (fMatch) {
648 // Save the position of any previous successful match.
649 fLastMatchEnd = fMatchEnd;
650
651 if (fMatchStart == fMatchEnd) {
652 // Previous match had zero length. Move start position up one position
653 // to avoid sending find() into a loop on zero-length matches.
46f4442e 654 if (startPos >= fActiveLimit) {
374ca955 655 fMatch = FALSE;
46f4442e 656 fHitEnd = TRUE;
374ca955
A
657 return FALSE;
658 }
729e4ab9 659 UTEXT_SETNATIVEINDEX(fInputText, startPos);
4388f060 660 (void)UTEXT_NEXT32(fInputText);
729e4ab9 661 startPos = UTEXT_GETNATIVEINDEX(fInputText);
374ca955
A
662 }
663 } else {
664 if (fLastMatchEnd >= 0) {
665 // A previous find() failed to match. Don't try again.
666 // (without this test, a pattern with a zero-length match
667 // could match again at the end of an input string.)
46f4442e 668 fHitEnd = TRUE;
374ca955
A
669 return FALSE;
670 }
671 }
672
374ca955
A
673
674 // Compute the position in the input string beyond which a match can not begin, because
675 // the minimum length match would extend past the end of the input.
46f4442e
A
676 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
677 // Be aware of possible overflows if making changes here.
729e4ab9
A
678 int64_t testStartLimit;
679 if (UTEXT_USES_U16(fInputText)) {
680 testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
681 if (startPos > testStartLimit) {
682 fMatch = FALSE;
683 fHitEnd = TRUE;
684 return FALSE;
685 }
686 } else {
b331163b
A
687 // We don't know exactly how long the minimum match length is in native characters.
688 // Treat anything > 0 as 1.
689 testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0);
b75a7d8f
A
690 }
691
b75a7d8f
A
692 UChar32 c;
693 U_ASSERT(startPos >= 0);
694
695 switch (fPattern->fStartType) {
696 case START_NO_INFO:
57a6839d 697 // No optimization was found.
b75a7d8f
A
698 // Try a match at each input position.
699 for (;;) {
b331163b
A
700 MatchAt(startPos, FALSE, status);
701 if (U_FAILURE(status)) {
b75a7d8f
A
702 return FALSE;
703 }
704 if (fMatch) {
705 return TRUE;
706 }
729e4ab9 707 if (startPos >= testStartLimit) {
46f4442e 708 fHitEnd = TRUE;
b75a7d8f
A
709 return FALSE;
710 }
729e4ab9 711 UTEXT_SETNATIVEINDEX(fInputText, startPos);
4388f060 712 (void)UTEXT_NEXT32(fInputText);
729e4ab9 713 startPos = UTEXT_GETNATIVEINDEX(fInputText);
b75a7d8f
A
714 // Note that it's perfectly OK for a pattern to have a zero-length
715 // match at the end of a string, so we must make sure that the loop
729e4ab9 716 // runs with startPos == testStartLimit the last time through.
b331163b 717 if (findProgressInterrupt(startPos, status))
729e4ab9 718 return FALSE;
b75a7d8f
A
719 }
720 U_ASSERT(FALSE);
721
722 case START_START:
723 // Matches are only possible at the start of the input string
724 // (pattern begins with ^ or \A)
46f4442e 725 if (startPos > fActiveStart) {
374ca955 726 fMatch = FALSE;
b75a7d8f
A
727 return FALSE;
728 }
b331163b
A
729 MatchAt(startPos, FALSE, status);
730 if (U_FAILURE(status)) {
b75a7d8f
A
731 return FALSE;
732 }
733 return fMatch;
734
735
736 case START_SET:
737 {
738 // Match may start on any char from a pre-computed set.
739 U_ASSERT(fPattern->fMinMatchLen > 0);
729e4ab9 740 UTEXT_SETNATIVEINDEX(fInputText, startPos);
b75a7d8f 741 for (;;) {
b331163b 742 int64_t pos = startPos;
729e4ab9 743 c = UTEXT_NEXT32(fInputText);
b331163b 744 startPos = UTEXT_GETNATIVEINDEX(fInputText);
729e4ab9
A
745 // c will be -1 (U_SENTINEL) at end of text, in which case we
746 // skip this next block (so we don't have a negative array index)
747 // and handle end of text in the following block.
748 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
749 (c>=256 && fPattern->fInitialChars->contains(c)))) {
b331163b
A
750 MatchAt(pos, FALSE, status);
751 if (U_FAILURE(status)) {
b75a7d8f
A
752 return FALSE;
753 }
754 if (fMatch) {
755 return TRUE;
756 }
729e4ab9 757 UTEXT_SETNATIVEINDEX(fInputText, pos);
b75a7d8f 758 }
b331163b 759 if (startPos > testStartLimit) {
374ca955 760 fMatch = FALSE;
46f4442e 761 fHitEnd = TRUE;
b75a7d8f
A
762 return FALSE;
763 }
b331163b 764 if (findProgressInterrupt(startPos, status))
729e4ab9 765 return FALSE;
b75a7d8f
A
766 }
767 }
768 U_ASSERT(FALSE);
769
770 case START_STRING:
771 case START_CHAR:
772 {
773 // Match starts on exactly one char.
774 U_ASSERT(fPattern->fMinMatchLen > 0);
775 UChar32 theChar = fPattern->fInitialChar;
729e4ab9 776 UTEXT_SETNATIVEINDEX(fInputText, startPos);
b75a7d8f 777 for (;;) {
b331163b 778 int64_t pos = startPos;
729e4ab9 779 c = UTEXT_NEXT32(fInputText);
b331163b 780 startPos = UTEXT_GETNATIVEINDEX(fInputText);
b75a7d8f 781 if (c == theChar) {
b331163b
A
782 MatchAt(pos, FALSE, status);
783 if (U_FAILURE(status)) {
b75a7d8f
A
784 return FALSE;
785 }
786 if (fMatch) {
787 return TRUE;
788 }
2ca993e8 789 UTEXT_SETNATIVEINDEX(fInputText, startPos);
b75a7d8f 790 }
b331163b 791 if (startPos > testStartLimit) {
374ca955 792 fMatch = FALSE;
46f4442e 793 fHitEnd = TRUE;
b75a7d8f
A
794 return FALSE;
795 }
b331163b 796 if (findProgressInterrupt(startPos, status))
729e4ab9
A
797 return FALSE;
798 }
b75a7d8f
A
799 }
800 U_ASSERT(FALSE);
801
802 case START_LINE:
803 {
804 UChar32 c;
46f4442e 805 if (startPos == fAnchorStart) {
b331163b
A
806 MatchAt(startPos, FALSE, status);
807 if (U_FAILURE(status)) {
b75a7d8f
A
808 return FALSE;
809 }
810 if (fMatch) {
811 return TRUE;
812 }
729e4ab9
A
813 UTEXT_SETNATIVEINDEX(fInputText, startPos);
814 c = UTEXT_NEXT32(fInputText);
815 startPos = UTEXT_GETNATIVEINDEX(fInputText);
816 } else {
817 UTEXT_SETNATIVEINDEX(fInputText, startPos);
818 c = UTEXT_PREVIOUS32(fInputText);
819 UTEXT_SETNATIVEINDEX(fInputText, startPos);
b75a7d8f
A
820 }
821
46f4442e 822 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
729e4ab9 823 for (;;) {
46f4442e 824 if (c == 0x0a) {
b331163b
A
825 MatchAt(startPos, FALSE, status);
826 if (U_FAILURE(status)) {
46f4442e
A
827 return FALSE;
828 }
829 if (fMatch) {
830 return TRUE;
831 }
729e4ab9 832 UTEXT_SETNATIVEINDEX(fInputText, startPos);
46f4442e 833 }
729e4ab9 834 if (startPos >= testStartLimit) {
46f4442e
A
835 fMatch = FALSE;
836 fHitEnd = TRUE;
837 return FALSE;
838 }
729e4ab9
A
839 c = UTEXT_NEXT32(fInputText);
840 startPos = UTEXT_GETNATIVEINDEX(fInputText);
46f4442e
A
841 // Note that it's perfectly OK for a pattern to have a zero-length
842 // match at the end of a string, so we must make sure that the loop
729e4ab9 843 // runs with startPos == testStartLimit the last time through.
b331163b 844 if (findProgressInterrupt(startPos, status))
729e4ab9 845 return FALSE;
b75a7d8f 846 }
46f4442e
A
847 } else {
848 for (;;) {
b331163b
A
849 if (isLineTerminator(c)) {
850 if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
851 (void)UTEXT_NEXT32(fInputText);
852 startPos = UTEXT_GETNATIVEINDEX(fInputText);
853 }
854 MatchAt(startPos, FALSE, status);
855 if (U_FAILURE(status)) {
856 return FALSE;
857 }
858 if (fMatch) {
859 return TRUE;
860 }
861 UTEXT_SETNATIVEINDEX(fInputText, startPos);
46f4442e 862 }
729e4ab9 863 if (startPos >= testStartLimit) {
46f4442e
A
864 fMatch = FALSE;
865 fHitEnd = TRUE;
866 return FALSE;
867 }
729e4ab9
A
868 c = UTEXT_NEXT32(fInputText);
869 startPos = UTEXT_GETNATIVEINDEX(fInputText);
46f4442e
A
870 // Note that it's perfectly OK for a pattern to have a zero-length
871 // match at the end of a string, so we must make sure that the loop
729e4ab9 872 // runs with startPos == testStartLimit the last time through.
b331163b 873 if (findProgressInterrupt(startPos, status))
729e4ab9 874 return FALSE;
b75a7d8f 875 }
b75a7d8f
A
876 }
877 }
878
879 default:
880 U_ASSERT(FALSE);
881 }
882
883 U_ASSERT(FALSE);
884 return FALSE;
885}
886
887
888
729e4ab9 889UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
b75a7d8f
A
890 if (U_FAILURE(status)) {
891 return FALSE;
892 }
893 if (U_FAILURE(fDeferredStatus)) {
894 status = fDeferredStatus;
895 return FALSE;
896 }
46f4442e
A
897 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
898 // This will reset the region to be the full input length.
729e4ab9
A
899 if (start < 0) {
900 status = U_INDEX_OUTOFBOUNDS_ERROR;
901 return FALSE;
902 }
57a6839d 903
729e4ab9
A
904 int64_t nativeStart = start;
905 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
b75a7d8f
A
906 status = U_INDEX_OUTOFBOUNDS_ERROR;
907 return FALSE;
908 }
57a6839d 909 fMatchEnd = nativeStart;
b331163b 910 return find(status);
b75a7d8f
A
911}
912
913
b75a7d8f
A
914//--------------------------------------------------------------------------------
915//
729e4ab9
A
916// findUsingChunk() -- like find(), but with the advance knowledge that the
917// entire string is available in the UText's chunk buffer.
b75a7d8f
A
918//
919//--------------------------------------------------------------------------------
b331163b 920UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
729e4ab9
A
921 // Start at the position of the last match end. (Will be zero if the
922 // matcher has been reset.
923 //
b75a7d8f 924
729e4ab9
A
925 int32_t startPos = (int32_t)fMatchEnd;
926 if (startPos==0) {
927 startPos = (int32_t)fActiveStart;
b75a7d8f 928 }
57a6839d 929
729e4ab9 930 const UChar *inputBuf = fInputText->chunkContents;
b75a7d8f 931
729e4ab9
A
932 if (fMatch) {
933 // Save the position of any previous successful match.
934 fLastMatchEnd = fMatchEnd;
57a6839d 935
729e4ab9
A
936 if (fMatchStart == fMatchEnd) {
937 // Previous match had zero length. Move start position up one position
938 // to avoid sending find() into a loop on zero-length matches.
939 if (startPos >= fActiveLimit) {
940 fMatch = FALSE;
941 fHitEnd = TRUE;
942 return FALSE;
943 }
944 U16_FWD_1(inputBuf, startPos, fInputLength);
945 }
946 } else {
947 if (fLastMatchEnd >= 0) {
948 // A previous find() failed to match. Don't try again.
949 // (without this test, a pattern with a zero-length match
950 // could match again at the end of an input string.)
951 fHitEnd = TRUE;
952 return FALSE;
953 }
b75a7d8f 954 }
57a6839d
A
955
956
729e4ab9
A
957 // Compute the position in the input string beyond which a match can not begin, because
958 // the minimum length match would extend past the end of the input.
959 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
960 // Be aware of possible overflows if making changes here.
b331163b 961 // Note: a match can begin at inputBuf + testLen; it is an inclusive limit.
729e4ab9
A
962 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
963 if (startPos > testLen) {
964 fMatch = FALSE;
965 fHitEnd = TRUE;
b75a7d8f
A
966 return FALSE;
967 }
57a6839d 968
729e4ab9
A
969 UChar32 c;
970 U_ASSERT(startPos >= 0);
57a6839d 971
729e4ab9
A
972 switch (fPattern->fStartType) {
973 case START_NO_INFO:
57a6839d 974 // No optimization was found.
729e4ab9
A
975 // Try a match at each input position.
976 for (;;) {
b331163b
A
977 MatchChunkAt(startPos, FALSE, status);
978 if (U_FAILURE(status)) {
729e4ab9
A
979 return FALSE;
980 }
981 if (fMatch) {
982 return TRUE;
983 }
984 if (startPos >= testLen) {
985 fHitEnd = TRUE;
986 return FALSE;
987 }
988 U16_FWD_1(inputBuf, startPos, fActiveLimit);
989 // Note that it's perfectly OK for a pattern to have a zero-length
990 // match at the end of a string, so we must make sure that the loop
991 // runs with startPos == testLen the last time through.
b331163b 992 if (findProgressInterrupt(startPos, status))
729e4ab9
A
993 return FALSE;
994 }
995 U_ASSERT(FALSE);
57a6839d 996
729e4ab9
A
997 case START_START:
998 // Matches are only possible at the start of the input string
999 // (pattern begins with ^ or \A)
1000 if (startPos > fActiveStart) {
1001 fMatch = FALSE;
1002 return FALSE;
1003 }
b331163b
A
1004 MatchChunkAt(startPos, FALSE, status);
1005 if (U_FAILURE(status)) {
729e4ab9
A
1006 return FALSE;
1007 }
1008 return fMatch;
57a6839d
A
1009
1010
729e4ab9
A
1011 case START_SET:
1012 {
1013 // Match may start on any char from a pre-computed set.
1014 U_ASSERT(fPattern->fMinMatchLen > 0);
1015 for (;;) {
1016 int32_t pos = startPos;
1017 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
1018 if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
1019 (c>=256 && fPattern->fInitialChars->contains(c))) {
b331163b
A
1020 MatchChunkAt(pos, FALSE, status);
1021 if (U_FAILURE(status)) {
729e4ab9
A
1022 return FALSE;
1023 }
1024 if (fMatch) {
1025 return TRUE;
1026 }
1027 }
b331163b 1028 if (startPos > testLen) {
729e4ab9
A
1029 fMatch = FALSE;
1030 fHitEnd = TRUE;
1031 return FALSE;
1032 }
b331163b 1033 if (findProgressInterrupt(startPos, status))
729e4ab9
A
1034 return FALSE;
1035 }
b75a7d8f 1036 }
729e4ab9 1037 U_ASSERT(FALSE);
57a6839d 1038
729e4ab9
A
1039 case START_STRING:
1040 case START_CHAR:
1041 {
1042 // Match starts on exactly one char.
1043 U_ASSERT(fPattern->fMinMatchLen > 0);
1044 UChar32 theChar = fPattern->fInitialChar;
1045 for (;;) {
1046 int32_t pos = startPos;
1047 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
1048 if (c == theChar) {
b331163b
A
1049 MatchChunkAt(pos, FALSE, status);
1050 if (U_FAILURE(status)) {
729e4ab9
A
1051 return FALSE;
1052 }
1053 if (fMatch) {
1054 return TRUE;
1055 }
1056 }
b331163b 1057 if (startPos > testLen) {
729e4ab9
A
1058 fMatch = FALSE;
1059 fHitEnd = TRUE;
1060 return FALSE;
1061 }
b331163b 1062 if (findProgressInterrupt(startPos, status))
729e4ab9
A
1063 return FALSE;
1064 }
1065 }
b331163b 1066 U_ASSERT(FALSE);
57a6839d 1067
729e4ab9
A
1068 case START_LINE:
1069 {
1070 UChar32 c;
1071 if (startPos == fAnchorStart) {
b331163b
A
1072 MatchChunkAt(startPos, FALSE, status);
1073 if (U_FAILURE(status)) {
729e4ab9
A
1074 return FALSE;
1075 }
1076 if (fMatch) {
1077 return TRUE;
1078 }
f3c0d7a5
A
1079 // In bug 31063104 which has a zero-length text buffer we get here with
1080 // inputBuf=NULL, startPos=fActiveLimit=0 (and fMatch F) which violates the
1081 // requirement for U16_FWD_1 (utf16.h) that startPos < fActiveLimit. Having
1082 // inputBuf=NULL (chunkContexts NULL) is probably due to an error in the
1083 // CFStringUText functions. Nevertheless, to be defensive, add test below.
1084 if (startPos >= testLen) {
1085 fHitEnd = TRUE;
1086 return FALSE;
1087 }
729e4ab9
A
1088 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1089 }
57a6839d 1090
729e4ab9
A
1091 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1092 for (;;) {
1093 c = inputBuf[startPos-1];
1094 if (c == 0x0a) {
b331163b
A
1095 MatchChunkAt(startPos, FALSE, status);
1096 if (U_FAILURE(status)) {
729e4ab9
A
1097 return FALSE;
1098 }
1099 if (fMatch) {
1100 return TRUE;
1101 }
1102 }
1103 if (startPos >= testLen) {
1104 fMatch = FALSE;
1105 fHitEnd = TRUE;
1106 return FALSE;
1107 }
1108 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1109 // Note that it's perfectly OK for a pattern to have a zero-length
1110 // match at the end of a string, so we must make sure that the loop
1111 // runs with startPos == testLen the last time through.
b331163b 1112 if (findProgressInterrupt(startPos, status))
729e4ab9
A
1113 return FALSE;
1114 }
1115 } else {
1116 for (;;) {
1117 c = inputBuf[startPos-1];
b331163b 1118 if (isLineTerminator(c)) {
729e4ab9
A
1119 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1120 startPos++;
1121 }
b331163b
A
1122 MatchChunkAt(startPos, FALSE, status);
1123 if (U_FAILURE(status)) {
729e4ab9
A
1124 return FALSE;
1125 }
1126 if (fMatch) {
1127 return TRUE;
1128 }
1129 }
1130 if (startPos >= testLen) {
1131 fMatch = FALSE;
1132 fHitEnd = TRUE;
1133 return FALSE;
1134 }
1135 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1136 // Note that it's perfectly OK for a pattern to have a zero-length
1137 // match at the end of a string, so we must make sure that the loop
1138 // runs with startPos == testLen the last time through.
b331163b 1139 if (findProgressInterrupt(startPos, status))
729e4ab9
A
1140 return FALSE;
1141 }
1142 }
1143 }
57a6839d 1144
729e4ab9
A
1145 default:
1146 U_ASSERT(FALSE);
1147 }
57a6839d 1148
729e4ab9
A
1149 U_ASSERT(FALSE);
1150 return FALSE;
1151}
1152
1153
1154
1155//--------------------------------------------------------------------------------
1156//
1157// group()
1158//
1159//--------------------------------------------------------------------------------
1160UnicodeString RegexMatcher::group(UErrorCode &status) const {
1161 return group(0, status);
b75a7d8f
A
1162}
1163
729e4ab9
A
1164// Return immutable shallow clone
1165UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1166 return group(0, dest, group_len, status);
1167}
b75a7d8f 1168
729e4ab9
A
1169// Return immutable shallow clone
1170UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1171 group_len = 0;
374ca955 1172 if (U_FAILURE(status)) {
729e4ab9 1173 return dest;
374ca955
A
1174 }
1175 if (U_FAILURE(fDeferredStatus)) {
1176 status = fDeferredStatus;
57a6839d 1177 } else if (fMatch == FALSE) {
729e4ab9 1178 status = U_REGEX_INVALID_STATE;
57a6839d 1179 } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
374ca955 1180 status = U_INDEX_OUTOFBOUNDS_ERROR;
374ca955 1181 }
57a6839d
A
1182
1183 if (U_FAILURE(status)) {
1184 return dest;
729e4ab9 1185 }
57a6839d 1186
729e4ab9
A
1187 int64_t s, e;
1188 if (groupNum == 0) {
1189 s = fMatchStart;
1190 e = fMatchEnd;
1191 } else {
1192 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1193 U_ASSERT(groupOffset < fPattern->fFrameSize);
1194 U_ASSERT(groupOffset >= 0);
1195 s = fFrame->fExtra[groupOffset];
1196 e = fFrame->fExtra[groupOffset+1];
1197 }
1198
1199 if (s < 0) {
1200 // A capture group wasn't part of the match
1201 return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1202 }
1203 U_ASSERT(s <= e);
1204 group_len = e - s;
57a6839d 1205
729e4ab9
A
1206 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1207 if (dest)
1208 UTEXT_SETNATIVEINDEX(dest, s);
1209 return dest;
374ca955
A
1210}
1211
729e4ab9
A
1212UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1213 UnicodeString result;
b331163b
A
1214 int64_t groupStart = start64(groupNum, status);
1215 int64_t groupEnd = end64(groupNum, status);
1216 if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) {
729e4ab9
A
1217 return result;
1218 }
57a6839d 1219
b331163b
A
1220 // Get the group length using a utext_extract preflight.
1221 // UText is actually pretty efficient at this when underlying encoding is UTF-16.
1222 int32_t length = utext_extract(fInputText, groupStart, groupEnd, NULL, 0, &status);
1223 if (status != U_BUFFER_OVERFLOW_ERROR) {
1224 return result;
729e4ab9 1225 }
57a6839d 1226
b331163b
A
1227 status = U_ZERO_ERROR;
1228 UChar *buf = result.getBuffer(length);
1229 if (buf == NULL) {
1230 status = U_MEMORY_ALLOCATION_ERROR;
729e4ab9 1231 } else {
b331163b
A
1232 int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status);
1233 result.releaseBuffer(extractLength);
1234 U_ASSERT(length == extractLength);
729e4ab9 1235 }
b331163b 1236 return result;
b75a7d8f
A
1237}
1238
b331163b 1239
729e4ab9
A
1240//--------------------------------------------------------------------------------
1241//
1242// appendGroup() -- currently internal only, appends a group to a UText rather
1243// than replacing its contents
1244//
1245//--------------------------------------------------------------------------------
b75a7d8f 1246
729e4ab9 1247int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
374ca955 1248 if (U_FAILURE(status)) {
729e4ab9 1249 return 0;
374ca955
A
1250 }
1251 if (U_FAILURE(fDeferredStatus)) {
1252 status = fDeferredStatus;
729e4ab9 1253 return 0;
374ca955 1254 }
729e4ab9 1255 int64_t destLen = utext_nativeLength(dest);
57a6839d 1256
729e4ab9
A
1257 if (fMatch == FALSE) {
1258 status = U_REGEX_INVALID_STATE;
1259 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1260 }
1261 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
374ca955 1262 status = U_INDEX_OUTOFBOUNDS_ERROR;
729e4ab9 1263 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
374ca955 1264 }
57a6839d 1265
729e4ab9
A
1266 int64_t s, e;
1267 if (groupNum == 0) {
1268 s = fMatchStart;
1269 e = fMatchEnd;
1270 } else {
1271 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1272 U_ASSERT(groupOffset < fPattern->fFrameSize);
1273 U_ASSERT(groupOffset >= 0);
1274 s = fFrame->fExtra[groupOffset];
1275 e = fFrame->fExtra[groupOffset+1];
1276 }
57a6839d 1277
729e4ab9 1278 if (s < 0) {
57a6839d 1279 // A capture group wasn't part of the match
729e4ab9
A
1280 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1281 }
1282 U_ASSERT(s <= e);
57a6839d 1283
729e4ab9
A
1284 int64_t deltaLen;
1285 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1286 U_ASSERT(e <= fInputLength);
1287 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1288 } else {
1289 int32_t len16;
1290 if (UTEXT_USES_U16(fInputText)) {
1291 len16 = (int32_t)(e-s);
1292 } else {
1293 UErrorCode lengthStatus = U_ZERO_ERROR;
1294 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1295 }
1296 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1297 if (groupChars == NULL) {
1298 status = U_MEMORY_ALLOCATION_ERROR;
1299 return 0;
1300 }
1301 utext_extract(fInputText, s, e, groupChars, len16+1, &status);
57a6839d 1302
729e4ab9
A
1303 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1304 uprv_free(groupChars);
1305 }
1306 return deltaLen;
374ca955
A
1307}
1308
b75a7d8f
A
1309
1310
46f4442e
A
1311//--------------------------------------------------------------------------------
1312//
729e4ab9 1313// groupCount()
46f4442e
A
1314//
1315//--------------------------------------------------------------------------------
729e4ab9
A
1316int32_t RegexMatcher::groupCount() const {
1317 return fPattern->fGroupMap->size();
b75a7d8f
A
1318}
1319
46f4442e
A
1320//--------------------------------------------------------------------------------
1321//
729e4ab9
A
1322// hasAnchoringBounds()
1323//
1324//--------------------------------------------------------------------------------
1325UBool RegexMatcher::hasAnchoringBounds() const {
1326 return fAnchoringBounds;
1327}
1328
1329
1330//--------------------------------------------------------------------------------
1331//
1332// hasTransparentBounds()
1333//
1334//--------------------------------------------------------------------------------
1335UBool RegexMatcher::hasTransparentBounds() const {
1336 return fTransparentBounds;
1337}
1338
1339
1340
1341//--------------------------------------------------------------------------------
1342//
1343// hitEnd()
1344//
1345//--------------------------------------------------------------------------------
1346UBool RegexMatcher::hitEnd() const {
1347 return fHitEnd;
1348}
1349
1350
1351//--------------------------------------------------------------------------------
1352//
1353// input()
1354//
1355//--------------------------------------------------------------------------------
1356const UnicodeString &RegexMatcher::input() const {
1357 if (!fInput) {
1358 UErrorCode status = U_ZERO_ERROR;
1359 int32_t len16;
1360 if (UTEXT_USES_U16(fInputText)) {
1361 len16 = (int32_t)fInputLength;
1362 } else {
1363 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1364 status = U_ZERO_ERROR; // overflow, length status
1365 }
1366 UnicodeString *result = new UnicodeString(len16, 0, 0);
57a6839d 1367
729e4ab9
A
1368 UChar *inputChars = result->getBuffer(len16);
1369 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1370 result->releaseBuffer(len16);
57a6839d 1371
729e4ab9
A
1372 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1373 }
57a6839d 1374
729e4ab9
A
1375 return *fInput;
1376}
1377
1378//--------------------------------------------------------------------------------
1379//
1380// inputText()
1381//
1382//--------------------------------------------------------------------------------
1383UText *RegexMatcher::inputText() const {
1384 return fInputText;
1385}
1386
1387
1388//--------------------------------------------------------------------------------
1389//
1390// getInput() -- like inputText(), but makes a clone or copies into another UText
1391//
1392//--------------------------------------------------------------------------------
1393UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
729e4ab9
A
1394 if (U_FAILURE(status)) {
1395 return dest;
1396 }
1397 if (U_FAILURE(fDeferredStatus)) {
1398 status = fDeferredStatus;
57a6839d 1399 return dest;
729e4ab9 1400 }
57a6839d 1401
729e4ab9
A
1402 if (dest) {
1403 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1404 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1405 } else {
1406 int32_t input16Len;
1407 if (UTEXT_USES_U16(fInputText)) {
1408 input16Len = (int32_t)fInputLength;
1409 } else {
1410 UErrorCode lengthStatus = U_ZERO_ERROR;
1411 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1412 }
1413 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1414 if (inputChars == NULL) {
1415 return dest;
1416 }
57a6839d 1417
729e4ab9
A
1418 status = U_ZERO_ERROR;
1419 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1420 status = U_ZERO_ERROR;
1421 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
57a6839d 1422
729e4ab9
A
1423 uprv_free(inputChars);
1424 }
1425 return dest;
1426 } else {
1427 return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1428 }
1429}
1430
1431
1432static UBool compat_SyncMutableUTextContents(UText *ut);
1433static UBool compat_SyncMutableUTextContents(UText *ut) {
1434 UBool retVal = FALSE;
57a6839d 1435
729e4ab9
A
1436 // In the following test, we're really only interested in whether the UText should switch
1437 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents
1438 // will still point to the correct data.
1439 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1440 UnicodeString *us=(UnicodeString *)ut->context;
57a6839d 1441
729e4ab9
A
1442 // Update to the latest length.
1443 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1444 int32_t newLength = us->length();
57a6839d 1445
729e4ab9
A
1446 // Update the chunk description.
1447 // The buffer may have switched between stack- and heap-based.
1448 ut->chunkContents = us->getBuffer();
1449 ut->chunkLength = newLength;
1450 ut->chunkNativeLimit = newLength;
1451 ut->nativeIndexingLimit = newLength;
1452 retVal = TRUE;
1453 }
1454
1455 return retVal;
1456}
1457
1458//--------------------------------------------------------------------------------
1459//
1460// lookingAt()
1461//
1462//--------------------------------------------------------------------------------
1463UBool RegexMatcher::lookingAt(UErrorCode &status) {
1464 if (U_FAILURE(status)) {
1465 return FALSE;
1466 }
1467 if (U_FAILURE(fDeferredStatus)) {
1468 status = fDeferredStatus;
1469 return FALSE;
1470 }
57a6839d 1471
729e4ab9
A
1472 if (fInputUniStrMaybeMutable) {
1473 if (compat_SyncMutableUTextContents(fInputText)) {
1474 fInputLength = utext_nativeLength(fInputText);
1475 reset();
1476 }
1477 }
1478 else {
1479 resetPreserveRegion();
1480 }
1481 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1482 MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1483 } else {
1484 MatchAt(fActiveStart, FALSE, status);
1485 }
1486 return fMatch;
1487}
1488
1489
1490UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1491 if (U_FAILURE(status)) {
1492 return FALSE;
1493 }
1494 if (U_FAILURE(fDeferredStatus)) {
1495 status = fDeferredStatus;
1496 return FALSE;
1497 }
1498 reset();
57a6839d 1499
729e4ab9
A
1500 if (start < 0) {
1501 status = U_INDEX_OUTOFBOUNDS_ERROR;
1502 return FALSE;
1503 }
57a6839d 1504
729e4ab9
A
1505 if (fInputUniStrMaybeMutable) {
1506 if (compat_SyncMutableUTextContents(fInputText)) {
1507 fInputLength = utext_nativeLength(fInputText);
1508 reset();
1509 }
1510 }
1511
1512 int64_t nativeStart;
1513 nativeStart = start;
1514 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1515 status = U_INDEX_OUTOFBOUNDS_ERROR;
1516 return FALSE;
1517 }
57a6839d 1518
729e4ab9
A
1519 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1520 MatchChunkAt((int32_t)nativeStart, FALSE, status);
1521 } else {
1522 MatchAt(nativeStart, FALSE, status);
1523 }
1524 return fMatch;
1525}
1526
1527
1528
1529//--------------------------------------------------------------------------------
1530//
1531// matches()
1532//
1533//--------------------------------------------------------------------------------
1534UBool RegexMatcher::matches(UErrorCode &status) {
1535 if (U_FAILURE(status)) {
1536 return FALSE;
1537 }
1538 if (U_FAILURE(fDeferredStatus)) {
1539 status = fDeferredStatus;
1540 return FALSE;
1541 }
1542
1543 if (fInputUniStrMaybeMutable) {
1544 if (compat_SyncMutableUTextContents(fInputText)) {
1545 fInputLength = utext_nativeLength(fInputText);
1546 reset();
1547 }
1548 }
1549 else {
1550 resetPreserveRegion();
1551 }
1552
1553 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1554 MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1555 } else {
1556 MatchAt(fActiveStart, TRUE, status);
1557 }
1558 return fMatch;
1559}
1560
1561
1562UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1563 if (U_FAILURE(status)) {
1564 return FALSE;
1565 }
1566 if (U_FAILURE(fDeferredStatus)) {
1567 status = fDeferredStatus;
1568 return FALSE;
1569 }
1570 reset();
57a6839d 1571
729e4ab9
A
1572 if (start < 0) {
1573 status = U_INDEX_OUTOFBOUNDS_ERROR;
1574 return FALSE;
1575 }
1576
1577 if (fInputUniStrMaybeMutable) {
1578 if (compat_SyncMutableUTextContents(fInputText)) {
1579 fInputLength = utext_nativeLength(fInputText);
1580 reset();
1581 }
1582 }
1583
1584 int64_t nativeStart;
1585 nativeStart = start;
1586 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1587 status = U_INDEX_OUTOFBOUNDS_ERROR;
1588 return FALSE;
1589 }
1590
1591 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1592 MatchChunkAt((int32_t)nativeStart, TRUE, status);
1593 } else {
1594 MatchAt(nativeStart, TRUE, status);
1595 }
1596 return fMatch;
1597}
1598
1599
1600
1601//--------------------------------------------------------------------------------
1602//
1603// pattern
1604//
1605//--------------------------------------------------------------------------------
1606const RegexPattern &RegexMatcher::pattern() const {
1607 return *fPattern;
1608}
1609
1610
1611
1612//--------------------------------------------------------------------------------
1613//
1614// region
46f4442e
A
1615//
1616//--------------------------------------------------------------------------------
729e4ab9 1617RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
46f4442e
A
1618 if (U_FAILURE(status)) {
1619 return *this;
1620 }
57a6839d 1621
729e4ab9 1622 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
46f4442e
A
1623 status = U_ILLEGAL_ARGUMENT_ERROR;
1624 }
57a6839d 1625
729e4ab9
A
1626 int64_t nativeStart = regionStart;
1627 int64_t nativeLimit = regionLimit;
1628 if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1629 status = U_ILLEGAL_ARGUMENT_ERROR;
1630 }
1631
1632 if (startIndex == -1)
1633 this->reset();
1634 else
57a6839d
A
1635 resetPreserveRegion();
1636
729e4ab9
A
1637 fRegionStart = nativeStart;
1638 fRegionLimit = nativeLimit;
1639 fActiveStart = nativeStart;
1640 fActiveLimit = nativeLimit;
1641
1642 if (startIndex != -1) {
1643 if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1644 status = U_INDEX_OUTOFBOUNDS_ERROR;
1645 }
57a6839d 1646 fMatchEnd = startIndex;
729e4ab9
A
1647 }
1648
46f4442e 1649 if (!fTransparentBounds) {
729e4ab9
A
1650 fLookStart = nativeStart;
1651 fLookLimit = nativeLimit;
46f4442e
A
1652 }
1653 if (fAnchoringBounds) {
729e4ab9
A
1654 fAnchorStart = nativeStart;
1655 fAnchorLimit = nativeLimit;
46f4442e
A
1656 }
1657 return *this;
1658}
1659
729e4ab9
A
1660RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1661 return region(start, limit, -1, status);
1662}
46f4442e
A
1663
1664//--------------------------------------------------------------------------------
1665//
1666// regionEnd
1667//
1668//--------------------------------------------------------------------------------
1669int32_t RegexMatcher::regionEnd() const {
729e4ab9 1670 return (int32_t)fRegionLimit;
46f4442e
A
1671}
1672
729e4ab9
A
1673int64_t RegexMatcher::regionEnd64() const {
1674 return fRegionLimit;
1675}
46f4442e
A
1676
1677//--------------------------------------------------------------------------------
1678//
1679// regionStart
1680//
1681//--------------------------------------------------------------------------------
1682int32_t RegexMatcher::regionStart() const {
729e4ab9
A
1683 return (int32_t)fRegionStart;
1684}
1685
1686int64_t RegexMatcher::regionStart64() const {
46f4442e
A
1687 return fRegionStart;
1688}
1689
1690
b75a7d8f
A
1691//--------------------------------------------------------------------------------
1692//
1693// replaceAll
1694//
1695//--------------------------------------------------------------------------------
1696UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
729e4ab9
A
1697 UText replacementText = UTEXT_INITIALIZER;
1698 UText resultText = UTEXT_INITIALIZER;
1699 UnicodeString resultString;
1700 if (U_FAILURE(status)) {
1701 return resultString;
1702 }
57a6839d 1703
729e4ab9
A
1704 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1705 utext_openUnicodeString(&resultText, &resultString, &status);
57a6839d 1706
729e4ab9
A
1707 replaceAll(&replacementText, &resultText, status);
1708
1709 utext_close(&resultText);
1710 utext_close(&replacementText);
57a6839d 1711
729e4ab9
A
1712 return resultString;
1713}
1714
1715
1716//
1717// replaceAll, UText mode
1718//
1719UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
b75a7d8f 1720 if (U_FAILURE(status)) {
729e4ab9 1721 return dest;
b75a7d8f
A
1722 }
1723 if (U_FAILURE(fDeferredStatus)) {
1724 status = fDeferredStatus;
729e4ab9 1725 return dest;
b75a7d8f 1726 }
57a6839d 1727
729e4ab9
A
1728 if (dest == NULL) {
1729 UnicodeString emptyString;
1730 UText empty = UTEXT_INITIALIZER;
57a6839d 1731
729e4ab9
A
1732 utext_openUnicodeString(&empty, &emptyString, &status);
1733 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1734 utext_close(&empty);
1735 }
1736
1737 if (U_SUCCESS(status)) {
1738 reset();
1739 while (find()) {
1740 appendReplacement(dest, replacement, status);
1741 if (U_FAILURE(status)) {
1742 break;
1743 }
b75a7d8f 1744 }
729e4ab9 1745 appendTail(dest, status);
b75a7d8f 1746 }
57a6839d 1747
729e4ab9 1748 return dest;
b75a7d8f
A
1749}
1750
1751
b75a7d8f
A
1752//--------------------------------------------------------------------------------
1753//
1754// replaceFirst
1755//
1756//--------------------------------------------------------------------------------
1757UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
729e4ab9
A
1758 UText replacementText = UTEXT_INITIALIZER;
1759 UText resultText = UTEXT_INITIALIZER;
1760 UnicodeString resultString;
57a6839d 1761
729e4ab9
A
1762 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1763 utext_openUnicodeString(&resultText, &resultString, &status);
57a6839d 1764
729e4ab9 1765 replaceFirst(&replacementText, &resultText, status);
57a6839d 1766
729e4ab9
A
1767 utext_close(&resultText);
1768 utext_close(&replacementText);
57a6839d 1769
729e4ab9
A
1770 return resultString;
1771}
1772
1773//
1774// replaceFirst, UText mode
1775//
1776UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
b75a7d8f 1777 if (U_FAILURE(status)) {
729e4ab9 1778 return dest;
b75a7d8f
A
1779 }
1780 if (U_FAILURE(fDeferredStatus)) {
1781 status = fDeferredStatus;
729e4ab9 1782 return dest;
b75a7d8f
A
1783 }
1784
1785 reset();
1786 if (!find()) {
729e4ab9 1787 return getInput(dest, status);
b75a7d8f 1788 }
57a6839d 1789
729e4ab9
A
1790 if (dest == NULL) {
1791 UnicodeString emptyString;
1792 UText empty = UTEXT_INITIALIZER;
57a6839d 1793
729e4ab9
A
1794 utext_openUnicodeString(&empty, &emptyString, &status);
1795 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1796 utext_close(&empty);
1797 }
57a6839d 1798
729e4ab9
A
1799 appendReplacement(dest, replacement, status);
1800 appendTail(dest, status);
57a6839d 1801
729e4ab9 1802 return dest;
b75a7d8f
A
1803}
1804
1805
46f4442e
A
1806//--------------------------------------------------------------------------------
1807//
1808// requireEnd
1809//
1810//--------------------------------------------------------------------------------
1811UBool RegexMatcher::requireEnd() const {
1812 return fRequireEnd;
1813}
1814
b75a7d8f
A
1815
1816//--------------------------------------------------------------------------------
1817//
1818// reset
1819//
1820//--------------------------------------------------------------------------------
1821RegexMatcher &RegexMatcher::reset() {
46f4442e 1822 fRegionStart = 0;
729e4ab9 1823 fRegionLimit = fInputLength;
46f4442e 1824 fActiveStart = 0;
729e4ab9 1825 fActiveLimit = fInputLength;
46f4442e 1826 fAnchorStart = 0;
729e4ab9 1827 fAnchorLimit = fInputLength;
46f4442e 1828 fLookStart = 0;
729e4ab9 1829 fLookLimit = fInputLength;
46f4442e
A
1830 resetPreserveRegion();
1831 return *this;
1832}
1833
1834
1835
1836void RegexMatcher::resetPreserveRegion() {
374ca955
A
1837 fMatchStart = 0;
1838 fMatchEnd = 0;
1839 fLastMatchEnd = -1;
46f4442e 1840 fAppendPosition = 0;
374ca955 1841 fMatch = FALSE;
46f4442e
A
1842 fHitEnd = FALSE;
1843 fRequireEnd = FALSE;
1844 fTime = 0;
1845 fTickCounter = TIMER_INITIAL_VALUE;
729e4ab9 1846 //resetStack(); // more expensive than it looks...
b75a7d8f
A
1847}
1848
1849
b75a7d8f 1850RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
729e4ab9
A
1851 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1852 if (fPattern->fNeedsAltInput) {
1853 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1854 }
b331163b
A
1855 if (U_FAILURE(fDeferredStatus)) {
1856 return *this;
1857 }
729e4ab9 1858 fInputLength = utext_nativeLength(fInputText);
57a6839d 1859
b75a7d8f 1860 reset();
729e4ab9
A
1861 delete fInput;
1862 fInput = NULL;
1863
1864 // Do the following for any UnicodeString.
1865 // This is for compatibility for those clients who modify the input string "live" during regex operations.
57a6839d
A
1866 fInputUniStrMaybeMutable = TRUE;
1867
374ca955 1868 if (fWordBreakItr != NULL) {
729e4ab9
A
1869#if UCONFIG_NO_BREAK_ITERATION==0
1870 UErrorCode status = U_ZERO_ERROR;
1871 fWordBreakItr->setText(fInputText, status);
1872#endif
374ca955 1873 }
b75a7d8f
A
1874 return *this;
1875}
1876
b75a7d8f 1877
729e4ab9
A
1878RegexMatcher &RegexMatcher::reset(UText *input) {
1879 if (fInputText != input) {
1880 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1881 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
b331163b
A
1882 if (U_FAILURE(fDeferredStatus)) {
1883 return *this;
1884 }
729e4ab9 1885 fInputLength = utext_nativeLength(fInputText);
57a6839d 1886
729e4ab9
A
1887 delete fInput;
1888 fInput = NULL;
57a6839d 1889
729e4ab9
A
1890 if (fWordBreakItr != NULL) {
1891#if UCONFIG_NO_BREAK_ITERATION==0
1892 UErrorCode status = U_ZERO_ERROR;
1893 fWordBreakItr->setText(input, status);
1894#endif
1895 }
1896 }
1897 reset();
1898 fInputUniStrMaybeMutable = FALSE;
1899
1900 return *this;
1901}
1902
1903/*RegexMatcher &RegexMatcher::reset(const UChar *) {
1904 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1905 return *this;
1906}*/
1907
1908RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1909 if (U_FAILURE(status)) {
374ca955 1910 return *this;
b75a7d8f 1911 }
46f4442e 1912 reset(); // Reset also resets the region to be the entire string.
57a6839d 1913
729e4ab9 1914 if (position < 0 || position > fActiveLimit) {
374ca955
A
1915 status = U_INDEX_OUTOFBOUNDS_ERROR;
1916 return *this;
1917 }
1918 fMatchEnd = position;
1919 return *this;
b75a7d8f
A
1920}
1921
1922
4388f060
A
1923//--------------------------------------------------------------------------------
1924//
1925// refresh
1926//
1927//--------------------------------------------------------------------------------
1928RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1929 if (U_FAILURE(status)) {
1930 return *this;
1931 }
1932 if (input == NULL) {
1933 status = U_ILLEGAL_ARGUMENT_ERROR;
1934 return *this;
1935 }
1936 if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1937 status = U_ILLEGAL_ARGUMENT_ERROR;
1938 return *this;
1939 }
1940 int64_t pos = utext_getNativeIndex(fInputText);
1941 // Shallow read-only clone of the new UText into the existing input UText
1942 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
1943 if (U_FAILURE(status)) {
1944 return *this;
1945 }
1946 utext_setNativeIndex(fInputText, pos);
1947
1948 if (fAltInputText != NULL) {
1949 pos = utext_getNativeIndex(fAltInputText);
1950 fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
1951 if (U_FAILURE(status)) {
1952 return *this;
1953 }
1954 utext_setNativeIndex(fAltInputText, pos);
1955 }
1956 return *this;
1957}
b75a7d8f 1958
374ca955
A
1959
1960
b75a7d8f
A
1961//--------------------------------------------------------------------------------
1962//
1963// setTrace
1964//
1965//--------------------------------------------------------------------------------
1966void RegexMatcher::setTrace(UBool state) {
1967 fTraceDebug = state;
1968}
1969
1970
1971
b331163b
A
1972/**
1973 * UText, replace entire contents of the destination UText with a substring of the source UText.
1974 *
1975 * @param src The source UText
1976 * @param dest The destination UText. Must be writable.
1977 * May be NULL, in which case a new UText will be allocated.
1978 * @param start Start index of source substring.
1979 * @param limit Limit index of source substring.
1980 * @param status An error code.
1981 */
1982static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) {
1983 if (U_FAILURE(*status)) {
1984 return dest;
1985 }
1986 if (start == limit) {
1987 if (dest) {
1988 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, status);
1989 return dest;
1990 } else {
1991 return utext_openUChars(NULL, NULL, 0, status);
1992 }
1993 }
1994 int32_t length = utext_extract(src, start, limit, NULL, 0, status);
1995 if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
1996 return dest;
1997 }
1998 *status = U_ZERO_ERROR;
1999 MaybeStackArray<UChar, 40> buffer;
2000 if (length >= buffer.getCapacity()) {
2001 UChar *newBuf = buffer.resize(length+1); // Leave space for terminating Nul.
2002 if (newBuf == NULL) {
2003 *status = U_MEMORY_ALLOCATION_ERROR;
2004 }
2005 }
2006 utext_extract(src, start, limit, buffer.getAlias(), length+1, status);
2007 if (dest) {
2008 utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status);
2009 return dest;
2010 }
2011
2012 // Caller did not provide a prexisting UText.
2013 // Open a new one, and have it adopt the text buffer storage.
2014 if (U_FAILURE(*status)) {
2015 return NULL;
2016 }
2017 int32_t ownedLength = 0;
2018 UChar *ownedBuf = buffer.orphanOrClone(length+1, ownedLength);
2019 if (ownedBuf == NULL) {
2020 *status = U_MEMORY_ALLOCATION_ERROR;
2021 return NULL;
2022 }
2023 UText *result = utext_openUChars(NULL, ownedBuf, length, status);
2024 if (U_FAILURE(*status)) {
2025 uprv_free(ownedBuf);
2026 return NULL;
2027 }
2028 result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT);
2029 return result;
2030}
2031
2032
b75a7d8f
A
2033//---------------------------------------------------------------------
2034//
2035// split
2036//
2037//---------------------------------------------------------------------
2038int32_t RegexMatcher::split(const UnicodeString &input,
2039 UnicodeString dest[],
2040 int32_t destCapacity,
729e4ab9
A
2041 UErrorCode &status)
2042{
2043 UText inputText = UTEXT_INITIALIZER;
2044 utext_openConstUnicodeString(&inputText, &input, &status);
2045 if (U_FAILURE(status)) {
2046 return 0;
2047 }
2048
2049 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2050 if (destText == NULL) {
2051 status = U_MEMORY_ALLOCATION_ERROR;
2052 return 0;
2053 }
2054 int32_t i;
2055 for (i = 0; i < destCapacity; i++) {
2056 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2057 }
57a6839d 2058
729e4ab9 2059 int32_t fieldCount = split(&inputText, destText, destCapacity, status);
57a6839d 2060
729e4ab9
A
2061 for (i = 0; i < destCapacity; i++) {
2062 utext_close(destText[i]);
2063 }
2064
2065 uprv_free(destText);
2066 utext_close(&inputText);
2067 return fieldCount;
2068}
2069
2070//
2071// split, UText mode
2072//
2073int32_t RegexMatcher::split(UText *input,
2074 UText *dest[],
2075 int32_t destCapacity,
2076 UErrorCode &status)
b75a7d8f
A
2077{
2078 //
2079 // Check arguements for validity
2080 //
2081 if (U_FAILURE(status)) {
2082 return 0;
2083 };
2084
2085 if (destCapacity < 1) {
2086 status = U_ILLEGAL_ARGUMENT_ERROR;
2087 return 0;
2088 }
2089
b75a7d8f
A
2090 //
2091 // Reset for the input text
2092 //
2093 reset(input);
729e4ab9 2094 int64_t nextOutputStringStart = 0;
46f4442e 2095 if (fActiveLimit == 0) {
b75a7d8f
A
2096 return 0;
2097 }
2098
b75a7d8f
A
2099 //
2100 // Loop through the input text, searching for the delimiter pattern
2101 //
73c04bcf 2102 int32_t i;
b75a7d8f
A
2103 int32_t numCaptureGroups = fPattern->fGroupMap->size();
2104 for (i=0; ; i++) {
2105 if (i>=destCapacity-1) {
2106 // There is one or zero output string left.
2107 // Fill the last output string with whatever is left from the input, then exit the loop.
729e4ab9 2108 // ( i will be == destCapacity if we filled the output array while processing
b75a7d8f
A
2109 // capture groups of the delimiter expression, in which case we will discard the
2110 // last capture group saved in favor of the unprocessed remainder of the
2111 // input string.)
2112 i = destCapacity-1;
729e4ab9
A
2113 if (fActiveLimit > nextOutputStringStart) {
2114 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2115 if (dest[i]) {
57a6839d
A
2116 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2117 input->chunkContents+nextOutputStringStart,
729e4ab9
A
2118 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2119 } else {
2120 UText remainingText = UTEXT_INITIALIZER;
57a6839d 2121 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
729e4ab9
A
2122 fActiveLimit-nextOutputStringStart, &status);
2123 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2124 utext_close(&remainingText);
2125 }
2126 } else {
2127 UErrorCode lengthStatus = U_ZERO_ERROR;
57a6839d 2128 int32_t remaining16Length =
729e4ab9
A
2129 utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2130 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2131 if (remainingChars == NULL) {
2132 status = U_MEMORY_ALLOCATION_ERROR;
2133 break;
2134 }
2135
2136 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2137 if (dest[i]) {
2138 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2139 } else {
2140 UText remainingText = UTEXT_INITIALIZER;
2141 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2142 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2143 utext_close(&remainingText);
2144 }
57a6839d 2145
729e4ab9
A
2146 uprv_free(remainingChars);
2147 }
b75a7d8f
A
2148 }
2149 break;
2150 }
2151 if (find()) {
2152 // We found another delimiter. Move everything from where we started looking
2153 // up until the start of the delimiter into the next output string.
729e4ab9
A
2154 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2155 if (dest[i]) {
57a6839d
A
2156 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2157 input->chunkContents+nextOutputStringStart,
729e4ab9
A
2158 (int32_t)(fMatchStart-nextOutputStringStart), &status);
2159 } else {
2160 UText remainingText = UTEXT_INITIALIZER;
57a6839d 2161 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
729e4ab9
A
2162 fMatchStart-nextOutputStringStart, &status);
2163 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2164 utext_close(&remainingText);
2165 }
2166 } else {
2167 UErrorCode lengthStatus = U_ZERO_ERROR;
2168 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2169 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2170 if (remainingChars == NULL) {
2171 status = U_MEMORY_ALLOCATION_ERROR;
2172 break;
2173 }
2174 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2175 if (dest[i]) {
2176 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2177 } else {
2178 UText remainingText = UTEXT_INITIALIZER;
2179 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2180 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2181 utext_close(&remainingText);
2182 }
57a6839d 2183
729e4ab9
A
2184 uprv_free(remainingChars);
2185 }
b75a7d8f
A
2186 nextOutputStringStart = fMatchEnd;
2187
2188 // If the delimiter pattern has capturing parentheses, the captured
2189 // text goes out into the next n destination strings.
2190 int32_t groupNum;
2191 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
4388f060
A
2192 if (i >= destCapacity-2) {
2193 // Never fill the last available output string with capture group text.
2194 // It will filled with the last field, the remainder of the
2195 // unsplit input text.
b75a7d8f
A
2196 break;
2197 }
2198 i++;
0f5d89e8 2199 dest[i] = utext_extract_replace(fInputText, dest[i],
b331163b 2200 start64(groupNum, status), end64(groupNum, status), &status);
b75a7d8f
A
2201 }
2202
46f4442e 2203 if (nextOutputStringStart == fActiveLimit) {
4388f060
A
2204 // The delimiter was at the end of the string. We're done, but first
2205 // we output one last empty string, for the empty field following
2206 // the delimiter at the end of input.
2207 if (i+1 < destCapacity) {
2208 ++i;
2209 if (dest[i] == NULL) {
2210 dest[i] = utext_openUChars(NULL, NULL, 0, &status);
2211 } else {
0f5d89e8 2212 static const UChar emptyString[] = {(UChar)0};
4388f060
A
2213 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
2214 }
729e4ab9 2215 }
4388f060 2216 break;
57a6839d
A
2217
2218 }
b75a7d8f
A
2219 }
2220 else
2221 {
2222 // We ran off the end of the input while looking for the next delimiter.
2223 // All the remaining text goes into the current output string.
729e4ab9
A
2224 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2225 if (dest[i]) {
57a6839d
A
2226 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2227 input->chunkContents+nextOutputStringStart,
729e4ab9
A
2228 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2229 } else {
2230 UText remainingText = UTEXT_INITIALIZER;
57a6839d 2231 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
729e4ab9
A
2232 fActiveLimit-nextOutputStringStart, &status);
2233 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2234 utext_close(&remainingText);
2235 }
2236 } else {
2237 UErrorCode lengthStatus = U_ZERO_ERROR;
2238 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2239 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2240 if (remainingChars == NULL) {
2241 status = U_MEMORY_ALLOCATION_ERROR;
2242 break;
2243 }
57a6839d 2244
729e4ab9
A
2245 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2246 if (dest[i]) {
2247 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2248 } else {
2249 UText remainingText = UTEXT_INITIALIZER;
2250 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2251 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2252 utext_close(&remainingText);
2253 }
57a6839d 2254
729e4ab9
A
2255 uprv_free(remainingChars);
2256 }
b75a7d8f
A
2257 break;
2258 }
729e4ab9
A
2259 if (U_FAILURE(status)) {
2260 break;
2261 }
2262 } // end of for loop
b75a7d8f
A
2263 return i+1;
2264}
2265
2266
b75a7d8f
A
2267//--------------------------------------------------------------------------------
2268//
2269// start
2270//
2271//--------------------------------------------------------------------------------
2272int32_t RegexMatcher::start(UErrorCode &status) const {
2273 return start(0, status);
2274}
2275
729e4ab9
A
2276int64_t RegexMatcher::start64(UErrorCode &status) const {
2277 return start64(0, status);
2278}
b75a7d8f 2279
46f4442e
A
2280//--------------------------------------------------------------------------------
2281//
2282// start(int32_t group, UErrorCode &status)
2283//
2284//--------------------------------------------------------------------------------
729e4ab9
A
2285
2286int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
b75a7d8f
A
2287 if (U_FAILURE(status)) {
2288 return -1;
2289 }
2290 if (U_FAILURE(fDeferredStatus)) {
2291 status = fDeferredStatus;
2292 return -1;
2293 }
2294 if (fMatch == FALSE) {
2295 status = U_REGEX_INVALID_STATE;
2296 return -1;
2297 }
2298 if (group < 0 || group > fPattern->fGroupMap->size()) {
2299 status = U_INDEX_OUTOFBOUNDS_ERROR;
2300 return -1;
2301 }
729e4ab9 2302 int64_t s;
b75a7d8f 2303 if (group == 0) {
57a6839d 2304 s = fMatchStart;
b75a7d8f
A
2305 } else {
2306 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2307 U_ASSERT(groupOffset < fPattern->fFrameSize);
2308 U_ASSERT(groupOffset >= 0);
2309 s = fFrame->fExtra[groupOffset];
2310 }
57a6839d 2311
b75a7d8f
A
2312 return s;
2313}
2314
2315
729e4ab9
A
2316int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2317 return (int32_t)start64(group, status);
2318}
b75a7d8f 2319
46f4442e
A
2320//--------------------------------------------------------------------------------
2321//
2322// useAnchoringBounds
2323//
2324//--------------------------------------------------------------------------------
2325RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2326 fAnchoringBounds = b;
729e4ab9
A
2327 fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2328 fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
46f4442e
A
2329 return *this;
2330}
2331
2332
2333//--------------------------------------------------------------------------------
2334//
2335// useTransparentBounds
2336//
2337//--------------------------------------------------------------------------------
2338RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2339 fTransparentBounds = b;
729e4ab9
A
2340 fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2341 fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
46f4442e
A
2342 return *this;
2343}
2344
2345//--------------------------------------------------------------------------------
2346//
2347// setTimeLimit
2348//
2349//--------------------------------------------------------------------------------
2350void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2351 if (U_FAILURE(status)) {
2352 return;
2353 }
2354 if (U_FAILURE(fDeferredStatus)) {
2355 status = fDeferredStatus;
2356 return;
2357 }
2358 if (limit < 0) {
2359 status = U_ILLEGAL_ARGUMENT_ERROR;
2360 return;
2361 }
2362 fTimeLimit = limit;
2363}
2364
2365
2366//--------------------------------------------------------------------------------
2367//
2368// getTimeLimit
2369//
2370//--------------------------------------------------------------------------------
2371int32_t RegexMatcher::getTimeLimit() const {
2372 return fTimeLimit;
2373}
2374
2375
2376//--------------------------------------------------------------------------------
2377//
2378// setStackLimit
2379//
2380//--------------------------------------------------------------------------------
2381void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2382 if (U_FAILURE(status)) {
2383 return;
2384 }
2385 if (U_FAILURE(fDeferredStatus)) {
2386 status = fDeferredStatus;
2387 return;
2388 }
2389 if (limit < 0) {
2390 status = U_ILLEGAL_ARGUMENT_ERROR;
2391 return;
2392 }
57a6839d 2393
46f4442e 2394 // Reset the matcher. This is needed here in case there is a current match
57a6839d 2395 // whose final stack frame (containing the match results, pointed to by fFrame)
46f4442e
A
2396 // would be lost by resizing to a smaller stack size.
2397 reset();
57a6839d 2398
46f4442e
A
2399 if (limit == 0) {
2400 // Unlimited stack expansion
2401 fStack->setMaxCapacity(0);
2402 } else {
2403 // Change the units of the limit from bytes to ints, and bump the size up
57a6839d 2404 // to be big enough to hold at least one stack frame for the pattern,
46f4442e
A
2405 // if it isn't there already.
2406 int32_t adjustedLimit = limit / sizeof(int32_t);
2407 if (adjustedLimit < fPattern->fFrameSize) {
2408 adjustedLimit = fPattern->fFrameSize;
2409 }
2410 fStack->setMaxCapacity(adjustedLimit);
2411 }
2412 fStackLimit = limit;
2413}
2414
2415
2416//--------------------------------------------------------------------------------
2417//
2418// getStackLimit
2419//
2420//--------------------------------------------------------------------------------
2421int32_t RegexMatcher::getStackLimit() const {
2422 return fStackLimit;
2423}
2424
2425
2426//--------------------------------------------------------------------------------
2427//
2428// setMatchCallback
2429//
2430//--------------------------------------------------------------------------------
2431void RegexMatcher::setMatchCallback(URegexMatchCallback *callback,
2432 const void *context,
2433 UErrorCode &status) {
729e4ab9
A
2434 if (U_FAILURE(status)) {
2435 return;
2436 }
2437 fCallbackFn = callback;
2438 fCallbackContext = context;
46f4442e
A
2439}
2440
2441
2442//--------------------------------------------------------------------------------
2443//
2444// getMatchCallback
2445//
2446//--------------------------------------------------------------------------------
2447void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback,
2448 const void *&context,
2449 UErrorCode &status) {
2450 if (U_FAILURE(status)) {
2451 return;
2452 }
2453 callback = fCallbackFn;
2454 context = fCallbackContext;
2455}
2456
2457
729e4ab9
A
2458//--------------------------------------------------------------------------------
2459//
2460// setMatchCallback
2461//
2462//--------------------------------------------------------------------------------
2463void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback,
2464 const void *context,
2465 UErrorCode &status) {
2466 if (U_FAILURE(status)) {
2467 return;
2468 }
2469 fFindProgressCallbackFn = callback;
2470 fFindProgressCallbackContext = context;
2471}
2472
2473
2474//--------------------------------------------------------------------------------
2475//
2476// getMatchCallback
2477//
2478//--------------------------------------------------------------------------------
2479void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback,
2480 const void *&context,
2481 UErrorCode &status) {
2482 if (U_FAILURE(status)) {
2483 return;
2484 }
2485 callback = fFindProgressCallbackFn;
2486 context = fFindProgressCallbackContext;
2487}
2488
2489
374ca955
A
2490//================================================================================
2491//
2492// Code following this point in this file is the internal
2493// Match Engine Implementation.
2494//
2495//================================================================================
2496
2497
2498//--------------------------------------------------------------------------------
2499//
2500// resetStack
2501// Discard any previous contents of the state save stack, and initialize a
57a6839d 2502// new stack frame to all -1. The -1s are needed for capture group limits,
374ca955
A
2503// where they indicate that a group has not yet matched anything.
2504//--------------------------------------------------------------------------------
2505REStackFrame *RegexMatcher::resetStack() {
2506 // Discard any previous contents of the state save stack, and initialize a
729e4ab9
A
2507 // new stack frame with all -1 data. The -1s are needed for capture group limits,
2508 // where they indicate that a group has not yet matched anything.
374ca955
A
2509 fStack->removeAllElements();
2510
729e4ab9 2511 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2ca993e8
A
2512 if(U_FAILURE(fDeferredStatus)) {
2513 return NULL;
2514 }
2515
729e4ab9
A
2516 int32_t i;
2517 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2518 iFrame->fExtra[i] = -1;
2519 }
2520 return iFrame;
2521}
2522
2523
2524
2525//--------------------------------------------------------------------------------
2526//
57a6839d 2527// isWordBoundary
729e4ab9
A
2528// in perl, "xab..cd..", \b is true at positions 0,3,5,7
2529// For us,
2530// If the current char is a combining mark,
2531// \b is FALSE.
2532// Else Scan backwards to the first non-combining char.
2533// We are at a boundary if the this char and the original chars are
2534// opposite in membership in \w set
2535//
2536// parameters: pos - the current position in the input buffer
2537//
2538// TODO: double-check edge cases at region boundaries.
2539//
2540//--------------------------------------------------------------------------------
2541UBool RegexMatcher::isWordBoundary(int64_t pos) {
2542 UBool isBoundary = FALSE;
2543 UBool cIsWord = FALSE;
57a6839d 2544
729e4ab9
A
2545 if (pos >= fLookLimit) {
2546 fHitEnd = TRUE;
2547 } else {
2548 // Determine whether char c at current position is a member of the word set of chars.
2549 // If we're off the end of the string, behave as though we're not at a word char.
2550 UTEXT_SETNATIVEINDEX(fInputText, pos);
2551 UChar32 c = UTEXT_CURRENT32(fInputText);
2552 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2553 // Current char is a combining one. Not a boundary.
2554 return FALSE;
2555 }
2556 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2557 }
57a6839d 2558
729e4ab9
A
2559 // Back up until we come to a non-combining char, determine whether
2560 // that char is a word char.
2561 UBool prevCIsWord = FALSE;
2562 for (;;) {
2563 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2564 break;
2565 }
2566 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2567 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2568 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2569 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2570 break;
2571 }
2572 }
2573 isBoundary = cIsWord ^ prevCIsWord;
2574 return isBoundary;
2575}
2576
2577UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2578 UBool isBoundary = FALSE;
2579 UBool cIsWord = FALSE;
57a6839d 2580
729e4ab9 2581 const UChar *inputBuf = fInputText->chunkContents;
57a6839d 2582
729e4ab9
A
2583 if (pos >= fLookLimit) {
2584 fHitEnd = TRUE;
2585 } else {
2586 // Determine whether char c at current position is a member of the word set of chars.
2587 // If we're off the end of the string, behave as though we're not at a word char.
2588 UChar32 c;
2589 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2590 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2591 // Current char is a combining one. Not a boundary.
2592 return FALSE;
2593 }
2594 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2595 }
57a6839d 2596
729e4ab9
A
2597 // Back up until we come to a non-combining char, determine whether
2598 // that char is a word char.
2599 UBool prevCIsWord = FALSE;
2600 for (;;) {
2601 if (pos <= fLookStart) {
2602 break;
2603 }
2604 UChar32 prevChar;
2605 U16_PREV(inputBuf, fLookStart, pos, prevChar);
2606 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2607 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2608 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2609 break;
2610 }
2611 }
2612 isBoundary = cIsWord ^ prevCIsWord;
2613 return isBoundary;
2614}
2615
2616//--------------------------------------------------------------------------------
2617//
57a6839d 2618// isUWordBoundary
729e4ab9
A
2619//
2620// Test for a word boundary using RBBI word break.
2621//
2622// parameters: pos - the current position in the input buffer
2623//
2624//--------------------------------------------------------------------------------
2625UBool RegexMatcher::isUWordBoundary(int64_t pos) {
2626 UBool returnVal = FALSE;
2627#if UCONFIG_NO_BREAK_ITERATION==0
57a6839d 2628
729e4ab9
A
2629 // If we haven't yet created a break iterator for this matcher, do it now.
2630 if (fWordBreakItr == NULL) {
57a6839d 2631 fWordBreakItr =
729e4ab9
A
2632 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
2633 if (U_FAILURE(fDeferredStatus)) {
2634 return FALSE;
2635 }
2636 fWordBreakItr->setText(fInputText, fDeferredStatus);
2637 }
2638
2639 if (pos >= fLookLimit) {
2640 fHitEnd = TRUE;
2641 returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real"
2642 // words are not boundaries. All non-word chars stand by themselves,
2643 // with word boundaries on both sides.
2644 } else {
2645 if (!UTEXT_USES_U16(fInputText)) {
2646 // !!!: Would like a better way to do this!
2647 UErrorCode status = U_ZERO_ERROR;
2648 pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
2649 }
2650 returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2651 }
2652#endif
2653 return returnVal;
2654}
2655
2656//--------------------------------------------------------------------------------
2657//
2658// IncrementTime This function is called once each TIMER_INITIAL_VALUE state
2659// saves. Increment the "time" counter, and call the
2660// user callback function if there is one installed.
2661//
2662// If the match operation needs to be aborted, either for a time-out
2663// or because the user callback asked for it, just set an error status.
2664// The engine will pick that up and stop in its outer loop.
2665//
2666//--------------------------------------------------------------------------------
2667void RegexMatcher::IncrementTime(UErrorCode &status) {
2668 fTickCounter = TIMER_INITIAL_VALUE;
2669 fTime++;
2670 if (fCallbackFn != NULL) {
2671 if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2672 status = U_REGEX_STOPPED_BY_CALLER;
2673 return;
2674 }
2675 }
2676 if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2677 status = U_REGEX_TIME_OUT;
2678 }
2679}
2680
729e4ab9
A
2681//--------------------------------------------------------------------------------
2682//
2683// StateSave
2684// Make a new stack frame, initialized as a copy of the current stack frame.
2685// Set the pattern index in the original stack frame from the operand value
2686// in the opcode. Execution of the engine continues with the state in
2687// the newly created stack frame
2688//
2689// Note that reserveBlock() may grow the stack, resulting in the
2690// whole thing being relocated in memory.
2691//
2692// Parameters:
57a6839d 2693// fp The top frame pointer when called. At return, a new
729e4ab9
A
2694// fame will be present
2695// savePatIdx An index into the compiled pattern. Goes into the original
2696// (not new) frame. If execution ever back-tracks out of the
2697// new frame, this will be where we continue from in the pattern.
2698// Return
2699// The new frame pointer.
2700//
2701//--------------------------------------------------------------------------------
2702inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2ca993e8
A
2703 if (U_FAILURE(status)) {
2704 return fp;
2705 }
57a6839d 2706 // push storage for a new frame.
729e4ab9 2707 int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2ca993e8 2708 if (U_FAILURE(status)) {
729e4ab9
A
2709 // Failure on attempted stack expansion.
2710 // Stack function set some other error code, change it to a more
2711 // specific one for regular expressions.
2712 status = U_REGEX_STACK_OVERFLOW;
2713 // We need to return a writable stack frame, so just return the
2714 // previous frame. The match operation will stop quickly
2715 // because of the error status, after which the frame will never
2716 // be looked at again.
2717 return fp;
2718 }
2719 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack.
57a6839d 2720
729e4ab9
A
2721 // New stack frame = copy of old top frame.
2722 int64_t *source = (int64_t *)fp;
2723 int64_t *dest = newFP;
2724 for (;;) {
2725 *dest++ = *source++;
2726 if (source == newFP) {
2727 break;
2728 }
2729 }
57a6839d 2730
729e4ab9
A
2731 fTickCounter--;
2732 if (fTickCounter <= 0) {
2733 IncrementTime(status); // Re-initializes fTickCounter
2734 }
2735 fp->fPatIdx = savePatIdx;
2736 return (REStackFrame *)newFP;
2737}
2738
2ca993e8
A
2739#if defined(REGEX_DEBUG)
2740namespace {
2741UnicodeString StringFromUText(UText *ut) {
2742 UnicodeString result;
2743 for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) {
2744 result.append(c);
2745 }
2746 return result;
2747}
2748}
2749#endif // REGEX_DEBUG
2750
729e4ab9
A
2751
2752//--------------------------------------------------------------------------------
2753//
2754// MatchAt This is the actual matching engine.
2755//
2756// startIdx: begin matching a this index.
2757// toEnd: if true, match must extend to end of the input region
2758//
2759//--------------------------------------------------------------------------------
2760void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2761 UBool isMatch = FALSE; // True if the we have a match.
57a6839d 2762
729e4ab9
A
2763 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2764
2765 int32_t op; // Operation from the compiled pattern, split into
2766 int32_t opType; // the opcode
2767 int32_t opValue; // and the operand value.
57a6839d
A
2768
2769#ifdef REGEX_RUN_DEBUG
2ca993e8 2770 if (fTraceDebug) {
729e4ab9 2771 printf("MatchAt(startIdx=%ld)\n", startIdx);
2ca993e8
A
2772 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
2773 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
729e4ab9 2774 }
57a6839d 2775#endif
729e4ab9
A
2776
2777 if (U_FAILURE(status)) {
2778 return;
2779 }
2780
2781 // Cache frequently referenced items from the compiled pattern
2782 //
2783 int64_t *pat = fPattern->fCompiledPat->getBuffer();
2784
2785 const UChar *litText = fPattern->fLiteralText.getBuffer();
2786 UVector *sets = fPattern->fSets;
2787
2788 fFrameSize = fPattern->fFrameSize;
2789 REStackFrame *fp = resetStack();
2ca993e8
A
2790 if (U_FAILURE(fDeferredStatus)) {
2791 status = fDeferredStatus;
2792 return;
2793 }
729e4ab9
A
2794
2795 fp->fPatIdx = 0;
2796 fp->fInputIdx = startIdx;
2797
2798 // Zero out the pattern's static data
2799 int32_t i;
2800 for (i = 0; i<fPattern->fDataSize; i++) {
2801 fData[i] = 0;
2802 }
2803
2804 //
2805 // Main loop for interpreting the compiled pattern.
2806 // One iteration of the loop per pattern operation performed.
2807 //
2808 for (;;) {
729e4ab9
A
2809 op = (int32_t)pat[fp->fPatIdx];
2810 opType = URX_TYPE(op);
2811 opValue = URX_VAL(op);
57a6839d 2812#ifdef REGEX_RUN_DEBUG
729e4ab9
A
2813 if (fTraceDebug) {
2814 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 2815 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
729e4ab9
A
2816 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2817 fPattern->dumpOp(fp->fPatIdx);
2818 }
57a6839d 2819#endif
729e4ab9 2820 fp->fPatIdx++;
57a6839d 2821
729e4ab9
A
2822 switch (opType) {
2823
2824
2825 case URX_NOP:
2826 break;
2827
2828
2829 case URX_BACKTRACK:
2830 // Force a backtrack. In some circumstances, the pattern compiler
2831 // will notice that the pattern can't possibly match anything, and will
2832 // emit one of these at that point.
2833 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2834 break;
2835
2836
2837 case URX_ONECHAR:
2838 if (fp->fInputIdx < fActiveLimit) {
2839 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2840 UChar32 c = UTEXT_NEXT32(fInputText);
2841 if (c == opValue) {
2842 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2843 break;
2844 }
2845 } else {
2846 fHitEnd = TRUE;
2847 }
729e4ab9
A
2848 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2849 break;
2850
2851
2852 case URX_STRING:
2853 {
2854 // Test input against a literal string.
2855 // Strings require two slots in the compiled pattern, one for the
2856 // offset to the string text, and one for the length.
729e4ab9 2857
4388f060 2858 int32_t stringStartIdx = opValue;
729e4ab9
A
2859 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
2860 fp->fPatIdx++;
2861 opType = URX_TYPE(op);
4388f060 2862 int32_t stringLen = URX_VAL(op);
729e4ab9
A
2863 U_ASSERT(opType == URX_STRING_LEN);
2864 U_ASSERT(stringLen >= 2);
57a6839d 2865
4388f060
A
2866 const UChar *patternString = litText+stringStartIdx;
2867 int32_t patternStringIndex = 0;
729e4ab9 2868 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4388f060
A
2869 UChar32 inputChar;
2870 UChar32 patternChar;
729e4ab9 2871 UBool success = TRUE;
4388f060
A
2872 while (patternStringIndex < stringLen) {
2873 if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
729e4ab9 2874 success = FALSE;
4388f060
A
2875 fHitEnd = TRUE;
2876 break;
2877 }
2878 inputChar = UTEXT_NEXT32(fInputText);
2879 U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
2880 if (patternChar != inputChar) {
2881 success = FALSE;
2882 break;
729e4ab9
A
2883 }
2884 }
57a6839d 2885
729e4ab9
A
2886 if (success) {
2887 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2888 } else {
729e4ab9
A
2889 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2890 }
2891 }
2892 break;
2893
2894
2895 case URX_STATE_SAVE:
2896 fp = StateSave(fp, opValue, status);
2897 break;
2898
2899
2900 case URX_END:
2901 // The match loop will exit via this path on a successful match,
2902 // when we reach the end of the pattern.
2903 if (toEnd && fp->fInputIdx != fActiveLimit) {
2904 // The pattern matched, but not to the end of input. Try some more.
2905 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2906 break;
2907 }
2908 isMatch = TRUE;
2909 goto breakFromLoop;
2910
2911 // Start and End Capture stack frame variables are laid out out like this:
2912 // fp->fExtra[opValue] - The start of a completed capture group
2913 // opValue+1 - The end of a completed capture group
2914 // opValue+2 - the start of a capture group whose end
2915 // has not yet been reached (and might not ever be).
2916 case URX_START_CAPTURE:
2917 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2918 fp->fExtra[opValue+2] = fp->fInputIdx;
2919 break;
2920
2921
2922 case URX_END_CAPTURE:
2923 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2924 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
2925 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
2926 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
2927 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
2928 break;
2929
2930
2931 case URX_DOLLAR: // $, test for End of line
2932 // or for position before new line at end of input
2933 {
2934 if (fp->fInputIdx >= fAnchorLimit) {
2935 // We really are at the end of input. Success.
2936 fHitEnd = TRUE;
2937 fRequireEnd = TRUE;
2938 break;
2939 }
57a6839d 2940
729e4ab9 2941 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 2942
729e4ab9
A
2943 // If we are positioned just before a new-line that is located at the
2944 // end of input, succeed.
2945 UChar32 c = UTEXT_NEXT32(fInputText);
2946 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
b331163b 2947 if (isLineTerminator(c)) {
729e4ab9 2948 // If not in the middle of a CR/LF sequence
b331163b 2949 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
729e4ab9
A
2950 // At new-line at end of input. Success
2951 fHitEnd = TRUE;
2952 fRequireEnd = TRUE;
57a6839d 2953
729e4ab9
A
2954 break;
2955 }
2956 }
2957 } else {
2958 UChar32 nextC = UTEXT_NEXT32(fInputText);
2959 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2960 fHitEnd = TRUE;
2961 fRequireEnd = TRUE;
2962 break; // At CR/LF at end of input. Success
2963 }
2964 }
2965
2966 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2967 }
2968 break;
2969
2970
2971 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
2972 if (fp->fInputIdx >= fAnchorLimit) {
2973 // Off the end of input. Success.
2974 fHitEnd = TRUE;
2975 fRequireEnd = TRUE;
2976 break;
2977 } else {
2978 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2979 UChar32 c = UTEXT_NEXT32(fInputText);
2980 // Either at the last character of input, or off the end.
2981 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
2982 fHitEnd = TRUE;
2983 fRequireEnd = TRUE;
2984 break;
2985 }
2986 }
2987
2988 // Not at end of input. Back-track out.
2989 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2990 break;
2991
2992
2993 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
2994 {
2995 if (fp->fInputIdx >= fAnchorLimit) {
2996 // We really are at the end of input. Success.
2997 fHitEnd = TRUE;
2998 fRequireEnd = TRUE;
2999 break;
3000 }
3001 // If we are positioned just before a new-line, succeed.
3002 // It makes no difference where the new-line is within the input.
3003 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3004 UChar32 c = UTEXT_CURRENT32(fInputText);
b331163b 3005 if (isLineTerminator(c)) {
729e4ab9
A
3006 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
3007 // In multi-line mode, hitting a new-line just before the end of input does not
3008 // set the hitEnd or requireEnd flags
3009 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
3010 break;
3011 }
3012 }
3013 // not at a new line. Fail.
3014 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3015 }
3016 break;
3017
3018
3019 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
3020 {
3021 if (fp->fInputIdx >= fAnchorLimit) {
3022 // We really are at the end of input. Success.
3023 fHitEnd = TRUE;
3024 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
3025 break; // adding a new-line would not lose the match.
3026 }
3027 // If we are not positioned just before a new-line, the test fails; backtrack out.
3028 // It makes no difference where the new-line is within the input.
3029 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3030 if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3031 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3032 }
3033 }
3034 break;
3035
3036
3037 case URX_CARET: // ^, test for start of line
3038 if (fp->fInputIdx != fAnchorStart) {
3039 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3040 }
3041 break;
3042
3043
3044 case URX_CARET_M: // ^, test for start of line in mulit-line mode
3045 {
3046 if (fp->fInputIdx == fAnchorStart) {
3047 // We are at the start input. Success.
3048 break;
3049 }
3050 // Check whether character just before the current pos is a new-line
3051 // unless we are at the end of input
3052 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 3053 UChar32 c = UTEXT_PREVIOUS32(fInputText);
b331163b 3054 if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) {
729e4ab9
A
3055 // It's a new-line. ^ is true. Success.
3056 // TODO: what should be done with positions between a CR and LF?
3057 break;
3058 }
3059 // Not at the start of a line. Fail.
3060 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3061 }
3062 break;
3063
3064
3065 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
3066 {
3067 U_ASSERT(fp->fInputIdx >= fAnchorStart);
3068 if (fp->fInputIdx <= fAnchorStart) {
3069 // We are at the start input. Success.
3070 break;
3071 }
3072 // Check whether character just before the current pos is a new-line
3073 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3074 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3075 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3076 if (c != 0x0a) {
3077 // Not at the start of a line. Back-track out.
3078 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3079 }
3080 }
3081 break;
3082
3083 case URX_BACKSLASH_B: // Test for word boundaries
3084 {
3085 UBool success = isWordBoundary(fp->fInputIdx);
51004dcb 3086 success ^= (UBool)(opValue != 0); // flip sense for \B
729e4ab9
A
3087 if (!success) {
3088 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3089 }
3090 }
3091 break;
3092
3093
3094 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
3095 {
3096 UBool success = isUWordBoundary(fp->fInputIdx);
51004dcb 3097 success ^= (UBool)(opValue != 0); // flip sense for \B
729e4ab9
A
3098 if (!success) {
3099 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3100 }
3101 }
3102 break;
3103
3104
3105 case URX_BACKSLASH_D: // Test for decimal digit
3106 {
3107 if (fp->fInputIdx >= fActiveLimit) {
3108 fHitEnd = TRUE;
3109 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3110 break;
3111 }
3112
3113 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3114
3115 UChar32 c = UTEXT_NEXT32(fInputText);
3116 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
3117 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
51004dcb 3118 success ^= (UBool)(opValue != 0); // flip sense for \D
729e4ab9
A
3119 if (success) {
3120 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3121 } else {
3122 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3123 }
3124 }
3125 break;
3126
3127
3128 case URX_BACKSLASH_G: // Test for position at end of previous match
3129 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3130 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3131 }
3132 break;
3133
3134
b331163b
A
3135 case URX_BACKSLASH_H: // Test for \h, horizontal white space.
3136 {
3137 if (fp->fInputIdx >= fActiveLimit) {
3138 fHitEnd = TRUE;
3139 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3140 break;
3141 }
3142 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3143 UChar32 c = UTEXT_NEXT32(fInputText);
3144 int8_t ctype = u_charType(c);
3145 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
3146 success ^= (UBool)(opValue != 0); // flip sense for \H
3147 if (success) {
3148 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3149 } else {
3150 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3151 }
3152 }
3153 break;
3154
3155
3156 case URX_BACKSLASH_R: // Test for \R, any line break sequence.
3157 {
3158 if (fp->fInputIdx >= fActiveLimit) {
3159 fHitEnd = TRUE;
3160 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3161 break;
3162 }
3163 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3164 UChar32 c = UTEXT_NEXT32(fInputText);
3165 if (isLineTerminator(c)) {
3166 if (c == 0x0d && utext_current32(fInputText) == 0x0a) {
3167 utext_next32(fInputText);
3168 }
3169 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3170 } else {
3171 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3172 }
3173 }
3174 break;
3175
3176
3177 case URX_BACKSLASH_V: // \v, any single line ending character.
3178 {
3179 if (fp->fInputIdx >= fActiveLimit) {
3180 fHitEnd = TRUE;
3181 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3182 break;
3183 }
3184 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3185 UChar32 c = UTEXT_NEXT32(fInputText);
3186 UBool success = isLineTerminator(c);
3187 success ^= (UBool)(opValue != 0); // flip sense for \V
3188 if (success) {
3189 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3190 } else {
3191 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3192 }
3193 }
3194 break;
3195
3196
57a6839d 3197 case URX_BACKSLASH_X:
729e4ab9
A
3198 // Match a Grapheme, as defined by Unicode TR 29.
3199 // Differs slightly from Perl, which consumes combining marks independently
3200 // of context.
3201 {
3202
3203 // Fail if at end of input
3204 if (fp->fInputIdx >= fActiveLimit) {
3205 fHitEnd = TRUE;
3206 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3207 break;
3208 }
57a6839d 3209
729e4ab9
A
3210 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3211
3212 // Examine (and consume) the current char.
3213 // Dispatch into a little state machine, based on the char.
3214 UChar32 c;
3215 c = UTEXT_NEXT32(fInputText);
3216 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3217 UnicodeSet **sets = fPattern->fStaticSets;
3218 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
3219 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
3220 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3221 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3222 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3223 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3224 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3225 goto GC_Extend;
3226
3227
3228
3229GC_L:
3230 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3231 c = UTEXT_NEXT32(fInputText);
3232 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3233 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3234 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3235 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3236 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4388f060 3237 (void)UTEXT_PREVIOUS32(fInputText);
729e4ab9
A
3238 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3239 goto GC_Extend;
3240
3241GC_V:
3242 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3243 c = UTEXT_NEXT32(fInputText);
3244 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3245 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3246 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4388f060 3247 (void)UTEXT_PREVIOUS32(fInputText);
729e4ab9
A
3248 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3249 goto GC_Extend;
3250
3251GC_T:
3252 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3253 c = UTEXT_NEXT32(fInputText);
3254 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3255 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4388f060 3256 (void)UTEXT_PREVIOUS32(fInputText);
729e4ab9
A
3257 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3258 goto GC_Extend;
3259
3260GC_Extend:
3261 // Combining characters are consumed here
3262 for (;;) {
3263 if (fp->fInputIdx >= fActiveLimit) {
3264 break;
3265 }
3266 c = UTEXT_CURRENT32(fInputText);
3267 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
3268 break;
3269 }
4388f060 3270 (void)UTEXT_NEXT32(fInputText);
729e4ab9
A
3271 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3272 }
3273 goto GC_Done;
3274
3275GC_Control:
57a6839d 3276 // Most control chars stand alone (don't combine with combining chars),
729e4ab9
A
3277 // except for that CR/LF sequence is a single grapheme cluster.
3278 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
3279 c = UTEXT_NEXT32(fInputText);
3280 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3281 }
3282
3283GC_Done:
3284 if (fp->fInputIdx >= fActiveLimit) {
3285 fHitEnd = TRUE;
3286 }
3287 break;
3288 }
57a6839d 3289
729e4ab9
A
3290
3291
3292
3293 case URX_BACKSLASH_Z: // Test for end of Input
3294 if (fp->fInputIdx < fAnchorLimit) {
3295 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3296 } else {
3297 fHitEnd = TRUE;
3298 fRequireEnd = TRUE;
3299 }
3300 break;
3301
3302
3303
3304 case URX_STATIC_SETREF:
3305 {
3306 // Test input character against one of the predefined sets
3307 // (Word Characters, for example)
3308 // The high bit of the op value is a flag for the match polarity.
3309 // 0: success if input char is in set.
3310 // 1: success if input char is not in set.
3311 if (fp->fInputIdx >= fActiveLimit) {
3312 fHitEnd = TRUE;
3313 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3314 break;
3315 }
3316
57a6839d 3317 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
729e4ab9
A
3318 opValue &= ~URX_NEG_SET;
3319 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3320
3321 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3322 UChar32 c = UTEXT_NEXT32(fInputText);
3323 if (c < 256) {
3324 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3325 if (s8->contains(c)) {
3326 success = !success;
3327 }
3328 } else {
3329 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3330 if (s->contains(c)) {
3331 success = !success;
3332 }
3333 }
3334 if (success) {
3335 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3336 } else {
3337 // the character wasn't in the set.
729e4ab9
A
3338 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3339 }
3340 }
3341 break;
57a6839d 3342
729e4ab9
A
3343
3344 case URX_STAT_SETREF_N:
3345 {
57a6839d 3346 // Test input character for NOT being a member of one of
729e4ab9
A
3347 // the predefined sets (Word Characters, for example)
3348 if (fp->fInputIdx >= fActiveLimit) {
3349 fHitEnd = TRUE;
3350 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3351 break;
3352 }
3353
3354 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3355
3356 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 3357
729e4ab9
A
3358 UChar32 c = UTEXT_NEXT32(fInputText);
3359 if (c < 256) {
3360 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3361 if (s8->contains(c) == FALSE) {
3362 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3363 break;
3364 }
3365 } else {
3366 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3367 if (s->contains(c) == FALSE) {
3368 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3369 break;
3370 }
3371 }
3372 // the character wasn't in the set.
729e4ab9
A
3373 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3374 }
3375 break;
57a6839d 3376
729e4ab9
A
3377
3378 case URX_SETREF:
3379 if (fp->fInputIdx >= fActiveLimit) {
3380 fHitEnd = TRUE;
3381 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3382 break;
3383 } else {
3384 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 3385
729e4ab9
A
3386 // There is input left. Pick up one char and test it for set membership.
3387 UChar32 c = UTEXT_NEXT32(fInputText);
3388 U_ASSERT(opValue > 0 && opValue < sets->size());
3389 if (c<256) {
3390 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3391 if (s8->contains(c)) {
3392 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3393 break;
3394 }
3395 } else {
3396 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3397 if (s->contains(c)) {
3398 // The character is in the set. A Match.
3399 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3400 break;
3401 }
3402 }
57a6839d 3403
729e4ab9 3404 // the character wasn't in the set.
729e4ab9
A
3405 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3406 }
3407 break;
3408
3409
3410 case URX_DOTANY:
3411 {
3412 // . matches anything, but stops at end-of-line.
3413 if (fp->fInputIdx >= fActiveLimit) {
3414 // At end of input. Match failed. Backtrack out.
3415 fHitEnd = TRUE;
3416 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3417 break;
3418 }
57a6839d 3419
729e4ab9 3420 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 3421
729e4ab9
A
3422 // There is input left. Advance over one char, unless we've hit end-of-line
3423 UChar32 c = UTEXT_NEXT32(fInputText);
b331163b 3424 if (isLineTerminator(c)) {
729e4ab9
A
3425 // End of line in normal mode. . does not match.
3426 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3427 break;
3428 }
3429 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3430 }
3431 break;
3432
3433
3434 case URX_DOTANY_ALL:
3435 {
3436 // ., in dot-matches-all (including new lines) mode
3437 if (fp->fInputIdx >= fActiveLimit) {
3438 // At end of input. Match failed. Backtrack out.
3439 fHitEnd = TRUE;
3440 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3441 break;
3442 }
57a6839d 3443
729e4ab9 3444 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 3445
729e4ab9
A
3446 // There is input left. Advance over one char, except if we are
3447 // at a cr/lf, advance over both of them.
57a6839d 3448 UChar32 c;
729e4ab9
A
3449 c = UTEXT_NEXT32(fInputText);
3450 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3451 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3452 // In the case of a CR/LF, we need to advance over both.
3453 UChar32 nextc = UTEXT_CURRENT32(fInputText);
3454 if (nextc == 0x0a) {
4388f060 3455 (void)UTEXT_NEXT32(fInputText);
729e4ab9
A
3456 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3457 }
3458 }
3459 }
3460 break;
3461
3462
3463 case URX_DOTANY_UNIX:
3464 {
3465 // '.' operator, matches all, but stops at end-of-line.
3466 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
3467 if (fp->fInputIdx >= fActiveLimit) {
3468 // At end of input. Match failed. Backtrack out.
3469 fHitEnd = TRUE;
3470 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3471 break;
3472 }
3473
3474 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 3475
729e4ab9
A
3476 // There is input left. Advance over one char, unless we've hit end-of-line
3477 UChar32 c = UTEXT_NEXT32(fInputText);
3478 if (c == 0x0a) {
3479 // End of line in normal mode. '.' does not match the \n
3480 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3481 } else {
3482 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3483 }
3484 }
3485 break;
3486
3487
3488 case URX_JMP:
3489 fp->fPatIdx = opValue;
3490 break;
3491
3492 case URX_FAIL:
3493 isMatch = FALSE;
3494 goto breakFromLoop;
3495
3496 case URX_JMP_SAV:
3497 U_ASSERT(opValue < fPattern->fCompiledPat->size());
3498 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3499 fp->fPatIdx = opValue; // Then JMP.
3500 break;
3501
3502 case URX_JMP_SAV_X:
3503 // This opcode is used with (x)+, when x can match a zero length string.
3504 // Same as JMP_SAV, except conditional on the match having made forward progress.
3505 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3506 // data address of the input position at the start of the loop.
3507 {
3508 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3509 int32_t stoOp = (int32_t)pat[opValue-1];
3510 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3511 int32_t frameLoc = URX_VAL(stoOp);
3512 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3513 int64_t prevInputIdx = fp->fExtra[frameLoc];
3514 U_ASSERT(prevInputIdx <= fp->fInputIdx);
3515 if (prevInputIdx < fp->fInputIdx) {
3516 // The match did make progress. Repeat the loop.
3517 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3518 fp->fPatIdx = opValue;
3519 fp->fExtra[frameLoc] = fp->fInputIdx;
57a6839d 3520 }
729e4ab9
A
3521 // If the input position did not advance, we do nothing here,
3522 // execution will fall out of the loop.
3523 }
3524 break;
3525
3526 case URX_CTR_INIT:
3527 {
3528 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
57a6839d 3529 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
729e4ab9
A
3530
3531 // Pick up the three extra operands that CTR_INIT has, and
57a6839d 3532 // skip the pattern location counter past
729e4ab9
A
3533 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3534 fp->fPatIdx += 3;
3535 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3536 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3537 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3538 U_ASSERT(minCount>=0);
3539 U_ASSERT(maxCount>=minCount || maxCount==-1);
57a6839d 3540 U_ASSERT(loopLoc>=fp->fPatIdx);
729e4ab9
A
3541
3542 if (minCount == 0) {
3543 fp = StateSave(fp, loopLoc+1, status);
3544 }
57a6839d
A
3545 if (maxCount == -1) {
3546 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
3547 } else if (maxCount == 0) {
729e4ab9
A
3548 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3549 }
3550 }
3551 break;
3552
3553 case URX_CTR_LOOP:
3554 {
3555 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3556 int32_t initOp = (int32_t)pat[opValue];
3557 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3558 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3559 int32_t minCount = (int32_t)pat[opValue+2];
3560 int32_t maxCount = (int32_t)pat[opValue+3];
729e4ab9 3561 (*pCounter)++;
57a6839d
A
3562 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3563 U_ASSERT(*pCounter == maxCount);
729e4ab9
A
3564 break;
3565 }
3566 if (*pCounter >= minCount) {
57a6839d
A
3567 if (maxCount == -1) {
3568 // Loop has no hard upper bound.
3569 // Check that it is progressing through the input, break if it is not.
3570 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
3571 if (fp->fInputIdx == *pLastInputIdx) {
3572 break;
3573 } else {
3574 *pLastInputIdx = fp->fInputIdx;
3575 }
3576 }
729e4ab9 3577 fp = StateSave(fp, fp->fPatIdx, status);
f3c0d7a5
A
3578 } else {
3579 // Increment time-out counter. (StateSave() does it if count >= minCount)
3580 fTickCounter--;
3581 if (fTickCounter <= 0) {
3582 IncrementTime(status); // Re-initializes fTickCounter
3583 }
729e4ab9 3584 }
f3c0d7a5 3585
729e4ab9
A
3586 fp->fPatIdx = opValue + 4; // Loop back.
3587 }
3588 break;
3589
3590 case URX_CTR_INIT_NG:
3591 {
3592 // Initialize a non-greedy loop
3593 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
57a6839d 3594 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
729e4ab9 3595
57a6839d
A
3596 // Pick up the three extra operands that CTR_INIT_NG has, and
3597 // skip the pattern location counter past
729e4ab9
A
3598 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3599 fp->fPatIdx += 3;
3600 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3601 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3602 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3603 U_ASSERT(minCount>=0);
3604 U_ASSERT(maxCount>=minCount || maxCount==-1);
3605 U_ASSERT(loopLoc>fp->fPatIdx);
57a6839d
A
3606 if (maxCount == -1) {
3607 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
3608 }
729e4ab9
A
3609
3610 if (minCount == 0) {
3611 if (maxCount != 0) {
3612 fp = StateSave(fp, fp->fPatIdx, status);
3613 }
3614 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
57a6839d 3615 }
729e4ab9
A
3616 }
3617 break;
3618
3619 case URX_CTR_LOOP_NG:
3620 {
3621 // Non-greedy {min, max} loops
3622 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3623 int32_t initOp = (int32_t)pat[opValue];
3624 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3625 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3626 int32_t minCount = (int32_t)pat[opValue+2];
3627 int32_t maxCount = (int32_t)pat[opValue+3];
729e4ab9 3628
57a6839d
A
3629 (*pCounter)++;
3630 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
729e4ab9
A
3631 // The loop has matched the maximum permitted number of times.
3632 // Break out of here with no action. Matching will
3633 // continue with the following pattern.
57a6839d 3634 U_ASSERT(*pCounter == maxCount);
729e4ab9
A
3635 break;
3636 }
3637
3638 if (*pCounter < minCount) {
3639 // We haven't met the minimum number of matches yet.
3640 // Loop back for another one.
3641 fp->fPatIdx = opValue + 4; // Loop back.
f3c0d7a5
A
3642 // Increment time-out counter. (StateSave() does it if count >= minCount)
3643 fTickCounter--;
3644 if (fTickCounter <= 0) {
3645 IncrementTime(status); // Re-initializes fTickCounter
3646 }
729e4ab9
A
3647 } else {
3648 // We do have the minimum number of matches.
57a6839d
A
3649
3650 // If there is no upper bound on the loop iterations, check that the input index
3651 // is progressing, and stop the loop if it is not.
3652 if (maxCount == -1) {
3653 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
3654 if (fp->fInputIdx == *pLastInputIdx) {
3655 break;
3656 }
3657 *pLastInputIdx = fp->fInputIdx;
3658 }
3659
3660 // Loop Continuation: we will fall into the pattern following the loop
3661 // (non-greedy, don't execute loop body first), but first do
3662 // a state save to the top of the loop, so that a match failure
729e4ab9
A
3663 // in the following pattern will try another iteration of the loop.
3664 fp = StateSave(fp, opValue + 4, status);
3665 }
3666 }
3667 break;
3668
3669 case URX_STO_SP:
3670 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3671 fData[opValue] = fStack->size();
3672 break;
3673
3674 case URX_LD_SP:
3675 {
3676 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3677 int32_t newStackSize = (int32_t)fData[opValue];
3678 U_ASSERT(newStackSize <= fStack->size());
3679 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3680 if (newFP == (int64_t *)fp) {
3681 break;
3682 }
3683 int32_t i;
3684 for (i=0; i<fFrameSize; i++) {
3685 newFP[i] = ((int64_t *)fp)[i];
3686 }
3687 fp = (REStackFrame *)newFP;
3688 fStack->setSize(newStackSize);
3689 }
3690 break;
3691
3692 case URX_BACKREF:
729e4ab9
A
3693 {
3694 U_ASSERT(opValue < fFrameSize);
3695 int64_t groupStartIdx = fp->fExtra[opValue];
3696 int64_t groupEndIdx = fp->fExtra[opValue+1];
3697 U_ASSERT(groupStartIdx <= groupEndIdx);
3698 if (groupStartIdx < 0) {
3699 // This capture group has not participated in the match thus far,
3700 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
729e4ab9
A
3701 break;
3702 }
729e4ab9
A
3703 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3704 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4388f060
A
3705
3706 // Note: if the capture group match was of an empty string the backref
57a6839d 3707 // match succeeds. Verified by testing: Perl matches succeed
4388f060 3708 // in this case, so we do too.
57a6839d 3709
4388f060
A
3710 UBool success = TRUE;
3711 for (;;) {
3712 if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3713 success = TRUE;
3714 break;
3715 }
3716 if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
3717 success = FALSE;
729e4ab9 3718 fHitEnd = TRUE;
4388f060
A
3719 break;
3720 }
3721 UChar32 captureGroupChar = utext_next32(fAltInputText);
3722 UChar32 inputChar = utext_next32(fInputText);
3723 if (inputChar != captureGroupChar) {
3724 success = FALSE;
3725 break;
729e4ab9 3726 }
4388f060
A
3727 }
3728
3729 if (success) {
3730 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3731 } else {
3732 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3733 }
3734 }
3735 break;
3736
3737
3738
3739 case URX_BACKREF_I:
3740 {
3741 U_ASSERT(opValue < fFrameSize);
3742 int64_t groupStartIdx = fp->fExtra[opValue];
3743 int64_t groupEndIdx = fp->fExtra[opValue+1];
3744 U_ASSERT(groupStartIdx <= groupEndIdx);
3745 if (groupStartIdx < 0) {
3746 // This capture group has not participated in the match thus far,
729e4ab9 3747 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
4388f060
A
3748 break;
3749 }
3750 utext_setNativeIndex(fAltInputText, groupStartIdx);
3751 utext_setNativeIndex(fInputText, fp->fInputIdx);
3752 CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
3753 CaseFoldingUTextIterator inputItr(*fInputText);
3754
3755 // Note: if the capture group match was of an empty string the backref
57a6839d 3756 // match succeeds. Verified by testing: Perl matches succeed
4388f060 3757 // in this case, so we do too.
57a6839d 3758
4388f060
A
3759 UBool success = TRUE;
3760 for (;;) {
3761 if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3762 success = TRUE;
3763 break;
3764 }
3765 if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
3766 success = FALSE;
3767 fHitEnd = TRUE;
3768 break;
3769 }
3770 UChar32 captureGroupChar = captureGroupItr.next();
3771 UChar32 inputChar = inputItr.next();
3772 if (inputChar != captureGroupChar) {
3773 success = FALSE;
3774 break;
3775 }
3776 }
3777
3778 if (success && inputItr.inExpansion()) {
57a6839d
A
3779 // We otained a match by consuming part of a string obtained from
3780 // case-folding a single code point of the input text.
4388f060
A
3781 // This does not count as an overall match.
3782 success = FALSE;
3783 }
3784
3785 if (success) {
3786 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3787 } else {
3788 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
729e4ab9 3789 }
57a6839d 3790
729e4ab9
A
3791 }
3792 break;
57a6839d 3793
729e4ab9
A
3794 case URX_STO_INP_LOC:
3795 {
3796 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3797 fp->fExtra[opValue] = fp->fInputIdx;
3798 }
3799 break;
3800
3801 case URX_JMPX:
3802 {
3803 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3804 fp->fPatIdx += 1;
3805 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
3806 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3807 int64_t savedInputIdx = fp->fExtra[dataLoc];
3808 U_ASSERT(savedInputIdx <= fp->fInputIdx);
3809 if (savedInputIdx < fp->fInputIdx) {
3810 fp->fPatIdx = opValue; // JMP
3811 } else {
3812 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
3813 }
3814 }
3815 break;
3816
3817 case URX_LA_START:
3818 {
3819 // Entering a lookahead block.
3820 // Save Stack Ptr, Input Pos.
3821 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3822 fData[opValue] = fStack->size();
3823 fData[opValue+1] = fp->fInputIdx;
3824 fActiveStart = fLookStart; // Set the match region change for
3825 fActiveLimit = fLookLimit; // transparent bounds.
3826 }
3827 break;
3828
3829 case URX_LA_END:
3830 {
3831 // Leaving a look-ahead block.
3832 // restore Stack Ptr, Input Pos to positions they had on entry to block.
3833 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3834 int32_t stackSize = fStack->size();
3835 int32_t newStackSize =(int32_t)fData[opValue];
3836 U_ASSERT(stackSize >= newStackSize);
3837 if (stackSize > newStackSize) {
3838 // Copy the current top frame back to the new (cut back) top frame.
3839 // This makes the capture groups from within the look-ahead
3840 // expression available.
3841 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3842 int32_t i;
3843 for (i=0; i<fFrameSize; i++) {
3844 newFP[i] = ((int64_t *)fp)[i];
3845 }
3846 fp = (REStackFrame *)newFP;
3847 fStack->setSize(newStackSize);
3848 }
3849 fp->fInputIdx = fData[opValue+1];
3850
3851 // Restore the active region bounds in the input string; they may have
3852 // been changed because of transparent bounds on a Region.
3853 fActiveStart = fRegionStart;
3854 fActiveLimit = fRegionLimit;
3855 }
3856 break;
3857
3858 case URX_ONECHAR_I:
4388f060
A
3859 // Case insensitive one char. The char from the pattern is already case folded.
3860 // Input text is not, but case folding the input can not reduce two or more code
3861 // points to one.
729e4ab9
A
3862 if (fp->fInputIdx < fActiveLimit) {
3863 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3864
3865 UChar32 c = UTEXT_NEXT32(fInputText);
3866 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3867 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3868 break;
3869 }
3870 } else {
3871 fHitEnd = TRUE;
3872 }
57a6839d 3873
729e4ab9
A
3874 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3875 break;
3876
3877 case URX_STRING_I:
3878 {
4388f060 3879 // Case-insensitive test input against a literal string.
729e4ab9
A
3880 // Strings require two slots in the compiled pattern, one for the
3881 // offset to the string text, and one for the length.
4388f060 3882 // The compiled string has already been case folded.
729e4ab9 3883 {
4388f060
A
3884 const UChar *patternString = litText + opValue;
3885 int32_t patternStringIdx = 0;
729e4ab9
A
3886
3887 op = (int32_t)pat[fp->fPatIdx];
3888 fp->fPatIdx++;
3889 opType = URX_TYPE(op);
3890 opValue = URX_VAL(op);
3891 U_ASSERT(opType == URX_STRING_LEN);
4388f060 3892 int32_t patternStringLen = opValue; // Length of the string from the pattern.
57a6839d
A
3893
3894
4388f060
A
3895 UChar32 cPattern;
3896 UChar32 cText;
3897 UBool success = TRUE;
3898
729e4ab9 3899 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4388f060
A
3900 CaseFoldingUTextIterator inputIterator(*fInputText);
3901 while (patternStringIdx < patternStringLen) {
3902 if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
3903 success = FALSE;
3904 fHitEnd = TRUE;
3905 break;
729e4ab9 3906 }
4388f060
A
3907 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
3908 cText = inputIterator.next();
3909 if (cText != cPattern) {
3910 success = FALSE;
3911 break;
729e4ab9
A
3912 }
3913 }
4388f060
A
3914 if (inputIterator.inExpansion()) {
3915 success = FALSE;
3916 }
3917
3918 if (success) {
3919 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3920 } else {
729e4ab9
A
3921 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3922 }
3923 }
3924 }
3925 break;
3926
3927 case URX_LB_START:
3928 {
3929 // Entering a look-behind block.
3930 // Save Stack Ptr, Input Pos.
3931 // TODO: implement transparent bounds. Ticket #6067
3932 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3933 fData[opValue] = fStack->size();
3934 fData[opValue+1] = fp->fInputIdx;
3935 // Init the variable containing the start index for attempted matches.
3936 fData[opValue+2] = -1;
3937 // Save input string length, then reset to pin any matches to end at
3938 // the current position.
3939 fData[opValue+3] = fActiveLimit;
3940 fActiveLimit = fp->fInputIdx;
3941 }
3942 break;
3943
3944
3945 case URX_LB_CONT:
3946 {
3947 // Positive Look-Behind, at top of loop checking for matches of LB expression
3948 // at all possible input starting positions.
3949
3950 // Fetch the min and max possible match lengths. They are the operands
3951 // of this op in the pattern.
3952 int32_t minML = (int32_t)pat[fp->fPatIdx++];
3953 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
2ca993e8
A
3954 if (!UTEXT_USES_U16(fInputText)) {
3955 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
3956 // The max length need not be exact; it just needs to be >= actual maximum.
3957 maxML *= 3;
3958 }
729e4ab9
A
3959 U_ASSERT(minML <= maxML);
3960 U_ASSERT(minML >= 0);
3961
3962 // Fetch (from data) the last input index where a match was attempted.
3963 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2ca993e8
A
3964 int64_t &lbStartIdx = fData[opValue+2];
3965 if (lbStartIdx < 0) {
729e4ab9 3966 // First time through loop.
2ca993e8
A
3967 lbStartIdx = fp->fInputIdx - minML;
3968 if (lbStartIdx > 0) {
3969 // move index to a code point boudary, if it's not on one already.
3970 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3971 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3972 }
729e4ab9
A
3973 } else {
3974 // 2nd through nth time through the loop.
3975 // Back up start position for match by one.
2ca993e8
A
3976 if (lbStartIdx == 0) {
3977 (lbStartIdx)--;
729e4ab9 3978 } else {
2ca993e8 3979 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4388f060 3980 (void)UTEXT_PREVIOUS32(fInputText);
2ca993e8 3981 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
729e4ab9
A
3982 }
3983 }
3984
2ca993e8 3985 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
729e4ab9
A
3986 // We have tried all potential match starting points without
3987 // getting a match. Backtrack out, and out of the
3988 // Look Behind altogether.
3989 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3990 int64_t restoreInputLen = fData[opValue+3];
3991 U_ASSERT(restoreInputLen >= fActiveLimit);
3992 U_ASSERT(restoreInputLen <= fInputLength);
3993 fActiveLimit = restoreInputLen;
3994 break;
3995 }
3996
3997 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3998 // (successful match will fall off the end of the loop.)
3999 fp = StateSave(fp, fp->fPatIdx-3, status);
2ca993e8 4000 fp->fInputIdx = lbStartIdx;
729e4ab9
A
4001 }
4002 break;
4003
4004 case URX_LB_END:
4005 // End of a look-behind block, after a successful match.
4006 {
4007 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4008 if (fp->fInputIdx != fActiveLimit) {
4009 // The look-behind expression matched, but the match did not
4010 // extend all the way to the point that we are looking behind from.
4011 // FAIL out of here, which will take us back to the LB_CONT, which
4012 // will retry the match starting at another position or fail
4013 // the look-behind altogether, whichever is appropriate.
4014 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4015 break;
4016 }
4017
4018 // Look-behind match is good. Restore the orignal input string length,
57a6839d 4019 // which had been truncated to pin the end of the lookbehind match to the
729e4ab9
A
4020 // position being looked-behind.
4021 int64_t originalInputLen = fData[opValue+3];
4022 U_ASSERT(originalInputLen >= fActiveLimit);
4023 U_ASSERT(originalInputLen <= fInputLength);
4024 fActiveLimit = originalInputLen;
4025 }
4026 break;
4027
4028
4029 case URX_LBN_CONT:
4030 {
4031 // Negative Look-Behind, at top of loop checking for matches of LB expression
4032 // at all possible input starting positions.
4033
4034 // Fetch the extra parameters of this op.
4035 int32_t minML = (int32_t)pat[fp->fPatIdx++];
4036 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
2ca993e8
A
4037 if (!UTEXT_USES_U16(fInputText)) {
4038 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
4039 // The max length need not be exact; it just needs to be >= actual maximum.
4040 maxML *= 3;
4041 }
729e4ab9
A
4042 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
4043 continueLoc = URX_VAL(continueLoc);
4044 U_ASSERT(minML <= maxML);
4045 U_ASSERT(minML >= 0);
4046 U_ASSERT(continueLoc > fp->fPatIdx);
4047
4048 // Fetch (from data) the last input index where a match was attempted.
4049 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2ca993e8
A
4050 int64_t &lbStartIdx = fData[opValue+2];
4051 if (lbStartIdx < 0) {
729e4ab9 4052 // First time through loop.
2ca993e8
A
4053 lbStartIdx = fp->fInputIdx - minML;
4054 if (lbStartIdx > 0) {
4055 // move index to a code point boudary, if it's not on one already.
4056 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4057 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4058 }
729e4ab9
A
4059 } else {
4060 // 2nd through nth time through the loop.
4061 // Back up start position for match by one.
2ca993e8
A
4062 if (lbStartIdx == 0) {
4063 (lbStartIdx)--;
729e4ab9 4064 } else {
2ca993e8 4065 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4388f060 4066 (void)UTEXT_PREVIOUS32(fInputText);
2ca993e8 4067 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
729e4ab9
A
4068 }
4069 }
4070
2ca993e8 4071 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
729e4ab9
A
4072 // We have tried all potential match starting points without
4073 // getting a match, which means that the negative lookbehind as
4074 // a whole has succeeded. Jump forward to the continue location
4075 int64_t restoreInputLen = fData[opValue+3];
4076 U_ASSERT(restoreInputLen >= fActiveLimit);
4077 U_ASSERT(restoreInputLen <= fInputLength);
4078 fActiveLimit = restoreInputLen;
4079 fp->fPatIdx = continueLoc;
4080 break;
4081 }
4082
4083 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4084 // (successful match will cause a FAIL out of the loop altogether.)
4085 fp = StateSave(fp, fp->fPatIdx-4, status);
2ca993e8 4086 fp->fInputIdx = lbStartIdx;
729e4ab9
A
4087 }
4088 break;
4089
4090 case URX_LBN_END:
4091 // End of a negative look-behind block, after a successful match.
4092 {
4093 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4094 if (fp->fInputIdx != fActiveLimit) {
4095 // The look-behind expression matched, but the match did not
4096 // extend all the way to the point that we are looking behind from.
4097 // FAIL out of here, which will take us back to the LB_CONT, which
4098 // will retry the match starting at another position or succeed
4099 // the look-behind altogether, whichever is appropriate.
4100 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4101 break;
4102 }
4103
4104 // Look-behind expression matched, which means look-behind test as
4105 // a whole Fails
57a6839d
A
4106
4107 // Restore the orignal input string length, which had been truncated
4108 // inorder to pin the end of the lookbehind match
729e4ab9
A
4109 // to the position being looked-behind.
4110 int64_t originalInputLen = fData[opValue+3];
4111 U_ASSERT(originalInputLen >= fActiveLimit);
4112 U_ASSERT(originalInputLen <= fInputLength);
4113 fActiveLimit = originalInputLen;
4114
4115 // Restore original stack position, discarding any state saved
4116 // by the successful pattern match.
4117 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4118 int32_t newStackSize = (int32_t)fData[opValue];
4119 U_ASSERT(fStack->size() > newStackSize);
4120 fStack->setSize(newStackSize);
57a6839d
A
4121
4122 // FAIL, which will take control back to someplace
729e4ab9
A
4123 // prior to entering the look-behind test.
4124 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4125 }
4126 break;
4127
4128
4129 case URX_LOOP_SR_I:
4130 // Loop Initialization for the optimized implementation of
4131 // [some character set]*
4132 // This op scans through all matching input.
4133 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4134 {
4135 U_ASSERT(opValue > 0 && opValue < sets->size());
4136 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4137 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
4138
4139 // Loop through input, until either the input is exhausted or
4140 // we reach a character that is not a member of the set.
4141 int64_t ix = fp->fInputIdx;
4142 UTEXT_SETNATIVEINDEX(fInputText, ix);
4143 for (;;) {
4144 if (ix >= fActiveLimit) {
4145 fHitEnd = TRUE;
4146 break;
4147 }
4148 UChar32 c = UTEXT_NEXT32(fInputText);
4149 if (c<256) {
4150 if (s8->contains(c) == FALSE) {
4151 break;
4152 }
4153 } else {
4154 if (s->contains(c) == FALSE) {
4155 break;
4156 }
4157 }
4158 ix = UTEXT_GETNATIVEINDEX(fInputText);
4159 }
4160
4161 // If there were no matching characters, skip over the loop altogether.
4162 // The loop doesn't run at all, a * op always succeeds.
4163 if (ix == fp->fInputIdx) {
4164 fp->fPatIdx++; // skip the URX_LOOP_C op.
4165 break;
4166 }
4167
4168 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4169 // must follow. It's operand is the stack location
4170 // that holds the starting input index for the match of this [set]*
4171 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4172 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4173 int32_t stackLoc = URX_VAL(loopcOp);
4174 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4175 fp->fExtra[stackLoc] = fp->fInputIdx;
729e4ab9
A
4176 fp->fInputIdx = ix;
4177
4178 // Save State to the URX_LOOP_C op that follows this one,
4179 // so that match failures in the following code will return to there.
4180 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4181 fp = StateSave(fp, fp->fPatIdx, status);
4182 fp->fPatIdx++;
4183 }
4184 break;
4185
4186
4187 case URX_LOOP_DOT_I:
4188 // Loop Initialization for the optimized implementation of .*
4189 // This op scans through all remaining input.
4190 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4191 {
4192 // Loop through input until the input is exhausted (we reach an end-of-line)
4193 // In DOTALL mode, we can just go straight to the end of the input.
4194 int64_t ix;
4195 if ((opValue & 1) == 1) {
4196 // Dot-matches-All mode. Jump straight to the end of the string.
4197 ix = fActiveLimit;
4198 fHitEnd = TRUE;
4199 } else {
4200 // NOT DOT ALL mode. Line endings do not match '.'
4201 // Scan forward until a line ending or end of input.
4202 ix = fp->fInputIdx;
4203 UTEXT_SETNATIVEINDEX(fInputText, ix);
4204 for (;;) {
4205 if (ix >= fActiveLimit) {
4206 fHitEnd = TRUE;
4207 break;
4208 }
4209 UChar32 c = UTEXT_NEXT32(fInputText);
4210 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
4211 if ((c == 0x0a) || // 0x0a is newline in both modes.
4212 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
b331163b 4213 isLineTerminator(c))) {
729e4ab9
A
4214 // char is a line ending. Exit the scanning loop.
4215 break;
4216 }
4217 }
4218 ix = UTEXT_GETNATIVEINDEX(fInputText);
4219 }
4220 }
4221
4222 // If there were no matching characters, skip over the loop altogether.
4223 // The loop doesn't run at all, a * op always succeeds.
4224 if (ix == fp->fInputIdx) {
4225 fp->fPatIdx++; // skip the URX_LOOP_C op.
4226 break;
4227 }
4228
4229 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4230 // must follow. It's operand is the stack location
4231 // that holds the starting input index for the match of this .*
4232 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4233 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4234 int32_t stackLoc = URX_VAL(loopcOp);
4235 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4236 fp->fExtra[stackLoc] = fp->fInputIdx;
729e4ab9
A
4237 fp->fInputIdx = ix;
4238
4239 // Save State to the URX_LOOP_C op that follows this one,
4240 // so that match failures in the following code will return to there.
4241 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4242 fp = StateSave(fp, fp->fPatIdx, status);
4243 fp->fPatIdx++;
4244 }
4245 break;
4246
4247
4248 case URX_LOOP_C:
4249 {
4250 U_ASSERT(opValue>=0 && opValue<fFrameSize);
4251 backSearchIndex = fp->fExtra[opValue];
4252 U_ASSERT(backSearchIndex <= fp->fInputIdx);
4253 if (backSearchIndex == fp->fInputIdx) {
4254 // We've backed up the input idx to the point that the loop started.
57a6839d 4255 // The loop is done. Leave here without saving state.
729e4ab9
A
4256 // Subsequent failures won't come back here.
4257 break;
4258 }
4259 // Set up for the next iteration of the loop, with input index
4260 // backed up by one from the last time through,
4261 // and a state save to this instruction in case the following code fails again.
4262 // (We're going backwards because this loop emulates stack unwinding, not
4263 // the initial scan forward.)
4264 U_ASSERT(fp->fInputIdx > 0);
4265 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4266 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4267 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
57a6839d 4268
729e4ab9 4269 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
57a6839d 4270 if (prevC == 0x0a &&
729e4ab9
A
4271 fp->fInputIdx > backSearchIndex &&
4272 twoPrevC == 0x0d) {
4273 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4274 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4275 // .*, stepping back over CRLF pair.
4276 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4277 }
4278 }
4279
374ca955 4280
729e4ab9
A
4281 fp = StateSave(fp, fp->fPatIdx-1, status);
4282 }
4283 break;
374ca955
A
4284
4285
729e4ab9
A
4286
4287 default:
4288 // Trouble. The compiled pattern contains an entry with an
4289 // unrecognized type tag.
4290 U_ASSERT(FALSE);
b75a7d8f 4291 }
729e4ab9
A
4292
4293 if (U_FAILURE(status)) {
4294 isMatch = FALSE;
b75a7d8f
A
4295 break;
4296 }
4297 }
57a6839d 4298
729e4ab9
A
4299breakFromLoop:
4300 fMatch = isMatch;
4301 if (isMatch) {
4302 fLastMatchEnd = fMatchEnd;
4303 fMatchStart = startIdx;
4304 fMatchEnd = fp->fInputIdx;
46f4442e 4305 }
57a6839d
A
4306
4307#ifdef REGEX_RUN_DEBUG
4308 if (fTraceDebug) {
4309 if (isMatch) {
4310 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
4311 } else {
4312 printf("No match\n\n");
46f4442e
A
4313 }
4314 }
57a6839d 4315#endif
46f4442e 4316
729e4ab9
A
4317 fFrame = fp; // The active stack frame when the engine stopped.
4318 // Contains the capture group results that we need to
4319 // access later.
4320 return;
b75a7d8f 4321}
46f4442e
A
4322
4323
b75a7d8f
A
4324//--------------------------------------------------------------------------------
4325//
729e4ab9
A
4326// MatchChunkAt This is the actual matching engine. Like MatchAt, but with the
4327// assumption that the entire string is available in the UText's
4328// chunk buffer. For now, that means we can use int32_t indexes,
4329// except for anything that needs to be saved (like group starts
4330// and ends).
b75a7d8f 4331//
46f4442e
A
4332// startIdx: begin matching a this index.
4333// toEnd: if true, match must extend to end of the input region
4334//
b75a7d8f 4335//--------------------------------------------------------------------------------
729e4ab9 4336void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
b75a7d8f 4337 UBool isMatch = FALSE; // True if the we have a match.
57a6839d 4338
729e4ab9 4339 int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
b75a7d8f
A
4340
4341 int32_t op; // Operation from the compiled pattern, split into
4342 int32_t opType; // the opcode
4343 int32_t opValue; // and the operand value.
57a6839d 4344
729e4ab9 4345#ifdef REGEX_RUN_DEBUG
57a6839d
A
4346 if (fTraceDebug) {
4347 printf("MatchAt(startIdx=%d)\n", startIdx);
2ca993e8
A
4348 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
4349 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
b75a7d8f 4350 }
729e4ab9 4351#endif
57a6839d 4352
b75a7d8f
A
4353 if (U_FAILURE(status)) {
4354 return;
4355 }
57a6839d 4356
b75a7d8f 4357 // Cache frequently referenced items from the compiled pattern
b75a7d8f 4358 //
729e4ab9 4359 int64_t *pat = fPattern->fCompiledPat->getBuffer();
57a6839d 4360
b75a7d8f
A
4361 const UChar *litText = fPattern->fLiteralText.getBuffer();
4362 UVector *sets = fPattern->fSets;
57a6839d 4363
729e4ab9 4364 const UChar *inputBuf = fInputText->chunkContents;
57a6839d 4365
46f4442e 4366 fFrameSize = fPattern->fFrameSize;
b75a7d8f 4367 REStackFrame *fp = resetStack();
2ca993e8
A
4368 if (U_FAILURE(fDeferredStatus)) {
4369 status = fDeferredStatus;
4370 return;
4371 }
57a6839d 4372
b75a7d8f
A
4373 fp->fPatIdx = 0;
4374 fp->fInputIdx = startIdx;
57a6839d 4375
b75a7d8f
A
4376 // Zero out the pattern's static data
4377 int32_t i;
4378 for (i = 0; i<fPattern->fDataSize; i++) {
4379 fData[i] = 0;
4380 }
57a6839d 4381
b75a7d8f
A
4382 //
4383 // Main loop for interpreting the compiled pattern.
4384 // One iteration of the loop per pattern operation performed.
4385 //
4386 for (;;) {
729e4ab9 4387 op = (int32_t)pat[fp->fPatIdx];
b75a7d8f
A
4388 opType = URX_TYPE(op);
4389 opValue = URX_VAL(op);
729e4ab9 4390#ifdef REGEX_RUN_DEBUG
b75a7d8f 4391 if (fTraceDebug) {
729e4ab9 4392 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 4393 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
729e4ab9 4394 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
b75a7d8f
A
4395 fPattern->dumpOp(fp->fPatIdx);
4396 }
729e4ab9 4397#endif
b75a7d8f 4398 fp->fPatIdx++;
57a6839d 4399
b75a7d8f 4400 switch (opType) {
57a6839d
A
4401
4402
b75a7d8f
A
4403 case URX_NOP:
4404 break;
57a6839d
A
4405
4406
b75a7d8f
A
4407 case URX_BACKTRACK:
4408 // Force a backtrack. In some circumstances, the pattern compiler
4409 // will notice that the pattern can't possibly match anything, and will
4410 // emit one of these at that point.
46f4442e 4411 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f 4412 break;
57a6839d
A
4413
4414
b75a7d8f 4415 case URX_ONECHAR:
46f4442e 4416 if (fp->fInputIdx < fActiveLimit) {
729e4ab9 4417 UChar32 c;
46f4442e
A
4418 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4419 if (c == opValue) {
b75a7d8f
A
4420 break;
4421 }
46f4442e
A
4422 } else {
4423 fHitEnd = TRUE;
b75a7d8f 4424 }
729e4ab9
A
4425 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4426 break;
57a6839d
A
4427
4428
b75a7d8f
A
4429 case URX_STRING:
4430 {
4431 // Test input against a literal string.
4432 // Strings require two slots in the compiled pattern, one for the
4433 // offset to the string text, and one for the length.
4434 int32_t stringStartIdx = opValue;
4435 int32_t stringLen;
57a6839d 4436
729e4ab9 4437 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
b75a7d8f
A
4438 fp->fPatIdx++;
4439 opType = URX_TYPE(op);
4440 stringLen = URX_VAL(op);
4441 U_ASSERT(opType == URX_STRING_LEN);
4442 U_ASSERT(stringLen >= 2);
57a6839d 4443
b75a7d8f 4444 const UChar * pInp = inputBuf + fp->fInputIdx;
4388f060 4445 const UChar * pInpLimit = inputBuf + fActiveLimit;
b75a7d8f
A
4446 const UChar * pPat = litText+stringStartIdx;
4447 const UChar * pEnd = pInp + stringLen;
4388f060
A
4448 UBool success = TRUE;
4449 while (pInp < pEnd) {
4450 if (pInp >= pInpLimit) {
4451 fHitEnd = TRUE;
4452 success = FALSE;
4453 break;
4454 }
4455 if (*pInp++ != *pPat++) {
4456 success = FALSE;
b75a7d8f
A
4457 break;
4458 }
4459 }
57a6839d 4460
729e4ab9
A
4461 if (success) {
4462 fp->fInputIdx += stringLen;
4463 } else {
729e4ab9
A
4464 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4465 }
b75a7d8f 4466 }
729e4ab9 4467 break;
57a6839d
A
4468
4469
b75a7d8f 4470 case URX_STATE_SAVE:
46f4442e 4471 fp = StateSave(fp, opValue, status);
b75a7d8f 4472 break;
57a6839d
A
4473
4474
b75a7d8f
A
4475 case URX_END:
4476 // The match loop will exit via this path on a successful match,
4477 // when we reach the end of the pattern.
46f4442e
A
4478 if (toEnd && fp->fInputIdx != fActiveLimit) {
4479 // The pattern matched, but not to the end of input. Try some more.
4480 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4481 break;
4482 }
b75a7d8f
A
4483 isMatch = TRUE;
4484 goto breakFromLoop;
57a6839d 4485
729e4ab9 4486 // Start and End Capture stack frame variables are laid out out like this:
b75a7d8f
A
4487 // fp->fExtra[opValue] - The start of a completed capture group
4488 // opValue+1 - The end of a completed capture group
4489 // opValue+2 - the start of a capture group whose end
4490 // has not yet been reached (and might not ever be).
4491 case URX_START_CAPTURE:
46f4442e 4492 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
b75a7d8f
A
4493 fp->fExtra[opValue+2] = fp->fInputIdx;
4494 break;
57a6839d
A
4495
4496
b75a7d8f 4497 case URX_END_CAPTURE:
46f4442e 4498 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
b75a7d8f
A
4499 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
4500 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
4501 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
4502 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4503 break;
57a6839d
A
4504
4505
b75a7d8f 4506 case URX_DOLLAR: // $, test for End of line
729e4ab9 4507 // or for position before new line at end of input
46f4442e 4508 if (fp->fInputIdx < fAnchorLimit-2) {
b75a7d8f 4509 // We are no where near the end of input. Fail.
46f4442e
A
4510 // This is the common case. Keep it first.
4511 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4512 break;
4513 }
46f4442e 4514 if (fp->fInputIdx >= fAnchorLimit) {
b75a7d8f 4515 // We really are at the end of input. Success.
46f4442e
A
4516 fHitEnd = TRUE;
4517 fRequireEnd = TRUE;
b75a7d8f
A
4518 break;
4519 }
57a6839d 4520
b75a7d8f
A
4521 // If we are positioned just before a new-line that is located at the
4522 // end of input, succeed.
46f4442e 4523 if (fp->fInputIdx == fAnchorLimit-1) {
729e4ab9
A
4524 UChar32 c;
4525 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
57a6839d 4526
b331163b 4527 if (isLineTerminator(c)) {
46f4442e 4528 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
374ca955 4529 // At new-line at end of input. Success
46f4442e
A
4530 fHitEnd = TRUE;
4531 fRequireEnd = TRUE;
4532 break;
374ca955 4533 }
b75a7d8f 4534 }
729e4ab9
A
4535 } else if (fp->fInputIdx == fAnchorLimit-2 &&
4536 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
46f4442e
A
4537 fHitEnd = TRUE;
4538 fRequireEnd = TRUE;
b75a7d8f 4539 break; // At CR/LF at end of input. Success
b75a7d8f 4540 }
57a6839d 4541
46f4442e 4542 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
57a6839d 4543
46f4442e 4544 break;
57a6839d
A
4545
4546
729e4ab9 4547 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
46f4442e
A
4548 if (fp->fInputIdx >= fAnchorLimit-1) {
4549 // Either at the last character of input, or off the end.
4550 if (fp->fInputIdx == fAnchorLimit-1) {
4551 // At last char of input. Success if it's a new line.
729e4ab9 4552 if (inputBuf[fp->fInputIdx] == 0x0a) {
46f4442e
A
4553 fHitEnd = TRUE;
4554 fRequireEnd = TRUE;
4555 break;
4556 }
4557 } else {
4558 // Off the end of input. Success.
4559 fHitEnd = TRUE;
4560 fRequireEnd = TRUE;
4561 break;
4562 }
4563 }
57a6839d 4564
46f4442e
A
4565 // Not at end of input. Back-track out.
4566 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f 4567 break;
57a6839d
A
4568
4569
729e4ab9
A
4570 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
4571 {
4572 if (fp->fInputIdx >= fAnchorLimit) {
4573 // We really are at the end of input. Success.
4574 fHitEnd = TRUE;
4575 fRequireEnd = TRUE;
4576 break;
4577 }
4578 // If we are positioned just before a new-line, succeed.
4579 // It makes no difference where the new-line is within the input.
4580 UChar32 c = inputBuf[fp->fInputIdx];
b331163b 4581 if (isLineTerminator(c)) {
729e4ab9
A
4582 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
4583 // In multi-line mode, hitting a new-line just before the end of input does not
4584 // set the hitEnd or requireEnd flags
4585 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
46f4442e 4586 break;
729e4ab9
A
4587 }
4588 }
4589 // not at a new line. Fail.
4590 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4591 }
4592 break;
57a6839d
A
4593
4594
729e4ab9
A
4595 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
4596 {
4597 if (fp->fInputIdx >= fAnchorLimit) {
4598 // We really are at the end of input. Success.
4599 fHitEnd = TRUE;
4600 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
4601 break; // adding a new-line would not lose the match.
4602 }
4603 // If we are not positioned just before a new-line, the test fails; backtrack out.
4604 // It makes no difference where the new-line is within the input.
4605 if (inputBuf[fp->fInputIdx] != 0x0a) {
4606 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4607 }
4608 }
4609 break;
57a6839d
A
4610
4611
729e4ab9 4612 case URX_CARET: // ^, test for start of line
46f4442e
A
4613 if (fp->fInputIdx != fAnchorStart) {
4614 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4615 }
b75a7d8f 4616 break;
57a6839d
A
4617
4618
729e4ab9
A
4619 case URX_CARET_M: // ^, test for start of line in mulit-line mode
4620 {
4621 if (fp->fInputIdx == fAnchorStart) {
4622 // We are at the start input. Success.
4623 break;
4624 }
4625 // Check whether character just before the current pos is a new-line
4626 // unless we are at the end of input
57a6839d
A
4627 UChar c = inputBuf[fp->fInputIdx - 1];
4628 if ((fp->fInputIdx < fAnchorLimit) &&
b331163b 4629 isLineTerminator(c)) {
729e4ab9
A
4630 // It's a new-line. ^ is true. Success.
4631 // TODO: what should be done with positions between a CR and LF?
4632 break;
4633 }
4634 // Not at the start of a line. Fail.
4635 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4636 }
4637 break;
57a6839d
A
4638
4639
729e4ab9
A
4640 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
4641 {
4642 U_ASSERT(fp->fInputIdx >= fAnchorStart);
4643 if (fp->fInputIdx <= fAnchorStart) {
4644 // We are at the start input. Success.
4645 break;
4646 }
4647 // Check whether character just before the current pos is a new-line
4648 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
57a6839d 4649 UChar c = inputBuf[fp->fInputIdx - 1];
729e4ab9
A
4650 if (c != 0x0a) {
4651 // Not at the start of a line. Back-track out.
4652 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4653 }
4654 }
4655 break;
57a6839d 4656
b75a7d8f
A
4657 case URX_BACKSLASH_B: // Test for word boundaries
4658 {
729e4ab9 4659 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
51004dcb 4660 success ^= (UBool)(opValue != 0); // flip sense for \B
b75a7d8f 4661 if (!success) {
46f4442e 4662 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4663 }
4664 }
4665 break;
57a6839d
A
4666
4667
374ca955
A
4668 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
4669 {
4670 UBool success = isUWordBoundary(fp->fInputIdx);
51004dcb 4671 success ^= (UBool)(opValue != 0); // flip sense for \B
374ca955 4672 if (!success) {
46f4442e 4673 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
374ca955
A
4674 }
4675 }
4676 break;
57a6839d
A
4677
4678
b75a7d8f
A
4679 case URX_BACKSLASH_D: // Test for decimal digit
4680 {
46f4442e
A
4681 if (fp->fInputIdx >= fActiveLimit) {
4682 fHitEnd = TRUE;
4683 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4684 break;
4685 }
57a6839d 4686
729e4ab9
A
4687 UChar32 c;
4688 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
46f4442e 4689 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
b75a7d8f 4690 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
51004dcb 4691 success ^= (UBool)(opValue != 0); // flip sense for \D
729e4ab9 4692 if (!success) {
46f4442e 4693 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4694 }
4695 }
4696 break;
57a6839d
A
4697
4698
b75a7d8f 4699 case URX_BACKSLASH_G: // Test for position at end of previous match
729e4ab9 4700 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
46f4442e 4701 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4702 }
4703 break;
57a6839d
A
4704
4705
b331163b
A
4706 case URX_BACKSLASH_H: // Test for \h, horizontal white space.
4707 {
4708 if (fp->fInputIdx >= fActiveLimit) {
4709 fHitEnd = TRUE;
4710 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4711 break;
4712 }
4713 UChar32 c;
4714 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4715 int8_t ctype = u_charType(c);
4716 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
4717 success ^= (UBool)(opValue != 0); // flip sense for \H
4718 if (!success) {
4719 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4720 }
4721 }
4722 break;
4723
4724
4725 case URX_BACKSLASH_R: // Test for \R, any line break sequence.
4726 {
4727 if (fp->fInputIdx >= fActiveLimit) {
4728 fHitEnd = TRUE;
4729 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4730 break;
4731 }
4732 UChar32 c;
4733 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4734 if (isLineTerminator(c)) {
4735 if (c == 0x0d && fp->fInputIdx < fActiveLimit) {
4736 // Check for CR/LF sequence. Consume both together when found.
4737 UChar c2;
4738 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2);
4739 if (c2 != 0x0a) {
4740 U16_PREV(inputBuf, 0, fp->fInputIdx, c2);
4741 }
4742 }
4743 } else {
4744 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4745 }
4746 }
4747 break;
4748
4749
4750 case URX_BACKSLASH_V: // Any single code point line ending.
4751 {
4752 if (fp->fInputIdx >= fActiveLimit) {
4753 fHitEnd = TRUE;
4754 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4755 break;
4756 }
4757 UChar32 c;
4758 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4759 UBool success = isLineTerminator(c);
4760 success ^= (UBool)(opValue != 0); // flip sense for \V
4761 if (!success) {
4762 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4763 }
4764 }
4765 break;
4766
4767
4768
57a6839d 4769 case URX_BACKSLASH_X:
729e4ab9
A
4770 // Match a Grapheme, as defined by Unicode TR 29.
4771 // Differs slightly from Perl, which consumes combining marks independently
4772 // of context.
4773 {
b75a7d8f 4774
729e4ab9
A
4775 // Fail if at end of input
4776 if (fp->fInputIdx >= fActiveLimit) {
4777 fHitEnd = TRUE;
4778 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4779 break;
4780 }
b75a7d8f 4781
729e4ab9
A
4782 // Examine (and consume) the current char.
4783 // Dispatch into a little state machine, based on the char.
4784 UChar32 c;
4785 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4786 UnicodeSet **sets = fPattern->fStaticSets;
4787 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
4788 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
4789 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4790 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4791 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4792 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4793 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4794 goto GC_Extend;
b75a7d8f
A
4795
4796
4797
4798GC_L:
729e4ab9
A
4799 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4800 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4801 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4802 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4803 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4804 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4805 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4806 goto GC_Extend;
b75a7d8f
A
4807
4808GC_V:
729e4ab9
A
4809 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4810 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4811 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4812 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4813 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4814 goto GC_Extend;
b75a7d8f
A
4815
4816GC_T:
729e4ab9
A
4817 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4818 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4819 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4820 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4821 goto GC_Extend;
b75a7d8f
A
4822
4823GC_Extend:
729e4ab9
A
4824 // Combining characters are consumed here
4825 for (;;) {
4826 if (fp->fInputIdx >= fActiveLimit) {
4827 break;
b75a7d8f 4828 }
729e4ab9
A
4829 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4830 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
4831 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
4832 break;
4833 }
4834 }
4835 goto GC_Done;
b75a7d8f
A
4836
4837GC_Control:
57a6839d 4838 // Most control chars stand alone (don't combine with combining chars),
729e4ab9
A
4839 // except for that CR/LF sequence is a single grapheme cluster.
4840 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
4841 fp->fInputIdx++;
4842 }
b75a7d8f
A
4843
4844GC_Done:
729e4ab9
A
4845 if (fp->fInputIdx >= fActiveLimit) {
4846 fHitEnd = TRUE;
b75a7d8f 4847 }
729e4ab9
A
4848 break;
4849 }
57a6839d
A
4850
4851
4852
4853
46f4442e
A
4854 case URX_BACKSLASH_Z: // Test for end of Input
4855 if (fp->fInputIdx < fAnchorLimit) {
4856 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4857 } else {
4858 fHitEnd = TRUE;
4859 fRequireEnd = TRUE;
b75a7d8f
A
4860 }
4861 break;
57a6839d
A
4862
4863
4864
b75a7d8f
A
4865 case URX_STATIC_SETREF:
4866 {
4867 // Test input character against one of the predefined sets
4868 // (Word Characters, for example)
4869 // The high bit of the op value is a flag for the match polarity.
4870 // 0: success if input char is in set.
4871 // 1: success if input char is not in set.
46f4442e
A
4872 if (fp->fInputIdx >= fActiveLimit) {
4873 fHitEnd = TRUE;
4874 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4875 break;
4876 }
57a6839d
A
4877
4878 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
b75a7d8f
A
4879 opValue &= ~URX_NEG_SET;
4880 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
57a6839d 4881
729e4ab9 4882 UChar32 c;
46f4442e 4883 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
b75a7d8f
A
4884 if (c < 256) {
4885 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4886 if (s8->contains(c)) {
4887 success = !success;
4888 }
4889 } else {
4890 const UnicodeSet *s = fPattern->fStaticSets[opValue];
4891 if (s->contains(c)) {
4892 success = !success;
4893 }
4894 }
4895 if (!success) {
46f4442e 4896 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4897 }
4898 }
4899 break;
57a6839d
A
4900
4901
b75a7d8f
A
4902 case URX_STAT_SETREF_N:
4903 {
57a6839d 4904 // Test input character for NOT being a member of one of
b75a7d8f 4905 // the predefined sets (Word Characters, for example)
46f4442e
A
4906 if (fp->fInputIdx >= fActiveLimit) {
4907 fHitEnd = TRUE;
4908 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4909 break;
4910 }
57a6839d 4911
b75a7d8f 4912 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
57a6839d 4913
b75a7d8f 4914 UChar32 c;
46f4442e 4915 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
b75a7d8f
A
4916 if (c < 256) {
4917 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4918 if (s8->contains(c) == FALSE) {
4919 break;
4920 }
4921 } else {
4922 const UnicodeSet *s = fPattern->fStaticSets[opValue];
4923 if (s->contains(c) == FALSE) {
4924 break;
4925 }
4926 }
46f4442e 4927 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4928 }
4929 break;
57a6839d
A
4930
4931
b75a7d8f 4932 case URX_SETREF:
729e4ab9
A
4933 {
4934 if (fp->fInputIdx >= fActiveLimit) {
4935 fHitEnd = TRUE;
4936 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
46f4442e
A
4937 break;
4938 }
57a6839d 4939
729e4ab9
A
4940 U_ASSERT(opValue > 0 && opValue < sets->size());
4941
4942 // There is input left. Pick up one char and test it for set membership.
4943 UChar32 c;
4944 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4945 if (c<256) {
4946 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4947 if (s8->contains(c)) {
4948 // The character is in the set. A Match.
4949 break;
4950 }
4951 } else {
4952 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
4953 if (s->contains(c)) {
4954 // The character is in the set. A Match.
4955 break;
4956 }
4957 }
57a6839d 4958
729e4ab9 4959 // the character wasn't in the set.
729e4ab9 4960 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
46f4442e 4961 }
b75a7d8f 4962 break;
57a6839d
A
4963
4964
b75a7d8f
A
4965 case URX_DOTANY:
4966 {
4967 // . matches anything, but stops at end-of-line.
46f4442e 4968 if (fp->fInputIdx >= fActiveLimit) {
b75a7d8f 4969 // At end of input. Match failed. Backtrack out.
46f4442e
A
4970 fHitEnd = TRUE;
4971 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4972 break;
4973 }
57a6839d 4974
b75a7d8f 4975 // There is input left. Advance over one char, unless we've hit end-of-line
729e4ab9 4976 UChar32 c;
46f4442e 4977 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
b331163b 4978 if (isLineTerminator(c)) {
b75a7d8f 4979 // End of line in normal mode. . does not match.
729e4ab9 4980 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4981 break;
4982 }
4983 }
4984 break;
57a6839d
A
4985
4986
b75a7d8f
A
4987 case URX_DOTANY_ALL:
4988 {
729e4ab9 4989 // . in dot-matches-all (including new lines) mode
46f4442e 4990 if (fp->fInputIdx >= fActiveLimit) {
b75a7d8f 4991 // At end of input. Match failed. Backtrack out.
46f4442e
A
4992 fHitEnd = TRUE;
4993 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4994 break;
4995 }
57a6839d 4996
b75a7d8f
A
4997 // There is input left. Advance over one char, except if we are
4998 // at a cr/lf, advance over both of them.
57a6839d 4999 UChar32 c;
46f4442e
A
5000 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5001 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
b75a7d8f 5002 // In the case of a CR/LF, we need to advance over both.
729e4ab9
A
5003 if (inputBuf[fp->fInputIdx] == 0x0a) {
5004 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
b75a7d8f
A
5005 }
5006 }
5007 }
5008 break;
57a6839d
A
5009
5010
46f4442e 5011 case URX_DOTANY_UNIX:
b75a7d8f 5012 {
46f4442e
A
5013 // '.' operator, matches all, but stops at end-of-line.
5014 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
5015 if (fp->fInputIdx >= fActiveLimit) {
5016 // At end of input. Match failed. Backtrack out.
5017 fHitEnd = TRUE;
5018 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
5019 break;
5020 }
57a6839d 5021
46f4442e 5022 // There is input left. Advance over one char, unless we've hit end-of-line
57a6839d 5023 UChar32 c;
46f4442e
A
5024 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5025 if (c == 0x0a) {
5026 // End of line in normal mode. '.' does not match the \n
5027 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
5028 }
5029 }
5030 break;
57a6839d
A
5031
5032
b75a7d8f
A
5033 case URX_JMP:
5034 fp->fPatIdx = opValue;
5035 break;
57a6839d 5036
b75a7d8f
A
5037 case URX_FAIL:
5038 isMatch = FALSE;
5039 goto breakFromLoop;
57a6839d 5040
b75a7d8f
A
5041 case URX_JMP_SAV:
5042 U_ASSERT(opValue < fPattern->fCompiledPat->size());
46f4442e
A
5043 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
5044 fp->fPatIdx = opValue; // Then JMP.
b75a7d8f 5045 break;
57a6839d 5046
b75a7d8f
A
5047 case URX_JMP_SAV_X:
5048 // This opcode is used with (x)+, when x can match a zero length string.
5049 // Same as JMP_SAV, except conditional on the match having made forward progress.
5050 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
5051 // data address of the input position at the start of the loop.
5052 {
5053 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
729e4ab9 5054 int32_t stoOp = (int32_t)pat[opValue-1];
b75a7d8f
A
5055 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
5056 int32_t frameLoc = URX_VAL(stoOp);
46f4442e 5057 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
729e4ab9 5058 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
b75a7d8f
A
5059 U_ASSERT(prevInputIdx <= fp->fInputIdx);
5060 if (prevInputIdx < fp->fInputIdx) {
5061 // The match did make progress. Repeat the loop.
46f4442e 5062 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
b75a7d8f
A
5063 fp->fPatIdx = opValue;
5064 fp->fExtra[frameLoc] = fp->fInputIdx;
57a6839d 5065 }
b75a7d8f
A
5066 // If the input position did not advance, we do nothing here,
5067 // execution will fall out of the loop.
5068 }
5069 break;
57a6839d 5070
b75a7d8f
A
5071 case URX_CTR_INIT:
5072 {
46f4442e 5073 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
57a6839d
A
5074 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
5075
b75a7d8f 5076 // Pick up the three extra operands that CTR_INIT has, and
57a6839d 5077 // skip the pattern location counter past
729e4ab9 5078 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
b75a7d8f
A
5079 fp->fPatIdx += 3;
5080 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
729e4ab9
A
5081 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5082 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
b75a7d8f
A
5083 U_ASSERT(minCount>=0);
5084 U_ASSERT(maxCount>=minCount || maxCount==-1);
57a6839d
A
5085 U_ASSERT(loopLoc>=fp->fPatIdx);
5086
b75a7d8f 5087 if (minCount == 0) {
46f4442e 5088 fp = StateSave(fp, loopLoc+1, status);
b75a7d8f 5089 }
57a6839d
A
5090 if (maxCount == -1) {
5091 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
5092 } else if (maxCount == 0) {
46f4442e 5093 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
5094 }
5095 }
5096 break;
57a6839d 5097
b75a7d8f
A
5098 case URX_CTR_LOOP:
5099 {
5100 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
729e4ab9 5101 int32_t initOp = (int32_t)pat[opValue];
b75a7d8f 5102 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
729e4ab9
A
5103 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5104 int32_t minCount = (int32_t)pat[opValue+2];
5105 int32_t maxCount = (int32_t)pat[opValue+3];
b75a7d8f 5106 (*pCounter)++;
57a6839d
A
5107 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5108 U_ASSERT(*pCounter == maxCount);
b75a7d8f
A
5109 break;
5110 }
5111 if (*pCounter >= minCount) {
57a6839d
A
5112 if (maxCount == -1) {
5113 // Loop has no hard upper bound.
5114 // Check that it is progressing through the input, break if it is not.
5115 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
5116 if (fp->fInputIdx == *pLastInputIdx) {
5117 break;
5118 } else {
5119 *pLastInputIdx = fp->fInputIdx;
5120 }
5121 }
46f4442e 5122 fp = StateSave(fp, fp->fPatIdx, status);
f3c0d7a5
A
5123 } else {
5124 // Increment time-out counter. (StateSave() does it if count >= minCount)
5125 fTickCounter--;
5126 if (fTickCounter <= 0) {
5127 IncrementTime(status); // Re-initializes fTickCounter
5128 }
b75a7d8f
A
5129 }
5130 fp->fPatIdx = opValue + 4; // Loop back.
5131 }
5132 break;
57a6839d 5133
b75a7d8f
A
5134 case URX_CTR_INIT_NG:
5135 {
46f4442e
A
5136 // Initialize a non-greedy loop
5137 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
57a6839d
A
5138 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
5139
5140 // Pick up the three extra operands that CTR_INIT_NG has, and
5141 // skip the pattern location counter past
729e4ab9 5142 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
b75a7d8f
A
5143 fp->fPatIdx += 3;
5144 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
729e4ab9
A
5145 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5146 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
b75a7d8f
A
5147 U_ASSERT(minCount>=0);
5148 U_ASSERT(maxCount>=minCount || maxCount==-1);
5149 U_ASSERT(loopLoc>fp->fPatIdx);
57a6839d
A
5150 if (maxCount == -1) {
5151 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
5152 }
5153
b75a7d8f
A
5154 if (minCount == 0) {
5155 if (maxCount != 0) {
46f4442e 5156 fp = StateSave(fp, fp->fPatIdx, status);
b75a7d8f
A
5157 }
5158 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
57a6839d 5159 }
b75a7d8f
A
5160 }
5161 break;
57a6839d 5162
b75a7d8f
A
5163 case URX_CTR_LOOP_NG:
5164 {
46f4442e 5165 // Non-greedy {min, max} loops
b75a7d8f 5166 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
729e4ab9 5167 int32_t initOp = (int32_t)pat[opValue];
b75a7d8f 5168 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
729e4ab9
A
5169 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5170 int32_t minCount = (int32_t)pat[opValue+2];
5171 int32_t maxCount = (int32_t)pat[opValue+3];
57a6839d 5172
b75a7d8f 5173 (*pCounter)++;
57a6839d 5174 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
b75a7d8f
A
5175 // The loop has matched the maximum permitted number of times.
5176 // Break out of here with no action. Matching will
5177 // continue with the following pattern.
57a6839d 5178 U_ASSERT(*pCounter == maxCount);
b75a7d8f
A
5179 break;
5180 }
57a6839d 5181
b75a7d8f
A
5182 if (*pCounter < minCount) {
5183 // We haven't met the minimum number of matches yet.
5184 // Loop back for another one.
5185 fp->fPatIdx = opValue + 4; // Loop back.
f3c0d7a5
A
5186 fTickCounter--;
5187 if (fTickCounter <= 0) {
5188 IncrementTime(status); // Re-initializes fTickCounter
5189 }
b75a7d8f
A
5190 } else {
5191 // We do have the minimum number of matches.
57a6839d
A
5192
5193 // If there is no upper bound on the loop iterations, check that the input index
5194 // is progressing, and stop the loop if it is not.
5195 if (maxCount == -1) {
5196 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
5197 if (fp->fInputIdx == *pLastInputIdx) {
5198 break;
5199 }
5200 *pLastInputIdx = fp->fInputIdx;
5201 }
5202
5203 // Loop Continuation: we will fall into the pattern following the loop
5204 // (non-greedy, don't execute loop body first), but first do
5205 // a state save to the top of the loop, so that a match failure
b75a7d8f 5206 // in the following pattern will try another iteration of the loop.
46f4442e 5207 fp = StateSave(fp, opValue + 4, status);
b75a7d8f
A
5208 }
5209 }
5210 break;
57a6839d 5211
b75a7d8f
A
5212 case URX_STO_SP:
5213 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5214 fData[opValue] = fStack->size();
5215 break;
57a6839d 5216
b75a7d8f
A
5217 case URX_LD_SP:
5218 {
5219 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
729e4ab9 5220 int32_t newStackSize = (int32_t)fData[opValue];
b75a7d8f 5221 U_ASSERT(newStackSize <= fStack->size());
729e4ab9
A
5222 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5223 if (newFP == (int64_t *)fp) {
b75a7d8f
A
5224 break;
5225 }
5226 int32_t i;
46f4442e 5227 for (i=0; i<fFrameSize; i++) {
729e4ab9 5228 newFP[i] = ((int64_t *)fp)[i];
b75a7d8f
A
5229 }
5230 fp = (REStackFrame *)newFP;
5231 fStack->setSize(newStackSize);
5232 }
5233 break;
57a6839d 5234
b75a7d8f 5235 case URX_BACKREF:
4388f060
A
5236 {
5237 U_ASSERT(opValue < fFrameSize);
5238 int64_t groupStartIdx = fp->fExtra[opValue];
5239 int64_t groupEndIdx = fp->fExtra[opValue+1];
5240 U_ASSERT(groupStartIdx <= groupEndIdx);
5241 int64_t inputIndex = fp->fInputIdx;
5242 if (groupStartIdx < 0) {
5243 // This capture group has not participated in the match thus far,
5244 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5245 break;
5246 }
5247 UBool success = TRUE;
5248 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
5249 if (inputIndex >= fActiveLimit) {
5250 success = FALSE;
5251 fHitEnd = TRUE;
5252 break;
5253 }
5254 if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
5255 success = FALSE;
5256 break;
5257 }
5258 }
2ca993e8
A
5259 if (success && groupStartIdx < groupEndIdx && U16_IS_LEAD(inputBuf[groupEndIdx-1]) &&
5260 inputIndex < fActiveLimit && U16_IS_TRAIL(inputBuf[inputIndex])) {
5261 // Capture group ended with an unpaired lead surrogate.
5262 // Back reference is not permitted to match lead only of a surrogatge pair.
5263 success = FALSE;
5264 }
4388f060
A
5265 if (success) {
5266 fp->fInputIdx = inputIndex;
5267 } else {
5268 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5269 }
5270 }
5271 break;
57a6839d 5272
b75a7d8f
A
5273 case URX_BACKREF_I:
5274 {
46f4442e 5275 U_ASSERT(opValue < fFrameSize);
729e4ab9
A
5276 int64_t groupStartIdx = fp->fExtra[opValue];
5277 int64_t groupEndIdx = fp->fExtra[opValue+1];
b75a7d8f 5278 U_ASSERT(groupStartIdx <= groupEndIdx);
b75a7d8f
A
5279 if (groupStartIdx < 0) {
5280 // This capture group has not participated in the match thus far,
46f4442e 5281 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
4388f060 5282 break;
b75a7d8f 5283 }
4388f060
A
5284 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
5285 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
b75a7d8f 5286
4388f060 5287 // Note: if the capture group match was of an empty string the backref
57a6839d 5288 // match succeeds. Verified by testing: Perl matches succeed
4388f060 5289 // in this case, so we do too.
57a6839d 5290
4388f060
A
5291 UBool success = TRUE;
5292 for (;;) {
5293 UChar32 captureGroupChar = captureGroupItr.next();
5294 if (captureGroupChar == U_SENTINEL) {
5295 success = TRUE;
b75a7d8f
A
5296 break;
5297 }
4388f060
A
5298 UChar32 inputChar = inputItr.next();
5299 if (inputChar == U_SENTINEL) {
5300 success = FALSE;
5301 fHitEnd = TRUE;
5302 break;
b75a7d8f 5303 }
4388f060
A
5304 if (inputChar != captureGroupChar) {
5305 success = FALSE;
5306 break;
5307 }
5308 }
5309
5310 if (success && inputItr.inExpansion()) {
57a6839d
A
5311 // We otained a match by consuming part of a string obtained from
5312 // case-folding a single code point of the input text.
4388f060
A
5313 // This does not count as an overall match.
5314 success = FALSE;
b75a7d8f 5315 }
4388f060
A
5316
5317 if (success) {
5318 fp->fInputIdx = inputItr.getIndex();
b75a7d8f 5319 } else {
4388f060 5320 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
5321 }
5322 }
5323 break;
4388f060 5324
b75a7d8f
A
5325 case URX_STO_INP_LOC:
5326 {
46f4442e 5327 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
b75a7d8f
A
5328 fp->fExtra[opValue] = fp->fInputIdx;
5329 }
5330 break;
57a6839d 5331
b75a7d8f
A
5332 case URX_JMPX:
5333 {
729e4ab9 5334 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
b75a7d8f
A
5335 fp->fPatIdx += 1;
5336 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
46f4442e 5337 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
729e4ab9 5338 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
b75a7d8f
A
5339 U_ASSERT(savedInputIdx <= fp->fInputIdx);
5340 if (savedInputIdx < fp->fInputIdx) {
5341 fp->fPatIdx = opValue; // JMP
5342 } else {
729e4ab9 5343 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
b75a7d8f
A
5344 }
5345 }
5346 break;
57a6839d 5347
b75a7d8f
A
5348 case URX_LA_START:
5349 {
5350 // Entering a lookahead block.
5351 // Save Stack Ptr, Input Pos.
5352 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5353 fData[opValue] = fStack->size();
5354 fData[opValue+1] = fp->fInputIdx;
46f4442e
A
5355 fActiveStart = fLookStart; // Set the match region change for
5356 fActiveLimit = fLookLimit; // transparent bounds.
b75a7d8f
A
5357 }
5358 break;
57a6839d 5359
b75a7d8f
A
5360 case URX_LA_END:
5361 {
5362 // Leaving a look-ahead block.
5363 // restore Stack Ptr, Input Pos to positions they had on entry to block.
5364 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5365 int32_t stackSize = fStack->size();
729e4ab9 5366 int32_t newStackSize = (int32_t)fData[opValue];
b75a7d8f
A
5367 U_ASSERT(stackSize >= newStackSize);
5368 if (stackSize > newStackSize) {
46f4442e
A
5369 // Copy the current top frame back to the new (cut back) top frame.
5370 // This makes the capture groups from within the look-ahead
5371 // expression available.
729e4ab9 5372 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
b75a7d8f 5373 int32_t i;
46f4442e 5374 for (i=0; i<fFrameSize; i++) {
729e4ab9 5375 newFP[i] = ((int64_t *)fp)[i];
b75a7d8f
A
5376 }
5377 fp = (REStackFrame *)newFP;
5378 fStack->setSize(newStackSize);
5379 }
5380 fp->fInputIdx = fData[opValue+1];
57a6839d 5381
46f4442e
A
5382 // Restore the active region bounds in the input string; they may have
5383 // been changed because of transparent bounds on a Region.
5384 fActiveStart = fRegionStart;
5385 fActiveLimit = fRegionLimit;
b75a7d8f
A
5386 }
5387 break;
57a6839d 5388
b75a7d8f 5389 case URX_ONECHAR_I:
46f4442e 5390 if (fp->fInputIdx < fActiveLimit) {
57a6839d 5391 UChar32 c;
46f4442e
A
5392 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5393 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
b75a7d8f
A
5394 break;
5395 }
46f4442e
A
5396 } else {
5397 fHitEnd = TRUE;
5398 }
5399 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f 5400 break;
57a6839d 5401
b75a7d8f 5402 case URX_STRING_I:
4388f060
A
5403 // Case-insensitive test input against a literal string.
5404 // Strings require two slots in the compiled pattern, one for the
5405 // offset to the string text, and one for the length.
5406 // The compiled string has already been case folded.
b75a7d8f 5407 {
4388f060
A
5408 const UChar *patternString = litText + opValue;
5409
5410 op = (int32_t)pat[fp->fPatIdx];
5411 fp->fPatIdx++;
5412 opType = URX_TYPE(op);
5413 opValue = URX_VAL(op);
5414 U_ASSERT(opType == URX_STRING_LEN);
5415 int32_t patternStringLen = opValue; // Length of the string from the pattern.
57a6839d 5416
4388f060
A
5417 UChar32 cText;
5418 UChar32 cPattern;
5419 UBool success = TRUE;
5420 int32_t patternStringIdx = 0;
5421 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
5422 while (patternStringIdx < patternStringLen) {
5423 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
5424 cText = inputIterator.next();
5425 if (cText != cPattern) {
5426 success = FALSE;
5427 if (cText == U_SENTINEL) {
5428 fHitEnd = TRUE;
729e4ab9 5429 }
4388f060 5430 break;
374ca955 5431 }
46f4442e 5432 }
4388f060
A
5433 if (inputIterator.inExpansion()) {
5434 success = FALSE;
5435 }
5436
5437 if (success) {
5438 fp->fInputIdx = inputIterator.getIndex();
5439 } else {
5440 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5441 }
b75a7d8f
A
5442 }
5443 break;
4388f060 5444
b75a7d8f
A
5445 case URX_LB_START:
5446 {
5447 // Entering a look-behind block.
5448 // Save Stack Ptr, Input Pos.
46f4442e 5449 // TODO: implement transparent bounds. Ticket #6067
b75a7d8f
A
5450 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5451 fData[opValue] = fStack->size();
5452 fData[opValue+1] = fp->fInputIdx;
5453 // Init the variable containing the start index for attempted matches.
5454 fData[opValue+2] = -1;
5455 // Save input string length, then reset to pin any matches to end at
5456 // the current position.
46f4442e
A
5457 fData[opValue+3] = fActiveLimit;
5458 fActiveLimit = fp->fInputIdx;
b75a7d8f
A
5459 }
5460 break;
57a6839d
A
5461
5462
b75a7d8f
A
5463 case URX_LB_CONT:
5464 {
5465 // Positive Look-Behind, at top of loop checking for matches of LB expression
5466 // at all possible input starting positions.
57a6839d 5467
b75a7d8f
A
5468 // Fetch the min and max possible match lengths. They are the operands
5469 // of this op in the pattern.
729e4ab9
A
5470 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5471 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
b75a7d8f
A
5472 U_ASSERT(minML <= maxML);
5473 U_ASSERT(minML >= 0);
57a6839d 5474
b75a7d8f
A
5475 // Fetch (from data) the last input index where a match was attempted.
5476 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2ca993e8
A
5477 int64_t &lbStartIdx = fData[opValue+2];
5478 if (lbStartIdx < 0) {
b75a7d8f 5479 // First time through loop.
2ca993e8 5480 lbStartIdx = fp->fInputIdx - minML;
0f5d89e8 5481 if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
2ca993e8
A
5482 U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5483 }
b75a7d8f
A
5484 } else {
5485 // 2nd through nth time through the loop.
5486 // Back up start position for match by one.
2ca993e8
A
5487 if (lbStartIdx == 0) {
5488 lbStartIdx--;
b75a7d8f 5489 } else {
2ca993e8 5490 U16_BACK_1(inputBuf, 0, lbStartIdx);
b75a7d8f
A
5491 }
5492 }
57a6839d 5493
2ca993e8 5494 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
b75a7d8f
A
5495 // We have tried all potential match starting points without
5496 // getting a match. Backtrack out, and out of the
5497 // Look Behind altogether.
46f4442e 5498 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
729e4ab9 5499 int64_t restoreInputLen = fData[opValue+3];
46f4442e 5500 U_ASSERT(restoreInputLen >= fActiveLimit);
729e4ab9 5501 U_ASSERT(restoreInputLen <= fInputLength);
46f4442e 5502 fActiveLimit = restoreInputLen;
b75a7d8f
A
5503 break;
5504 }
57a6839d 5505
b75a7d8f
A
5506 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5507 // (successful match will fall off the end of the loop.)
46f4442e 5508 fp = StateSave(fp, fp->fPatIdx-3, status);
2ca993e8 5509 fp->fInputIdx = lbStartIdx;
b75a7d8f
A
5510 }
5511 break;
57a6839d 5512
b75a7d8f
A
5513 case URX_LB_END:
5514 // End of a look-behind block, after a successful match.
5515 {
5516 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
46f4442e 5517 if (fp->fInputIdx != fActiveLimit) {
b75a7d8f
A
5518 // The look-behind expression matched, but the match did not
5519 // extend all the way to the point that we are looking behind from.
5520 // FAIL out of here, which will take us back to the LB_CONT, which
5521 // will retry the match starting at another position or fail
5522 // the look-behind altogether, whichever is appropriate.
46f4442e 5523 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
5524 break;
5525 }
57a6839d 5526
b75a7d8f 5527 // Look-behind match is good. Restore the orignal input string length,
57a6839d 5528 // which had been truncated to pin the end of the lookbehind match to the
b75a7d8f 5529 // position being looked-behind.
729e4ab9 5530 int64_t originalInputLen = fData[opValue+3];
46f4442e 5531 U_ASSERT(originalInputLen >= fActiveLimit);
729e4ab9 5532 U_ASSERT(originalInputLen <= fInputLength);
46f4442e 5533 fActiveLimit = originalInputLen;
b75a7d8f
A
5534 }
5535 break;
57a6839d
A
5536
5537
b75a7d8f
A
5538 case URX_LBN_CONT:
5539 {
5540 // Negative Look-Behind, at top of loop checking for matches of LB expression
5541 // at all possible input starting positions.
57a6839d 5542
b75a7d8f 5543 // Fetch the extra parameters of this op.
729e4ab9
A
5544 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5545 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5546 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5547 continueLoc = URX_VAL(continueLoc);
b75a7d8f
A
5548 U_ASSERT(minML <= maxML);
5549 U_ASSERT(minML >= 0);
5550 U_ASSERT(continueLoc > fp->fPatIdx);
57a6839d 5551
b75a7d8f
A
5552 // Fetch (from data) the last input index where a match was attempted.
5553 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2ca993e8
A
5554 int64_t &lbStartIdx = fData[opValue+2];
5555 if (lbStartIdx < 0) {
b75a7d8f 5556 // First time through loop.
2ca993e8 5557 lbStartIdx = fp->fInputIdx - minML;
0f5d89e8 5558 if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
2ca993e8
A
5559 U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5560 }
b75a7d8f
A
5561 } else {
5562 // 2nd through nth time through the loop.
5563 // Back up start position for match by one.
2ca993e8
A
5564 if (lbStartIdx == 0) {
5565 lbStartIdx--; // Because U16_BACK is unsafe starting at 0.
b75a7d8f 5566 } else {
2ca993e8 5567 U16_BACK_1(inputBuf, 0, lbStartIdx);
b75a7d8f
A
5568 }
5569 }
57a6839d 5570
2ca993e8 5571 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
b75a7d8f
A
5572 // We have tried all potential match starting points without
5573 // getting a match, which means that the negative lookbehind as
5574 // a whole has succeeded. Jump forward to the continue location
729e4ab9 5575 int64_t restoreInputLen = fData[opValue+3];
46f4442e 5576 U_ASSERT(restoreInputLen >= fActiveLimit);
729e4ab9 5577 U_ASSERT(restoreInputLen <= fInputLength);
46f4442e 5578 fActiveLimit = restoreInputLen;
b75a7d8f
A
5579 fp->fPatIdx = continueLoc;
5580 break;
5581 }
57a6839d 5582
b75a7d8f
A
5583 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5584 // (successful match will cause a FAIL out of the loop altogether.)
46f4442e 5585 fp = StateSave(fp, fp->fPatIdx-4, status);
2ca993e8 5586 fp->fInputIdx = lbStartIdx;
b75a7d8f
A
5587 }
5588 break;
57a6839d 5589
b75a7d8f
A
5590 case URX_LBN_END:
5591 // End of a negative look-behind block, after a successful match.
5592 {
5593 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
46f4442e 5594 if (fp->fInputIdx != fActiveLimit) {
b75a7d8f
A
5595 // The look-behind expression matched, but the match did not
5596 // extend all the way to the point that we are looking behind from.
5597 // FAIL out of here, which will take us back to the LB_CONT, which
5598 // will retry the match starting at another position or succeed
5599 // the look-behind altogether, whichever is appropriate.
46f4442e 5600 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
5601 break;
5602 }
57a6839d 5603
b75a7d8f
A
5604 // Look-behind expression matched, which means look-behind test as
5605 // a whole Fails
57a6839d
A
5606
5607 // Restore the orignal input string length, which had been truncated
5608 // inorder to pin the end of the lookbehind match
b75a7d8f 5609 // to the position being looked-behind.
729e4ab9 5610 int64_t originalInputLen = fData[opValue+3];
46f4442e 5611 U_ASSERT(originalInputLen >= fActiveLimit);
729e4ab9 5612 U_ASSERT(originalInputLen <= fInputLength);
46f4442e 5613 fActiveLimit = originalInputLen;
57a6839d 5614
b75a7d8f
A
5615 // Restore original stack position, discarding any state saved
5616 // by the successful pattern match.
5617 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
729e4ab9 5618 int32_t newStackSize = (int32_t)fData[opValue];
b75a7d8f
A
5619 U_ASSERT(fStack->size() > newStackSize);
5620 fStack->setSize(newStackSize);
57a6839d
A
5621
5622 // FAIL, which will take control back to someplace
b75a7d8f 5623 // prior to entering the look-behind test.
46f4442e 5624 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
5625 }
5626 break;
57a6839d
A
5627
5628
b75a7d8f
A
5629 case URX_LOOP_SR_I:
5630 // Loop Initialization for the optimized implementation of
5631 // [some character set]*
5632 // This op scans through all matching input.
5633 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5634 {
5635 U_ASSERT(opValue > 0 && opValue < sets->size());
5636 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5637 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
57a6839d 5638
b75a7d8f
A
5639 // Loop through input, until either the input is exhausted or
5640 // we reach a character that is not a member of the set.
729e4ab9 5641 int32_t ix = (int32_t)fp->fInputIdx;
b75a7d8f 5642 for (;;) {
46f4442e
A
5643 if (ix >= fActiveLimit) {
5644 fHitEnd = TRUE;
b75a7d8f
A
5645 break;
5646 }
5647 UChar32 c;
46f4442e 5648 U16_NEXT(inputBuf, ix, fActiveLimit, c);
b75a7d8f
A
5649 if (c<256) {
5650 if (s8->contains(c) == FALSE) {
5651 U16_BACK_1(inputBuf, 0, ix);
5652 break;
5653 }
5654 } else {
5655 if (s->contains(c) == FALSE) {
5656 U16_BACK_1(inputBuf, 0, ix);
5657 break;
5658 }
5659 }
5660 }
57a6839d 5661
b75a7d8f
A
5662 // If there were no matching characters, skip over the loop altogether.
5663 // The loop doesn't run at all, a * op always succeeds.
5664 if (ix == fp->fInputIdx) {
5665 fp->fPatIdx++; // skip the URX_LOOP_C op.
5666 break;
5667 }
57a6839d 5668
b75a7d8f
A
5669 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5670 // must follow. It's operand is the stack location
5671 // that holds the starting input index for the match of this [set]*
729e4ab9 5672 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
b75a7d8f
A
5673 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5674 int32_t stackLoc = URX_VAL(loopcOp);
46f4442e 5675 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
b75a7d8f
A
5676 fp->fExtra[stackLoc] = fp->fInputIdx;
5677 fp->fInputIdx = ix;
57a6839d 5678
b75a7d8f
A
5679 // Save State to the URX_LOOP_C op that follows this one,
5680 // so that match failures in the following code will return to there.
5681 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
46f4442e 5682 fp = StateSave(fp, fp->fPatIdx, status);
b75a7d8f
A
5683 fp->fPatIdx++;
5684 }
5685 break;
57a6839d
A
5686
5687
b75a7d8f
A
5688 case URX_LOOP_DOT_I:
5689 // Loop Initialization for the optimized implementation of .*
5690 // This op scans through all remaining input.
5691 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5692 {
5693 // Loop through input until the input is exhausted (we reach an end-of-line)
46f4442e 5694 // In DOTALL mode, we can just go straight to the end of the input.
374ca955 5695 int32_t ix;
46f4442e
A
5696 if ((opValue & 1) == 1) {
5697 // Dot-matches-All mode. Jump straight to the end of the string.
729e4ab9 5698 ix = (int32_t)fActiveLimit;
46f4442e 5699 fHitEnd = TRUE;
374ca955 5700 } else {
46f4442e 5701 // NOT DOT ALL mode. Line endings do not match '.'
b75a7d8f 5702 // Scan forward until a line ending or end of input.
729e4ab9 5703 ix = (int32_t)fp->fInputIdx;
b75a7d8f 5704 for (;;) {
46f4442e
A
5705 if (ix >= fActiveLimit) {
5706 fHitEnd = TRUE;
b75a7d8f
A
5707 break;
5708 }
5709 UChar32 c;
46f4442e 5710 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++]
729e4ab9
A
5711 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
5712 if ((c == 0x0a) || // 0x0a is newline in both modes.
5713 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
b331163b 5714 isLineTerminator(c))) {
46f4442e
A
5715 // char is a line ending. Put the input pos back to the
5716 // line ending char, and exit the scanning loop.
5717 U16_BACK_1(inputBuf, 0, ix);
5718 break;
5719 }
b75a7d8f
A
5720 }
5721 }
5722 }
57a6839d 5723
b75a7d8f
A
5724 // If there were no matching characters, skip over the loop altogether.
5725 // The loop doesn't run at all, a * op always succeeds.
5726 if (ix == fp->fInputIdx) {
5727 fp->fPatIdx++; // skip the URX_LOOP_C op.
5728 break;
5729 }
57a6839d 5730
b75a7d8f
A
5731 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5732 // must follow. It's operand is the stack location
46f4442e 5733 // that holds the starting input index for the match of this .*
729e4ab9 5734 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
b75a7d8f
A
5735 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5736 int32_t stackLoc = URX_VAL(loopcOp);
46f4442e 5737 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
b75a7d8f
A
5738 fp->fExtra[stackLoc] = fp->fInputIdx;
5739 fp->fInputIdx = ix;
57a6839d 5740
b75a7d8f
A
5741 // Save State to the URX_LOOP_C op that follows this one,
5742 // so that match failures in the following code will return to there.
5743 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
46f4442e 5744 fp = StateSave(fp, fp->fPatIdx, status);
b75a7d8f
A
5745 fp->fPatIdx++;
5746 }
5747 break;
57a6839d
A
5748
5749
b75a7d8f
A
5750 case URX_LOOP_C:
5751 {
46f4442e 5752 U_ASSERT(opValue>=0 && opValue<fFrameSize);
729e4ab9
A
5753 backSearchIndex = (int32_t)fp->fExtra[opValue];
5754 U_ASSERT(backSearchIndex <= fp->fInputIdx);
5755 if (backSearchIndex == fp->fInputIdx) {
b75a7d8f 5756 // We've backed up the input idx to the point that the loop started.
57a6839d 5757 // The loop is done. Leave here without saving state.
b75a7d8f
A
5758 // Subsequent failures won't come back here.
5759 break;
5760 }
5761 // Set up for the next iteration of the loop, with input index
5762 // backed up by one from the last time through,
5763 // and a state save to this instruction in case the following code fails again.
5764 // (We're going backwards because this loop emulates stack unwinding, not
5765 // the initial scan forward.)
5766 U_ASSERT(fp->fInputIdx > 0);
729e4ab9
A
5767 UChar32 prevC;
5768 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
57a6839d
A
5769
5770 if (prevC == 0x0a &&
729e4ab9 5771 fp->fInputIdx > backSearchIndex &&
b75a7d8f 5772 inputBuf[fp->fInputIdx-1] == 0x0d) {
729e4ab9 5773 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
b75a7d8f
A
5774 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
5775 // .*, stepping back over CRLF pair.
729e4ab9 5776 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
b75a7d8f
A
5777 }
5778 }
57a6839d
A
5779
5780
46f4442e 5781 fp = StateSave(fp, fp->fPatIdx-1, status);
b75a7d8f
A
5782 }
5783 break;
57a6839d
A
5784
5785
5786
b75a7d8f
A
5787 default:
5788 // Trouble. The compiled pattern contains an entry with an
5789 // unrecognized type tag.
5790 U_ASSERT(FALSE);
5791 }
57a6839d 5792
b75a7d8f 5793 if (U_FAILURE(status)) {
46f4442e 5794 isMatch = FALSE;
b75a7d8f
A
5795 break;
5796 }
5797 }
57a6839d 5798
b75a7d8f
A
5799breakFromLoop:
5800 fMatch = isMatch;
5801 if (isMatch) {
5802 fLastMatchEnd = fMatchEnd;
5803 fMatchStart = startIdx;
5804 fMatchEnd = fp->fInputIdx;
b75a7d8f 5805 }
57a6839d
A
5806
5807#ifdef REGEX_RUN_DEBUG
5808 if (fTraceDebug) {
5809 if (isMatch) {
5810 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
5811 } else {
5812 printf("No match\n\n");
b75a7d8f
A
5813 }
5814 }
57a6839d
A
5815#endif
5816
b75a7d8f 5817 fFrame = fp; // The active stack frame when the engine stopped.
57a6839d
A
5818 // Contains the capture group results that we need to
5819 // access later.
b75a7d8f
A
5820
5821 return;
5822}
5823
5824
374ca955 5825UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
b75a7d8f
A
5826
5827U_NAMESPACE_END
5828
5829#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
0f5d89e8 5830