]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/rematch.cpp
ICU-66108.tar.gz
[apple/icu.git] / icuSources / i18n / rematch.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f
A
3/*
4**************************************************************************
2ca993e8
A
5* Copyright (C) 2002-2016 International Business Machines Corporation
6* and others. All rights reserved.
b75a7d8f
A
7**************************************************************************
8*/
46f4442e
A
9//
10// file: rematch.cpp
11//
12// Contains the implementation of class RegexMatcher,
13// which is one of the main API classes for the ICU regular expression package.
14//
b75a7d8f
A
15
16#include "unicode/utypes.h"
17#if !UCONFIG_NO_REGULAR_EXPRESSIONS
18
19#include "unicode/regex.h"
20#include "unicode/uniset.h"
21#include "unicode/uchar.h"
22#include "unicode/ustring.h"
374ca955 23#include "unicode/rbbi.h"
4388f060
A
24#include "unicode/utf.h"
25#include "unicode/utf16.h"
b75a7d8f
A
26#include "uassert.h"
27#include "cmemory.h"
2ca993e8 28#include "cstr.h"
b75a7d8f
A
29#include "uvector.h"
30#include "uvectr32.h"
729e4ab9 31#include "uvectr64.h"
b75a7d8f
A
32#include "regeximp.h"
33#include "regexst.h"
729e4ab9
A
34#include "regextxt.h"
35#include "ucase.h"
b75a7d8f
A
36
37// #include <malloc.h> // Needed for heapcheck testing
38
2ca993e8 39
b75a7d8f
A
40U_NAMESPACE_BEGIN
41
46f4442e
A
42// Default limit for the size of the back track stack, to avoid system
43// failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
44// This value puts ICU's limits higher than most other regexp implementations,
45// which use recursion rather than the heap, and take more storage per
46// backtrack point.
47//
48static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
49
50// Time limit counter constant.
51// Time limits for expression evaluation are in terms of quanta of work by
52// the engine, each of which is 10,000 state saves.
53// This constant determines that state saves per tick number.
54static const int32_t TIMER_INITIAL_VALUE = 10000;
55
b331163b
A
56
57// Test for any of the Unicode line terminating characters.
58static inline UBool isLineTerminator(UChar32 c) {
59 if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
60 return false;
61 }
62 return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029;
63}
64
b75a7d8f
A
65//-----------------------------------------------------------------------------
66//
67// Constructor and Destructor
68//
69//-----------------------------------------------------------------------------
57a6839d 70RegexMatcher::RegexMatcher(const RegexPattern *pat) {
46f4442e
A
71 fDeferredStatus = U_ZERO_ERROR;
72 init(fDeferredStatus);
73 if (U_FAILURE(fDeferredStatus)) {
74 return;
75 }
b75a7d8f
A
76 if (pat==NULL) {
77 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
78 return;
79 }
46f4442e 80 fPattern = pat;
729e4ab9 81 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
b75a7d8f
A
82}
83
84
85
86RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
87 uint32_t flags, UErrorCode &status) {
46f4442e 88 init(status);
b75a7d8f
A
89 if (U_FAILURE(status)) {
90 return;
91 }
46f4442e
A
92 UParseError pe;
93 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
729e4ab9 94 fPattern = fPatternOwned;
57a6839d 95
729e4ab9
A
96 UText inputText = UTEXT_INITIALIZER;
97 utext_openConstUnicodeString(&inputText, &input, &status);
98 init2(&inputText, status);
99 utext_close(&inputText);
100
57a6839d 101 fInputUniStrMaybeMutable = TRUE;
729e4ab9
A
102}
103
104
105RegexMatcher::RegexMatcher(UText *regexp, UText *input,
106 uint32_t flags, UErrorCode &status) {
107 init(status);
108 if (U_FAILURE(status)) {
109 return;
110 }
111 UParseError pe;
112 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
113 if (U_FAILURE(status)) {
114 return;
115 }
116
46f4442e
A
117 fPattern = fPatternOwned;
118 init2(input, status);
b75a7d8f
A
119}
120
121
57a6839d 122RegexMatcher::RegexMatcher(const UnicodeString &regexp,
b75a7d8f 123 uint32_t flags, UErrorCode &status) {
46f4442e 124 init(status);
b75a7d8f
A
125 if (U_FAILURE(status)) {
126 return;
127 }
46f4442e
A
128 UParseError pe;
129 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
729e4ab9
A
130 if (U_FAILURE(status)) {
131 return;
132 }
133 fPattern = fPatternOwned;
134 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
135}
136
57a6839d 137RegexMatcher::RegexMatcher(UText *regexp,
729e4ab9
A
138 uint32_t flags, UErrorCode &status) {
139 init(status);
140 if (U_FAILURE(status)) {
141 return;
142 }
143 UParseError pe;
144 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
145 if (U_FAILURE(status)) {
146 return;
147 }
148
46f4442e 149 fPattern = fPatternOwned;
729e4ab9 150 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
b75a7d8f
A
151}
152
153
154
46f4442e 155
b75a7d8f
A
156RegexMatcher::~RegexMatcher() {
157 delete fStack;
158 if (fData != fSmallData) {
374ca955 159 uprv_free(fData);
b75a7d8f
A
160 fData = NULL;
161 }
162 if (fPatternOwned) {
163 delete fPatternOwned;
164 fPatternOwned = NULL;
165 fPattern = NULL;
166 }
57a6839d 167
729e4ab9
A
168 if (fInput) {
169 delete fInput;
170 }
171 if (fInputText) {
172 utext_close(fInputText);
173 }
174 if (fAltInputText) {
175 utext_close(fAltInputText);
176 }
57a6839d 177
374ca955
A
178 #if UCONFIG_NO_BREAK_ITERATION==0
179 delete fWordBreakItr;
180 #endif
b75a7d8f
A
181}
182
46f4442e
A
183//
184// init() common initialization for use by all constructors.
185// Initialize all fields, get the object into a consistent state.
186// This must be done even when the initial status shows an error,
187// so that the object is initialized sufficiently well for the destructor
188// to run safely.
189//
190void RegexMatcher::init(UErrorCode &status) {
191 fPattern = NULL;
192 fPatternOwned = NULL;
46f4442e
A
193 fFrameSize = 0;
194 fRegionStart = 0;
195 fRegionLimit = 0;
196 fAnchorStart = 0;
197 fAnchorLimit = 0;
198 fLookStart = 0;
199 fLookLimit = 0;
200 fActiveStart = 0;
201 fActiveLimit = 0;
202 fTransparentBounds = FALSE;
203 fAnchoringBounds = TRUE;
204 fMatch = FALSE;
205 fMatchStart = 0;
206 fMatchEnd = 0;
207 fLastMatchEnd = -1;
208 fAppendPosition = 0;
209 fHitEnd = FALSE;
210 fRequireEnd = FALSE;
211 fStack = NULL;
212 fFrame = NULL;
213 fTimeLimit = 0;
214 fTime = 0;
215 fTickCounter = 0;
216 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY;
217 fCallbackFn = NULL;
218 fCallbackContext = NULL;
729e4ab9
A
219 fFindProgressCallbackFn = NULL;
220 fFindProgressCallbackContext = NULL;
46f4442e
A
221 fTraceDebug = FALSE;
222 fDeferredStatus = status;
223 fData = fSmallData;
224 fWordBreakItr = NULL;
57a6839d 225
4388f060 226 fStack = NULL;
729e4ab9
A
227 fInputText = NULL;
228 fAltInputText = NULL;
229 fInput = NULL;
230 fInputLength = 0;
231 fInputUniStrMaybeMutable = FALSE;
46f4442e
A
232}
233
234//
235// init2() Common initialization for use by RegexMatcher constructors, part 2.
236// This handles the common setup to be done after the Pattern is available.
237//
729e4ab9 238void RegexMatcher::init2(UText *input, UErrorCode &status) {
46f4442e
A
239 if (U_FAILURE(status)) {
240 fDeferredStatus = status;
241 return;
242 }
243
2ca993e8 244 if (fPattern->fDataSize > UPRV_LENGTHOF(fSmallData)) {
57a6839d 245 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
46f4442e
A
246 if (fData == NULL) {
247 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
248 return;
249 }
250 }
251
4388f060
A
252 fStack = new UVector64(status);
253 if (fStack == NULL) {
254 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
255 return;
256 }
257
46f4442e
A
258 reset(input);
259 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
260 if (U_FAILURE(status)) {
261 fDeferredStatus = status;
262 return;
263 }
264}
b75a7d8f
A
265
266
267static const UChar BACKSLASH = 0x5c;
268static const UChar DOLLARSIGN = 0x24;
b331163b
A
269static const UChar LEFTBRACKET = 0x7b;
270static const UChar RIGHTBRACKET = 0x7d;
271
b75a7d8f
A
272//--------------------------------------------------------------------------------
273//
274// appendReplacement
275//
276//--------------------------------------------------------------------------------
277RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
278 const UnicodeString &replacement,
279 UErrorCode &status) {
729e4ab9 280 UText replacementText = UTEXT_INITIALIZER;
57a6839d 281
729e4ab9 282 utext_openConstUnicodeString(&replacementText, &replacement, &status);
57a6839d 283 if (U_SUCCESS(status)) {
729e4ab9
A
284 UText resultText = UTEXT_INITIALIZER;
285 utext_openUnicodeString(&resultText, &dest, &status);
57a6839d 286
729e4ab9
A
287 if (U_SUCCESS(status)) {
288 appendReplacement(&resultText, &replacementText, status);
289 utext_close(&resultText);
290 }
291 utext_close(&replacementText);
292 }
57a6839d 293
729e4ab9
A
294 return *this;
295}
296
297//
298// appendReplacement, UText mode
299//
300RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
301 UText *replacement,
302 UErrorCode &status) {
b75a7d8f
A
303 if (U_FAILURE(status)) {
304 return *this;
305 }
306 if (U_FAILURE(fDeferredStatus)) {
307 status = fDeferredStatus;
308 return *this;
309 }
310 if (fMatch == FALSE) {
311 status = U_REGEX_INVALID_STATE;
312 return *this;
313 }
57a6839d 314
b75a7d8f 315 // Copy input string from the end of previous match to start of current match
729e4ab9
A
316 int64_t destLen = utext_nativeLength(dest);
317 if (fMatchStart > fAppendPosition) {
318 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
57a6839d 319 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
729e4ab9
A
320 (int32_t)(fMatchStart-fAppendPosition), &status);
321 } else {
322 int32_t len16;
323 if (UTEXT_USES_U16(fInputText)) {
324 len16 = (int32_t)(fMatchStart-fAppendPosition);
325 } else {
326 UErrorCode lengthStatus = U_ZERO_ERROR;
327 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
328 }
329 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
330 if (inputChars == NULL) {
331 status = U_MEMORY_ALLOCATION_ERROR;
332 return *this;
333 }
334 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
335 destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
336 uprv_free(inputChars);
337 }
b75a7d8f 338 }
46f4442e 339 fAppendPosition = fMatchEnd;
57a6839d
A
340
341
b75a7d8f
A
342 // scan the replacement text, looking for substitutions ($n) and \escapes.
343 // TODO: optimize this loop by efficiently scanning for '$' or '\',
344 // move entire ranges not containing substitutions.
729e4ab9 345 UTEXT_SETNATIVEINDEX(replacement, 0);
b331163b 346 for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL; c = UTEXT_NEXT32(replacement)) {
b75a7d8f
A
347 if (c == BACKSLASH) {
348 // Backslash Escape. Copy the following char out without further checks.
349 // Note: Surrogate pairs don't need any special handling
350 // The second half wont be a '$' or a '\', and
351 // will move to the dest normally on the next
352 // loop iteration.
729e4ab9
A
353 c = UTEXT_CURRENT32(replacement);
354 if (c == U_SENTINEL) {
b75a7d8f
A
355 break;
356 }
57a6839d 357
b75a7d8f
A
358 if (c==0x55/*U*/ || c==0x75/*u*/) {
359 // We have a \udddd or \Udddddddd escape sequence.
729e4ab9
A
360 int32_t offset = 0;
361 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
362 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
b75a7d8f 363 if (escapedChar != (UChar32)0xFFFFFFFF) {
729e4ab9
A
364 if (U_IS_BMP(escapedChar)) {
365 UChar c16 = (UChar)escapedChar;
366 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
367 } else {
368 UChar surrogate[2];
369 surrogate[0] = U16_LEAD(escapedChar);
370 surrogate[1] = U16_TRAIL(escapedChar);
371 if (U_SUCCESS(status)) {
372 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
373 }
374 }
b75a7d8f
A
375 // TODO: Report errors for mal-formed \u escapes?
376 // As this is, the original sequence is output, which may be OK.
729e4ab9 377 if (context.lastOffset == offset) {
4388f060 378 (void)UTEXT_PREVIOUS32(replacement);
729e4ab9
A
379 } else if (context.lastOffset != offset-1) {
380 utext_moveIndex32(replacement, offset - context.lastOffset - 1);
381 }
382 }
383 } else {
4388f060 384 (void)UTEXT_NEXT32(replacement);
729e4ab9
A
385 // Plain backslash escape. Just put out the escaped character.
386 if (U_IS_BMP(c)) {
387 UChar c16 = (UChar)c;
388 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
389 } else {
390 UChar surrogate[2];
391 surrogate[0] = U16_LEAD(c);
392 surrogate[1] = U16_TRAIL(c);
393 if (U_SUCCESS(status)) {
394 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
395 }
b75a7d8f
A
396 }
397 }
729e4ab9 398 } else if (c != DOLLARSIGN) {
b75a7d8f 399 // Normal char, not a $. Copy it out without further checks.
729e4ab9
A
400 if (U_IS_BMP(c)) {
401 UChar c16 = (UChar)c;
402 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
403 } else {
404 UChar surrogate[2];
405 surrogate[0] = U16_LEAD(c);
406 surrogate[1] = U16_TRAIL(c);
407 if (U_SUCCESS(status)) {
408 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
409 }
b75a7d8f 410 }
729e4ab9 411 } else {
b331163b
A
412 // We've got a $. Pick up a capture group name or number if one follows.
413 // Consume digits so long as the resulting group number <= the number of
414 // number of capture groups in the pattern.
57a6839d 415
729e4ab9 416 int32_t groupNum = 0;
b331163b
A
417 int32_t numDigits = 0;
418 UChar32 nextChar = utext_current32(replacement);
419 if (nextChar == LEFTBRACKET) {
420 // Scan for a Named Capture Group, ${name}.
421 UnicodeString groupName;
422 utext_next32(replacement);
423 while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {
424 nextChar = utext_next32(replacement);
425 if (nextChar == U_SENTINEL) {
426 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
427 } else if ((nextChar >= 0x41 && nextChar <= 0x5a) || // A..Z
428 (nextChar >= 0x61 && nextChar <= 0x7a) || // a..z
429 (nextChar >= 0x31 && nextChar <= 0x39)) { // 0..9
430 groupName.append(nextChar);
431 } else if (nextChar == RIGHTBRACKET) {
340931cb 432 groupNum = fPattern->fNamedCaptureMap ? uhash_geti(fPattern->fNamedCaptureMap, &groupName) : 0;
b331163b
A
433 if (groupNum == 0) {
434 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
435 }
436 } else {
437 // Character was something other than a name char or a closing '}'
438 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
439 }
729e4ab9 440 }
0f5d89e8 441
b331163b
A
442 } else if (u_isdigit(nextChar)) {
443 // $n Scan for a capture group number
444 int32_t numCaptureGroups = fPattern->fGroupMap->size();
445 for (;;) {
446 nextChar = UTEXT_CURRENT32(replacement);
447 if (nextChar == U_SENTINEL) {
448 break;
449 }
450 if (u_isdigit(nextChar) == FALSE) {
451 break;
452 }
453 int32_t nextDigitVal = u_charDigitValue(nextChar);
454 if (groupNum*10 + nextDigitVal > numCaptureGroups) {
455 // Don't consume the next digit if it makes the capture group number too big.
456 if (numDigits == 0) {
457 status = U_INDEX_OUTOFBOUNDS_ERROR;
458 }
459 break;
460 }
461 (void)UTEXT_NEXT32(replacement);
0f5d89e8 462 groupNum=groupNum*10 + nextDigitVal;
b331163b 463 ++numDigits;
729e4ab9 464 }
b331163b
A
465 } else {
466 // $ not followed by capture group name or number.
467 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
b75a7d8f 468 }
57a6839d 469
b331163b 470 if (U_SUCCESS(status)) {
729e4ab9 471 destLen += appendGroup(groupNum, dest, status);
b75a7d8f 472 }
b331163b
A
473 } // End of $ capture group handling
474 } // End of per-character loop through the replacement string.
57a6839d 475
b75a7d8f
A
476 return *this;
477}
478
479
480
481//--------------------------------------------------------------------------------
482//
483// appendTail Intended to be used in conjunction with appendReplacement()
484// To the destination string, append everything following
485// the last match position from the input string.
486//
46f4442e
A
487// Note: Match ranges do not affect appendTail or appendReplacement
488//
b75a7d8f
A
489//--------------------------------------------------------------------------------
490UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
729e4ab9
A
491 UErrorCode status = U_ZERO_ERROR;
492 UText resultText = UTEXT_INITIALIZER;
493 utext_openUnicodeString(&resultText, &dest, &status);
57a6839d 494
729e4ab9
A
495 if (U_SUCCESS(status)) {
496 appendTail(&resultText, status);
497 utext_close(&resultText);
498 }
57a6839d 499
729e4ab9
A
500 return dest;
501}
502
503//
504// appendTail, UText mode
505//
506UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
729e4ab9 507 if (U_FAILURE(status)) {
57a6839d 508 return dest;
729e4ab9
A
509 }
510 if (U_FAILURE(fDeferredStatus)) {
511 status = fDeferredStatus;
57a6839d 512 return dest;
729e4ab9 513 }
57a6839d 514
729e4ab9
A
515 if (fInputLength > fAppendPosition) {
516 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
517 int64_t destLen = utext_nativeLength(dest);
57a6839d 518 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
729e4ab9
A
519 (int32_t)(fInputLength-fAppendPosition), &status);
520 } else {
521 int32_t len16;
522 if (UTEXT_USES_U16(fInputText)) {
523 len16 = (int32_t)(fInputLength-fAppendPosition);
524 } else {
525 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
526 status = U_ZERO_ERROR; // buffer overflow
527 }
57a6839d 528
729e4ab9
A
529 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
530 if (inputChars == NULL) {
531 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
532 } else {
57a6839d 533 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
729e4ab9
A
534 int64_t destLen = utext_nativeLength(dest);
535 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
536 uprv_free(inputChars);
537 }
538 }
b75a7d8f
A
539 }
540 return dest;
541}
542
543
544
545//--------------------------------------------------------------------------------
546//
547// end
548//
549//--------------------------------------------------------------------------------
550int32_t RegexMatcher::end(UErrorCode &err) const {
551 return end(0, err);
552}
553
729e4ab9
A
554int64_t RegexMatcher::end64(UErrorCode &err) const {
555 return end64(0, err);
556}
b75a7d8f 557
729e4ab9 558int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
b75a7d8f
A
559 if (U_FAILURE(err)) {
560 return -1;
561 }
562 if (fMatch == FALSE) {
563 err = U_REGEX_INVALID_STATE;
564 return -1;
565 }
566 if (group < 0 || group > fPattern->fGroupMap->size()) {
567 err = U_INDEX_OUTOFBOUNDS_ERROR;
568 return -1;
569 }
729e4ab9 570 int64_t e = -1;
b75a7d8f 571 if (group == 0) {
57a6839d 572 e = fMatchEnd;
b75a7d8f
A
573 } else {
574 // Get the position within the stack frame of the variables for
575 // this capture group.
576 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
577 U_ASSERT(groupOffset < fPattern->fFrameSize);
578 U_ASSERT(groupOffset >= 0);
579 e = fFrame->fExtra[groupOffset + 1];
580 }
57a6839d 581
729e4ab9 582 return e;
b75a7d8f
A
583}
584
729e4ab9
A
585int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
586 return (int32_t)end64(group, err);
587}
b75a7d8f 588
b331163b
A
589//--------------------------------------------------------------------------------
590//
591// findProgressInterrupt This function is called once for each advance in the target
592// string from the find() function, and calls the user progress callback
593// function if there is one installed.
594//
595// Return: TRUE if the find operation is to be terminated.
596// FALSE if the find operation is to continue running.
597//
598//--------------------------------------------------------------------------------
599UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) {
600 if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) {
601 status = U_REGEX_STOPPED_BY_CALLER;
602 return TRUE;
603 }
604 return FALSE;
605}
b75a7d8f
A
606
607//--------------------------------------------------------------------------------
608//
609// find()
610//
611//--------------------------------------------------------------------------------
612UBool RegexMatcher::find() {
b331163b
A
613 if (U_FAILURE(fDeferredStatus)) {
614 return FALSE;
615 }
616 UErrorCode status = U_ZERO_ERROR;
617 UBool result = find(status);
618 return result;
619}
620
621//--------------------------------------------------------------------------------
622//
623// find()
624//
625//--------------------------------------------------------------------------------
626UBool RegexMatcher::find(UErrorCode &status) {
b75a7d8f 627 // Start at the position of the last match end. (Will be zero if the
729e4ab9 628 // matcher has been reset.)
b75a7d8f 629 //
b331163b
A
630 if (U_FAILURE(status)) {
631 return FALSE;
632 }
b75a7d8f 633 if (U_FAILURE(fDeferredStatus)) {
b331163b 634 status = fDeferredStatus;
b75a7d8f
A
635 return FALSE;
636 }
57a6839d 637
729e4ab9 638 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
b331163b 639 return findUsingChunk(status);
729e4ab9 640 }
b75a7d8f 641
729e4ab9 642 int64_t startPos = fMatchEnd;
46f4442e
A
643 if (startPos==0) {
644 startPos = fActiveStart;
645 }
374ca955
A
646
647 if (fMatch) {
648 // Save the position of any previous successful match.
649 fLastMatchEnd = fMatchEnd;
650
651 if (fMatchStart == fMatchEnd) {
652 // Previous match had zero length. Move start position up one position
653 // to avoid sending find() into a loop on zero-length matches.
46f4442e 654 if (startPos >= fActiveLimit) {
374ca955 655 fMatch = FALSE;
46f4442e 656 fHitEnd = TRUE;
374ca955
A
657 return FALSE;
658 }
729e4ab9 659 UTEXT_SETNATIVEINDEX(fInputText, startPos);
4388f060 660 (void)UTEXT_NEXT32(fInputText);
729e4ab9 661 startPos = UTEXT_GETNATIVEINDEX(fInputText);
374ca955
A
662 }
663 } else {
664 if (fLastMatchEnd >= 0) {
665 // A previous find() failed to match. Don't try again.
666 // (without this test, a pattern with a zero-length match
667 // could match again at the end of an input string.)
46f4442e 668 fHitEnd = TRUE;
374ca955
A
669 return FALSE;
670 }
671 }
672
374ca955
A
673
674 // Compute the position in the input string beyond which a match can not begin, because
675 // the minimum length match would extend past the end of the input.
46f4442e
A
676 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
677 // Be aware of possible overflows if making changes here.
729e4ab9
A
678 int64_t testStartLimit;
679 if (UTEXT_USES_U16(fInputText)) {
680 testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
681 if (startPos > testStartLimit) {
682 fMatch = FALSE;
683 fHitEnd = TRUE;
684 return FALSE;
685 }
686 } else {
b331163b
A
687 // We don't know exactly how long the minimum match length is in native characters.
688 // Treat anything > 0 as 1.
689 testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0);
b75a7d8f
A
690 }
691
b75a7d8f
A
692 UChar32 c;
693 U_ASSERT(startPos >= 0);
694
695 switch (fPattern->fStartType) {
696 case START_NO_INFO:
57a6839d 697 // No optimization was found.
b75a7d8f
A
698 // Try a match at each input position.
699 for (;;) {
b331163b
A
700 MatchAt(startPos, FALSE, status);
701 if (U_FAILURE(status)) {
b75a7d8f
A
702 return FALSE;
703 }
704 if (fMatch) {
705 return TRUE;
706 }
729e4ab9 707 if (startPos >= testStartLimit) {
46f4442e 708 fHitEnd = TRUE;
b75a7d8f
A
709 return FALSE;
710 }
729e4ab9 711 UTEXT_SETNATIVEINDEX(fInputText, startPos);
4388f060 712 (void)UTEXT_NEXT32(fInputText);
729e4ab9 713 startPos = UTEXT_GETNATIVEINDEX(fInputText);
b75a7d8f
A
714 // Note that it's perfectly OK for a pattern to have a zero-length
715 // match at the end of a string, so we must make sure that the loop
729e4ab9 716 // runs with startPos == testStartLimit the last time through.
b331163b 717 if (findProgressInterrupt(startPos, status))
729e4ab9 718 return FALSE;
b75a7d8f 719 }
3d1f044b 720 UPRV_UNREACHABLE;
b75a7d8f
A
721
722 case START_START:
723 // Matches are only possible at the start of the input string
724 // (pattern begins with ^ or \A)
46f4442e 725 if (startPos > fActiveStart) {
374ca955 726 fMatch = FALSE;
b75a7d8f
A
727 return FALSE;
728 }
b331163b
A
729 MatchAt(startPos, FALSE, status);
730 if (U_FAILURE(status)) {
b75a7d8f
A
731 return FALSE;
732 }
733 return fMatch;
734
735
736 case START_SET:
737 {
738 // Match may start on any char from a pre-computed set.
739 U_ASSERT(fPattern->fMinMatchLen > 0);
729e4ab9 740 UTEXT_SETNATIVEINDEX(fInputText, startPos);
b75a7d8f 741 for (;;) {
b331163b 742 int64_t pos = startPos;
729e4ab9 743 c = UTEXT_NEXT32(fInputText);
b331163b 744 startPos = UTEXT_GETNATIVEINDEX(fInputText);
729e4ab9
A
745 // c will be -1 (U_SENTINEL) at end of text, in which case we
746 // skip this next block (so we don't have a negative array index)
747 // and handle end of text in the following block.
748 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
749 (c>=256 && fPattern->fInitialChars->contains(c)))) {
b331163b
A
750 MatchAt(pos, FALSE, status);
751 if (U_FAILURE(status)) {
b75a7d8f
A
752 return FALSE;
753 }
754 if (fMatch) {
755 return TRUE;
756 }
729e4ab9 757 UTEXT_SETNATIVEINDEX(fInputText, pos);
b75a7d8f 758 }
b331163b 759 if (startPos > testStartLimit) {
374ca955 760 fMatch = FALSE;
46f4442e 761 fHitEnd = TRUE;
b75a7d8f
A
762 return FALSE;
763 }
b331163b 764 if (findProgressInterrupt(startPos, status))
729e4ab9 765 return FALSE;
b75a7d8f
A
766 }
767 }
3d1f044b 768 UPRV_UNREACHABLE;
b75a7d8f
A
769
770 case START_STRING:
771 case START_CHAR:
772 {
773 // Match starts on exactly one char.
774 U_ASSERT(fPattern->fMinMatchLen > 0);
775 UChar32 theChar = fPattern->fInitialChar;
729e4ab9 776 UTEXT_SETNATIVEINDEX(fInputText, startPos);
b75a7d8f 777 for (;;) {
b331163b 778 int64_t pos = startPos;
729e4ab9 779 c = UTEXT_NEXT32(fInputText);
b331163b 780 startPos = UTEXT_GETNATIVEINDEX(fInputText);
b75a7d8f 781 if (c == theChar) {
b331163b
A
782 MatchAt(pos, FALSE, status);
783 if (U_FAILURE(status)) {
b75a7d8f
A
784 return FALSE;
785 }
786 if (fMatch) {
787 return TRUE;
788 }
2ca993e8 789 UTEXT_SETNATIVEINDEX(fInputText, startPos);
b75a7d8f 790 }
b331163b 791 if (startPos > testStartLimit) {
374ca955 792 fMatch = FALSE;
46f4442e 793 fHitEnd = TRUE;
b75a7d8f
A
794 return FALSE;
795 }
b331163b 796 if (findProgressInterrupt(startPos, status))
729e4ab9
A
797 return FALSE;
798 }
b75a7d8f 799 }
3d1f044b 800 UPRV_UNREACHABLE;
b75a7d8f
A
801
802 case START_LINE:
803 {
3d1f044b 804 UChar32 ch;
46f4442e 805 if (startPos == fAnchorStart) {
b331163b
A
806 MatchAt(startPos, FALSE, status);
807 if (U_FAILURE(status)) {
b75a7d8f
A
808 return FALSE;
809 }
810 if (fMatch) {
811 return TRUE;
812 }
729e4ab9 813 UTEXT_SETNATIVEINDEX(fInputText, startPos);
3d1f044b 814 ch = UTEXT_NEXT32(fInputText);
729e4ab9
A
815 startPos = UTEXT_GETNATIVEINDEX(fInputText);
816 } else {
817 UTEXT_SETNATIVEINDEX(fInputText, startPos);
3d1f044b 818 ch = UTEXT_PREVIOUS32(fInputText);
729e4ab9 819 UTEXT_SETNATIVEINDEX(fInputText, startPos);
b75a7d8f
A
820 }
821
46f4442e 822 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
729e4ab9 823 for (;;) {
3d1f044b 824 if (ch == 0x0a) {
b331163b
A
825 MatchAt(startPos, FALSE, status);
826 if (U_FAILURE(status)) {
46f4442e
A
827 return FALSE;
828 }
829 if (fMatch) {
830 return TRUE;
831 }
729e4ab9 832 UTEXT_SETNATIVEINDEX(fInputText, startPos);
46f4442e 833 }
729e4ab9 834 if (startPos >= testStartLimit) {
46f4442e
A
835 fMatch = FALSE;
836 fHitEnd = TRUE;
837 return FALSE;
838 }
3d1f044b 839 ch = UTEXT_NEXT32(fInputText);
729e4ab9 840 startPos = UTEXT_GETNATIVEINDEX(fInputText);
46f4442e
A
841 // Note that it's perfectly OK for a pattern to have a zero-length
842 // match at the end of a string, so we must make sure that the loop
729e4ab9 843 // runs with startPos == testStartLimit the last time through.
b331163b 844 if (findProgressInterrupt(startPos, status))
729e4ab9 845 return FALSE;
b75a7d8f 846 }
46f4442e
A
847 } else {
848 for (;;) {
3d1f044b
A
849 if (isLineTerminator(ch)) {
850 if (ch == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
b331163b
A
851 (void)UTEXT_NEXT32(fInputText);
852 startPos = UTEXT_GETNATIVEINDEX(fInputText);
853 }
854 MatchAt(startPos, FALSE, status);
855 if (U_FAILURE(status)) {
856 return FALSE;
857 }
858 if (fMatch) {
859 return TRUE;
860 }
861 UTEXT_SETNATIVEINDEX(fInputText, startPos);
46f4442e 862 }
729e4ab9 863 if (startPos >= testStartLimit) {
46f4442e
A
864 fMatch = FALSE;
865 fHitEnd = TRUE;
866 return FALSE;
867 }
3d1f044b 868 ch = UTEXT_NEXT32(fInputText);
729e4ab9 869 startPos = UTEXT_GETNATIVEINDEX(fInputText);
46f4442e
A
870 // Note that it's perfectly OK for a pattern to have a zero-length
871 // match at the end of a string, so we must make sure that the loop
729e4ab9 872 // runs with startPos == testStartLimit the last time through.
b331163b 873 if (findProgressInterrupt(startPos, status))
729e4ab9 874 return FALSE;
b75a7d8f 875 }
b75a7d8f
A
876 }
877 }
878
879 default:
3d1f044b 880 UPRV_UNREACHABLE;
b75a7d8f
A
881 }
882
3d1f044b 883 UPRV_UNREACHABLE;
b75a7d8f
A
884}
885
886
887
729e4ab9 888UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
b75a7d8f
A
889 if (U_FAILURE(status)) {
890 return FALSE;
891 }
892 if (U_FAILURE(fDeferredStatus)) {
893 status = fDeferredStatus;
894 return FALSE;
895 }
46f4442e
A
896 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
897 // This will reset the region to be the full input length.
729e4ab9
A
898 if (start < 0) {
899 status = U_INDEX_OUTOFBOUNDS_ERROR;
900 return FALSE;
901 }
57a6839d 902
729e4ab9
A
903 int64_t nativeStart = start;
904 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
b75a7d8f
A
905 status = U_INDEX_OUTOFBOUNDS_ERROR;
906 return FALSE;
907 }
57a6839d 908 fMatchEnd = nativeStart;
b331163b 909 return find(status);
b75a7d8f
A
910}
911
912
b75a7d8f
A
913//--------------------------------------------------------------------------------
914//
729e4ab9
A
915// findUsingChunk() -- like find(), but with the advance knowledge that the
916// entire string is available in the UText's chunk buffer.
b75a7d8f
A
917//
918//--------------------------------------------------------------------------------
b331163b 919UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
729e4ab9
A
920 // Start at the position of the last match end. (Will be zero if the
921 // matcher has been reset.
922 //
b75a7d8f 923
729e4ab9
A
924 int32_t startPos = (int32_t)fMatchEnd;
925 if (startPos==0) {
926 startPos = (int32_t)fActiveStart;
b75a7d8f 927 }
57a6839d 928
729e4ab9 929 const UChar *inputBuf = fInputText->chunkContents;
b75a7d8f 930
729e4ab9
A
931 if (fMatch) {
932 // Save the position of any previous successful match.
933 fLastMatchEnd = fMatchEnd;
57a6839d 934
729e4ab9
A
935 if (fMatchStart == fMatchEnd) {
936 // Previous match had zero length. Move start position up one position
937 // to avoid sending find() into a loop on zero-length matches.
938 if (startPos >= fActiveLimit) {
939 fMatch = FALSE;
940 fHitEnd = TRUE;
941 return FALSE;
942 }
943 U16_FWD_1(inputBuf, startPos, fInputLength);
944 }
945 } else {
946 if (fLastMatchEnd >= 0) {
947 // A previous find() failed to match. Don't try again.
948 // (without this test, a pattern with a zero-length match
949 // could match again at the end of an input string.)
950 fHitEnd = TRUE;
951 return FALSE;
952 }
b75a7d8f 953 }
57a6839d
A
954
955
729e4ab9
A
956 // Compute the position in the input string beyond which a match can not begin, because
957 // the minimum length match would extend past the end of the input.
958 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
959 // Be aware of possible overflows if making changes here.
b331163b 960 // Note: a match can begin at inputBuf + testLen; it is an inclusive limit.
729e4ab9
A
961 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
962 if (startPos > testLen) {
963 fMatch = FALSE;
964 fHitEnd = TRUE;
b75a7d8f
A
965 return FALSE;
966 }
57a6839d 967
729e4ab9
A
968 UChar32 c;
969 U_ASSERT(startPos >= 0);
57a6839d 970
729e4ab9
A
971 switch (fPattern->fStartType) {
972 case START_NO_INFO:
57a6839d 973 // No optimization was found.
729e4ab9
A
974 // Try a match at each input position.
975 for (;;) {
b331163b
A
976 MatchChunkAt(startPos, FALSE, status);
977 if (U_FAILURE(status)) {
729e4ab9
A
978 return FALSE;
979 }
980 if (fMatch) {
981 return TRUE;
982 }
983 if (startPos >= testLen) {
984 fHitEnd = TRUE;
985 return FALSE;
986 }
987 U16_FWD_1(inputBuf, startPos, fActiveLimit);
988 // Note that it's perfectly OK for a pattern to have a zero-length
989 // match at the end of a string, so we must make sure that the loop
990 // runs with startPos == testLen the last time through.
b331163b 991 if (findProgressInterrupt(startPos, status))
729e4ab9
A
992 return FALSE;
993 }
3d1f044b 994 UPRV_UNREACHABLE;
57a6839d 995
729e4ab9
A
996 case START_START:
997 // Matches are only possible at the start of the input string
998 // (pattern begins with ^ or \A)
999 if (startPos > fActiveStart) {
1000 fMatch = FALSE;
1001 return FALSE;
1002 }
b331163b
A
1003 MatchChunkAt(startPos, FALSE, status);
1004 if (U_FAILURE(status)) {
729e4ab9
A
1005 return FALSE;
1006 }
1007 return fMatch;
57a6839d
A
1008
1009
729e4ab9
A
1010 case START_SET:
1011 {
1012 // Match may start on any char from a pre-computed set.
1013 U_ASSERT(fPattern->fMinMatchLen > 0);
1014 for (;;) {
1015 int32_t pos = startPos;
1016 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
1017 if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
1018 (c>=256 && fPattern->fInitialChars->contains(c))) {
b331163b
A
1019 MatchChunkAt(pos, FALSE, status);
1020 if (U_FAILURE(status)) {
729e4ab9
A
1021 return FALSE;
1022 }
1023 if (fMatch) {
1024 return TRUE;
1025 }
1026 }
b331163b 1027 if (startPos > testLen) {
729e4ab9
A
1028 fMatch = FALSE;
1029 fHitEnd = TRUE;
1030 return FALSE;
1031 }
b331163b 1032 if (findProgressInterrupt(startPos, status))
729e4ab9
A
1033 return FALSE;
1034 }
b75a7d8f 1035 }
3d1f044b 1036 UPRV_UNREACHABLE;
57a6839d 1037
729e4ab9
A
1038 case START_STRING:
1039 case START_CHAR:
1040 {
1041 // Match starts on exactly one char.
1042 U_ASSERT(fPattern->fMinMatchLen > 0);
1043 UChar32 theChar = fPattern->fInitialChar;
1044 for (;;) {
1045 int32_t pos = startPos;
1046 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
1047 if (c == theChar) {
b331163b
A
1048 MatchChunkAt(pos, FALSE, status);
1049 if (U_FAILURE(status)) {
729e4ab9
A
1050 return FALSE;
1051 }
1052 if (fMatch) {
1053 return TRUE;
1054 }
1055 }
b331163b 1056 if (startPos > testLen) {
729e4ab9
A
1057 fMatch = FALSE;
1058 fHitEnd = TRUE;
1059 return FALSE;
1060 }
b331163b 1061 if (findProgressInterrupt(startPos, status))
729e4ab9
A
1062 return FALSE;
1063 }
1064 }
3d1f044b 1065 UPRV_UNREACHABLE;
57a6839d 1066
729e4ab9
A
1067 case START_LINE:
1068 {
3d1f044b 1069 UChar32 ch;
729e4ab9 1070 if (startPos == fAnchorStart) {
b331163b
A
1071 MatchChunkAt(startPos, FALSE, status);
1072 if (U_FAILURE(status)) {
729e4ab9
A
1073 return FALSE;
1074 }
1075 if (fMatch) {
1076 return TRUE;
1077 }
f3c0d7a5
A
1078 // In bug 31063104 which has a zero-length text buffer we get here with
1079 // inputBuf=NULL, startPos=fActiveLimit=0 (and fMatch F) which violates the
1080 // requirement for U16_FWD_1 (utf16.h) that startPos < fActiveLimit. Having
1081 // inputBuf=NULL (chunkContexts NULL) is probably due to an error in the
1082 // CFStringUText functions. Nevertheless, to be defensive, add test below.
1083 if (startPos >= testLen) {
1084 fHitEnd = TRUE;
1085 return FALSE;
1086 }
729e4ab9
A
1087 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1088 }
57a6839d 1089
729e4ab9
A
1090 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1091 for (;;) {
3d1f044b
A
1092 ch = inputBuf[startPos-1];
1093 if (ch == 0x0a) {
b331163b
A
1094 MatchChunkAt(startPos, FALSE, status);
1095 if (U_FAILURE(status)) {
729e4ab9
A
1096 return FALSE;
1097 }
1098 if (fMatch) {
1099 return TRUE;
1100 }
1101 }
1102 if (startPos >= testLen) {
1103 fMatch = FALSE;
1104 fHitEnd = TRUE;
1105 return FALSE;
1106 }
1107 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1108 // Note that it's perfectly OK for a pattern to have a zero-length
1109 // match at the end of a string, so we must make sure that the loop
1110 // runs with startPos == testLen the last time through.
b331163b 1111 if (findProgressInterrupt(startPos, status))
729e4ab9
A
1112 return FALSE;
1113 }
1114 } else {
1115 for (;;) {
3d1f044b
A
1116 ch = inputBuf[startPos-1];
1117 if (isLineTerminator(ch)) {
1118 if (ch == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
729e4ab9
A
1119 startPos++;
1120 }
b331163b
A
1121 MatchChunkAt(startPos, FALSE, status);
1122 if (U_FAILURE(status)) {
729e4ab9
A
1123 return FALSE;
1124 }
1125 if (fMatch) {
1126 return TRUE;
1127 }
1128 }
1129 if (startPos >= testLen) {
1130 fMatch = FALSE;
1131 fHitEnd = TRUE;
1132 return FALSE;
1133 }
1134 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1135 // Note that it's perfectly OK for a pattern to have a zero-length
1136 // match at the end of a string, so we must make sure that the loop
1137 // runs with startPos == testLen the last time through.
b331163b 1138 if (findProgressInterrupt(startPos, status))
729e4ab9
A
1139 return FALSE;
1140 }
1141 }
1142 }
57a6839d 1143
729e4ab9 1144 default:
3d1f044b 1145 UPRV_UNREACHABLE;
729e4ab9 1146 }
57a6839d 1147
3d1f044b 1148 UPRV_UNREACHABLE;
729e4ab9
A
1149}
1150
1151
1152
1153//--------------------------------------------------------------------------------
1154//
1155// group()
1156//
1157//--------------------------------------------------------------------------------
1158UnicodeString RegexMatcher::group(UErrorCode &status) const {
1159 return group(0, status);
b75a7d8f
A
1160}
1161
729e4ab9
A
1162// Return immutable shallow clone
1163UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1164 return group(0, dest, group_len, status);
1165}
b75a7d8f 1166
729e4ab9
A
1167// Return immutable shallow clone
1168UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1169 group_len = 0;
374ca955 1170 if (U_FAILURE(status)) {
729e4ab9 1171 return dest;
374ca955
A
1172 }
1173 if (U_FAILURE(fDeferredStatus)) {
1174 status = fDeferredStatus;
57a6839d 1175 } else if (fMatch == FALSE) {
729e4ab9 1176 status = U_REGEX_INVALID_STATE;
57a6839d 1177 } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
374ca955 1178 status = U_INDEX_OUTOFBOUNDS_ERROR;
374ca955 1179 }
57a6839d
A
1180
1181 if (U_FAILURE(status)) {
1182 return dest;
729e4ab9 1183 }
57a6839d 1184
729e4ab9
A
1185 int64_t s, e;
1186 if (groupNum == 0) {
1187 s = fMatchStart;
1188 e = fMatchEnd;
1189 } else {
1190 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1191 U_ASSERT(groupOffset < fPattern->fFrameSize);
1192 U_ASSERT(groupOffset >= 0);
1193 s = fFrame->fExtra[groupOffset];
1194 e = fFrame->fExtra[groupOffset+1];
1195 }
1196
1197 if (s < 0) {
1198 // A capture group wasn't part of the match
1199 return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1200 }
1201 U_ASSERT(s <= e);
1202 group_len = e - s;
57a6839d 1203
729e4ab9
A
1204 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1205 if (dest)
1206 UTEXT_SETNATIVEINDEX(dest, s);
1207 return dest;
374ca955
A
1208}
1209
729e4ab9
A
1210UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1211 UnicodeString result;
b331163b
A
1212 int64_t groupStart = start64(groupNum, status);
1213 int64_t groupEnd = end64(groupNum, status);
1214 if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) {
729e4ab9
A
1215 return result;
1216 }
57a6839d 1217
b331163b
A
1218 // Get the group length using a utext_extract preflight.
1219 // UText is actually pretty efficient at this when underlying encoding is UTF-16.
1220 int32_t length = utext_extract(fInputText, groupStart, groupEnd, NULL, 0, &status);
1221 if (status != U_BUFFER_OVERFLOW_ERROR) {
1222 return result;
729e4ab9 1223 }
57a6839d 1224
b331163b
A
1225 status = U_ZERO_ERROR;
1226 UChar *buf = result.getBuffer(length);
1227 if (buf == NULL) {
1228 status = U_MEMORY_ALLOCATION_ERROR;
729e4ab9 1229 } else {
b331163b
A
1230 int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status);
1231 result.releaseBuffer(extractLength);
1232 U_ASSERT(length == extractLength);
729e4ab9 1233 }
b331163b 1234 return result;
b75a7d8f
A
1235}
1236
b331163b 1237
729e4ab9
A
1238//--------------------------------------------------------------------------------
1239//
1240// appendGroup() -- currently internal only, appends a group to a UText rather
1241// than replacing its contents
1242//
1243//--------------------------------------------------------------------------------
b75a7d8f 1244
729e4ab9 1245int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
374ca955 1246 if (U_FAILURE(status)) {
729e4ab9 1247 return 0;
374ca955
A
1248 }
1249 if (U_FAILURE(fDeferredStatus)) {
1250 status = fDeferredStatus;
729e4ab9 1251 return 0;
374ca955 1252 }
729e4ab9 1253 int64_t destLen = utext_nativeLength(dest);
57a6839d 1254
729e4ab9
A
1255 if (fMatch == FALSE) {
1256 status = U_REGEX_INVALID_STATE;
1257 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1258 }
1259 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
374ca955 1260 status = U_INDEX_OUTOFBOUNDS_ERROR;
729e4ab9 1261 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
374ca955 1262 }
57a6839d 1263
729e4ab9
A
1264 int64_t s, e;
1265 if (groupNum == 0) {
1266 s = fMatchStart;
1267 e = fMatchEnd;
1268 } else {
1269 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1270 U_ASSERT(groupOffset < fPattern->fFrameSize);
1271 U_ASSERT(groupOffset >= 0);
1272 s = fFrame->fExtra[groupOffset];
1273 e = fFrame->fExtra[groupOffset+1];
1274 }
57a6839d 1275
729e4ab9 1276 if (s < 0) {
57a6839d 1277 // A capture group wasn't part of the match
729e4ab9
A
1278 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1279 }
1280 U_ASSERT(s <= e);
57a6839d 1281
729e4ab9
A
1282 int64_t deltaLen;
1283 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1284 U_ASSERT(e <= fInputLength);
1285 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1286 } else {
1287 int32_t len16;
1288 if (UTEXT_USES_U16(fInputText)) {
1289 len16 = (int32_t)(e-s);
1290 } else {
1291 UErrorCode lengthStatus = U_ZERO_ERROR;
1292 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1293 }
1294 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1295 if (groupChars == NULL) {
1296 status = U_MEMORY_ALLOCATION_ERROR;
1297 return 0;
1298 }
1299 utext_extract(fInputText, s, e, groupChars, len16+1, &status);
57a6839d 1300
729e4ab9
A
1301 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1302 uprv_free(groupChars);
1303 }
1304 return deltaLen;
374ca955
A
1305}
1306
b75a7d8f
A
1307
1308
46f4442e
A
1309//--------------------------------------------------------------------------------
1310//
729e4ab9 1311// groupCount()
46f4442e
A
1312//
1313//--------------------------------------------------------------------------------
729e4ab9
A
1314int32_t RegexMatcher::groupCount() const {
1315 return fPattern->fGroupMap->size();
b75a7d8f
A
1316}
1317
46f4442e
A
1318//--------------------------------------------------------------------------------
1319//
729e4ab9
A
1320// hasAnchoringBounds()
1321//
1322//--------------------------------------------------------------------------------
1323UBool RegexMatcher::hasAnchoringBounds() const {
1324 return fAnchoringBounds;
1325}
1326
1327
1328//--------------------------------------------------------------------------------
1329//
1330// hasTransparentBounds()
1331//
1332//--------------------------------------------------------------------------------
1333UBool RegexMatcher::hasTransparentBounds() const {
1334 return fTransparentBounds;
1335}
1336
1337
1338
1339//--------------------------------------------------------------------------------
1340//
1341// hitEnd()
1342//
1343//--------------------------------------------------------------------------------
1344UBool RegexMatcher::hitEnd() const {
1345 return fHitEnd;
1346}
1347
1348
1349//--------------------------------------------------------------------------------
1350//
1351// input()
1352//
1353//--------------------------------------------------------------------------------
1354const UnicodeString &RegexMatcher::input() const {
1355 if (!fInput) {
1356 UErrorCode status = U_ZERO_ERROR;
1357 int32_t len16;
1358 if (UTEXT_USES_U16(fInputText)) {
1359 len16 = (int32_t)fInputLength;
1360 } else {
1361 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1362 status = U_ZERO_ERROR; // overflow, length status
1363 }
1364 UnicodeString *result = new UnicodeString(len16, 0, 0);
57a6839d 1365
729e4ab9
A
1366 UChar *inputChars = result->getBuffer(len16);
1367 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1368 result->releaseBuffer(len16);
57a6839d 1369
729e4ab9
A
1370 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1371 }
57a6839d 1372
729e4ab9
A
1373 return *fInput;
1374}
1375
1376//--------------------------------------------------------------------------------
1377//
1378// inputText()
1379//
1380//--------------------------------------------------------------------------------
1381UText *RegexMatcher::inputText() const {
1382 return fInputText;
1383}
1384
1385
1386//--------------------------------------------------------------------------------
1387//
1388// getInput() -- like inputText(), but makes a clone or copies into another UText
1389//
1390//--------------------------------------------------------------------------------
1391UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
729e4ab9
A
1392 if (U_FAILURE(status)) {
1393 return dest;
1394 }
1395 if (U_FAILURE(fDeferredStatus)) {
1396 status = fDeferredStatus;
57a6839d 1397 return dest;
729e4ab9 1398 }
57a6839d 1399
729e4ab9
A
1400 if (dest) {
1401 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1402 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1403 } else {
1404 int32_t input16Len;
1405 if (UTEXT_USES_U16(fInputText)) {
1406 input16Len = (int32_t)fInputLength;
1407 } else {
1408 UErrorCode lengthStatus = U_ZERO_ERROR;
1409 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1410 }
1411 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1412 if (inputChars == NULL) {
1413 return dest;
1414 }
57a6839d 1415
729e4ab9
A
1416 status = U_ZERO_ERROR;
1417 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1418 status = U_ZERO_ERROR;
1419 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
57a6839d 1420
729e4ab9
A
1421 uprv_free(inputChars);
1422 }
1423 return dest;
1424 } else {
1425 return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1426 }
1427}
1428
1429
1430static UBool compat_SyncMutableUTextContents(UText *ut);
1431static UBool compat_SyncMutableUTextContents(UText *ut) {
1432 UBool retVal = FALSE;
57a6839d 1433
729e4ab9
A
1434 // In the following test, we're really only interested in whether the UText should switch
1435 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents
1436 // will still point to the correct data.
1437 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1438 UnicodeString *us=(UnicodeString *)ut->context;
57a6839d 1439
729e4ab9
A
1440 // Update to the latest length.
1441 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1442 int32_t newLength = us->length();
57a6839d 1443
729e4ab9
A
1444 // Update the chunk description.
1445 // The buffer may have switched between stack- and heap-based.
1446 ut->chunkContents = us->getBuffer();
1447 ut->chunkLength = newLength;
1448 ut->chunkNativeLimit = newLength;
1449 ut->nativeIndexingLimit = newLength;
1450 retVal = TRUE;
1451 }
1452
1453 return retVal;
1454}
1455
1456//--------------------------------------------------------------------------------
1457//
1458// lookingAt()
1459//
1460//--------------------------------------------------------------------------------
1461UBool RegexMatcher::lookingAt(UErrorCode &status) {
1462 if (U_FAILURE(status)) {
1463 return FALSE;
1464 }
1465 if (U_FAILURE(fDeferredStatus)) {
1466 status = fDeferredStatus;
1467 return FALSE;
1468 }
57a6839d 1469
729e4ab9
A
1470 if (fInputUniStrMaybeMutable) {
1471 if (compat_SyncMutableUTextContents(fInputText)) {
1472 fInputLength = utext_nativeLength(fInputText);
1473 reset();
1474 }
1475 }
1476 else {
1477 resetPreserveRegion();
1478 }
1479 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1480 MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1481 } else {
1482 MatchAt(fActiveStart, FALSE, status);
1483 }
1484 return fMatch;
1485}
1486
1487
1488UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1489 if (U_FAILURE(status)) {
1490 return FALSE;
1491 }
1492 if (U_FAILURE(fDeferredStatus)) {
1493 status = fDeferredStatus;
1494 return FALSE;
1495 }
1496 reset();
57a6839d 1497
729e4ab9
A
1498 if (start < 0) {
1499 status = U_INDEX_OUTOFBOUNDS_ERROR;
1500 return FALSE;
1501 }
57a6839d 1502
729e4ab9
A
1503 if (fInputUniStrMaybeMutable) {
1504 if (compat_SyncMutableUTextContents(fInputText)) {
1505 fInputLength = utext_nativeLength(fInputText);
1506 reset();
1507 }
1508 }
1509
1510 int64_t nativeStart;
1511 nativeStart = start;
1512 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1513 status = U_INDEX_OUTOFBOUNDS_ERROR;
1514 return FALSE;
1515 }
57a6839d 1516
729e4ab9
A
1517 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1518 MatchChunkAt((int32_t)nativeStart, FALSE, status);
1519 } else {
1520 MatchAt(nativeStart, FALSE, status);
1521 }
1522 return fMatch;
1523}
1524
1525
1526
1527//--------------------------------------------------------------------------------
1528//
1529// matches()
1530//
1531//--------------------------------------------------------------------------------
1532UBool RegexMatcher::matches(UErrorCode &status) {
1533 if (U_FAILURE(status)) {
1534 return FALSE;
1535 }
1536 if (U_FAILURE(fDeferredStatus)) {
1537 status = fDeferredStatus;
1538 return FALSE;
1539 }
1540
1541 if (fInputUniStrMaybeMutable) {
1542 if (compat_SyncMutableUTextContents(fInputText)) {
1543 fInputLength = utext_nativeLength(fInputText);
1544 reset();
1545 }
1546 }
1547 else {
1548 resetPreserveRegion();
1549 }
1550
1551 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1552 MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1553 } else {
1554 MatchAt(fActiveStart, TRUE, status);
1555 }
1556 return fMatch;
1557}
1558
1559
1560UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1561 if (U_FAILURE(status)) {
1562 return FALSE;
1563 }
1564 if (U_FAILURE(fDeferredStatus)) {
1565 status = fDeferredStatus;
1566 return FALSE;
1567 }
1568 reset();
57a6839d 1569
729e4ab9
A
1570 if (start < 0) {
1571 status = U_INDEX_OUTOFBOUNDS_ERROR;
1572 return FALSE;
1573 }
1574
1575 if (fInputUniStrMaybeMutable) {
1576 if (compat_SyncMutableUTextContents(fInputText)) {
1577 fInputLength = utext_nativeLength(fInputText);
1578 reset();
1579 }
1580 }
1581
1582 int64_t nativeStart;
1583 nativeStart = start;
1584 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1585 status = U_INDEX_OUTOFBOUNDS_ERROR;
1586 return FALSE;
1587 }
1588
1589 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1590 MatchChunkAt((int32_t)nativeStart, TRUE, status);
1591 } else {
1592 MatchAt(nativeStart, TRUE, status);
1593 }
1594 return fMatch;
1595}
1596
1597
1598
1599//--------------------------------------------------------------------------------
1600//
1601// pattern
1602//
1603//--------------------------------------------------------------------------------
1604const RegexPattern &RegexMatcher::pattern() const {
1605 return *fPattern;
1606}
1607
1608
1609
1610//--------------------------------------------------------------------------------
1611//
1612// region
46f4442e
A
1613//
1614//--------------------------------------------------------------------------------
729e4ab9 1615RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
46f4442e
A
1616 if (U_FAILURE(status)) {
1617 return *this;
1618 }
57a6839d 1619
729e4ab9 1620 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
46f4442e
A
1621 status = U_ILLEGAL_ARGUMENT_ERROR;
1622 }
57a6839d 1623
729e4ab9
A
1624 int64_t nativeStart = regionStart;
1625 int64_t nativeLimit = regionLimit;
1626 if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1627 status = U_ILLEGAL_ARGUMENT_ERROR;
1628 }
1629
1630 if (startIndex == -1)
1631 this->reset();
1632 else
57a6839d
A
1633 resetPreserveRegion();
1634
729e4ab9
A
1635 fRegionStart = nativeStart;
1636 fRegionLimit = nativeLimit;
1637 fActiveStart = nativeStart;
1638 fActiveLimit = nativeLimit;
1639
1640 if (startIndex != -1) {
1641 if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1642 status = U_INDEX_OUTOFBOUNDS_ERROR;
1643 }
57a6839d 1644 fMatchEnd = startIndex;
729e4ab9
A
1645 }
1646
46f4442e 1647 if (!fTransparentBounds) {
729e4ab9
A
1648 fLookStart = nativeStart;
1649 fLookLimit = nativeLimit;
46f4442e
A
1650 }
1651 if (fAnchoringBounds) {
729e4ab9
A
1652 fAnchorStart = nativeStart;
1653 fAnchorLimit = nativeLimit;
46f4442e
A
1654 }
1655 return *this;
1656}
1657
729e4ab9
A
1658RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1659 return region(start, limit, -1, status);
1660}
46f4442e
A
1661
1662//--------------------------------------------------------------------------------
1663//
1664// regionEnd
1665//
1666//--------------------------------------------------------------------------------
1667int32_t RegexMatcher::regionEnd() const {
729e4ab9 1668 return (int32_t)fRegionLimit;
46f4442e
A
1669}
1670
729e4ab9
A
1671int64_t RegexMatcher::regionEnd64() const {
1672 return fRegionLimit;
1673}
46f4442e
A
1674
1675//--------------------------------------------------------------------------------
1676//
1677// regionStart
1678//
1679//--------------------------------------------------------------------------------
1680int32_t RegexMatcher::regionStart() const {
729e4ab9
A
1681 return (int32_t)fRegionStart;
1682}
1683
1684int64_t RegexMatcher::regionStart64() const {
46f4442e
A
1685 return fRegionStart;
1686}
1687
1688
b75a7d8f
A
1689//--------------------------------------------------------------------------------
1690//
1691// replaceAll
1692//
1693//--------------------------------------------------------------------------------
1694UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
729e4ab9
A
1695 UText replacementText = UTEXT_INITIALIZER;
1696 UText resultText = UTEXT_INITIALIZER;
1697 UnicodeString resultString;
1698 if (U_FAILURE(status)) {
1699 return resultString;
1700 }
57a6839d 1701
729e4ab9
A
1702 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1703 utext_openUnicodeString(&resultText, &resultString, &status);
57a6839d 1704
729e4ab9
A
1705 replaceAll(&replacementText, &resultText, status);
1706
1707 utext_close(&resultText);
1708 utext_close(&replacementText);
57a6839d 1709
729e4ab9
A
1710 return resultString;
1711}
1712
1713
1714//
1715// replaceAll, UText mode
1716//
1717UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
b75a7d8f 1718 if (U_FAILURE(status)) {
729e4ab9 1719 return dest;
b75a7d8f
A
1720 }
1721 if (U_FAILURE(fDeferredStatus)) {
1722 status = fDeferredStatus;
729e4ab9 1723 return dest;
b75a7d8f 1724 }
57a6839d 1725
729e4ab9
A
1726 if (dest == NULL) {
1727 UnicodeString emptyString;
1728 UText empty = UTEXT_INITIALIZER;
57a6839d 1729
729e4ab9
A
1730 utext_openUnicodeString(&empty, &emptyString, &status);
1731 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1732 utext_close(&empty);
1733 }
1734
1735 if (U_SUCCESS(status)) {
1736 reset();
1737 while (find()) {
1738 appendReplacement(dest, replacement, status);
1739 if (U_FAILURE(status)) {
1740 break;
1741 }
b75a7d8f 1742 }
729e4ab9 1743 appendTail(dest, status);
b75a7d8f 1744 }
57a6839d 1745
729e4ab9 1746 return dest;
b75a7d8f
A
1747}
1748
1749
b75a7d8f
A
1750//--------------------------------------------------------------------------------
1751//
1752// replaceFirst
1753//
1754//--------------------------------------------------------------------------------
1755UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
729e4ab9
A
1756 UText replacementText = UTEXT_INITIALIZER;
1757 UText resultText = UTEXT_INITIALIZER;
1758 UnicodeString resultString;
57a6839d 1759
729e4ab9
A
1760 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1761 utext_openUnicodeString(&resultText, &resultString, &status);
57a6839d 1762
729e4ab9 1763 replaceFirst(&replacementText, &resultText, status);
57a6839d 1764
729e4ab9
A
1765 utext_close(&resultText);
1766 utext_close(&replacementText);
57a6839d 1767
729e4ab9
A
1768 return resultString;
1769}
1770
1771//
1772// replaceFirst, UText mode
1773//
1774UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
b75a7d8f 1775 if (U_FAILURE(status)) {
729e4ab9 1776 return dest;
b75a7d8f
A
1777 }
1778 if (U_FAILURE(fDeferredStatus)) {
1779 status = fDeferredStatus;
729e4ab9 1780 return dest;
b75a7d8f
A
1781 }
1782
1783 reset();
1784 if (!find()) {
729e4ab9 1785 return getInput(dest, status);
b75a7d8f 1786 }
57a6839d 1787
729e4ab9
A
1788 if (dest == NULL) {
1789 UnicodeString emptyString;
1790 UText empty = UTEXT_INITIALIZER;
57a6839d 1791
729e4ab9
A
1792 utext_openUnicodeString(&empty, &emptyString, &status);
1793 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1794 utext_close(&empty);
1795 }
57a6839d 1796
729e4ab9
A
1797 appendReplacement(dest, replacement, status);
1798 appendTail(dest, status);
57a6839d 1799
729e4ab9 1800 return dest;
b75a7d8f
A
1801}
1802
1803
46f4442e
A
1804//--------------------------------------------------------------------------------
1805//
1806// requireEnd
1807//
1808//--------------------------------------------------------------------------------
1809UBool RegexMatcher::requireEnd() const {
1810 return fRequireEnd;
1811}
1812
b75a7d8f
A
1813
1814//--------------------------------------------------------------------------------
1815//
1816// reset
1817//
1818//--------------------------------------------------------------------------------
1819RegexMatcher &RegexMatcher::reset() {
46f4442e 1820 fRegionStart = 0;
729e4ab9 1821 fRegionLimit = fInputLength;
46f4442e 1822 fActiveStart = 0;
729e4ab9 1823 fActiveLimit = fInputLength;
46f4442e 1824 fAnchorStart = 0;
729e4ab9 1825 fAnchorLimit = fInputLength;
46f4442e 1826 fLookStart = 0;
729e4ab9 1827 fLookLimit = fInputLength;
46f4442e
A
1828 resetPreserveRegion();
1829 return *this;
1830}
1831
1832
1833
1834void RegexMatcher::resetPreserveRegion() {
374ca955
A
1835 fMatchStart = 0;
1836 fMatchEnd = 0;
1837 fLastMatchEnd = -1;
46f4442e 1838 fAppendPosition = 0;
374ca955 1839 fMatch = FALSE;
46f4442e
A
1840 fHitEnd = FALSE;
1841 fRequireEnd = FALSE;
1842 fTime = 0;
1843 fTickCounter = TIMER_INITIAL_VALUE;
729e4ab9 1844 //resetStack(); // more expensive than it looks...
b75a7d8f
A
1845}
1846
1847
b75a7d8f 1848RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
729e4ab9
A
1849 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1850 if (fPattern->fNeedsAltInput) {
1851 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1852 }
b331163b
A
1853 if (U_FAILURE(fDeferredStatus)) {
1854 return *this;
1855 }
729e4ab9 1856 fInputLength = utext_nativeLength(fInputText);
57a6839d 1857
b75a7d8f 1858 reset();
729e4ab9
A
1859 delete fInput;
1860 fInput = NULL;
1861
1862 // Do the following for any UnicodeString.
1863 // This is for compatibility for those clients who modify the input string "live" during regex operations.
57a6839d
A
1864 fInputUniStrMaybeMutable = TRUE;
1865
374ca955 1866 if (fWordBreakItr != NULL) {
729e4ab9
A
1867#if UCONFIG_NO_BREAK_ITERATION==0
1868 UErrorCode status = U_ZERO_ERROR;
1869 fWordBreakItr->setText(fInputText, status);
1870#endif
374ca955 1871 }
b75a7d8f
A
1872 return *this;
1873}
1874
b75a7d8f 1875
729e4ab9
A
1876RegexMatcher &RegexMatcher::reset(UText *input) {
1877 if (fInputText != input) {
1878 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1879 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
b331163b
A
1880 if (U_FAILURE(fDeferredStatus)) {
1881 return *this;
1882 }
729e4ab9 1883 fInputLength = utext_nativeLength(fInputText);
57a6839d 1884
729e4ab9
A
1885 delete fInput;
1886 fInput = NULL;
57a6839d 1887
729e4ab9
A
1888 if (fWordBreakItr != NULL) {
1889#if UCONFIG_NO_BREAK_ITERATION==0
1890 UErrorCode status = U_ZERO_ERROR;
1891 fWordBreakItr->setText(input, status);
1892#endif
1893 }
1894 }
1895 reset();
1896 fInputUniStrMaybeMutable = FALSE;
1897
1898 return *this;
1899}
1900
1901/*RegexMatcher &RegexMatcher::reset(const UChar *) {
1902 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1903 return *this;
1904}*/
1905
1906RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1907 if (U_FAILURE(status)) {
374ca955 1908 return *this;
b75a7d8f 1909 }
46f4442e 1910 reset(); // Reset also resets the region to be the entire string.
57a6839d 1911
729e4ab9 1912 if (position < 0 || position > fActiveLimit) {
374ca955
A
1913 status = U_INDEX_OUTOFBOUNDS_ERROR;
1914 return *this;
1915 }
1916 fMatchEnd = position;
1917 return *this;
b75a7d8f
A
1918}
1919
1920
4388f060
A
1921//--------------------------------------------------------------------------------
1922//
1923// refresh
1924//
1925//--------------------------------------------------------------------------------
1926RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1927 if (U_FAILURE(status)) {
1928 return *this;
1929 }
1930 if (input == NULL) {
1931 status = U_ILLEGAL_ARGUMENT_ERROR;
1932 return *this;
1933 }
1934 if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1935 status = U_ILLEGAL_ARGUMENT_ERROR;
1936 return *this;
1937 }
1938 int64_t pos = utext_getNativeIndex(fInputText);
1939 // Shallow read-only clone of the new UText into the existing input UText
1940 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
1941 if (U_FAILURE(status)) {
1942 return *this;
1943 }
1944 utext_setNativeIndex(fInputText, pos);
1945
1946 if (fAltInputText != NULL) {
1947 pos = utext_getNativeIndex(fAltInputText);
1948 fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
1949 if (U_FAILURE(status)) {
1950 return *this;
1951 }
1952 utext_setNativeIndex(fAltInputText, pos);
1953 }
1954 return *this;
1955}
b75a7d8f 1956
374ca955
A
1957
1958
b75a7d8f
A
1959//--------------------------------------------------------------------------------
1960//
1961// setTrace
1962//
1963//--------------------------------------------------------------------------------
1964void RegexMatcher::setTrace(UBool state) {
1965 fTraceDebug = state;
1966}
1967
1968
1969
b331163b
A
1970/**
1971 * UText, replace entire contents of the destination UText with a substring of the source UText.
1972 *
1973 * @param src The source UText
1974 * @param dest The destination UText. Must be writable.
1975 * May be NULL, in which case a new UText will be allocated.
1976 * @param start Start index of source substring.
1977 * @param limit Limit index of source substring.
1978 * @param status An error code.
1979 */
1980static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) {
1981 if (U_FAILURE(*status)) {
1982 return dest;
1983 }
1984 if (start == limit) {
1985 if (dest) {
1986 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, status);
1987 return dest;
1988 } else {
1989 return utext_openUChars(NULL, NULL, 0, status);
1990 }
1991 }
1992 int32_t length = utext_extract(src, start, limit, NULL, 0, status);
1993 if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
1994 return dest;
1995 }
1996 *status = U_ZERO_ERROR;
1997 MaybeStackArray<UChar, 40> buffer;
1998 if (length >= buffer.getCapacity()) {
1999 UChar *newBuf = buffer.resize(length+1); // Leave space for terminating Nul.
2000 if (newBuf == NULL) {
2001 *status = U_MEMORY_ALLOCATION_ERROR;
2002 }
2003 }
2004 utext_extract(src, start, limit, buffer.getAlias(), length+1, status);
2005 if (dest) {
2006 utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status);
2007 return dest;
2008 }
2009
2010 // Caller did not provide a prexisting UText.
2011 // Open a new one, and have it adopt the text buffer storage.
2012 if (U_FAILURE(*status)) {
2013 return NULL;
2014 }
2015 int32_t ownedLength = 0;
2016 UChar *ownedBuf = buffer.orphanOrClone(length+1, ownedLength);
2017 if (ownedBuf == NULL) {
2018 *status = U_MEMORY_ALLOCATION_ERROR;
2019 return NULL;
2020 }
2021 UText *result = utext_openUChars(NULL, ownedBuf, length, status);
2022 if (U_FAILURE(*status)) {
2023 uprv_free(ownedBuf);
2024 return NULL;
2025 }
2026 result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT);
2027 return result;
2028}
2029
2030
b75a7d8f
A
2031//---------------------------------------------------------------------
2032//
2033// split
2034//
2035//---------------------------------------------------------------------
2036int32_t RegexMatcher::split(const UnicodeString &input,
2037 UnicodeString dest[],
2038 int32_t destCapacity,
729e4ab9
A
2039 UErrorCode &status)
2040{
2041 UText inputText = UTEXT_INITIALIZER;
2042 utext_openConstUnicodeString(&inputText, &input, &status);
2043 if (U_FAILURE(status)) {
2044 return 0;
2045 }
2046
2047 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2048 if (destText == NULL) {
2049 status = U_MEMORY_ALLOCATION_ERROR;
2050 return 0;
2051 }
2052 int32_t i;
2053 for (i = 0; i < destCapacity; i++) {
2054 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2055 }
57a6839d 2056
729e4ab9 2057 int32_t fieldCount = split(&inputText, destText, destCapacity, status);
57a6839d 2058
729e4ab9
A
2059 for (i = 0; i < destCapacity; i++) {
2060 utext_close(destText[i]);
2061 }
2062
2063 uprv_free(destText);
2064 utext_close(&inputText);
2065 return fieldCount;
2066}
2067
2068//
2069// split, UText mode
2070//
2071int32_t RegexMatcher::split(UText *input,
2072 UText *dest[],
2073 int32_t destCapacity,
2074 UErrorCode &status)
b75a7d8f
A
2075{
2076 //
2077 // Check arguements for validity
2078 //
2079 if (U_FAILURE(status)) {
2080 return 0;
340931cb 2081 }
b75a7d8f
A
2082
2083 if (destCapacity < 1) {
2084 status = U_ILLEGAL_ARGUMENT_ERROR;
2085 return 0;
2086 }
2087
b75a7d8f
A
2088 //
2089 // Reset for the input text
2090 //
2091 reset(input);
729e4ab9 2092 int64_t nextOutputStringStart = 0;
46f4442e 2093 if (fActiveLimit == 0) {
b75a7d8f
A
2094 return 0;
2095 }
2096
b75a7d8f
A
2097 //
2098 // Loop through the input text, searching for the delimiter pattern
2099 //
73c04bcf 2100 int32_t i;
b75a7d8f
A
2101 int32_t numCaptureGroups = fPattern->fGroupMap->size();
2102 for (i=0; ; i++) {
2103 if (i>=destCapacity-1) {
2104 // There is one or zero output string left.
2105 // Fill the last output string with whatever is left from the input, then exit the loop.
729e4ab9 2106 // ( i will be == destCapacity if we filled the output array while processing
b75a7d8f
A
2107 // capture groups of the delimiter expression, in which case we will discard the
2108 // last capture group saved in favor of the unprocessed remainder of the
2109 // input string.)
2110 i = destCapacity-1;
729e4ab9
A
2111 if (fActiveLimit > nextOutputStringStart) {
2112 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2113 if (dest[i]) {
57a6839d
A
2114 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2115 input->chunkContents+nextOutputStringStart,
729e4ab9
A
2116 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2117 } else {
2118 UText remainingText = UTEXT_INITIALIZER;
57a6839d 2119 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
729e4ab9
A
2120 fActiveLimit-nextOutputStringStart, &status);
2121 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2122 utext_close(&remainingText);
2123 }
2124 } else {
2125 UErrorCode lengthStatus = U_ZERO_ERROR;
57a6839d 2126 int32_t remaining16Length =
729e4ab9
A
2127 utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2128 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2129 if (remainingChars == NULL) {
2130 status = U_MEMORY_ALLOCATION_ERROR;
2131 break;
2132 }
2133
2134 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2135 if (dest[i]) {
2136 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2137 } else {
2138 UText remainingText = UTEXT_INITIALIZER;
2139 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2140 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2141 utext_close(&remainingText);
2142 }
57a6839d 2143
729e4ab9
A
2144 uprv_free(remainingChars);
2145 }
b75a7d8f
A
2146 }
2147 break;
2148 }
2149 if (find()) {
2150 // We found another delimiter. Move everything from where we started looking
2151 // up until the start of the delimiter into the next output string.
729e4ab9
A
2152 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2153 if (dest[i]) {
57a6839d
A
2154 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2155 input->chunkContents+nextOutputStringStart,
729e4ab9
A
2156 (int32_t)(fMatchStart-nextOutputStringStart), &status);
2157 } else {
2158 UText remainingText = UTEXT_INITIALIZER;
57a6839d 2159 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
729e4ab9
A
2160 fMatchStart-nextOutputStringStart, &status);
2161 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2162 utext_close(&remainingText);
2163 }
2164 } else {
2165 UErrorCode lengthStatus = U_ZERO_ERROR;
2166 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2167 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2168 if (remainingChars == NULL) {
2169 status = U_MEMORY_ALLOCATION_ERROR;
2170 break;
2171 }
2172 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2173 if (dest[i]) {
2174 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2175 } else {
2176 UText remainingText = UTEXT_INITIALIZER;
2177 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2178 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2179 utext_close(&remainingText);
2180 }
57a6839d 2181
729e4ab9
A
2182 uprv_free(remainingChars);
2183 }
b75a7d8f
A
2184 nextOutputStringStart = fMatchEnd;
2185
2186 // If the delimiter pattern has capturing parentheses, the captured
2187 // text goes out into the next n destination strings.
2188 int32_t groupNum;
2189 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
4388f060
A
2190 if (i >= destCapacity-2) {
2191 // Never fill the last available output string with capture group text.
2192 // It will filled with the last field, the remainder of the
2193 // unsplit input text.
b75a7d8f
A
2194 break;
2195 }
2196 i++;
0f5d89e8 2197 dest[i] = utext_extract_replace(fInputText, dest[i],
b331163b 2198 start64(groupNum, status), end64(groupNum, status), &status);
b75a7d8f
A
2199 }
2200
46f4442e 2201 if (nextOutputStringStart == fActiveLimit) {
4388f060
A
2202 // The delimiter was at the end of the string. We're done, but first
2203 // we output one last empty string, for the empty field following
2204 // the delimiter at the end of input.
2205 if (i+1 < destCapacity) {
2206 ++i;
2207 if (dest[i] == NULL) {
2208 dest[i] = utext_openUChars(NULL, NULL, 0, &status);
2209 } else {
0f5d89e8 2210 static const UChar emptyString[] = {(UChar)0};
4388f060
A
2211 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
2212 }
729e4ab9 2213 }
4388f060 2214 break;
57a6839d
A
2215
2216 }
b75a7d8f
A
2217 }
2218 else
2219 {
2220 // We ran off the end of the input while looking for the next delimiter.
2221 // All the remaining text goes into the current output string.
729e4ab9
A
2222 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2223 if (dest[i]) {
57a6839d
A
2224 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2225 input->chunkContents+nextOutputStringStart,
729e4ab9
A
2226 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2227 } else {
2228 UText remainingText = UTEXT_INITIALIZER;
57a6839d 2229 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
729e4ab9
A
2230 fActiveLimit-nextOutputStringStart, &status);
2231 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2232 utext_close(&remainingText);
2233 }
2234 } else {
2235 UErrorCode lengthStatus = U_ZERO_ERROR;
2236 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2237 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2238 if (remainingChars == NULL) {
2239 status = U_MEMORY_ALLOCATION_ERROR;
2240 break;
2241 }
57a6839d 2242
729e4ab9
A
2243 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2244 if (dest[i]) {
2245 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2246 } else {
2247 UText remainingText = UTEXT_INITIALIZER;
2248 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2249 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2250 utext_close(&remainingText);
2251 }
57a6839d 2252
729e4ab9
A
2253 uprv_free(remainingChars);
2254 }
b75a7d8f
A
2255 break;
2256 }
729e4ab9
A
2257 if (U_FAILURE(status)) {
2258 break;
2259 }
2260 } // end of for loop
b75a7d8f
A
2261 return i+1;
2262}
2263
2264
b75a7d8f
A
2265//--------------------------------------------------------------------------------
2266//
2267// start
2268//
2269//--------------------------------------------------------------------------------
2270int32_t RegexMatcher::start(UErrorCode &status) const {
2271 return start(0, status);
2272}
2273
729e4ab9
A
2274int64_t RegexMatcher::start64(UErrorCode &status) const {
2275 return start64(0, status);
2276}
b75a7d8f 2277
46f4442e
A
2278//--------------------------------------------------------------------------------
2279//
2280// start(int32_t group, UErrorCode &status)
2281//
2282//--------------------------------------------------------------------------------
729e4ab9
A
2283
2284int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
b75a7d8f
A
2285 if (U_FAILURE(status)) {
2286 return -1;
2287 }
2288 if (U_FAILURE(fDeferredStatus)) {
2289 status = fDeferredStatus;
2290 return -1;
2291 }
2292 if (fMatch == FALSE) {
2293 status = U_REGEX_INVALID_STATE;
2294 return -1;
2295 }
2296 if (group < 0 || group > fPattern->fGroupMap->size()) {
2297 status = U_INDEX_OUTOFBOUNDS_ERROR;
2298 return -1;
2299 }
729e4ab9 2300 int64_t s;
b75a7d8f 2301 if (group == 0) {
57a6839d 2302 s = fMatchStart;
b75a7d8f
A
2303 } else {
2304 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2305 U_ASSERT(groupOffset < fPattern->fFrameSize);
2306 U_ASSERT(groupOffset >= 0);
2307 s = fFrame->fExtra[groupOffset];
2308 }
57a6839d 2309
b75a7d8f
A
2310 return s;
2311}
2312
2313
729e4ab9
A
2314int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2315 return (int32_t)start64(group, status);
2316}
b75a7d8f 2317
46f4442e
A
2318//--------------------------------------------------------------------------------
2319//
2320// useAnchoringBounds
2321//
2322//--------------------------------------------------------------------------------
2323RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2324 fAnchoringBounds = b;
729e4ab9
A
2325 fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2326 fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
46f4442e
A
2327 return *this;
2328}
2329
2330
2331//--------------------------------------------------------------------------------
2332//
2333// useTransparentBounds
2334//
2335//--------------------------------------------------------------------------------
2336RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2337 fTransparentBounds = b;
729e4ab9
A
2338 fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2339 fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
46f4442e
A
2340 return *this;
2341}
2342
2343//--------------------------------------------------------------------------------
2344//
2345// setTimeLimit
2346//
2347//--------------------------------------------------------------------------------
2348void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2349 if (U_FAILURE(status)) {
2350 return;
2351 }
2352 if (U_FAILURE(fDeferredStatus)) {
2353 status = fDeferredStatus;
2354 return;
2355 }
2356 if (limit < 0) {
2357 status = U_ILLEGAL_ARGUMENT_ERROR;
2358 return;
2359 }
2360 fTimeLimit = limit;
2361}
2362
2363
2364//--------------------------------------------------------------------------------
2365//
2366// getTimeLimit
2367//
2368//--------------------------------------------------------------------------------
2369int32_t RegexMatcher::getTimeLimit() const {
2370 return fTimeLimit;
2371}
2372
2373
2374//--------------------------------------------------------------------------------
2375//
2376// setStackLimit
2377//
2378//--------------------------------------------------------------------------------
2379void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2380 if (U_FAILURE(status)) {
2381 return;
2382 }
2383 if (U_FAILURE(fDeferredStatus)) {
2384 status = fDeferredStatus;
2385 return;
2386 }
2387 if (limit < 0) {
2388 status = U_ILLEGAL_ARGUMENT_ERROR;
2389 return;
2390 }
57a6839d 2391
46f4442e 2392 // Reset the matcher. This is needed here in case there is a current match
57a6839d 2393 // whose final stack frame (containing the match results, pointed to by fFrame)
46f4442e
A
2394 // would be lost by resizing to a smaller stack size.
2395 reset();
57a6839d 2396
46f4442e
A
2397 if (limit == 0) {
2398 // Unlimited stack expansion
2399 fStack->setMaxCapacity(0);
2400 } else {
2401 // Change the units of the limit from bytes to ints, and bump the size up
57a6839d 2402 // to be big enough to hold at least one stack frame for the pattern,
46f4442e
A
2403 // if it isn't there already.
2404 int32_t adjustedLimit = limit / sizeof(int32_t);
2405 if (adjustedLimit < fPattern->fFrameSize) {
2406 adjustedLimit = fPattern->fFrameSize;
2407 }
2408 fStack->setMaxCapacity(adjustedLimit);
2409 }
2410 fStackLimit = limit;
2411}
2412
2413
2414//--------------------------------------------------------------------------------
2415//
2416// getStackLimit
2417//
2418//--------------------------------------------------------------------------------
2419int32_t RegexMatcher::getStackLimit() const {
2420 return fStackLimit;
2421}
2422
2423
2424//--------------------------------------------------------------------------------
2425//
2426// setMatchCallback
2427//
2428//--------------------------------------------------------------------------------
2429void RegexMatcher::setMatchCallback(URegexMatchCallback *callback,
2430 const void *context,
2431 UErrorCode &status) {
729e4ab9
A
2432 if (U_FAILURE(status)) {
2433 return;
2434 }
2435 fCallbackFn = callback;
2436 fCallbackContext = context;
46f4442e
A
2437}
2438
2439
2440//--------------------------------------------------------------------------------
2441//
2442// getMatchCallback
2443//
2444//--------------------------------------------------------------------------------
2445void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback,
2446 const void *&context,
2447 UErrorCode &status) {
2448 if (U_FAILURE(status)) {
2449 return;
2450 }
2451 callback = fCallbackFn;
2452 context = fCallbackContext;
2453}
2454
2455
729e4ab9
A
2456//--------------------------------------------------------------------------------
2457//
2458// setMatchCallback
2459//
2460//--------------------------------------------------------------------------------
2461void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback,
2462 const void *context,
2463 UErrorCode &status) {
2464 if (U_FAILURE(status)) {
2465 return;
2466 }
2467 fFindProgressCallbackFn = callback;
2468 fFindProgressCallbackContext = context;
2469}
2470
2471
2472//--------------------------------------------------------------------------------
2473//
2474// getMatchCallback
2475//
2476//--------------------------------------------------------------------------------
2477void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback,
2478 const void *&context,
2479 UErrorCode &status) {
2480 if (U_FAILURE(status)) {
2481 return;
2482 }
2483 callback = fFindProgressCallbackFn;
2484 context = fFindProgressCallbackContext;
2485}
2486
2487
374ca955
A
2488//================================================================================
2489//
2490// Code following this point in this file is the internal
2491// Match Engine Implementation.
2492//
2493//================================================================================
2494
2495
2496//--------------------------------------------------------------------------------
2497//
2498// resetStack
2499// Discard any previous contents of the state save stack, and initialize a
57a6839d 2500// new stack frame to all -1. The -1s are needed for capture group limits,
374ca955
A
2501// where they indicate that a group has not yet matched anything.
2502//--------------------------------------------------------------------------------
2503REStackFrame *RegexMatcher::resetStack() {
2504 // Discard any previous contents of the state save stack, and initialize a
729e4ab9
A
2505 // new stack frame with all -1 data. The -1s are needed for capture group limits,
2506 // where they indicate that a group has not yet matched anything.
374ca955
A
2507 fStack->removeAllElements();
2508
729e4ab9 2509 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2ca993e8
A
2510 if(U_FAILURE(fDeferredStatus)) {
2511 return NULL;
2512 }
2513
729e4ab9
A
2514 int32_t i;
2515 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2516 iFrame->fExtra[i] = -1;
2517 }
2518 return iFrame;
2519}
2520
2521
2522
2523//--------------------------------------------------------------------------------
2524//
57a6839d 2525// isWordBoundary
729e4ab9
A
2526// in perl, "xab..cd..", \b is true at positions 0,3,5,7
2527// For us,
2528// If the current char is a combining mark,
2529// \b is FALSE.
2530// Else Scan backwards to the first non-combining char.
2531// We are at a boundary if the this char and the original chars are
2532// opposite in membership in \w set
2533//
2534// parameters: pos - the current position in the input buffer
2535//
2536// TODO: double-check edge cases at region boundaries.
2537//
2538//--------------------------------------------------------------------------------
2539UBool RegexMatcher::isWordBoundary(int64_t pos) {
2540 UBool isBoundary = FALSE;
2541 UBool cIsWord = FALSE;
57a6839d 2542
729e4ab9
A
2543 if (pos >= fLookLimit) {
2544 fHitEnd = TRUE;
2545 } else {
2546 // Determine whether char c at current position is a member of the word set of chars.
2547 // If we're off the end of the string, behave as though we're not at a word char.
2548 UTEXT_SETNATIVEINDEX(fInputText, pos);
2549 UChar32 c = UTEXT_CURRENT32(fInputText);
2550 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2551 // Current char is a combining one. Not a boundary.
2552 return FALSE;
2553 }
2554 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2555 }
57a6839d 2556
729e4ab9
A
2557 // Back up until we come to a non-combining char, determine whether
2558 // that char is a word char.
2559 UBool prevCIsWord = FALSE;
2560 for (;;) {
2561 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2562 break;
2563 }
2564 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2565 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2566 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2567 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2568 break;
2569 }
2570 }
2571 isBoundary = cIsWord ^ prevCIsWord;
2572 return isBoundary;
2573}
2574
2575UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2576 UBool isBoundary = FALSE;
2577 UBool cIsWord = FALSE;
57a6839d 2578
729e4ab9 2579 const UChar *inputBuf = fInputText->chunkContents;
57a6839d 2580
729e4ab9
A
2581 if (pos >= fLookLimit) {
2582 fHitEnd = TRUE;
2583 } else {
2584 // Determine whether char c at current position is a member of the word set of chars.
2585 // If we're off the end of the string, behave as though we're not at a word char.
2586 UChar32 c;
2587 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2588 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2589 // Current char is a combining one. Not a boundary.
2590 return FALSE;
2591 }
2592 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2593 }
57a6839d 2594
729e4ab9
A
2595 // Back up until we come to a non-combining char, determine whether
2596 // that char is a word char.
2597 UBool prevCIsWord = FALSE;
2598 for (;;) {
2599 if (pos <= fLookStart) {
2600 break;
2601 }
2602 UChar32 prevChar;
2603 U16_PREV(inputBuf, fLookStart, pos, prevChar);
2604 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2605 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2606 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2607 break;
2608 }
2609 }
2610 isBoundary = cIsWord ^ prevCIsWord;
2611 return isBoundary;
2612}
2613
2614//--------------------------------------------------------------------------------
2615//
57a6839d 2616// isUWordBoundary
729e4ab9
A
2617//
2618// Test for a word boundary using RBBI word break.
2619//
2620// parameters: pos - the current position in the input buffer
2621//
2622//--------------------------------------------------------------------------------
2623UBool RegexMatcher::isUWordBoundary(int64_t pos) {
2624 UBool returnVal = FALSE;
2625#if UCONFIG_NO_BREAK_ITERATION==0
57a6839d 2626
729e4ab9
A
2627 // If we haven't yet created a break iterator for this matcher, do it now.
2628 if (fWordBreakItr == NULL) {
57a6839d 2629 fWordBreakItr =
729e4ab9
A
2630 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
2631 if (U_FAILURE(fDeferredStatus)) {
2632 return FALSE;
2633 }
2634 fWordBreakItr->setText(fInputText, fDeferredStatus);
2635 }
2636
2637 if (pos >= fLookLimit) {
2638 fHitEnd = TRUE;
2639 returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real"
2640 // words are not boundaries. All non-word chars stand by themselves,
2641 // with word boundaries on both sides.
2642 } else {
2643 if (!UTEXT_USES_U16(fInputText)) {
2644 // !!!: Would like a better way to do this!
2645 UErrorCode status = U_ZERO_ERROR;
2646 pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
2647 }
2648 returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2649 }
2650#endif
2651 return returnVal;
2652}
2653
2654//--------------------------------------------------------------------------------
2655//
2656// IncrementTime This function is called once each TIMER_INITIAL_VALUE state
2657// saves. Increment the "time" counter, and call the
2658// user callback function if there is one installed.
2659//
2660// If the match operation needs to be aborted, either for a time-out
2661// or because the user callback asked for it, just set an error status.
2662// The engine will pick that up and stop in its outer loop.
2663//
2664//--------------------------------------------------------------------------------
2665void RegexMatcher::IncrementTime(UErrorCode &status) {
2666 fTickCounter = TIMER_INITIAL_VALUE;
2667 fTime++;
2668 if (fCallbackFn != NULL) {
2669 if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2670 status = U_REGEX_STOPPED_BY_CALLER;
2671 return;
2672 }
2673 }
2674 if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2675 status = U_REGEX_TIME_OUT;
2676 }
2677}
2678
729e4ab9
A
2679//--------------------------------------------------------------------------------
2680//
2681// StateSave
2682// Make a new stack frame, initialized as a copy of the current stack frame.
2683// Set the pattern index in the original stack frame from the operand value
2684// in the opcode. Execution of the engine continues with the state in
2685// the newly created stack frame
2686//
2687// Note that reserveBlock() may grow the stack, resulting in the
2688// whole thing being relocated in memory.
2689//
2690// Parameters:
57a6839d 2691// fp The top frame pointer when called. At return, a new
729e4ab9
A
2692// fame will be present
2693// savePatIdx An index into the compiled pattern. Goes into the original
2694// (not new) frame. If execution ever back-tracks out of the
2695// new frame, this will be where we continue from in the pattern.
2696// Return
2697// The new frame pointer.
2698//
2699//--------------------------------------------------------------------------------
2700inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2ca993e8
A
2701 if (U_FAILURE(status)) {
2702 return fp;
2703 }
57a6839d 2704 // push storage for a new frame.
729e4ab9 2705 int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2ca993e8 2706 if (U_FAILURE(status)) {
729e4ab9
A
2707 // Failure on attempted stack expansion.
2708 // Stack function set some other error code, change it to a more
2709 // specific one for regular expressions.
2710 status = U_REGEX_STACK_OVERFLOW;
2711 // We need to return a writable stack frame, so just return the
2712 // previous frame. The match operation will stop quickly
2713 // because of the error status, after which the frame will never
2714 // be looked at again.
2715 return fp;
2716 }
2717 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack.
57a6839d 2718
729e4ab9
A
2719 // New stack frame = copy of old top frame.
2720 int64_t *source = (int64_t *)fp;
2721 int64_t *dest = newFP;
2722 for (;;) {
2723 *dest++ = *source++;
2724 if (source == newFP) {
2725 break;
2726 }
2727 }
57a6839d 2728
729e4ab9
A
2729 fTickCounter--;
2730 if (fTickCounter <= 0) {
2731 IncrementTime(status); // Re-initializes fTickCounter
2732 }
2733 fp->fPatIdx = savePatIdx;
2734 return (REStackFrame *)newFP;
2735}
2736
2ca993e8
A
2737#if defined(REGEX_DEBUG)
2738namespace {
2739UnicodeString StringFromUText(UText *ut) {
2740 UnicodeString result;
2741 for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) {
2742 result.append(c);
2743 }
2744 return result;
2745}
2746}
2747#endif // REGEX_DEBUG
2748
729e4ab9
A
2749
2750//--------------------------------------------------------------------------------
2751//
2752// MatchAt This is the actual matching engine.
2753//
2754// startIdx: begin matching a this index.
2755// toEnd: if true, match must extend to end of the input region
2756//
2757//--------------------------------------------------------------------------------
2758void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2759 UBool isMatch = FALSE; // True if the we have a match.
57a6839d 2760
729e4ab9
A
2761 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2762
2763 int32_t op; // Operation from the compiled pattern, split into
2764 int32_t opType; // the opcode
2765 int32_t opValue; // and the operand value.
57a6839d
A
2766
2767#ifdef REGEX_RUN_DEBUG
2ca993e8 2768 if (fTraceDebug) {
729e4ab9 2769 printf("MatchAt(startIdx=%ld)\n", startIdx);
2ca993e8
A
2770 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
2771 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
729e4ab9 2772 }
57a6839d 2773#endif
729e4ab9
A
2774
2775 if (U_FAILURE(status)) {
2776 return;
2777 }
2778
2779 // Cache frequently referenced items from the compiled pattern
2780 //
2781 int64_t *pat = fPattern->fCompiledPat->getBuffer();
2782
2783 const UChar *litText = fPattern->fLiteralText.getBuffer();
3d1f044b 2784 UVector *fSets = fPattern->fSets;
729e4ab9
A
2785
2786 fFrameSize = fPattern->fFrameSize;
2787 REStackFrame *fp = resetStack();
2ca993e8
A
2788 if (U_FAILURE(fDeferredStatus)) {
2789 status = fDeferredStatus;
2790 return;
2791 }
729e4ab9
A
2792
2793 fp->fPatIdx = 0;
2794 fp->fInputIdx = startIdx;
2795
2796 // Zero out the pattern's static data
2797 int32_t i;
2798 for (i = 0; i<fPattern->fDataSize; i++) {
2799 fData[i] = 0;
2800 }
2801
2802 //
2803 // Main loop for interpreting the compiled pattern.
2804 // One iteration of the loop per pattern operation performed.
2805 //
2806 for (;;) {
729e4ab9
A
2807 op = (int32_t)pat[fp->fPatIdx];
2808 opType = URX_TYPE(op);
2809 opValue = URX_VAL(op);
57a6839d 2810#ifdef REGEX_RUN_DEBUG
729e4ab9
A
2811 if (fTraceDebug) {
2812 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 2813 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
729e4ab9
A
2814 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2815 fPattern->dumpOp(fp->fPatIdx);
2816 }
57a6839d 2817#endif
729e4ab9 2818 fp->fPatIdx++;
57a6839d 2819
729e4ab9
A
2820 switch (opType) {
2821
2822
2823 case URX_NOP:
2824 break;
2825
2826
2827 case URX_BACKTRACK:
2828 // Force a backtrack. In some circumstances, the pattern compiler
2829 // will notice that the pattern can't possibly match anything, and will
2830 // emit one of these at that point.
2831 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2832 break;
2833
2834
2835 case URX_ONECHAR:
2836 if (fp->fInputIdx < fActiveLimit) {
2837 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2838 UChar32 c = UTEXT_NEXT32(fInputText);
2839 if (c == opValue) {
2840 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2841 break;
2842 }
2843 } else {
2844 fHitEnd = TRUE;
2845 }
729e4ab9
A
2846 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2847 break;
2848
2849
2850 case URX_STRING:
2851 {
2852 // Test input against a literal string.
2853 // Strings require two slots in the compiled pattern, one for the
2854 // offset to the string text, and one for the length.
729e4ab9 2855
4388f060 2856 int32_t stringStartIdx = opValue;
729e4ab9
A
2857 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
2858 fp->fPatIdx++;
2859 opType = URX_TYPE(op);
4388f060 2860 int32_t stringLen = URX_VAL(op);
729e4ab9
A
2861 U_ASSERT(opType == URX_STRING_LEN);
2862 U_ASSERT(stringLen >= 2);
57a6839d 2863
4388f060
A
2864 const UChar *patternString = litText+stringStartIdx;
2865 int32_t patternStringIndex = 0;
729e4ab9 2866 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4388f060
A
2867 UChar32 inputChar;
2868 UChar32 patternChar;
729e4ab9 2869 UBool success = TRUE;
4388f060
A
2870 while (patternStringIndex < stringLen) {
2871 if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
729e4ab9 2872 success = FALSE;
4388f060
A
2873 fHitEnd = TRUE;
2874 break;
2875 }
2876 inputChar = UTEXT_NEXT32(fInputText);
2877 U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
2878 if (patternChar != inputChar) {
2879 success = FALSE;
2880 break;
729e4ab9
A
2881 }
2882 }
57a6839d 2883
729e4ab9
A
2884 if (success) {
2885 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2886 } else {
729e4ab9
A
2887 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2888 }
2889 }
2890 break;
2891
2892
2893 case URX_STATE_SAVE:
2894 fp = StateSave(fp, opValue, status);
2895 break;
2896
2897
2898 case URX_END:
2899 // The match loop will exit via this path on a successful match,
2900 // when we reach the end of the pattern.
2901 if (toEnd && fp->fInputIdx != fActiveLimit) {
2902 // The pattern matched, but not to the end of input. Try some more.
2903 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2904 break;
2905 }
2906 isMatch = TRUE;
2907 goto breakFromLoop;
2908
2909 // Start and End Capture stack frame variables are laid out out like this:
2910 // fp->fExtra[opValue] - The start of a completed capture group
2911 // opValue+1 - The end of a completed capture group
2912 // opValue+2 - the start of a capture group whose end
2913 // has not yet been reached (and might not ever be).
2914 case URX_START_CAPTURE:
2915 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2916 fp->fExtra[opValue+2] = fp->fInputIdx;
2917 break;
2918
2919
2920 case URX_END_CAPTURE:
2921 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2922 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
2923 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
2924 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
2925 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
2926 break;
2927
2928
2929 case URX_DOLLAR: // $, test for End of line
2930 // or for position before new line at end of input
2931 {
2932 if (fp->fInputIdx >= fAnchorLimit) {
2933 // We really are at the end of input. Success.
2934 fHitEnd = TRUE;
2935 fRequireEnd = TRUE;
2936 break;
2937 }
57a6839d 2938
729e4ab9 2939 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 2940
729e4ab9
A
2941 // If we are positioned just before a new-line that is located at the
2942 // end of input, succeed.
2943 UChar32 c = UTEXT_NEXT32(fInputText);
2944 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
b331163b 2945 if (isLineTerminator(c)) {
729e4ab9 2946 // If not in the middle of a CR/LF sequence
b331163b 2947 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
729e4ab9
A
2948 // At new-line at end of input. Success
2949 fHitEnd = TRUE;
2950 fRequireEnd = TRUE;
57a6839d 2951
729e4ab9
A
2952 break;
2953 }
2954 }
2955 } else {
2956 UChar32 nextC = UTEXT_NEXT32(fInputText);
2957 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2958 fHitEnd = TRUE;
2959 fRequireEnd = TRUE;
2960 break; // At CR/LF at end of input. Success
2961 }
2962 }
2963
2964 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2965 }
2966 break;
2967
2968
2969 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
2970 if (fp->fInputIdx >= fAnchorLimit) {
2971 // Off the end of input. Success.
2972 fHitEnd = TRUE;
2973 fRequireEnd = TRUE;
2974 break;
2975 } else {
2976 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2977 UChar32 c = UTEXT_NEXT32(fInputText);
2978 // Either at the last character of input, or off the end.
2979 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
2980 fHitEnd = TRUE;
2981 fRequireEnd = TRUE;
2982 break;
2983 }
2984 }
2985
2986 // Not at end of input. Back-track out.
2987 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2988 break;
2989
2990
2991 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
2992 {
2993 if (fp->fInputIdx >= fAnchorLimit) {
2994 // We really are at the end of input. Success.
2995 fHitEnd = TRUE;
2996 fRequireEnd = TRUE;
2997 break;
2998 }
2999 // If we are positioned just before a new-line, succeed.
3000 // It makes no difference where the new-line is within the input.
3001 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3002 UChar32 c = UTEXT_CURRENT32(fInputText);
b331163b 3003 if (isLineTerminator(c)) {
729e4ab9
A
3004 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
3005 // In multi-line mode, hitting a new-line just before the end of input does not
3006 // set the hitEnd or requireEnd flags
3007 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
3008 break;
3009 }
3010 }
3011 // not at a new line. Fail.
3012 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3013 }
3014 break;
3015
3016
3017 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
3018 {
3019 if (fp->fInputIdx >= fAnchorLimit) {
3020 // We really are at the end of input. Success.
3021 fHitEnd = TRUE;
3022 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
3023 break; // adding a new-line would not lose the match.
3024 }
3025 // If we are not positioned just before a new-line, the test fails; backtrack out.
3026 // It makes no difference where the new-line is within the input.
3027 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3028 if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3029 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3030 }
3031 }
3032 break;
3033
3034
3035 case URX_CARET: // ^, test for start of line
3036 if (fp->fInputIdx != fAnchorStart) {
3037 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3038 }
3039 break;
3040
3041
3042 case URX_CARET_M: // ^, test for start of line in mulit-line mode
3043 {
3044 if (fp->fInputIdx == fAnchorStart) {
3045 // We are at the start input. Success.
3046 break;
3047 }
3048 // Check whether character just before the current pos is a new-line
3049 // unless we are at the end of input
3050 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 3051 UChar32 c = UTEXT_PREVIOUS32(fInputText);
b331163b 3052 if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) {
729e4ab9
A
3053 // It's a new-line. ^ is true. Success.
3054 // TODO: what should be done with positions between a CR and LF?
3055 break;
3056 }
3057 // Not at the start of a line. Fail.
3058 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3059 }
3060 break;
3061
3062
3063 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
3064 {
3065 U_ASSERT(fp->fInputIdx >= fAnchorStart);
3066 if (fp->fInputIdx <= fAnchorStart) {
3067 // We are at the start input. Success.
3068 break;
3069 }
3070 // Check whether character just before the current pos is a new-line
3071 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3072 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3073 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3074 if (c != 0x0a) {
3075 // Not at the start of a line. Back-track out.
3076 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3077 }
3078 }
3079 break;
3080
3081 case URX_BACKSLASH_B: // Test for word boundaries
3082 {
3083 UBool success = isWordBoundary(fp->fInputIdx);
51004dcb 3084 success ^= (UBool)(opValue != 0); // flip sense for \B
729e4ab9
A
3085 if (!success) {
3086 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3087 }
3088 }
3089 break;
3090
3091
3092 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
3093 {
3094 UBool success = isUWordBoundary(fp->fInputIdx);
51004dcb 3095 success ^= (UBool)(opValue != 0); // flip sense for \B
729e4ab9
A
3096 if (!success) {
3097 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3098 }
3099 }
3100 break;
3101
3102
3103 case URX_BACKSLASH_D: // Test for decimal digit
3104 {
3105 if (fp->fInputIdx >= fActiveLimit) {
3106 fHitEnd = TRUE;
3107 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3108 break;
3109 }
3110
3111 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3112
3113 UChar32 c = UTEXT_NEXT32(fInputText);
3114 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
3115 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
51004dcb 3116 success ^= (UBool)(opValue != 0); // flip sense for \D
729e4ab9
A
3117 if (success) {
3118 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3119 } else {
3120 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3121 }
3122 }
3123 break;
3124
3125
3126 case URX_BACKSLASH_G: // Test for position at end of previous match
3127 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3128 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3129 }
3130 break;
3131
3132
b331163b
A
3133 case URX_BACKSLASH_H: // Test for \h, horizontal white space.
3134 {
3135 if (fp->fInputIdx >= fActiveLimit) {
3136 fHitEnd = TRUE;
3137 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3138 break;
3139 }
3140 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3141 UChar32 c = UTEXT_NEXT32(fInputText);
3142 int8_t ctype = u_charType(c);
3143 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
3144 success ^= (UBool)(opValue != 0); // flip sense for \H
3145 if (success) {
3146 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3147 } else {
3148 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3149 }
3150 }
3151 break;
3152
3153
3154 case URX_BACKSLASH_R: // Test for \R, any line break sequence.
3155 {
3156 if (fp->fInputIdx >= fActiveLimit) {
3157 fHitEnd = TRUE;
3158 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3159 break;
3160 }
3161 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3162 UChar32 c = UTEXT_NEXT32(fInputText);
3163 if (isLineTerminator(c)) {
3164 if (c == 0x0d && utext_current32(fInputText) == 0x0a) {
3165 utext_next32(fInputText);
3166 }
3167 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3168 } else {
3169 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3170 }
3171 }
3172 break;
3173
3174
3175 case URX_BACKSLASH_V: // \v, any single line ending character.
3176 {
3177 if (fp->fInputIdx >= fActiveLimit) {
3178 fHitEnd = TRUE;
3179 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3180 break;
3181 }
3182 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3183 UChar32 c = UTEXT_NEXT32(fInputText);
3184 UBool success = isLineTerminator(c);
3185 success ^= (UBool)(opValue != 0); // flip sense for \V
3186 if (success) {
3187 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3188 } else {
3189 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3190 }
3191 }
3192 break;
3193
3194
57a6839d 3195 case URX_BACKSLASH_X:
729e4ab9
A
3196 // Match a Grapheme, as defined by Unicode TR 29.
3197 // Differs slightly from Perl, which consumes combining marks independently
3198 // of context.
3199 {
3200
3201 // Fail if at end of input
3202 if (fp->fInputIdx >= fActiveLimit) {
3203 fHitEnd = TRUE;
3204 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3205 break;
3206 }
57a6839d 3207
729e4ab9
A
3208 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3209
3210 // Examine (and consume) the current char.
3211 // Dispatch into a little state machine, based on the char.
3212 UChar32 c;
3213 c = UTEXT_NEXT32(fInputText);
3214 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3215 UnicodeSet **sets = fPattern->fStaticSets;
3216 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
3217 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
3218 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3219 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3220 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3221 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3222 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3223 goto GC_Extend;
3224
3225
3226
3227GC_L:
3228 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3229 c = UTEXT_NEXT32(fInputText);
3230 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3231 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3232 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3233 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3234 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4388f060 3235 (void)UTEXT_PREVIOUS32(fInputText);
729e4ab9
A
3236 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3237 goto GC_Extend;
3238
3239GC_V:
3240 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3241 c = UTEXT_NEXT32(fInputText);
3242 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3243 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3244 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4388f060 3245 (void)UTEXT_PREVIOUS32(fInputText);
729e4ab9
A
3246 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3247 goto GC_Extend;
3248
3249GC_T:
3250 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3251 c = UTEXT_NEXT32(fInputText);
3252 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3253 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4388f060 3254 (void)UTEXT_PREVIOUS32(fInputText);
729e4ab9
A
3255 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3256 goto GC_Extend;
3257
3258GC_Extend:
3259 // Combining characters are consumed here
3260 for (;;) {
3261 if (fp->fInputIdx >= fActiveLimit) {
3262 break;
3263 }
3264 c = UTEXT_CURRENT32(fInputText);
3265 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
3266 break;
3267 }
4388f060 3268 (void)UTEXT_NEXT32(fInputText);
729e4ab9
A
3269 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3270 }
3271 goto GC_Done;
3272
3273GC_Control:
57a6839d 3274 // Most control chars stand alone (don't combine with combining chars),
729e4ab9
A
3275 // except for that CR/LF sequence is a single grapheme cluster.
3276 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
3277 c = UTEXT_NEXT32(fInputText);
3278 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3279 }
3280
3281GC_Done:
3282 if (fp->fInputIdx >= fActiveLimit) {
3283 fHitEnd = TRUE;
3284 }
3285 break;
3286 }
57a6839d 3287
729e4ab9
A
3288
3289
3290
3291 case URX_BACKSLASH_Z: // Test for end of Input
3292 if (fp->fInputIdx < fAnchorLimit) {
3293 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3294 } else {
3295 fHitEnd = TRUE;
3296 fRequireEnd = TRUE;
3297 }
3298 break;
3299
3300
3301
3302 case URX_STATIC_SETREF:
3303 {
3304 // Test input character against one of the predefined sets
3305 // (Word Characters, for example)
3306 // The high bit of the op value is a flag for the match polarity.
3307 // 0: success if input char is in set.
3308 // 1: success if input char is not in set.
3309 if (fp->fInputIdx >= fActiveLimit) {
3310 fHitEnd = TRUE;
3311 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3312 break;
3313 }
3314
57a6839d 3315 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
729e4ab9
A
3316 opValue &= ~URX_NEG_SET;
3317 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3318
3319 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3320 UChar32 c = UTEXT_NEXT32(fInputText);
3321 if (c < 256) {
3322 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3323 if (s8->contains(c)) {
3324 success = !success;
3325 }
3326 } else {
3327 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3328 if (s->contains(c)) {
3329 success = !success;
3330 }
3331 }
3332 if (success) {
3333 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3334 } else {
3335 // the character wasn't in the set.
729e4ab9
A
3336 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3337 }
3338 }
3339 break;
57a6839d 3340
729e4ab9
A
3341
3342 case URX_STAT_SETREF_N:
3343 {
57a6839d 3344 // Test input character for NOT being a member of one of
729e4ab9
A
3345 // the predefined sets (Word Characters, for example)
3346 if (fp->fInputIdx >= fActiveLimit) {
3347 fHitEnd = TRUE;
3348 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3349 break;
3350 }
3351
3352 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3353
3354 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 3355
729e4ab9
A
3356 UChar32 c = UTEXT_NEXT32(fInputText);
3357 if (c < 256) {
3358 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3359 if (s8->contains(c) == FALSE) {
3360 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3361 break;
3362 }
3363 } else {
3364 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3365 if (s->contains(c) == FALSE) {
3366 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3367 break;
3368 }
3369 }
3370 // the character wasn't in the set.
729e4ab9
A
3371 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3372 }
3373 break;
57a6839d 3374
729e4ab9
A
3375
3376 case URX_SETREF:
3377 if (fp->fInputIdx >= fActiveLimit) {
3378 fHitEnd = TRUE;
3379 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3380 break;
3381 } else {
3382 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 3383
729e4ab9
A
3384 // There is input left. Pick up one char and test it for set membership.
3385 UChar32 c = UTEXT_NEXT32(fInputText);
3d1f044b 3386 U_ASSERT(opValue > 0 && opValue < fSets->size());
729e4ab9
A
3387 if (c<256) {
3388 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3389 if (s8->contains(c)) {
3390 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3391 break;
3392 }
3393 } else {
3d1f044b 3394 UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
729e4ab9
A
3395 if (s->contains(c)) {
3396 // The character is in the set. A Match.
3397 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3398 break;
3399 }
3400 }
57a6839d 3401
729e4ab9 3402 // the character wasn't in the set.
729e4ab9
A
3403 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3404 }
3405 break;
3406
3407
3408 case URX_DOTANY:
3409 {
3410 // . matches anything, but stops at end-of-line.
3411 if (fp->fInputIdx >= fActiveLimit) {
3412 // At end of input. Match failed. Backtrack out.
3413 fHitEnd = TRUE;
3414 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3415 break;
3416 }
57a6839d 3417
729e4ab9 3418 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 3419
729e4ab9
A
3420 // There is input left. Advance over one char, unless we've hit end-of-line
3421 UChar32 c = UTEXT_NEXT32(fInputText);
b331163b 3422 if (isLineTerminator(c)) {
729e4ab9
A
3423 // End of line in normal mode. . does not match.
3424 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3425 break;
3426 }
3427 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3428 }
3429 break;
3430
3431
3432 case URX_DOTANY_ALL:
3433 {
3434 // ., in dot-matches-all (including new lines) mode
3435 if (fp->fInputIdx >= fActiveLimit) {
3436 // At end of input. Match failed. Backtrack out.
3437 fHitEnd = TRUE;
3438 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3439 break;
3440 }
57a6839d 3441
729e4ab9 3442 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 3443
729e4ab9
A
3444 // There is input left. Advance over one char, except if we are
3445 // at a cr/lf, advance over both of them.
57a6839d 3446 UChar32 c;
729e4ab9
A
3447 c = UTEXT_NEXT32(fInputText);
3448 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3449 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3450 // In the case of a CR/LF, we need to advance over both.
3451 UChar32 nextc = UTEXT_CURRENT32(fInputText);
3452 if (nextc == 0x0a) {
4388f060 3453 (void)UTEXT_NEXT32(fInputText);
729e4ab9
A
3454 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3455 }
3456 }
3457 }
3458 break;
3459
3460
3461 case URX_DOTANY_UNIX:
3462 {
3463 // '.' operator, matches all, but stops at end-of-line.
3464 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
3465 if (fp->fInputIdx >= fActiveLimit) {
3466 // At end of input. Match failed. Backtrack out.
3467 fHitEnd = TRUE;
3468 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3469 break;
3470 }
3471
3472 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 3473
729e4ab9
A
3474 // There is input left. Advance over one char, unless we've hit end-of-line
3475 UChar32 c = UTEXT_NEXT32(fInputText);
3476 if (c == 0x0a) {
3477 // End of line in normal mode. '.' does not match the \n
3478 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3479 } else {
3480 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3481 }
3482 }
3483 break;
3484
3485
3486 case URX_JMP:
3487 fp->fPatIdx = opValue;
3488 break;
3489
3490 case URX_FAIL:
3491 isMatch = FALSE;
3492 goto breakFromLoop;
3493
3494 case URX_JMP_SAV:
3495 U_ASSERT(opValue < fPattern->fCompiledPat->size());
3496 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3497 fp->fPatIdx = opValue; // Then JMP.
3498 break;
3499
3500 case URX_JMP_SAV_X:
3501 // This opcode is used with (x)+, when x can match a zero length string.
3502 // Same as JMP_SAV, except conditional on the match having made forward progress.
3503 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3504 // data address of the input position at the start of the loop.
3505 {
3506 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3507 int32_t stoOp = (int32_t)pat[opValue-1];
3508 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3509 int32_t frameLoc = URX_VAL(stoOp);
3510 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3511 int64_t prevInputIdx = fp->fExtra[frameLoc];
3512 U_ASSERT(prevInputIdx <= fp->fInputIdx);
3513 if (prevInputIdx < fp->fInputIdx) {
3514 // The match did make progress. Repeat the loop.
3515 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3516 fp->fPatIdx = opValue;
3517 fp->fExtra[frameLoc] = fp->fInputIdx;
57a6839d 3518 }
729e4ab9
A
3519 // If the input position did not advance, we do nothing here,
3520 // execution will fall out of the loop.
3521 }
3522 break;
3523
3524 case URX_CTR_INIT:
3525 {
3526 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
57a6839d 3527 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
729e4ab9
A
3528
3529 // Pick up the three extra operands that CTR_INIT has, and
57a6839d 3530 // skip the pattern location counter past
729e4ab9
A
3531 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3532 fp->fPatIdx += 3;
3533 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3534 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3535 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3536 U_ASSERT(minCount>=0);
3537 U_ASSERT(maxCount>=minCount || maxCount==-1);
57a6839d 3538 U_ASSERT(loopLoc>=fp->fPatIdx);
729e4ab9
A
3539
3540 if (minCount == 0) {
3541 fp = StateSave(fp, loopLoc+1, status);
3542 }
57a6839d
A
3543 if (maxCount == -1) {
3544 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
3545 } else if (maxCount == 0) {
729e4ab9
A
3546 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3547 }
3548 }
3549 break;
3550
3551 case URX_CTR_LOOP:
3552 {
3553 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3554 int32_t initOp = (int32_t)pat[opValue];
3555 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3556 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3557 int32_t minCount = (int32_t)pat[opValue+2];
3558 int32_t maxCount = (int32_t)pat[opValue+3];
729e4ab9 3559 (*pCounter)++;
57a6839d
A
3560 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3561 U_ASSERT(*pCounter == maxCount);
729e4ab9
A
3562 break;
3563 }
3564 if (*pCounter >= minCount) {
57a6839d
A
3565 if (maxCount == -1) {
3566 // Loop has no hard upper bound.
3567 // Check that it is progressing through the input, break if it is not.
3568 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
3569 if (fp->fInputIdx == *pLastInputIdx) {
3570 break;
3571 } else {
3572 *pLastInputIdx = fp->fInputIdx;
3573 }
3574 }
729e4ab9 3575 fp = StateSave(fp, fp->fPatIdx, status);
f3c0d7a5
A
3576 } else {
3577 // Increment time-out counter. (StateSave() does it if count >= minCount)
3578 fTickCounter--;
3579 if (fTickCounter <= 0) {
3580 IncrementTime(status); // Re-initializes fTickCounter
3581 }
729e4ab9 3582 }
f3c0d7a5 3583
729e4ab9
A
3584 fp->fPatIdx = opValue + 4; // Loop back.
3585 }
3586 break;
3587
3588 case URX_CTR_INIT_NG:
3589 {
3590 // Initialize a non-greedy loop
3591 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
57a6839d 3592 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
729e4ab9 3593
57a6839d
A
3594 // Pick up the three extra operands that CTR_INIT_NG has, and
3595 // skip the pattern location counter past
729e4ab9
A
3596 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3597 fp->fPatIdx += 3;
3598 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3599 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3600 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3601 U_ASSERT(minCount>=0);
3602 U_ASSERT(maxCount>=minCount || maxCount==-1);
3603 U_ASSERT(loopLoc>fp->fPatIdx);
57a6839d
A
3604 if (maxCount == -1) {
3605 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
3606 }
729e4ab9
A
3607
3608 if (minCount == 0) {
3609 if (maxCount != 0) {
3610 fp = StateSave(fp, fp->fPatIdx, status);
3611 }
3612 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
57a6839d 3613 }
729e4ab9
A
3614 }
3615 break;
3616
3617 case URX_CTR_LOOP_NG:
3618 {
3619 // Non-greedy {min, max} loops
3620 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3621 int32_t initOp = (int32_t)pat[opValue];
3622 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3623 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3624 int32_t minCount = (int32_t)pat[opValue+2];
3625 int32_t maxCount = (int32_t)pat[opValue+3];
729e4ab9 3626
57a6839d
A
3627 (*pCounter)++;
3628 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
729e4ab9
A
3629 // The loop has matched the maximum permitted number of times.
3630 // Break out of here with no action. Matching will
3631 // continue with the following pattern.
57a6839d 3632 U_ASSERT(*pCounter == maxCount);
729e4ab9
A
3633 break;
3634 }
3635
3636 if (*pCounter < minCount) {
3637 // We haven't met the minimum number of matches yet.
3638 // Loop back for another one.
3639 fp->fPatIdx = opValue + 4; // Loop back.
f3c0d7a5
A
3640 // Increment time-out counter. (StateSave() does it if count >= minCount)
3641 fTickCounter--;
3642 if (fTickCounter <= 0) {
3643 IncrementTime(status); // Re-initializes fTickCounter
3644 }
729e4ab9
A
3645 } else {
3646 // We do have the minimum number of matches.
57a6839d
A
3647
3648 // If there is no upper bound on the loop iterations, check that the input index
3649 // is progressing, and stop the loop if it is not.
3650 if (maxCount == -1) {
3651 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
3652 if (fp->fInputIdx == *pLastInputIdx) {
3653 break;
3654 }
3655 *pLastInputIdx = fp->fInputIdx;
3656 }
3657
3658 // Loop Continuation: we will fall into the pattern following the loop
3659 // (non-greedy, don't execute loop body first), but first do
3660 // a state save to the top of the loop, so that a match failure
729e4ab9
A
3661 // in the following pattern will try another iteration of the loop.
3662 fp = StateSave(fp, opValue + 4, status);
3663 }
3664 }
3665 break;
3666
3667 case URX_STO_SP:
3668 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3669 fData[opValue] = fStack->size();
3670 break;
3671
3672 case URX_LD_SP:
3673 {
3674 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3675 int32_t newStackSize = (int32_t)fData[opValue];
3676 U_ASSERT(newStackSize <= fStack->size());
3677 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3678 if (newFP == (int64_t *)fp) {
3679 break;
3680 }
3d1f044b
A
3681 int32_t j;
3682 for (j=0; j<fFrameSize; j++) {
3683 newFP[j] = ((int64_t *)fp)[j];
729e4ab9
A
3684 }
3685 fp = (REStackFrame *)newFP;
3686 fStack->setSize(newStackSize);
3687 }
3688 break;
3689
3690 case URX_BACKREF:
729e4ab9
A
3691 {
3692 U_ASSERT(opValue < fFrameSize);
3693 int64_t groupStartIdx = fp->fExtra[opValue];
3694 int64_t groupEndIdx = fp->fExtra[opValue+1];
3695 U_ASSERT(groupStartIdx <= groupEndIdx);
3696 if (groupStartIdx < 0) {
3697 // This capture group has not participated in the match thus far,
3698 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
729e4ab9
A
3699 break;
3700 }
729e4ab9
A
3701 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3702 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4388f060
A
3703
3704 // Note: if the capture group match was of an empty string the backref
57a6839d 3705 // match succeeds. Verified by testing: Perl matches succeed
4388f060 3706 // in this case, so we do too.
57a6839d 3707
4388f060
A
3708 UBool success = TRUE;
3709 for (;;) {
3710 if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3711 success = TRUE;
3712 break;
3713 }
3714 if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
3715 success = FALSE;
729e4ab9 3716 fHitEnd = TRUE;
4388f060
A
3717 break;
3718 }
3719 UChar32 captureGroupChar = utext_next32(fAltInputText);
3720 UChar32 inputChar = utext_next32(fInputText);
3721 if (inputChar != captureGroupChar) {
3722 success = FALSE;
3723 break;
729e4ab9 3724 }
4388f060
A
3725 }
3726
3727 if (success) {
3728 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3729 } else {
3730 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3731 }
3732 }
3733 break;
3734
3735
3736
3737 case URX_BACKREF_I:
3738 {
3739 U_ASSERT(opValue < fFrameSize);
3740 int64_t groupStartIdx = fp->fExtra[opValue];
3741 int64_t groupEndIdx = fp->fExtra[opValue+1];
3742 U_ASSERT(groupStartIdx <= groupEndIdx);
3743 if (groupStartIdx < 0) {
3744 // This capture group has not participated in the match thus far,
729e4ab9 3745 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
4388f060
A
3746 break;
3747 }
3748 utext_setNativeIndex(fAltInputText, groupStartIdx);
3749 utext_setNativeIndex(fInputText, fp->fInputIdx);
3750 CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
3751 CaseFoldingUTextIterator inputItr(*fInputText);
3752
3753 // Note: if the capture group match was of an empty string the backref
57a6839d 3754 // match succeeds. Verified by testing: Perl matches succeed
4388f060 3755 // in this case, so we do too.
57a6839d 3756
4388f060
A
3757 UBool success = TRUE;
3758 for (;;) {
3759 if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3760 success = TRUE;
3761 break;
3762 }
3763 if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
3764 success = FALSE;
3765 fHitEnd = TRUE;
3766 break;
3767 }
3768 UChar32 captureGroupChar = captureGroupItr.next();
3769 UChar32 inputChar = inputItr.next();
3770 if (inputChar != captureGroupChar) {
3771 success = FALSE;
3772 break;
3773 }
3774 }
3775
3776 if (success && inputItr.inExpansion()) {
57a6839d
A
3777 // We otained a match by consuming part of a string obtained from
3778 // case-folding a single code point of the input text.
4388f060
A
3779 // This does not count as an overall match.
3780 success = FALSE;
3781 }
3782
3783 if (success) {
3784 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3785 } else {
3786 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
729e4ab9 3787 }
57a6839d 3788
729e4ab9
A
3789 }
3790 break;
57a6839d 3791
729e4ab9
A
3792 case URX_STO_INP_LOC:
3793 {
3794 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3795 fp->fExtra[opValue] = fp->fInputIdx;
3796 }
3797 break;
3798
3799 case URX_JMPX:
3800 {
3801 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3802 fp->fPatIdx += 1;
3803 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
3804 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3805 int64_t savedInputIdx = fp->fExtra[dataLoc];
3806 U_ASSERT(savedInputIdx <= fp->fInputIdx);
3807 if (savedInputIdx < fp->fInputIdx) {
3808 fp->fPatIdx = opValue; // JMP
3809 } else {
3810 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
3811 }
3812 }
3813 break;
3814
3815 case URX_LA_START:
3816 {
340931cb 3817 // Entering a look around block.
729e4ab9 3818 // Save Stack Ptr, Input Pos.
340931cb 3819 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
729e4ab9
A
3820 fData[opValue] = fStack->size();
3821 fData[opValue+1] = fp->fInputIdx;
340931cb
A
3822 fData[opValue+2] = fActiveStart;
3823 fData[opValue+3] = fActiveLimit;
729e4ab9
A
3824 fActiveStart = fLookStart; // Set the match region change for
3825 fActiveLimit = fLookLimit; // transparent bounds.
3826 }
3827 break;
3828
3829 case URX_LA_END:
3830 {
3831 // Leaving a look-ahead block.
3832 // restore Stack Ptr, Input Pos to positions they had on entry to block.
340931cb 3833 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
729e4ab9
A
3834 int32_t stackSize = fStack->size();
3835 int32_t newStackSize =(int32_t)fData[opValue];
3836 U_ASSERT(stackSize >= newStackSize);
3837 if (stackSize > newStackSize) {
3838 // Copy the current top frame back to the new (cut back) top frame.
3839 // This makes the capture groups from within the look-ahead
3840 // expression available.
3841 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3d1f044b
A
3842 int32_t j;
3843 for (j=0; j<fFrameSize; j++) {
3844 newFP[j] = ((int64_t *)fp)[j];
729e4ab9
A
3845 }
3846 fp = (REStackFrame *)newFP;
3847 fStack->setSize(newStackSize);
3848 }
3849 fp->fInputIdx = fData[opValue+1];
3850
3851 // Restore the active region bounds in the input string; they may have
3852 // been changed because of transparent bounds on a Region.
340931cb
A
3853 fActiveStart = fData[opValue+2];
3854 fActiveLimit = fData[opValue+3];
3855 U_ASSERT(fActiveStart >= 0);
3856 U_ASSERT(fActiveLimit <= fInputLength);
729e4ab9
A
3857 }
3858 break;
3859
3860 case URX_ONECHAR_I:
4388f060
A
3861 // Case insensitive one char. The char from the pattern is already case folded.
3862 // Input text is not, but case folding the input can not reduce two or more code
3863 // points to one.
729e4ab9
A
3864 if (fp->fInputIdx < fActiveLimit) {
3865 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3866
3867 UChar32 c = UTEXT_NEXT32(fInputText);
3868 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3869 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3870 break;
3871 }
3872 } else {
3873 fHitEnd = TRUE;
3874 }
57a6839d 3875
729e4ab9
A
3876 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3877 break;
3878
3879 case URX_STRING_I:
3880 {
4388f060 3881 // Case-insensitive test input against a literal string.
729e4ab9
A
3882 // Strings require two slots in the compiled pattern, one for the
3883 // offset to the string text, and one for the length.
4388f060 3884 // The compiled string has already been case folded.
729e4ab9 3885 {
4388f060
A
3886 const UChar *patternString = litText + opValue;
3887 int32_t patternStringIdx = 0;
729e4ab9
A
3888
3889 op = (int32_t)pat[fp->fPatIdx];
3890 fp->fPatIdx++;
3891 opType = URX_TYPE(op);
3892 opValue = URX_VAL(op);
3893 U_ASSERT(opType == URX_STRING_LEN);
4388f060 3894 int32_t patternStringLen = opValue; // Length of the string from the pattern.
57a6839d
A
3895
3896
4388f060
A
3897 UChar32 cPattern;
3898 UChar32 cText;
3899 UBool success = TRUE;
3900
729e4ab9 3901 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4388f060
A
3902 CaseFoldingUTextIterator inputIterator(*fInputText);
3903 while (patternStringIdx < patternStringLen) {
3904 if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
3905 success = FALSE;
3906 fHitEnd = TRUE;
3907 break;
729e4ab9 3908 }
4388f060
A
3909 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
3910 cText = inputIterator.next();
3911 if (cText != cPattern) {
3912 success = FALSE;
3913 break;
729e4ab9
A
3914 }
3915 }
4388f060
A
3916 if (inputIterator.inExpansion()) {
3917 success = FALSE;
3918 }
3919
3920 if (success) {
3921 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3922 } else {
729e4ab9
A
3923 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3924 }
3925 }
3926 }
3927 break;
3928
3929 case URX_LB_START:
3930 {
3931 // Entering a look-behind block.
340931cb 3932 // Save Stack Ptr, Input Pos and active input region.
729e4ab9 3933 // TODO: implement transparent bounds. Ticket #6067
340931cb 3934 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
729e4ab9
A
3935 fData[opValue] = fStack->size();
3936 fData[opValue+1] = fp->fInputIdx;
729e4ab9
A
3937 // Save input string length, then reset to pin any matches to end at
3938 // the current position.
340931cb 3939 fData[opValue+2] = fActiveStart;
729e4ab9 3940 fData[opValue+3] = fActiveLimit;
340931cb 3941 fActiveStart = fRegionStart;
729e4ab9 3942 fActiveLimit = fp->fInputIdx;
340931cb
A
3943 // Init the variable containing the start index for attempted matches.
3944 fData[opValue+4] = -1;
729e4ab9
A
3945 }
3946 break;
3947
3948
3949 case URX_LB_CONT:
3950 {
3951 // Positive Look-Behind, at top of loop checking for matches of LB expression
3952 // at all possible input starting positions.
3953
3954 // Fetch the min and max possible match lengths. They are the operands
3955 // of this op in the pattern.
3956 int32_t minML = (int32_t)pat[fp->fPatIdx++];
3957 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
2ca993e8
A
3958 if (!UTEXT_USES_U16(fInputText)) {
3959 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
3960 // The max length need not be exact; it just needs to be >= actual maximum.
3961 maxML *= 3;
3962 }
729e4ab9
A
3963 U_ASSERT(minML <= maxML);
3964 U_ASSERT(minML >= 0);
3965
3966 // Fetch (from data) the last input index where a match was attempted.
340931cb
A
3967 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
3968 int64_t &lbStartIdx = fData[opValue+4];
2ca993e8 3969 if (lbStartIdx < 0) {
729e4ab9 3970 // First time through loop.
2ca993e8
A
3971 lbStartIdx = fp->fInputIdx - minML;
3972 if (lbStartIdx > 0) {
3973 // move index to a code point boudary, if it's not on one already.
3974 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3975 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3976 }
729e4ab9
A
3977 } else {
3978 // 2nd through nth time through the loop.
3979 // Back up start position for match by one.
2ca993e8
A
3980 if (lbStartIdx == 0) {
3981 (lbStartIdx)--;
729e4ab9 3982 } else {
2ca993e8 3983 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4388f060 3984 (void)UTEXT_PREVIOUS32(fInputText);
2ca993e8 3985 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
729e4ab9
A
3986 }
3987 }
3988
2ca993e8 3989 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
729e4ab9
A
3990 // We have tried all potential match starting points without
3991 // getting a match. Backtrack out, and out of the
3992 // Look Behind altogether.
3993 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
340931cb
A
3994 fActiveStart = fData[opValue+2];
3995 fActiveLimit = fData[opValue+3];
3996 U_ASSERT(fActiveStart >= 0);
3997 U_ASSERT(fActiveLimit <= fInputLength);
729e4ab9
A
3998 break;
3999 }
4000
4001 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4002 // (successful match will fall off the end of the loop.)
4003 fp = StateSave(fp, fp->fPatIdx-3, status);
2ca993e8 4004 fp->fInputIdx = lbStartIdx;
729e4ab9
A
4005 }
4006 break;
4007
4008 case URX_LB_END:
4009 // End of a look-behind block, after a successful match.
4010 {
340931cb 4011 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
729e4ab9
A
4012 if (fp->fInputIdx != fActiveLimit) {
4013 // The look-behind expression matched, but the match did not
4014 // extend all the way to the point that we are looking behind from.
4015 // FAIL out of here, which will take us back to the LB_CONT, which
4016 // will retry the match starting at another position or fail
4017 // the look-behind altogether, whichever is appropriate.
4018 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4019 break;
4020 }
4021
340931cb 4022 // Look-behind match is good. Restore the orignal input string region,
57a6839d 4023 // which had been truncated to pin the end of the lookbehind match to the
729e4ab9 4024 // position being looked-behind.
340931cb
A
4025 fActiveStart = fData[opValue+2];
4026 fActiveLimit = fData[opValue+3];
4027 U_ASSERT(fActiveStart >= 0);
4028 U_ASSERT(fActiveLimit <= fInputLength);
729e4ab9
A
4029 }
4030 break;
4031
4032
4033 case URX_LBN_CONT:
4034 {
4035 // Negative Look-Behind, at top of loop checking for matches of LB expression
4036 // at all possible input starting positions.
4037
4038 // Fetch the extra parameters of this op.
4039 int32_t minML = (int32_t)pat[fp->fPatIdx++];
4040 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
2ca993e8
A
4041 if (!UTEXT_USES_U16(fInputText)) {
4042 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
4043 // The max length need not be exact; it just needs to be >= actual maximum.
4044 maxML *= 3;
4045 }
729e4ab9
A
4046 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
4047 continueLoc = URX_VAL(continueLoc);
4048 U_ASSERT(minML <= maxML);
4049 U_ASSERT(minML >= 0);
4050 U_ASSERT(continueLoc > fp->fPatIdx);
4051
4052 // Fetch (from data) the last input index where a match was attempted.
340931cb
A
4053 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
4054 int64_t &lbStartIdx = fData[opValue+4];
2ca993e8 4055 if (lbStartIdx < 0) {
729e4ab9 4056 // First time through loop.
2ca993e8
A
4057 lbStartIdx = fp->fInputIdx - minML;
4058 if (lbStartIdx > 0) {
4059 // move index to a code point boudary, if it's not on one already.
4060 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4061 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4062 }
729e4ab9
A
4063 } else {
4064 // 2nd through nth time through the loop.
4065 // Back up start position for match by one.
2ca993e8
A
4066 if (lbStartIdx == 0) {
4067 (lbStartIdx)--;
729e4ab9 4068 } else {
2ca993e8 4069 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4388f060 4070 (void)UTEXT_PREVIOUS32(fInputText);
2ca993e8 4071 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
729e4ab9
A
4072 }
4073 }
4074
2ca993e8 4075 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
729e4ab9
A
4076 // We have tried all potential match starting points without
4077 // getting a match, which means that the negative lookbehind as
4078 // a whole has succeeded. Jump forward to the continue location
340931cb
A
4079 fActiveStart = fData[opValue+2];
4080 fActiveLimit = fData[opValue+3];
4081 U_ASSERT(fActiveStart >= 0);
4082 U_ASSERT(fActiveLimit <= fInputLength);
729e4ab9
A
4083 fp->fPatIdx = continueLoc;
4084 break;
4085 }
4086
4087 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4088 // (successful match will cause a FAIL out of the loop altogether.)
4089 fp = StateSave(fp, fp->fPatIdx-4, status);
2ca993e8 4090 fp->fInputIdx = lbStartIdx;
729e4ab9
A
4091 }
4092 break;
4093
4094 case URX_LBN_END:
4095 // End of a negative look-behind block, after a successful match.
4096 {
340931cb 4097 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
729e4ab9
A
4098 if (fp->fInputIdx != fActiveLimit) {
4099 // The look-behind expression matched, but the match did not
4100 // extend all the way to the point that we are looking behind from.
4101 // FAIL out of here, which will take us back to the LB_CONT, which
4102 // will retry the match starting at another position or succeed
4103 // the look-behind altogether, whichever is appropriate.
4104 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4105 break;
4106 }
4107
4108 // Look-behind expression matched, which means look-behind test as
4109 // a whole Fails
57a6839d
A
4110
4111 // Restore the orignal input string length, which had been truncated
4112 // inorder to pin the end of the lookbehind match
729e4ab9 4113 // to the position being looked-behind.
340931cb
A
4114 fActiveStart = fData[opValue+2];
4115 fActiveLimit = fData[opValue+3];
4116 U_ASSERT(fActiveStart >= 0);
4117 U_ASSERT(fActiveLimit <= fInputLength);
729e4ab9
A
4118
4119 // Restore original stack position, discarding any state saved
4120 // by the successful pattern match.
4121 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4122 int32_t newStackSize = (int32_t)fData[opValue];
4123 U_ASSERT(fStack->size() > newStackSize);
4124 fStack->setSize(newStackSize);
57a6839d
A
4125
4126 // FAIL, which will take control back to someplace
729e4ab9
A
4127 // prior to entering the look-behind test.
4128 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4129 }
4130 break;
4131
4132
4133 case URX_LOOP_SR_I:
4134 // Loop Initialization for the optimized implementation of
4135 // [some character set]*
4136 // This op scans through all matching input.
4137 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4138 {
3d1f044b 4139 U_ASSERT(opValue > 0 && opValue < fSets->size());
729e4ab9 4140 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3d1f044b 4141 UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
729e4ab9
A
4142
4143 // Loop through input, until either the input is exhausted or
4144 // we reach a character that is not a member of the set.
4145 int64_t ix = fp->fInputIdx;
4146 UTEXT_SETNATIVEINDEX(fInputText, ix);
4147 for (;;) {
4148 if (ix >= fActiveLimit) {
4149 fHitEnd = TRUE;
4150 break;
4151 }
4152 UChar32 c = UTEXT_NEXT32(fInputText);
4153 if (c<256) {
4154 if (s8->contains(c) == FALSE) {
4155 break;
4156 }
4157 } else {
4158 if (s->contains(c) == FALSE) {
4159 break;
4160 }
4161 }
4162 ix = UTEXT_GETNATIVEINDEX(fInputText);
4163 }
4164
4165 // If there were no matching characters, skip over the loop altogether.
4166 // The loop doesn't run at all, a * op always succeeds.
4167 if (ix == fp->fInputIdx) {
4168 fp->fPatIdx++; // skip the URX_LOOP_C op.
4169 break;
4170 }
4171
4172 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4173 // must follow. It's operand is the stack location
4174 // that holds the starting input index for the match of this [set]*
4175 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4176 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4177 int32_t stackLoc = URX_VAL(loopcOp);
4178 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4179 fp->fExtra[stackLoc] = fp->fInputIdx;
729e4ab9
A
4180 fp->fInputIdx = ix;
4181
4182 // Save State to the URX_LOOP_C op that follows this one,
4183 // so that match failures in the following code will return to there.
4184 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4185 fp = StateSave(fp, fp->fPatIdx, status);
4186 fp->fPatIdx++;
4187 }
4188 break;
4189
4190
4191 case URX_LOOP_DOT_I:
4192 // Loop Initialization for the optimized implementation of .*
4193 // This op scans through all remaining input.
4194 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4195 {
4196 // Loop through input until the input is exhausted (we reach an end-of-line)
4197 // In DOTALL mode, we can just go straight to the end of the input.
4198 int64_t ix;
4199 if ((opValue & 1) == 1) {
4200 // Dot-matches-All mode. Jump straight to the end of the string.
4201 ix = fActiveLimit;
4202 fHitEnd = TRUE;
4203 } else {
4204 // NOT DOT ALL mode. Line endings do not match '.'
4205 // Scan forward until a line ending or end of input.
4206 ix = fp->fInputIdx;
4207 UTEXT_SETNATIVEINDEX(fInputText, ix);
4208 for (;;) {
4209 if (ix >= fActiveLimit) {
4210 fHitEnd = TRUE;
4211 break;
4212 }
4213 UChar32 c = UTEXT_NEXT32(fInputText);
4214 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
4215 if ((c == 0x0a) || // 0x0a is newline in both modes.
4216 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
b331163b 4217 isLineTerminator(c))) {
729e4ab9
A
4218 // char is a line ending. Exit the scanning loop.
4219 break;
4220 }
4221 }
4222 ix = UTEXT_GETNATIVEINDEX(fInputText);
4223 }
4224 }
4225
4226 // If there were no matching characters, skip over the loop altogether.
4227 // The loop doesn't run at all, a * op always succeeds.
4228 if (ix == fp->fInputIdx) {
4229 fp->fPatIdx++; // skip the URX_LOOP_C op.
4230 break;
4231 }
4232
4233 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4234 // must follow. It's operand is the stack location
4235 // that holds the starting input index for the match of this .*
4236 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4237 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4238 int32_t stackLoc = URX_VAL(loopcOp);
4239 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4240 fp->fExtra[stackLoc] = fp->fInputIdx;
729e4ab9
A
4241 fp->fInputIdx = ix;
4242
4243 // Save State to the URX_LOOP_C op that follows this one,
4244 // so that match failures in the following code will return to there.
4245 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4246 fp = StateSave(fp, fp->fPatIdx, status);
4247 fp->fPatIdx++;
4248 }
4249 break;
4250
4251
4252 case URX_LOOP_C:
4253 {
4254 U_ASSERT(opValue>=0 && opValue<fFrameSize);
4255 backSearchIndex = fp->fExtra[opValue];
4256 U_ASSERT(backSearchIndex <= fp->fInputIdx);
4257 if (backSearchIndex == fp->fInputIdx) {
4258 // We've backed up the input idx to the point that the loop started.
57a6839d 4259 // The loop is done. Leave here without saving state.
729e4ab9
A
4260 // Subsequent failures won't come back here.
4261 break;
4262 }
4263 // Set up for the next iteration of the loop, with input index
4264 // backed up by one from the last time through,
4265 // and a state save to this instruction in case the following code fails again.
4266 // (We're going backwards because this loop emulates stack unwinding, not
4267 // the initial scan forward.)
4268 U_ASSERT(fp->fInputIdx > 0);
4269 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4270 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4271 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
57a6839d 4272
729e4ab9 4273 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
57a6839d 4274 if (prevC == 0x0a &&
729e4ab9
A
4275 fp->fInputIdx > backSearchIndex &&
4276 twoPrevC == 0x0d) {
4277 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4278 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4279 // .*, stepping back over CRLF pair.
4280 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4281 }
4282 }
4283
374ca955 4284
729e4ab9
A
4285 fp = StateSave(fp, fp->fPatIdx-1, status);
4286 }
4287 break;
374ca955
A
4288
4289
729e4ab9
A
4290
4291 default:
4292 // Trouble. The compiled pattern contains an entry with an
4293 // unrecognized type tag.
3d1f044b 4294 UPRV_UNREACHABLE;
b75a7d8f 4295 }
729e4ab9
A
4296
4297 if (U_FAILURE(status)) {
4298 isMatch = FALSE;
b75a7d8f
A
4299 break;
4300 }
4301 }
57a6839d 4302
729e4ab9
A
4303breakFromLoop:
4304 fMatch = isMatch;
4305 if (isMatch) {
4306 fLastMatchEnd = fMatchEnd;
4307 fMatchStart = startIdx;
4308 fMatchEnd = fp->fInputIdx;
46f4442e 4309 }
57a6839d
A
4310
4311#ifdef REGEX_RUN_DEBUG
4312 if (fTraceDebug) {
4313 if (isMatch) {
4314 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
4315 } else {
4316 printf("No match\n\n");
46f4442e
A
4317 }
4318 }
57a6839d 4319#endif
46f4442e 4320
729e4ab9
A
4321 fFrame = fp; // The active stack frame when the engine stopped.
4322 // Contains the capture group results that we need to
4323 // access later.
4324 return;
b75a7d8f 4325}
46f4442e
A
4326
4327
b75a7d8f
A
4328//--------------------------------------------------------------------------------
4329//
729e4ab9
A
4330// MatchChunkAt This is the actual matching engine. Like MatchAt, but with the
4331// assumption that the entire string is available in the UText's
4332// chunk buffer. For now, that means we can use int32_t indexes,
4333// except for anything that needs to be saved (like group starts
4334// and ends).
b75a7d8f 4335//
46f4442e
A
4336// startIdx: begin matching a this index.
4337// toEnd: if true, match must extend to end of the input region
4338//
b75a7d8f 4339//--------------------------------------------------------------------------------
729e4ab9 4340void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
b75a7d8f 4341 UBool isMatch = FALSE; // True if the we have a match.
57a6839d 4342
729e4ab9 4343 int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
b75a7d8f
A
4344
4345 int32_t op; // Operation from the compiled pattern, split into
4346 int32_t opType; // the opcode
4347 int32_t opValue; // and the operand value.
57a6839d 4348
729e4ab9 4349#ifdef REGEX_RUN_DEBUG
57a6839d
A
4350 if (fTraceDebug) {
4351 printf("MatchAt(startIdx=%d)\n", startIdx);
2ca993e8
A
4352 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
4353 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
b75a7d8f 4354 }
729e4ab9 4355#endif
57a6839d 4356
b75a7d8f
A
4357 if (U_FAILURE(status)) {
4358 return;
4359 }
57a6839d 4360
b75a7d8f 4361 // Cache frequently referenced items from the compiled pattern
b75a7d8f 4362 //
729e4ab9 4363 int64_t *pat = fPattern->fCompiledPat->getBuffer();
57a6839d 4364
b75a7d8f 4365 const UChar *litText = fPattern->fLiteralText.getBuffer();
3d1f044b 4366 UVector *fSets = fPattern->fSets;
57a6839d 4367
729e4ab9 4368 const UChar *inputBuf = fInputText->chunkContents;
57a6839d 4369
46f4442e 4370 fFrameSize = fPattern->fFrameSize;
b75a7d8f 4371 REStackFrame *fp = resetStack();
2ca993e8
A
4372 if (U_FAILURE(fDeferredStatus)) {
4373 status = fDeferredStatus;
4374 return;
4375 }
57a6839d 4376
b75a7d8f
A
4377 fp->fPatIdx = 0;
4378 fp->fInputIdx = startIdx;
57a6839d 4379
b75a7d8f
A
4380 // Zero out the pattern's static data
4381 int32_t i;
4382 for (i = 0; i<fPattern->fDataSize; i++) {
4383 fData[i] = 0;
4384 }
57a6839d 4385
b75a7d8f
A
4386 //
4387 // Main loop for interpreting the compiled pattern.
4388 // One iteration of the loop per pattern operation performed.
4389 //
4390 for (;;) {
729e4ab9 4391 op = (int32_t)pat[fp->fPatIdx];
b75a7d8f
A
4392 opType = URX_TYPE(op);
4393 opValue = URX_VAL(op);
729e4ab9 4394#ifdef REGEX_RUN_DEBUG
b75a7d8f 4395 if (fTraceDebug) {
729e4ab9 4396 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 4397 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
729e4ab9 4398 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
b75a7d8f
A
4399 fPattern->dumpOp(fp->fPatIdx);
4400 }
729e4ab9 4401#endif
b75a7d8f 4402 fp->fPatIdx++;
57a6839d 4403
b75a7d8f 4404 switch (opType) {
57a6839d
A
4405
4406
b75a7d8f
A
4407 case URX_NOP:
4408 break;
57a6839d
A
4409
4410
b75a7d8f
A
4411 case URX_BACKTRACK:
4412 // Force a backtrack. In some circumstances, the pattern compiler
4413 // will notice that the pattern can't possibly match anything, and will
4414 // emit one of these at that point.
46f4442e 4415 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f 4416 break;
57a6839d
A
4417
4418
b75a7d8f 4419 case URX_ONECHAR:
46f4442e 4420 if (fp->fInputIdx < fActiveLimit) {
729e4ab9 4421 UChar32 c;
46f4442e
A
4422 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4423 if (c == opValue) {
b75a7d8f
A
4424 break;
4425 }
46f4442e
A
4426 } else {
4427 fHitEnd = TRUE;
b75a7d8f 4428 }
729e4ab9
A
4429 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4430 break;
57a6839d
A
4431
4432
b75a7d8f
A
4433 case URX_STRING:
4434 {
4435 // Test input against a literal string.
4436 // Strings require two slots in the compiled pattern, one for the
4437 // offset to the string text, and one for the length.
4438 int32_t stringStartIdx = opValue;
4439 int32_t stringLen;
57a6839d 4440
729e4ab9 4441 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
b75a7d8f
A
4442 fp->fPatIdx++;
4443 opType = URX_TYPE(op);
4444 stringLen = URX_VAL(op);
4445 U_ASSERT(opType == URX_STRING_LEN);
4446 U_ASSERT(stringLen >= 2);
57a6839d 4447
b75a7d8f 4448 const UChar * pInp = inputBuf + fp->fInputIdx;
4388f060 4449 const UChar * pInpLimit = inputBuf + fActiveLimit;
b75a7d8f
A
4450 const UChar * pPat = litText+stringStartIdx;
4451 const UChar * pEnd = pInp + stringLen;
4388f060
A
4452 UBool success = TRUE;
4453 while (pInp < pEnd) {
4454 if (pInp >= pInpLimit) {
4455 fHitEnd = TRUE;
4456 success = FALSE;
4457 break;
4458 }
4459 if (*pInp++ != *pPat++) {
4460 success = FALSE;
b75a7d8f
A
4461 break;
4462 }
4463 }
57a6839d 4464
729e4ab9
A
4465 if (success) {
4466 fp->fInputIdx += stringLen;
4467 } else {
729e4ab9
A
4468 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4469 }
b75a7d8f 4470 }
729e4ab9 4471 break;
57a6839d
A
4472
4473
b75a7d8f 4474 case URX_STATE_SAVE:
46f4442e 4475 fp = StateSave(fp, opValue, status);
b75a7d8f 4476 break;
57a6839d
A
4477
4478
b75a7d8f
A
4479 case URX_END:
4480 // The match loop will exit via this path on a successful match,
4481 // when we reach the end of the pattern.
46f4442e
A
4482 if (toEnd && fp->fInputIdx != fActiveLimit) {
4483 // The pattern matched, but not to the end of input. Try some more.
4484 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4485 break;
4486 }
b75a7d8f
A
4487 isMatch = TRUE;
4488 goto breakFromLoop;
57a6839d 4489
729e4ab9 4490 // Start and End Capture stack frame variables are laid out out like this:
b75a7d8f
A
4491 // fp->fExtra[opValue] - The start of a completed capture group
4492 // opValue+1 - The end of a completed capture group
4493 // opValue+2 - the start of a capture group whose end
4494 // has not yet been reached (and might not ever be).
4495 case URX_START_CAPTURE:
46f4442e 4496 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
b75a7d8f
A
4497 fp->fExtra[opValue+2] = fp->fInputIdx;
4498 break;
57a6839d
A
4499
4500
b75a7d8f 4501 case URX_END_CAPTURE:
46f4442e 4502 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
b75a7d8f
A
4503 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
4504 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
4505 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
4506 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4507 break;
57a6839d
A
4508
4509
b75a7d8f 4510 case URX_DOLLAR: // $, test for End of line
729e4ab9 4511 // or for position before new line at end of input
46f4442e 4512 if (fp->fInputIdx < fAnchorLimit-2) {
b75a7d8f 4513 // We are no where near the end of input. Fail.
46f4442e
A
4514 // This is the common case. Keep it first.
4515 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4516 break;
4517 }
46f4442e 4518 if (fp->fInputIdx >= fAnchorLimit) {
b75a7d8f 4519 // We really are at the end of input. Success.
46f4442e
A
4520 fHitEnd = TRUE;
4521 fRequireEnd = TRUE;
b75a7d8f
A
4522 break;
4523 }
57a6839d 4524
b75a7d8f
A
4525 // If we are positioned just before a new-line that is located at the
4526 // end of input, succeed.
46f4442e 4527 if (fp->fInputIdx == fAnchorLimit-1) {
729e4ab9
A
4528 UChar32 c;
4529 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
57a6839d 4530
b331163b 4531 if (isLineTerminator(c)) {
46f4442e 4532 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
374ca955 4533 // At new-line at end of input. Success
46f4442e
A
4534 fHitEnd = TRUE;
4535 fRequireEnd = TRUE;
4536 break;
374ca955 4537 }
b75a7d8f 4538 }
729e4ab9
A
4539 } else if (fp->fInputIdx == fAnchorLimit-2 &&
4540 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
46f4442e
A
4541 fHitEnd = TRUE;
4542 fRequireEnd = TRUE;
b75a7d8f 4543 break; // At CR/LF at end of input. Success
b75a7d8f 4544 }
57a6839d 4545
46f4442e 4546 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
57a6839d 4547
46f4442e 4548 break;
57a6839d
A
4549
4550
729e4ab9 4551 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
46f4442e
A
4552 if (fp->fInputIdx >= fAnchorLimit-1) {
4553 // Either at the last character of input, or off the end.
4554 if (fp->fInputIdx == fAnchorLimit-1) {
4555 // At last char of input. Success if it's a new line.
729e4ab9 4556 if (inputBuf[fp->fInputIdx] == 0x0a) {
46f4442e
A
4557 fHitEnd = TRUE;
4558 fRequireEnd = TRUE;
4559 break;
4560 }
4561 } else {
4562 // Off the end of input. Success.
4563 fHitEnd = TRUE;
4564 fRequireEnd = TRUE;
4565 break;
4566 }
4567 }
57a6839d 4568
46f4442e
A
4569 // Not at end of input. Back-track out.
4570 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f 4571 break;
57a6839d
A
4572
4573
729e4ab9
A
4574 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
4575 {
4576 if (fp->fInputIdx >= fAnchorLimit) {
4577 // We really are at the end of input. Success.
4578 fHitEnd = TRUE;
4579 fRequireEnd = TRUE;
4580 break;
4581 }
4582 // If we are positioned just before a new-line, succeed.
4583 // It makes no difference where the new-line is within the input.
4584 UChar32 c = inputBuf[fp->fInputIdx];
b331163b 4585 if (isLineTerminator(c)) {
729e4ab9
A
4586 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
4587 // In multi-line mode, hitting a new-line just before the end of input does not
4588 // set the hitEnd or requireEnd flags
4589 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
46f4442e 4590 break;
729e4ab9
A
4591 }
4592 }
4593 // not at a new line. Fail.
4594 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4595 }
4596 break;
57a6839d
A
4597
4598
729e4ab9
A
4599 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
4600 {
4601 if (fp->fInputIdx >= fAnchorLimit) {
4602 // We really are at the end of input. Success.
4603 fHitEnd = TRUE;
4604 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
4605 break; // adding a new-line would not lose the match.
4606 }
4607 // If we are not positioned just before a new-line, the test fails; backtrack out.
4608 // It makes no difference where the new-line is within the input.
4609 if (inputBuf[fp->fInputIdx] != 0x0a) {
4610 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4611 }
4612 }
4613 break;
57a6839d
A
4614
4615
729e4ab9 4616 case URX_CARET: // ^, test for start of line
46f4442e
A
4617 if (fp->fInputIdx != fAnchorStart) {
4618 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4619 }
b75a7d8f 4620 break;
57a6839d
A
4621
4622
729e4ab9
A
4623 case URX_CARET_M: // ^, test for start of line in mulit-line mode
4624 {
4625 if (fp->fInputIdx == fAnchorStart) {
4626 // We are at the start input. Success.
4627 break;
4628 }
4629 // Check whether character just before the current pos is a new-line
4630 // unless we are at the end of input
57a6839d
A
4631 UChar c = inputBuf[fp->fInputIdx - 1];
4632 if ((fp->fInputIdx < fAnchorLimit) &&
b331163b 4633 isLineTerminator(c)) {
729e4ab9
A
4634 // It's a new-line. ^ is true. Success.
4635 // TODO: what should be done with positions between a CR and LF?
4636 break;
4637 }
4638 // Not at the start of a line. Fail.
4639 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4640 }
4641 break;
57a6839d
A
4642
4643
729e4ab9
A
4644 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
4645 {
4646 U_ASSERT(fp->fInputIdx >= fAnchorStart);
4647 if (fp->fInputIdx <= fAnchorStart) {
4648 // We are at the start input. Success.
4649 break;
4650 }
4651 // Check whether character just before the current pos is a new-line
4652 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
57a6839d 4653 UChar c = inputBuf[fp->fInputIdx - 1];
729e4ab9
A
4654 if (c != 0x0a) {
4655 // Not at the start of a line. Back-track out.
4656 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4657 }
4658 }
4659 break;
57a6839d 4660
b75a7d8f
A
4661 case URX_BACKSLASH_B: // Test for word boundaries
4662 {
729e4ab9 4663 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
51004dcb 4664 success ^= (UBool)(opValue != 0); // flip sense for \B
b75a7d8f 4665 if (!success) {
46f4442e 4666 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4667 }
4668 }
4669 break;
57a6839d
A
4670
4671
374ca955
A
4672 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
4673 {
4674 UBool success = isUWordBoundary(fp->fInputIdx);
51004dcb 4675 success ^= (UBool)(opValue != 0); // flip sense for \B
374ca955 4676 if (!success) {
46f4442e 4677 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
374ca955
A
4678 }
4679 }
4680 break;
57a6839d
A
4681
4682
b75a7d8f
A
4683 case URX_BACKSLASH_D: // Test for decimal digit
4684 {
46f4442e
A
4685 if (fp->fInputIdx >= fActiveLimit) {
4686 fHitEnd = TRUE;
4687 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4688 break;
4689 }
57a6839d 4690
729e4ab9
A
4691 UChar32 c;
4692 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
46f4442e 4693 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
b75a7d8f 4694 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
51004dcb 4695 success ^= (UBool)(opValue != 0); // flip sense for \D
729e4ab9 4696 if (!success) {
46f4442e 4697 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4698 }
4699 }
4700 break;
57a6839d
A
4701
4702
b75a7d8f 4703 case URX_BACKSLASH_G: // Test for position at end of previous match
729e4ab9 4704 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
46f4442e 4705 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4706 }
4707 break;
57a6839d
A
4708
4709
b331163b
A
4710 case URX_BACKSLASH_H: // Test for \h, horizontal white space.
4711 {
4712 if (fp->fInputIdx >= fActiveLimit) {
4713 fHitEnd = TRUE;
4714 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4715 break;
4716 }
4717 UChar32 c;
4718 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4719 int8_t ctype = u_charType(c);
4720 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
4721 success ^= (UBool)(opValue != 0); // flip sense for \H
4722 if (!success) {
4723 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4724 }
4725 }
4726 break;
4727
4728
4729 case URX_BACKSLASH_R: // Test for \R, any line break sequence.
4730 {
4731 if (fp->fInputIdx >= fActiveLimit) {
4732 fHitEnd = TRUE;
4733 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4734 break;
4735 }
4736 UChar32 c;
4737 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4738 if (isLineTerminator(c)) {
4739 if (c == 0x0d && fp->fInputIdx < fActiveLimit) {
4740 // Check for CR/LF sequence. Consume both together when found.
4741 UChar c2;
4742 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2);
4743 if (c2 != 0x0a) {
4744 U16_PREV(inputBuf, 0, fp->fInputIdx, c2);
4745 }
4746 }
4747 } else {
4748 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4749 }
4750 }
4751 break;
4752
4753
4754 case URX_BACKSLASH_V: // Any single code point line ending.
4755 {
4756 if (fp->fInputIdx >= fActiveLimit) {
4757 fHitEnd = TRUE;
4758 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4759 break;
4760 }
4761 UChar32 c;
4762 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4763 UBool success = isLineTerminator(c);
4764 success ^= (UBool)(opValue != 0); // flip sense for \V
4765 if (!success) {
4766 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4767 }
4768 }
4769 break;
4770
4771
4772
57a6839d 4773 case URX_BACKSLASH_X:
729e4ab9
A
4774 // Match a Grapheme, as defined by Unicode TR 29.
4775 // Differs slightly from Perl, which consumes combining marks independently
4776 // of context.
4777 {
b75a7d8f 4778
729e4ab9
A
4779 // Fail if at end of input
4780 if (fp->fInputIdx >= fActiveLimit) {
4781 fHitEnd = TRUE;
4782 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4783 break;
4784 }
b75a7d8f 4785
729e4ab9
A
4786 // Examine (and consume) the current char.
4787 // Dispatch into a little state machine, based on the char.
4788 UChar32 c;
4789 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4790 UnicodeSet **sets = fPattern->fStaticSets;
4791 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
4792 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
4793 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4794 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4795 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4796 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4797 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4798 goto GC_Extend;
b75a7d8f
A
4799
4800
4801
4802GC_L:
729e4ab9
A
4803 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4804 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4805 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4806 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4807 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4808 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4809 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4810 goto GC_Extend;
b75a7d8f
A
4811
4812GC_V:
729e4ab9
A
4813 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4814 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4815 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4816 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4817 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4818 goto GC_Extend;
b75a7d8f
A
4819
4820GC_T:
729e4ab9
A
4821 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4822 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4823 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4824 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4825 goto GC_Extend;
b75a7d8f
A
4826
4827GC_Extend:
729e4ab9
A
4828 // Combining characters are consumed here
4829 for (;;) {
4830 if (fp->fInputIdx >= fActiveLimit) {
4831 break;
b75a7d8f 4832 }
729e4ab9
A
4833 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4834 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
4835 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
4836 break;
4837 }
4838 }
4839 goto GC_Done;
b75a7d8f
A
4840
4841GC_Control:
57a6839d 4842 // Most control chars stand alone (don't combine with combining chars),
729e4ab9
A
4843 // except for that CR/LF sequence is a single grapheme cluster.
4844 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
4845 fp->fInputIdx++;
4846 }
b75a7d8f
A
4847
4848GC_Done:
729e4ab9
A
4849 if (fp->fInputIdx >= fActiveLimit) {
4850 fHitEnd = TRUE;
b75a7d8f 4851 }
729e4ab9
A
4852 break;
4853 }
57a6839d
A
4854
4855
4856
4857
46f4442e
A
4858 case URX_BACKSLASH_Z: // Test for end of Input
4859 if (fp->fInputIdx < fAnchorLimit) {
4860 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4861 } else {
4862 fHitEnd = TRUE;
4863 fRequireEnd = TRUE;
b75a7d8f
A
4864 }
4865 break;
57a6839d
A
4866
4867
4868
b75a7d8f
A
4869 case URX_STATIC_SETREF:
4870 {
4871 // Test input character against one of the predefined sets
4872 // (Word Characters, for example)
4873 // The high bit of the op value is a flag for the match polarity.
4874 // 0: success if input char is in set.
4875 // 1: success if input char is not in set.
46f4442e
A
4876 if (fp->fInputIdx >= fActiveLimit) {
4877 fHitEnd = TRUE;
4878 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4879 break;
4880 }
57a6839d
A
4881
4882 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
b75a7d8f
A
4883 opValue &= ~URX_NEG_SET;
4884 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
57a6839d 4885
729e4ab9 4886 UChar32 c;
46f4442e 4887 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
b75a7d8f
A
4888 if (c < 256) {
4889 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4890 if (s8->contains(c)) {
4891 success = !success;
4892 }
4893 } else {
4894 const UnicodeSet *s = fPattern->fStaticSets[opValue];
4895 if (s->contains(c)) {
4896 success = !success;
4897 }
4898 }
4899 if (!success) {
46f4442e 4900 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4901 }
4902 }
4903 break;
57a6839d
A
4904
4905
b75a7d8f
A
4906 case URX_STAT_SETREF_N:
4907 {
57a6839d 4908 // Test input character for NOT being a member of one of
b75a7d8f 4909 // the predefined sets (Word Characters, for example)
46f4442e
A
4910 if (fp->fInputIdx >= fActiveLimit) {
4911 fHitEnd = TRUE;
4912 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4913 break;
4914 }
57a6839d 4915
b75a7d8f 4916 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
57a6839d 4917
b75a7d8f 4918 UChar32 c;
46f4442e 4919 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
b75a7d8f
A
4920 if (c < 256) {
4921 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4922 if (s8->contains(c) == FALSE) {
4923 break;
4924 }
4925 } else {
4926 const UnicodeSet *s = fPattern->fStaticSets[opValue];
4927 if (s->contains(c) == FALSE) {
4928 break;
4929 }
4930 }
46f4442e 4931 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4932 }
4933 break;
57a6839d
A
4934
4935
b75a7d8f 4936 case URX_SETREF:
729e4ab9
A
4937 {
4938 if (fp->fInputIdx >= fActiveLimit) {
4939 fHitEnd = TRUE;
4940 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
46f4442e
A
4941 break;
4942 }
57a6839d 4943
3d1f044b 4944 U_ASSERT(opValue > 0 && opValue < fSets->size());
729e4ab9
A
4945
4946 // There is input left. Pick up one char and test it for set membership.
4947 UChar32 c;
4948 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4949 if (c<256) {
4950 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4951 if (s8->contains(c)) {
4952 // The character is in the set. A Match.
4953 break;
4954 }
4955 } else {
3d1f044b 4956 UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
729e4ab9
A
4957 if (s->contains(c)) {
4958 // The character is in the set. A Match.
4959 break;
4960 }
4961 }
57a6839d 4962
729e4ab9 4963 // the character wasn't in the set.
729e4ab9 4964 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
46f4442e 4965 }
b75a7d8f 4966 break;
57a6839d
A
4967
4968
b75a7d8f
A
4969 case URX_DOTANY:
4970 {
4971 // . matches anything, but stops at end-of-line.
46f4442e 4972 if (fp->fInputIdx >= fActiveLimit) {
b75a7d8f 4973 // At end of input. Match failed. Backtrack out.
46f4442e
A
4974 fHitEnd = TRUE;
4975 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4976 break;
4977 }
57a6839d 4978
b75a7d8f 4979 // There is input left. Advance over one char, unless we've hit end-of-line
729e4ab9 4980 UChar32 c;
46f4442e 4981 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
b331163b 4982 if (isLineTerminator(c)) {
b75a7d8f 4983 // End of line in normal mode. . does not match.
729e4ab9 4984 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4985 break;
4986 }
4987 }
4988 break;
57a6839d
A
4989
4990
b75a7d8f
A
4991 case URX_DOTANY_ALL:
4992 {
729e4ab9 4993 // . in dot-matches-all (including new lines) mode
46f4442e 4994 if (fp->fInputIdx >= fActiveLimit) {
b75a7d8f 4995 // At end of input. Match failed. Backtrack out.
46f4442e
A
4996 fHitEnd = TRUE;
4997 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4998 break;
4999 }
57a6839d 5000
b75a7d8f
A
5001 // There is input left. Advance over one char, except if we are
5002 // at a cr/lf, advance over both of them.
57a6839d 5003 UChar32 c;
46f4442e
A
5004 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5005 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
b75a7d8f 5006 // In the case of a CR/LF, we need to advance over both.
729e4ab9
A
5007 if (inputBuf[fp->fInputIdx] == 0x0a) {
5008 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
b75a7d8f
A
5009 }
5010 }
5011 }
5012 break;
57a6839d
A
5013
5014
46f4442e 5015 case URX_DOTANY_UNIX:
b75a7d8f 5016 {
46f4442e
A
5017 // '.' operator, matches all, but stops at end-of-line.
5018 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
5019 if (fp->fInputIdx >= fActiveLimit) {
5020 // At end of input. Match failed. Backtrack out.
5021 fHitEnd = TRUE;
5022 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
5023 break;
5024 }
57a6839d 5025
46f4442e 5026 // There is input left. Advance over one char, unless we've hit end-of-line
57a6839d 5027 UChar32 c;
46f4442e
A
5028 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5029 if (c == 0x0a) {
5030 // End of line in normal mode. '.' does not match the \n
5031 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
5032 }
5033 }
5034 break;
57a6839d
A
5035
5036
b75a7d8f
A
5037 case URX_JMP:
5038 fp->fPatIdx = opValue;
5039 break;
57a6839d 5040
b75a7d8f
A
5041 case URX_FAIL:
5042 isMatch = FALSE;
5043 goto breakFromLoop;
57a6839d 5044
b75a7d8f
A
5045 case URX_JMP_SAV:
5046 U_ASSERT(opValue < fPattern->fCompiledPat->size());
46f4442e
A
5047 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
5048 fp->fPatIdx = opValue; // Then JMP.
b75a7d8f 5049 break;
57a6839d 5050
b75a7d8f
A
5051 case URX_JMP_SAV_X:
5052 // This opcode is used with (x)+, when x can match a zero length string.
5053 // Same as JMP_SAV, except conditional on the match having made forward progress.
5054 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
5055 // data address of the input position at the start of the loop.
5056 {
5057 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
729e4ab9 5058 int32_t stoOp = (int32_t)pat[opValue-1];
b75a7d8f
A
5059 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
5060 int32_t frameLoc = URX_VAL(stoOp);
46f4442e 5061 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
729e4ab9 5062 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
b75a7d8f
A
5063 U_ASSERT(prevInputIdx <= fp->fInputIdx);
5064 if (prevInputIdx < fp->fInputIdx) {
5065 // The match did make progress. Repeat the loop.
46f4442e 5066 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
b75a7d8f
A
5067 fp->fPatIdx = opValue;
5068 fp->fExtra[frameLoc] = fp->fInputIdx;
57a6839d 5069 }
b75a7d8f
A
5070 // If the input position did not advance, we do nothing here,
5071 // execution will fall out of the loop.
5072 }
5073 break;
57a6839d 5074
b75a7d8f
A
5075 case URX_CTR_INIT:
5076 {
46f4442e 5077 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
57a6839d
A
5078 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
5079
b75a7d8f 5080 // Pick up the three extra operands that CTR_INIT has, and
57a6839d 5081 // skip the pattern location counter past
729e4ab9 5082 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
b75a7d8f
A
5083 fp->fPatIdx += 3;
5084 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
729e4ab9
A
5085 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5086 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
b75a7d8f
A
5087 U_ASSERT(minCount>=0);
5088 U_ASSERT(maxCount>=minCount || maxCount==-1);
57a6839d
A
5089 U_ASSERT(loopLoc>=fp->fPatIdx);
5090
b75a7d8f 5091 if (minCount == 0) {
46f4442e 5092 fp = StateSave(fp, loopLoc+1, status);
b75a7d8f 5093 }
57a6839d
A
5094 if (maxCount == -1) {
5095 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
5096 } else if (maxCount == 0) {
46f4442e 5097 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
5098 }
5099 }
5100 break;
57a6839d 5101
b75a7d8f
A
5102 case URX_CTR_LOOP:
5103 {
5104 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
729e4ab9 5105 int32_t initOp = (int32_t)pat[opValue];
b75a7d8f 5106 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
729e4ab9
A
5107 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5108 int32_t minCount = (int32_t)pat[opValue+2];
5109 int32_t maxCount = (int32_t)pat[opValue+3];
b75a7d8f 5110 (*pCounter)++;
57a6839d
A
5111 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5112 U_ASSERT(*pCounter == maxCount);
b75a7d8f
A
5113 break;
5114 }
5115 if (*pCounter >= minCount) {
57a6839d
A
5116 if (maxCount == -1) {
5117 // Loop has no hard upper bound.
5118 // Check that it is progressing through the input, break if it is not.
5119 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
5120 if (fp->fInputIdx == *pLastInputIdx) {
5121 break;
5122 } else {
5123 *pLastInputIdx = fp->fInputIdx;
5124 }
5125 }
46f4442e 5126 fp = StateSave(fp, fp->fPatIdx, status);
f3c0d7a5
A
5127 } else {
5128 // Increment time-out counter. (StateSave() does it if count >= minCount)
5129 fTickCounter--;
5130 if (fTickCounter <= 0) {
5131 IncrementTime(status); // Re-initializes fTickCounter
5132 }
b75a7d8f
A
5133 }
5134 fp->fPatIdx = opValue + 4; // Loop back.
5135 }
5136 break;
57a6839d 5137
b75a7d8f
A
5138 case URX_CTR_INIT_NG:
5139 {
46f4442e
A
5140 // Initialize a non-greedy loop
5141 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
57a6839d
A
5142 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
5143
5144 // Pick up the three extra operands that CTR_INIT_NG has, and
5145 // skip the pattern location counter past
729e4ab9 5146 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
b75a7d8f
A
5147 fp->fPatIdx += 3;
5148 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
729e4ab9
A
5149 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5150 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
b75a7d8f
A
5151 U_ASSERT(minCount>=0);
5152 U_ASSERT(maxCount>=minCount || maxCount==-1);
5153 U_ASSERT(loopLoc>fp->fPatIdx);
57a6839d
A
5154 if (maxCount == -1) {
5155 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
5156 }
5157
b75a7d8f
A
5158 if (minCount == 0) {
5159 if (maxCount != 0) {
46f4442e 5160 fp = StateSave(fp, fp->fPatIdx, status);
b75a7d8f
A
5161 }
5162 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
57a6839d 5163 }
b75a7d8f
A
5164 }
5165 break;
57a6839d 5166
b75a7d8f
A
5167 case URX_CTR_LOOP_NG:
5168 {
46f4442e 5169 // Non-greedy {min, max} loops
b75a7d8f 5170 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
729e4ab9 5171 int32_t initOp = (int32_t)pat[opValue];
b75a7d8f 5172 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
729e4ab9
A
5173 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5174 int32_t minCount = (int32_t)pat[opValue+2];
5175 int32_t maxCount = (int32_t)pat[opValue+3];
57a6839d 5176
b75a7d8f 5177 (*pCounter)++;
57a6839d 5178 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
b75a7d8f
A
5179 // The loop has matched the maximum permitted number of times.
5180 // Break out of here with no action. Matching will
5181 // continue with the following pattern.
57a6839d 5182 U_ASSERT(*pCounter == maxCount);
b75a7d8f
A
5183 break;
5184 }
57a6839d 5185
b75a7d8f
A
5186 if (*pCounter < minCount) {
5187 // We haven't met the minimum number of matches yet.
5188 // Loop back for another one.
5189 fp->fPatIdx = opValue + 4; // Loop back.
f3c0d7a5
A
5190 fTickCounter--;
5191 if (fTickCounter <= 0) {
5192 IncrementTime(status); // Re-initializes fTickCounter
5193 }
b75a7d8f
A
5194 } else {
5195 // We do have the minimum number of matches.
57a6839d
A
5196
5197 // If there is no upper bound on the loop iterations, check that the input index
5198 // is progressing, and stop the loop if it is not.
5199 if (maxCount == -1) {
5200 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
5201 if (fp->fInputIdx == *pLastInputIdx) {
5202 break;
5203 }
5204 *pLastInputIdx = fp->fInputIdx;
5205 }
5206
5207 // Loop Continuation: we will fall into the pattern following the loop
5208 // (non-greedy, don't execute loop body first), but first do
5209 // a state save to the top of the loop, so that a match failure
b75a7d8f 5210 // in the following pattern will try another iteration of the loop.
46f4442e 5211 fp = StateSave(fp, opValue + 4, status);
b75a7d8f
A
5212 }
5213 }
5214 break;
57a6839d 5215
b75a7d8f
A
5216 case URX_STO_SP:
5217 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5218 fData[opValue] = fStack->size();
5219 break;
57a6839d 5220
b75a7d8f
A
5221 case URX_LD_SP:
5222 {
5223 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
729e4ab9 5224 int32_t newStackSize = (int32_t)fData[opValue];
b75a7d8f 5225 U_ASSERT(newStackSize <= fStack->size());
729e4ab9
A
5226 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5227 if (newFP == (int64_t *)fp) {
b75a7d8f
A
5228 break;
5229 }
3d1f044b
A
5230 int32_t j;
5231 for (j=0; j<fFrameSize; j++) {
5232 newFP[j] = ((int64_t *)fp)[j];
b75a7d8f
A
5233 }
5234 fp = (REStackFrame *)newFP;
5235 fStack->setSize(newStackSize);
5236 }
5237 break;
57a6839d 5238
b75a7d8f 5239 case URX_BACKREF:
4388f060
A
5240 {
5241 U_ASSERT(opValue < fFrameSize);
5242 int64_t groupStartIdx = fp->fExtra[opValue];
5243 int64_t groupEndIdx = fp->fExtra[opValue+1];
5244 U_ASSERT(groupStartIdx <= groupEndIdx);
5245 int64_t inputIndex = fp->fInputIdx;
5246 if (groupStartIdx < 0) {
5247 // This capture group has not participated in the match thus far,
5248 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5249 break;
5250 }
5251 UBool success = TRUE;
5252 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
5253 if (inputIndex >= fActiveLimit) {
5254 success = FALSE;
5255 fHitEnd = TRUE;
5256 break;
5257 }
5258 if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
5259 success = FALSE;
5260 break;
5261 }
5262 }
2ca993e8
A
5263 if (success && groupStartIdx < groupEndIdx && U16_IS_LEAD(inputBuf[groupEndIdx-1]) &&
5264 inputIndex < fActiveLimit && U16_IS_TRAIL(inputBuf[inputIndex])) {
5265 // Capture group ended with an unpaired lead surrogate.
5266 // Back reference is not permitted to match lead only of a surrogatge pair.
5267 success = FALSE;
5268 }
4388f060
A
5269 if (success) {
5270 fp->fInputIdx = inputIndex;
5271 } else {
5272 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5273 }
5274 }
5275 break;
57a6839d 5276
b75a7d8f
A
5277 case URX_BACKREF_I:
5278 {
46f4442e 5279 U_ASSERT(opValue < fFrameSize);
729e4ab9
A
5280 int64_t groupStartIdx = fp->fExtra[opValue];
5281 int64_t groupEndIdx = fp->fExtra[opValue+1];
b75a7d8f 5282 U_ASSERT(groupStartIdx <= groupEndIdx);
b75a7d8f
A
5283 if (groupStartIdx < 0) {
5284 // This capture group has not participated in the match thus far,
46f4442e 5285 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
4388f060 5286 break;
b75a7d8f 5287 }
4388f060
A
5288 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
5289 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
b75a7d8f 5290
4388f060 5291 // Note: if the capture group match was of an empty string the backref
57a6839d 5292 // match succeeds. Verified by testing: Perl matches succeed
4388f060 5293 // in this case, so we do too.
57a6839d 5294
4388f060
A
5295 UBool success = TRUE;
5296 for (;;) {
5297 UChar32 captureGroupChar = captureGroupItr.next();
5298 if (captureGroupChar == U_SENTINEL) {
5299 success = TRUE;
b75a7d8f
A
5300 break;
5301 }
4388f060
A
5302 UChar32 inputChar = inputItr.next();
5303 if (inputChar == U_SENTINEL) {
5304 success = FALSE;
5305 fHitEnd = TRUE;
5306 break;
b75a7d8f 5307 }
4388f060
A
5308 if (inputChar != captureGroupChar) {
5309 success = FALSE;
5310 break;
5311 }
5312 }
5313
5314 if (success && inputItr.inExpansion()) {
57a6839d
A
5315 // We otained a match by consuming part of a string obtained from
5316 // case-folding a single code point of the input text.
4388f060
A
5317 // This does not count as an overall match.
5318 success = FALSE;
b75a7d8f 5319 }
4388f060
A
5320
5321 if (success) {
5322 fp->fInputIdx = inputItr.getIndex();
b75a7d8f 5323 } else {
4388f060 5324 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
5325 }
5326 }
5327 break;
4388f060 5328
b75a7d8f
A
5329 case URX_STO_INP_LOC:
5330 {
46f4442e 5331 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
b75a7d8f
A
5332 fp->fExtra[opValue] = fp->fInputIdx;
5333 }
5334 break;
57a6839d 5335
b75a7d8f
A
5336 case URX_JMPX:
5337 {
729e4ab9 5338 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
b75a7d8f
A
5339 fp->fPatIdx += 1;
5340 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
46f4442e 5341 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
729e4ab9 5342 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
b75a7d8f
A
5343 U_ASSERT(savedInputIdx <= fp->fInputIdx);
5344 if (savedInputIdx < fp->fInputIdx) {
5345 fp->fPatIdx = opValue; // JMP
5346 } else {
729e4ab9 5347 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
b75a7d8f
A
5348 }
5349 }
5350 break;
57a6839d 5351
b75a7d8f
A
5352 case URX_LA_START:
5353 {
340931cb 5354 // Entering a look around block.
b75a7d8f 5355 // Save Stack Ptr, Input Pos.
340931cb 5356 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
b75a7d8f
A
5357 fData[opValue] = fStack->size();
5358 fData[opValue+1] = fp->fInputIdx;
340931cb
A
5359 fData[opValue+2] = fActiveStart;
5360 fData[opValue+3] = fActiveLimit;
46f4442e
A
5361 fActiveStart = fLookStart; // Set the match region change for
5362 fActiveLimit = fLookLimit; // transparent bounds.
b75a7d8f
A
5363 }
5364 break;
57a6839d 5365
b75a7d8f
A
5366 case URX_LA_END:
5367 {
340931cb 5368 // Leaving a look around block.
b75a7d8f 5369 // restore Stack Ptr, Input Pos to positions they had on entry to block.
340931cb 5370 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
b75a7d8f 5371 int32_t stackSize = fStack->size();
729e4ab9 5372 int32_t newStackSize = (int32_t)fData[opValue];
b75a7d8f
A
5373 U_ASSERT(stackSize >= newStackSize);
5374 if (stackSize > newStackSize) {
46f4442e
A
5375 // Copy the current top frame back to the new (cut back) top frame.
5376 // This makes the capture groups from within the look-ahead
5377 // expression available.
729e4ab9 5378 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3d1f044b
A
5379 int32_t j;
5380 for (j=0; j<fFrameSize; j++) {
5381 newFP[j] = ((int64_t *)fp)[j];
b75a7d8f
A
5382 }
5383 fp = (REStackFrame *)newFP;
5384 fStack->setSize(newStackSize);
5385 }
5386 fp->fInputIdx = fData[opValue+1];
57a6839d 5387
46f4442e
A
5388 // Restore the active region bounds in the input string; they may have
5389 // been changed because of transparent bounds on a Region.
340931cb
A
5390 fActiveStart = fData[opValue+2];
5391 fActiveLimit = fData[opValue+3];
5392 U_ASSERT(fActiveStart >= 0);
5393 U_ASSERT(fActiveLimit <= fInputLength);
b75a7d8f
A
5394 }
5395 break;
57a6839d 5396
b75a7d8f 5397 case URX_ONECHAR_I:
46f4442e 5398 if (fp->fInputIdx < fActiveLimit) {
57a6839d 5399 UChar32 c;
46f4442e
A
5400 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5401 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
b75a7d8f
A
5402 break;
5403 }
46f4442e
A
5404 } else {
5405 fHitEnd = TRUE;
5406 }
5407 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f 5408 break;
57a6839d 5409
b75a7d8f 5410 case URX_STRING_I:
4388f060
A
5411 // Case-insensitive test input against a literal string.
5412 // Strings require two slots in the compiled pattern, one for the
5413 // offset to the string text, and one for the length.
5414 // The compiled string has already been case folded.
b75a7d8f 5415 {
4388f060
A
5416 const UChar *patternString = litText + opValue;
5417
5418 op = (int32_t)pat[fp->fPatIdx];
5419 fp->fPatIdx++;
5420 opType = URX_TYPE(op);
5421 opValue = URX_VAL(op);
5422 U_ASSERT(opType == URX_STRING_LEN);
5423 int32_t patternStringLen = opValue; // Length of the string from the pattern.
57a6839d 5424
4388f060
A
5425 UChar32 cText;
5426 UChar32 cPattern;
5427 UBool success = TRUE;
5428 int32_t patternStringIdx = 0;
5429 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
5430 while (patternStringIdx < patternStringLen) {
5431 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
5432 cText = inputIterator.next();
5433 if (cText != cPattern) {
5434 success = FALSE;
5435 if (cText == U_SENTINEL) {
5436 fHitEnd = TRUE;
729e4ab9 5437 }
4388f060 5438 break;
374ca955 5439 }
46f4442e 5440 }
4388f060
A
5441 if (inputIterator.inExpansion()) {
5442 success = FALSE;
5443 }
5444
5445 if (success) {
5446 fp->fInputIdx = inputIterator.getIndex();
5447 } else {
5448 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5449 }
b75a7d8f
A
5450 }
5451 break;
4388f060 5452
b75a7d8f
A
5453 case URX_LB_START:
5454 {
5455 // Entering a look-behind block.
340931cb 5456 // Save Stack Ptr, Input Pos and active input region.
46f4442e 5457 // TODO: implement transparent bounds. Ticket #6067
340931cb 5458 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
b75a7d8f
A
5459 fData[opValue] = fStack->size();
5460 fData[opValue+1] = fp->fInputIdx;
b75a7d8f
A
5461 // Save input string length, then reset to pin any matches to end at
5462 // the current position.
340931cb 5463 fData[opValue+2] = fActiveStart;
46f4442e 5464 fData[opValue+3] = fActiveLimit;
340931cb 5465 fActiveStart = fRegionStart;
46f4442e 5466 fActiveLimit = fp->fInputIdx;
340931cb
A
5467 // Init the variable containing the start index for attempted matches.
5468 fData[opValue+4] = -1;
b75a7d8f
A
5469 }
5470 break;
57a6839d
A
5471
5472
b75a7d8f
A
5473 case URX_LB_CONT:
5474 {
5475 // Positive Look-Behind, at top of loop checking for matches of LB expression
5476 // at all possible input starting positions.
57a6839d 5477
b75a7d8f
A
5478 // Fetch the min and max possible match lengths. They are the operands
5479 // of this op in the pattern.
729e4ab9
A
5480 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5481 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
b75a7d8f
A
5482 U_ASSERT(minML <= maxML);
5483 U_ASSERT(minML >= 0);
57a6839d 5484
b75a7d8f 5485 // Fetch (from data) the last input index where a match was attempted.
340931cb
A
5486 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5487 int64_t &lbStartIdx = fData[opValue+4];
2ca993e8 5488 if (lbStartIdx < 0) {
b75a7d8f 5489 // First time through loop.
2ca993e8 5490 lbStartIdx = fp->fInputIdx - minML;
0f5d89e8 5491 if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
2ca993e8
A
5492 U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5493 }
b75a7d8f
A
5494 } else {
5495 // 2nd through nth time through the loop.
5496 // Back up start position for match by one.
2ca993e8
A
5497 if (lbStartIdx == 0) {
5498 lbStartIdx--;
b75a7d8f 5499 } else {
2ca993e8 5500 U16_BACK_1(inputBuf, 0, lbStartIdx);
b75a7d8f
A
5501 }
5502 }
57a6839d 5503
2ca993e8 5504 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
b75a7d8f
A
5505 // We have tried all potential match starting points without
5506 // getting a match. Backtrack out, and out of the
5507 // Look Behind altogether.
46f4442e 5508 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
340931cb
A
5509 fActiveStart = fData[opValue+2];
5510 fActiveLimit = fData[opValue+3];
5511 U_ASSERT(fActiveStart >= 0);
5512 U_ASSERT(fActiveLimit <= fInputLength);
b75a7d8f
A
5513 break;
5514 }
57a6839d 5515
b75a7d8f
A
5516 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5517 // (successful match will fall off the end of the loop.)
46f4442e 5518 fp = StateSave(fp, fp->fPatIdx-3, status);
2ca993e8 5519 fp->fInputIdx = lbStartIdx;
b75a7d8f
A
5520 }
5521 break;
57a6839d 5522
b75a7d8f
A
5523 case URX_LB_END:
5524 // End of a look-behind block, after a successful match.
5525 {
340931cb 5526 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
46f4442e 5527 if (fp->fInputIdx != fActiveLimit) {
b75a7d8f
A
5528 // The look-behind expression matched, but the match did not
5529 // extend all the way to the point that we are looking behind from.
5530 // FAIL out of here, which will take us back to the LB_CONT, which
5531 // will retry the match starting at another position or fail
5532 // the look-behind altogether, whichever is appropriate.
46f4442e 5533 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
5534 break;
5535 }
57a6839d 5536
340931cb 5537 // Look-behind match is good. Restore the orignal input string region,
57a6839d 5538 // which had been truncated to pin the end of the lookbehind match to the
b75a7d8f 5539 // position being looked-behind.
340931cb
A
5540 fActiveStart = fData[opValue+2];
5541 fActiveLimit = fData[opValue+3];
5542 U_ASSERT(fActiveStart >= 0);
5543 U_ASSERT(fActiveLimit <= fInputLength);
b75a7d8f
A
5544 }
5545 break;
57a6839d
A
5546
5547
b75a7d8f
A
5548 case URX_LBN_CONT:
5549 {
5550 // Negative Look-Behind, at top of loop checking for matches of LB expression
5551 // at all possible input starting positions.
57a6839d 5552
b75a7d8f 5553 // Fetch the extra parameters of this op.
729e4ab9
A
5554 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5555 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5556 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5557 continueLoc = URX_VAL(continueLoc);
b75a7d8f
A
5558 U_ASSERT(minML <= maxML);
5559 U_ASSERT(minML >= 0);
5560 U_ASSERT(continueLoc > fp->fPatIdx);
57a6839d 5561
b75a7d8f 5562 // Fetch (from data) the last input index where a match was attempted.
340931cb
A
5563 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5564 int64_t &lbStartIdx = fData[opValue+4];
2ca993e8 5565 if (lbStartIdx < 0) {
b75a7d8f 5566 // First time through loop.
2ca993e8 5567 lbStartIdx = fp->fInputIdx - minML;
0f5d89e8 5568 if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
2ca993e8
A
5569 U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5570 }
b75a7d8f
A
5571 } else {
5572 // 2nd through nth time through the loop.
5573 // Back up start position for match by one.
2ca993e8
A
5574 if (lbStartIdx == 0) {
5575 lbStartIdx--; // Because U16_BACK is unsafe starting at 0.
b75a7d8f 5576 } else {
2ca993e8 5577 U16_BACK_1(inputBuf, 0, lbStartIdx);
b75a7d8f
A
5578 }
5579 }
57a6839d 5580
2ca993e8 5581 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
b75a7d8f
A
5582 // We have tried all potential match starting points without
5583 // getting a match, which means that the negative lookbehind as
5584 // a whole has succeeded. Jump forward to the continue location
340931cb
A
5585 fActiveStart = fData[opValue+2];
5586 fActiveLimit = fData[opValue+3];
5587 U_ASSERT(fActiveStart >= 0);
5588 U_ASSERT(fActiveLimit <= fInputLength);
b75a7d8f
A
5589 fp->fPatIdx = continueLoc;
5590 break;
5591 }
57a6839d 5592
b75a7d8f
A
5593 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5594 // (successful match will cause a FAIL out of the loop altogether.)
46f4442e 5595 fp = StateSave(fp, fp->fPatIdx-4, status);
2ca993e8 5596 fp->fInputIdx = lbStartIdx;
b75a7d8f
A
5597 }
5598 break;
57a6839d 5599
b75a7d8f
A
5600 case URX_LBN_END:
5601 // End of a negative look-behind block, after a successful match.
5602 {
340931cb 5603 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
46f4442e 5604 if (fp->fInputIdx != fActiveLimit) {
b75a7d8f
A
5605 // The look-behind expression matched, but the match did not
5606 // extend all the way to the point that we are looking behind from.
5607 // FAIL out of here, which will take us back to the LB_CONT, which
5608 // will retry the match starting at another position or succeed
5609 // the look-behind altogether, whichever is appropriate.
46f4442e 5610 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
5611 break;
5612 }
57a6839d 5613
b75a7d8f
A
5614 // Look-behind expression matched, which means look-behind test as
5615 // a whole Fails
57a6839d
A
5616
5617 // Restore the orignal input string length, which had been truncated
5618 // inorder to pin the end of the lookbehind match
b75a7d8f 5619 // to the position being looked-behind.
340931cb
A
5620 fActiveStart = fData[opValue+2];
5621 fActiveLimit = fData[opValue+3];
5622 U_ASSERT(fActiveStart >= 0);
5623 U_ASSERT(fActiveLimit <= fInputLength);
57a6839d 5624
b75a7d8f
A
5625 // Restore original stack position, discarding any state saved
5626 // by the successful pattern match.
5627 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
729e4ab9 5628 int32_t newStackSize = (int32_t)fData[opValue];
b75a7d8f
A
5629 U_ASSERT(fStack->size() > newStackSize);
5630 fStack->setSize(newStackSize);
57a6839d
A
5631
5632 // FAIL, which will take control back to someplace
b75a7d8f 5633 // prior to entering the look-behind test.
46f4442e 5634 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
5635 }
5636 break;
57a6839d
A
5637
5638
b75a7d8f
A
5639 case URX_LOOP_SR_I:
5640 // Loop Initialization for the optimized implementation of
5641 // [some character set]*
5642 // This op scans through all matching input.
5643 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5644 {
3d1f044b 5645 U_ASSERT(opValue > 0 && opValue < fSets->size());
b75a7d8f 5646 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3d1f044b 5647 UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
57a6839d 5648
b75a7d8f
A
5649 // Loop through input, until either the input is exhausted or
5650 // we reach a character that is not a member of the set.
729e4ab9 5651 int32_t ix = (int32_t)fp->fInputIdx;
b75a7d8f 5652 for (;;) {
46f4442e
A
5653 if (ix >= fActiveLimit) {
5654 fHitEnd = TRUE;
b75a7d8f
A
5655 break;
5656 }
5657 UChar32 c;
46f4442e 5658 U16_NEXT(inputBuf, ix, fActiveLimit, c);
b75a7d8f
A
5659 if (c<256) {
5660 if (s8->contains(c) == FALSE) {
5661 U16_BACK_1(inputBuf, 0, ix);
5662 break;
5663 }
5664 } else {
5665 if (s->contains(c) == FALSE) {
5666 U16_BACK_1(inputBuf, 0, ix);
5667 break;
5668 }
5669 }
5670 }
57a6839d 5671
b75a7d8f
A
5672 // If there were no matching characters, skip over the loop altogether.
5673 // The loop doesn't run at all, a * op always succeeds.
5674 if (ix == fp->fInputIdx) {
5675 fp->fPatIdx++; // skip the URX_LOOP_C op.
5676 break;
5677 }
57a6839d 5678
b75a7d8f
A
5679 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5680 // must follow. It's operand is the stack location
5681 // that holds the starting input index for the match of this [set]*
729e4ab9 5682 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
b75a7d8f
A
5683 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5684 int32_t stackLoc = URX_VAL(loopcOp);
46f4442e 5685 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
b75a7d8f
A
5686 fp->fExtra[stackLoc] = fp->fInputIdx;
5687 fp->fInputIdx = ix;
57a6839d 5688
b75a7d8f
A
5689 // Save State to the URX_LOOP_C op that follows this one,
5690 // so that match failures in the following code will return to there.
5691 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
46f4442e 5692 fp = StateSave(fp, fp->fPatIdx, status);
b75a7d8f
A
5693 fp->fPatIdx++;
5694 }
5695 break;
57a6839d
A
5696
5697
b75a7d8f
A
5698 case URX_LOOP_DOT_I:
5699 // Loop Initialization for the optimized implementation of .*
5700 // This op scans through all remaining input.
5701 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5702 {
5703 // Loop through input until the input is exhausted (we reach an end-of-line)
46f4442e 5704 // In DOTALL mode, we can just go straight to the end of the input.
374ca955 5705 int32_t ix;
46f4442e
A
5706 if ((opValue & 1) == 1) {
5707 // Dot-matches-All mode. Jump straight to the end of the string.
729e4ab9 5708 ix = (int32_t)fActiveLimit;
46f4442e 5709 fHitEnd = TRUE;
374ca955 5710 } else {
46f4442e 5711 // NOT DOT ALL mode. Line endings do not match '.'
b75a7d8f 5712 // Scan forward until a line ending or end of input.
729e4ab9 5713 ix = (int32_t)fp->fInputIdx;
b75a7d8f 5714 for (;;) {
46f4442e
A
5715 if (ix >= fActiveLimit) {
5716 fHitEnd = TRUE;
b75a7d8f
A
5717 break;
5718 }
5719 UChar32 c;
46f4442e 5720 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++]
729e4ab9
A
5721 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
5722 if ((c == 0x0a) || // 0x0a is newline in both modes.
5723 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
b331163b 5724 isLineTerminator(c))) {
46f4442e
A
5725 // char is a line ending. Put the input pos back to the
5726 // line ending char, and exit the scanning loop.
5727 U16_BACK_1(inputBuf, 0, ix);
5728 break;
5729 }
b75a7d8f
A
5730 }
5731 }
5732 }
57a6839d 5733
b75a7d8f
A
5734 // If there were no matching characters, skip over the loop altogether.
5735 // The loop doesn't run at all, a * op always succeeds.
5736 if (ix == fp->fInputIdx) {
5737 fp->fPatIdx++; // skip the URX_LOOP_C op.
5738 break;
5739 }
57a6839d 5740
b75a7d8f
A
5741 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5742 // must follow. It's operand is the stack location
46f4442e 5743 // that holds the starting input index for the match of this .*
729e4ab9 5744 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
b75a7d8f
A
5745 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5746 int32_t stackLoc = URX_VAL(loopcOp);
46f4442e 5747 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
b75a7d8f
A
5748 fp->fExtra[stackLoc] = fp->fInputIdx;
5749 fp->fInputIdx = ix;
57a6839d 5750
b75a7d8f
A
5751 // Save State to the URX_LOOP_C op that follows this one,
5752 // so that match failures in the following code will return to there.
5753 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
46f4442e 5754 fp = StateSave(fp, fp->fPatIdx, status);
b75a7d8f
A
5755 fp->fPatIdx++;
5756 }
5757 break;
57a6839d
A
5758
5759
b75a7d8f
A
5760 case URX_LOOP_C:
5761 {
46f4442e 5762 U_ASSERT(opValue>=0 && opValue<fFrameSize);
729e4ab9
A
5763 backSearchIndex = (int32_t)fp->fExtra[opValue];
5764 U_ASSERT(backSearchIndex <= fp->fInputIdx);
5765 if (backSearchIndex == fp->fInputIdx) {
b75a7d8f 5766 // We've backed up the input idx to the point that the loop started.
57a6839d 5767 // The loop is done. Leave here without saving state.
b75a7d8f
A
5768 // Subsequent failures won't come back here.
5769 break;
5770 }
5771 // Set up for the next iteration of the loop, with input index
5772 // backed up by one from the last time through,
5773 // and a state save to this instruction in case the following code fails again.
5774 // (We're going backwards because this loop emulates stack unwinding, not
5775 // the initial scan forward.)
5776 U_ASSERT(fp->fInputIdx > 0);
729e4ab9
A
5777 UChar32 prevC;
5778 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
57a6839d
A
5779
5780 if (prevC == 0x0a &&
729e4ab9 5781 fp->fInputIdx > backSearchIndex &&
b75a7d8f 5782 inputBuf[fp->fInputIdx-1] == 0x0d) {
729e4ab9 5783 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
b75a7d8f
A
5784 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
5785 // .*, stepping back over CRLF pair.
729e4ab9 5786 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
b75a7d8f
A
5787 }
5788 }
57a6839d
A
5789
5790
46f4442e 5791 fp = StateSave(fp, fp->fPatIdx-1, status);
b75a7d8f
A
5792 }
5793 break;
57a6839d
A
5794
5795
5796
b75a7d8f
A
5797 default:
5798 // Trouble. The compiled pattern contains an entry with an
5799 // unrecognized type tag.
3d1f044b 5800 UPRV_UNREACHABLE;
b75a7d8f 5801 }
57a6839d 5802
b75a7d8f 5803 if (U_FAILURE(status)) {
46f4442e 5804 isMatch = FALSE;
b75a7d8f
A
5805 break;
5806 }
5807 }
57a6839d 5808
b75a7d8f
A
5809breakFromLoop:
5810 fMatch = isMatch;
5811 if (isMatch) {
5812 fLastMatchEnd = fMatchEnd;
5813 fMatchStart = startIdx;
5814 fMatchEnd = fp->fInputIdx;
b75a7d8f 5815 }
57a6839d
A
5816
5817#ifdef REGEX_RUN_DEBUG
5818 if (fTraceDebug) {
5819 if (isMatch) {
5820 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
5821 } else {
5822 printf("No match\n\n");
b75a7d8f
A
5823 }
5824 }
57a6839d
A
5825#endif
5826
b75a7d8f 5827 fFrame = fp; // The active stack frame when the engine stopped.
57a6839d
A
5828 // Contains the capture group results that we need to
5829 // access later.
b75a7d8f
A
5830
5831 return;
5832}
5833
5834
374ca955 5835UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
b75a7d8f
A
5836
5837U_NAMESPACE_END
5838
5839#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
0f5d89e8 5840