]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/rematch.cpp
ICU-57149.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / rematch.cpp
CommitLineData
b75a7d8f
A
1/*
2**************************************************************************
2ca993e8
A
3* Copyright (C) 2002-2016 International Business Machines Corporation
4* and others. All rights reserved.
b75a7d8f
A
5**************************************************************************
6*/
46f4442e
A
7//
8// file: rematch.cpp
9//
10// Contains the implementation of class RegexMatcher,
11// which is one of the main API classes for the ICU regular expression package.
12//
b75a7d8f
A
13
14#include "unicode/utypes.h"
15#if !UCONFIG_NO_REGULAR_EXPRESSIONS
16
17#include "unicode/regex.h"
18#include "unicode/uniset.h"
19#include "unicode/uchar.h"
20#include "unicode/ustring.h"
374ca955 21#include "unicode/rbbi.h"
4388f060
A
22#include "unicode/utf.h"
23#include "unicode/utf16.h"
b75a7d8f
A
24#include "uassert.h"
25#include "cmemory.h"
2ca993e8 26#include "cstr.h"
b75a7d8f
A
27#include "uvector.h"
28#include "uvectr32.h"
729e4ab9 29#include "uvectr64.h"
b75a7d8f
A
30#include "regeximp.h"
31#include "regexst.h"
729e4ab9
A
32#include "regextxt.h"
33#include "ucase.h"
b75a7d8f
A
34
35// #include <malloc.h> // Needed for heapcheck testing
36
2ca993e8 37
b75a7d8f
A
38U_NAMESPACE_BEGIN
39
46f4442e
A
40// Default limit for the size of the back track stack, to avoid system
41// failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
42// This value puts ICU's limits higher than most other regexp implementations,
43// which use recursion rather than the heap, and take more storage per
44// backtrack point.
45//
46static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
47
48// Time limit counter constant.
49// Time limits for expression evaluation are in terms of quanta of work by
50// the engine, each of which is 10,000 state saves.
51// This constant determines that state saves per tick number.
52static const int32_t TIMER_INITIAL_VALUE = 10000;
53
b331163b
A
54
55// Test for any of the Unicode line terminating characters.
56static inline UBool isLineTerminator(UChar32 c) {
57 if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
58 return false;
59 }
60 return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029;
61}
62
b75a7d8f
A
63//-----------------------------------------------------------------------------
64//
65// Constructor and Destructor
66//
67//-----------------------------------------------------------------------------
57a6839d 68RegexMatcher::RegexMatcher(const RegexPattern *pat) {
46f4442e
A
69 fDeferredStatus = U_ZERO_ERROR;
70 init(fDeferredStatus);
71 if (U_FAILURE(fDeferredStatus)) {
72 return;
73 }
b75a7d8f
A
74 if (pat==NULL) {
75 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
76 return;
77 }
46f4442e 78 fPattern = pat;
729e4ab9 79 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
b75a7d8f
A
80}
81
82
83
84RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
85 uint32_t flags, UErrorCode &status) {
46f4442e 86 init(status);
b75a7d8f
A
87 if (U_FAILURE(status)) {
88 return;
89 }
46f4442e
A
90 UParseError pe;
91 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
729e4ab9 92 fPattern = fPatternOwned;
57a6839d 93
729e4ab9
A
94 UText inputText = UTEXT_INITIALIZER;
95 utext_openConstUnicodeString(&inputText, &input, &status);
96 init2(&inputText, status);
97 utext_close(&inputText);
98
57a6839d 99 fInputUniStrMaybeMutable = TRUE;
729e4ab9
A
100}
101
102
103RegexMatcher::RegexMatcher(UText *regexp, UText *input,
104 uint32_t flags, UErrorCode &status) {
105 init(status);
106 if (U_FAILURE(status)) {
107 return;
108 }
109 UParseError pe;
110 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
111 if (U_FAILURE(status)) {
112 return;
113 }
114
46f4442e
A
115 fPattern = fPatternOwned;
116 init2(input, status);
b75a7d8f
A
117}
118
119
57a6839d 120RegexMatcher::RegexMatcher(const UnicodeString &regexp,
b75a7d8f 121 uint32_t flags, UErrorCode &status) {
46f4442e 122 init(status);
b75a7d8f
A
123 if (U_FAILURE(status)) {
124 return;
125 }
46f4442e
A
126 UParseError pe;
127 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
729e4ab9
A
128 if (U_FAILURE(status)) {
129 return;
130 }
131 fPattern = fPatternOwned;
132 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
133}
134
57a6839d 135RegexMatcher::RegexMatcher(UText *regexp,
729e4ab9
A
136 uint32_t flags, UErrorCode &status) {
137 init(status);
138 if (U_FAILURE(status)) {
139 return;
140 }
141 UParseError pe;
142 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
143 if (U_FAILURE(status)) {
144 return;
145 }
146
46f4442e 147 fPattern = fPatternOwned;
729e4ab9 148 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
b75a7d8f
A
149}
150
151
152
46f4442e 153
b75a7d8f
A
154RegexMatcher::~RegexMatcher() {
155 delete fStack;
156 if (fData != fSmallData) {
374ca955 157 uprv_free(fData);
b75a7d8f
A
158 fData = NULL;
159 }
160 if (fPatternOwned) {
161 delete fPatternOwned;
162 fPatternOwned = NULL;
163 fPattern = NULL;
164 }
57a6839d 165
729e4ab9
A
166 if (fInput) {
167 delete fInput;
168 }
169 if (fInputText) {
170 utext_close(fInputText);
171 }
172 if (fAltInputText) {
173 utext_close(fAltInputText);
174 }
57a6839d 175
374ca955
A
176 #if UCONFIG_NO_BREAK_ITERATION==0
177 delete fWordBreakItr;
178 #endif
b75a7d8f
A
179}
180
46f4442e
A
181//
182// init() common initialization for use by all constructors.
183// Initialize all fields, get the object into a consistent state.
184// This must be done even when the initial status shows an error,
185// so that the object is initialized sufficiently well for the destructor
186// to run safely.
187//
188void RegexMatcher::init(UErrorCode &status) {
189 fPattern = NULL;
190 fPatternOwned = NULL;
46f4442e
A
191 fFrameSize = 0;
192 fRegionStart = 0;
193 fRegionLimit = 0;
194 fAnchorStart = 0;
195 fAnchorLimit = 0;
196 fLookStart = 0;
197 fLookLimit = 0;
198 fActiveStart = 0;
199 fActiveLimit = 0;
200 fTransparentBounds = FALSE;
201 fAnchoringBounds = TRUE;
202 fMatch = FALSE;
203 fMatchStart = 0;
204 fMatchEnd = 0;
205 fLastMatchEnd = -1;
206 fAppendPosition = 0;
207 fHitEnd = FALSE;
208 fRequireEnd = FALSE;
209 fStack = NULL;
210 fFrame = NULL;
211 fTimeLimit = 0;
212 fTime = 0;
213 fTickCounter = 0;
214 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY;
215 fCallbackFn = NULL;
216 fCallbackContext = NULL;
729e4ab9
A
217 fFindProgressCallbackFn = NULL;
218 fFindProgressCallbackContext = NULL;
46f4442e
A
219 fTraceDebug = FALSE;
220 fDeferredStatus = status;
221 fData = fSmallData;
222 fWordBreakItr = NULL;
57a6839d 223
4388f060 224 fStack = NULL;
729e4ab9
A
225 fInputText = NULL;
226 fAltInputText = NULL;
227 fInput = NULL;
228 fInputLength = 0;
229 fInputUniStrMaybeMutable = FALSE;
46f4442e
A
230}
231
232//
233// init2() Common initialization for use by RegexMatcher constructors, part 2.
234// This handles the common setup to be done after the Pattern is available.
235//
729e4ab9 236void RegexMatcher::init2(UText *input, UErrorCode &status) {
46f4442e
A
237 if (U_FAILURE(status)) {
238 fDeferredStatus = status;
239 return;
240 }
241
2ca993e8 242 if (fPattern->fDataSize > UPRV_LENGTHOF(fSmallData)) {
57a6839d 243 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
46f4442e
A
244 if (fData == NULL) {
245 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
246 return;
247 }
248 }
249
4388f060
A
250 fStack = new UVector64(status);
251 if (fStack == NULL) {
252 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
253 return;
254 }
255
46f4442e
A
256 reset(input);
257 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
258 if (U_FAILURE(status)) {
259 fDeferredStatus = status;
260 return;
261 }
262}
b75a7d8f
A
263
264
265static const UChar BACKSLASH = 0x5c;
266static const UChar DOLLARSIGN = 0x24;
b331163b
A
267static const UChar LEFTBRACKET = 0x7b;
268static const UChar RIGHTBRACKET = 0x7d;
269
b75a7d8f
A
270//--------------------------------------------------------------------------------
271//
272// appendReplacement
273//
274//--------------------------------------------------------------------------------
275RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
276 const UnicodeString &replacement,
277 UErrorCode &status) {
729e4ab9 278 UText replacementText = UTEXT_INITIALIZER;
57a6839d 279
729e4ab9 280 utext_openConstUnicodeString(&replacementText, &replacement, &status);
57a6839d 281 if (U_SUCCESS(status)) {
729e4ab9
A
282 UText resultText = UTEXT_INITIALIZER;
283 utext_openUnicodeString(&resultText, &dest, &status);
57a6839d 284
729e4ab9
A
285 if (U_SUCCESS(status)) {
286 appendReplacement(&resultText, &replacementText, status);
287 utext_close(&resultText);
288 }
289 utext_close(&replacementText);
290 }
57a6839d 291
729e4ab9
A
292 return *this;
293}
294
295//
296// appendReplacement, UText mode
297//
298RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
299 UText *replacement,
300 UErrorCode &status) {
b75a7d8f
A
301 if (U_FAILURE(status)) {
302 return *this;
303 }
304 if (U_FAILURE(fDeferredStatus)) {
305 status = fDeferredStatus;
306 return *this;
307 }
308 if (fMatch == FALSE) {
309 status = U_REGEX_INVALID_STATE;
310 return *this;
311 }
57a6839d 312
b75a7d8f 313 // Copy input string from the end of previous match to start of current match
729e4ab9
A
314 int64_t destLen = utext_nativeLength(dest);
315 if (fMatchStart > fAppendPosition) {
316 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
57a6839d 317 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
729e4ab9
A
318 (int32_t)(fMatchStart-fAppendPosition), &status);
319 } else {
320 int32_t len16;
321 if (UTEXT_USES_U16(fInputText)) {
322 len16 = (int32_t)(fMatchStart-fAppendPosition);
323 } else {
324 UErrorCode lengthStatus = U_ZERO_ERROR;
325 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
326 }
327 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
328 if (inputChars == NULL) {
329 status = U_MEMORY_ALLOCATION_ERROR;
330 return *this;
331 }
332 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
333 destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
334 uprv_free(inputChars);
335 }
b75a7d8f 336 }
46f4442e 337 fAppendPosition = fMatchEnd;
57a6839d
A
338
339
b75a7d8f
A
340 // scan the replacement text, looking for substitutions ($n) and \escapes.
341 // TODO: optimize this loop by efficiently scanning for '$' or '\',
342 // move entire ranges not containing substitutions.
729e4ab9 343 UTEXT_SETNATIVEINDEX(replacement, 0);
b331163b 344 for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL; c = UTEXT_NEXT32(replacement)) {
b75a7d8f
A
345 if (c == BACKSLASH) {
346 // Backslash Escape. Copy the following char out without further checks.
347 // Note: Surrogate pairs don't need any special handling
348 // The second half wont be a '$' or a '\', and
349 // will move to the dest normally on the next
350 // loop iteration.
729e4ab9
A
351 c = UTEXT_CURRENT32(replacement);
352 if (c == U_SENTINEL) {
b75a7d8f
A
353 break;
354 }
57a6839d 355
b75a7d8f
A
356 if (c==0x55/*U*/ || c==0x75/*u*/) {
357 // We have a \udddd or \Udddddddd escape sequence.
729e4ab9
A
358 int32_t offset = 0;
359 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
360 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
b75a7d8f 361 if (escapedChar != (UChar32)0xFFFFFFFF) {
729e4ab9
A
362 if (U_IS_BMP(escapedChar)) {
363 UChar c16 = (UChar)escapedChar;
364 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
365 } else {
366 UChar surrogate[2];
367 surrogate[0] = U16_LEAD(escapedChar);
368 surrogate[1] = U16_TRAIL(escapedChar);
369 if (U_SUCCESS(status)) {
370 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
371 }
372 }
b75a7d8f
A
373 // TODO: Report errors for mal-formed \u escapes?
374 // As this is, the original sequence is output, which may be OK.
729e4ab9 375 if (context.lastOffset == offset) {
4388f060 376 (void)UTEXT_PREVIOUS32(replacement);
729e4ab9
A
377 } else if (context.lastOffset != offset-1) {
378 utext_moveIndex32(replacement, offset - context.lastOffset - 1);
379 }
380 }
381 } else {
4388f060 382 (void)UTEXT_NEXT32(replacement);
729e4ab9
A
383 // Plain backslash escape. Just put out the escaped character.
384 if (U_IS_BMP(c)) {
385 UChar c16 = (UChar)c;
386 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
387 } else {
388 UChar surrogate[2];
389 surrogate[0] = U16_LEAD(c);
390 surrogate[1] = U16_TRAIL(c);
391 if (U_SUCCESS(status)) {
392 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
393 }
b75a7d8f
A
394 }
395 }
729e4ab9 396 } else if (c != DOLLARSIGN) {
b75a7d8f 397 // Normal char, not a $. Copy it out without further checks.
729e4ab9
A
398 if (U_IS_BMP(c)) {
399 UChar c16 = (UChar)c;
400 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
401 } else {
402 UChar surrogate[2];
403 surrogate[0] = U16_LEAD(c);
404 surrogate[1] = U16_TRAIL(c);
405 if (U_SUCCESS(status)) {
406 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
407 }
b75a7d8f 408 }
729e4ab9 409 } else {
b331163b
A
410 // We've got a $. Pick up a capture group name or number if one follows.
411 // Consume digits so long as the resulting group number <= the number of
412 // number of capture groups in the pattern.
57a6839d 413
729e4ab9 414 int32_t groupNum = 0;
b331163b
A
415 int32_t numDigits = 0;
416 UChar32 nextChar = utext_current32(replacement);
417 if (nextChar == LEFTBRACKET) {
418 // Scan for a Named Capture Group, ${name}.
419 UnicodeString groupName;
420 utext_next32(replacement);
421 while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {
422 nextChar = utext_next32(replacement);
423 if (nextChar == U_SENTINEL) {
424 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
425 } else if ((nextChar >= 0x41 && nextChar <= 0x5a) || // A..Z
426 (nextChar >= 0x61 && nextChar <= 0x7a) || // a..z
427 (nextChar >= 0x31 && nextChar <= 0x39)) { // 0..9
428 groupName.append(nextChar);
429 } else if (nextChar == RIGHTBRACKET) {
430 groupNum = uhash_geti(fPattern->fNamedCaptureMap, &groupName);
431 if (groupNum == 0) {
432 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
433 }
434 } else {
435 // Character was something other than a name char or a closing '}'
436 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
437 }
729e4ab9 438 }
b331163b
A
439
440 } else if (u_isdigit(nextChar)) {
441 // $n Scan for a capture group number
442 int32_t numCaptureGroups = fPattern->fGroupMap->size();
443 for (;;) {
444 nextChar = UTEXT_CURRENT32(replacement);
445 if (nextChar == U_SENTINEL) {
446 break;
447 }
448 if (u_isdigit(nextChar) == FALSE) {
449 break;
450 }
451 int32_t nextDigitVal = u_charDigitValue(nextChar);
452 if (groupNum*10 + nextDigitVal > numCaptureGroups) {
453 // Don't consume the next digit if it makes the capture group number too big.
454 if (numDigits == 0) {
455 status = U_INDEX_OUTOFBOUNDS_ERROR;
456 }
457 break;
458 }
459 (void)UTEXT_NEXT32(replacement);
460 groupNum=groupNum*10 + nextDigitVal;
461 ++numDigits;
729e4ab9 462 }
b331163b
A
463 } else {
464 // $ not followed by capture group name or number.
465 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
b75a7d8f 466 }
57a6839d 467
b331163b 468 if (U_SUCCESS(status)) {
729e4ab9 469 destLen += appendGroup(groupNum, dest, status);
b75a7d8f 470 }
b331163b
A
471 } // End of $ capture group handling
472 } // End of per-character loop through the replacement string.
57a6839d 473
b75a7d8f
A
474 return *this;
475}
476
477
478
479//--------------------------------------------------------------------------------
480//
481// appendTail Intended to be used in conjunction with appendReplacement()
482// To the destination string, append everything following
483// the last match position from the input string.
484//
46f4442e
A
485// Note: Match ranges do not affect appendTail or appendReplacement
486//
b75a7d8f
A
487//--------------------------------------------------------------------------------
488UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
729e4ab9
A
489 UErrorCode status = U_ZERO_ERROR;
490 UText resultText = UTEXT_INITIALIZER;
491 utext_openUnicodeString(&resultText, &dest, &status);
57a6839d 492
729e4ab9
A
493 if (U_SUCCESS(status)) {
494 appendTail(&resultText, status);
495 utext_close(&resultText);
496 }
57a6839d 497
729e4ab9
A
498 return dest;
499}
500
501//
502// appendTail, UText mode
503//
504UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
729e4ab9 505 if (U_FAILURE(status)) {
57a6839d 506 return dest;
729e4ab9
A
507 }
508 if (U_FAILURE(fDeferredStatus)) {
509 status = fDeferredStatus;
57a6839d 510 return dest;
729e4ab9 511 }
57a6839d 512
729e4ab9
A
513 if (fInputLength > fAppendPosition) {
514 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
515 int64_t destLen = utext_nativeLength(dest);
57a6839d 516 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
729e4ab9
A
517 (int32_t)(fInputLength-fAppendPosition), &status);
518 } else {
519 int32_t len16;
520 if (UTEXT_USES_U16(fInputText)) {
521 len16 = (int32_t)(fInputLength-fAppendPosition);
522 } else {
523 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
524 status = U_ZERO_ERROR; // buffer overflow
525 }
57a6839d 526
729e4ab9
A
527 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
528 if (inputChars == NULL) {
529 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
530 } else {
57a6839d 531 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
729e4ab9
A
532 int64_t destLen = utext_nativeLength(dest);
533 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
534 uprv_free(inputChars);
535 }
536 }
b75a7d8f
A
537 }
538 return dest;
539}
540
541
542
543//--------------------------------------------------------------------------------
544//
545// end
546//
547//--------------------------------------------------------------------------------
548int32_t RegexMatcher::end(UErrorCode &err) const {
549 return end(0, err);
550}
551
729e4ab9
A
552int64_t RegexMatcher::end64(UErrorCode &err) const {
553 return end64(0, err);
554}
b75a7d8f 555
729e4ab9 556int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
b75a7d8f
A
557 if (U_FAILURE(err)) {
558 return -1;
559 }
560 if (fMatch == FALSE) {
561 err = U_REGEX_INVALID_STATE;
562 return -1;
563 }
564 if (group < 0 || group > fPattern->fGroupMap->size()) {
565 err = U_INDEX_OUTOFBOUNDS_ERROR;
566 return -1;
567 }
729e4ab9 568 int64_t e = -1;
b75a7d8f 569 if (group == 0) {
57a6839d 570 e = fMatchEnd;
b75a7d8f
A
571 } else {
572 // Get the position within the stack frame of the variables for
573 // this capture group.
574 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
575 U_ASSERT(groupOffset < fPattern->fFrameSize);
576 U_ASSERT(groupOffset >= 0);
577 e = fFrame->fExtra[groupOffset + 1];
578 }
57a6839d 579
729e4ab9 580 return e;
b75a7d8f
A
581}
582
729e4ab9
A
583int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
584 return (int32_t)end64(group, err);
585}
b75a7d8f 586
b331163b
A
587//--------------------------------------------------------------------------------
588//
589// findProgressInterrupt This function is called once for each advance in the target
590// string from the find() function, and calls the user progress callback
591// function if there is one installed.
592//
593// Return: TRUE if the find operation is to be terminated.
594// FALSE if the find operation is to continue running.
595//
596//--------------------------------------------------------------------------------
597UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) {
598 if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) {
599 status = U_REGEX_STOPPED_BY_CALLER;
600 return TRUE;
601 }
602 return FALSE;
603}
b75a7d8f
A
604
605//--------------------------------------------------------------------------------
606//
607// find()
608//
609//--------------------------------------------------------------------------------
610UBool RegexMatcher::find() {
b331163b
A
611 if (U_FAILURE(fDeferredStatus)) {
612 return FALSE;
613 }
614 UErrorCode status = U_ZERO_ERROR;
615 UBool result = find(status);
616 return result;
617}
618
619//--------------------------------------------------------------------------------
620//
621// find()
622//
623//--------------------------------------------------------------------------------
624UBool RegexMatcher::find(UErrorCode &status) {
b75a7d8f 625 // Start at the position of the last match end. (Will be zero if the
729e4ab9 626 // matcher has been reset.)
b75a7d8f 627 //
b331163b
A
628 if (U_FAILURE(status)) {
629 return FALSE;
630 }
b75a7d8f 631 if (U_FAILURE(fDeferredStatus)) {
b331163b 632 status = fDeferredStatus;
b75a7d8f
A
633 return FALSE;
634 }
57a6839d 635
729e4ab9 636 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
b331163b 637 return findUsingChunk(status);
729e4ab9 638 }
b75a7d8f 639
729e4ab9 640 int64_t startPos = fMatchEnd;
46f4442e
A
641 if (startPos==0) {
642 startPos = fActiveStart;
643 }
374ca955
A
644
645 if (fMatch) {
646 // Save the position of any previous successful match.
647 fLastMatchEnd = fMatchEnd;
648
649 if (fMatchStart == fMatchEnd) {
650 // Previous match had zero length. Move start position up one position
651 // to avoid sending find() into a loop on zero-length matches.
46f4442e 652 if (startPos >= fActiveLimit) {
374ca955 653 fMatch = FALSE;
46f4442e 654 fHitEnd = TRUE;
374ca955
A
655 return FALSE;
656 }
729e4ab9 657 UTEXT_SETNATIVEINDEX(fInputText, startPos);
4388f060 658 (void)UTEXT_NEXT32(fInputText);
729e4ab9 659 startPos = UTEXT_GETNATIVEINDEX(fInputText);
374ca955
A
660 }
661 } else {
662 if (fLastMatchEnd >= 0) {
663 // A previous find() failed to match. Don't try again.
664 // (without this test, a pattern with a zero-length match
665 // could match again at the end of an input string.)
46f4442e 666 fHitEnd = TRUE;
374ca955
A
667 return FALSE;
668 }
669 }
670
374ca955
A
671
672 // Compute the position in the input string beyond which a match can not begin, because
673 // the minimum length match would extend past the end of the input.
46f4442e
A
674 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
675 // Be aware of possible overflows if making changes here.
729e4ab9
A
676 int64_t testStartLimit;
677 if (UTEXT_USES_U16(fInputText)) {
678 testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
679 if (startPos > testStartLimit) {
680 fMatch = FALSE;
681 fHitEnd = TRUE;
682 return FALSE;
683 }
684 } else {
b331163b
A
685 // We don't know exactly how long the minimum match length is in native characters.
686 // Treat anything > 0 as 1.
687 testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0);
b75a7d8f
A
688 }
689
b75a7d8f
A
690 UChar32 c;
691 U_ASSERT(startPos >= 0);
692
693 switch (fPattern->fStartType) {
694 case START_NO_INFO:
57a6839d 695 // No optimization was found.
b75a7d8f
A
696 // Try a match at each input position.
697 for (;;) {
b331163b
A
698 MatchAt(startPos, FALSE, status);
699 if (U_FAILURE(status)) {
b75a7d8f
A
700 return FALSE;
701 }
702 if (fMatch) {
703 return TRUE;
704 }
729e4ab9 705 if (startPos >= testStartLimit) {
46f4442e 706 fHitEnd = TRUE;
b75a7d8f
A
707 return FALSE;
708 }
729e4ab9 709 UTEXT_SETNATIVEINDEX(fInputText, startPos);
4388f060 710 (void)UTEXT_NEXT32(fInputText);
729e4ab9 711 startPos = UTEXT_GETNATIVEINDEX(fInputText);
b75a7d8f
A
712 // Note that it's perfectly OK for a pattern to have a zero-length
713 // match at the end of a string, so we must make sure that the loop
729e4ab9 714 // runs with startPos == testStartLimit the last time through.
b331163b 715 if (findProgressInterrupt(startPos, status))
729e4ab9 716 return FALSE;
b75a7d8f
A
717 }
718 U_ASSERT(FALSE);
719
720 case START_START:
721 // Matches are only possible at the start of the input string
722 // (pattern begins with ^ or \A)
46f4442e 723 if (startPos > fActiveStart) {
374ca955 724 fMatch = FALSE;
b75a7d8f
A
725 return FALSE;
726 }
b331163b
A
727 MatchAt(startPos, FALSE, status);
728 if (U_FAILURE(status)) {
b75a7d8f
A
729 return FALSE;
730 }
731 return fMatch;
732
733
734 case START_SET:
735 {
736 // Match may start on any char from a pre-computed set.
737 U_ASSERT(fPattern->fMinMatchLen > 0);
729e4ab9 738 UTEXT_SETNATIVEINDEX(fInputText, startPos);
b75a7d8f 739 for (;;) {
b331163b 740 int64_t pos = startPos;
729e4ab9 741 c = UTEXT_NEXT32(fInputText);
b331163b 742 startPos = UTEXT_GETNATIVEINDEX(fInputText);
729e4ab9
A
743 // c will be -1 (U_SENTINEL) at end of text, in which case we
744 // skip this next block (so we don't have a negative array index)
745 // and handle end of text in the following block.
746 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
747 (c>=256 && fPattern->fInitialChars->contains(c)))) {
b331163b
A
748 MatchAt(pos, FALSE, status);
749 if (U_FAILURE(status)) {
b75a7d8f
A
750 return FALSE;
751 }
752 if (fMatch) {
753 return TRUE;
754 }
729e4ab9 755 UTEXT_SETNATIVEINDEX(fInputText, pos);
b75a7d8f 756 }
b331163b 757 if (startPos > testStartLimit) {
374ca955 758 fMatch = FALSE;
46f4442e 759 fHitEnd = TRUE;
b75a7d8f
A
760 return FALSE;
761 }
b331163b 762 if (findProgressInterrupt(startPos, status))
729e4ab9 763 return FALSE;
b75a7d8f
A
764 }
765 }
766 U_ASSERT(FALSE);
767
768 case START_STRING:
769 case START_CHAR:
770 {
771 // Match starts on exactly one char.
772 U_ASSERT(fPattern->fMinMatchLen > 0);
773 UChar32 theChar = fPattern->fInitialChar;
729e4ab9 774 UTEXT_SETNATIVEINDEX(fInputText, startPos);
b75a7d8f 775 for (;;) {
b331163b 776 int64_t pos = startPos;
729e4ab9 777 c = UTEXT_NEXT32(fInputText);
b331163b 778 startPos = UTEXT_GETNATIVEINDEX(fInputText);
b75a7d8f 779 if (c == theChar) {
b331163b
A
780 MatchAt(pos, FALSE, status);
781 if (U_FAILURE(status)) {
b75a7d8f
A
782 return FALSE;
783 }
784 if (fMatch) {
785 return TRUE;
786 }
2ca993e8 787 UTEXT_SETNATIVEINDEX(fInputText, startPos);
b75a7d8f 788 }
b331163b 789 if (startPos > testStartLimit) {
374ca955 790 fMatch = FALSE;
46f4442e 791 fHitEnd = TRUE;
b75a7d8f
A
792 return FALSE;
793 }
b331163b 794 if (findProgressInterrupt(startPos, status))
729e4ab9
A
795 return FALSE;
796 }
b75a7d8f
A
797 }
798 U_ASSERT(FALSE);
799
800 case START_LINE:
801 {
802 UChar32 c;
46f4442e 803 if (startPos == fAnchorStart) {
b331163b
A
804 MatchAt(startPos, FALSE, status);
805 if (U_FAILURE(status)) {
b75a7d8f
A
806 return FALSE;
807 }
808 if (fMatch) {
809 return TRUE;
810 }
729e4ab9
A
811 UTEXT_SETNATIVEINDEX(fInputText, startPos);
812 c = UTEXT_NEXT32(fInputText);
813 startPos = UTEXT_GETNATIVEINDEX(fInputText);
814 } else {
815 UTEXT_SETNATIVEINDEX(fInputText, startPos);
816 c = UTEXT_PREVIOUS32(fInputText);
817 UTEXT_SETNATIVEINDEX(fInputText, startPos);
b75a7d8f
A
818 }
819
46f4442e 820 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
729e4ab9 821 for (;;) {
46f4442e 822 if (c == 0x0a) {
b331163b
A
823 MatchAt(startPos, FALSE, status);
824 if (U_FAILURE(status)) {
46f4442e
A
825 return FALSE;
826 }
827 if (fMatch) {
828 return TRUE;
829 }
729e4ab9 830 UTEXT_SETNATIVEINDEX(fInputText, startPos);
46f4442e 831 }
729e4ab9 832 if (startPos >= testStartLimit) {
46f4442e
A
833 fMatch = FALSE;
834 fHitEnd = TRUE;
835 return FALSE;
836 }
729e4ab9
A
837 c = UTEXT_NEXT32(fInputText);
838 startPos = UTEXT_GETNATIVEINDEX(fInputText);
46f4442e
A
839 // Note that it's perfectly OK for a pattern to have a zero-length
840 // match at the end of a string, so we must make sure that the loop
729e4ab9 841 // runs with startPos == testStartLimit the last time through.
b331163b 842 if (findProgressInterrupt(startPos, status))
729e4ab9 843 return FALSE;
b75a7d8f 844 }
46f4442e
A
845 } else {
846 for (;;) {
b331163b
A
847 if (isLineTerminator(c)) {
848 if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
849 (void)UTEXT_NEXT32(fInputText);
850 startPos = UTEXT_GETNATIVEINDEX(fInputText);
851 }
852 MatchAt(startPos, FALSE, status);
853 if (U_FAILURE(status)) {
854 return FALSE;
855 }
856 if (fMatch) {
857 return TRUE;
858 }
859 UTEXT_SETNATIVEINDEX(fInputText, startPos);
46f4442e 860 }
729e4ab9 861 if (startPos >= testStartLimit) {
46f4442e
A
862 fMatch = FALSE;
863 fHitEnd = TRUE;
864 return FALSE;
865 }
729e4ab9
A
866 c = UTEXT_NEXT32(fInputText);
867 startPos = UTEXT_GETNATIVEINDEX(fInputText);
46f4442e
A
868 // Note that it's perfectly OK for a pattern to have a zero-length
869 // match at the end of a string, so we must make sure that the loop
729e4ab9 870 // runs with startPos == testStartLimit the last time through.
b331163b 871 if (findProgressInterrupt(startPos, status))
729e4ab9 872 return FALSE;
b75a7d8f 873 }
b75a7d8f
A
874 }
875 }
876
877 default:
878 U_ASSERT(FALSE);
879 }
880
881 U_ASSERT(FALSE);
882 return FALSE;
883}
884
885
886
729e4ab9 887UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
b75a7d8f
A
888 if (U_FAILURE(status)) {
889 return FALSE;
890 }
891 if (U_FAILURE(fDeferredStatus)) {
892 status = fDeferredStatus;
893 return FALSE;
894 }
46f4442e
A
895 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
896 // This will reset the region to be the full input length.
729e4ab9
A
897 if (start < 0) {
898 status = U_INDEX_OUTOFBOUNDS_ERROR;
899 return FALSE;
900 }
57a6839d 901
729e4ab9
A
902 int64_t nativeStart = start;
903 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
b75a7d8f
A
904 status = U_INDEX_OUTOFBOUNDS_ERROR;
905 return FALSE;
906 }
57a6839d 907 fMatchEnd = nativeStart;
b331163b 908 return find(status);
b75a7d8f
A
909}
910
911
b75a7d8f
A
912//--------------------------------------------------------------------------------
913//
729e4ab9
A
914// findUsingChunk() -- like find(), but with the advance knowledge that the
915// entire string is available in the UText's chunk buffer.
b75a7d8f
A
916//
917//--------------------------------------------------------------------------------
b331163b 918UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
729e4ab9
A
919 // Start at the position of the last match end. (Will be zero if the
920 // matcher has been reset.
921 //
b75a7d8f 922
729e4ab9
A
923 int32_t startPos = (int32_t)fMatchEnd;
924 if (startPos==0) {
925 startPos = (int32_t)fActiveStart;
b75a7d8f 926 }
57a6839d 927
729e4ab9 928 const UChar *inputBuf = fInputText->chunkContents;
b75a7d8f 929
729e4ab9
A
930 if (fMatch) {
931 // Save the position of any previous successful match.
932 fLastMatchEnd = fMatchEnd;
57a6839d 933
729e4ab9
A
934 if (fMatchStart == fMatchEnd) {
935 // Previous match had zero length. Move start position up one position
936 // to avoid sending find() into a loop on zero-length matches.
937 if (startPos >= fActiveLimit) {
938 fMatch = FALSE;
939 fHitEnd = TRUE;
940 return FALSE;
941 }
942 U16_FWD_1(inputBuf, startPos, fInputLength);
943 }
944 } else {
945 if (fLastMatchEnd >= 0) {
946 // A previous find() failed to match. Don't try again.
947 // (without this test, a pattern with a zero-length match
948 // could match again at the end of an input string.)
949 fHitEnd = TRUE;
950 return FALSE;
951 }
b75a7d8f 952 }
57a6839d
A
953
954
729e4ab9
A
955 // Compute the position in the input string beyond which a match can not begin, because
956 // the minimum length match would extend past the end of the input.
957 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
958 // Be aware of possible overflows if making changes here.
b331163b 959 // Note: a match can begin at inputBuf + testLen; it is an inclusive limit.
729e4ab9
A
960 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
961 if (startPos > testLen) {
962 fMatch = FALSE;
963 fHitEnd = TRUE;
b75a7d8f
A
964 return FALSE;
965 }
57a6839d 966
729e4ab9
A
967 UChar32 c;
968 U_ASSERT(startPos >= 0);
57a6839d 969
729e4ab9
A
970 switch (fPattern->fStartType) {
971 case START_NO_INFO:
57a6839d 972 // No optimization was found.
729e4ab9
A
973 // Try a match at each input position.
974 for (;;) {
b331163b
A
975 MatchChunkAt(startPos, FALSE, status);
976 if (U_FAILURE(status)) {
729e4ab9
A
977 return FALSE;
978 }
979 if (fMatch) {
980 return TRUE;
981 }
982 if (startPos >= testLen) {
983 fHitEnd = TRUE;
984 return FALSE;
985 }
986 U16_FWD_1(inputBuf, startPos, fActiveLimit);
987 // Note that it's perfectly OK for a pattern to have a zero-length
988 // match at the end of a string, so we must make sure that the loop
989 // runs with startPos == testLen the last time through.
b331163b 990 if (findProgressInterrupt(startPos, status))
729e4ab9
A
991 return FALSE;
992 }
993 U_ASSERT(FALSE);
57a6839d 994
729e4ab9
A
995 case START_START:
996 // Matches are only possible at the start of the input string
997 // (pattern begins with ^ or \A)
998 if (startPos > fActiveStart) {
999 fMatch = FALSE;
1000 return FALSE;
1001 }
b331163b
A
1002 MatchChunkAt(startPos, FALSE, status);
1003 if (U_FAILURE(status)) {
729e4ab9
A
1004 return FALSE;
1005 }
1006 return fMatch;
57a6839d
A
1007
1008
729e4ab9
A
1009 case START_SET:
1010 {
1011 // Match may start on any char from a pre-computed set.
1012 U_ASSERT(fPattern->fMinMatchLen > 0);
1013 for (;;) {
1014 int32_t pos = startPos;
1015 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
1016 if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
1017 (c>=256 && fPattern->fInitialChars->contains(c))) {
b331163b
A
1018 MatchChunkAt(pos, FALSE, status);
1019 if (U_FAILURE(status)) {
729e4ab9
A
1020 return FALSE;
1021 }
1022 if (fMatch) {
1023 return TRUE;
1024 }
1025 }
b331163b 1026 if (startPos > testLen) {
729e4ab9
A
1027 fMatch = FALSE;
1028 fHitEnd = TRUE;
1029 return FALSE;
1030 }
b331163b 1031 if (findProgressInterrupt(startPos, status))
729e4ab9
A
1032 return FALSE;
1033 }
b75a7d8f 1034 }
729e4ab9 1035 U_ASSERT(FALSE);
57a6839d 1036
729e4ab9
A
1037 case START_STRING:
1038 case START_CHAR:
1039 {
1040 // Match starts on exactly one char.
1041 U_ASSERT(fPattern->fMinMatchLen > 0);
1042 UChar32 theChar = fPattern->fInitialChar;
1043 for (;;) {
1044 int32_t pos = startPos;
1045 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
1046 if (c == theChar) {
b331163b
A
1047 MatchChunkAt(pos, FALSE, status);
1048 if (U_FAILURE(status)) {
729e4ab9
A
1049 return FALSE;
1050 }
1051 if (fMatch) {
1052 return TRUE;
1053 }
1054 }
b331163b 1055 if (startPos > testLen) {
729e4ab9
A
1056 fMatch = FALSE;
1057 fHitEnd = TRUE;
1058 return FALSE;
1059 }
b331163b 1060 if (findProgressInterrupt(startPos, status))
729e4ab9
A
1061 return FALSE;
1062 }
1063 }
b331163b 1064 U_ASSERT(FALSE);
57a6839d 1065
729e4ab9
A
1066 case START_LINE:
1067 {
1068 UChar32 c;
1069 if (startPos == fAnchorStart) {
b331163b
A
1070 MatchChunkAt(startPos, FALSE, status);
1071 if (U_FAILURE(status)) {
729e4ab9
A
1072 return FALSE;
1073 }
1074 if (fMatch) {
1075 return TRUE;
1076 }
1077 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1078 }
57a6839d 1079
729e4ab9
A
1080 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1081 for (;;) {
1082 c = inputBuf[startPos-1];
1083 if (c == 0x0a) {
b331163b
A
1084 MatchChunkAt(startPos, FALSE, status);
1085 if (U_FAILURE(status)) {
729e4ab9
A
1086 return FALSE;
1087 }
1088 if (fMatch) {
1089 return TRUE;
1090 }
1091 }
1092 if (startPos >= testLen) {
1093 fMatch = FALSE;
1094 fHitEnd = TRUE;
1095 return FALSE;
1096 }
1097 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1098 // Note that it's perfectly OK for a pattern to have a zero-length
1099 // match at the end of a string, so we must make sure that the loop
1100 // runs with startPos == testLen the last time through.
b331163b 1101 if (findProgressInterrupt(startPos, status))
729e4ab9
A
1102 return FALSE;
1103 }
1104 } else {
1105 for (;;) {
1106 c = inputBuf[startPos-1];
b331163b 1107 if (isLineTerminator(c)) {
729e4ab9
A
1108 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1109 startPos++;
1110 }
b331163b
A
1111 MatchChunkAt(startPos, FALSE, status);
1112 if (U_FAILURE(status)) {
729e4ab9
A
1113 return FALSE;
1114 }
1115 if (fMatch) {
1116 return TRUE;
1117 }
1118 }
1119 if (startPos >= testLen) {
1120 fMatch = FALSE;
1121 fHitEnd = TRUE;
1122 return FALSE;
1123 }
1124 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1125 // Note that it's perfectly OK for a pattern to have a zero-length
1126 // match at the end of a string, so we must make sure that the loop
1127 // runs with startPos == testLen the last time through.
b331163b 1128 if (findProgressInterrupt(startPos, status))
729e4ab9
A
1129 return FALSE;
1130 }
1131 }
1132 }
57a6839d 1133
729e4ab9
A
1134 default:
1135 U_ASSERT(FALSE);
1136 }
57a6839d 1137
729e4ab9
A
1138 U_ASSERT(FALSE);
1139 return FALSE;
1140}
1141
1142
1143
1144//--------------------------------------------------------------------------------
1145//
1146// group()
1147//
1148//--------------------------------------------------------------------------------
1149UnicodeString RegexMatcher::group(UErrorCode &status) const {
1150 return group(0, status);
b75a7d8f
A
1151}
1152
729e4ab9
A
1153// Return immutable shallow clone
1154UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1155 return group(0, dest, group_len, status);
1156}
b75a7d8f 1157
729e4ab9
A
1158// Return immutable shallow clone
1159UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1160 group_len = 0;
374ca955 1161 if (U_FAILURE(status)) {
729e4ab9 1162 return dest;
374ca955
A
1163 }
1164 if (U_FAILURE(fDeferredStatus)) {
1165 status = fDeferredStatus;
57a6839d 1166 } else if (fMatch == FALSE) {
729e4ab9 1167 status = U_REGEX_INVALID_STATE;
57a6839d 1168 } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
374ca955 1169 status = U_INDEX_OUTOFBOUNDS_ERROR;
374ca955 1170 }
57a6839d
A
1171
1172 if (U_FAILURE(status)) {
1173 return dest;
729e4ab9 1174 }
57a6839d 1175
729e4ab9
A
1176 int64_t s, e;
1177 if (groupNum == 0) {
1178 s = fMatchStart;
1179 e = fMatchEnd;
1180 } else {
1181 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1182 U_ASSERT(groupOffset < fPattern->fFrameSize);
1183 U_ASSERT(groupOffset >= 0);
1184 s = fFrame->fExtra[groupOffset];
1185 e = fFrame->fExtra[groupOffset+1];
1186 }
1187
1188 if (s < 0) {
1189 // A capture group wasn't part of the match
1190 return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1191 }
1192 U_ASSERT(s <= e);
1193 group_len = e - s;
57a6839d 1194
729e4ab9
A
1195 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1196 if (dest)
1197 UTEXT_SETNATIVEINDEX(dest, s);
1198 return dest;
374ca955
A
1199}
1200
729e4ab9
A
1201UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1202 UnicodeString result;
b331163b
A
1203 int64_t groupStart = start64(groupNum, status);
1204 int64_t groupEnd = end64(groupNum, status);
1205 if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) {
729e4ab9
A
1206 return result;
1207 }
57a6839d 1208
b331163b
A
1209 // Get the group length using a utext_extract preflight.
1210 // UText is actually pretty efficient at this when underlying encoding is UTF-16.
1211 int32_t length = utext_extract(fInputText, groupStart, groupEnd, NULL, 0, &status);
1212 if (status != U_BUFFER_OVERFLOW_ERROR) {
1213 return result;
729e4ab9 1214 }
57a6839d 1215
b331163b
A
1216 status = U_ZERO_ERROR;
1217 UChar *buf = result.getBuffer(length);
1218 if (buf == NULL) {
1219 status = U_MEMORY_ALLOCATION_ERROR;
729e4ab9 1220 } else {
b331163b
A
1221 int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status);
1222 result.releaseBuffer(extractLength);
1223 U_ASSERT(length == extractLength);
729e4ab9 1224 }
b331163b 1225 return result;
b75a7d8f
A
1226}
1227
b331163b 1228
729e4ab9
A
1229//--------------------------------------------------------------------------------
1230//
1231// appendGroup() -- currently internal only, appends a group to a UText rather
1232// than replacing its contents
1233//
1234//--------------------------------------------------------------------------------
b75a7d8f 1235
729e4ab9 1236int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
374ca955 1237 if (U_FAILURE(status)) {
729e4ab9 1238 return 0;
374ca955
A
1239 }
1240 if (U_FAILURE(fDeferredStatus)) {
1241 status = fDeferredStatus;
729e4ab9 1242 return 0;
374ca955 1243 }
729e4ab9 1244 int64_t destLen = utext_nativeLength(dest);
57a6839d 1245
729e4ab9
A
1246 if (fMatch == FALSE) {
1247 status = U_REGEX_INVALID_STATE;
1248 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1249 }
1250 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
374ca955 1251 status = U_INDEX_OUTOFBOUNDS_ERROR;
729e4ab9 1252 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
374ca955 1253 }
57a6839d 1254
729e4ab9
A
1255 int64_t s, e;
1256 if (groupNum == 0) {
1257 s = fMatchStart;
1258 e = fMatchEnd;
1259 } else {
1260 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1261 U_ASSERT(groupOffset < fPattern->fFrameSize);
1262 U_ASSERT(groupOffset >= 0);
1263 s = fFrame->fExtra[groupOffset];
1264 e = fFrame->fExtra[groupOffset+1];
1265 }
57a6839d 1266
729e4ab9 1267 if (s < 0) {
57a6839d 1268 // A capture group wasn't part of the match
729e4ab9
A
1269 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1270 }
1271 U_ASSERT(s <= e);
57a6839d 1272
729e4ab9
A
1273 int64_t deltaLen;
1274 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1275 U_ASSERT(e <= fInputLength);
1276 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1277 } else {
1278 int32_t len16;
1279 if (UTEXT_USES_U16(fInputText)) {
1280 len16 = (int32_t)(e-s);
1281 } else {
1282 UErrorCode lengthStatus = U_ZERO_ERROR;
1283 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1284 }
1285 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1286 if (groupChars == NULL) {
1287 status = U_MEMORY_ALLOCATION_ERROR;
1288 return 0;
1289 }
1290 utext_extract(fInputText, s, e, groupChars, len16+1, &status);
57a6839d 1291
729e4ab9
A
1292 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1293 uprv_free(groupChars);
1294 }
1295 return deltaLen;
374ca955
A
1296}
1297
b75a7d8f
A
1298
1299
46f4442e
A
1300//--------------------------------------------------------------------------------
1301//
729e4ab9 1302// groupCount()
46f4442e
A
1303//
1304//--------------------------------------------------------------------------------
729e4ab9
A
1305int32_t RegexMatcher::groupCount() const {
1306 return fPattern->fGroupMap->size();
b75a7d8f
A
1307}
1308
46f4442e
A
1309//--------------------------------------------------------------------------------
1310//
729e4ab9
A
1311// hasAnchoringBounds()
1312//
1313//--------------------------------------------------------------------------------
1314UBool RegexMatcher::hasAnchoringBounds() const {
1315 return fAnchoringBounds;
1316}
1317
1318
1319//--------------------------------------------------------------------------------
1320//
1321// hasTransparentBounds()
1322//
1323//--------------------------------------------------------------------------------
1324UBool RegexMatcher::hasTransparentBounds() const {
1325 return fTransparentBounds;
1326}
1327
1328
1329
1330//--------------------------------------------------------------------------------
1331//
1332// hitEnd()
1333//
1334//--------------------------------------------------------------------------------
1335UBool RegexMatcher::hitEnd() const {
1336 return fHitEnd;
1337}
1338
1339
1340//--------------------------------------------------------------------------------
1341//
1342// input()
1343//
1344//--------------------------------------------------------------------------------
1345const UnicodeString &RegexMatcher::input() const {
1346 if (!fInput) {
1347 UErrorCode status = U_ZERO_ERROR;
1348 int32_t len16;
1349 if (UTEXT_USES_U16(fInputText)) {
1350 len16 = (int32_t)fInputLength;
1351 } else {
1352 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1353 status = U_ZERO_ERROR; // overflow, length status
1354 }
1355 UnicodeString *result = new UnicodeString(len16, 0, 0);
57a6839d 1356
729e4ab9
A
1357 UChar *inputChars = result->getBuffer(len16);
1358 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1359 result->releaseBuffer(len16);
57a6839d 1360
729e4ab9
A
1361 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1362 }
57a6839d 1363
729e4ab9
A
1364 return *fInput;
1365}
1366
1367//--------------------------------------------------------------------------------
1368//
1369// inputText()
1370//
1371//--------------------------------------------------------------------------------
1372UText *RegexMatcher::inputText() const {
1373 return fInputText;
1374}
1375
1376
1377//--------------------------------------------------------------------------------
1378//
1379// getInput() -- like inputText(), but makes a clone or copies into another UText
1380//
1381//--------------------------------------------------------------------------------
1382UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
729e4ab9
A
1383 if (U_FAILURE(status)) {
1384 return dest;
1385 }
1386 if (U_FAILURE(fDeferredStatus)) {
1387 status = fDeferredStatus;
57a6839d 1388 return dest;
729e4ab9 1389 }
57a6839d 1390
729e4ab9
A
1391 if (dest) {
1392 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1393 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1394 } else {
1395 int32_t input16Len;
1396 if (UTEXT_USES_U16(fInputText)) {
1397 input16Len = (int32_t)fInputLength;
1398 } else {
1399 UErrorCode lengthStatus = U_ZERO_ERROR;
1400 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1401 }
1402 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1403 if (inputChars == NULL) {
1404 return dest;
1405 }
57a6839d 1406
729e4ab9
A
1407 status = U_ZERO_ERROR;
1408 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1409 status = U_ZERO_ERROR;
1410 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
57a6839d 1411
729e4ab9
A
1412 uprv_free(inputChars);
1413 }
1414 return dest;
1415 } else {
1416 return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1417 }
1418}
1419
1420
1421static UBool compat_SyncMutableUTextContents(UText *ut);
1422static UBool compat_SyncMutableUTextContents(UText *ut) {
1423 UBool retVal = FALSE;
57a6839d 1424
729e4ab9
A
1425 // In the following test, we're really only interested in whether the UText should switch
1426 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents
1427 // will still point to the correct data.
1428 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1429 UnicodeString *us=(UnicodeString *)ut->context;
57a6839d 1430
729e4ab9
A
1431 // Update to the latest length.
1432 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1433 int32_t newLength = us->length();
57a6839d 1434
729e4ab9
A
1435 // Update the chunk description.
1436 // The buffer may have switched between stack- and heap-based.
1437 ut->chunkContents = us->getBuffer();
1438 ut->chunkLength = newLength;
1439 ut->chunkNativeLimit = newLength;
1440 ut->nativeIndexingLimit = newLength;
1441 retVal = TRUE;
1442 }
1443
1444 return retVal;
1445}
1446
1447//--------------------------------------------------------------------------------
1448//
1449// lookingAt()
1450//
1451//--------------------------------------------------------------------------------
1452UBool RegexMatcher::lookingAt(UErrorCode &status) {
1453 if (U_FAILURE(status)) {
1454 return FALSE;
1455 }
1456 if (U_FAILURE(fDeferredStatus)) {
1457 status = fDeferredStatus;
1458 return FALSE;
1459 }
57a6839d 1460
729e4ab9
A
1461 if (fInputUniStrMaybeMutable) {
1462 if (compat_SyncMutableUTextContents(fInputText)) {
1463 fInputLength = utext_nativeLength(fInputText);
1464 reset();
1465 }
1466 }
1467 else {
1468 resetPreserveRegion();
1469 }
1470 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1471 MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1472 } else {
1473 MatchAt(fActiveStart, FALSE, status);
1474 }
1475 return fMatch;
1476}
1477
1478
1479UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1480 if (U_FAILURE(status)) {
1481 return FALSE;
1482 }
1483 if (U_FAILURE(fDeferredStatus)) {
1484 status = fDeferredStatus;
1485 return FALSE;
1486 }
1487 reset();
57a6839d 1488
729e4ab9
A
1489 if (start < 0) {
1490 status = U_INDEX_OUTOFBOUNDS_ERROR;
1491 return FALSE;
1492 }
57a6839d 1493
729e4ab9
A
1494 if (fInputUniStrMaybeMutable) {
1495 if (compat_SyncMutableUTextContents(fInputText)) {
1496 fInputLength = utext_nativeLength(fInputText);
1497 reset();
1498 }
1499 }
1500
1501 int64_t nativeStart;
1502 nativeStart = start;
1503 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1504 status = U_INDEX_OUTOFBOUNDS_ERROR;
1505 return FALSE;
1506 }
57a6839d 1507
729e4ab9
A
1508 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1509 MatchChunkAt((int32_t)nativeStart, FALSE, status);
1510 } else {
1511 MatchAt(nativeStart, FALSE, status);
1512 }
1513 return fMatch;
1514}
1515
1516
1517
1518//--------------------------------------------------------------------------------
1519//
1520// matches()
1521//
1522//--------------------------------------------------------------------------------
1523UBool RegexMatcher::matches(UErrorCode &status) {
1524 if (U_FAILURE(status)) {
1525 return FALSE;
1526 }
1527 if (U_FAILURE(fDeferredStatus)) {
1528 status = fDeferredStatus;
1529 return FALSE;
1530 }
1531
1532 if (fInputUniStrMaybeMutable) {
1533 if (compat_SyncMutableUTextContents(fInputText)) {
1534 fInputLength = utext_nativeLength(fInputText);
1535 reset();
1536 }
1537 }
1538 else {
1539 resetPreserveRegion();
1540 }
1541
1542 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1543 MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1544 } else {
1545 MatchAt(fActiveStart, TRUE, status);
1546 }
1547 return fMatch;
1548}
1549
1550
1551UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1552 if (U_FAILURE(status)) {
1553 return FALSE;
1554 }
1555 if (U_FAILURE(fDeferredStatus)) {
1556 status = fDeferredStatus;
1557 return FALSE;
1558 }
1559 reset();
57a6839d 1560
729e4ab9
A
1561 if (start < 0) {
1562 status = U_INDEX_OUTOFBOUNDS_ERROR;
1563 return FALSE;
1564 }
1565
1566 if (fInputUniStrMaybeMutable) {
1567 if (compat_SyncMutableUTextContents(fInputText)) {
1568 fInputLength = utext_nativeLength(fInputText);
1569 reset();
1570 }
1571 }
1572
1573 int64_t nativeStart;
1574 nativeStart = start;
1575 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1576 status = U_INDEX_OUTOFBOUNDS_ERROR;
1577 return FALSE;
1578 }
1579
1580 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1581 MatchChunkAt((int32_t)nativeStart, TRUE, status);
1582 } else {
1583 MatchAt(nativeStart, TRUE, status);
1584 }
1585 return fMatch;
1586}
1587
1588
1589
1590//--------------------------------------------------------------------------------
1591//
1592// pattern
1593//
1594//--------------------------------------------------------------------------------
1595const RegexPattern &RegexMatcher::pattern() const {
1596 return *fPattern;
1597}
1598
1599
1600
1601//--------------------------------------------------------------------------------
1602//
1603// region
46f4442e
A
1604//
1605//--------------------------------------------------------------------------------
729e4ab9 1606RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
46f4442e
A
1607 if (U_FAILURE(status)) {
1608 return *this;
1609 }
57a6839d 1610
729e4ab9 1611 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
46f4442e
A
1612 status = U_ILLEGAL_ARGUMENT_ERROR;
1613 }
57a6839d 1614
729e4ab9
A
1615 int64_t nativeStart = regionStart;
1616 int64_t nativeLimit = regionLimit;
1617 if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1618 status = U_ILLEGAL_ARGUMENT_ERROR;
1619 }
1620
1621 if (startIndex == -1)
1622 this->reset();
1623 else
57a6839d
A
1624 resetPreserveRegion();
1625
729e4ab9
A
1626 fRegionStart = nativeStart;
1627 fRegionLimit = nativeLimit;
1628 fActiveStart = nativeStart;
1629 fActiveLimit = nativeLimit;
1630
1631 if (startIndex != -1) {
1632 if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1633 status = U_INDEX_OUTOFBOUNDS_ERROR;
1634 }
57a6839d 1635 fMatchEnd = startIndex;
729e4ab9
A
1636 }
1637
46f4442e 1638 if (!fTransparentBounds) {
729e4ab9
A
1639 fLookStart = nativeStart;
1640 fLookLimit = nativeLimit;
46f4442e
A
1641 }
1642 if (fAnchoringBounds) {
729e4ab9
A
1643 fAnchorStart = nativeStart;
1644 fAnchorLimit = nativeLimit;
46f4442e
A
1645 }
1646 return *this;
1647}
1648
729e4ab9
A
1649RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1650 return region(start, limit, -1, status);
1651}
46f4442e
A
1652
1653//--------------------------------------------------------------------------------
1654//
1655// regionEnd
1656//
1657//--------------------------------------------------------------------------------
1658int32_t RegexMatcher::regionEnd() const {
729e4ab9 1659 return (int32_t)fRegionLimit;
46f4442e
A
1660}
1661
729e4ab9
A
1662int64_t RegexMatcher::regionEnd64() const {
1663 return fRegionLimit;
1664}
46f4442e
A
1665
1666//--------------------------------------------------------------------------------
1667//
1668// regionStart
1669//
1670//--------------------------------------------------------------------------------
1671int32_t RegexMatcher::regionStart() const {
729e4ab9
A
1672 return (int32_t)fRegionStart;
1673}
1674
1675int64_t RegexMatcher::regionStart64() const {
46f4442e
A
1676 return fRegionStart;
1677}
1678
1679
b75a7d8f
A
1680//--------------------------------------------------------------------------------
1681//
1682// replaceAll
1683//
1684//--------------------------------------------------------------------------------
1685UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
729e4ab9
A
1686 UText replacementText = UTEXT_INITIALIZER;
1687 UText resultText = UTEXT_INITIALIZER;
1688 UnicodeString resultString;
1689 if (U_FAILURE(status)) {
1690 return resultString;
1691 }
57a6839d 1692
729e4ab9
A
1693 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1694 utext_openUnicodeString(&resultText, &resultString, &status);
57a6839d 1695
729e4ab9
A
1696 replaceAll(&replacementText, &resultText, status);
1697
1698 utext_close(&resultText);
1699 utext_close(&replacementText);
57a6839d 1700
729e4ab9
A
1701 return resultString;
1702}
1703
1704
1705//
1706// replaceAll, UText mode
1707//
1708UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
b75a7d8f 1709 if (U_FAILURE(status)) {
729e4ab9 1710 return dest;
b75a7d8f
A
1711 }
1712 if (U_FAILURE(fDeferredStatus)) {
1713 status = fDeferredStatus;
729e4ab9 1714 return dest;
b75a7d8f 1715 }
57a6839d 1716
729e4ab9
A
1717 if (dest == NULL) {
1718 UnicodeString emptyString;
1719 UText empty = UTEXT_INITIALIZER;
57a6839d 1720
729e4ab9
A
1721 utext_openUnicodeString(&empty, &emptyString, &status);
1722 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1723 utext_close(&empty);
1724 }
1725
1726 if (U_SUCCESS(status)) {
1727 reset();
1728 while (find()) {
1729 appendReplacement(dest, replacement, status);
1730 if (U_FAILURE(status)) {
1731 break;
1732 }
b75a7d8f 1733 }
729e4ab9 1734 appendTail(dest, status);
b75a7d8f 1735 }
57a6839d 1736
729e4ab9 1737 return dest;
b75a7d8f
A
1738}
1739
1740
b75a7d8f
A
1741//--------------------------------------------------------------------------------
1742//
1743// replaceFirst
1744//
1745//--------------------------------------------------------------------------------
1746UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
729e4ab9
A
1747 UText replacementText = UTEXT_INITIALIZER;
1748 UText resultText = UTEXT_INITIALIZER;
1749 UnicodeString resultString;
57a6839d 1750
729e4ab9
A
1751 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1752 utext_openUnicodeString(&resultText, &resultString, &status);
57a6839d 1753
729e4ab9 1754 replaceFirst(&replacementText, &resultText, status);
57a6839d 1755
729e4ab9
A
1756 utext_close(&resultText);
1757 utext_close(&replacementText);
57a6839d 1758
729e4ab9
A
1759 return resultString;
1760}
1761
1762//
1763// replaceFirst, UText mode
1764//
1765UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
b75a7d8f 1766 if (U_FAILURE(status)) {
729e4ab9 1767 return dest;
b75a7d8f
A
1768 }
1769 if (U_FAILURE(fDeferredStatus)) {
1770 status = fDeferredStatus;
729e4ab9 1771 return dest;
b75a7d8f
A
1772 }
1773
1774 reset();
1775 if (!find()) {
729e4ab9 1776 return getInput(dest, status);
b75a7d8f 1777 }
57a6839d 1778
729e4ab9
A
1779 if (dest == NULL) {
1780 UnicodeString emptyString;
1781 UText empty = UTEXT_INITIALIZER;
57a6839d 1782
729e4ab9
A
1783 utext_openUnicodeString(&empty, &emptyString, &status);
1784 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1785 utext_close(&empty);
1786 }
57a6839d 1787
729e4ab9
A
1788 appendReplacement(dest, replacement, status);
1789 appendTail(dest, status);
57a6839d 1790
729e4ab9 1791 return dest;
b75a7d8f
A
1792}
1793
1794
46f4442e
A
1795//--------------------------------------------------------------------------------
1796//
1797// requireEnd
1798//
1799//--------------------------------------------------------------------------------
1800UBool RegexMatcher::requireEnd() const {
1801 return fRequireEnd;
1802}
1803
b75a7d8f
A
1804
1805//--------------------------------------------------------------------------------
1806//
1807// reset
1808//
1809//--------------------------------------------------------------------------------
1810RegexMatcher &RegexMatcher::reset() {
46f4442e 1811 fRegionStart = 0;
729e4ab9 1812 fRegionLimit = fInputLength;
46f4442e 1813 fActiveStart = 0;
729e4ab9 1814 fActiveLimit = fInputLength;
46f4442e 1815 fAnchorStart = 0;
729e4ab9 1816 fAnchorLimit = fInputLength;
46f4442e 1817 fLookStart = 0;
729e4ab9 1818 fLookLimit = fInputLength;
46f4442e
A
1819 resetPreserveRegion();
1820 return *this;
1821}
1822
1823
1824
1825void RegexMatcher::resetPreserveRegion() {
374ca955
A
1826 fMatchStart = 0;
1827 fMatchEnd = 0;
1828 fLastMatchEnd = -1;
46f4442e 1829 fAppendPosition = 0;
374ca955 1830 fMatch = FALSE;
46f4442e
A
1831 fHitEnd = FALSE;
1832 fRequireEnd = FALSE;
1833 fTime = 0;
1834 fTickCounter = TIMER_INITIAL_VALUE;
729e4ab9 1835 //resetStack(); // more expensive than it looks...
b75a7d8f
A
1836}
1837
1838
b75a7d8f 1839RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
729e4ab9
A
1840 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1841 if (fPattern->fNeedsAltInput) {
1842 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1843 }
b331163b
A
1844 if (U_FAILURE(fDeferredStatus)) {
1845 return *this;
1846 }
729e4ab9 1847 fInputLength = utext_nativeLength(fInputText);
57a6839d 1848
b75a7d8f 1849 reset();
729e4ab9
A
1850 delete fInput;
1851 fInput = NULL;
1852
1853 // Do the following for any UnicodeString.
1854 // This is for compatibility for those clients who modify the input string "live" during regex operations.
57a6839d
A
1855 fInputUniStrMaybeMutable = TRUE;
1856
374ca955 1857 if (fWordBreakItr != NULL) {
729e4ab9
A
1858#if UCONFIG_NO_BREAK_ITERATION==0
1859 UErrorCode status = U_ZERO_ERROR;
1860 fWordBreakItr->setText(fInputText, status);
1861#endif
374ca955 1862 }
b75a7d8f
A
1863 return *this;
1864}
1865
b75a7d8f 1866
729e4ab9
A
1867RegexMatcher &RegexMatcher::reset(UText *input) {
1868 if (fInputText != input) {
1869 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1870 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
b331163b
A
1871 if (U_FAILURE(fDeferredStatus)) {
1872 return *this;
1873 }
729e4ab9 1874 fInputLength = utext_nativeLength(fInputText);
57a6839d 1875
729e4ab9
A
1876 delete fInput;
1877 fInput = NULL;
57a6839d 1878
729e4ab9
A
1879 if (fWordBreakItr != NULL) {
1880#if UCONFIG_NO_BREAK_ITERATION==0
1881 UErrorCode status = U_ZERO_ERROR;
1882 fWordBreakItr->setText(input, status);
1883#endif
1884 }
1885 }
1886 reset();
1887 fInputUniStrMaybeMutable = FALSE;
1888
1889 return *this;
1890}
1891
1892/*RegexMatcher &RegexMatcher::reset(const UChar *) {
1893 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1894 return *this;
1895}*/
1896
1897RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1898 if (U_FAILURE(status)) {
374ca955 1899 return *this;
b75a7d8f 1900 }
46f4442e 1901 reset(); // Reset also resets the region to be the entire string.
57a6839d 1902
729e4ab9 1903 if (position < 0 || position > fActiveLimit) {
374ca955
A
1904 status = U_INDEX_OUTOFBOUNDS_ERROR;
1905 return *this;
1906 }
1907 fMatchEnd = position;
1908 return *this;
b75a7d8f
A
1909}
1910
1911
4388f060
A
1912//--------------------------------------------------------------------------------
1913//
1914// refresh
1915//
1916//--------------------------------------------------------------------------------
1917RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1918 if (U_FAILURE(status)) {
1919 return *this;
1920 }
1921 if (input == NULL) {
1922 status = U_ILLEGAL_ARGUMENT_ERROR;
1923 return *this;
1924 }
1925 if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1926 status = U_ILLEGAL_ARGUMENT_ERROR;
1927 return *this;
1928 }
1929 int64_t pos = utext_getNativeIndex(fInputText);
1930 // Shallow read-only clone of the new UText into the existing input UText
1931 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
1932 if (U_FAILURE(status)) {
1933 return *this;
1934 }
1935 utext_setNativeIndex(fInputText, pos);
1936
1937 if (fAltInputText != NULL) {
1938 pos = utext_getNativeIndex(fAltInputText);
1939 fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
1940 if (U_FAILURE(status)) {
1941 return *this;
1942 }
1943 utext_setNativeIndex(fAltInputText, pos);
1944 }
1945 return *this;
1946}
b75a7d8f 1947
374ca955
A
1948
1949
b75a7d8f
A
1950//--------------------------------------------------------------------------------
1951//
1952// setTrace
1953//
1954//--------------------------------------------------------------------------------
1955void RegexMatcher::setTrace(UBool state) {
1956 fTraceDebug = state;
1957}
1958
1959
1960
b331163b
A
1961/**
1962 * UText, replace entire contents of the destination UText with a substring of the source UText.
1963 *
1964 * @param src The source UText
1965 * @param dest The destination UText. Must be writable.
1966 * May be NULL, in which case a new UText will be allocated.
1967 * @param start Start index of source substring.
1968 * @param limit Limit index of source substring.
1969 * @param status An error code.
1970 */
1971static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) {
1972 if (U_FAILURE(*status)) {
1973 return dest;
1974 }
1975 if (start == limit) {
1976 if (dest) {
1977 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, status);
1978 return dest;
1979 } else {
1980 return utext_openUChars(NULL, NULL, 0, status);
1981 }
1982 }
1983 int32_t length = utext_extract(src, start, limit, NULL, 0, status);
1984 if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
1985 return dest;
1986 }
1987 *status = U_ZERO_ERROR;
1988 MaybeStackArray<UChar, 40> buffer;
1989 if (length >= buffer.getCapacity()) {
1990 UChar *newBuf = buffer.resize(length+1); // Leave space for terminating Nul.
1991 if (newBuf == NULL) {
1992 *status = U_MEMORY_ALLOCATION_ERROR;
1993 }
1994 }
1995 utext_extract(src, start, limit, buffer.getAlias(), length+1, status);
1996 if (dest) {
1997 utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status);
1998 return dest;
1999 }
2000
2001 // Caller did not provide a prexisting UText.
2002 // Open a new one, and have it adopt the text buffer storage.
2003 if (U_FAILURE(*status)) {
2004 return NULL;
2005 }
2006 int32_t ownedLength = 0;
2007 UChar *ownedBuf = buffer.orphanOrClone(length+1, ownedLength);
2008 if (ownedBuf == NULL) {
2009 *status = U_MEMORY_ALLOCATION_ERROR;
2010 return NULL;
2011 }
2012 UText *result = utext_openUChars(NULL, ownedBuf, length, status);
2013 if (U_FAILURE(*status)) {
2014 uprv_free(ownedBuf);
2015 return NULL;
2016 }
2017 result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT);
2018 return result;
2019}
2020
2021
b75a7d8f
A
2022//---------------------------------------------------------------------
2023//
2024// split
2025//
2026//---------------------------------------------------------------------
2027int32_t RegexMatcher::split(const UnicodeString &input,
2028 UnicodeString dest[],
2029 int32_t destCapacity,
729e4ab9
A
2030 UErrorCode &status)
2031{
2032 UText inputText = UTEXT_INITIALIZER;
2033 utext_openConstUnicodeString(&inputText, &input, &status);
2034 if (U_FAILURE(status)) {
2035 return 0;
2036 }
2037
2038 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2039 if (destText == NULL) {
2040 status = U_MEMORY_ALLOCATION_ERROR;
2041 return 0;
2042 }
2043 int32_t i;
2044 for (i = 0; i < destCapacity; i++) {
2045 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2046 }
57a6839d 2047
729e4ab9 2048 int32_t fieldCount = split(&inputText, destText, destCapacity, status);
57a6839d 2049
729e4ab9
A
2050 for (i = 0; i < destCapacity; i++) {
2051 utext_close(destText[i]);
2052 }
2053
2054 uprv_free(destText);
2055 utext_close(&inputText);
2056 return fieldCount;
2057}
2058
2059//
2060// split, UText mode
2061//
2062int32_t RegexMatcher::split(UText *input,
2063 UText *dest[],
2064 int32_t destCapacity,
2065 UErrorCode &status)
b75a7d8f
A
2066{
2067 //
2068 // Check arguements for validity
2069 //
2070 if (U_FAILURE(status)) {
2071 return 0;
2072 };
2073
2074 if (destCapacity < 1) {
2075 status = U_ILLEGAL_ARGUMENT_ERROR;
2076 return 0;
2077 }
2078
b75a7d8f
A
2079 //
2080 // Reset for the input text
2081 //
2082 reset(input);
729e4ab9 2083 int64_t nextOutputStringStart = 0;
46f4442e 2084 if (fActiveLimit == 0) {
b75a7d8f
A
2085 return 0;
2086 }
2087
b75a7d8f
A
2088 //
2089 // Loop through the input text, searching for the delimiter pattern
2090 //
73c04bcf 2091 int32_t i;
b75a7d8f
A
2092 int32_t numCaptureGroups = fPattern->fGroupMap->size();
2093 for (i=0; ; i++) {
2094 if (i>=destCapacity-1) {
2095 // There is one or zero output string left.
2096 // Fill the last output string with whatever is left from the input, then exit the loop.
729e4ab9 2097 // ( i will be == destCapacity if we filled the output array while processing
b75a7d8f
A
2098 // capture groups of the delimiter expression, in which case we will discard the
2099 // last capture group saved in favor of the unprocessed remainder of the
2100 // input string.)
2101 i = destCapacity-1;
729e4ab9
A
2102 if (fActiveLimit > nextOutputStringStart) {
2103 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2104 if (dest[i]) {
57a6839d
A
2105 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2106 input->chunkContents+nextOutputStringStart,
729e4ab9
A
2107 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2108 } else {
2109 UText remainingText = UTEXT_INITIALIZER;
57a6839d 2110 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
729e4ab9
A
2111 fActiveLimit-nextOutputStringStart, &status);
2112 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2113 utext_close(&remainingText);
2114 }
2115 } else {
2116 UErrorCode lengthStatus = U_ZERO_ERROR;
57a6839d 2117 int32_t remaining16Length =
729e4ab9
A
2118 utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2119 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2120 if (remainingChars == NULL) {
2121 status = U_MEMORY_ALLOCATION_ERROR;
2122 break;
2123 }
2124
2125 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2126 if (dest[i]) {
2127 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2128 } else {
2129 UText remainingText = UTEXT_INITIALIZER;
2130 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2131 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2132 utext_close(&remainingText);
2133 }
57a6839d 2134
729e4ab9
A
2135 uprv_free(remainingChars);
2136 }
b75a7d8f
A
2137 }
2138 break;
2139 }
2140 if (find()) {
2141 // We found another delimiter. Move everything from where we started looking
2142 // up until the start of the delimiter into the next output string.
729e4ab9
A
2143 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2144 if (dest[i]) {
57a6839d
A
2145 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2146 input->chunkContents+nextOutputStringStart,
729e4ab9
A
2147 (int32_t)(fMatchStart-nextOutputStringStart), &status);
2148 } else {
2149 UText remainingText = UTEXT_INITIALIZER;
57a6839d 2150 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
729e4ab9
A
2151 fMatchStart-nextOutputStringStart, &status);
2152 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2153 utext_close(&remainingText);
2154 }
2155 } else {
2156 UErrorCode lengthStatus = U_ZERO_ERROR;
2157 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2158 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2159 if (remainingChars == NULL) {
2160 status = U_MEMORY_ALLOCATION_ERROR;
2161 break;
2162 }
2163 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2164 if (dest[i]) {
2165 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2166 } else {
2167 UText remainingText = UTEXT_INITIALIZER;
2168 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2169 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2170 utext_close(&remainingText);
2171 }
57a6839d 2172
729e4ab9
A
2173 uprv_free(remainingChars);
2174 }
b75a7d8f
A
2175 nextOutputStringStart = fMatchEnd;
2176
2177 // If the delimiter pattern has capturing parentheses, the captured
2178 // text goes out into the next n destination strings.
2179 int32_t groupNum;
2180 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
4388f060
A
2181 if (i >= destCapacity-2) {
2182 // Never fill the last available output string with capture group text.
2183 // It will filled with the last field, the remainder of the
2184 // unsplit input text.
b75a7d8f
A
2185 break;
2186 }
2187 i++;
b331163b
A
2188 dest[i] = utext_extract_replace(fInputText, dest[i],
2189 start64(groupNum, status), end64(groupNum, status), &status);
b75a7d8f
A
2190 }
2191
46f4442e 2192 if (nextOutputStringStart == fActiveLimit) {
4388f060
A
2193 // The delimiter was at the end of the string. We're done, but first
2194 // we output one last empty string, for the empty field following
2195 // the delimiter at the end of input.
2196 if (i+1 < destCapacity) {
2197 ++i;
2198 if (dest[i] == NULL) {
2199 dest[i] = utext_openUChars(NULL, NULL, 0, &status);
2200 } else {
2201 static UChar emptyString[] = {(UChar)0};
2202 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
2203 }
729e4ab9 2204 }
4388f060 2205 break;
57a6839d
A
2206
2207 }
b75a7d8f
A
2208 }
2209 else
2210 {
2211 // We ran off the end of the input while looking for the next delimiter.
2212 // All the remaining text goes into the current output string.
729e4ab9
A
2213 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2214 if (dest[i]) {
57a6839d
A
2215 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2216 input->chunkContents+nextOutputStringStart,
729e4ab9
A
2217 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2218 } else {
2219 UText remainingText = UTEXT_INITIALIZER;
57a6839d 2220 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
729e4ab9
A
2221 fActiveLimit-nextOutputStringStart, &status);
2222 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2223 utext_close(&remainingText);
2224 }
2225 } else {
2226 UErrorCode lengthStatus = U_ZERO_ERROR;
2227 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2228 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2229 if (remainingChars == NULL) {
2230 status = U_MEMORY_ALLOCATION_ERROR;
2231 break;
2232 }
57a6839d 2233
729e4ab9
A
2234 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2235 if (dest[i]) {
2236 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2237 } else {
2238 UText remainingText = UTEXT_INITIALIZER;
2239 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2240 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2241 utext_close(&remainingText);
2242 }
57a6839d 2243
729e4ab9
A
2244 uprv_free(remainingChars);
2245 }
b75a7d8f
A
2246 break;
2247 }
729e4ab9
A
2248 if (U_FAILURE(status)) {
2249 break;
2250 }
2251 } // end of for loop
b75a7d8f
A
2252 return i+1;
2253}
2254
2255
b75a7d8f
A
2256//--------------------------------------------------------------------------------
2257//
2258// start
2259//
2260//--------------------------------------------------------------------------------
2261int32_t RegexMatcher::start(UErrorCode &status) const {
2262 return start(0, status);
2263}
2264
729e4ab9
A
2265int64_t RegexMatcher::start64(UErrorCode &status) const {
2266 return start64(0, status);
2267}
b75a7d8f 2268
46f4442e
A
2269//--------------------------------------------------------------------------------
2270//
2271// start(int32_t group, UErrorCode &status)
2272//
2273//--------------------------------------------------------------------------------
729e4ab9
A
2274
2275int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
b75a7d8f
A
2276 if (U_FAILURE(status)) {
2277 return -1;
2278 }
2279 if (U_FAILURE(fDeferredStatus)) {
2280 status = fDeferredStatus;
2281 return -1;
2282 }
2283 if (fMatch == FALSE) {
2284 status = U_REGEX_INVALID_STATE;
2285 return -1;
2286 }
2287 if (group < 0 || group > fPattern->fGroupMap->size()) {
2288 status = U_INDEX_OUTOFBOUNDS_ERROR;
2289 return -1;
2290 }
729e4ab9 2291 int64_t s;
b75a7d8f 2292 if (group == 0) {
57a6839d 2293 s = fMatchStart;
b75a7d8f
A
2294 } else {
2295 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2296 U_ASSERT(groupOffset < fPattern->fFrameSize);
2297 U_ASSERT(groupOffset >= 0);
2298 s = fFrame->fExtra[groupOffset];
2299 }
57a6839d 2300
b75a7d8f
A
2301 return s;
2302}
2303
2304
729e4ab9
A
2305int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2306 return (int32_t)start64(group, status);
2307}
b75a7d8f 2308
46f4442e
A
2309//--------------------------------------------------------------------------------
2310//
2311// useAnchoringBounds
2312//
2313//--------------------------------------------------------------------------------
2314RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2315 fAnchoringBounds = b;
729e4ab9
A
2316 fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2317 fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
46f4442e
A
2318 return *this;
2319}
2320
2321
2322//--------------------------------------------------------------------------------
2323//
2324// useTransparentBounds
2325//
2326//--------------------------------------------------------------------------------
2327RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2328 fTransparentBounds = b;
729e4ab9
A
2329 fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2330 fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
46f4442e
A
2331 return *this;
2332}
2333
2334//--------------------------------------------------------------------------------
2335//
2336// setTimeLimit
2337//
2338//--------------------------------------------------------------------------------
2339void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2340 if (U_FAILURE(status)) {
2341 return;
2342 }
2343 if (U_FAILURE(fDeferredStatus)) {
2344 status = fDeferredStatus;
2345 return;
2346 }
2347 if (limit < 0) {
2348 status = U_ILLEGAL_ARGUMENT_ERROR;
2349 return;
2350 }
2351 fTimeLimit = limit;
2352}
2353
2354
2355//--------------------------------------------------------------------------------
2356//
2357// getTimeLimit
2358//
2359//--------------------------------------------------------------------------------
2360int32_t RegexMatcher::getTimeLimit() const {
2361 return fTimeLimit;
2362}
2363
2364
2365//--------------------------------------------------------------------------------
2366//
2367// setStackLimit
2368//
2369//--------------------------------------------------------------------------------
2370void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2371 if (U_FAILURE(status)) {
2372 return;
2373 }
2374 if (U_FAILURE(fDeferredStatus)) {
2375 status = fDeferredStatus;
2376 return;
2377 }
2378 if (limit < 0) {
2379 status = U_ILLEGAL_ARGUMENT_ERROR;
2380 return;
2381 }
57a6839d 2382
46f4442e 2383 // Reset the matcher. This is needed here in case there is a current match
57a6839d 2384 // whose final stack frame (containing the match results, pointed to by fFrame)
46f4442e
A
2385 // would be lost by resizing to a smaller stack size.
2386 reset();
57a6839d 2387
46f4442e
A
2388 if (limit == 0) {
2389 // Unlimited stack expansion
2390 fStack->setMaxCapacity(0);
2391 } else {
2392 // Change the units of the limit from bytes to ints, and bump the size up
57a6839d 2393 // to be big enough to hold at least one stack frame for the pattern,
46f4442e
A
2394 // if it isn't there already.
2395 int32_t adjustedLimit = limit / sizeof(int32_t);
2396 if (adjustedLimit < fPattern->fFrameSize) {
2397 adjustedLimit = fPattern->fFrameSize;
2398 }
2399 fStack->setMaxCapacity(adjustedLimit);
2400 }
2401 fStackLimit = limit;
2402}
2403
2404
2405//--------------------------------------------------------------------------------
2406//
2407// getStackLimit
2408//
2409//--------------------------------------------------------------------------------
2410int32_t RegexMatcher::getStackLimit() const {
2411 return fStackLimit;
2412}
2413
2414
2415//--------------------------------------------------------------------------------
2416//
2417// setMatchCallback
2418//
2419//--------------------------------------------------------------------------------
2420void RegexMatcher::setMatchCallback(URegexMatchCallback *callback,
2421 const void *context,
2422 UErrorCode &status) {
729e4ab9
A
2423 if (U_FAILURE(status)) {
2424 return;
2425 }
2426 fCallbackFn = callback;
2427 fCallbackContext = context;
46f4442e
A
2428}
2429
2430
2431//--------------------------------------------------------------------------------
2432//
2433// getMatchCallback
2434//
2435//--------------------------------------------------------------------------------
2436void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback,
2437 const void *&context,
2438 UErrorCode &status) {
2439 if (U_FAILURE(status)) {
2440 return;
2441 }
2442 callback = fCallbackFn;
2443 context = fCallbackContext;
2444}
2445
2446
729e4ab9
A
2447//--------------------------------------------------------------------------------
2448//
2449// setMatchCallback
2450//
2451//--------------------------------------------------------------------------------
2452void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback,
2453 const void *context,
2454 UErrorCode &status) {
2455 if (U_FAILURE(status)) {
2456 return;
2457 }
2458 fFindProgressCallbackFn = callback;
2459 fFindProgressCallbackContext = context;
2460}
2461
2462
2463//--------------------------------------------------------------------------------
2464//
2465// getMatchCallback
2466//
2467//--------------------------------------------------------------------------------
2468void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback,
2469 const void *&context,
2470 UErrorCode &status) {
2471 if (U_FAILURE(status)) {
2472 return;
2473 }
2474 callback = fFindProgressCallbackFn;
2475 context = fFindProgressCallbackContext;
2476}
2477
2478
374ca955
A
2479//================================================================================
2480//
2481// Code following this point in this file is the internal
2482// Match Engine Implementation.
2483//
2484//================================================================================
2485
2486
2487//--------------------------------------------------------------------------------
2488//
2489// resetStack
2490// Discard any previous contents of the state save stack, and initialize a
57a6839d 2491// new stack frame to all -1. The -1s are needed for capture group limits,
374ca955
A
2492// where they indicate that a group has not yet matched anything.
2493//--------------------------------------------------------------------------------
2494REStackFrame *RegexMatcher::resetStack() {
2495 // Discard any previous contents of the state save stack, and initialize a
729e4ab9
A
2496 // new stack frame with all -1 data. The -1s are needed for capture group limits,
2497 // where they indicate that a group has not yet matched anything.
374ca955
A
2498 fStack->removeAllElements();
2499
729e4ab9 2500 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2ca993e8
A
2501 if(U_FAILURE(fDeferredStatus)) {
2502 return NULL;
2503 }
2504
729e4ab9
A
2505 int32_t i;
2506 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2507 iFrame->fExtra[i] = -1;
2508 }
2509 return iFrame;
2510}
2511
2512
2513
2514//--------------------------------------------------------------------------------
2515//
57a6839d 2516// isWordBoundary
729e4ab9
A
2517// in perl, "xab..cd..", \b is true at positions 0,3,5,7
2518// For us,
2519// If the current char is a combining mark,
2520// \b is FALSE.
2521// Else Scan backwards to the first non-combining char.
2522// We are at a boundary if the this char and the original chars are
2523// opposite in membership in \w set
2524//
2525// parameters: pos - the current position in the input buffer
2526//
2527// TODO: double-check edge cases at region boundaries.
2528//
2529//--------------------------------------------------------------------------------
2530UBool RegexMatcher::isWordBoundary(int64_t pos) {
2531 UBool isBoundary = FALSE;
2532 UBool cIsWord = FALSE;
57a6839d 2533
729e4ab9
A
2534 if (pos >= fLookLimit) {
2535 fHitEnd = TRUE;
2536 } else {
2537 // Determine whether char c at current position is a member of the word set of chars.
2538 // If we're off the end of the string, behave as though we're not at a word char.
2539 UTEXT_SETNATIVEINDEX(fInputText, pos);
2540 UChar32 c = UTEXT_CURRENT32(fInputText);
2541 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2542 // Current char is a combining one. Not a boundary.
2543 return FALSE;
2544 }
2545 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2546 }
57a6839d 2547
729e4ab9
A
2548 // Back up until we come to a non-combining char, determine whether
2549 // that char is a word char.
2550 UBool prevCIsWord = FALSE;
2551 for (;;) {
2552 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2553 break;
2554 }
2555 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2556 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2557 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2558 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2559 break;
2560 }
2561 }
2562 isBoundary = cIsWord ^ prevCIsWord;
2563 return isBoundary;
2564}
2565
2566UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2567 UBool isBoundary = FALSE;
2568 UBool cIsWord = FALSE;
57a6839d 2569
729e4ab9 2570 const UChar *inputBuf = fInputText->chunkContents;
57a6839d 2571
729e4ab9
A
2572 if (pos >= fLookLimit) {
2573 fHitEnd = TRUE;
2574 } else {
2575 // Determine whether char c at current position is a member of the word set of chars.
2576 // If we're off the end of the string, behave as though we're not at a word char.
2577 UChar32 c;
2578 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2579 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2580 // Current char is a combining one. Not a boundary.
2581 return FALSE;
2582 }
2583 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2584 }
57a6839d 2585
729e4ab9
A
2586 // Back up until we come to a non-combining char, determine whether
2587 // that char is a word char.
2588 UBool prevCIsWord = FALSE;
2589 for (;;) {
2590 if (pos <= fLookStart) {
2591 break;
2592 }
2593 UChar32 prevChar;
2594 U16_PREV(inputBuf, fLookStart, pos, prevChar);
2595 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2596 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2597 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2598 break;
2599 }
2600 }
2601 isBoundary = cIsWord ^ prevCIsWord;
2602 return isBoundary;
2603}
2604
2605//--------------------------------------------------------------------------------
2606//
57a6839d 2607// isUWordBoundary
729e4ab9
A
2608//
2609// Test for a word boundary using RBBI word break.
2610//
2611// parameters: pos - the current position in the input buffer
2612//
2613//--------------------------------------------------------------------------------
2614UBool RegexMatcher::isUWordBoundary(int64_t pos) {
2615 UBool returnVal = FALSE;
2616#if UCONFIG_NO_BREAK_ITERATION==0
57a6839d 2617
729e4ab9
A
2618 // If we haven't yet created a break iterator for this matcher, do it now.
2619 if (fWordBreakItr == NULL) {
57a6839d 2620 fWordBreakItr =
729e4ab9
A
2621 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
2622 if (U_FAILURE(fDeferredStatus)) {
2623 return FALSE;
2624 }
2625 fWordBreakItr->setText(fInputText, fDeferredStatus);
2626 }
2627
2628 if (pos >= fLookLimit) {
2629 fHitEnd = TRUE;
2630 returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real"
2631 // words are not boundaries. All non-word chars stand by themselves,
2632 // with word boundaries on both sides.
2633 } else {
2634 if (!UTEXT_USES_U16(fInputText)) {
2635 // !!!: Would like a better way to do this!
2636 UErrorCode status = U_ZERO_ERROR;
2637 pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
2638 }
2639 returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2640 }
2641#endif
2642 return returnVal;
2643}
2644
2645//--------------------------------------------------------------------------------
2646//
2647// IncrementTime This function is called once each TIMER_INITIAL_VALUE state
2648// saves. Increment the "time" counter, and call the
2649// user callback function if there is one installed.
2650//
2651// If the match operation needs to be aborted, either for a time-out
2652// or because the user callback asked for it, just set an error status.
2653// The engine will pick that up and stop in its outer loop.
2654//
2655//--------------------------------------------------------------------------------
2656void RegexMatcher::IncrementTime(UErrorCode &status) {
2657 fTickCounter = TIMER_INITIAL_VALUE;
2658 fTime++;
2659 if (fCallbackFn != NULL) {
2660 if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2661 status = U_REGEX_STOPPED_BY_CALLER;
2662 return;
2663 }
2664 }
2665 if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2666 status = U_REGEX_TIME_OUT;
2667 }
2668}
2669
729e4ab9
A
2670//--------------------------------------------------------------------------------
2671//
2672// StateSave
2673// Make a new stack frame, initialized as a copy of the current stack frame.
2674// Set the pattern index in the original stack frame from the operand value
2675// in the opcode. Execution of the engine continues with the state in
2676// the newly created stack frame
2677//
2678// Note that reserveBlock() may grow the stack, resulting in the
2679// whole thing being relocated in memory.
2680//
2681// Parameters:
57a6839d 2682// fp The top frame pointer when called. At return, a new
729e4ab9
A
2683// fame will be present
2684// savePatIdx An index into the compiled pattern. Goes into the original
2685// (not new) frame. If execution ever back-tracks out of the
2686// new frame, this will be where we continue from in the pattern.
2687// Return
2688// The new frame pointer.
2689//
2690//--------------------------------------------------------------------------------
2691inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2ca993e8
A
2692 if (U_FAILURE(status)) {
2693 return fp;
2694 }
57a6839d 2695 // push storage for a new frame.
729e4ab9 2696 int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2ca993e8 2697 if (U_FAILURE(status)) {
729e4ab9
A
2698 // Failure on attempted stack expansion.
2699 // Stack function set some other error code, change it to a more
2700 // specific one for regular expressions.
2701 status = U_REGEX_STACK_OVERFLOW;
2702 // We need to return a writable stack frame, so just return the
2703 // previous frame. The match operation will stop quickly
2704 // because of the error status, after which the frame will never
2705 // be looked at again.
2706 return fp;
2707 }
2708 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack.
57a6839d 2709
729e4ab9
A
2710 // New stack frame = copy of old top frame.
2711 int64_t *source = (int64_t *)fp;
2712 int64_t *dest = newFP;
2713 for (;;) {
2714 *dest++ = *source++;
2715 if (source == newFP) {
2716 break;
2717 }
2718 }
57a6839d 2719
729e4ab9
A
2720 fTickCounter--;
2721 if (fTickCounter <= 0) {
2722 IncrementTime(status); // Re-initializes fTickCounter
2723 }
2724 fp->fPatIdx = savePatIdx;
2725 return (REStackFrame *)newFP;
2726}
2727
2ca993e8
A
2728#if defined(REGEX_DEBUG)
2729namespace {
2730UnicodeString StringFromUText(UText *ut) {
2731 UnicodeString result;
2732 for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) {
2733 result.append(c);
2734 }
2735 return result;
2736}
2737}
2738#endif // REGEX_DEBUG
2739
729e4ab9
A
2740
2741//--------------------------------------------------------------------------------
2742//
2743// MatchAt This is the actual matching engine.
2744//
2745// startIdx: begin matching a this index.
2746// toEnd: if true, match must extend to end of the input region
2747//
2748//--------------------------------------------------------------------------------
2749void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2750 UBool isMatch = FALSE; // True if the we have a match.
57a6839d 2751
729e4ab9
A
2752 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2753
2754 int32_t op; // Operation from the compiled pattern, split into
2755 int32_t opType; // the opcode
2756 int32_t opValue; // and the operand value.
57a6839d
A
2757
2758#ifdef REGEX_RUN_DEBUG
2ca993e8 2759 if (fTraceDebug) {
729e4ab9 2760 printf("MatchAt(startIdx=%ld)\n", startIdx);
2ca993e8
A
2761 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
2762 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
729e4ab9 2763 }
57a6839d 2764#endif
729e4ab9
A
2765
2766 if (U_FAILURE(status)) {
2767 return;
2768 }
2769
2770 // Cache frequently referenced items from the compiled pattern
2771 //
2772 int64_t *pat = fPattern->fCompiledPat->getBuffer();
2773
2774 const UChar *litText = fPattern->fLiteralText.getBuffer();
2775 UVector *sets = fPattern->fSets;
2776
2777 fFrameSize = fPattern->fFrameSize;
2778 REStackFrame *fp = resetStack();
2ca993e8
A
2779 if (U_FAILURE(fDeferredStatus)) {
2780 status = fDeferredStatus;
2781 return;
2782 }
729e4ab9
A
2783
2784 fp->fPatIdx = 0;
2785 fp->fInputIdx = startIdx;
2786
2787 // Zero out the pattern's static data
2788 int32_t i;
2789 for (i = 0; i<fPattern->fDataSize; i++) {
2790 fData[i] = 0;
2791 }
2792
2793 //
2794 // Main loop for interpreting the compiled pattern.
2795 // One iteration of the loop per pattern operation performed.
2796 //
2797 for (;;) {
729e4ab9
A
2798 op = (int32_t)pat[fp->fPatIdx];
2799 opType = URX_TYPE(op);
2800 opValue = URX_VAL(op);
57a6839d 2801#ifdef REGEX_RUN_DEBUG
729e4ab9
A
2802 if (fTraceDebug) {
2803 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 2804 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
729e4ab9
A
2805 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2806 fPattern->dumpOp(fp->fPatIdx);
2807 }
57a6839d 2808#endif
729e4ab9 2809 fp->fPatIdx++;
57a6839d 2810
729e4ab9
A
2811 switch (opType) {
2812
2813
2814 case URX_NOP:
2815 break;
2816
2817
2818 case URX_BACKTRACK:
2819 // Force a backtrack. In some circumstances, the pattern compiler
2820 // will notice that the pattern can't possibly match anything, and will
2821 // emit one of these at that point.
2822 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2823 break;
2824
2825
2826 case URX_ONECHAR:
2827 if (fp->fInputIdx < fActiveLimit) {
2828 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2829 UChar32 c = UTEXT_NEXT32(fInputText);
2830 if (c == opValue) {
2831 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2832 break;
2833 }
2834 } else {
2835 fHitEnd = TRUE;
2836 }
729e4ab9
A
2837 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2838 break;
2839
2840
2841 case URX_STRING:
2842 {
2843 // Test input against a literal string.
2844 // Strings require two slots in the compiled pattern, one for the
2845 // offset to the string text, and one for the length.
729e4ab9 2846
4388f060 2847 int32_t stringStartIdx = opValue;
729e4ab9
A
2848 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
2849 fp->fPatIdx++;
2850 opType = URX_TYPE(op);
4388f060 2851 int32_t stringLen = URX_VAL(op);
729e4ab9
A
2852 U_ASSERT(opType == URX_STRING_LEN);
2853 U_ASSERT(stringLen >= 2);
57a6839d 2854
4388f060
A
2855 const UChar *patternString = litText+stringStartIdx;
2856 int32_t patternStringIndex = 0;
729e4ab9 2857 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4388f060
A
2858 UChar32 inputChar;
2859 UChar32 patternChar;
729e4ab9 2860 UBool success = TRUE;
4388f060
A
2861 while (patternStringIndex < stringLen) {
2862 if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
729e4ab9 2863 success = FALSE;
4388f060
A
2864 fHitEnd = TRUE;
2865 break;
2866 }
2867 inputChar = UTEXT_NEXT32(fInputText);
2868 U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
2869 if (patternChar != inputChar) {
2870 success = FALSE;
2871 break;
729e4ab9
A
2872 }
2873 }
57a6839d 2874
729e4ab9
A
2875 if (success) {
2876 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2877 } else {
729e4ab9
A
2878 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2879 }
2880 }
2881 break;
2882
2883
2884 case URX_STATE_SAVE:
2885 fp = StateSave(fp, opValue, status);
2886 break;
2887
2888
2889 case URX_END:
2890 // The match loop will exit via this path on a successful match,
2891 // when we reach the end of the pattern.
2892 if (toEnd && fp->fInputIdx != fActiveLimit) {
2893 // The pattern matched, but not to the end of input. Try some more.
2894 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2895 break;
2896 }
2897 isMatch = TRUE;
2898 goto breakFromLoop;
2899
2900 // Start and End Capture stack frame variables are laid out out like this:
2901 // fp->fExtra[opValue] - The start of a completed capture group
2902 // opValue+1 - The end of a completed capture group
2903 // opValue+2 - the start of a capture group whose end
2904 // has not yet been reached (and might not ever be).
2905 case URX_START_CAPTURE:
2906 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2907 fp->fExtra[opValue+2] = fp->fInputIdx;
2908 break;
2909
2910
2911 case URX_END_CAPTURE:
2912 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2913 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
2914 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
2915 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
2916 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
2917 break;
2918
2919
2920 case URX_DOLLAR: // $, test for End of line
2921 // or for position before new line at end of input
2922 {
2923 if (fp->fInputIdx >= fAnchorLimit) {
2924 // We really are at the end of input. Success.
2925 fHitEnd = TRUE;
2926 fRequireEnd = TRUE;
2927 break;
2928 }
57a6839d 2929
729e4ab9 2930 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 2931
729e4ab9
A
2932 // If we are positioned just before a new-line that is located at the
2933 // end of input, succeed.
2934 UChar32 c = UTEXT_NEXT32(fInputText);
2935 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
b331163b 2936 if (isLineTerminator(c)) {
729e4ab9 2937 // If not in the middle of a CR/LF sequence
b331163b 2938 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
729e4ab9
A
2939 // At new-line at end of input. Success
2940 fHitEnd = TRUE;
2941 fRequireEnd = TRUE;
57a6839d 2942
729e4ab9
A
2943 break;
2944 }
2945 }
2946 } else {
2947 UChar32 nextC = UTEXT_NEXT32(fInputText);
2948 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2949 fHitEnd = TRUE;
2950 fRequireEnd = TRUE;
2951 break; // At CR/LF at end of input. Success
2952 }
2953 }
2954
2955 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2956 }
2957 break;
2958
2959
2960 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
2961 if (fp->fInputIdx >= fAnchorLimit) {
2962 // Off the end of input. Success.
2963 fHitEnd = TRUE;
2964 fRequireEnd = TRUE;
2965 break;
2966 } else {
2967 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2968 UChar32 c = UTEXT_NEXT32(fInputText);
2969 // Either at the last character of input, or off the end.
2970 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
2971 fHitEnd = TRUE;
2972 fRequireEnd = TRUE;
2973 break;
2974 }
2975 }
2976
2977 // Not at end of input. Back-track out.
2978 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2979 break;
2980
2981
2982 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
2983 {
2984 if (fp->fInputIdx >= fAnchorLimit) {
2985 // We really are at the end of input. Success.
2986 fHitEnd = TRUE;
2987 fRequireEnd = TRUE;
2988 break;
2989 }
2990 // If we are positioned just before a new-line, succeed.
2991 // It makes no difference where the new-line is within the input.
2992 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2993 UChar32 c = UTEXT_CURRENT32(fInputText);
b331163b 2994 if (isLineTerminator(c)) {
729e4ab9
A
2995 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
2996 // In multi-line mode, hitting a new-line just before the end of input does not
2997 // set the hitEnd or requireEnd flags
2998 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
2999 break;
3000 }
3001 }
3002 // not at a new line. Fail.
3003 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3004 }
3005 break;
3006
3007
3008 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
3009 {
3010 if (fp->fInputIdx >= fAnchorLimit) {
3011 // We really are at the end of input. Success.
3012 fHitEnd = TRUE;
3013 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
3014 break; // adding a new-line would not lose the match.
3015 }
3016 // If we are not positioned just before a new-line, the test fails; backtrack out.
3017 // It makes no difference where the new-line is within the input.
3018 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3019 if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3020 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3021 }
3022 }
3023 break;
3024
3025
3026 case URX_CARET: // ^, test for start of line
3027 if (fp->fInputIdx != fAnchorStart) {
3028 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3029 }
3030 break;
3031
3032
3033 case URX_CARET_M: // ^, test for start of line in mulit-line mode
3034 {
3035 if (fp->fInputIdx == fAnchorStart) {
3036 // We are at the start input. Success.
3037 break;
3038 }
3039 // Check whether character just before the current pos is a new-line
3040 // unless we are at the end of input
3041 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 3042 UChar32 c = UTEXT_PREVIOUS32(fInputText);
b331163b 3043 if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) {
729e4ab9
A
3044 // It's a new-line. ^ is true. Success.
3045 // TODO: what should be done with positions between a CR and LF?
3046 break;
3047 }
3048 // Not at the start of a line. Fail.
3049 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3050 }
3051 break;
3052
3053
3054 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
3055 {
3056 U_ASSERT(fp->fInputIdx >= fAnchorStart);
3057 if (fp->fInputIdx <= fAnchorStart) {
3058 // We are at the start input. Success.
3059 break;
3060 }
3061 // Check whether character just before the current pos is a new-line
3062 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3063 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3064 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3065 if (c != 0x0a) {
3066 // Not at the start of a line. Back-track out.
3067 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3068 }
3069 }
3070 break;
3071
3072 case URX_BACKSLASH_B: // Test for word boundaries
3073 {
3074 UBool success = isWordBoundary(fp->fInputIdx);
51004dcb 3075 success ^= (UBool)(opValue != 0); // flip sense for \B
729e4ab9
A
3076 if (!success) {
3077 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3078 }
3079 }
3080 break;
3081
3082
3083 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
3084 {
3085 UBool success = isUWordBoundary(fp->fInputIdx);
51004dcb 3086 success ^= (UBool)(opValue != 0); // flip sense for \B
729e4ab9
A
3087 if (!success) {
3088 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3089 }
3090 }
3091 break;
3092
3093
3094 case URX_BACKSLASH_D: // Test for decimal digit
3095 {
3096 if (fp->fInputIdx >= fActiveLimit) {
3097 fHitEnd = TRUE;
3098 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3099 break;
3100 }
3101
3102 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3103
3104 UChar32 c = UTEXT_NEXT32(fInputText);
3105 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
3106 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
51004dcb 3107 success ^= (UBool)(opValue != 0); // flip sense for \D
729e4ab9
A
3108 if (success) {
3109 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3110 } else {
3111 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3112 }
3113 }
3114 break;
3115
3116
3117 case URX_BACKSLASH_G: // Test for position at end of previous match
3118 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3119 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3120 }
3121 break;
3122
3123
b331163b
A
3124 case URX_BACKSLASH_H: // Test for \h, horizontal white space.
3125 {
3126 if (fp->fInputIdx >= fActiveLimit) {
3127 fHitEnd = TRUE;
3128 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3129 break;
3130 }
3131 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3132 UChar32 c = UTEXT_NEXT32(fInputText);
3133 int8_t ctype = u_charType(c);
3134 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
3135 success ^= (UBool)(opValue != 0); // flip sense for \H
3136 if (success) {
3137 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3138 } else {
3139 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3140 }
3141 }
3142 break;
3143
3144
3145 case URX_BACKSLASH_R: // Test for \R, any line break sequence.
3146 {
3147 if (fp->fInputIdx >= fActiveLimit) {
3148 fHitEnd = TRUE;
3149 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3150 break;
3151 }
3152 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3153 UChar32 c = UTEXT_NEXT32(fInputText);
3154 if (isLineTerminator(c)) {
3155 if (c == 0x0d && utext_current32(fInputText) == 0x0a) {
3156 utext_next32(fInputText);
3157 }
3158 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3159 } else {
3160 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3161 }
3162 }
3163 break;
3164
3165
3166 case URX_BACKSLASH_V: // \v, any single line ending character.
3167 {
3168 if (fp->fInputIdx >= fActiveLimit) {
3169 fHitEnd = TRUE;
3170 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3171 break;
3172 }
3173 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3174 UChar32 c = UTEXT_NEXT32(fInputText);
3175 UBool success = isLineTerminator(c);
3176 success ^= (UBool)(opValue != 0); // flip sense for \V
3177 if (success) {
3178 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3179 } else {
3180 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3181 }
3182 }
3183 break;
3184
3185
57a6839d 3186 case URX_BACKSLASH_X:
729e4ab9
A
3187 // Match a Grapheme, as defined by Unicode TR 29.
3188 // Differs slightly from Perl, which consumes combining marks independently
3189 // of context.
3190 {
3191
3192 // Fail if at end of input
3193 if (fp->fInputIdx >= fActiveLimit) {
3194 fHitEnd = TRUE;
3195 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3196 break;
3197 }
57a6839d 3198
729e4ab9
A
3199 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3200
3201 // Examine (and consume) the current char.
3202 // Dispatch into a little state machine, based on the char.
3203 UChar32 c;
3204 c = UTEXT_NEXT32(fInputText);
3205 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3206 UnicodeSet **sets = fPattern->fStaticSets;
3207 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
3208 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
3209 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3210 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3211 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3212 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3213 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3214 goto GC_Extend;
3215
3216
3217
3218GC_L:
3219 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3220 c = UTEXT_NEXT32(fInputText);
3221 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3222 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3223 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3224 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3225 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4388f060 3226 (void)UTEXT_PREVIOUS32(fInputText);
729e4ab9
A
3227 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3228 goto GC_Extend;
3229
3230GC_V:
3231 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3232 c = UTEXT_NEXT32(fInputText);
3233 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3234 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3235 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4388f060 3236 (void)UTEXT_PREVIOUS32(fInputText);
729e4ab9
A
3237 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3238 goto GC_Extend;
3239
3240GC_T:
3241 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3242 c = UTEXT_NEXT32(fInputText);
3243 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3244 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4388f060 3245 (void)UTEXT_PREVIOUS32(fInputText);
729e4ab9
A
3246 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3247 goto GC_Extend;
3248
3249GC_Extend:
3250 // Combining characters are consumed here
3251 for (;;) {
3252 if (fp->fInputIdx >= fActiveLimit) {
3253 break;
3254 }
3255 c = UTEXT_CURRENT32(fInputText);
3256 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
3257 break;
3258 }
4388f060 3259 (void)UTEXT_NEXT32(fInputText);
729e4ab9
A
3260 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3261 }
3262 goto GC_Done;
3263
3264GC_Control:
57a6839d 3265 // Most control chars stand alone (don't combine with combining chars),
729e4ab9
A
3266 // except for that CR/LF sequence is a single grapheme cluster.
3267 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
3268 c = UTEXT_NEXT32(fInputText);
3269 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3270 }
3271
3272GC_Done:
3273 if (fp->fInputIdx >= fActiveLimit) {
3274 fHitEnd = TRUE;
3275 }
3276 break;
3277 }
57a6839d 3278
729e4ab9
A
3279
3280
3281
3282 case URX_BACKSLASH_Z: // Test for end of Input
3283 if (fp->fInputIdx < fAnchorLimit) {
3284 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3285 } else {
3286 fHitEnd = TRUE;
3287 fRequireEnd = TRUE;
3288 }
3289 break;
3290
3291
3292
3293 case URX_STATIC_SETREF:
3294 {
3295 // Test input character against one of the predefined sets
3296 // (Word Characters, for example)
3297 // The high bit of the op value is a flag for the match polarity.
3298 // 0: success if input char is in set.
3299 // 1: success if input char is not in set.
3300 if (fp->fInputIdx >= fActiveLimit) {
3301 fHitEnd = TRUE;
3302 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3303 break;
3304 }
3305
57a6839d 3306 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
729e4ab9
A
3307 opValue &= ~URX_NEG_SET;
3308 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3309
3310 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3311 UChar32 c = UTEXT_NEXT32(fInputText);
3312 if (c < 256) {
3313 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3314 if (s8->contains(c)) {
3315 success = !success;
3316 }
3317 } else {
3318 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3319 if (s->contains(c)) {
3320 success = !success;
3321 }
3322 }
3323 if (success) {
3324 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3325 } else {
3326 // the character wasn't in the set.
729e4ab9
A
3327 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3328 }
3329 }
3330 break;
57a6839d 3331
729e4ab9
A
3332
3333 case URX_STAT_SETREF_N:
3334 {
57a6839d 3335 // Test input character for NOT being a member of one of
729e4ab9
A
3336 // the predefined sets (Word Characters, for example)
3337 if (fp->fInputIdx >= fActiveLimit) {
3338 fHitEnd = TRUE;
3339 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3340 break;
3341 }
3342
3343 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3344
3345 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 3346
729e4ab9
A
3347 UChar32 c = UTEXT_NEXT32(fInputText);
3348 if (c < 256) {
3349 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3350 if (s8->contains(c) == FALSE) {
3351 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3352 break;
3353 }
3354 } else {
3355 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3356 if (s->contains(c) == FALSE) {
3357 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3358 break;
3359 }
3360 }
3361 // the character wasn't in the set.
729e4ab9
A
3362 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3363 }
3364 break;
57a6839d 3365
729e4ab9
A
3366
3367 case URX_SETREF:
3368 if (fp->fInputIdx >= fActiveLimit) {
3369 fHitEnd = TRUE;
3370 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3371 break;
3372 } else {
3373 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 3374
729e4ab9
A
3375 // There is input left. Pick up one char and test it for set membership.
3376 UChar32 c = UTEXT_NEXT32(fInputText);
3377 U_ASSERT(opValue > 0 && opValue < sets->size());
3378 if (c<256) {
3379 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3380 if (s8->contains(c)) {
3381 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3382 break;
3383 }
3384 } else {
3385 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3386 if (s->contains(c)) {
3387 // The character is in the set. A Match.
3388 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3389 break;
3390 }
3391 }
57a6839d 3392
729e4ab9 3393 // the character wasn't in the set.
729e4ab9
A
3394 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3395 }
3396 break;
3397
3398
3399 case URX_DOTANY:
3400 {
3401 // . matches anything, but stops at end-of-line.
3402 if (fp->fInputIdx >= fActiveLimit) {
3403 // At end of input. Match failed. Backtrack out.
3404 fHitEnd = TRUE;
3405 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3406 break;
3407 }
57a6839d 3408
729e4ab9 3409 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 3410
729e4ab9
A
3411 // There is input left. Advance over one char, unless we've hit end-of-line
3412 UChar32 c = UTEXT_NEXT32(fInputText);
b331163b 3413 if (isLineTerminator(c)) {
729e4ab9
A
3414 // End of line in normal mode. . does not match.
3415 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3416 break;
3417 }
3418 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3419 }
3420 break;
3421
3422
3423 case URX_DOTANY_ALL:
3424 {
3425 // ., in dot-matches-all (including new lines) mode
3426 if (fp->fInputIdx >= fActiveLimit) {
3427 // At end of input. Match failed. Backtrack out.
3428 fHitEnd = TRUE;
3429 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3430 break;
3431 }
57a6839d 3432
729e4ab9 3433 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 3434
729e4ab9
A
3435 // There is input left. Advance over one char, except if we are
3436 // at a cr/lf, advance over both of them.
57a6839d 3437 UChar32 c;
729e4ab9
A
3438 c = UTEXT_NEXT32(fInputText);
3439 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3440 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3441 // In the case of a CR/LF, we need to advance over both.
3442 UChar32 nextc = UTEXT_CURRENT32(fInputText);
3443 if (nextc == 0x0a) {
4388f060 3444 (void)UTEXT_NEXT32(fInputText);
729e4ab9
A
3445 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3446 }
3447 }
3448 }
3449 break;
3450
3451
3452 case URX_DOTANY_UNIX:
3453 {
3454 // '.' operator, matches all, but stops at end-of-line.
3455 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
3456 if (fp->fInputIdx >= fActiveLimit) {
3457 // At end of input. Match failed. Backtrack out.
3458 fHitEnd = TRUE;
3459 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3460 break;
3461 }
3462
3463 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 3464
729e4ab9
A
3465 // There is input left. Advance over one char, unless we've hit end-of-line
3466 UChar32 c = UTEXT_NEXT32(fInputText);
3467 if (c == 0x0a) {
3468 // End of line in normal mode. '.' does not match the \n
3469 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3470 } else {
3471 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3472 }
3473 }
3474 break;
3475
3476
3477 case URX_JMP:
3478 fp->fPatIdx = opValue;
3479 break;
3480
3481 case URX_FAIL:
3482 isMatch = FALSE;
3483 goto breakFromLoop;
3484
3485 case URX_JMP_SAV:
3486 U_ASSERT(opValue < fPattern->fCompiledPat->size());
3487 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3488 fp->fPatIdx = opValue; // Then JMP.
3489 break;
3490
3491 case URX_JMP_SAV_X:
3492 // This opcode is used with (x)+, when x can match a zero length string.
3493 // Same as JMP_SAV, except conditional on the match having made forward progress.
3494 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3495 // data address of the input position at the start of the loop.
3496 {
3497 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3498 int32_t stoOp = (int32_t)pat[opValue-1];
3499 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3500 int32_t frameLoc = URX_VAL(stoOp);
3501 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3502 int64_t prevInputIdx = fp->fExtra[frameLoc];
3503 U_ASSERT(prevInputIdx <= fp->fInputIdx);
3504 if (prevInputIdx < fp->fInputIdx) {
3505 // The match did make progress. Repeat the loop.
3506 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3507 fp->fPatIdx = opValue;
3508 fp->fExtra[frameLoc] = fp->fInputIdx;
57a6839d 3509 }
729e4ab9
A
3510 // If the input position did not advance, we do nothing here,
3511 // execution will fall out of the loop.
3512 }
3513 break;
3514
3515 case URX_CTR_INIT:
3516 {
3517 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
57a6839d 3518 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
729e4ab9
A
3519
3520 // Pick up the three extra operands that CTR_INIT has, and
57a6839d 3521 // skip the pattern location counter past
729e4ab9
A
3522 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3523 fp->fPatIdx += 3;
3524 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3525 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3526 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3527 U_ASSERT(minCount>=0);
3528 U_ASSERT(maxCount>=minCount || maxCount==-1);
57a6839d 3529 U_ASSERT(loopLoc>=fp->fPatIdx);
729e4ab9
A
3530
3531 if (minCount == 0) {
3532 fp = StateSave(fp, loopLoc+1, status);
3533 }
57a6839d
A
3534 if (maxCount == -1) {
3535 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
3536 } else if (maxCount == 0) {
729e4ab9
A
3537 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3538 }
3539 }
3540 break;
3541
3542 case URX_CTR_LOOP:
3543 {
3544 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3545 int32_t initOp = (int32_t)pat[opValue];
3546 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3547 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3548 int32_t minCount = (int32_t)pat[opValue+2];
3549 int32_t maxCount = (int32_t)pat[opValue+3];
729e4ab9 3550 (*pCounter)++;
57a6839d
A
3551 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3552 U_ASSERT(*pCounter == maxCount);
729e4ab9
A
3553 break;
3554 }
3555 if (*pCounter >= minCount) {
57a6839d
A
3556 if (maxCount == -1) {
3557 // Loop has no hard upper bound.
3558 // Check that it is progressing through the input, break if it is not.
3559 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
3560 if (fp->fInputIdx == *pLastInputIdx) {
3561 break;
3562 } else {
3563 *pLastInputIdx = fp->fInputIdx;
3564 }
3565 }
729e4ab9
A
3566 fp = StateSave(fp, fp->fPatIdx, status);
3567 }
3568 fp->fPatIdx = opValue + 4; // Loop back.
3569 }
3570 break;
3571
3572 case URX_CTR_INIT_NG:
3573 {
3574 // Initialize a non-greedy loop
3575 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
57a6839d 3576 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
729e4ab9 3577
57a6839d
A
3578 // Pick up the three extra operands that CTR_INIT_NG has, and
3579 // skip the pattern location counter past
729e4ab9
A
3580 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3581 fp->fPatIdx += 3;
3582 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3583 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3584 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3585 U_ASSERT(minCount>=0);
3586 U_ASSERT(maxCount>=minCount || maxCount==-1);
3587 U_ASSERT(loopLoc>fp->fPatIdx);
57a6839d
A
3588 if (maxCount == -1) {
3589 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
3590 }
729e4ab9
A
3591
3592 if (minCount == 0) {
3593 if (maxCount != 0) {
3594 fp = StateSave(fp, fp->fPatIdx, status);
3595 }
3596 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
57a6839d 3597 }
729e4ab9
A
3598 }
3599 break;
3600
3601 case URX_CTR_LOOP_NG:
3602 {
3603 // Non-greedy {min, max} loops
3604 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3605 int32_t initOp = (int32_t)pat[opValue];
3606 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3607 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3608 int32_t minCount = (int32_t)pat[opValue+2];
3609 int32_t maxCount = (int32_t)pat[opValue+3];
729e4ab9 3610
57a6839d
A
3611 (*pCounter)++;
3612 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
729e4ab9
A
3613 // The loop has matched the maximum permitted number of times.
3614 // Break out of here with no action. Matching will
3615 // continue with the following pattern.
57a6839d 3616 U_ASSERT(*pCounter == maxCount);
729e4ab9
A
3617 break;
3618 }
3619
3620 if (*pCounter < minCount) {
3621 // We haven't met the minimum number of matches yet.
3622 // Loop back for another one.
3623 fp->fPatIdx = opValue + 4; // Loop back.
3624 } else {
3625 // We do have the minimum number of matches.
57a6839d
A
3626
3627 // If there is no upper bound on the loop iterations, check that the input index
3628 // is progressing, and stop the loop if it is not.
3629 if (maxCount == -1) {
3630 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
3631 if (fp->fInputIdx == *pLastInputIdx) {
3632 break;
3633 }
3634 *pLastInputIdx = fp->fInputIdx;
3635 }
3636
3637 // Loop Continuation: we will fall into the pattern following the loop
3638 // (non-greedy, don't execute loop body first), but first do
3639 // a state save to the top of the loop, so that a match failure
729e4ab9
A
3640 // in the following pattern will try another iteration of the loop.
3641 fp = StateSave(fp, opValue + 4, status);
3642 }
3643 }
3644 break;
3645
3646 case URX_STO_SP:
3647 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3648 fData[opValue] = fStack->size();
3649 break;
3650
3651 case URX_LD_SP:
3652 {
3653 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3654 int32_t newStackSize = (int32_t)fData[opValue];
3655 U_ASSERT(newStackSize <= fStack->size());
3656 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3657 if (newFP == (int64_t *)fp) {
3658 break;
3659 }
3660 int32_t i;
3661 for (i=0; i<fFrameSize; i++) {
3662 newFP[i] = ((int64_t *)fp)[i];
3663 }
3664 fp = (REStackFrame *)newFP;
3665 fStack->setSize(newStackSize);
3666 }
3667 break;
3668
3669 case URX_BACKREF:
729e4ab9
A
3670 {
3671 U_ASSERT(opValue < fFrameSize);
3672 int64_t groupStartIdx = fp->fExtra[opValue];
3673 int64_t groupEndIdx = fp->fExtra[opValue+1];
3674 U_ASSERT(groupStartIdx <= groupEndIdx);
3675 if (groupStartIdx < 0) {
3676 // This capture group has not participated in the match thus far,
3677 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
729e4ab9
A
3678 break;
3679 }
729e4ab9
A
3680 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3681 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4388f060
A
3682
3683 // Note: if the capture group match was of an empty string the backref
57a6839d 3684 // match succeeds. Verified by testing: Perl matches succeed
4388f060 3685 // in this case, so we do too.
57a6839d 3686
4388f060
A
3687 UBool success = TRUE;
3688 for (;;) {
3689 if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3690 success = TRUE;
3691 break;
3692 }
3693 if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
3694 success = FALSE;
729e4ab9 3695 fHitEnd = TRUE;
4388f060
A
3696 break;
3697 }
3698 UChar32 captureGroupChar = utext_next32(fAltInputText);
3699 UChar32 inputChar = utext_next32(fInputText);
3700 if (inputChar != captureGroupChar) {
3701 success = FALSE;
3702 break;
729e4ab9 3703 }
4388f060
A
3704 }
3705
3706 if (success) {
3707 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3708 } else {
3709 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3710 }
3711 }
3712 break;
3713
3714
3715
3716 case URX_BACKREF_I:
3717 {
3718 U_ASSERT(opValue < fFrameSize);
3719 int64_t groupStartIdx = fp->fExtra[opValue];
3720 int64_t groupEndIdx = fp->fExtra[opValue+1];
3721 U_ASSERT(groupStartIdx <= groupEndIdx);
3722 if (groupStartIdx < 0) {
3723 // This capture group has not participated in the match thus far,
729e4ab9 3724 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
4388f060
A
3725 break;
3726 }
3727 utext_setNativeIndex(fAltInputText, groupStartIdx);
3728 utext_setNativeIndex(fInputText, fp->fInputIdx);
3729 CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
3730 CaseFoldingUTextIterator inputItr(*fInputText);
3731
3732 // Note: if the capture group match was of an empty string the backref
57a6839d 3733 // match succeeds. Verified by testing: Perl matches succeed
4388f060 3734 // in this case, so we do too.
57a6839d 3735
4388f060
A
3736 UBool success = TRUE;
3737 for (;;) {
3738 if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3739 success = TRUE;
3740 break;
3741 }
3742 if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
3743 success = FALSE;
3744 fHitEnd = TRUE;
3745 break;
3746 }
3747 UChar32 captureGroupChar = captureGroupItr.next();
3748 UChar32 inputChar = inputItr.next();
3749 if (inputChar != captureGroupChar) {
3750 success = FALSE;
3751 break;
3752 }
3753 }
3754
3755 if (success && inputItr.inExpansion()) {
57a6839d
A
3756 // We otained a match by consuming part of a string obtained from
3757 // case-folding a single code point of the input text.
4388f060
A
3758 // This does not count as an overall match.
3759 success = FALSE;
3760 }
3761
3762 if (success) {
3763 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3764 } else {
3765 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
729e4ab9 3766 }
57a6839d 3767
729e4ab9
A
3768 }
3769 break;
57a6839d 3770
729e4ab9
A
3771 case URX_STO_INP_LOC:
3772 {
3773 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3774 fp->fExtra[opValue] = fp->fInputIdx;
3775 }
3776 break;
3777
3778 case URX_JMPX:
3779 {
3780 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3781 fp->fPatIdx += 1;
3782 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
3783 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3784 int64_t savedInputIdx = fp->fExtra[dataLoc];
3785 U_ASSERT(savedInputIdx <= fp->fInputIdx);
3786 if (savedInputIdx < fp->fInputIdx) {
3787 fp->fPatIdx = opValue; // JMP
3788 } else {
3789 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
3790 }
3791 }
3792 break;
3793
3794 case URX_LA_START:
3795 {
3796 // Entering a lookahead block.
3797 // Save Stack Ptr, Input Pos.
3798 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3799 fData[opValue] = fStack->size();
3800 fData[opValue+1] = fp->fInputIdx;
3801 fActiveStart = fLookStart; // Set the match region change for
3802 fActiveLimit = fLookLimit; // transparent bounds.
3803 }
3804 break;
3805
3806 case URX_LA_END:
3807 {
3808 // Leaving a look-ahead block.
3809 // restore Stack Ptr, Input Pos to positions they had on entry to block.
3810 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3811 int32_t stackSize = fStack->size();
3812 int32_t newStackSize =(int32_t)fData[opValue];
3813 U_ASSERT(stackSize >= newStackSize);
3814 if (stackSize > newStackSize) {
3815 // Copy the current top frame back to the new (cut back) top frame.
3816 // This makes the capture groups from within the look-ahead
3817 // expression available.
3818 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3819 int32_t i;
3820 for (i=0; i<fFrameSize; i++) {
3821 newFP[i] = ((int64_t *)fp)[i];
3822 }
3823 fp = (REStackFrame *)newFP;
3824 fStack->setSize(newStackSize);
3825 }
3826 fp->fInputIdx = fData[opValue+1];
3827
3828 // Restore the active region bounds in the input string; they may have
3829 // been changed because of transparent bounds on a Region.
3830 fActiveStart = fRegionStart;
3831 fActiveLimit = fRegionLimit;
3832 }
3833 break;
3834
3835 case URX_ONECHAR_I:
4388f060
A
3836 // Case insensitive one char. The char from the pattern is already case folded.
3837 // Input text is not, but case folding the input can not reduce two or more code
3838 // points to one.
729e4ab9
A
3839 if (fp->fInputIdx < fActiveLimit) {
3840 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3841
3842 UChar32 c = UTEXT_NEXT32(fInputText);
3843 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3844 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3845 break;
3846 }
3847 } else {
3848 fHitEnd = TRUE;
3849 }
57a6839d 3850
729e4ab9
A
3851 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3852 break;
3853
3854 case URX_STRING_I:
3855 {
4388f060 3856 // Case-insensitive test input against a literal string.
729e4ab9
A
3857 // Strings require two slots in the compiled pattern, one for the
3858 // offset to the string text, and one for the length.
4388f060 3859 // The compiled string has already been case folded.
729e4ab9 3860 {
4388f060
A
3861 const UChar *patternString = litText + opValue;
3862 int32_t patternStringIdx = 0;
729e4ab9
A
3863
3864 op = (int32_t)pat[fp->fPatIdx];
3865 fp->fPatIdx++;
3866 opType = URX_TYPE(op);
3867 opValue = URX_VAL(op);
3868 U_ASSERT(opType == URX_STRING_LEN);
4388f060 3869 int32_t patternStringLen = opValue; // Length of the string from the pattern.
57a6839d
A
3870
3871
4388f060
A
3872 UChar32 cPattern;
3873 UChar32 cText;
3874 UBool success = TRUE;
3875
729e4ab9 3876 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4388f060
A
3877 CaseFoldingUTextIterator inputIterator(*fInputText);
3878 while (patternStringIdx < patternStringLen) {
3879 if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
3880 success = FALSE;
3881 fHitEnd = TRUE;
3882 break;
729e4ab9 3883 }
4388f060
A
3884 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
3885 cText = inputIterator.next();
3886 if (cText != cPattern) {
3887 success = FALSE;
3888 break;
729e4ab9
A
3889 }
3890 }
4388f060
A
3891 if (inputIterator.inExpansion()) {
3892 success = FALSE;
3893 }
3894
3895 if (success) {
3896 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3897 } else {
729e4ab9
A
3898 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3899 }
3900 }
3901 }
3902 break;
3903
3904 case URX_LB_START:
3905 {
3906 // Entering a look-behind block.
3907 // Save Stack Ptr, Input Pos.
3908 // TODO: implement transparent bounds. Ticket #6067
3909 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3910 fData[opValue] = fStack->size();
3911 fData[opValue+1] = fp->fInputIdx;
3912 // Init the variable containing the start index for attempted matches.
3913 fData[opValue+2] = -1;
3914 // Save input string length, then reset to pin any matches to end at
3915 // the current position.
3916 fData[opValue+3] = fActiveLimit;
3917 fActiveLimit = fp->fInputIdx;
3918 }
3919 break;
3920
3921
3922 case URX_LB_CONT:
3923 {
3924 // Positive Look-Behind, at top of loop checking for matches of LB expression
3925 // at all possible input starting positions.
3926
3927 // Fetch the min and max possible match lengths. They are the operands
3928 // of this op in the pattern.
3929 int32_t minML = (int32_t)pat[fp->fPatIdx++];
3930 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
2ca993e8
A
3931 if (!UTEXT_USES_U16(fInputText)) {
3932 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
3933 // The max length need not be exact; it just needs to be >= actual maximum.
3934 maxML *= 3;
3935 }
729e4ab9
A
3936 U_ASSERT(minML <= maxML);
3937 U_ASSERT(minML >= 0);
3938
3939 // Fetch (from data) the last input index where a match was attempted.
3940 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2ca993e8
A
3941 int64_t &lbStartIdx = fData[opValue+2];
3942 if (lbStartIdx < 0) {
729e4ab9 3943 // First time through loop.
2ca993e8
A
3944 lbStartIdx = fp->fInputIdx - minML;
3945 if (lbStartIdx > 0) {
3946 // move index to a code point boudary, if it's not on one already.
3947 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3948 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3949 }
729e4ab9
A
3950 } else {
3951 // 2nd through nth time through the loop.
3952 // Back up start position for match by one.
2ca993e8
A
3953 if (lbStartIdx == 0) {
3954 (lbStartIdx)--;
729e4ab9 3955 } else {
2ca993e8 3956 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4388f060 3957 (void)UTEXT_PREVIOUS32(fInputText);
2ca993e8 3958 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
729e4ab9
A
3959 }
3960 }
3961
2ca993e8 3962 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
729e4ab9
A
3963 // We have tried all potential match starting points without
3964 // getting a match. Backtrack out, and out of the
3965 // Look Behind altogether.
3966 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3967 int64_t restoreInputLen = fData[opValue+3];
3968 U_ASSERT(restoreInputLen >= fActiveLimit);
3969 U_ASSERT(restoreInputLen <= fInputLength);
3970 fActiveLimit = restoreInputLen;
3971 break;
3972 }
3973
3974 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3975 // (successful match will fall off the end of the loop.)
3976 fp = StateSave(fp, fp->fPatIdx-3, status);
2ca993e8 3977 fp->fInputIdx = lbStartIdx;
729e4ab9
A
3978 }
3979 break;
3980
3981 case URX_LB_END:
3982 // End of a look-behind block, after a successful match.
3983 {
3984 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3985 if (fp->fInputIdx != fActiveLimit) {
3986 // The look-behind expression matched, but the match did not
3987 // extend all the way to the point that we are looking behind from.
3988 // FAIL out of here, which will take us back to the LB_CONT, which
3989 // will retry the match starting at another position or fail
3990 // the look-behind altogether, whichever is appropriate.
3991 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3992 break;
3993 }
3994
3995 // Look-behind match is good. Restore the orignal input string length,
57a6839d 3996 // which had been truncated to pin the end of the lookbehind match to the
729e4ab9
A
3997 // position being looked-behind.
3998 int64_t originalInputLen = fData[opValue+3];
3999 U_ASSERT(originalInputLen >= fActiveLimit);
4000 U_ASSERT(originalInputLen <= fInputLength);
4001 fActiveLimit = originalInputLen;
4002 }
4003 break;
4004
4005
4006 case URX_LBN_CONT:
4007 {
4008 // Negative Look-Behind, at top of loop checking for matches of LB expression
4009 // at all possible input starting positions.
4010
4011 // Fetch the extra parameters of this op.
4012 int32_t minML = (int32_t)pat[fp->fPatIdx++];
4013 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
2ca993e8
A
4014 if (!UTEXT_USES_U16(fInputText)) {
4015 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
4016 // The max length need not be exact; it just needs to be >= actual maximum.
4017 maxML *= 3;
4018 }
729e4ab9
A
4019 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
4020 continueLoc = URX_VAL(continueLoc);
4021 U_ASSERT(minML <= maxML);
4022 U_ASSERT(minML >= 0);
4023 U_ASSERT(continueLoc > fp->fPatIdx);
4024
4025 // Fetch (from data) the last input index where a match was attempted.
4026 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2ca993e8
A
4027 int64_t &lbStartIdx = fData[opValue+2];
4028 if (lbStartIdx < 0) {
729e4ab9 4029 // First time through loop.
2ca993e8
A
4030 lbStartIdx = fp->fInputIdx - minML;
4031 if (lbStartIdx > 0) {
4032 // move index to a code point boudary, if it's not on one already.
4033 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4034 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4035 }
729e4ab9
A
4036 } else {
4037 // 2nd through nth time through the loop.
4038 // Back up start position for match by one.
2ca993e8
A
4039 if (lbStartIdx == 0) {
4040 (lbStartIdx)--;
729e4ab9 4041 } else {
2ca993e8 4042 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4388f060 4043 (void)UTEXT_PREVIOUS32(fInputText);
2ca993e8 4044 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
729e4ab9
A
4045 }
4046 }
4047
2ca993e8 4048 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
729e4ab9
A
4049 // We have tried all potential match starting points without
4050 // getting a match, which means that the negative lookbehind as
4051 // a whole has succeeded. Jump forward to the continue location
4052 int64_t restoreInputLen = fData[opValue+3];
4053 U_ASSERT(restoreInputLen >= fActiveLimit);
4054 U_ASSERT(restoreInputLen <= fInputLength);
4055 fActiveLimit = restoreInputLen;
4056 fp->fPatIdx = continueLoc;
4057 break;
4058 }
4059
4060 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4061 // (successful match will cause a FAIL out of the loop altogether.)
4062 fp = StateSave(fp, fp->fPatIdx-4, status);
2ca993e8 4063 fp->fInputIdx = lbStartIdx;
729e4ab9
A
4064 }
4065 break;
4066
4067 case URX_LBN_END:
4068 // End of a negative look-behind block, after a successful match.
4069 {
4070 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4071 if (fp->fInputIdx != fActiveLimit) {
4072 // The look-behind expression matched, but the match did not
4073 // extend all the way to the point that we are looking behind from.
4074 // FAIL out of here, which will take us back to the LB_CONT, which
4075 // will retry the match starting at another position or succeed
4076 // the look-behind altogether, whichever is appropriate.
4077 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4078 break;
4079 }
4080
4081 // Look-behind expression matched, which means look-behind test as
4082 // a whole Fails
57a6839d
A
4083
4084 // Restore the orignal input string length, which had been truncated
4085 // inorder to pin the end of the lookbehind match
729e4ab9
A
4086 // to the position being looked-behind.
4087 int64_t originalInputLen = fData[opValue+3];
4088 U_ASSERT(originalInputLen >= fActiveLimit);
4089 U_ASSERT(originalInputLen <= fInputLength);
4090 fActiveLimit = originalInputLen;
4091
4092 // Restore original stack position, discarding any state saved
4093 // by the successful pattern match.
4094 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4095 int32_t newStackSize = (int32_t)fData[opValue];
4096 U_ASSERT(fStack->size() > newStackSize);
4097 fStack->setSize(newStackSize);
57a6839d
A
4098
4099 // FAIL, which will take control back to someplace
729e4ab9
A
4100 // prior to entering the look-behind test.
4101 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4102 }
4103 break;
4104
4105
4106 case URX_LOOP_SR_I:
4107 // Loop Initialization for the optimized implementation of
4108 // [some character set]*
4109 // This op scans through all matching input.
4110 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4111 {
4112 U_ASSERT(opValue > 0 && opValue < sets->size());
4113 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4114 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
4115
4116 // Loop through input, until either the input is exhausted or
4117 // we reach a character that is not a member of the set.
4118 int64_t ix = fp->fInputIdx;
4119 UTEXT_SETNATIVEINDEX(fInputText, ix);
4120 for (;;) {
4121 if (ix >= fActiveLimit) {
4122 fHitEnd = TRUE;
4123 break;
4124 }
4125 UChar32 c = UTEXT_NEXT32(fInputText);
4126 if (c<256) {
4127 if (s8->contains(c) == FALSE) {
4128 break;
4129 }
4130 } else {
4131 if (s->contains(c) == FALSE) {
4132 break;
4133 }
4134 }
4135 ix = UTEXT_GETNATIVEINDEX(fInputText);
4136 }
4137
4138 // If there were no matching characters, skip over the loop altogether.
4139 // The loop doesn't run at all, a * op always succeeds.
4140 if (ix == fp->fInputIdx) {
4141 fp->fPatIdx++; // skip the URX_LOOP_C op.
4142 break;
4143 }
4144
4145 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4146 // must follow. It's operand is the stack location
4147 // that holds the starting input index for the match of this [set]*
4148 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4149 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4150 int32_t stackLoc = URX_VAL(loopcOp);
4151 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4152 fp->fExtra[stackLoc] = fp->fInputIdx;
729e4ab9
A
4153 fp->fInputIdx = ix;
4154
4155 // Save State to the URX_LOOP_C op that follows this one,
4156 // so that match failures in the following code will return to there.
4157 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4158 fp = StateSave(fp, fp->fPatIdx, status);
4159 fp->fPatIdx++;
4160 }
4161 break;
4162
4163
4164 case URX_LOOP_DOT_I:
4165 // Loop Initialization for the optimized implementation of .*
4166 // This op scans through all remaining input.
4167 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4168 {
4169 // Loop through input until the input is exhausted (we reach an end-of-line)
4170 // In DOTALL mode, we can just go straight to the end of the input.
4171 int64_t ix;
4172 if ((opValue & 1) == 1) {
4173 // Dot-matches-All mode. Jump straight to the end of the string.
4174 ix = fActiveLimit;
4175 fHitEnd = TRUE;
4176 } else {
4177 // NOT DOT ALL mode. Line endings do not match '.'
4178 // Scan forward until a line ending or end of input.
4179 ix = fp->fInputIdx;
4180 UTEXT_SETNATIVEINDEX(fInputText, ix);
4181 for (;;) {
4182 if (ix >= fActiveLimit) {
4183 fHitEnd = TRUE;
4184 break;
4185 }
4186 UChar32 c = UTEXT_NEXT32(fInputText);
4187 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
4188 if ((c == 0x0a) || // 0x0a is newline in both modes.
4189 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
b331163b 4190 isLineTerminator(c))) {
729e4ab9
A
4191 // char is a line ending. Exit the scanning loop.
4192 break;
4193 }
4194 }
4195 ix = UTEXT_GETNATIVEINDEX(fInputText);
4196 }
4197 }
4198
4199 // If there were no matching characters, skip over the loop altogether.
4200 // The loop doesn't run at all, a * op always succeeds.
4201 if (ix == fp->fInputIdx) {
4202 fp->fPatIdx++; // skip the URX_LOOP_C op.
4203 break;
4204 }
4205
4206 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4207 // must follow. It's operand is the stack location
4208 // that holds the starting input index for the match of this .*
4209 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4210 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4211 int32_t stackLoc = URX_VAL(loopcOp);
4212 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4213 fp->fExtra[stackLoc] = fp->fInputIdx;
729e4ab9
A
4214 fp->fInputIdx = ix;
4215
4216 // Save State to the URX_LOOP_C op that follows this one,
4217 // so that match failures in the following code will return to there.
4218 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4219 fp = StateSave(fp, fp->fPatIdx, status);
4220 fp->fPatIdx++;
4221 }
4222 break;
4223
4224
4225 case URX_LOOP_C:
4226 {
4227 U_ASSERT(opValue>=0 && opValue<fFrameSize);
4228 backSearchIndex = fp->fExtra[opValue];
4229 U_ASSERT(backSearchIndex <= fp->fInputIdx);
4230 if (backSearchIndex == fp->fInputIdx) {
4231 // We've backed up the input idx to the point that the loop started.
57a6839d 4232 // The loop is done. Leave here without saving state.
729e4ab9
A
4233 // Subsequent failures won't come back here.
4234 break;
4235 }
4236 // Set up for the next iteration of the loop, with input index
4237 // backed up by one from the last time through,
4238 // and a state save to this instruction in case the following code fails again.
4239 // (We're going backwards because this loop emulates stack unwinding, not
4240 // the initial scan forward.)
4241 U_ASSERT(fp->fInputIdx > 0);
4242 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4243 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4244 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
57a6839d 4245
729e4ab9 4246 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
57a6839d 4247 if (prevC == 0x0a &&
729e4ab9
A
4248 fp->fInputIdx > backSearchIndex &&
4249 twoPrevC == 0x0d) {
4250 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4251 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4252 // .*, stepping back over CRLF pair.
4253 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4254 }
4255 }
4256
374ca955 4257
729e4ab9
A
4258 fp = StateSave(fp, fp->fPatIdx-1, status);
4259 }
4260 break;
374ca955
A
4261
4262
729e4ab9
A
4263
4264 default:
4265 // Trouble. The compiled pattern contains an entry with an
4266 // unrecognized type tag.
4267 U_ASSERT(FALSE);
b75a7d8f 4268 }
729e4ab9
A
4269
4270 if (U_FAILURE(status)) {
4271 isMatch = FALSE;
b75a7d8f
A
4272 break;
4273 }
4274 }
57a6839d 4275
729e4ab9
A
4276breakFromLoop:
4277 fMatch = isMatch;
4278 if (isMatch) {
4279 fLastMatchEnd = fMatchEnd;
4280 fMatchStart = startIdx;
4281 fMatchEnd = fp->fInputIdx;
46f4442e 4282 }
57a6839d
A
4283
4284#ifdef REGEX_RUN_DEBUG
4285 if (fTraceDebug) {
4286 if (isMatch) {
4287 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
4288 } else {
4289 printf("No match\n\n");
46f4442e
A
4290 }
4291 }
57a6839d 4292#endif
46f4442e 4293
729e4ab9
A
4294 fFrame = fp; // The active stack frame when the engine stopped.
4295 // Contains the capture group results that we need to
4296 // access later.
4297 return;
b75a7d8f 4298}
46f4442e
A
4299
4300
b75a7d8f
A
4301//--------------------------------------------------------------------------------
4302//
729e4ab9
A
4303// MatchChunkAt This is the actual matching engine. Like MatchAt, but with the
4304// assumption that the entire string is available in the UText's
4305// chunk buffer. For now, that means we can use int32_t indexes,
4306// except for anything that needs to be saved (like group starts
4307// and ends).
b75a7d8f 4308//
46f4442e
A
4309// startIdx: begin matching a this index.
4310// toEnd: if true, match must extend to end of the input region
4311//
b75a7d8f 4312//--------------------------------------------------------------------------------
729e4ab9 4313void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
b75a7d8f 4314 UBool isMatch = FALSE; // True if the we have a match.
57a6839d 4315
729e4ab9 4316 int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
b75a7d8f
A
4317
4318 int32_t op; // Operation from the compiled pattern, split into
4319 int32_t opType; // the opcode
4320 int32_t opValue; // and the operand value.
57a6839d 4321
729e4ab9 4322#ifdef REGEX_RUN_DEBUG
57a6839d
A
4323 if (fTraceDebug) {
4324 printf("MatchAt(startIdx=%d)\n", startIdx);
2ca993e8
A
4325 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
4326 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
b75a7d8f 4327 }
729e4ab9 4328#endif
57a6839d 4329
b75a7d8f
A
4330 if (U_FAILURE(status)) {
4331 return;
4332 }
57a6839d 4333
b75a7d8f 4334 // Cache frequently referenced items from the compiled pattern
b75a7d8f 4335 //
729e4ab9 4336 int64_t *pat = fPattern->fCompiledPat->getBuffer();
57a6839d 4337
b75a7d8f
A
4338 const UChar *litText = fPattern->fLiteralText.getBuffer();
4339 UVector *sets = fPattern->fSets;
57a6839d 4340
729e4ab9 4341 const UChar *inputBuf = fInputText->chunkContents;
57a6839d 4342
46f4442e 4343 fFrameSize = fPattern->fFrameSize;
b75a7d8f 4344 REStackFrame *fp = resetStack();
2ca993e8
A
4345 if (U_FAILURE(fDeferredStatus)) {
4346 status = fDeferredStatus;
4347 return;
4348 }
57a6839d 4349
b75a7d8f
A
4350 fp->fPatIdx = 0;
4351 fp->fInputIdx = startIdx;
57a6839d 4352
b75a7d8f
A
4353 // Zero out the pattern's static data
4354 int32_t i;
4355 for (i = 0; i<fPattern->fDataSize; i++) {
4356 fData[i] = 0;
4357 }
57a6839d 4358
b75a7d8f
A
4359 //
4360 // Main loop for interpreting the compiled pattern.
4361 // One iteration of the loop per pattern operation performed.
4362 //
4363 for (;;) {
729e4ab9 4364 op = (int32_t)pat[fp->fPatIdx];
b75a7d8f
A
4365 opType = URX_TYPE(op);
4366 opValue = URX_VAL(op);
729e4ab9 4367#ifdef REGEX_RUN_DEBUG
b75a7d8f 4368 if (fTraceDebug) {
729e4ab9 4369 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 4370 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
729e4ab9 4371 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
b75a7d8f
A
4372 fPattern->dumpOp(fp->fPatIdx);
4373 }
729e4ab9 4374#endif
b75a7d8f 4375 fp->fPatIdx++;
57a6839d 4376
b75a7d8f 4377 switch (opType) {
57a6839d
A
4378
4379
b75a7d8f
A
4380 case URX_NOP:
4381 break;
57a6839d
A
4382
4383
b75a7d8f
A
4384 case URX_BACKTRACK:
4385 // Force a backtrack. In some circumstances, the pattern compiler
4386 // will notice that the pattern can't possibly match anything, and will
4387 // emit one of these at that point.
46f4442e 4388 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f 4389 break;
57a6839d
A
4390
4391
b75a7d8f 4392 case URX_ONECHAR:
46f4442e 4393 if (fp->fInputIdx < fActiveLimit) {
729e4ab9 4394 UChar32 c;
46f4442e
A
4395 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4396 if (c == opValue) {
b75a7d8f
A
4397 break;
4398 }
46f4442e
A
4399 } else {
4400 fHitEnd = TRUE;
b75a7d8f 4401 }
729e4ab9
A
4402 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4403 break;
57a6839d
A
4404
4405
b75a7d8f
A
4406 case URX_STRING:
4407 {
4408 // Test input against a literal string.
4409 // Strings require two slots in the compiled pattern, one for the
4410 // offset to the string text, and one for the length.
4411 int32_t stringStartIdx = opValue;
4412 int32_t stringLen;
57a6839d 4413
729e4ab9 4414 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
b75a7d8f
A
4415 fp->fPatIdx++;
4416 opType = URX_TYPE(op);
4417 stringLen = URX_VAL(op);
4418 U_ASSERT(opType == URX_STRING_LEN);
4419 U_ASSERT(stringLen >= 2);
57a6839d 4420
b75a7d8f 4421 const UChar * pInp = inputBuf + fp->fInputIdx;
4388f060 4422 const UChar * pInpLimit = inputBuf + fActiveLimit;
b75a7d8f
A
4423 const UChar * pPat = litText+stringStartIdx;
4424 const UChar * pEnd = pInp + stringLen;
4388f060
A
4425 UBool success = TRUE;
4426 while (pInp < pEnd) {
4427 if (pInp >= pInpLimit) {
4428 fHitEnd = TRUE;
4429 success = FALSE;
4430 break;
4431 }
4432 if (*pInp++ != *pPat++) {
4433 success = FALSE;
b75a7d8f
A
4434 break;
4435 }
4436 }
57a6839d 4437
729e4ab9
A
4438 if (success) {
4439 fp->fInputIdx += stringLen;
4440 } else {
729e4ab9
A
4441 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4442 }
b75a7d8f 4443 }
729e4ab9 4444 break;
57a6839d
A
4445
4446
b75a7d8f 4447 case URX_STATE_SAVE:
46f4442e 4448 fp = StateSave(fp, opValue, status);
b75a7d8f 4449 break;
57a6839d
A
4450
4451
b75a7d8f
A
4452 case URX_END:
4453 // The match loop will exit via this path on a successful match,
4454 // when we reach the end of the pattern.
46f4442e
A
4455 if (toEnd && fp->fInputIdx != fActiveLimit) {
4456 // The pattern matched, but not to the end of input. Try some more.
4457 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4458 break;
4459 }
b75a7d8f
A
4460 isMatch = TRUE;
4461 goto breakFromLoop;
57a6839d 4462
729e4ab9 4463 // Start and End Capture stack frame variables are laid out out like this:
b75a7d8f
A
4464 // fp->fExtra[opValue] - The start of a completed capture group
4465 // opValue+1 - The end of a completed capture group
4466 // opValue+2 - the start of a capture group whose end
4467 // has not yet been reached (and might not ever be).
4468 case URX_START_CAPTURE:
46f4442e 4469 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
b75a7d8f
A
4470 fp->fExtra[opValue+2] = fp->fInputIdx;
4471 break;
57a6839d
A
4472
4473
b75a7d8f 4474 case URX_END_CAPTURE:
46f4442e 4475 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
b75a7d8f
A
4476 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
4477 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
4478 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
4479 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4480 break;
57a6839d
A
4481
4482
b75a7d8f 4483 case URX_DOLLAR: // $, test for End of line
729e4ab9 4484 // or for position before new line at end of input
46f4442e 4485 if (fp->fInputIdx < fAnchorLimit-2) {
b75a7d8f 4486 // We are no where near the end of input. Fail.
46f4442e
A
4487 // This is the common case. Keep it first.
4488 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4489 break;
4490 }
46f4442e 4491 if (fp->fInputIdx >= fAnchorLimit) {
b75a7d8f 4492 // We really are at the end of input. Success.
46f4442e
A
4493 fHitEnd = TRUE;
4494 fRequireEnd = TRUE;
b75a7d8f
A
4495 break;
4496 }
57a6839d 4497
b75a7d8f
A
4498 // If we are positioned just before a new-line that is located at the
4499 // end of input, succeed.
46f4442e 4500 if (fp->fInputIdx == fAnchorLimit-1) {
729e4ab9
A
4501 UChar32 c;
4502 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
57a6839d 4503
b331163b 4504 if (isLineTerminator(c)) {
46f4442e 4505 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
374ca955 4506 // At new-line at end of input. Success
46f4442e
A
4507 fHitEnd = TRUE;
4508 fRequireEnd = TRUE;
4509 break;
374ca955 4510 }
b75a7d8f 4511 }
729e4ab9
A
4512 } else if (fp->fInputIdx == fAnchorLimit-2 &&
4513 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
46f4442e
A
4514 fHitEnd = TRUE;
4515 fRequireEnd = TRUE;
b75a7d8f 4516 break; // At CR/LF at end of input. Success
b75a7d8f 4517 }
57a6839d 4518
46f4442e 4519 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
57a6839d 4520
46f4442e 4521 break;
57a6839d
A
4522
4523
729e4ab9 4524 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
46f4442e
A
4525 if (fp->fInputIdx >= fAnchorLimit-1) {
4526 // Either at the last character of input, or off the end.
4527 if (fp->fInputIdx == fAnchorLimit-1) {
4528 // At last char of input. Success if it's a new line.
729e4ab9 4529 if (inputBuf[fp->fInputIdx] == 0x0a) {
46f4442e
A
4530 fHitEnd = TRUE;
4531 fRequireEnd = TRUE;
4532 break;
4533 }
4534 } else {
4535 // Off the end of input. Success.
4536 fHitEnd = TRUE;
4537 fRequireEnd = TRUE;
4538 break;
4539 }
4540 }
57a6839d 4541
46f4442e
A
4542 // Not at end of input. Back-track out.
4543 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f 4544 break;
57a6839d
A
4545
4546
729e4ab9
A
4547 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
4548 {
4549 if (fp->fInputIdx >= fAnchorLimit) {
4550 // We really are at the end of input. Success.
4551 fHitEnd = TRUE;
4552 fRequireEnd = TRUE;
4553 break;
4554 }
4555 // If we are positioned just before a new-line, succeed.
4556 // It makes no difference where the new-line is within the input.
4557 UChar32 c = inputBuf[fp->fInputIdx];
b331163b 4558 if (isLineTerminator(c)) {
729e4ab9
A
4559 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
4560 // In multi-line mode, hitting a new-line just before the end of input does not
4561 // set the hitEnd or requireEnd flags
4562 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
46f4442e 4563 break;
729e4ab9
A
4564 }
4565 }
4566 // not at a new line. Fail.
4567 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4568 }
4569 break;
57a6839d
A
4570
4571
729e4ab9
A
4572 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
4573 {
4574 if (fp->fInputIdx >= fAnchorLimit) {
4575 // We really are at the end of input. Success.
4576 fHitEnd = TRUE;
4577 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
4578 break; // adding a new-line would not lose the match.
4579 }
4580 // If we are not positioned just before a new-line, the test fails; backtrack out.
4581 // It makes no difference where the new-line is within the input.
4582 if (inputBuf[fp->fInputIdx] != 0x0a) {
4583 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4584 }
4585 }
4586 break;
57a6839d
A
4587
4588
729e4ab9 4589 case URX_CARET: // ^, test for start of line
46f4442e
A
4590 if (fp->fInputIdx != fAnchorStart) {
4591 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4592 }
b75a7d8f 4593 break;
57a6839d
A
4594
4595
729e4ab9
A
4596 case URX_CARET_M: // ^, test for start of line in mulit-line mode
4597 {
4598 if (fp->fInputIdx == fAnchorStart) {
4599 // We are at the start input. Success.
4600 break;
4601 }
4602 // Check whether character just before the current pos is a new-line
4603 // unless we are at the end of input
57a6839d
A
4604 UChar c = inputBuf[fp->fInputIdx - 1];
4605 if ((fp->fInputIdx < fAnchorLimit) &&
b331163b 4606 isLineTerminator(c)) {
729e4ab9
A
4607 // It's a new-line. ^ is true. Success.
4608 // TODO: what should be done with positions between a CR and LF?
4609 break;
4610 }
4611 // Not at the start of a line. Fail.
4612 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4613 }
4614 break;
57a6839d
A
4615
4616
729e4ab9
A
4617 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
4618 {
4619 U_ASSERT(fp->fInputIdx >= fAnchorStart);
4620 if (fp->fInputIdx <= fAnchorStart) {
4621 // We are at the start input. Success.
4622 break;
4623 }
4624 // Check whether character just before the current pos is a new-line
4625 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
57a6839d 4626 UChar c = inputBuf[fp->fInputIdx - 1];
729e4ab9
A
4627 if (c != 0x0a) {
4628 // Not at the start of a line. Back-track out.
4629 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4630 }
4631 }
4632 break;
57a6839d 4633
b75a7d8f
A
4634 case URX_BACKSLASH_B: // Test for word boundaries
4635 {
729e4ab9 4636 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
51004dcb 4637 success ^= (UBool)(opValue != 0); // flip sense for \B
b75a7d8f 4638 if (!success) {
46f4442e 4639 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4640 }
4641 }
4642 break;
57a6839d
A
4643
4644
374ca955
A
4645 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
4646 {
4647 UBool success = isUWordBoundary(fp->fInputIdx);
51004dcb 4648 success ^= (UBool)(opValue != 0); // flip sense for \B
374ca955 4649 if (!success) {
46f4442e 4650 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
374ca955
A
4651 }
4652 }
4653 break;
57a6839d
A
4654
4655
b75a7d8f
A
4656 case URX_BACKSLASH_D: // Test for decimal digit
4657 {
46f4442e
A
4658 if (fp->fInputIdx >= fActiveLimit) {
4659 fHitEnd = TRUE;
4660 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4661 break;
4662 }
57a6839d 4663
729e4ab9
A
4664 UChar32 c;
4665 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
46f4442e 4666 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
b75a7d8f 4667 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
51004dcb 4668 success ^= (UBool)(opValue != 0); // flip sense for \D
729e4ab9 4669 if (!success) {
46f4442e 4670 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4671 }
4672 }
4673 break;
57a6839d
A
4674
4675
b75a7d8f 4676 case URX_BACKSLASH_G: // Test for position at end of previous match
729e4ab9 4677 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
46f4442e 4678 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4679 }
4680 break;
57a6839d
A
4681
4682
b331163b
A
4683 case URX_BACKSLASH_H: // Test for \h, horizontal white space.
4684 {
4685 if (fp->fInputIdx >= fActiveLimit) {
4686 fHitEnd = TRUE;
4687 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4688 break;
4689 }
4690 UChar32 c;
4691 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4692 int8_t ctype = u_charType(c);
4693 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
4694 success ^= (UBool)(opValue != 0); // flip sense for \H
4695 if (!success) {
4696 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4697 }
4698 }
4699 break;
4700
4701
4702 case URX_BACKSLASH_R: // Test for \R, any line break sequence.
4703 {
4704 if (fp->fInputIdx >= fActiveLimit) {
4705 fHitEnd = TRUE;
4706 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4707 break;
4708 }
4709 UChar32 c;
4710 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4711 if (isLineTerminator(c)) {
4712 if (c == 0x0d && fp->fInputIdx < fActiveLimit) {
4713 // Check for CR/LF sequence. Consume both together when found.
4714 UChar c2;
4715 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2);
4716 if (c2 != 0x0a) {
4717 U16_PREV(inputBuf, 0, fp->fInputIdx, c2);
4718 }
4719 }
4720 } else {
4721 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4722 }
4723 }
4724 break;
4725
4726
4727 case URX_BACKSLASH_V: // Any single code point line ending.
4728 {
4729 if (fp->fInputIdx >= fActiveLimit) {
4730 fHitEnd = TRUE;
4731 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4732 break;
4733 }
4734 UChar32 c;
4735 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4736 UBool success = isLineTerminator(c);
4737 success ^= (UBool)(opValue != 0); // flip sense for \V
4738 if (!success) {
4739 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4740 }
4741 }
4742 break;
4743
4744
4745
57a6839d 4746 case URX_BACKSLASH_X:
729e4ab9
A
4747 // Match a Grapheme, as defined by Unicode TR 29.
4748 // Differs slightly from Perl, which consumes combining marks independently
4749 // of context.
4750 {
b75a7d8f 4751
729e4ab9
A
4752 // Fail if at end of input
4753 if (fp->fInputIdx >= fActiveLimit) {
4754 fHitEnd = TRUE;
4755 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4756 break;
4757 }
b75a7d8f 4758
729e4ab9
A
4759 // Examine (and consume) the current char.
4760 // Dispatch into a little state machine, based on the char.
4761 UChar32 c;
4762 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4763 UnicodeSet **sets = fPattern->fStaticSets;
4764 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
4765 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
4766 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4767 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4768 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4769 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4770 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4771 goto GC_Extend;
b75a7d8f
A
4772
4773
4774
4775GC_L:
729e4ab9
A
4776 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4777 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4778 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4779 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4780 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4781 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4782 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4783 goto GC_Extend;
b75a7d8f
A
4784
4785GC_V:
729e4ab9
A
4786 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4787 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4788 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4789 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4790 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4791 goto GC_Extend;
b75a7d8f
A
4792
4793GC_T:
729e4ab9
A
4794 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4795 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4796 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4797 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4798 goto GC_Extend;
b75a7d8f
A
4799
4800GC_Extend:
729e4ab9
A
4801 // Combining characters are consumed here
4802 for (;;) {
4803 if (fp->fInputIdx >= fActiveLimit) {
4804 break;
b75a7d8f 4805 }
729e4ab9
A
4806 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4807 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
4808 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
4809 break;
4810 }
4811 }
4812 goto GC_Done;
b75a7d8f
A
4813
4814GC_Control:
57a6839d 4815 // Most control chars stand alone (don't combine with combining chars),
729e4ab9
A
4816 // except for that CR/LF sequence is a single grapheme cluster.
4817 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
4818 fp->fInputIdx++;
4819 }
b75a7d8f
A
4820
4821GC_Done:
729e4ab9
A
4822 if (fp->fInputIdx >= fActiveLimit) {
4823 fHitEnd = TRUE;
b75a7d8f 4824 }
729e4ab9
A
4825 break;
4826 }
57a6839d
A
4827
4828
4829
4830
46f4442e
A
4831 case URX_BACKSLASH_Z: // Test for end of Input
4832 if (fp->fInputIdx < fAnchorLimit) {
4833 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4834 } else {
4835 fHitEnd = TRUE;
4836 fRequireEnd = TRUE;
b75a7d8f
A
4837 }
4838 break;
57a6839d
A
4839
4840
4841
b75a7d8f
A
4842 case URX_STATIC_SETREF:
4843 {
4844 // Test input character against one of the predefined sets
4845 // (Word Characters, for example)
4846 // The high bit of the op value is a flag for the match polarity.
4847 // 0: success if input char is in set.
4848 // 1: success if input char is not in set.
46f4442e
A
4849 if (fp->fInputIdx >= fActiveLimit) {
4850 fHitEnd = TRUE;
4851 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4852 break;
4853 }
57a6839d
A
4854
4855 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
b75a7d8f
A
4856 opValue &= ~URX_NEG_SET;
4857 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
57a6839d 4858
729e4ab9 4859 UChar32 c;
46f4442e 4860 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
b75a7d8f
A
4861 if (c < 256) {
4862 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4863 if (s8->contains(c)) {
4864 success = !success;
4865 }
4866 } else {
4867 const UnicodeSet *s = fPattern->fStaticSets[opValue];
4868 if (s->contains(c)) {
4869 success = !success;
4870 }
4871 }
4872 if (!success) {
46f4442e 4873 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4874 }
4875 }
4876 break;
57a6839d
A
4877
4878
b75a7d8f
A
4879 case URX_STAT_SETREF_N:
4880 {
57a6839d 4881 // Test input character for NOT being a member of one of
b75a7d8f 4882 // the predefined sets (Word Characters, for example)
46f4442e
A
4883 if (fp->fInputIdx >= fActiveLimit) {
4884 fHitEnd = TRUE;
4885 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4886 break;
4887 }
57a6839d 4888
b75a7d8f 4889 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
57a6839d 4890
b75a7d8f 4891 UChar32 c;
46f4442e 4892 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
b75a7d8f
A
4893 if (c < 256) {
4894 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4895 if (s8->contains(c) == FALSE) {
4896 break;
4897 }
4898 } else {
4899 const UnicodeSet *s = fPattern->fStaticSets[opValue];
4900 if (s->contains(c) == FALSE) {
4901 break;
4902 }
4903 }
46f4442e 4904 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4905 }
4906 break;
57a6839d
A
4907
4908
b75a7d8f 4909 case URX_SETREF:
729e4ab9
A
4910 {
4911 if (fp->fInputIdx >= fActiveLimit) {
4912 fHitEnd = TRUE;
4913 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
46f4442e
A
4914 break;
4915 }
57a6839d 4916
729e4ab9
A
4917 U_ASSERT(opValue > 0 && opValue < sets->size());
4918
4919 // There is input left. Pick up one char and test it for set membership.
4920 UChar32 c;
4921 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4922 if (c<256) {
4923 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4924 if (s8->contains(c)) {
4925 // The character is in the set. A Match.
4926 break;
4927 }
4928 } else {
4929 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
4930 if (s->contains(c)) {
4931 // The character is in the set. A Match.
4932 break;
4933 }
4934 }
57a6839d 4935
729e4ab9 4936 // the character wasn't in the set.
729e4ab9 4937 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
46f4442e 4938 }
b75a7d8f 4939 break;
57a6839d
A
4940
4941
b75a7d8f
A
4942 case URX_DOTANY:
4943 {
4944 // . matches anything, but stops at end-of-line.
46f4442e 4945 if (fp->fInputIdx >= fActiveLimit) {
b75a7d8f 4946 // At end of input. Match failed. Backtrack out.
46f4442e
A
4947 fHitEnd = TRUE;
4948 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4949 break;
4950 }
57a6839d 4951
b75a7d8f 4952 // There is input left. Advance over one char, unless we've hit end-of-line
729e4ab9 4953 UChar32 c;
46f4442e 4954 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
b331163b 4955 if (isLineTerminator(c)) {
b75a7d8f 4956 // End of line in normal mode. . does not match.
729e4ab9 4957 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4958 break;
4959 }
4960 }
4961 break;
57a6839d
A
4962
4963
b75a7d8f
A
4964 case URX_DOTANY_ALL:
4965 {
729e4ab9 4966 // . in dot-matches-all (including new lines) mode
46f4442e 4967 if (fp->fInputIdx >= fActiveLimit) {
b75a7d8f 4968 // At end of input. Match failed. Backtrack out.
46f4442e
A
4969 fHitEnd = TRUE;
4970 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4971 break;
4972 }
57a6839d 4973
b75a7d8f
A
4974 // There is input left. Advance over one char, except if we are
4975 // at a cr/lf, advance over both of them.
57a6839d 4976 UChar32 c;
46f4442e
A
4977 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4978 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
b75a7d8f 4979 // In the case of a CR/LF, we need to advance over both.
729e4ab9
A
4980 if (inputBuf[fp->fInputIdx] == 0x0a) {
4981 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
b75a7d8f
A
4982 }
4983 }
4984 }
4985 break;
57a6839d
A
4986
4987
46f4442e 4988 case URX_DOTANY_UNIX:
b75a7d8f 4989 {
46f4442e
A
4990 // '.' operator, matches all, but stops at end-of-line.
4991 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
4992 if (fp->fInputIdx >= fActiveLimit) {
4993 // At end of input. Match failed. Backtrack out.
4994 fHitEnd = TRUE;
4995 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4996 break;
4997 }
57a6839d 4998
46f4442e 4999 // There is input left. Advance over one char, unless we've hit end-of-line
57a6839d 5000 UChar32 c;
46f4442e
A
5001 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5002 if (c == 0x0a) {
5003 // End of line in normal mode. '.' does not match the \n
5004 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
5005 }
5006 }
5007 break;
57a6839d
A
5008
5009
b75a7d8f
A
5010 case URX_JMP:
5011 fp->fPatIdx = opValue;
5012 break;
57a6839d 5013
b75a7d8f
A
5014 case URX_FAIL:
5015 isMatch = FALSE;
5016 goto breakFromLoop;
57a6839d 5017
b75a7d8f
A
5018 case URX_JMP_SAV:
5019 U_ASSERT(opValue < fPattern->fCompiledPat->size());
46f4442e
A
5020 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
5021 fp->fPatIdx = opValue; // Then JMP.
b75a7d8f 5022 break;
57a6839d 5023
b75a7d8f
A
5024 case URX_JMP_SAV_X:
5025 // This opcode is used with (x)+, when x can match a zero length string.
5026 // Same as JMP_SAV, except conditional on the match having made forward progress.
5027 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
5028 // data address of the input position at the start of the loop.
5029 {
5030 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
729e4ab9 5031 int32_t stoOp = (int32_t)pat[opValue-1];
b75a7d8f
A
5032 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
5033 int32_t frameLoc = URX_VAL(stoOp);
46f4442e 5034 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
729e4ab9 5035 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
b75a7d8f
A
5036 U_ASSERT(prevInputIdx <= fp->fInputIdx);
5037 if (prevInputIdx < fp->fInputIdx) {
5038 // The match did make progress. Repeat the loop.
46f4442e 5039 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
b75a7d8f
A
5040 fp->fPatIdx = opValue;
5041 fp->fExtra[frameLoc] = fp->fInputIdx;
57a6839d 5042 }
b75a7d8f
A
5043 // If the input position did not advance, we do nothing here,
5044 // execution will fall out of the loop.
5045 }
5046 break;
57a6839d 5047
b75a7d8f
A
5048 case URX_CTR_INIT:
5049 {
46f4442e 5050 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
57a6839d
A
5051 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
5052
b75a7d8f 5053 // Pick up the three extra operands that CTR_INIT has, and
57a6839d 5054 // skip the pattern location counter past
729e4ab9 5055 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
b75a7d8f
A
5056 fp->fPatIdx += 3;
5057 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
729e4ab9
A
5058 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5059 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
b75a7d8f
A
5060 U_ASSERT(minCount>=0);
5061 U_ASSERT(maxCount>=minCount || maxCount==-1);
57a6839d
A
5062 U_ASSERT(loopLoc>=fp->fPatIdx);
5063
b75a7d8f 5064 if (minCount == 0) {
46f4442e 5065 fp = StateSave(fp, loopLoc+1, status);
b75a7d8f 5066 }
57a6839d
A
5067 if (maxCount == -1) {
5068 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
5069 } else if (maxCount == 0) {
46f4442e 5070 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
5071 }
5072 }
5073 break;
57a6839d 5074
b75a7d8f
A
5075 case URX_CTR_LOOP:
5076 {
5077 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
729e4ab9 5078 int32_t initOp = (int32_t)pat[opValue];
b75a7d8f 5079 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
729e4ab9
A
5080 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5081 int32_t minCount = (int32_t)pat[opValue+2];
5082 int32_t maxCount = (int32_t)pat[opValue+3];
b75a7d8f 5083 (*pCounter)++;
57a6839d
A
5084 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5085 U_ASSERT(*pCounter == maxCount);
b75a7d8f
A
5086 break;
5087 }
5088 if (*pCounter >= minCount) {
57a6839d
A
5089 if (maxCount == -1) {
5090 // Loop has no hard upper bound.
5091 // Check that it is progressing through the input, break if it is not.
5092 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
5093 if (fp->fInputIdx == *pLastInputIdx) {
5094 break;
5095 } else {
5096 *pLastInputIdx = fp->fInputIdx;
5097 }
5098 }
46f4442e 5099 fp = StateSave(fp, fp->fPatIdx, status);
b75a7d8f
A
5100 }
5101 fp->fPatIdx = opValue + 4; // Loop back.
5102 }
5103 break;
57a6839d 5104
b75a7d8f
A
5105 case URX_CTR_INIT_NG:
5106 {
46f4442e
A
5107 // Initialize a non-greedy loop
5108 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
57a6839d
A
5109 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
5110
5111 // Pick up the three extra operands that CTR_INIT_NG has, and
5112 // skip the pattern location counter past
729e4ab9 5113 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
b75a7d8f
A
5114 fp->fPatIdx += 3;
5115 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
729e4ab9
A
5116 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5117 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
b75a7d8f
A
5118 U_ASSERT(minCount>=0);
5119 U_ASSERT(maxCount>=minCount || maxCount==-1);
5120 U_ASSERT(loopLoc>fp->fPatIdx);
57a6839d
A
5121 if (maxCount == -1) {
5122 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
5123 }
5124
b75a7d8f
A
5125 if (minCount == 0) {
5126 if (maxCount != 0) {
46f4442e 5127 fp = StateSave(fp, fp->fPatIdx, status);
b75a7d8f
A
5128 }
5129 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
57a6839d 5130 }
b75a7d8f
A
5131 }
5132 break;
57a6839d 5133
b75a7d8f
A
5134 case URX_CTR_LOOP_NG:
5135 {
46f4442e 5136 // Non-greedy {min, max} loops
b75a7d8f 5137 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
729e4ab9 5138 int32_t initOp = (int32_t)pat[opValue];
b75a7d8f 5139 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
729e4ab9
A
5140 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5141 int32_t minCount = (int32_t)pat[opValue+2];
5142 int32_t maxCount = (int32_t)pat[opValue+3];
57a6839d 5143
b75a7d8f 5144 (*pCounter)++;
57a6839d 5145 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
b75a7d8f
A
5146 // The loop has matched the maximum permitted number of times.
5147 // Break out of here with no action. Matching will
5148 // continue with the following pattern.
57a6839d 5149 U_ASSERT(*pCounter == maxCount);
b75a7d8f
A
5150 break;
5151 }
57a6839d 5152
b75a7d8f
A
5153 if (*pCounter < minCount) {
5154 // We haven't met the minimum number of matches yet.
5155 // Loop back for another one.
5156 fp->fPatIdx = opValue + 4; // Loop back.
5157 } else {
5158 // We do have the minimum number of matches.
57a6839d
A
5159
5160 // If there is no upper bound on the loop iterations, check that the input index
5161 // is progressing, and stop the loop if it is not.
5162 if (maxCount == -1) {
5163 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
5164 if (fp->fInputIdx == *pLastInputIdx) {
5165 break;
5166 }
5167 *pLastInputIdx = fp->fInputIdx;
5168 }
5169
5170 // Loop Continuation: we will fall into the pattern following the loop
5171 // (non-greedy, don't execute loop body first), but first do
5172 // a state save to the top of the loop, so that a match failure
b75a7d8f 5173 // in the following pattern will try another iteration of the loop.
46f4442e 5174 fp = StateSave(fp, opValue + 4, status);
b75a7d8f
A
5175 }
5176 }
5177 break;
57a6839d 5178
b75a7d8f
A
5179 case URX_STO_SP:
5180 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5181 fData[opValue] = fStack->size();
5182 break;
57a6839d 5183
b75a7d8f
A
5184 case URX_LD_SP:
5185 {
5186 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
729e4ab9 5187 int32_t newStackSize = (int32_t)fData[opValue];
b75a7d8f 5188 U_ASSERT(newStackSize <= fStack->size());
729e4ab9
A
5189 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5190 if (newFP == (int64_t *)fp) {
b75a7d8f
A
5191 break;
5192 }
5193 int32_t i;
46f4442e 5194 for (i=0; i<fFrameSize; i++) {
729e4ab9 5195 newFP[i] = ((int64_t *)fp)[i];
b75a7d8f
A
5196 }
5197 fp = (REStackFrame *)newFP;
5198 fStack->setSize(newStackSize);
5199 }
5200 break;
57a6839d 5201
b75a7d8f 5202 case URX_BACKREF:
4388f060
A
5203 {
5204 U_ASSERT(opValue < fFrameSize);
5205 int64_t groupStartIdx = fp->fExtra[opValue];
5206 int64_t groupEndIdx = fp->fExtra[opValue+1];
5207 U_ASSERT(groupStartIdx <= groupEndIdx);
5208 int64_t inputIndex = fp->fInputIdx;
5209 if (groupStartIdx < 0) {
5210 // This capture group has not participated in the match thus far,
5211 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5212 break;
5213 }
5214 UBool success = TRUE;
5215 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
5216 if (inputIndex >= fActiveLimit) {
5217 success = FALSE;
5218 fHitEnd = TRUE;
5219 break;
5220 }
5221 if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
5222 success = FALSE;
5223 break;
5224 }
5225 }
2ca993e8
A
5226 if (success && groupStartIdx < groupEndIdx && U16_IS_LEAD(inputBuf[groupEndIdx-1]) &&
5227 inputIndex < fActiveLimit && U16_IS_TRAIL(inputBuf[inputIndex])) {
5228 // Capture group ended with an unpaired lead surrogate.
5229 // Back reference is not permitted to match lead only of a surrogatge pair.
5230 success = FALSE;
5231 }
4388f060
A
5232 if (success) {
5233 fp->fInputIdx = inputIndex;
5234 } else {
5235 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5236 }
5237 }
5238 break;
57a6839d 5239
b75a7d8f
A
5240 case URX_BACKREF_I:
5241 {
46f4442e 5242 U_ASSERT(opValue < fFrameSize);
729e4ab9
A
5243 int64_t groupStartIdx = fp->fExtra[opValue];
5244 int64_t groupEndIdx = fp->fExtra[opValue+1];
b75a7d8f 5245 U_ASSERT(groupStartIdx <= groupEndIdx);
b75a7d8f
A
5246 if (groupStartIdx < 0) {
5247 // This capture group has not participated in the match thus far,
46f4442e 5248 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
4388f060 5249 break;
b75a7d8f 5250 }
4388f060
A
5251 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
5252 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
b75a7d8f 5253
4388f060 5254 // Note: if the capture group match was of an empty string the backref
57a6839d 5255 // match succeeds. Verified by testing: Perl matches succeed
4388f060 5256 // in this case, so we do too.
57a6839d 5257
4388f060
A
5258 UBool success = TRUE;
5259 for (;;) {
5260 UChar32 captureGroupChar = captureGroupItr.next();
5261 if (captureGroupChar == U_SENTINEL) {
5262 success = TRUE;
b75a7d8f
A
5263 break;
5264 }
4388f060
A
5265 UChar32 inputChar = inputItr.next();
5266 if (inputChar == U_SENTINEL) {
5267 success = FALSE;
5268 fHitEnd = TRUE;
5269 break;
b75a7d8f 5270 }
4388f060
A
5271 if (inputChar != captureGroupChar) {
5272 success = FALSE;
5273 break;
5274 }
5275 }
5276
5277 if (success && inputItr.inExpansion()) {
57a6839d
A
5278 // We otained a match by consuming part of a string obtained from
5279 // case-folding a single code point of the input text.
4388f060
A
5280 // This does not count as an overall match.
5281 success = FALSE;
b75a7d8f 5282 }
4388f060
A
5283
5284 if (success) {
5285 fp->fInputIdx = inputItr.getIndex();
b75a7d8f 5286 } else {
4388f060 5287 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
5288 }
5289 }
5290 break;
4388f060 5291
b75a7d8f
A
5292 case URX_STO_INP_LOC:
5293 {
46f4442e 5294 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
b75a7d8f
A
5295 fp->fExtra[opValue] = fp->fInputIdx;
5296 }
5297 break;
57a6839d 5298
b75a7d8f
A
5299 case URX_JMPX:
5300 {
729e4ab9 5301 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
b75a7d8f
A
5302 fp->fPatIdx += 1;
5303 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
46f4442e 5304 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
729e4ab9 5305 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
b75a7d8f
A
5306 U_ASSERT(savedInputIdx <= fp->fInputIdx);
5307 if (savedInputIdx < fp->fInputIdx) {
5308 fp->fPatIdx = opValue; // JMP
5309 } else {
729e4ab9 5310 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
b75a7d8f
A
5311 }
5312 }
5313 break;
57a6839d 5314
b75a7d8f
A
5315 case URX_LA_START:
5316 {
5317 // Entering a lookahead block.
5318 // Save Stack Ptr, Input Pos.
5319 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5320 fData[opValue] = fStack->size();
5321 fData[opValue+1] = fp->fInputIdx;
46f4442e
A
5322 fActiveStart = fLookStart; // Set the match region change for
5323 fActiveLimit = fLookLimit; // transparent bounds.
b75a7d8f
A
5324 }
5325 break;
57a6839d 5326
b75a7d8f
A
5327 case URX_LA_END:
5328 {
5329 // Leaving a look-ahead block.
5330 // restore Stack Ptr, Input Pos to positions they had on entry to block.
5331 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5332 int32_t stackSize = fStack->size();
729e4ab9 5333 int32_t newStackSize = (int32_t)fData[opValue];
b75a7d8f
A
5334 U_ASSERT(stackSize >= newStackSize);
5335 if (stackSize > newStackSize) {
46f4442e
A
5336 // Copy the current top frame back to the new (cut back) top frame.
5337 // This makes the capture groups from within the look-ahead
5338 // expression available.
729e4ab9 5339 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
b75a7d8f 5340 int32_t i;
46f4442e 5341 for (i=0; i<fFrameSize; i++) {
729e4ab9 5342 newFP[i] = ((int64_t *)fp)[i];
b75a7d8f
A
5343 }
5344 fp = (REStackFrame *)newFP;
5345 fStack->setSize(newStackSize);
5346 }
5347 fp->fInputIdx = fData[opValue+1];
57a6839d 5348
46f4442e
A
5349 // Restore the active region bounds in the input string; they may have
5350 // been changed because of transparent bounds on a Region.
5351 fActiveStart = fRegionStart;
5352 fActiveLimit = fRegionLimit;
b75a7d8f
A
5353 }
5354 break;
57a6839d 5355
b75a7d8f 5356 case URX_ONECHAR_I:
46f4442e 5357 if (fp->fInputIdx < fActiveLimit) {
57a6839d 5358 UChar32 c;
46f4442e
A
5359 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5360 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
b75a7d8f
A
5361 break;
5362 }
46f4442e
A
5363 } else {
5364 fHitEnd = TRUE;
5365 }
5366 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f 5367 break;
57a6839d 5368
b75a7d8f 5369 case URX_STRING_I:
4388f060
A
5370 // Case-insensitive test input against a literal string.
5371 // Strings require two slots in the compiled pattern, one for the
5372 // offset to the string text, and one for the length.
5373 // The compiled string has already been case folded.
b75a7d8f 5374 {
4388f060
A
5375 const UChar *patternString = litText + opValue;
5376
5377 op = (int32_t)pat[fp->fPatIdx];
5378 fp->fPatIdx++;
5379 opType = URX_TYPE(op);
5380 opValue = URX_VAL(op);
5381 U_ASSERT(opType == URX_STRING_LEN);
5382 int32_t patternStringLen = opValue; // Length of the string from the pattern.
57a6839d 5383
4388f060
A
5384 UChar32 cText;
5385 UChar32 cPattern;
5386 UBool success = TRUE;
5387 int32_t patternStringIdx = 0;
5388 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
5389 while (patternStringIdx < patternStringLen) {
5390 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
5391 cText = inputIterator.next();
5392 if (cText != cPattern) {
5393 success = FALSE;
5394 if (cText == U_SENTINEL) {
5395 fHitEnd = TRUE;
729e4ab9 5396 }
4388f060 5397 break;
374ca955 5398 }
46f4442e 5399 }
4388f060
A
5400 if (inputIterator.inExpansion()) {
5401 success = FALSE;
5402 }
5403
5404 if (success) {
5405 fp->fInputIdx = inputIterator.getIndex();
5406 } else {
5407 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5408 }
b75a7d8f
A
5409 }
5410 break;
4388f060 5411
b75a7d8f
A
5412 case URX_LB_START:
5413 {
5414 // Entering a look-behind block.
5415 // Save Stack Ptr, Input Pos.
46f4442e 5416 // TODO: implement transparent bounds. Ticket #6067
b75a7d8f
A
5417 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5418 fData[opValue] = fStack->size();
5419 fData[opValue+1] = fp->fInputIdx;
5420 // Init the variable containing the start index for attempted matches.
5421 fData[opValue+2] = -1;
5422 // Save input string length, then reset to pin any matches to end at
5423 // the current position.
46f4442e
A
5424 fData[opValue+3] = fActiveLimit;
5425 fActiveLimit = fp->fInputIdx;
b75a7d8f
A
5426 }
5427 break;
57a6839d
A
5428
5429
b75a7d8f
A
5430 case URX_LB_CONT:
5431 {
5432 // Positive Look-Behind, at top of loop checking for matches of LB expression
5433 // at all possible input starting positions.
57a6839d 5434
b75a7d8f
A
5435 // Fetch the min and max possible match lengths. They are the operands
5436 // of this op in the pattern.
729e4ab9
A
5437 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5438 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
b75a7d8f
A
5439 U_ASSERT(minML <= maxML);
5440 U_ASSERT(minML >= 0);
57a6839d 5441
b75a7d8f
A
5442 // Fetch (from data) the last input index where a match was attempted.
5443 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2ca993e8
A
5444 int64_t &lbStartIdx = fData[opValue+2];
5445 if (lbStartIdx < 0) {
b75a7d8f 5446 // First time through loop.
2ca993e8
A
5447 lbStartIdx = fp->fInputIdx - minML;
5448 if (lbStartIdx > 0) {
5449 U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5450 }
b75a7d8f
A
5451 } else {
5452 // 2nd through nth time through the loop.
5453 // Back up start position for match by one.
2ca993e8
A
5454 if (lbStartIdx == 0) {
5455 lbStartIdx--;
b75a7d8f 5456 } else {
2ca993e8 5457 U16_BACK_1(inputBuf, 0, lbStartIdx);
b75a7d8f
A
5458 }
5459 }
57a6839d 5460
2ca993e8 5461 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
b75a7d8f
A
5462 // We have tried all potential match starting points without
5463 // getting a match. Backtrack out, and out of the
5464 // Look Behind altogether.
46f4442e 5465 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
729e4ab9 5466 int64_t restoreInputLen = fData[opValue+3];
46f4442e 5467 U_ASSERT(restoreInputLen >= fActiveLimit);
729e4ab9 5468 U_ASSERT(restoreInputLen <= fInputLength);
46f4442e 5469 fActiveLimit = restoreInputLen;
b75a7d8f
A
5470 break;
5471 }
57a6839d 5472
b75a7d8f
A
5473 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5474 // (successful match will fall off the end of the loop.)
46f4442e 5475 fp = StateSave(fp, fp->fPatIdx-3, status);
2ca993e8 5476 fp->fInputIdx = lbStartIdx;
b75a7d8f
A
5477 }
5478 break;
57a6839d 5479
b75a7d8f
A
5480 case URX_LB_END:
5481 // End of a look-behind block, after a successful match.
5482 {
5483 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
46f4442e 5484 if (fp->fInputIdx != fActiveLimit) {
b75a7d8f
A
5485 // The look-behind expression matched, but the match did not
5486 // extend all the way to the point that we are looking behind from.
5487 // FAIL out of here, which will take us back to the LB_CONT, which
5488 // will retry the match starting at another position or fail
5489 // the look-behind altogether, whichever is appropriate.
46f4442e 5490 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
5491 break;
5492 }
57a6839d 5493
b75a7d8f 5494 // Look-behind match is good. Restore the orignal input string length,
57a6839d 5495 // which had been truncated to pin the end of the lookbehind match to the
b75a7d8f 5496 // position being looked-behind.
729e4ab9 5497 int64_t originalInputLen = fData[opValue+3];
46f4442e 5498 U_ASSERT(originalInputLen >= fActiveLimit);
729e4ab9 5499 U_ASSERT(originalInputLen <= fInputLength);
46f4442e 5500 fActiveLimit = originalInputLen;
b75a7d8f
A
5501 }
5502 break;
57a6839d
A
5503
5504
b75a7d8f
A
5505 case URX_LBN_CONT:
5506 {
5507 // Negative Look-Behind, at top of loop checking for matches of LB expression
5508 // at all possible input starting positions.
57a6839d 5509
b75a7d8f 5510 // Fetch the extra parameters of this op.
729e4ab9
A
5511 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5512 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5513 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5514 continueLoc = URX_VAL(continueLoc);
b75a7d8f
A
5515 U_ASSERT(minML <= maxML);
5516 U_ASSERT(minML >= 0);
5517 U_ASSERT(continueLoc > fp->fPatIdx);
57a6839d 5518
b75a7d8f
A
5519 // Fetch (from data) the last input index where a match was attempted.
5520 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2ca993e8
A
5521 int64_t &lbStartIdx = fData[opValue+2];
5522 if (lbStartIdx < 0) {
b75a7d8f 5523 // First time through loop.
2ca993e8
A
5524 lbStartIdx = fp->fInputIdx - minML;
5525 if (lbStartIdx > 0) {
5526 U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5527 }
b75a7d8f
A
5528 } else {
5529 // 2nd through nth time through the loop.
5530 // Back up start position for match by one.
2ca993e8
A
5531 if (lbStartIdx == 0) {
5532 lbStartIdx--; // Because U16_BACK is unsafe starting at 0.
b75a7d8f 5533 } else {
2ca993e8 5534 U16_BACK_1(inputBuf, 0, lbStartIdx);
b75a7d8f
A
5535 }
5536 }
57a6839d 5537
2ca993e8 5538 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
b75a7d8f
A
5539 // We have tried all potential match starting points without
5540 // getting a match, which means that the negative lookbehind as
5541 // a whole has succeeded. Jump forward to the continue location
729e4ab9 5542 int64_t restoreInputLen = fData[opValue+3];
46f4442e 5543 U_ASSERT(restoreInputLen >= fActiveLimit);
729e4ab9 5544 U_ASSERT(restoreInputLen <= fInputLength);
46f4442e 5545 fActiveLimit = restoreInputLen;
b75a7d8f
A
5546 fp->fPatIdx = continueLoc;
5547 break;
5548 }
57a6839d 5549
b75a7d8f
A
5550 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5551 // (successful match will cause a FAIL out of the loop altogether.)
46f4442e 5552 fp = StateSave(fp, fp->fPatIdx-4, status);
2ca993e8 5553 fp->fInputIdx = lbStartIdx;
b75a7d8f
A
5554 }
5555 break;
57a6839d 5556
b75a7d8f
A
5557 case URX_LBN_END:
5558 // End of a negative look-behind block, after a successful match.
5559 {
5560 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
46f4442e 5561 if (fp->fInputIdx != fActiveLimit) {
b75a7d8f
A
5562 // The look-behind expression matched, but the match did not
5563 // extend all the way to the point that we are looking behind from.
5564 // FAIL out of here, which will take us back to the LB_CONT, which
5565 // will retry the match starting at another position or succeed
5566 // the look-behind altogether, whichever is appropriate.
46f4442e 5567 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
5568 break;
5569 }
57a6839d 5570
b75a7d8f
A
5571 // Look-behind expression matched, which means look-behind test as
5572 // a whole Fails
57a6839d
A
5573
5574 // Restore the orignal input string length, which had been truncated
5575 // inorder to pin the end of the lookbehind match
b75a7d8f 5576 // to the position being looked-behind.
729e4ab9 5577 int64_t originalInputLen = fData[opValue+3];
46f4442e 5578 U_ASSERT(originalInputLen >= fActiveLimit);
729e4ab9 5579 U_ASSERT(originalInputLen <= fInputLength);
46f4442e 5580 fActiveLimit = originalInputLen;
57a6839d 5581
b75a7d8f
A
5582 // Restore original stack position, discarding any state saved
5583 // by the successful pattern match.
5584 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
729e4ab9 5585 int32_t newStackSize = (int32_t)fData[opValue];
b75a7d8f
A
5586 U_ASSERT(fStack->size() > newStackSize);
5587 fStack->setSize(newStackSize);
57a6839d
A
5588
5589 // FAIL, which will take control back to someplace
b75a7d8f 5590 // prior to entering the look-behind test.
46f4442e 5591 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
5592 }
5593 break;
57a6839d
A
5594
5595
b75a7d8f
A
5596 case URX_LOOP_SR_I:
5597 // Loop Initialization for the optimized implementation of
5598 // [some character set]*
5599 // This op scans through all matching input.
5600 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5601 {
5602 U_ASSERT(opValue > 0 && opValue < sets->size());
5603 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5604 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
57a6839d 5605
b75a7d8f
A
5606 // Loop through input, until either the input is exhausted or
5607 // we reach a character that is not a member of the set.
729e4ab9 5608 int32_t ix = (int32_t)fp->fInputIdx;
b75a7d8f 5609 for (;;) {
46f4442e
A
5610 if (ix >= fActiveLimit) {
5611 fHitEnd = TRUE;
b75a7d8f
A
5612 break;
5613 }
5614 UChar32 c;
46f4442e 5615 U16_NEXT(inputBuf, ix, fActiveLimit, c);
b75a7d8f
A
5616 if (c<256) {
5617 if (s8->contains(c) == FALSE) {
5618 U16_BACK_1(inputBuf, 0, ix);
5619 break;
5620 }
5621 } else {
5622 if (s->contains(c) == FALSE) {
5623 U16_BACK_1(inputBuf, 0, ix);
5624 break;
5625 }
5626 }
5627 }
57a6839d 5628
b75a7d8f
A
5629 // If there were no matching characters, skip over the loop altogether.
5630 // The loop doesn't run at all, a * op always succeeds.
5631 if (ix == fp->fInputIdx) {
5632 fp->fPatIdx++; // skip the URX_LOOP_C op.
5633 break;
5634 }
57a6839d 5635
b75a7d8f
A
5636 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5637 // must follow. It's operand is the stack location
5638 // that holds the starting input index for the match of this [set]*
729e4ab9 5639 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
b75a7d8f
A
5640 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5641 int32_t stackLoc = URX_VAL(loopcOp);
46f4442e 5642 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
b75a7d8f
A
5643 fp->fExtra[stackLoc] = fp->fInputIdx;
5644 fp->fInputIdx = ix;
57a6839d 5645
b75a7d8f
A
5646 // Save State to the URX_LOOP_C op that follows this one,
5647 // so that match failures in the following code will return to there.
5648 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
46f4442e 5649 fp = StateSave(fp, fp->fPatIdx, status);
b75a7d8f
A
5650 fp->fPatIdx++;
5651 }
5652 break;
57a6839d
A
5653
5654
b75a7d8f
A
5655 case URX_LOOP_DOT_I:
5656 // Loop Initialization for the optimized implementation of .*
5657 // This op scans through all remaining input.
5658 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5659 {
5660 // Loop through input until the input is exhausted (we reach an end-of-line)
46f4442e 5661 // In DOTALL mode, we can just go straight to the end of the input.
374ca955 5662 int32_t ix;
46f4442e
A
5663 if ((opValue & 1) == 1) {
5664 // Dot-matches-All mode. Jump straight to the end of the string.
729e4ab9 5665 ix = (int32_t)fActiveLimit;
46f4442e 5666 fHitEnd = TRUE;
374ca955 5667 } else {
46f4442e 5668 // NOT DOT ALL mode. Line endings do not match '.'
b75a7d8f 5669 // Scan forward until a line ending or end of input.
729e4ab9 5670 ix = (int32_t)fp->fInputIdx;
b75a7d8f 5671 for (;;) {
46f4442e
A
5672 if (ix >= fActiveLimit) {
5673 fHitEnd = TRUE;
b75a7d8f
A
5674 break;
5675 }
5676 UChar32 c;
46f4442e 5677 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++]
729e4ab9
A
5678 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
5679 if ((c == 0x0a) || // 0x0a is newline in both modes.
5680 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
b331163b 5681 isLineTerminator(c))) {
46f4442e
A
5682 // char is a line ending. Put the input pos back to the
5683 // line ending char, and exit the scanning loop.
5684 U16_BACK_1(inputBuf, 0, ix);
5685 break;
5686 }
b75a7d8f
A
5687 }
5688 }
5689 }
57a6839d 5690
b75a7d8f
A
5691 // If there were no matching characters, skip over the loop altogether.
5692 // The loop doesn't run at all, a * op always succeeds.
5693 if (ix == fp->fInputIdx) {
5694 fp->fPatIdx++; // skip the URX_LOOP_C op.
5695 break;
5696 }
57a6839d 5697
b75a7d8f
A
5698 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5699 // must follow. It's operand is the stack location
46f4442e 5700 // that holds the starting input index for the match of this .*
729e4ab9 5701 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
b75a7d8f
A
5702 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5703 int32_t stackLoc = URX_VAL(loopcOp);
46f4442e 5704 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
b75a7d8f
A
5705 fp->fExtra[stackLoc] = fp->fInputIdx;
5706 fp->fInputIdx = ix;
57a6839d 5707
b75a7d8f
A
5708 // Save State to the URX_LOOP_C op that follows this one,
5709 // so that match failures in the following code will return to there.
5710 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
46f4442e 5711 fp = StateSave(fp, fp->fPatIdx, status);
b75a7d8f
A
5712 fp->fPatIdx++;
5713 }
5714 break;
57a6839d
A
5715
5716
b75a7d8f
A
5717 case URX_LOOP_C:
5718 {
46f4442e 5719 U_ASSERT(opValue>=0 && opValue<fFrameSize);
729e4ab9
A
5720 backSearchIndex = (int32_t)fp->fExtra[opValue];
5721 U_ASSERT(backSearchIndex <= fp->fInputIdx);
5722 if (backSearchIndex == fp->fInputIdx) {
b75a7d8f 5723 // We've backed up the input idx to the point that the loop started.
57a6839d 5724 // The loop is done. Leave here without saving state.
b75a7d8f
A
5725 // Subsequent failures won't come back here.
5726 break;
5727 }
5728 // Set up for the next iteration of the loop, with input index
5729 // backed up by one from the last time through,
5730 // and a state save to this instruction in case the following code fails again.
5731 // (We're going backwards because this loop emulates stack unwinding, not
5732 // the initial scan forward.)
5733 U_ASSERT(fp->fInputIdx > 0);
729e4ab9
A
5734 UChar32 prevC;
5735 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
57a6839d
A
5736
5737 if (prevC == 0x0a &&
729e4ab9 5738 fp->fInputIdx > backSearchIndex &&
b75a7d8f 5739 inputBuf[fp->fInputIdx-1] == 0x0d) {
729e4ab9 5740 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
b75a7d8f
A
5741 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
5742 // .*, stepping back over CRLF pair.
729e4ab9 5743 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
b75a7d8f
A
5744 }
5745 }
57a6839d
A
5746
5747
46f4442e 5748 fp = StateSave(fp, fp->fPatIdx-1, status);
b75a7d8f
A
5749 }
5750 break;
57a6839d
A
5751
5752
5753
b75a7d8f
A
5754 default:
5755 // Trouble. The compiled pattern contains an entry with an
5756 // unrecognized type tag.
5757 U_ASSERT(FALSE);
5758 }
57a6839d 5759
b75a7d8f 5760 if (U_FAILURE(status)) {
46f4442e 5761 isMatch = FALSE;
b75a7d8f
A
5762 break;
5763 }
5764 }
57a6839d 5765
b75a7d8f
A
5766breakFromLoop:
5767 fMatch = isMatch;
5768 if (isMatch) {
5769 fLastMatchEnd = fMatchEnd;
5770 fMatchStart = startIdx;
5771 fMatchEnd = fp->fInputIdx;
b75a7d8f 5772 }
57a6839d
A
5773
5774#ifdef REGEX_RUN_DEBUG
5775 if (fTraceDebug) {
5776 if (isMatch) {
5777 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
5778 } else {
5779 printf("No match\n\n");
b75a7d8f
A
5780 }
5781 }
57a6839d
A
5782#endif
5783
b75a7d8f 5784 fFrame = fp; // The active stack frame when the engine stopped.
57a6839d
A
5785 // Contains the capture group results that we need to
5786 // access later.
b75a7d8f
A
5787
5788 return;
5789}
5790
5791
374ca955 5792UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
b75a7d8f
A
5793
5794U_NAMESPACE_END
5795
5796#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS