]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/rematch.cpp
ICU-531.48.tar.gz
[apple/icu.git] / icuSources / i18n / rematch.cpp
CommitLineData
b75a7d8f
A
1/*
2**************************************************************************
57a6839d 3* Copyright (C) 2002-2013 International Business Machines Corporation *
b75a7d8f
A
4* and others. All rights reserved. *
5**************************************************************************
6*/
46f4442e
A
7//
8// file: rematch.cpp
9//
10// Contains the implementation of class RegexMatcher,
11// which is one of the main API classes for the ICU regular expression package.
12//
b75a7d8f
A
13
14#include "unicode/utypes.h"
15#if !UCONFIG_NO_REGULAR_EXPRESSIONS
16
17#include "unicode/regex.h"
18#include "unicode/uniset.h"
19#include "unicode/uchar.h"
20#include "unicode/ustring.h"
374ca955 21#include "unicode/rbbi.h"
4388f060
A
22#include "unicode/utf.h"
23#include "unicode/utf16.h"
b75a7d8f
A
24#include "uassert.h"
25#include "cmemory.h"
26#include "uvector.h"
27#include "uvectr32.h"
729e4ab9 28#include "uvectr64.h"
b75a7d8f
A
29#include "regeximp.h"
30#include "regexst.h"
729e4ab9
A
31#include "regextxt.h"
32#include "ucase.h"
b75a7d8f
A
33
34// #include <malloc.h> // Needed for heapcheck testing
35
729e4ab9
A
36
37// Find progress callback
38// ----------------------
39// Macro to inline test & call to ReportFindProgress(). Eliminates unnecessary function call.
40//
41#define REGEXFINDPROGRESS_INTERRUPT(pos, status) \
42 (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE)
43
44
45// Smart Backtracking
46// ------------------
47// When a failure would go back to a LOOP_C instruction,
48// strings, characters, and setrefs scan backwards for a valid start
49// character themselves, pop the stack, and save state, emulating the
50// LOOP_C's effect but assured that the next character of input is a
51// possible matching character.
52//
53// Good idea in theory; unfortunately it only helps out a few specific
54// cases and slows the engine down a little in the rest.
55
b75a7d8f
A
56U_NAMESPACE_BEGIN
57
46f4442e
A
58// Default limit for the size of the back track stack, to avoid system
59// failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
60// This value puts ICU's limits higher than most other regexp implementations,
61// which use recursion rather than the heap, and take more storage per
62// backtrack point.
63//
64static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
65
66// Time limit counter constant.
67// Time limits for expression evaluation are in terms of quanta of work by
68// the engine, each of which is 10,000 state saves.
69// This constant determines that state saves per tick number.
70static const int32_t TIMER_INITIAL_VALUE = 10000;
71
b75a7d8f
A
72//-----------------------------------------------------------------------------
73//
74// Constructor and Destructor
75//
76//-----------------------------------------------------------------------------
57a6839d 77RegexMatcher::RegexMatcher(const RegexPattern *pat) {
46f4442e
A
78 fDeferredStatus = U_ZERO_ERROR;
79 init(fDeferredStatus);
80 if (U_FAILURE(fDeferredStatus)) {
81 return;
82 }
b75a7d8f
A
83 if (pat==NULL) {
84 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
85 return;
86 }
46f4442e 87 fPattern = pat;
729e4ab9 88 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
b75a7d8f
A
89}
90
91
92
93RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
94 uint32_t flags, UErrorCode &status) {
46f4442e 95 init(status);
b75a7d8f
A
96 if (U_FAILURE(status)) {
97 return;
98 }
46f4442e
A
99 UParseError pe;
100 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
729e4ab9 101 fPattern = fPatternOwned;
57a6839d 102
729e4ab9
A
103 UText inputText = UTEXT_INITIALIZER;
104 utext_openConstUnicodeString(&inputText, &input, &status);
105 init2(&inputText, status);
106 utext_close(&inputText);
107
57a6839d 108 fInputUniStrMaybeMutable = TRUE;
729e4ab9
A
109}
110
111
112RegexMatcher::RegexMatcher(UText *regexp, UText *input,
113 uint32_t flags, UErrorCode &status) {
114 init(status);
115 if (U_FAILURE(status)) {
116 return;
117 }
118 UParseError pe;
119 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
120 if (U_FAILURE(status)) {
121 return;
122 }
123
46f4442e
A
124 fPattern = fPatternOwned;
125 init2(input, status);
b75a7d8f
A
126}
127
128
57a6839d 129RegexMatcher::RegexMatcher(const UnicodeString &regexp,
b75a7d8f 130 uint32_t flags, UErrorCode &status) {
46f4442e 131 init(status);
b75a7d8f
A
132 if (U_FAILURE(status)) {
133 return;
134 }
46f4442e
A
135 UParseError pe;
136 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
729e4ab9
A
137 if (U_FAILURE(status)) {
138 return;
139 }
140 fPattern = fPatternOwned;
141 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
142}
143
57a6839d 144RegexMatcher::RegexMatcher(UText *regexp,
729e4ab9
A
145 uint32_t flags, UErrorCode &status) {
146 init(status);
147 if (U_FAILURE(status)) {
148 return;
149 }
150 UParseError pe;
151 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
152 if (U_FAILURE(status)) {
153 return;
154 }
155
46f4442e 156 fPattern = fPatternOwned;
729e4ab9 157 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
b75a7d8f
A
158}
159
160
161
46f4442e 162
b75a7d8f
A
163RegexMatcher::~RegexMatcher() {
164 delete fStack;
165 if (fData != fSmallData) {
374ca955 166 uprv_free(fData);
b75a7d8f
A
167 fData = NULL;
168 }
169 if (fPatternOwned) {
170 delete fPatternOwned;
171 fPatternOwned = NULL;
172 fPattern = NULL;
173 }
57a6839d 174
729e4ab9
A
175 if (fInput) {
176 delete fInput;
177 }
178 if (fInputText) {
179 utext_close(fInputText);
180 }
181 if (fAltInputText) {
182 utext_close(fAltInputText);
183 }
57a6839d 184
374ca955
A
185 #if UCONFIG_NO_BREAK_ITERATION==0
186 delete fWordBreakItr;
187 #endif
b75a7d8f
A
188}
189
46f4442e
A
190//
191// init() common initialization for use by all constructors.
192// Initialize all fields, get the object into a consistent state.
193// This must be done even when the initial status shows an error,
194// so that the object is initialized sufficiently well for the destructor
195// to run safely.
196//
197void RegexMatcher::init(UErrorCode &status) {
198 fPattern = NULL;
199 fPatternOwned = NULL;
46f4442e
A
200 fFrameSize = 0;
201 fRegionStart = 0;
202 fRegionLimit = 0;
203 fAnchorStart = 0;
204 fAnchorLimit = 0;
205 fLookStart = 0;
206 fLookLimit = 0;
207 fActiveStart = 0;
208 fActiveLimit = 0;
209 fTransparentBounds = FALSE;
210 fAnchoringBounds = TRUE;
211 fMatch = FALSE;
212 fMatchStart = 0;
213 fMatchEnd = 0;
214 fLastMatchEnd = -1;
215 fAppendPosition = 0;
216 fHitEnd = FALSE;
217 fRequireEnd = FALSE;
218 fStack = NULL;
219 fFrame = NULL;
220 fTimeLimit = 0;
221 fTime = 0;
222 fTickCounter = 0;
223 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY;
224 fCallbackFn = NULL;
225 fCallbackContext = NULL;
729e4ab9
A
226 fFindProgressCallbackFn = NULL;
227 fFindProgressCallbackContext = NULL;
46f4442e
A
228 fTraceDebug = FALSE;
229 fDeferredStatus = status;
230 fData = fSmallData;
231 fWordBreakItr = NULL;
57a6839d 232
4388f060 233 fStack = NULL;
729e4ab9
A
234 fInputText = NULL;
235 fAltInputText = NULL;
236 fInput = NULL;
237 fInputLength = 0;
238 fInputUniStrMaybeMutable = FALSE;
239
46f4442e
A
240 if (U_FAILURE(status)) {
241 fDeferredStatus = status;
242 }
243}
244
245//
246// init2() Common initialization for use by RegexMatcher constructors, part 2.
247// This handles the common setup to be done after the Pattern is available.
248//
729e4ab9 249void RegexMatcher::init2(UText *input, UErrorCode &status) {
46f4442e
A
250 if (U_FAILURE(status)) {
251 fDeferredStatus = status;
252 return;
253 }
254
729e4ab9 255 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) {
57a6839d 256 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
46f4442e
A
257 if (fData == NULL) {
258 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
259 return;
260 }
261 }
262
4388f060
A
263 fStack = new UVector64(status);
264 if (fStack == NULL) {
265 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
266 return;
267 }
268
46f4442e
A
269 reset(input);
270 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
271 if (U_FAILURE(status)) {
272 fDeferredStatus = status;
273 return;
274 }
275}
b75a7d8f
A
276
277
278static const UChar BACKSLASH = 0x5c;
279static const UChar DOLLARSIGN = 0x24;
280//--------------------------------------------------------------------------------
281//
282// appendReplacement
283//
284//--------------------------------------------------------------------------------
285RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
286 const UnicodeString &replacement,
287 UErrorCode &status) {
729e4ab9 288 UText replacementText = UTEXT_INITIALIZER;
57a6839d 289
729e4ab9 290 utext_openConstUnicodeString(&replacementText, &replacement, &status);
57a6839d 291 if (U_SUCCESS(status)) {
729e4ab9
A
292 UText resultText = UTEXT_INITIALIZER;
293 utext_openUnicodeString(&resultText, &dest, &status);
57a6839d 294
729e4ab9
A
295 if (U_SUCCESS(status)) {
296 appendReplacement(&resultText, &replacementText, status);
297 utext_close(&resultText);
298 }
299 utext_close(&replacementText);
300 }
57a6839d 301
729e4ab9
A
302 return *this;
303}
304
305//
306// appendReplacement, UText mode
307//
308RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
309 UText *replacement,
310 UErrorCode &status) {
b75a7d8f
A
311 if (U_FAILURE(status)) {
312 return *this;
313 }
314 if (U_FAILURE(fDeferredStatus)) {
315 status = fDeferredStatus;
316 return *this;
317 }
318 if (fMatch == FALSE) {
319 status = U_REGEX_INVALID_STATE;
320 return *this;
321 }
57a6839d 322
b75a7d8f 323 // Copy input string from the end of previous match to start of current match
729e4ab9
A
324 int64_t destLen = utext_nativeLength(dest);
325 if (fMatchStart > fAppendPosition) {
326 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
57a6839d 327 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
729e4ab9
A
328 (int32_t)(fMatchStart-fAppendPosition), &status);
329 } else {
330 int32_t len16;
331 if (UTEXT_USES_U16(fInputText)) {
332 len16 = (int32_t)(fMatchStart-fAppendPosition);
333 } else {
334 UErrorCode lengthStatus = U_ZERO_ERROR;
335 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
336 }
337 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
338 if (inputChars == NULL) {
339 status = U_MEMORY_ALLOCATION_ERROR;
340 return *this;
341 }
342 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
343 destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
344 uprv_free(inputChars);
345 }
b75a7d8f 346 }
46f4442e 347 fAppendPosition = fMatchEnd;
57a6839d
A
348
349
b75a7d8f
A
350 // scan the replacement text, looking for substitutions ($n) and \escapes.
351 // TODO: optimize this loop by efficiently scanning for '$' or '\',
352 // move entire ranges not containing substitutions.
729e4ab9
A
353 UTEXT_SETNATIVEINDEX(replacement, 0);
354 UChar32 c = UTEXT_NEXT32(replacement);
355 while (c != U_SENTINEL) {
b75a7d8f
A
356 if (c == BACKSLASH) {
357 // Backslash Escape. Copy the following char out without further checks.
358 // Note: Surrogate pairs don't need any special handling
359 // The second half wont be a '$' or a '\', and
360 // will move to the dest normally on the next
361 // loop iteration.
729e4ab9
A
362 c = UTEXT_CURRENT32(replacement);
363 if (c == U_SENTINEL) {
b75a7d8f
A
364 break;
365 }
57a6839d 366
b75a7d8f
A
367 if (c==0x55/*U*/ || c==0x75/*u*/) {
368 // We have a \udddd or \Udddddddd escape sequence.
729e4ab9
A
369 int32_t offset = 0;
370 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
371 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
b75a7d8f 372 if (escapedChar != (UChar32)0xFFFFFFFF) {
729e4ab9
A
373 if (U_IS_BMP(escapedChar)) {
374 UChar c16 = (UChar)escapedChar;
375 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
376 } else {
377 UChar surrogate[2];
378 surrogate[0] = U16_LEAD(escapedChar);
379 surrogate[1] = U16_TRAIL(escapedChar);
380 if (U_SUCCESS(status)) {
381 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
382 }
383 }
b75a7d8f
A
384 // TODO: Report errors for mal-formed \u escapes?
385 // As this is, the original sequence is output, which may be OK.
729e4ab9 386 if (context.lastOffset == offset) {
4388f060 387 (void)UTEXT_PREVIOUS32(replacement);
729e4ab9
A
388 } else if (context.lastOffset != offset-1) {
389 utext_moveIndex32(replacement, offset - context.lastOffset - 1);
390 }
391 }
392 } else {
4388f060 393 (void)UTEXT_NEXT32(replacement);
729e4ab9
A
394 // Plain backslash escape. Just put out the escaped character.
395 if (U_IS_BMP(c)) {
396 UChar c16 = (UChar)c;
397 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
398 } else {
399 UChar surrogate[2];
400 surrogate[0] = U16_LEAD(c);
401 surrogate[1] = U16_TRAIL(c);
402 if (U_SUCCESS(status)) {
403 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
404 }
b75a7d8f
A
405 }
406 }
729e4ab9 407 } else if (c != DOLLARSIGN) {
b75a7d8f 408 // Normal char, not a $. Copy it out without further checks.
729e4ab9
A
409 if (U_IS_BMP(c)) {
410 UChar c16 = (UChar)c;
411 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
412 } else {
413 UChar surrogate[2];
414 surrogate[0] = U16_LEAD(c);
415 surrogate[1] = U16_TRAIL(c);
416 if (U_SUCCESS(status)) {
417 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
418 }
b75a7d8f 419 }
729e4ab9
A
420 } else {
421 // We've got a $. Pick up a capture group number if one follows.
422 // Consume at most the number of digits necessary for the largest capture
423 // number that is valid for this pattern.
57a6839d 424
729e4ab9
A
425 int32_t numDigits = 0;
426 int32_t groupNum = 0;
427 UChar32 digitC;
428 for (;;) {
429 digitC = UTEXT_CURRENT32(replacement);
430 if (digitC == U_SENTINEL) {
431 break;
432 }
433 if (u_isdigit(digitC) == FALSE) {
434 break;
435 }
4388f060 436 (void)UTEXT_NEXT32(replacement);
729e4ab9
A
437 groupNum=groupNum*10 + u_charDigitValue(digitC);
438 numDigits++;
439 if (numDigits >= fPattern->fMaxCaptureDigits) {
440 break;
441 }
b75a7d8f 442 }
57a6839d
A
443
444
729e4ab9
A
445 if (numDigits == 0) {
446 // The $ didn't introduce a group number at all.
447 // Treat it as just part of the substitution text.
448 UChar c16 = DOLLARSIGN;
449 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
450 } else {
451 // Finally, append the capture group data to the destination.
452 destLen += appendGroup(groupNum, dest, status);
453 if (U_FAILURE(status)) {
454 // Can fail if group number is out of range.
455 break;
456 }
b75a7d8f
A
457 }
458 }
57a6839d 459
b75a7d8f 460 if (U_FAILURE(status)) {
b75a7d8f 461 break;
729e4ab9
A
462 } else {
463 c = UTEXT_NEXT32(replacement);
b75a7d8f 464 }
b75a7d8f 465 }
57a6839d 466
b75a7d8f
A
467 return *this;
468}
469
470
471
472//--------------------------------------------------------------------------------
473//
474// appendTail Intended to be used in conjunction with appendReplacement()
475// To the destination string, append everything following
476// the last match position from the input string.
477//
46f4442e
A
478// Note: Match ranges do not affect appendTail or appendReplacement
479//
b75a7d8f
A
480//--------------------------------------------------------------------------------
481UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
729e4ab9
A
482 UErrorCode status = U_ZERO_ERROR;
483 UText resultText = UTEXT_INITIALIZER;
484 utext_openUnicodeString(&resultText, &dest, &status);
57a6839d 485
729e4ab9
A
486 if (U_SUCCESS(status)) {
487 appendTail(&resultText, status);
488 utext_close(&resultText);
489 }
57a6839d 490
729e4ab9
A
491 return dest;
492}
493
494//
495// appendTail, UText mode
496//
497UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
729e4ab9 498 if (U_FAILURE(status)) {
57a6839d 499 return dest;
729e4ab9
A
500 }
501 if (U_FAILURE(fDeferredStatus)) {
502 status = fDeferredStatus;
57a6839d 503 return dest;
729e4ab9 504 }
57a6839d 505
729e4ab9
A
506 if (fInputLength > fAppendPosition) {
507 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
508 int64_t destLen = utext_nativeLength(dest);
57a6839d 509 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
729e4ab9
A
510 (int32_t)(fInputLength-fAppendPosition), &status);
511 } else {
512 int32_t len16;
513 if (UTEXT_USES_U16(fInputText)) {
514 len16 = (int32_t)(fInputLength-fAppendPosition);
515 } else {
516 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
517 status = U_ZERO_ERROR; // buffer overflow
518 }
57a6839d 519
729e4ab9
A
520 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
521 if (inputChars == NULL) {
522 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
523 } else {
57a6839d 524 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
729e4ab9
A
525 int64_t destLen = utext_nativeLength(dest);
526 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
527 uprv_free(inputChars);
528 }
529 }
b75a7d8f
A
530 }
531 return dest;
532}
533
534
535
536//--------------------------------------------------------------------------------
537//
538// end
539//
540//--------------------------------------------------------------------------------
541int32_t RegexMatcher::end(UErrorCode &err) const {
542 return end(0, err);
543}
544
729e4ab9
A
545int64_t RegexMatcher::end64(UErrorCode &err) const {
546 return end64(0, err);
547}
b75a7d8f 548
729e4ab9 549int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
b75a7d8f
A
550 if (U_FAILURE(err)) {
551 return -1;
552 }
553 if (fMatch == FALSE) {
554 err = U_REGEX_INVALID_STATE;
555 return -1;
556 }
557 if (group < 0 || group > fPattern->fGroupMap->size()) {
558 err = U_INDEX_OUTOFBOUNDS_ERROR;
559 return -1;
560 }
729e4ab9 561 int64_t e = -1;
b75a7d8f 562 if (group == 0) {
57a6839d 563 e = fMatchEnd;
b75a7d8f
A
564 } else {
565 // Get the position within the stack frame of the variables for
566 // this capture group.
567 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
568 U_ASSERT(groupOffset < fPattern->fFrameSize);
569 U_ASSERT(groupOffset >= 0);
570 e = fFrame->fExtra[groupOffset + 1];
571 }
57a6839d 572
729e4ab9 573 return e;
b75a7d8f
A
574}
575
729e4ab9
A
576int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
577 return (int32_t)end64(group, err);
578}
b75a7d8f
A
579
580
581//--------------------------------------------------------------------------------
582//
583// find()
584//
585//--------------------------------------------------------------------------------
586UBool RegexMatcher::find() {
587 // Start at the position of the last match end. (Will be zero if the
729e4ab9 588 // matcher has been reset.)
b75a7d8f
A
589 //
590 if (U_FAILURE(fDeferredStatus)) {
591 return FALSE;
592 }
57a6839d 593
729e4ab9
A
594 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
595 return findUsingChunk();
596 }
b75a7d8f 597
729e4ab9 598 int64_t startPos = fMatchEnd;
46f4442e
A
599 if (startPos==0) {
600 startPos = fActiveStart;
601 }
374ca955
A
602
603 if (fMatch) {
604 // Save the position of any previous successful match.
605 fLastMatchEnd = fMatchEnd;
606
607 if (fMatchStart == fMatchEnd) {
608 // Previous match had zero length. Move start position up one position
609 // to avoid sending find() into a loop on zero-length matches.
46f4442e 610 if (startPos >= fActiveLimit) {
374ca955 611 fMatch = FALSE;
46f4442e 612 fHitEnd = TRUE;
374ca955
A
613 return FALSE;
614 }
729e4ab9 615 UTEXT_SETNATIVEINDEX(fInputText, startPos);
4388f060 616 (void)UTEXT_NEXT32(fInputText);
729e4ab9 617 startPos = UTEXT_GETNATIVEINDEX(fInputText);
374ca955
A
618 }
619 } else {
620 if (fLastMatchEnd >= 0) {
621 // A previous find() failed to match. Don't try again.
622 // (without this test, a pattern with a zero-length match
623 // could match again at the end of an input string.)
46f4442e 624 fHitEnd = TRUE;
374ca955
A
625 return FALSE;
626 }
627 }
628
374ca955
A
629
630 // Compute the position in the input string beyond which a match can not begin, because
631 // the minimum length match would extend past the end of the input.
46f4442e
A
632 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
633 // Be aware of possible overflows if making changes here.
729e4ab9
A
634 int64_t testStartLimit;
635 if (UTEXT_USES_U16(fInputText)) {
636 testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
637 if (startPos > testStartLimit) {
638 fMatch = FALSE;
639 fHitEnd = TRUE;
640 return FALSE;
641 }
642 } else {
643 // For now, let the matcher discover that it can't match on its own
644 // We don't know how long the match len is in native characters
645 testStartLimit = fActiveLimit;
b75a7d8f
A
646 }
647
b75a7d8f
A
648 UChar32 c;
649 U_ASSERT(startPos >= 0);
650
651 switch (fPattern->fStartType) {
652 case START_NO_INFO:
57a6839d 653 // No optimization was found.
b75a7d8f
A
654 // Try a match at each input position.
655 for (;;) {
46f4442e 656 MatchAt(startPos, FALSE, fDeferredStatus);
b75a7d8f
A
657 if (U_FAILURE(fDeferredStatus)) {
658 return FALSE;
659 }
660 if (fMatch) {
661 return TRUE;
662 }
729e4ab9 663 if (startPos >= testStartLimit) {
46f4442e 664 fHitEnd = TRUE;
b75a7d8f
A
665 return FALSE;
666 }
729e4ab9 667 UTEXT_SETNATIVEINDEX(fInputText, startPos);
4388f060 668 (void)UTEXT_NEXT32(fInputText);
729e4ab9 669 startPos = UTEXT_GETNATIVEINDEX(fInputText);
b75a7d8f
A
670 // Note that it's perfectly OK for a pattern to have a zero-length
671 // match at the end of a string, so we must make sure that the loop
729e4ab9
A
672 // runs with startPos == testStartLimit the last time through.
673 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
674 return FALSE;
b75a7d8f
A
675 }
676 U_ASSERT(FALSE);
677
678 case START_START:
679 // Matches are only possible at the start of the input string
680 // (pattern begins with ^ or \A)
46f4442e 681 if (startPos > fActiveStart) {
374ca955 682 fMatch = FALSE;
b75a7d8f
A
683 return FALSE;
684 }
46f4442e 685 MatchAt(startPos, FALSE, fDeferredStatus);
b75a7d8f
A
686 if (U_FAILURE(fDeferredStatus)) {
687 return FALSE;
688 }
689 return fMatch;
690
691
692 case START_SET:
693 {
694 // Match may start on any char from a pre-computed set.
695 U_ASSERT(fPattern->fMinMatchLen > 0);
729e4ab9
A
696 int64_t pos;
697 UTEXT_SETNATIVEINDEX(fInputText, startPos);
b75a7d8f 698 for (;;) {
729e4ab9
A
699 c = UTEXT_NEXT32(fInputText);
700 pos = UTEXT_GETNATIVEINDEX(fInputText);
701 // c will be -1 (U_SENTINEL) at end of text, in which case we
702 // skip this next block (so we don't have a negative array index)
703 // and handle end of text in the following block.
704 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
705 (c>=256 && fPattern->fInitialChars->contains(c)))) {
706 MatchAt(startPos, FALSE, fDeferredStatus);
b75a7d8f
A
707 if (U_FAILURE(fDeferredStatus)) {
708 return FALSE;
709 }
710 if (fMatch) {
711 return TRUE;
712 }
729e4ab9 713 UTEXT_SETNATIVEINDEX(fInputText, pos);
b75a7d8f 714 }
729e4ab9 715 if (startPos >= testStartLimit) {
374ca955 716 fMatch = FALSE;
46f4442e 717 fHitEnd = TRUE;
b75a7d8f
A
718 return FALSE;
719 }
729e4ab9
A
720 startPos = pos;
721 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
722 return FALSE;
b75a7d8f
A
723 }
724 }
725 U_ASSERT(FALSE);
726
727 case START_STRING:
728 case START_CHAR:
729 {
730 // Match starts on exactly one char.
731 U_ASSERT(fPattern->fMinMatchLen > 0);
732 UChar32 theChar = fPattern->fInitialChar;
729e4ab9
A
733 int64_t pos;
734 UTEXT_SETNATIVEINDEX(fInputText, startPos);
b75a7d8f 735 for (;;) {
729e4ab9
A
736 c = UTEXT_NEXT32(fInputText);
737 pos = UTEXT_GETNATIVEINDEX(fInputText);
b75a7d8f 738 if (c == theChar) {
729e4ab9 739 MatchAt(startPos, FALSE, fDeferredStatus);
b75a7d8f
A
740 if (U_FAILURE(fDeferredStatus)) {
741 return FALSE;
742 }
743 if (fMatch) {
744 return TRUE;
745 }
729e4ab9 746 UTEXT_SETNATIVEINDEX(fInputText, pos);
b75a7d8f 747 }
729e4ab9 748 if (startPos >= testStartLimit) {
374ca955 749 fMatch = FALSE;
46f4442e 750 fHitEnd = TRUE;
b75a7d8f
A
751 return FALSE;
752 }
729e4ab9
A
753 startPos = pos;
754 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
755 return FALSE;
756 }
b75a7d8f
A
757 }
758 U_ASSERT(FALSE);
759
760 case START_LINE:
761 {
762 UChar32 c;
46f4442e
A
763 if (startPos == fAnchorStart) {
764 MatchAt(startPos, FALSE, fDeferredStatus);
b75a7d8f
A
765 if (U_FAILURE(fDeferredStatus)) {
766 return FALSE;
767 }
768 if (fMatch) {
769 return TRUE;
770 }
729e4ab9
A
771 UTEXT_SETNATIVEINDEX(fInputText, startPos);
772 c = UTEXT_NEXT32(fInputText);
773 startPos = UTEXT_GETNATIVEINDEX(fInputText);
774 } else {
775 UTEXT_SETNATIVEINDEX(fInputText, startPos);
776 c = UTEXT_PREVIOUS32(fInputText);
777 UTEXT_SETNATIVEINDEX(fInputText, startPos);
b75a7d8f
A
778 }
779
46f4442e 780 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
729e4ab9 781 for (;;) {
46f4442e
A
782 if (c == 0x0a) {
783 MatchAt(startPos, FALSE, fDeferredStatus);
784 if (U_FAILURE(fDeferredStatus)) {
785 return FALSE;
786 }
787 if (fMatch) {
788 return TRUE;
789 }
729e4ab9 790 UTEXT_SETNATIVEINDEX(fInputText, startPos);
46f4442e 791 }
729e4ab9 792 if (startPos >= testStartLimit) {
46f4442e
A
793 fMatch = FALSE;
794 fHitEnd = TRUE;
795 return FALSE;
796 }
729e4ab9
A
797 c = UTEXT_NEXT32(fInputText);
798 startPos = UTEXT_GETNATIVEINDEX(fInputText);
46f4442e
A
799 // Note that it's perfectly OK for a pattern to have a zero-length
800 // match at the end of a string, so we must make sure that the loop
729e4ab9
A
801 // runs with startPos == testStartLimit the last time through.
802 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
803 return FALSE;
b75a7d8f 804 }
46f4442e
A
805 } else {
806 for (;;) {
46f4442e
A
807 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
808 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
729e4ab9 809 if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
4388f060 810 (void)UTEXT_NEXT32(fInputText);
729e4ab9 811 startPos = UTEXT_GETNATIVEINDEX(fInputText);
46f4442e
A
812 }
813 MatchAt(startPos, FALSE, fDeferredStatus);
814 if (U_FAILURE(fDeferredStatus)) {
815 return FALSE;
816 }
817 if (fMatch) {
818 return TRUE;
819 }
729e4ab9 820 UTEXT_SETNATIVEINDEX(fInputText, startPos);
46f4442e 821 }
729e4ab9 822 if (startPos >= testStartLimit) {
46f4442e
A
823 fMatch = FALSE;
824 fHitEnd = TRUE;
825 return FALSE;
826 }
729e4ab9
A
827 c = UTEXT_NEXT32(fInputText);
828 startPos = UTEXT_GETNATIVEINDEX(fInputText);
46f4442e
A
829 // Note that it's perfectly OK for a pattern to have a zero-length
830 // match at the end of a string, so we must make sure that the loop
729e4ab9
A
831 // runs with startPos == testStartLimit the last time through.
832 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
833 return FALSE;
b75a7d8f 834 }
b75a7d8f
A
835 }
836 }
837
838 default:
839 U_ASSERT(FALSE);
840 }
841
842 U_ASSERT(FALSE);
843 return FALSE;
844}
845
846
847
729e4ab9 848UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
b75a7d8f
A
849 if (U_FAILURE(status)) {
850 return FALSE;
851 }
852 if (U_FAILURE(fDeferredStatus)) {
853 status = fDeferredStatus;
854 return FALSE;
855 }
46f4442e
A
856 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
857 // This will reset the region to be the full input length.
729e4ab9
A
858 if (start < 0) {
859 status = U_INDEX_OUTOFBOUNDS_ERROR;
860 return FALSE;
861 }
57a6839d 862
729e4ab9
A
863 int64_t nativeStart = start;
864 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
b75a7d8f
A
865 status = U_INDEX_OUTOFBOUNDS_ERROR;
866 return FALSE;
867 }
57a6839d 868 fMatchEnd = nativeStart;
b75a7d8f
A
869 return find();
870}
871
872
b75a7d8f
A
873//--------------------------------------------------------------------------------
874//
729e4ab9
A
875// findUsingChunk() -- like find(), but with the advance knowledge that the
876// entire string is available in the UText's chunk buffer.
b75a7d8f
A
877//
878//--------------------------------------------------------------------------------
729e4ab9
A
879UBool RegexMatcher::findUsingChunk() {
880 // Start at the position of the last match end. (Will be zero if the
881 // matcher has been reset.
882 //
b75a7d8f 883
729e4ab9
A
884 int32_t startPos = (int32_t)fMatchEnd;
885 if (startPos==0) {
886 startPos = (int32_t)fActiveStart;
b75a7d8f 887 }
57a6839d 888
729e4ab9 889 const UChar *inputBuf = fInputText->chunkContents;
b75a7d8f 890
729e4ab9
A
891 if (fMatch) {
892 // Save the position of any previous successful match.
893 fLastMatchEnd = fMatchEnd;
57a6839d 894
729e4ab9
A
895 if (fMatchStart == fMatchEnd) {
896 // Previous match had zero length. Move start position up one position
897 // to avoid sending find() into a loop on zero-length matches.
898 if (startPos >= fActiveLimit) {
899 fMatch = FALSE;
900 fHitEnd = TRUE;
901 return FALSE;
902 }
903 U16_FWD_1(inputBuf, startPos, fInputLength);
904 }
905 } else {
906 if (fLastMatchEnd >= 0) {
907 // A previous find() failed to match. Don't try again.
908 // (without this test, a pattern with a zero-length match
909 // could match again at the end of an input string.)
910 fHitEnd = TRUE;
911 return FALSE;
912 }
b75a7d8f 913 }
57a6839d
A
914
915
729e4ab9
A
916 // Compute the position in the input string beyond which a match can not begin, because
917 // the minimum length match would extend past the end of the input.
918 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
919 // Be aware of possible overflows if making changes here.
920 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
921 if (startPos > testLen) {
922 fMatch = FALSE;
923 fHitEnd = TRUE;
b75a7d8f
A
924 return FALSE;
925 }
57a6839d 926
729e4ab9
A
927 UChar32 c;
928 U_ASSERT(startPos >= 0);
57a6839d 929
729e4ab9
A
930 switch (fPattern->fStartType) {
931 case START_NO_INFO:
57a6839d 932 // No optimization was found.
729e4ab9
A
933 // Try a match at each input position.
934 for (;;) {
935 MatchChunkAt(startPos, FALSE, fDeferredStatus);
936 if (U_FAILURE(fDeferredStatus)) {
937 return FALSE;
938 }
939 if (fMatch) {
940 return TRUE;
941 }
942 if (startPos >= testLen) {
943 fHitEnd = TRUE;
944 return FALSE;
945 }
946 U16_FWD_1(inputBuf, startPos, fActiveLimit);
947 // Note that it's perfectly OK for a pattern to have a zero-length
948 // match at the end of a string, so we must make sure that the loop
949 // runs with startPos == testLen the last time through.
950 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
951 return FALSE;
952 }
953 U_ASSERT(FALSE);
57a6839d 954
729e4ab9
A
955 case START_START:
956 // Matches are only possible at the start of the input string
957 // (pattern begins with ^ or \A)
958 if (startPos > fActiveStart) {
959 fMatch = FALSE;
960 return FALSE;
961 }
962 MatchChunkAt(startPos, FALSE, fDeferredStatus);
963 if (U_FAILURE(fDeferredStatus)) {
964 return FALSE;
965 }
966 return fMatch;
57a6839d
A
967
968
729e4ab9
A
969 case START_SET:
970 {
971 // Match may start on any char from a pre-computed set.
972 U_ASSERT(fPattern->fMinMatchLen > 0);
973 for (;;) {
974 int32_t pos = startPos;
975 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
976 if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
977 (c>=256 && fPattern->fInitialChars->contains(c))) {
978 MatchChunkAt(pos, FALSE, fDeferredStatus);
979 if (U_FAILURE(fDeferredStatus)) {
980 return FALSE;
981 }
982 if (fMatch) {
983 return TRUE;
984 }
985 }
986 if (pos >= testLen) {
987 fMatch = FALSE;
988 fHitEnd = TRUE;
989 return FALSE;
990 }
991 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
992 return FALSE;
993 }
b75a7d8f 994 }
729e4ab9 995 U_ASSERT(FALSE);
57a6839d 996
729e4ab9
A
997 case START_STRING:
998 case START_CHAR:
999 {
1000 // Match starts on exactly one char.
1001 U_ASSERT(fPattern->fMinMatchLen > 0);
1002 UChar32 theChar = fPattern->fInitialChar;
1003 for (;;) {
1004 int32_t pos = startPos;
1005 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
1006 if (c == theChar) {
1007 MatchChunkAt(pos, FALSE, fDeferredStatus);
1008 if (U_FAILURE(fDeferredStatus)) {
1009 return FALSE;
1010 }
1011 if (fMatch) {
1012 return TRUE;
1013 }
1014 }
1015 if (pos >= testLen) {
1016 fMatch = FALSE;
1017 fHitEnd = TRUE;
1018 return FALSE;
1019 }
1020 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1021 return FALSE;
1022 }
1023 }
1024 U_ASSERT(FALSE);
57a6839d 1025
729e4ab9
A
1026 case START_LINE:
1027 {
1028 UChar32 c;
1029 if (startPos == fAnchorStart) {
1030 MatchChunkAt(startPos, FALSE, fDeferredStatus);
1031 if (U_FAILURE(fDeferredStatus)) {
1032 return FALSE;
1033 }
1034 if (fMatch) {
1035 return TRUE;
1036 }
1037 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1038 }
57a6839d 1039
729e4ab9
A
1040 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1041 for (;;) {
1042 c = inputBuf[startPos-1];
1043 if (c == 0x0a) {
1044 MatchChunkAt(startPos, FALSE, fDeferredStatus);
1045 if (U_FAILURE(fDeferredStatus)) {
1046 return FALSE;
1047 }
1048 if (fMatch) {
1049 return TRUE;
1050 }
1051 }
1052 if (startPos >= testLen) {
1053 fMatch = FALSE;
1054 fHitEnd = TRUE;
1055 return FALSE;
1056 }
1057 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1058 // Note that it's perfectly OK for a pattern to have a zero-length
1059 // match at the end of a string, so we must make sure that the loop
1060 // runs with startPos == testLen the last time through.
1061 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1062 return FALSE;
1063 }
1064 } else {
1065 for (;;) {
1066 c = inputBuf[startPos-1];
1067 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
1068 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
1069 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1070 startPos++;
1071 }
1072 MatchChunkAt(startPos, FALSE, fDeferredStatus);
1073 if (U_FAILURE(fDeferredStatus)) {
1074 return FALSE;
1075 }
1076 if (fMatch) {
1077 return TRUE;
1078 }
1079 }
1080 if (startPos >= testLen) {
1081 fMatch = FALSE;
1082 fHitEnd = TRUE;
1083 return FALSE;
1084 }
1085 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1086 // Note that it's perfectly OK for a pattern to have a zero-length
1087 // match at the end of a string, so we must make sure that the loop
1088 // runs with startPos == testLen the last time through.
1089 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1090 return FALSE;
1091 }
1092 }
1093 }
57a6839d 1094
729e4ab9
A
1095 default:
1096 U_ASSERT(FALSE);
1097 }
57a6839d 1098
729e4ab9
A
1099 U_ASSERT(FALSE);
1100 return FALSE;
1101}
1102
1103
1104
1105//--------------------------------------------------------------------------------
1106//
1107// group()
1108//
1109//--------------------------------------------------------------------------------
1110UnicodeString RegexMatcher::group(UErrorCode &status) const {
1111 return group(0, status);
b75a7d8f
A
1112}
1113
729e4ab9
A
1114// Return immutable shallow clone
1115UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1116 return group(0, dest, group_len, status);
1117}
b75a7d8f 1118
729e4ab9
A
1119// Return immutable shallow clone
1120UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1121 group_len = 0;
374ca955 1122 if (U_FAILURE(status)) {
729e4ab9 1123 return dest;
374ca955
A
1124 }
1125 if (U_FAILURE(fDeferredStatus)) {
1126 status = fDeferredStatus;
57a6839d 1127 } else if (fMatch == FALSE) {
729e4ab9 1128 status = U_REGEX_INVALID_STATE;
57a6839d 1129 } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
374ca955 1130 status = U_INDEX_OUTOFBOUNDS_ERROR;
374ca955 1131 }
57a6839d
A
1132
1133 if (U_FAILURE(status)) {
1134 return dest;
729e4ab9 1135 }
57a6839d 1136
729e4ab9
A
1137 int64_t s, e;
1138 if (groupNum == 0) {
1139 s = fMatchStart;
1140 e = fMatchEnd;
1141 } else {
1142 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1143 U_ASSERT(groupOffset < fPattern->fFrameSize);
1144 U_ASSERT(groupOffset >= 0);
1145 s = fFrame->fExtra[groupOffset];
1146 e = fFrame->fExtra[groupOffset+1];
1147 }
1148
1149 if (s < 0) {
1150 // A capture group wasn't part of the match
1151 return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1152 }
1153 U_ASSERT(s <= e);
1154 group_len = e - s;
57a6839d 1155
729e4ab9
A
1156 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1157 if (dest)
1158 UTEXT_SETNATIVEINDEX(dest, s);
1159 return dest;
374ca955
A
1160}
1161
729e4ab9
A
1162UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1163 UnicodeString result;
1164 if (U_FAILURE(status)) {
1165 return result;
1166 }
1167 UText resultText = UTEXT_INITIALIZER;
1168 utext_openUnicodeString(&resultText, &result, &status);
1169 group(groupNum, &resultText, status);
1170 utext_close(&resultText);
1171 return result;
1172}
374ca955 1173
b75a7d8f 1174
729e4ab9
A
1175// Return deep (mutable) clone
1176// Technology Preview (as an API), but note that the UnicodeString API is implemented
1177// using this function.
1178UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const {
b75a7d8f 1179 if (U_FAILURE(status)) {
729e4ab9 1180 return dest;
b75a7d8f 1181 }
57a6839d 1182
b75a7d8f
A
1183 if (U_FAILURE(fDeferredStatus)) {
1184 status = fDeferredStatus;
57a6839d 1185 } else if (fMatch == FALSE) {
729e4ab9 1186 status = U_REGEX_INVALID_STATE;
57a6839d 1187 } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
729e4ab9 1188 status = U_INDEX_OUTOFBOUNDS_ERROR;
729e4ab9 1189 }
57a6839d
A
1190 if (U_FAILURE(status)) {
1191 return dest;
729e4ab9 1192 }
57a6839d 1193
729e4ab9
A
1194 int64_t s, e;
1195 if (groupNum == 0) {
1196 s = fMatchStart;
1197 e = fMatchEnd;
1198 } else {
1199 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1200 U_ASSERT(groupOffset < fPattern->fFrameSize);
1201 U_ASSERT(groupOffset >= 0);
1202 s = fFrame->fExtra[groupOffset];
1203 e = fFrame->fExtra[groupOffset+1];
1204 }
57a6839d 1205
729e4ab9 1206 if (s < 0) {
57a6839d 1207 // A capture group wasn't part of the match
729e4ab9
A
1208 if (dest) {
1209 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1210 return dest;
1211 } else {
1212 return utext_openUChars(NULL, NULL, 0, &status);
1213 }
1214 }
1215 U_ASSERT(s <= e);
57a6839d 1216
729e4ab9
A
1217 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1218 U_ASSERT(e <= fInputLength);
1219 if (dest) {
1220 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status);
1221 } else {
1222 UText groupText = UTEXT_INITIALIZER;
1223 utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status);
1224 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1225 utext_close(&groupText);
1226 }
1227 } else {
1228 int32_t len16;
1229 if (UTEXT_USES_U16(fInputText)) {
1230 len16 = (int32_t)(e-s);
1231 } else {
1232 UErrorCode lengthStatus = U_ZERO_ERROR;
1233 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1234 }
1235 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1236 if (groupChars == NULL) {
1237 status = U_MEMORY_ALLOCATION_ERROR;
1238 return dest;
1239 }
1240 utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1241
1242 if (dest) {
1243 utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status);
1244 } else {
1245 UText groupText = UTEXT_INITIALIZER;
1246 utext_openUChars(&groupText, groupChars, len16, &status);
1247 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1248 utext_close(&groupText);
1249 }
57a6839d 1250
729e4ab9
A
1251 uprv_free(groupChars);
1252 }
1253 return dest;
b75a7d8f
A
1254}
1255
729e4ab9
A
1256//--------------------------------------------------------------------------------
1257//
1258// appendGroup() -- currently internal only, appends a group to a UText rather
1259// than replacing its contents
1260//
1261//--------------------------------------------------------------------------------
b75a7d8f 1262
729e4ab9 1263int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
374ca955 1264 if (U_FAILURE(status)) {
729e4ab9 1265 return 0;
374ca955
A
1266 }
1267 if (U_FAILURE(fDeferredStatus)) {
1268 status = fDeferredStatus;
729e4ab9 1269 return 0;
374ca955 1270 }
729e4ab9 1271 int64_t destLen = utext_nativeLength(dest);
57a6839d 1272
729e4ab9
A
1273 if (fMatch == FALSE) {
1274 status = U_REGEX_INVALID_STATE;
1275 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1276 }
1277 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
374ca955 1278 status = U_INDEX_OUTOFBOUNDS_ERROR;
729e4ab9 1279 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
374ca955 1280 }
57a6839d 1281
729e4ab9
A
1282 int64_t s, e;
1283 if (groupNum == 0) {
1284 s = fMatchStart;
1285 e = fMatchEnd;
1286 } else {
1287 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1288 U_ASSERT(groupOffset < fPattern->fFrameSize);
1289 U_ASSERT(groupOffset >= 0);
1290 s = fFrame->fExtra[groupOffset];
1291 e = fFrame->fExtra[groupOffset+1];
1292 }
57a6839d 1293
729e4ab9 1294 if (s < 0) {
57a6839d 1295 // A capture group wasn't part of the match
729e4ab9
A
1296 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1297 }
1298 U_ASSERT(s <= e);
57a6839d 1299
729e4ab9
A
1300 int64_t deltaLen;
1301 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1302 U_ASSERT(e <= fInputLength);
1303 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1304 } else {
1305 int32_t len16;
1306 if (UTEXT_USES_U16(fInputText)) {
1307 len16 = (int32_t)(e-s);
1308 } else {
1309 UErrorCode lengthStatus = U_ZERO_ERROR;
1310 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1311 }
1312 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1313 if (groupChars == NULL) {
1314 status = U_MEMORY_ALLOCATION_ERROR;
1315 return 0;
1316 }
1317 utext_extract(fInputText, s, e, groupChars, len16+1, &status);
57a6839d 1318
729e4ab9
A
1319 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1320 uprv_free(groupChars);
1321 }
1322 return deltaLen;
374ca955
A
1323}
1324
b75a7d8f
A
1325
1326
46f4442e
A
1327//--------------------------------------------------------------------------------
1328//
729e4ab9 1329// groupCount()
46f4442e
A
1330//
1331//--------------------------------------------------------------------------------
729e4ab9
A
1332int32_t RegexMatcher::groupCount() const {
1333 return fPattern->fGroupMap->size();
b75a7d8f
A
1334}
1335
1336
1337
46f4442e
A
1338//--------------------------------------------------------------------------------
1339//
729e4ab9
A
1340// hasAnchoringBounds()
1341//
1342//--------------------------------------------------------------------------------
1343UBool RegexMatcher::hasAnchoringBounds() const {
1344 return fAnchoringBounds;
1345}
1346
1347
1348//--------------------------------------------------------------------------------
1349//
1350// hasTransparentBounds()
1351//
1352//--------------------------------------------------------------------------------
1353UBool RegexMatcher::hasTransparentBounds() const {
1354 return fTransparentBounds;
1355}
1356
1357
1358
1359//--------------------------------------------------------------------------------
1360//
1361// hitEnd()
1362//
1363//--------------------------------------------------------------------------------
1364UBool RegexMatcher::hitEnd() const {
1365 return fHitEnd;
1366}
1367
1368
1369//--------------------------------------------------------------------------------
1370//
1371// input()
1372//
1373//--------------------------------------------------------------------------------
1374const UnicodeString &RegexMatcher::input() const {
1375 if (!fInput) {
1376 UErrorCode status = U_ZERO_ERROR;
1377 int32_t len16;
1378 if (UTEXT_USES_U16(fInputText)) {
1379 len16 = (int32_t)fInputLength;
1380 } else {
1381 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1382 status = U_ZERO_ERROR; // overflow, length status
1383 }
1384 UnicodeString *result = new UnicodeString(len16, 0, 0);
57a6839d 1385
729e4ab9
A
1386 UChar *inputChars = result->getBuffer(len16);
1387 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1388 result->releaseBuffer(len16);
57a6839d 1389
729e4ab9
A
1390 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1391 }
57a6839d 1392
729e4ab9
A
1393 return *fInput;
1394}
1395
1396//--------------------------------------------------------------------------------
1397//
1398// inputText()
1399//
1400//--------------------------------------------------------------------------------
1401UText *RegexMatcher::inputText() const {
1402 return fInputText;
1403}
1404
1405
1406//--------------------------------------------------------------------------------
1407//
1408// getInput() -- like inputText(), but makes a clone or copies into another UText
1409//
1410//--------------------------------------------------------------------------------
1411UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
729e4ab9
A
1412 if (U_FAILURE(status)) {
1413 return dest;
1414 }
1415 if (U_FAILURE(fDeferredStatus)) {
1416 status = fDeferredStatus;
57a6839d 1417 return dest;
729e4ab9 1418 }
57a6839d 1419
729e4ab9
A
1420 if (dest) {
1421 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1422 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1423 } else {
1424 int32_t input16Len;
1425 if (UTEXT_USES_U16(fInputText)) {
1426 input16Len = (int32_t)fInputLength;
1427 } else {
1428 UErrorCode lengthStatus = U_ZERO_ERROR;
1429 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1430 }
1431 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1432 if (inputChars == NULL) {
1433 return dest;
1434 }
57a6839d 1435
729e4ab9
A
1436 status = U_ZERO_ERROR;
1437 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1438 status = U_ZERO_ERROR;
1439 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
57a6839d 1440
729e4ab9
A
1441 uprv_free(inputChars);
1442 }
1443 return dest;
1444 } else {
1445 return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1446 }
1447}
1448
1449
1450static UBool compat_SyncMutableUTextContents(UText *ut);
1451static UBool compat_SyncMutableUTextContents(UText *ut) {
1452 UBool retVal = FALSE;
57a6839d 1453
729e4ab9
A
1454 // In the following test, we're really only interested in whether the UText should switch
1455 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents
1456 // will still point to the correct data.
1457 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1458 UnicodeString *us=(UnicodeString *)ut->context;
57a6839d 1459
729e4ab9
A
1460 // Update to the latest length.
1461 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1462 int32_t newLength = us->length();
57a6839d 1463
729e4ab9
A
1464 // Update the chunk description.
1465 // The buffer may have switched between stack- and heap-based.
1466 ut->chunkContents = us->getBuffer();
1467 ut->chunkLength = newLength;
1468 ut->chunkNativeLimit = newLength;
1469 ut->nativeIndexingLimit = newLength;
1470 retVal = TRUE;
1471 }
1472
1473 return retVal;
1474}
1475
1476//--------------------------------------------------------------------------------
1477//
1478// lookingAt()
1479//
1480//--------------------------------------------------------------------------------
1481UBool RegexMatcher::lookingAt(UErrorCode &status) {
1482 if (U_FAILURE(status)) {
1483 return FALSE;
1484 }
1485 if (U_FAILURE(fDeferredStatus)) {
1486 status = fDeferredStatus;
1487 return FALSE;
1488 }
57a6839d 1489
729e4ab9
A
1490 if (fInputUniStrMaybeMutable) {
1491 if (compat_SyncMutableUTextContents(fInputText)) {
1492 fInputLength = utext_nativeLength(fInputText);
1493 reset();
1494 }
1495 }
1496 else {
1497 resetPreserveRegion();
1498 }
1499 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1500 MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1501 } else {
1502 MatchAt(fActiveStart, FALSE, status);
1503 }
1504 return fMatch;
1505}
1506
1507
1508UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1509 if (U_FAILURE(status)) {
1510 return FALSE;
1511 }
1512 if (U_FAILURE(fDeferredStatus)) {
1513 status = fDeferredStatus;
1514 return FALSE;
1515 }
1516 reset();
57a6839d 1517
729e4ab9
A
1518 if (start < 0) {
1519 status = U_INDEX_OUTOFBOUNDS_ERROR;
1520 return FALSE;
1521 }
57a6839d 1522
729e4ab9
A
1523 if (fInputUniStrMaybeMutable) {
1524 if (compat_SyncMutableUTextContents(fInputText)) {
1525 fInputLength = utext_nativeLength(fInputText);
1526 reset();
1527 }
1528 }
1529
1530 int64_t nativeStart;
1531 nativeStart = start;
1532 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1533 status = U_INDEX_OUTOFBOUNDS_ERROR;
1534 return FALSE;
1535 }
57a6839d 1536
729e4ab9
A
1537 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1538 MatchChunkAt((int32_t)nativeStart, FALSE, status);
1539 } else {
1540 MatchAt(nativeStart, FALSE, status);
1541 }
1542 return fMatch;
1543}
1544
1545
1546
1547//--------------------------------------------------------------------------------
1548//
1549// matches()
1550//
1551//--------------------------------------------------------------------------------
1552UBool RegexMatcher::matches(UErrorCode &status) {
1553 if (U_FAILURE(status)) {
1554 return FALSE;
1555 }
1556 if (U_FAILURE(fDeferredStatus)) {
1557 status = fDeferredStatus;
1558 return FALSE;
1559 }
1560
1561 if (fInputUniStrMaybeMutable) {
1562 if (compat_SyncMutableUTextContents(fInputText)) {
1563 fInputLength = utext_nativeLength(fInputText);
1564 reset();
1565 }
1566 }
1567 else {
1568 resetPreserveRegion();
1569 }
1570
1571 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1572 MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1573 } else {
1574 MatchAt(fActiveStart, TRUE, status);
1575 }
1576 return fMatch;
1577}
1578
1579
1580UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1581 if (U_FAILURE(status)) {
1582 return FALSE;
1583 }
1584 if (U_FAILURE(fDeferredStatus)) {
1585 status = fDeferredStatus;
1586 return FALSE;
1587 }
1588 reset();
57a6839d 1589
729e4ab9
A
1590 if (start < 0) {
1591 status = U_INDEX_OUTOFBOUNDS_ERROR;
1592 return FALSE;
1593 }
1594
1595 if (fInputUniStrMaybeMutable) {
1596 if (compat_SyncMutableUTextContents(fInputText)) {
1597 fInputLength = utext_nativeLength(fInputText);
1598 reset();
1599 }
1600 }
1601
1602 int64_t nativeStart;
1603 nativeStart = start;
1604 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1605 status = U_INDEX_OUTOFBOUNDS_ERROR;
1606 return FALSE;
1607 }
1608
1609 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1610 MatchChunkAt((int32_t)nativeStart, TRUE, status);
1611 } else {
1612 MatchAt(nativeStart, TRUE, status);
1613 }
1614 return fMatch;
1615}
1616
1617
1618
1619//--------------------------------------------------------------------------------
1620//
1621// pattern
1622//
1623//--------------------------------------------------------------------------------
1624const RegexPattern &RegexMatcher::pattern() const {
1625 return *fPattern;
1626}
1627
1628
1629
1630//--------------------------------------------------------------------------------
1631//
1632// region
46f4442e
A
1633//
1634//--------------------------------------------------------------------------------
729e4ab9 1635RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
46f4442e
A
1636 if (U_FAILURE(status)) {
1637 return *this;
1638 }
57a6839d 1639
729e4ab9 1640 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
46f4442e
A
1641 status = U_ILLEGAL_ARGUMENT_ERROR;
1642 }
57a6839d 1643
729e4ab9
A
1644 int64_t nativeStart = regionStart;
1645 int64_t nativeLimit = regionLimit;
1646 if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1647 status = U_ILLEGAL_ARGUMENT_ERROR;
1648 }
1649
1650 if (startIndex == -1)
1651 this->reset();
1652 else
57a6839d
A
1653 resetPreserveRegion();
1654
729e4ab9
A
1655 fRegionStart = nativeStart;
1656 fRegionLimit = nativeLimit;
1657 fActiveStart = nativeStart;
1658 fActiveLimit = nativeLimit;
1659
1660 if (startIndex != -1) {
1661 if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1662 status = U_INDEX_OUTOFBOUNDS_ERROR;
1663 }
57a6839d 1664 fMatchEnd = startIndex;
729e4ab9
A
1665 }
1666
46f4442e 1667 if (!fTransparentBounds) {
729e4ab9
A
1668 fLookStart = nativeStart;
1669 fLookLimit = nativeLimit;
46f4442e
A
1670 }
1671 if (fAnchoringBounds) {
729e4ab9
A
1672 fAnchorStart = nativeStart;
1673 fAnchorLimit = nativeLimit;
46f4442e
A
1674 }
1675 return *this;
1676}
1677
729e4ab9
A
1678RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1679 return region(start, limit, -1, status);
1680}
46f4442e
A
1681
1682//--------------------------------------------------------------------------------
1683//
1684// regionEnd
1685//
1686//--------------------------------------------------------------------------------
1687int32_t RegexMatcher::regionEnd() const {
729e4ab9 1688 return (int32_t)fRegionLimit;
46f4442e
A
1689}
1690
729e4ab9
A
1691int64_t RegexMatcher::regionEnd64() const {
1692 return fRegionLimit;
1693}
46f4442e
A
1694
1695//--------------------------------------------------------------------------------
1696//
1697// regionStart
1698//
1699//--------------------------------------------------------------------------------
1700int32_t RegexMatcher::regionStart() const {
729e4ab9
A
1701 return (int32_t)fRegionStart;
1702}
1703
1704int64_t RegexMatcher::regionStart64() const {
46f4442e
A
1705 return fRegionStart;
1706}
1707
1708
b75a7d8f
A
1709//--------------------------------------------------------------------------------
1710//
1711// replaceAll
1712//
1713//--------------------------------------------------------------------------------
1714UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
729e4ab9
A
1715 UText replacementText = UTEXT_INITIALIZER;
1716 UText resultText = UTEXT_INITIALIZER;
1717 UnicodeString resultString;
1718 if (U_FAILURE(status)) {
1719 return resultString;
1720 }
57a6839d 1721
729e4ab9
A
1722 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1723 utext_openUnicodeString(&resultText, &resultString, &status);
57a6839d 1724
729e4ab9
A
1725 replaceAll(&replacementText, &resultText, status);
1726
1727 utext_close(&resultText);
1728 utext_close(&replacementText);
57a6839d 1729
729e4ab9
A
1730 return resultString;
1731}
1732
1733
1734//
1735// replaceAll, UText mode
1736//
1737UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
b75a7d8f 1738 if (U_FAILURE(status)) {
729e4ab9 1739 return dest;
b75a7d8f
A
1740 }
1741 if (U_FAILURE(fDeferredStatus)) {
1742 status = fDeferredStatus;
729e4ab9 1743 return dest;
b75a7d8f 1744 }
57a6839d 1745
729e4ab9
A
1746 if (dest == NULL) {
1747 UnicodeString emptyString;
1748 UText empty = UTEXT_INITIALIZER;
57a6839d 1749
729e4ab9
A
1750 utext_openUnicodeString(&empty, &emptyString, &status);
1751 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1752 utext_close(&empty);
1753 }
1754
1755 if (U_SUCCESS(status)) {
1756 reset();
1757 while (find()) {
1758 appendReplacement(dest, replacement, status);
1759 if (U_FAILURE(status)) {
1760 break;
1761 }
b75a7d8f 1762 }
729e4ab9 1763 appendTail(dest, status);
b75a7d8f 1764 }
57a6839d 1765
729e4ab9 1766 return dest;
b75a7d8f
A
1767}
1768
1769
b75a7d8f
A
1770//--------------------------------------------------------------------------------
1771//
1772// replaceFirst
1773//
1774//--------------------------------------------------------------------------------
1775UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
729e4ab9
A
1776 UText replacementText = UTEXT_INITIALIZER;
1777 UText resultText = UTEXT_INITIALIZER;
1778 UnicodeString resultString;
57a6839d 1779
729e4ab9
A
1780 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1781 utext_openUnicodeString(&resultText, &resultString, &status);
57a6839d 1782
729e4ab9 1783 replaceFirst(&replacementText, &resultText, status);
57a6839d 1784
729e4ab9
A
1785 utext_close(&resultText);
1786 utext_close(&replacementText);
57a6839d 1787
729e4ab9
A
1788 return resultString;
1789}
1790
1791//
1792// replaceFirst, UText mode
1793//
1794UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
b75a7d8f 1795 if (U_FAILURE(status)) {
729e4ab9 1796 return dest;
b75a7d8f
A
1797 }
1798 if (U_FAILURE(fDeferredStatus)) {
1799 status = fDeferredStatus;
729e4ab9 1800 return dest;
b75a7d8f
A
1801 }
1802
1803 reset();
1804 if (!find()) {
729e4ab9 1805 return getInput(dest, status);
b75a7d8f 1806 }
57a6839d 1807
729e4ab9
A
1808 if (dest == NULL) {
1809 UnicodeString emptyString;
1810 UText empty = UTEXT_INITIALIZER;
57a6839d 1811
729e4ab9
A
1812 utext_openUnicodeString(&empty, &emptyString, &status);
1813 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1814 utext_close(&empty);
1815 }
57a6839d 1816
729e4ab9
A
1817 appendReplacement(dest, replacement, status);
1818 appendTail(dest, status);
57a6839d 1819
729e4ab9 1820 return dest;
b75a7d8f
A
1821}
1822
1823
46f4442e
A
1824//--------------------------------------------------------------------------------
1825//
1826// requireEnd
1827//
1828//--------------------------------------------------------------------------------
1829UBool RegexMatcher::requireEnd() const {
1830 return fRequireEnd;
1831}
1832
b75a7d8f
A
1833
1834//--------------------------------------------------------------------------------
1835//
1836// reset
1837//
1838//--------------------------------------------------------------------------------
1839RegexMatcher &RegexMatcher::reset() {
46f4442e 1840 fRegionStart = 0;
729e4ab9 1841 fRegionLimit = fInputLength;
46f4442e 1842 fActiveStart = 0;
729e4ab9 1843 fActiveLimit = fInputLength;
46f4442e 1844 fAnchorStart = 0;
729e4ab9 1845 fAnchorLimit = fInputLength;
46f4442e 1846 fLookStart = 0;
729e4ab9 1847 fLookLimit = fInputLength;
46f4442e
A
1848 resetPreserveRegion();
1849 return *this;
1850}
1851
1852
1853
1854void RegexMatcher::resetPreserveRegion() {
374ca955
A
1855 fMatchStart = 0;
1856 fMatchEnd = 0;
1857 fLastMatchEnd = -1;
46f4442e 1858 fAppendPosition = 0;
374ca955 1859 fMatch = FALSE;
46f4442e
A
1860 fHitEnd = FALSE;
1861 fRequireEnd = FALSE;
1862 fTime = 0;
1863 fTickCounter = TIMER_INITIAL_VALUE;
729e4ab9 1864 //resetStack(); // more expensive than it looks...
b75a7d8f
A
1865}
1866
1867
b75a7d8f 1868RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
729e4ab9
A
1869 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1870 if (fPattern->fNeedsAltInput) {
1871 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1872 }
1873 fInputLength = utext_nativeLength(fInputText);
57a6839d 1874
b75a7d8f 1875 reset();
729e4ab9
A
1876 delete fInput;
1877 fInput = NULL;
1878
1879 // Do the following for any UnicodeString.
1880 // This is for compatibility for those clients who modify the input string "live" during regex operations.
57a6839d
A
1881 fInputUniStrMaybeMutable = TRUE;
1882
374ca955 1883 if (fWordBreakItr != NULL) {
729e4ab9
A
1884#if UCONFIG_NO_BREAK_ITERATION==0
1885 UErrorCode status = U_ZERO_ERROR;
1886 fWordBreakItr->setText(fInputText, status);
1887#endif
374ca955 1888 }
b75a7d8f
A
1889 return *this;
1890}
1891
b75a7d8f 1892
729e4ab9
A
1893RegexMatcher &RegexMatcher::reset(UText *input) {
1894 if (fInputText != input) {
1895 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1896 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1897 fInputLength = utext_nativeLength(fInputText);
57a6839d 1898
729e4ab9
A
1899 delete fInput;
1900 fInput = NULL;
57a6839d 1901
729e4ab9
A
1902 if (fWordBreakItr != NULL) {
1903#if UCONFIG_NO_BREAK_ITERATION==0
1904 UErrorCode status = U_ZERO_ERROR;
1905 fWordBreakItr->setText(input, status);
1906#endif
1907 }
1908 }
1909 reset();
1910 fInputUniStrMaybeMutable = FALSE;
1911
1912 return *this;
1913}
1914
1915/*RegexMatcher &RegexMatcher::reset(const UChar *) {
1916 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1917 return *this;
1918}*/
1919
1920RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1921 if (U_FAILURE(status)) {
374ca955 1922 return *this;
b75a7d8f 1923 }
46f4442e 1924 reset(); // Reset also resets the region to be the entire string.
57a6839d 1925
729e4ab9 1926 if (position < 0 || position > fActiveLimit) {
374ca955
A
1927 status = U_INDEX_OUTOFBOUNDS_ERROR;
1928 return *this;
1929 }
1930 fMatchEnd = position;
1931 return *this;
b75a7d8f
A
1932}
1933
1934
4388f060
A
1935//--------------------------------------------------------------------------------
1936//
1937// refresh
1938//
1939//--------------------------------------------------------------------------------
1940RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1941 if (U_FAILURE(status)) {
1942 return *this;
1943 }
1944 if (input == NULL) {
1945 status = U_ILLEGAL_ARGUMENT_ERROR;
1946 return *this;
1947 }
1948 if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1949 status = U_ILLEGAL_ARGUMENT_ERROR;
1950 return *this;
1951 }
1952 int64_t pos = utext_getNativeIndex(fInputText);
1953 // Shallow read-only clone of the new UText into the existing input UText
1954 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
1955 if (U_FAILURE(status)) {
1956 return *this;
1957 }
1958 utext_setNativeIndex(fInputText, pos);
1959
1960 if (fAltInputText != NULL) {
1961 pos = utext_getNativeIndex(fAltInputText);
1962 fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
1963 if (U_FAILURE(status)) {
1964 return *this;
1965 }
1966 utext_setNativeIndex(fAltInputText, pos);
1967 }
1968 return *this;
1969}
b75a7d8f 1970
374ca955
A
1971
1972
b75a7d8f
A
1973//--------------------------------------------------------------------------------
1974//
1975// setTrace
1976//
1977//--------------------------------------------------------------------------------
1978void RegexMatcher::setTrace(UBool state) {
1979 fTraceDebug = state;
1980}
1981
1982
1983
1984//---------------------------------------------------------------------
1985//
1986// split
1987//
1988//---------------------------------------------------------------------
1989int32_t RegexMatcher::split(const UnicodeString &input,
1990 UnicodeString dest[],
1991 int32_t destCapacity,
729e4ab9
A
1992 UErrorCode &status)
1993{
1994 UText inputText = UTEXT_INITIALIZER;
1995 utext_openConstUnicodeString(&inputText, &input, &status);
1996 if (U_FAILURE(status)) {
1997 return 0;
1998 }
1999
2000 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2001 if (destText == NULL) {
2002 status = U_MEMORY_ALLOCATION_ERROR;
2003 return 0;
2004 }
2005 int32_t i;
2006 for (i = 0; i < destCapacity; i++) {
2007 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2008 }
57a6839d 2009
729e4ab9 2010 int32_t fieldCount = split(&inputText, destText, destCapacity, status);
57a6839d 2011
729e4ab9
A
2012 for (i = 0; i < destCapacity; i++) {
2013 utext_close(destText[i]);
2014 }
2015
2016 uprv_free(destText);
2017 utext_close(&inputText);
2018 return fieldCount;
2019}
2020
2021//
2022// split, UText mode
2023//
2024int32_t RegexMatcher::split(UText *input,
2025 UText *dest[],
2026 int32_t destCapacity,
2027 UErrorCode &status)
b75a7d8f
A
2028{
2029 //
2030 // Check arguements for validity
2031 //
2032 if (U_FAILURE(status)) {
2033 return 0;
2034 };
2035
2036 if (destCapacity < 1) {
2037 status = U_ILLEGAL_ARGUMENT_ERROR;
2038 return 0;
2039 }
2040
b75a7d8f
A
2041 //
2042 // Reset for the input text
2043 //
2044 reset(input);
729e4ab9 2045 int64_t nextOutputStringStart = 0;
46f4442e 2046 if (fActiveLimit == 0) {
b75a7d8f
A
2047 return 0;
2048 }
2049
b75a7d8f
A
2050 //
2051 // Loop through the input text, searching for the delimiter pattern
2052 //
73c04bcf 2053 int32_t i;
b75a7d8f
A
2054 int32_t numCaptureGroups = fPattern->fGroupMap->size();
2055 for (i=0; ; i++) {
2056 if (i>=destCapacity-1) {
2057 // There is one or zero output string left.
2058 // Fill the last output string with whatever is left from the input, then exit the loop.
729e4ab9 2059 // ( i will be == destCapacity if we filled the output array while processing
b75a7d8f
A
2060 // capture groups of the delimiter expression, in which case we will discard the
2061 // last capture group saved in favor of the unprocessed remainder of the
2062 // input string.)
2063 i = destCapacity-1;
729e4ab9
A
2064 if (fActiveLimit > nextOutputStringStart) {
2065 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2066 if (dest[i]) {
57a6839d
A
2067 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2068 input->chunkContents+nextOutputStringStart,
729e4ab9
A
2069 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2070 } else {
2071 UText remainingText = UTEXT_INITIALIZER;
57a6839d 2072 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
729e4ab9
A
2073 fActiveLimit-nextOutputStringStart, &status);
2074 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2075 utext_close(&remainingText);
2076 }
2077 } else {
2078 UErrorCode lengthStatus = U_ZERO_ERROR;
57a6839d 2079 int32_t remaining16Length =
729e4ab9
A
2080 utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2081 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2082 if (remainingChars == NULL) {
2083 status = U_MEMORY_ALLOCATION_ERROR;
2084 break;
2085 }
2086
2087 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2088 if (dest[i]) {
2089 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2090 } else {
2091 UText remainingText = UTEXT_INITIALIZER;
2092 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2093 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2094 utext_close(&remainingText);
2095 }
57a6839d 2096
729e4ab9
A
2097 uprv_free(remainingChars);
2098 }
b75a7d8f
A
2099 }
2100 break;
2101 }
2102 if (find()) {
2103 // We found another delimiter. Move everything from where we started looking
2104 // up until the start of the delimiter into the next output string.
729e4ab9
A
2105 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2106 if (dest[i]) {
57a6839d
A
2107 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2108 input->chunkContents+nextOutputStringStart,
729e4ab9
A
2109 (int32_t)(fMatchStart-nextOutputStringStart), &status);
2110 } else {
2111 UText remainingText = UTEXT_INITIALIZER;
57a6839d 2112 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
729e4ab9
A
2113 fMatchStart-nextOutputStringStart, &status);
2114 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2115 utext_close(&remainingText);
2116 }
2117 } else {
2118 UErrorCode lengthStatus = U_ZERO_ERROR;
2119 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2120 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2121 if (remainingChars == NULL) {
2122 status = U_MEMORY_ALLOCATION_ERROR;
2123 break;
2124 }
2125 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2126 if (dest[i]) {
2127 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2128 } else {
2129 UText remainingText = UTEXT_INITIALIZER;
2130 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2131 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2132 utext_close(&remainingText);
2133 }
57a6839d 2134
729e4ab9
A
2135 uprv_free(remainingChars);
2136 }
b75a7d8f
A
2137 nextOutputStringStart = fMatchEnd;
2138
2139 // If the delimiter pattern has capturing parentheses, the captured
2140 // text goes out into the next n destination strings.
2141 int32_t groupNum;
2142 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
4388f060
A
2143 if (i >= destCapacity-2) {
2144 // Never fill the last available output string with capture group text.
2145 // It will filled with the last field, the remainder of the
2146 // unsplit input text.
b75a7d8f
A
2147 break;
2148 }
2149 i++;
729e4ab9 2150 dest[i] = group(groupNum, dest[i], status);
b75a7d8f
A
2151 }
2152
46f4442e 2153 if (nextOutputStringStart == fActiveLimit) {
4388f060
A
2154 // The delimiter was at the end of the string. We're done, but first
2155 // we output one last empty string, for the empty field following
2156 // the delimiter at the end of input.
2157 if (i+1 < destCapacity) {
2158 ++i;
2159 if (dest[i] == NULL) {
2160 dest[i] = utext_openUChars(NULL, NULL, 0, &status);
2161 } else {
2162 static UChar emptyString[] = {(UChar)0};
2163 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
2164 }
729e4ab9 2165 }
4388f060 2166 break;
57a6839d
A
2167
2168 }
b75a7d8f
A
2169 }
2170 else
2171 {
2172 // We ran off the end of the input while looking for the next delimiter.
2173 // All the remaining text goes into the current output string.
729e4ab9
A
2174 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2175 if (dest[i]) {
57a6839d
A
2176 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2177 input->chunkContents+nextOutputStringStart,
729e4ab9
A
2178 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2179 } else {
2180 UText remainingText = UTEXT_INITIALIZER;
57a6839d 2181 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
729e4ab9
A
2182 fActiveLimit-nextOutputStringStart, &status);
2183 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2184 utext_close(&remainingText);
2185 }
2186 } else {
2187 UErrorCode lengthStatus = U_ZERO_ERROR;
2188 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2189 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2190 if (remainingChars == NULL) {
2191 status = U_MEMORY_ALLOCATION_ERROR;
2192 break;
2193 }
57a6839d 2194
729e4ab9
A
2195 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2196 if (dest[i]) {
2197 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2198 } else {
2199 UText remainingText = UTEXT_INITIALIZER;
2200 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2201 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2202 utext_close(&remainingText);
2203 }
57a6839d 2204
729e4ab9
A
2205 uprv_free(remainingChars);
2206 }
b75a7d8f
A
2207 break;
2208 }
729e4ab9
A
2209 if (U_FAILURE(status)) {
2210 break;
2211 }
2212 } // end of for loop
b75a7d8f
A
2213 return i+1;
2214}
2215
2216
b75a7d8f
A
2217//--------------------------------------------------------------------------------
2218//
2219// start
2220//
2221//--------------------------------------------------------------------------------
2222int32_t RegexMatcher::start(UErrorCode &status) const {
2223 return start(0, status);
2224}
2225
729e4ab9
A
2226int64_t RegexMatcher::start64(UErrorCode &status) const {
2227 return start64(0, status);
2228}
b75a7d8f 2229
46f4442e
A
2230//--------------------------------------------------------------------------------
2231//
2232// start(int32_t group, UErrorCode &status)
2233//
2234//--------------------------------------------------------------------------------
729e4ab9
A
2235
2236int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
b75a7d8f
A
2237 if (U_FAILURE(status)) {
2238 return -1;
2239 }
2240 if (U_FAILURE(fDeferredStatus)) {
2241 status = fDeferredStatus;
2242 return -1;
2243 }
2244 if (fMatch == FALSE) {
2245 status = U_REGEX_INVALID_STATE;
2246 return -1;
2247 }
2248 if (group < 0 || group > fPattern->fGroupMap->size()) {
2249 status = U_INDEX_OUTOFBOUNDS_ERROR;
2250 return -1;
2251 }
729e4ab9 2252 int64_t s;
b75a7d8f 2253 if (group == 0) {
57a6839d 2254 s = fMatchStart;
b75a7d8f
A
2255 } else {
2256 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2257 U_ASSERT(groupOffset < fPattern->fFrameSize);
2258 U_ASSERT(groupOffset >= 0);
2259 s = fFrame->fExtra[groupOffset];
2260 }
57a6839d 2261
b75a7d8f
A
2262 return s;
2263}
2264
2265
729e4ab9
A
2266int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2267 return (int32_t)start64(group, status);
2268}
b75a7d8f 2269
46f4442e
A
2270//--------------------------------------------------------------------------------
2271//
2272// useAnchoringBounds
2273//
2274//--------------------------------------------------------------------------------
2275RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2276 fAnchoringBounds = b;
729e4ab9
A
2277 fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2278 fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
46f4442e
A
2279 return *this;
2280}
2281
2282
2283//--------------------------------------------------------------------------------
2284//
2285// useTransparentBounds
2286//
2287//--------------------------------------------------------------------------------
2288RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2289 fTransparentBounds = b;
729e4ab9
A
2290 fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2291 fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
46f4442e
A
2292 return *this;
2293}
2294
2295//--------------------------------------------------------------------------------
2296//
2297// setTimeLimit
2298//
2299//--------------------------------------------------------------------------------
2300void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2301 if (U_FAILURE(status)) {
2302 return;
2303 }
2304 if (U_FAILURE(fDeferredStatus)) {
2305 status = fDeferredStatus;
2306 return;
2307 }
2308 if (limit < 0) {
2309 status = U_ILLEGAL_ARGUMENT_ERROR;
2310 return;
2311 }
2312 fTimeLimit = limit;
2313}
2314
2315
2316//--------------------------------------------------------------------------------
2317//
2318// getTimeLimit
2319//
2320//--------------------------------------------------------------------------------
2321int32_t RegexMatcher::getTimeLimit() const {
2322 return fTimeLimit;
2323}
2324
2325
2326//--------------------------------------------------------------------------------
2327//
2328// setStackLimit
2329//
2330//--------------------------------------------------------------------------------
2331void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2332 if (U_FAILURE(status)) {
2333 return;
2334 }
2335 if (U_FAILURE(fDeferredStatus)) {
2336 status = fDeferredStatus;
2337 return;
2338 }
2339 if (limit < 0) {
2340 status = U_ILLEGAL_ARGUMENT_ERROR;
2341 return;
2342 }
57a6839d 2343
46f4442e 2344 // Reset the matcher. This is needed here in case there is a current match
57a6839d 2345 // whose final stack frame (containing the match results, pointed to by fFrame)
46f4442e
A
2346 // would be lost by resizing to a smaller stack size.
2347 reset();
57a6839d 2348
46f4442e
A
2349 if (limit == 0) {
2350 // Unlimited stack expansion
2351 fStack->setMaxCapacity(0);
2352 } else {
2353 // Change the units of the limit from bytes to ints, and bump the size up
57a6839d 2354 // to be big enough to hold at least one stack frame for the pattern,
46f4442e
A
2355 // if it isn't there already.
2356 int32_t adjustedLimit = limit / sizeof(int32_t);
2357 if (adjustedLimit < fPattern->fFrameSize) {
2358 adjustedLimit = fPattern->fFrameSize;
2359 }
2360 fStack->setMaxCapacity(adjustedLimit);
2361 }
2362 fStackLimit = limit;
2363}
2364
2365
2366//--------------------------------------------------------------------------------
2367//
2368// getStackLimit
2369//
2370//--------------------------------------------------------------------------------
2371int32_t RegexMatcher::getStackLimit() const {
2372 return fStackLimit;
2373}
2374
2375
2376//--------------------------------------------------------------------------------
2377//
2378// setMatchCallback
2379//
2380//--------------------------------------------------------------------------------
2381void RegexMatcher::setMatchCallback(URegexMatchCallback *callback,
2382 const void *context,
2383 UErrorCode &status) {
729e4ab9
A
2384 if (U_FAILURE(status)) {
2385 return;
2386 }
2387 fCallbackFn = callback;
2388 fCallbackContext = context;
46f4442e
A
2389}
2390
2391
2392//--------------------------------------------------------------------------------
2393//
2394// getMatchCallback
2395//
2396//--------------------------------------------------------------------------------
2397void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback,
2398 const void *&context,
2399 UErrorCode &status) {
2400 if (U_FAILURE(status)) {
2401 return;
2402 }
2403 callback = fCallbackFn;
2404 context = fCallbackContext;
2405}
2406
2407
729e4ab9
A
2408//--------------------------------------------------------------------------------
2409//
2410// setMatchCallback
2411//
2412//--------------------------------------------------------------------------------
2413void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback,
2414 const void *context,
2415 UErrorCode &status) {
2416 if (U_FAILURE(status)) {
2417 return;
2418 }
2419 fFindProgressCallbackFn = callback;
2420 fFindProgressCallbackContext = context;
2421}
2422
2423
2424//--------------------------------------------------------------------------------
2425//
2426// getMatchCallback
2427//
2428//--------------------------------------------------------------------------------
2429void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback,
2430 const void *&context,
2431 UErrorCode &status) {
2432 if (U_FAILURE(status)) {
2433 return;
2434 }
2435 callback = fFindProgressCallbackFn;
2436 context = fFindProgressCallbackContext;
2437}
2438
2439
374ca955
A
2440//================================================================================
2441//
2442// Code following this point in this file is the internal
2443// Match Engine Implementation.
2444//
2445//================================================================================
2446
2447
2448//--------------------------------------------------------------------------------
2449//
2450// resetStack
2451// Discard any previous contents of the state save stack, and initialize a
57a6839d 2452// new stack frame to all -1. The -1s are needed for capture group limits,
374ca955
A
2453// where they indicate that a group has not yet matched anything.
2454//--------------------------------------------------------------------------------
2455REStackFrame *RegexMatcher::resetStack() {
2456 // Discard any previous contents of the state save stack, and initialize a
729e4ab9
A
2457 // new stack frame with all -1 data. The -1s are needed for capture group limits,
2458 // where they indicate that a group has not yet matched anything.
374ca955
A
2459 fStack->removeAllElements();
2460
729e4ab9
A
2461 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2462 int32_t i;
2463 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2464 iFrame->fExtra[i] = -1;
2465 }
2466 return iFrame;
2467}
2468
2469
2470
2471//--------------------------------------------------------------------------------
2472//
57a6839d 2473// isWordBoundary
729e4ab9
A
2474// in perl, "xab..cd..", \b is true at positions 0,3,5,7
2475// For us,
2476// If the current char is a combining mark,
2477// \b is FALSE.
2478// Else Scan backwards to the first non-combining char.
2479// We are at a boundary if the this char and the original chars are
2480// opposite in membership in \w set
2481//
2482// parameters: pos - the current position in the input buffer
2483//
2484// TODO: double-check edge cases at region boundaries.
2485//
2486//--------------------------------------------------------------------------------
2487UBool RegexMatcher::isWordBoundary(int64_t pos) {
2488 UBool isBoundary = FALSE;
2489 UBool cIsWord = FALSE;
57a6839d 2490
729e4ab9
A
2491 if (pos >= fLookLimit) {
2492 fHitEnd = TRUE;
2493 } else {
2494 // Determine whether char c at current position is a member of the word set of chars.
2495 // If we're off the end of the string, behave as though we're not at a word char.
2496 UTEXT_SETNATIVEINDEX(fInputText, pos);
2497 UChar32 c = UTEXT_CURRENT32(fInputText);
2498 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2499 // Current char is a combining one. Not a boundary.
2500 return FALSE;
2501 }
2502 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2503 }
57a6839d 2504
729e4ab9
A
2505 // Back up until we come to a non-combining char, determine whether
2506 // that char is a word char.
2507 UBool prevCIsWord = FALSE;
2508 for (;;) {
2509 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2510 break;
2511 }
2512 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2513 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2514 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2515 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2516 break;
2517 }
2518 }
2519 isBoundary = cIsWord ^ prevCIsWord;
2520 return isBoundary;
2521}
2522
2523UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2524 UBool isBoundary = FALSE;
2525 UBool cIsWord = FALSE;
57a6839d 2526
729e4ab9 2527 const UChar *inputBuf = fInputText->chunkContents;
57a6839d 2528
729e4ab9
A
2529 if (pos >= fLookLimit) {
2530 fHitEnd = TRUE;
2531 } else {
2532 // Determine whether char c at current position is a member of the word set of chars.
2533 // If we're off the end of the string, behave as though we're not at a word char.
2534 UChar32 c;
2535 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2536 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2537 // Current char is a combining one. Not a boundary.
2538 return FALSE;
2539 }
2540 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2541 }
57a6839d 2542
729e4ab9
A
2543 // Back up until we come to a non-combining char, determine whether
2544 // that char is a word char.
2545 UBool prevCIsWord = FALSE;
2546 for (;;) {
2547 if (pos <= fLookStart) {
2548 break;
2549 }
2550 UChar32 prevChar;
2551 U16_PREV(inputBuf, fLookStart, pos, prevChar);
2552 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2553 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2554 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2555 break;
2556 }
2557 }
2558 isBoundary = cIsWord ^ prevCIsWord;
2559 return isBoundary;
2560}
2561
2562//--------------------------------------------------------------------------------
2563//
57a6839d 2564// isUWordBoundary
729e4ab9
A
2565//
2566// Test for a word boundary using RBBI word break.
2567//
2568// parameters: pos - the current position in the input buffer
2569//
2570//--------------------------------------------------------------------------------
2571UBool RegexMatcher::isUWordBoundary(int64_t pos) {
2572 UBool returnVal = FALSE;
2573#if UCONFIG_NO_BREAK_ITERATION==0
57a6839d 2574
729e4ab9
A
2575 // If we haven't yet created a break iterator for this matcher, do it now.
2576 if (fWordBreakItr == NULL) {
57a6839d 2577 fWordBreakItr =
729e4ab9
A
2578 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
2579 if (U_FAILURE(fDeferredStatus)) {
2580 return FALSE;
2581 }
2582 fWordBreakItr->setText(fInputText, fDeferredStatus);
2583 }
2584
2585 if (pos >= fLookLimit) {
2586 fHitEnd = TRUE;
2587 returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real"
2588 // words are not boundaries. All non-word chars stand by themselves,
2589 // with word boundaries on both sides.
2590 } else {
2591 if (!UTEXT_USES_U16(fInputText)) {
2592 // !!!: Would like a better way to do this!
2593 UErrorCode status = U_ZERO_ERROR;
2594 pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
2595 }
2596 returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2597 }
2598#endif
2599 return returnVal;
2600}
2601
2602//--------------------------------------------------------------------------------
2603//
2604// IncrementTime This function is called once each TIMER_INITIAL_VALUE state
2605// saves. Increment the "time" counter, and call the
2606// user callback function if there is one installed.
2607//
2608// If the match operation needs to be aborted, either for a time-out
2609// or because the user callback asked for it, just set an error status.
2610// The engine will pick that up and stop in its outer loop.
2611//
2612//--------------------------------------------------------------------------------
2613void RegexMatcher::IncrementTime(UErrorCode &status) {
2614 fTickCounter = TIMER_INITIAL_VALUE;
2615 fTime++;
2616 if (fCallbackFn != NULL) {
2617 if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2618 status = U_REGEX_STOPPED_BY_CALLER;
2619 return;
2620 }
2621 }
2622 if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2623 status = U_REGEX_TIME_OUT;
2624 }
2625}
2626
2627//--------------------------------------------------------------------------------
2628//
2629// ReportFindProgress This function is called once for each advance in the target
2630// string from the find() function, and calls the user progress callback
2631// function if there is one installed.
57a6839d
A
2632//
2633// NOTE:
729e4ab9
A
2634//
2635// If the match operation needs to be aborted because the user
2636// callback asked for it, just set an error status.
2637// The engine will pick that up and stop in its outer loop.
2638//
2639//--------------------------------------------------------------------------------
2640UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) {
2641 if (fFindProgressCallbackFn != NULL) {
2642 if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) {
2643 status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/;
2644 return FALSE;
2645 }
2646 }
2647 return TRUE;
2648}
2649
2650//--------------------------------------------------------------------------------
2651//
2652// StateSave
2653// Make a new stack frame, initialized as a copy of the current stack frame.
2654// Set the pattern index in the original stack frame from the operand value
2655// in the opcode. Execution of the engine continues with the state in
2656// the newly created stack frame
2657//
2658// Note that reserveBlock() may grow the stack, resulting in the
2659// whole thing being relocated in memory.
2660//
2661// Parameters:
57a6839d 2662// fp The top frame pointer when called. At return, a new
729e4ab9
A
2663// fame will be present
2664// savePatIdx An index into the compiled pattern. Goes into the original
2665// (not new) frame. If execution ever back-tracks out of the
2666// new frame, this will be where we continue from in the pattern.
2667// Return
2668// The new frame pointer.
2669//
2670//--------------------------------------------------------------------------------
2671inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
57a6839d 2672 // push storage for a new frame.
729e4ab9
A
2673 int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2674 if (newFP == NULL) {
2675 // Failure on attempted stack expansion.
2676 // Stack function set some other error code, change it to a more
2677 // specific one for regular expressions.
2678 status = U_REGEX_STACK_OVERFLOW;
2679 // We need to return a writable stack frame, so just return the
2680 // previous frame. The match operation will stop quickly
2681 // because of the error status, after which the frame will never
2682 // be looked at again.
2683 return fp;
2684 }
2685 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack.
57a6839d 2686
729e4ab9
A
2687 // New stack frame = copy of old top frame.
2688 int64_t *source = (int64_t *)fp;
2689 int64_t *dest = newFP;
2690 for (;;) {
2691 *dest++ = *source++;
2692 if (source == newFP) {
2693 break;
2694 }
2695 }
57a6839d 2696
729e4ab9
A
2697 fTickCounter--;
2698 if (fTickCounter <= 0) {
2699 IncrementTime(status); // Re-initializes fTickCounter
2700 }
2701 fp->fPatIdx = savePatIdx;
2702 return (REStackFrame *)newFP;
2703}
2704
2705
2706//--------------------------------------------------------------------------------
2707//
2708// MatchAt This is the actual matching engine.
2709//
2710// startIdx: begin matching a this index.
2711// toEnd: if true, match must extend to end of the input region
2712//
2713//--------------------------------------------------------------------------------
2714void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2715 UBool isMatch = FALSE; // True if the we have a match.
57a6839d 2716
729e4ab9
A
2717 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2718
2719 int32_t op; // Operation from the compiled pattern, split into
2720 int32_t opType; // the opcode
2721 int32_t opValue; // and the operand value.
57a6839d
A
2722
2723#ifdef REGEX_RUN_DEBUG
729e4ab9
A
2724 if (fTraceDebug)
2725 {
2726 printf("MatchAt(startIdx=%ld)\n", startIdx);
2727 printf("Original Pattern: ");
2728 UChar32 c = utext_next32From(fPattern->fPattern, 0);
2729 while (c != U_SENTINEL) {
2730 if (c<32 || c>256) {
2731 c = '.';
2732 }
57a6839d
A
2733 printf("%c", c);
2734
729e4ab9
A
2735 c = UTEXT_NEXT32(fPattern->fPattern);
2736 }
2737 printf("\n");
2738 printf("Input String: ");
2739 c = utext_next32From(fInputText, 0);
2740 while (c != U_SENTINEL) {
2741 if (c<32 || c>256) {
2742 c = '.';
2743 }
2744 printf("%c", c);
57a6839d 2745
729e4ab9
A
2746 c = UTEXT_NEXT32(fInputText);
2747 }
2748 printf("\n");
2749 printf("\n");
2750 }
57a6839d 2751#endif
729e4ab9
A
2752
2753 if (U_FAILURE(status)) {
2754 return;
2755 }
2756
2757 // Cache frequently referenced items from the compiled pattern
2758 //
2759 int64_t *pat = fPattern->fCompiledPat->getBuffer();
2760
2761 const UChar *litText = fPattern->fLiteralText.getBuffer();
2762 UVector *sets = fPattern->fSets;
2763
2764 fFrameSize = fPattern->fFrameSize;
2765 REStackFrame *fp = resetStack();
2766
2767 fp->fPatIdx = 0;
2768 fp->fInputIdx = startIdx;
2769
2770 // Zero out the pattern's static data
2771 int32_t i;
2772 for (i = 0; i<fPattern->fDataSize; i++) {
2773 fData[i] = 0;
2774 }
2775
2776 //
2777 // Main loop for interpreting the compiled pattern.
2778 // One iteration of the loop per pattern operation performed.
2779 //
2780 for (;;) {
729e4ab9
A
2781 op = (int32_t)pat[fp->fPatIdx];
2782 opType = URX_TYPE(op);
2783 opValue = URX_VAL(op);
57a6839d 2784#ifdef REGEX_RUN_DEBUG
729e4ab9
A
2785 if (fTraceDebug) {
2786 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 2787 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
729e4ab9
A
2788 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2789 fPattern->dumpOp(fp->fPatIdx);
2790 }
57a6839d 2791#endif
729e4ab9 2792 fp->fPatIdx++;
57a6839d 2793
729e4ab9
A
2794 switch (opType) {
2795
2796
2797 case URX_NOP:
2798 break;
2799
2800
2801 case URX_BACKTRACK:
2802 // Force a backtrack. In some circumstances, the pattern compiler
2803 // will notice that the pattern can't possibly match anything, and will
2804 // emit one of these at that point.
2805 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2806 break;
2807
2808
2809 case URX_ONECHAR:
2810 if (fp->fInputIdx < fActiveLimit) {
2811 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2812 UChar32 c = UTEXT_NEXT32(fInputText);
2813 if (c == opValue) {
2814 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2815 break;
2816 }
2817 } else {
2818 fHitEnd = TRUE;
2819 }
729e4ab9
A
2820 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2821 break;
2822
2823
2824 case URX_STRING:
2825 {
2826 // Test input against a literal string.
2827 // Strings require two slots in the compiled pattern, one for the
2828 // offset to the string text, and one for the length.
729e4ab9 2829
4388f060 2830 int32_t stringStartIdx = opValue;
729e4ab9
A
2831 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
2832 fp->fPatIdx++;
2833 opType = URX_TYPE(op);
4388f060 2834 int32_t stringLen = URX_VAL(op);
729e4ab9
A
2835 U_ASSERT(opType == URX_STRING_LEN);
2836 U_ASSERT(stringLen >= 2);
57a6839d 2837
4388f060
A
2838 const UChar *patternString = litText+stringStartIdx;
2839 int32_t patternStringIndex = 0;
729e4ab9 2840 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4388f060
A
2841 UChar32 inputChar;
2842 UChar32 patternChar;
729e4ab9 2843 UBool success = TRUE;
4388f060
A
2844 while (patternStringIndex < stringLen) {
2845 if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
729e4ab9 2846 success = FALSE;
4388f060
A
2847 fHitEnd = TRUE;
2848 break;
2849 }
2850 inputChar = UTEXT_NEXT32(fInputText);
2851 U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
2852 if (patternChar != inputChar) {
2853 success = FALSE;
2854 break;
729e4ab9
A
2855 }
2856 }
57a6839d 2857
729e4ab9
A
2858 if (success) {
2859 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2860 } else {
729e4ab9
A
2861 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2862 }
2863 }
2864 break;
2865
2866
2867 case URX_STATE_SAVE:
2868 fp = StateSave(fp, opValue, status);
2869 break;
2870
2871
2872 case URX_END:
2873 // The match loop will exit via this path on a successful match,
2874 // when we reach the end of the pattern.
2875 if (toEnd && fp->fInputIdx != fActiveLimit) {
2876 // The pattern matched, but not to the end of input. Try some more.
2877 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2878 break;
2879 }
2880 isMatch = TRUE;
2881 goto breakFromLoop;
2882
2883 // Start and End Capture stack frame variables are laid out out like this:
2884 // fp->fExtra[opValue] - The start of a completed capture group
2885 // opValue+1 - The end of a completed capture group
2886 // opValue+2 - the start of a capture group whose end
2887 // has not yet been reached (and might not ever be).
2888 case URX_START_CAPTURE:
2889 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2890 fp->fExtra[opValue+2] = fp->fInputIdx;
2891 break;
2892
2893
2894 case URX_END_CAPTURE:
2895 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2896 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
2897 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
2898 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
2899 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
2900 break;
2901
2902
2903 case URX_DOLLAR: // $, test for End of line
2904 // or for position before new line at end of input
2905 {
2906 if (fp->fInputIdx >= fAnchorLimit) {
2907 // We really are at the end of input. Success.
2908 fHitEnd = TRUE;
2909 fRequireEnd = TRUE;
2910 break;
2911 }
57a6839d 2912
729e4ab9 2913 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 2914
729e4ab9
A
2915 // If we are positioned just before a new-line that is located at the
2916 // end of input, succeed.
2917 UChar32 c = UTEXT_NEXT32(fInputText);
2918 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2919 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
2920 // If not in the middle of a CR/LF sequence
4388f060 2921 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
729e4ab9
A
2922 // At new-line at end of input. Success
2923 fHitEnd = TRUE;
2924 fRequireEnd = TRUE;
57a6839d 2925
729e4ab9
A
2926 break;
2927 }
2928 }
2929 } else {
2930 UChar32 nextC = UTEXT_NEXT32(fInputText);
2931 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2932 fHitEnd = TRUE;
2933 fRequireEnd = TRUE;
2934 break; // At CR/LF at end of input. Success
2935 }
2936 }
2937
2938 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2939 }
2940 break;
2941
2942
2943 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
2944 if (fp->fInputIdx >= fAnchorLimit) {
2945 // Off the end of input. Success.
2946 fHitEnd = TRUE;
2947 fRequireEnd = TRUE;
2948 break;
2949 } else {
2950 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2951 UChar32 c = UTEXT_NEXT32(fInputText);
2952 // Either at the last character of input, or off the end.
2953 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
2954 fHitEnd = TRUE;
2955 fRequireEnd = TRUE;
2956 break;
2957 }
2958 }
2959
2960 // Not at end of input. Back-track out.
2961 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2962 break;
2963
2964
2965 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
2966 {
2967 if (fp->fInputIdx >= fAnchorLimit) {
2968 // We really are at the end of input. Success.
2969 fHitEnd = TRUE;
2970 fRequireEnd = TRUE;
2971 break;
2972 }
2973 // If we are positioned just before a new-line, succeed.
2974 // It makes no difference where the new-line is within the input.
2975 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2976 UChar32 c = UTEXT_CURRENT32(fInputText);
2977 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
2978 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
2979 // In multi-line mode, hitting a new-line just before the end of input does not
2980 // set the hitEnd or requireEnd flags
2981 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
2982 break;
2983 }
2984 }
2985 // not at a new line. Fail.
2986 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2987 }
2988 break;
2989
2990
2991 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
2992 {
2993 if (fp->fInputIdx >= fAnchorLimit) {
2994 // We really are at the end of input. Success.
2995 fHitEnd = TRUE;
2996 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
2997 break; // adding a new-line would not lose the match.
2998 }
2999 // If we are not positioned just before a new-line, the test fails; backtrack out.
3000 // It makes no difference where the new-line is within the input.
3001 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3002 if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3003 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3004 }
3005 }
3006 break;
3007
3008
3009 case URX_CARET: // ^, test for start of line
3010 if (fp->fInputIdx != fAnchorStart) {
3011 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3012 }
3013 break;
3014
3015
3016 case URX_CARET_M: // ^, test for start of line in mulit-line mode
3017 {
3018 if (fp->fInputIdx == fAnchorStart) {
3019 // We are at the start input. Success.
3020 break;
3021 }
3022 // Check whether character just before the current pos is a new-line
3023 // unless we are at the end of input
3024 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d
A
3025 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3026 if ((fp->fInputIdx < fAnchorLimit) &&
729e4ab9
A
3027 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3028 // It's a new-line. ^ is true. Success.
3029 // TODO: what should be done with positions between a CR and LF?
3030 break;
3031 }
3032 // Not at the start of a line. Fail.
3033 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3034 }
3035 break;
3036
3037
3038 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
3039 {
3040 U_ASSERT(fp->fInputIdx >= fAnchorStart);
3041 if (fp->fInputIdx <= fAnchorStart) {
3042 // We are at the start input. Success.
3043 break;
3044 }
3045 // Check whether character just before the current pos is a new-line
3046 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3047 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3048 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3049 if (c != 0x0a) {
3050 // Not at the start of a line. Back-track out.
3051 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3052 }
3053 }
3054 break;
3055
3056 case URX_BACKSLASH_B: // Test for word boundaries
3057 {
3058 UBool success = isWordBoundary(fp->fInputIdx);
51004dcb 3059 success ^= (UBool)(opValue != 0); // flip sense for \B
729e4ab9
A
3060 if (!success) {
3061 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3062 }
3063 }
3064 break;
3065
3066
3067 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
3068 {
3069 UBool success = isUWordBoundary(fp->fInputIdx);
51004dcb 3070 success ^= (UBool)(opValue != 0); // flip sense for \B
729e4ab9
A
3071 if (!success) {
3072 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3073 }
3074 }
3075 break;
3076
3077
3078 case URX_BACKSLASH_D: // Test for decimal digit
3079 {
3080 if (fp->fInputIdx >= fActiveLimit) {
3081 fHitEnd = TRUE;
3082 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3083 break;
3084 }
3085
3086 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3087
3088 UChar32 c = UTEXT_NEXT32(fInputText);
3089 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
3090 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
51004dcb 3091 success ^= (UBool)(opValue != 0); // flip sense for \D
729e4ab9
A
3092 if (success) {
3093 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3094 } else {
3095 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3096 }
3097 }
3098 break;
3099
3100
3101 case URX_BACKSLASH_G: // Test for position at end of previous match
3102 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3103 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3104 }
3105 break;
3106
3107
57a6839d 3108 case URX_BACKSLASH_X:
729e4ab9
A
3109 // Match a Grapheme, as defined by Unicode TR 29.
3110 // Differs slightly from Perl, which consumes combining marks independently
3111 // of context.
3112 {
3113
3114 // Fail if at end of input
3115 if (fp->fInputIdx >= fActiveLimit) {
3116 fHitEnd = TRUE;
3117 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3118 break;
3119 }
57a6839d 3120
729e4ab9
A
3121 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3122
3123 // Examine (and consume) the current char.
3124 // Dispatch into a little state machine, based on the char.
3125 UChar32 c;
3126 c = UTEXT_NEXT32(fInputText);
3127 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3128 UnicodeSet **sets = fPattern->fStaticSets;
3129 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
3130 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
3131 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3132 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3133 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3134 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3135 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3136 goto GC_Extend;
3137
3138
3139
3140GC_L:
3141 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3142 c = UTEXT_NEXT32(fInputText);
3143 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3144 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3145 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3146 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3147 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4388f060 3148 (void)UTEXT_PREVIOUS32(fInputText);
729e4ab9
A
3149 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3150 goto GC_Extend;
3151
3152GC_V:
3153 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3154 c = UTEXT_NEXT32(fInputText);
3155 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3156 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3157 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4388f060 3158 (void)UTEXT_PREVIOUS32(fInputText);
729e4ab9
A
3159 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3160 goto GC_Extend;
3161
3162GC_T:
3163 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3164 c = UTEXT_NEXT32(fInputText);
3165 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3166 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4388f060 3167 (void)UTEXT_PREVIOUS32(fInputText);
729e4ab9
A
3168 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3169 goto GC_Extend;
3170
3171GC_Extend:
3172 // Combining characters are consumed here
3173 for (;;) {
3174 if (fp->fInputIdx >= fActiveLimit) {
3175 break;
3176 }
3177 c = UTEXT_CURRENT32(fInputText);
3178 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
3179 break;
3180 }
4388f060 3181 (void)UTEXT_NEXT32(fInputText);
729e4ab9
A
3182 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3183 }
3184 goto GC_Done;
3185
3186GC_Control:
57a6839d 3187 // Most control chars stand alone (don't combine with combining chars),
729e4ab9
A
3188 // except for that CR/LF sequence is a single grapheme cluster.
3189 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
3190 c = UTEXT_NEXT32(fInputText);
3191 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3192 }
3193
3194GC_Done:
3195 if (fp->fInputIdx >= fActiveLimit) {
3196 fHitEnd = TRUE;
3197 }
3198 break;
3199 }
57a6839d 3200
729e4ab9
A
3201
3202
3203
3204 case URX_BACKSLASH_Z: // Test for end of Input
3205 if (fp->fInputIdx < fAnchorLimit) {
3206 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3207 } else {
3208 fHitEnd = TRUE;
3209 fRequireEnd = TRUE;
3210 }
3211 break;
3212
3213
3214
3215 case URX_STATIC_SETREF:
3216 {
3217 // Test input character against one of the predefined sets
3218 // (Word Characters, for example)
3219 // The high bit of the op value is a flag for the match polarity.
3220 // 0: success if input char is in set.
3221 // 1: success if input char is not in set.
3222 if (fp->fInputIdx >= fActiveLimit) {
3223 fHitEnd = TRUE;
3224 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3225 break;
3226 }
3227
57a6839d 3228 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
729e4ab9
A
3229 opValue &= ~URX_NEG_SET;
3230 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3231
3232 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3233 UChar32 c = UTEXT_NEXT32(fInputText);
3234 if (c < 256) {
3235 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3236 if (s8->contains(c)) {
3237 success = !success;
3238 }
3239 } else {
3240 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3241 if (s->contains(c)) {
3242 success = !success;
3243 }
3244 }
3245 if (success) {
3246 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3247 } else {
3248 // the character wasn't in the set.
729e4ab9
A
3249 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3250 }
3251 }
3252 break;
57a6839d 3253
729e4ab9
A
3254
3255 case URX_STAT_SETREF_N:
3256 {
57a6839d 3257 // Test input character for NOT being a member of one of
729e4ab9
A
3258 // the predefined sets (Word Characters, for example)
3259 if (fp->fInputIdx >= fActiveLimit) {
3260 fHitEnd = TRUE;
3261 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3262 break;
3263 }
3264
3265 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3266
3267 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 3268
729e4ab9
A
3269 UChar32 c = UTEXT_NEXT32(fInputText);
3270 if (c < 256) {
3271 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3272 if (s8->contains(c) == FALSE) {
3273 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3274 break;
3275 }
3276 } else {
3277 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3278 if (s->contains(c) == FALSE) {
3279 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3280 break;
3281 }
3282 }
3283 // the character wasn't in the set.
729e4ab9
A
3284 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3285 }
3286 break;
57a6839d 3287
729e4ab9
A
3288
3289 case URX_SETREF:
3290 if (fp->fInputIdx >= fActiveLimit) {
3291 fHitEnd = TRUE;
3292 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3293 break;
3294 } else {
3295 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 3296
729e4ab9
A
3297 // There is input left. Pick up one char and test it for set membership.
3298 UChar32 c = UTEXT_NEXT32(fInputText);
3299 U_ASSERT(opValue > 0 && opValue < sets->size());
3300 if (c<256) {
3301 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3302 if (s8->contains(c)) {
3303 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3304 break;
3305 }
3306 } else {
3307 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3308 if (s->contains(c)) {
3309 // The character is in the set. A Match.
3310 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3311 break;
3312 }
3313 }
57a6839d 3314
729e4ab9 3315 // the character wasn't in the set.
729e4ab9
A
3316 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3317 }
3318 break;
3319
3320
3321 case URX_DOTANY:
3322 {
3323 // . matches anything, but stops at end-of-line.
3324 if (fp->fInputIdx >= fActiveLimit) {
3325 // At end of input. Match failed. Backtrack out.
3326 fHitEnd = TRUE;
3327 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3328 break;
3329 }
57a6839d 3330
729e4ab9 3331 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 3332
729e4ab9
A
3333 // There is input left. Advance over one char, unless we've hit end-of-line
3334 UChar32 c = UTEXT_NEXT32(fInputText);
3335 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
3336 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3337 // End of line in normal mode. . does not match.
3338 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3339 break;
3340 }
3341 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3342 }
3343 break;
3344
3345
3346 case URX_DOTANY_ALL:
3347 {
3348 // ., in dot-matches-all (including new lines) mode
3349 if (fp->fInputIdx >= fActiveLimit) {
3350 // At end of input. Match failed. Backtrack out.
3351 fHitEnd = TRUE;
3352 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3353 break;
3354 }
57a6839d 3355
729e4ab9 3356 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 3357
729e4ab9
A
3358 // There is input left. Advance over one char, except if we are
3359 // at a cr/lf, advance over both of them.
57a6839d 3360 UChar32 c;
729e4ab9
A
3361 c = UTEXT_NEXT32(fInputText);
3362 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3363 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3364 // In the case of a CR/LF, we need to advance over both.
3365 UChar32 nextc = UTEXT_CURRENT32(fInputText);
3366 if (nextc == 0x0a) {
4388f060 3367 (void)UTEXT_NEXT32(fInputText);
729e4ab9
A
3368 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3369 }
3370 }
3371 }
3372 break;
3373
3374
3375 case URX_DOTANY_UNIX:
3376 {
3377 // '.' operator, matches all, but stops at end-of-line.
3378 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
3379 if (fp->fInputIdx >= fActiveLimit) {
3380 // At end of input. Match failed. Backtrack out.
3381 fHitEnd = TRUE;
3382 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3383 break;
3384 }
3385
3386 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 3387
729e4ab9
A
3388 // There is input left. Advance over one char, unless we've hit end-of-line
3389 UChar32 c = UTEXT_NEXT32(fInputText);
3390 if (c == 0x0a) {
3391 // End of line in normal mode. '.' does not match the \n
3392 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3393 } else {
3394 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3395 }
3396 }
3397 break;
3398
3399
3400 case URX_JMP:
3401 fp->fPatIdx = opValue;
3402 break;
3403
3404 case URX_FAIL:
3405 isMatch = FALSE;
3406 goto breakFromLoop;
3407
3408 case URX_JMP_SAV:
3409 U_ASSERT(opValue < fPattern->fCompiledPat->size());
3410 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3411 fp->fPatIdx = opValue; // Then JMP.
3412 break;
3413
3414 case URX_JMP_SAV_X:
3415 // This opcode is used with (x)+, when x can match a zero length string.
3416 // Same as JMP_SAV, except conditional on the match having made forward progress.
3417 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3418 // data address of the input position at the start of the loop.
3419 {
3420 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3421 int32_t stoOp = (int32_t)pat[opValue-1];
3422 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3423 int32_t frameLoc = URX_VAL(stoOp);
3424 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3425 int64_t prevInputIdx = fp->fExtra[frameLoc];
3426 U_ASSERT(prevInputIdx <= fp->fInputIdx);
3427 if (prevInputIdx < fp->fInputIdx) {
3428 // The match did make progress. Repeat the loop.
3429 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3430 fp->fPatIdx = opValue;
3431 fp->fExtra[frameLoc] = fp->fInputIdx;
57a6839d 3432 }
729e4ab9
A
3433 // If the input position did not advance, we do nothing here,
3434 // execution will fall out of the loop.
3435 }
3436 break;
3437
3438 case URX_CTR_INIT:
3439 {
3440 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
57a6839d 3441 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
729e4ab9
A
3442
3443 // Pick up the three extra operands that CTR_INIT has, and
57a6839d 3444 // skip the pattern location counter past
729e4ab9
A
3445 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3446 fp->fPatIdx += 3;
3447 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3448 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3449 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3450 U_ASSERT(minCount>=0);
3451 U_ASSERT(maxCount>=minCount || maxCount==-1);
57a6839d 3452 U_ASSERT(loopLoc>=fp->fPatIdx);
729e4ab9
A
3453
3454 if (minCount == 0) {
3455 fp = StateSave(fp, loopLoc+1, status);
3456 }
57a6839d
A
3457 if (maxCount == -1) {
3458 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
3459 } else if (maxCount == 0) {
729e4ab9
A
3460 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3461 }
3462 }
3463 break;
3464
3465 case URX_CTR_LOOP:
3466 {
3467 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3468 int32_t initOp = (int32_t)pat[opValue];
3469 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3470 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3471 int32_t minCount = (int32_t)pat[opValue+2];
3472 int32_t maxCount = (int32_t)pat[opValue+3];
729e4ab9 3473 (*pCounter)++;
57a6839d
A
3474 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3475 U_ASSERT(*pCounter == maxCount);
729e4ab9
A
3476 break;
3477 }
3478 if (*pCounter >= minCount) {
57a6839d
A
3479 if (maxCount == -1) {
3480 // Loop has no hard upper bound.
3481 // Check that it is progressing through the input, break if it is not.
3482 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
3483 if (fp->fInputIdx == *pLastInputIdx) {
3484 break;
3485 } else {
3486 *pLastInputIdx = fp->fInputIdx;
3487 }
3488 }
729e4ab9
A
3489 fp = StateSave(fp, fp->fPatIdx, status);
3490 }
3491 fp->fPatIdx = opValue + 4; // Loop back.
3492 }
3493 break;
3494
3495 case URX_CTR_INIT_NG:
3496 {
3497 // Initialize a non-greedy loop
3498 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
57a6839d 3499 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
729e4ab9 3500
57a6839d
A
3501 // Pick up the three extra operands that CTR_INIT_NG has, and
3502 // skip the pattern location counter past
729e4ab9
A
3503 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3504 fp->fPatIdx += 3;
3505 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3506 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3507 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3508 U_ASSERT(minCount>=0);
3509 U_ASSERT(maxCount>=minCount || maxCount==-1);
3510 U_ASSERT(loopLoc>fp->fPatIdx);
57a6839d
A
3511 if (maxCount == -1) {
3512 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
3513 }
729e4ab9
A
3514
3515 if (minCount == 0) {
3516 if (maxCount != 0) {
3517 fp = StateSave(fp, fp->fPatIdx, status);
3518 }
3519 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
57a6839d 3520 }
729e4ab9
A
3521 }
3522 break;
3523
3524 case URX_CTR_LOOP_NG:
3525 {
3526 // Non-greedy {min, max} loops
3527 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3528 int32_t initOp = (int32_t)pat[opValue];
3529 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3530 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3531 int32_t minCount = (int32_t)pat[opValue+2];
3532 int32_t maxCount = (int32_t)pat[opValue+3];
729e4ab9 3533
57a6839d
A
3534 (*pCounter)++;
3535 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
729e4ab9
A
3536 // The loop has matched the maximum permitted number of times.
3537 // Break out of here with no action. Matching will
3538 // continue with the following pattern.
57a6839d 3539 U_ASSERT(*pCounter == maxCount);
729e4ab9
A
3540 break;
3541 }
3542
3543 if (*pCounter < minCount) {
3544 // We haven't met the minimum number of matches yet.
3545 // Loop back for another one.
3546 fp->fPatIdx = opValue + 4; // Loop back.
3547 } else {
3548 // We do have the minimum number of matches.
57a6839d
A
3549
3550 // If there is no upper bound on the loop iterations, check that the input index
3551 // is progressing, and stop the loop if it is not.
3552 if (maxCount == -1) {
3553 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
3554 if (fp->fInputIdx == *pLastInputIdx) {
3555 break;
3556 }
3557 *pLastInputIdx = fp->fInputIdx;
3558 }
3559
3560 // Loop Continuation: we will fall into the pattern following the loop
3561 // (non-greedy, don't execute loop body first), but first do
3562 // a state save to the top of the loop, so that a match failure
729e4ab9
A
3563 // in the following pattern will try another iteration of the loop.
3564 fp = StateSave(fp, opValue + 4, status);
3565 }
3566 }
3567 break;
3568
3569 case URX_STO_SP:
3570 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3571 fData[opValue] = fStack->size();
3572 break;
3573
3574 case URX_LD_SP:
3575 {
3576 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3577 int32_t newStackSize = (int32_t)fData[opValue];
3578 U_ASSERT(newStackSize <= fStack->size());
3579 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3580 if (newFP == (int64_t *)fp) {
3581 break;
3582 }
3583 int32_t i;
3584 for (i=0; i<fFrameSize; i++) {
3585 newFP[i] = ((int64_t *)fp)[i];
3586 }
3587 fp = (REStackFrame *)newFP;
3588 fStack->setSize(newStackSize);
3589 }
3590 break;
3591
3592 case URX_BACKREF:
729e4ab9
A
3593 {
3594 U_ASSERT(opValue < fFrameSize);
3595 int64_t groupStartIdx = fp->fExtra[opValue];
3596 int64_t groupEndIdx = fp->fExtra[opValue+1];
3597 U_ASSERT(groupStartIdx <= groupEndIdx);
3598 if (groupStartIdx < 0) {
3599 // This capture group has not participated in the match thus far,
3600 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
729e4ab9
A
3601 break;
3602 }
729e4ab9
A
3603 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3604 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4388f060
A
3605
3606 // Note: if the capture group match was of an empty string the backref
57a6839d 3607 // match succeeds. Verified by testing: Perl matches succeed
4388f060 3608 // in this case, so we do too.
57a6839d 3609
4388f060
A
3610 UBool success = TRUE;
3611 for (;;) {
3612 if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3613 success = TRUE;
3614 break;
3615 }
3616 if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
3617 success = FALSE;
729e4ab9 3618 fHitEnd = TRUE;
4388f060
A
3619 break;
3620 }
3621 UChar32 captureGroupChar = utext_next32(fAltInputText);
3622 UChar32 inputChar = utext_next32(fInputText);
3623 if (inputChar != captureGroupChar) {
3624 success = FALSE;
3625 break;
729e4ab9 3626 }
4388f060
A
3627 }
3628
3629 if (success) {
3630 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3631 } else {
3632 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3633 }
3634 }
3635 break;
3636
3637
3638
3639 case URX_BACKREF_I:
3640 {
3641 U_ASSERT(opValue < fFrameSize);
3642 int64_t groupStartIdx = fp->fExtra[opValue];
3643 int64_t groupEndIdx = fp->fExtra[opValue+1];
3644 U_ASSERT(groupStartIdx <= groupEndIdx);
3645 if (groupStartIdx < 0) {
3646 // This capture group has not participated in the match thus far,
729e4ab9 3647 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
4388f060
A
3648 break;
3649 }
3650 utext_setNativeIndex(fAltInputText, groupStartIdx);
3651 utext_setNativeIndex(fInputText, fp->fInputIdx);
3652 CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
3653 CaseFoldingUTextIterator inputItr(*fInputText);
3654
3655 // Note: if the capture group match was of an empty string the backref
57a6839d 3656 // match succeeds. Verified by testing: Perl matches succeed
4388f060 3657 // in this case, so we do too.
57a6839d 3658
4388f060
A
3659 UBool success = TRUE;
3660 for (;;) {
3661 if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3662 success = TRUE;
3663 break;
3664 }
3665 if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
3666 success = FALSE;
3667 fHitEnd = TRUE;
3668 break;
3669 }
3670 UChar32 captureGroupChar = captureGroupItr.next();
3671 UChar32 inputChar = inputItr.next();
3672 if (inputChar != captureGroupChar) {
3673 success = FALSE;
3674 break;
3675 }
3676 }
3677
3678 if (success && inputItr.inExpansion()) {
57a6839d
A
3679 // We otained a match by consuming part of a string obtained from
3680 // case-folding a single code point of the input text.
4388f060
A
3681 // This does not count as an overall match.
3682 success = FALSE;
3683 }
3684
3685 if (success) {
3686 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3687 } else {
3688 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
729e4ab9 3689 }
57a6839d 3690
729e4ab9
A
3691 }
3692 break;
57a6839d 3693
729e4ab9
A
3694 case URX_STO_INP_LOC:
3695 {
3696 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3697 fp->fExtra[opValue] = fp->fInputIdx;
3698 }
3699 break;
3700
3701 case URX_JMPX:
3702 {
3703 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3704 fp->fPatIdx += 1;
3705 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
3706 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3707 int64_t savedInputIdx = fp->fExtra[dataLoc];
3708 U_ASSERT(savedInputIdx <= fp->fInputIdx);
3709 if (savedInputIdx < fp->fInputIdx) {
3710 fp->fPatIdx = opValue; // JMP
3711 } else {
3712 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
3713 }
3714 }
3715 break;
3716
3717 case URX_LA_START:
3718 {
3719 // Entering a lookahead block.
3720 // Save Stack Ptr, Input Pos.
3721 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3722 fData[opValue] = fStack->size();
3723 fData[opValue+1] = fp->fInputIdx;
3724 fActiveStart = fLookStart; // Set the match region change for
3725 fActiveLimit = fLookLimit; // transparent bounds.
3726 }
3727 break;
3728
3729 case URX_LA_END:
3730 {
3731 // Leaving a look-ahead block.
3732 // restore Stack Ptr, Input Pos to positions they had on entry to block.
3733 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3734 int32_t stackSize = fStack->size();
3735 int32_t newStackSize =(int32_t)fData[opValue];
3736 U_ASSERT(stackSize >= newStackSize);
3737 if (stackSize > newStackSize) {
3738 // Copy the current top frame back to the new (cut back) top frame.
3739 // This makes the capture groups from within the look-ahead
3740 // expression available.
3741 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3742 int32_t i;
3743 for (i=0; i<fFrameSize; i++) {
3744 newFP[i] = ((int64_t *)fp)[i];
3745 }
3746 fp = (REStackFrame *)newFP;
3747 fStack->setSize(newStackSize);
3748 }
3749 fp->fInputIdx = fData[opValue+1];
3750
3751 // Restore the active region bounds in the input string; they may have
3752 // been changed because of transparent bounds on a Region.
3753 fActiveStart = fRegionStart;
3754 fActiveLimit = fRegionLimit;
3755 }
3756 break;
3757
3758 case URX_ONECHAR_I:
4388f060
A
3759 // Case insensitive one char. The char from the pattern is already case folded.
3760 // Input text is not, but case folding the input can not reduce two or more code
3761 // points to one.
729e4ab9
A
3762 if (fp->fInputIdx < fActiveLimit) {
3763 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3764
3765 UChar32 c = UTEXT_NEXT32(fInputText);
3766 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3767 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3768 break;
3769 }
3770 } else {
3771 fHitEnd = TRUE;
3772 }
57a6839d 3773
729e4ab9
A
3774 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3775 break;
3776
3777 case URX_STRING_I:
3778 {
4388f060 3779 // Case-insensitive test input against a literal string.
729e4ab9
A
3780 // Strings require two slots in the compiled pattern, one for the
3781 // offset to the string text, and one for the length.
4388f060 3782 // The compiled string has already been case folded.
729e4ab9 3783 {
4388f060
A
3784 const UChar *patternString = litText + opValue;
3785 int32_t patternStringIdx = 0;
729e4ab9
A
3786
3787 op = (int32_t)pat[fp->fPatIdx];
3788 fp->fPatIdx++;
3789 opType = URX_TYPE(op);
3790 opValue = URX_VAL(op);
3791 U_ASSERT(opType == URX_STRING_LEN);
4388f060 3792 int32_t patternStringLen = opValue; // Length of the string from the pattern.
57a6839d
A
3793
3794
4388f060
A
3795 UChar32 cPattern;
3796 UChar32 cText;
3797 UBool success = TRUE;
3798
729e4ab9 3799 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4388f060
A
3800 CaseFoldingUTextIterator inputIterator(*fInputText);
3801 while (patternStringIdx < patternStringLen) {
3802 if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
3803 success = FALSE;
3804 fHitEnd = TRUE;
3805 break;
729e4ab9 3806 }
4388f060
A
3807 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
3808 cText = inputIterator.next();
3809 if (cText != cPattern) {
3810 success = FALSE;
3811 break;
729e4ab9
A
3812 }
3813 }
4388f060
A
3814 if (inputIterator.inExpansion()) {
3815 success = FALSE;
3816 }
3817
3818 if (success) {
3819 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3820 } else {
729e4ab9
A
3821 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3822 }
3823 }
3824 }
3825 break;
3826
3827 case URX_LB_START:
3828 {
3829 // Entering a look-behind block.
3830 // Save Stack Ptr, Input Pos.
3831 // TODO: implement transparent bounds. Ticket #6067
3832 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3833 fData[opValue] = fStack->size();
3834 fData[opValue+1] = fp->fInputIdx;
3835 // Init the variable containing the start index for attempted matches.
3836 fData[opValue+2] = -1;
3837 // Save input string length, then reset to pin any matches to end at
3838 // the current position.
3839 fData[opValue+3] = fActiveLimit;
3840 fActiveLimit = fp->fInputIdx;
3841 }
3842 break;
3843
3844
3845 case URX_LB_CONT:
3846 {
3847 // Positive Look-Behind, at top of loop checking for matches of LB expression
3848 // at all possible input starting positions.
3849
3850 // Fetch the min and max possible match lengths. They are the operands
3851 // of this op in the pattern.
3852 int32_t minML = (int32_t)pat[fp->fPatIdx++];
3853 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3854 U_ASSERT(minML <= maxML);
3855 U_ASSERT(minML >= 0);
3856
3857 // Fetch (from data) the last input index where a match was attempted.
3858 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3859 int64_t *lbStartIdx = &fData[opValue+2];
3860 if (*lbStartIdx < 0) {
3861 // First time through loop.
3862 *lbStartIdx = fp->fInputIdx - minML;
3863 } else {
3864 // 2nd through nth time through the loop.
3865 // Back up start position for match by one.
3866 if (*lbStartIdx == 0) {
3867 (*lbStartIdx)--;
3868 } else {
3869 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
4388f060 3870 (void)UTEXT_PREVIOUS32(fInputText);
729e4ab9
A
3871 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3872 }
3873 }
3874
3875 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
3876 // We have tried all potential match starting points without
3877 // getting a match. Backtrack out, and out of the
3878 // Look Behind altogether.
3879 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3880 int64_t restoreInputLen = fData[opValue+3];
3881 U_ASSERT(restoreInputLen >= fActiveLimit);
3882 U_ASSERT(restoreInputLen <= fInputLength);
3883 fActiveLimit = restoreInputLen;
3884 break;
3885 }
3886
3887 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3888 // (successful match will fall off the end of the loop.)
3889 fp = StateSave(fp, fp->fPatIdx-3, status);
3890 fp->fInputIdx = *lbStartIdx;
3891 }
3892 break;
3893
3894 case URX_LB_END:
3895 // End of a look-behind block, after a successful match.
3896 {
3897 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3898 if (fp->fInputIdx != fActiveLimit) {
3899 // The look-behind expression matched, but the match did not
3900 // extend all the way to the point that we are looking behind from.
3901 // FAIL out of here, which will take us back to the LB_CONT, which
3902 // will retry the match starting at another position or fail
3903 // the look-behind altogether, whichever is appropriate.
3904 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3905 break;
3906 }
3907
3908 // Look-behind match is good. Restore the orignal input string length,
57a6839d 3909 // which had been truncated to pin the end of the lookbehind match to the
729e4ab9
A
3910 // position being looked-behind.
3911 int64_t originalInputLen = fData[opValue+3];
3912 U_ASSERT(originalInputLen >= fActiveLimit);
3913 U_ASSERT(originalInputLen <= fInputLength);
3914 fActiveLimit = originalInputLen;
3915 }
3916 break;
3917
3918
3919 case URX_LBN_CONT:
3920 {
3921 // Negative Look-Behind, at top of loop checking for matches of LB expression
3922 // at all possible input starting positions.
3923
3924 // Fetch the extra parameters of this op.
3925 int32_t minML = (int32_t)pat[fp->fPatIdx++];
3926 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3927 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
3928 continueLoc = URX_VAL(continueLoc);
3929 U_ASSERT(minML <= maxML);
3930 U_ASSERT(minML >= 0);
3931 U_ASSERT(continueLoc > fp->fPatIdx);
3932
3933 // Fetch (from data) the last input index where a match was attempted.
3934 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3935 int64_t *lbStartIdx = &fData[opValue+2];
3936 if (*lbStartIdx < 0) {
3937 // First time through loop.
3938 *lbStartIdx = fp->fInputIdx - minML;
3939 } else {
3940 // 2nd through nth time through the loop.
3941 // Back up start position for match by one.
3942 if (*lbStartIdx == 0) {
3943 (*lbStartIdx)--;
3944 } else {
3945 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
4388f060 3946 (void)UTEXT_PREVIOUS32(fInputText);
729e4ab9
A
3947 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3948 }
3949 }
3950
3951 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
3952 // We have tried all potential match starting points without
3953 // getting a match, which means that the negative lookbehind as
3954 // a whole has succeeded. Jump forward to the continue location
3955 int64_t restoreInputLen = fData[opValue+3];
3956 U_ASSERT(restoreInputLen >= fActiveLimit);
3957 U_ASSERT(restoreInputLen <= fInputLength);
3958 fActiveLimit = restoreInputLen;
3959 fp->fPatIdx = continueLoc;
3960 break;
3961 }
3962
3963 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3964 // (successful match will cause a FAIL out of the loop altogether.)
3965 fp = StateSave(fp, fp->fPatIdx-4, status);
3966 fp->fInputIdx = *lbStartIdx;
3967 }
3968 break;
3969
3970 case URX_LBN_END:
3971 // End of a negative look-behind block, after a successful match.
3972 {
3973 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3974 if (fp->fInputIdx != fActiveLimit) {
3975 // The look-behind expression matched, but the match did not
3976 // extend all the way to the point that we are looking behind from.
3977 // FAIL out of here, which will take us back to the LB_CONT, which
3978 // will retry the match starting at another position or succeed
3979 // the look-behind altogether, whichever is appropriate.
3980 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3981 break;
3982 }
3983
3984 // Look-behind expression matched, which means look-behind test as
3985 // a whole Fails
57a6839d
A
3986
3987 // Restore the orignal input string length, which had been truncated
3988 // inorder to pin the end of the lookbehind match
729e4ab9
A
3989 // to the position being looked-behind.
3990 int64_t originalInputLen = fData[opValue+3];
3991 U_ASSERT(originalInputLen >= fActiveLimit);
3992 U_ASSERT(originalInputLen <= fInputLength);
3993 fActiveLimit = originalInputLen;
3994
3995 // Restore original stack position, discarding any state saved
3996 // by the successful pattern match.
3997 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3998 int32_t newStackSize = (int32_t)fData[opValue];
3999 U_ASSERT(fStack->size() > newStackSize);
4000 fStack->setSize(newStackSize);
57a6839d
A
4001
4002 // FAIL, which will take control back to someplace
729e4ab9
A
4003 // prior to entering the look-behind test.
4004 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4005 }
4006 break;
4007
4008
4009 case URX_LOOP_SR_I:
4010 // Loop Initialization for the optimized implementation of
4011 // [some character set]*
4012 // This op scans through all matching input.
4013 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4014 {
4015 U_ASSERT(opValue > 0 && opValue < sets->size());
4016 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4017 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
4018
4019 // Loop through input, until either the input is exhausted or
4020 // we reach a character that is not a member of the set.
4021 int64_t ix = fp->fInputIdx;
4022 UTEXT_SETNATIVEINDEX(fInputText, ix);
4023 for (;;) {
4024 if (ix >= fActiveLimit) {
4025 fHitEnd = TRUE;
4026 break;
4027 }
4028 UChar32 c = UTEXT_NEXT32(fInputText);
4029 if (c<256) {
4030 if (s8->contains(c) == FALSE) {
4031 break;
4032 }
4033 } else {
4034 if (s->contains(c) == FALSE) {
4035 break;
4036 }
4037 }
4038 ix = UTEXT_GETNATIVEINDEX(fInputText);
4039 }
4040
4041 // If there were no matching characters, skip over the loop altogether.
4042 // The loop doesn't run at all, a * op always succeeds.
4043 if (ix == fp->fInputIdx) {
4044 fp->fPatIdx++; // skip the URX_LOOP_C op.
4045 break;
4046 }
4047
4048 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4049 // must follow. It's operand is the stack location
4050 // that holds the starting input index for the match of this [set]*
4051 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4052 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4053 int32_t stackLoc = URX_VAL(loopcOp);
4054 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4055 fp->fExtra[stackLoc] = fp->fInputIdx;
729e4ab9
A
4056 fp->fInputIdx = ix;
4057
4058 // Save State to the URX_LOOP_C op that follows this one,
4059 // so that match failures in the following code will return to there.
4060 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4061 fp = StateSave(fp, fp->fPatIdx, status);
4062 fp->fPatIdx++;
4063 }
4064 break;
4065
4066
4067 case URX_LOOP_DOT_I:
4068 // Loop Initialization for the optimized implementation of .*
4069 // This op scans through all remaining input.
4070 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4071 {
4072 // Loop through input until the input is exhausted (we reach an end-of-line)
4073 // In DOTALL mode, we can just go straight to the end of the input.
4074 int64_t ix;
4075 if ((opValue & 1) == 1) {
4076 // Dot-matches-All mode. Jump straight to the end of the string.
4077 ix = fActiveLimit;
4078 fHitEnd = TRUE;
4079 } else {
4080 // NOT DOT ALL mode. Line endings do not match '.'
4081 // Scan forward until a line ending or end of input.
4082 ix = fp->fInputIdx;
4083 UTEXT_SETNATIVEINDEX(fInputText, ix);
4084 for (;;) {
4085 if (ix >= fActiveLimit) {
4086 fHitEnd = TRUE;
4087 break;
4088 }
4089 UChar32 c = UTEXT_NEXT32(fInputText);
4090 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
4091 if ((c == 0x0a) || // 0x0a is newline in both modes.
4092 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
4093 (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) {
4094 // char is a line ending. Exit the scanning loop.
4095 break;
4096 }
4097 }
4098 ix = UTEXT_GETNATIVEINDEX(fInputText);
4099 }
4100 }
4101
4102 // If there were no matching characters, skip over the loop altogether.
4103 // The loop doesn't run at all, a * op always succeeds.
4104 if (ix == fp->fInputIdx) {
4105 fp->fPatIdx++; // skip the URX_LOOP_C op.
4106 break;
4107 }
4108
4109 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4110 // must follow. It's operand is the stack location
4111 // that holds the starting input index for the match of this .*
4112 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4113 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4114 int32_t stackLoc = URX_VAL(loopcOp);
4115 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4116 fp->fExtra[stackLoc] = fp->fInputIdx;
729e4ab9
A
4117 fp->fInputIdx = ix;
4118
4119 // Save State to the URX_LOOP_C op that follows this one,
4120 // so that match failures in the following code will return to there.
4121 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4122 fp = StateSave(fp, fp->fPatIdx, status);
4123 fp->fPatIdx++;
4124 }
4125 break;
4126
4127
4128 case URX_LOOP_C:
4129 {
4130 U_ASSERT(opValue>=0 && opValue<fFrameSize);
4131 backSearchIndex = fp->fExtra[opValue];
4132 U_ASSERT(backSearchIndex <= fp->fInputIdx);
4133 if (backSearchIndex == fp->fInputIdx) {
4134 // We've backed up the input idx to the point that the loop started.
57a6839d 4135 // The loop is done. Leave here without saving state.
729e4ab9
A
4136 // Subsequent failures won't come back here.
4137 break;
4138 }
4139 // Set up for the next iteration of the loop, with input index
4140 // backed up by one from the last time through,
4141 // and a state save to this instruction in case the following code fails again.
4142 // (We're going backwards because this loop emulates stack unwinding, not
4143 // the initial scan forward.)
4144 U_ASSERT(fp->fInputIdx > 0);
4145 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4146 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4147 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
57a6839d 4148
729e4ab9 4149 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
57a6839d 4150 if (prevC == 0x0a &&
729e4ab9
A
4151 fp->fInputIdx > backSearchIndex &&
4152 twoPrevC == 0x0d) {
4153 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4154 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4155 // .*, stepping back over CRLF pair.
4156 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4157 }
4158 }
4159
374ca955 4160
729e4ab9
A
4161 fp = StateSave(fp, fp->fPatIdx-1, status);
4162 }
4163 break;
374ca955
A
4164
4165
729e4ab9
A
4166
4167 default:
4168 // Trouble. The compiled pattern contains an entry with an
4169 // unrecognized type tag.
4170 U_ASSERT(FALSE);
b75a7d8f 4171 }
729e4ab9
A
4172
4173 if (U_FAILURE(status)) {
4174 isMatch = FALSE;
b75a7d8f
A
4175 break;
4176 }
4177 }
57a6839d 4178
729e4ab9
A
4179breakFromLoop:
4180 fMatch = isMatch;
4181 if (isMatch) {
4182 fLastMatchEnd = fMatchEnd;
4183 fMatchStart = startIdx;
4184 fMatchEnd = fp->fInputIdx;
46f4442e 4185 }
57a6839d
A
4186
4187#ifdef REGEX_RUN_DEBUG
4188 if (fTraceDebug) {
4189 if (isMatch) {
4190 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
4191 } else {
4192 printf("No match\n\n");
46f4442e
A
4193 }
4194 }
57a6839d 4195#endif
46f4442e 4196
729e4ab9
A
4197 fFrame = fp; // The active stack frame when the engine stopped.
4198 // Contains the capture group results that we need to
4199 // access later.
4200 return;
b75a7d8f 4201}
46f4442e
A
4202
4203
b75a7d8f
A
4204//--------------------------------------------------------------------------------
4205//
729e4ab9
A
4206// MatchChunkAt This is the actual matching engine. Like MatchAt, but with the
4207// assumption that the entire string is available in the UText's
4208// chunk buffer. For now, that means we can use int32_t indexes,
4209// except for anything that needs to be saved (like group starts
4210// and ends).
b75a7d8f 4211//
46f4442e
A
4212// startIdx: begin matching a this index.
4213// toEnd: if true, match must extend to end of the input region
4214//
b75a7d8f 4215//--------------------------------------------------------------------------------
729e4ab9 4216void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
b75a7d8f 4217 UBool isMatch = FALSE; // True if the we have a match.
57a6839d 4218
729e4ab9 4219 int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
b75a7d8f
A
4220
4221 int32_t op; // Operation from the compiled pattern, split into
4222 int32_t opType; // the opcode
4223 int32_t opValue; // and the operand value.
57a6839d 4224
729e4ab9 4225#ifdef REGEX_RUN_DEBUG
57a6839d
A
4226 if (fTraceDebug) {
4227 printf("MatchAt(startIdx=%d)\n", startIdx);
b75a7d8f 4228 printf("Original Pattern: ");
729e4ab9
A
4229 UChar32 c = utext_next32From(fPattern->fPattern, 0);
4230 while (c != U_SENTINEL) {
4231 if (c<32 || c>256) {
4232 c = '.';
4233 }
57a6839d
A
4234 printf("%c", c);
4235
729e4ab9 4236 c = UTEXT_NEXT32(fPattern->fPattern);
b75a7d8f
A
4237 }
4238 printf("\n");
4239 printf("Input String: ");
729e4ab9
A
4240 c = utext_next32From(fInputText, 0);
4241 while (c != U_SENTINEL) {
b75a7d8f
A
4242 if (c<32 || c>256) {
4243 c = '.';
4244 }
4245 printf("%c", c);
57a6839d 4246
729e4ab9 4247 c = UTEXT_NEXT32(fInputText);
b75a7d8f
A
4248 }
4249 printf("\n");
4250 printf("\n");
4251 }
729e4ab9 4252#endif
57a6839d 4253
b75a7d8f
A
4254 if (U_FAILURE(status)) {
4255 return;
4256 }
57a6839d 4257
b75a7d8f 4258 // Cache frequently referenced items from the compiled pattern
b75a7d8f 4259 //
729e4ab9 4260 int64_t *pat = fPattern->fCompiledPat->getBuffer();
57a6839d 4261
b75a7d8f
A
4262 const UChar *litText = fPattern->fLiteralText.getBuffer();
4263 UVector *sets = fPattern->fSets;
57a6839d 4264
729e4ab9 4265 const UChar *inputBuf = fInputText->chunkContents;
57a6839d 4266
46f4442e 4267 fFrameSize = fPattern->fFrameSize;
b75a7d8f 4268 REStackFrame *fp = resetStack();
57a6839d 4269
b75a7d8f
A
4270 fp->fPatIdx = 0;
4271 fp->fInputIdx = startIdx;
57a6839d 4272
b75a7d8f
A
4273 // Zero out the pattern's static data
4274 int32_t i;
4275 for (i = 0; i<fPattern->fDataSize; i++) {
4276 fData[i] = 0;
4277 }
57a6839d 4278
b75a7d8f
A
4279 //
4280 // Main loop for interpreting the compiled pattern.
4281 // One iteration of the loop per pattern operation performed.
4282 //
4283 for (;;) {
729e4ab9 4284 op = (int32_t)pat[fp->fPatIdx];
b75a7d8f
A
4285 opType = URX_TYPE(op);
4286 opValue = URX_VAL(op);
729e4ab9 4287#ifdef REGEX_RUN_DEBUG
b75a7d8f 4288 if (fTraceDebug) {
729e4ab9 4289 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
57a6839d 4290 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
729e4ab9 4291 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
b75a7d8f
A
4292 fPattern->dumpOp(fp->fPatIdx);
4293 }
729e4ab9 4294#endif
b75a7d8f 4295 fp->fPatIdx++;
57a6839d 4296
b75a7d8f 4297 switch (opType) {
57a6839d
A
4298
4299
b75a7d8f
A
4300 case URX_NOP:
4301 break;
57a6839d
A
4302
4303
b75a7d8f
A
4304 case URX_BACKTRACK:
4305 // Force a backtrack. In some circumstances, the pattern compiler
4306 // will notice that the pattern can't possibly match anything, and will
4307 // emit one of these at that point.
46f4442e 4308 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f 4309 break;
57a6839d
A
4310
4311
b75a7d8f 4312 case URX_ONECHAR:
46f4442e 4313 if (fp->fInputIdx < fActiveLimit) {
729e4ab9 4314 UChar32 c;
46f4442e
A
4315 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4316 if (c == opValue) {
b75a7d8f
A
4317 break;
4318 }
46f4442e
A
4319 } else {
4320 fHitEnd = TRUE;
b75a7d8f 4321 }
729e4ab9
A
4322 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4323 break;
57a6839d
A
4324
4325
b75a7d8f
A
4326 case URX_STRING:
4327 {
4328 // Test input against a literal string.
4329 // Strings require two slots in the compiled pattern, one for the
4330 // offset to the string text, and one for the length.
4331 int32_t stringStartIdx = opValue;
4332 int32_t stringLen;
57a6839d 4333
729e4ab9 4334 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
b75a7d8f
A
4335 fp->fPatIdx++;
4336 opType = URX_TYPE(op);
4337 stringLen = URX_VAL(op);
4338 U_ASSERT(opType == URX_STRING_LEN);
4339 U_ASSERT(stringLen >= 2);
57a6839d 4340
b75a7d8f 4341 const UChar * pInp = inputBuf + fp->fInputIdx;
4388f060 4342 const UChar * pInpLimit = inputBuf + fActiveLimit;
b75a7d8f
A
4343 const UChar * pPat = litText+stringStartIdx;
4344 const UChar * pEnd = pInp + stringLen;
4388f060
A
4345 UBool success = TRUE;
4346 while (pInp < pEnd) {
4347 if (pInp >= pInpLimit) {
4348 fHitEnd = TRUE;
4349 success = FALSE;
4350 break;
4351 }
4352 if (*pInp++ != *pPat++) {
4353 success = FALSE;
b75a7d8f
A
4354 break;
4355 }
4356 }
57a6839d 4357
729e4ab9
A
4358 if (success) {
4359 fp->fInputIdx += stringLen;
4360 } else {
729e4ab9
A
4361 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4362 }
b75a7d8f 4363 }
729e4ab9 4364 break;
57a6839d
A
4365
4366
b75a7d8f 4367 case URX_STATE_SAVE:
46f4442e 4368 fp = StateSave(fp, opValue, status);
b75a7d8f 4369 break;
57a6839d
A
4370
4371
b75a7d8f
A
4372 case URX_END:
4373 // The match loop will exit via this path on a successful match,
4374 // when we reach the end of the pattern.
46f4442e
A
4375 if (toEnd && fp->fInputIdx != fActiveLimit) {
4376 // The pattern matched, but not to the end of input. Try some more.
4377 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4378 break;
4379 }
b75a7d8f
A
4380 isMatch = TRUE;
4381 goto breakFromLoop;
57a6839d 4382
729e4ab9 4383 // Start and End Capture stack frame variables are laid out out like this:
b75a7d8f
A
4384 // fp->fExtra[opValue] - The start of a completed capture group
4385 // opValue+1 - The end of a completed capture group
4386 // opValue+2 - the start of a capture group whose end
4387 // has not yet been reached (and might not ever be).
4388 case URX_START_CAPTURE:
46f4442e 4389 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
b75a7d8f
A
4390 fp->fExtra[opValue+2] = fp->fInputIdx;
4391 break;
57a6839d
A
4392
4393
b75a7d8f 4394 case URX_END_CAPTURE:
46f4442e 4395 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
b75a7d8f
A
4396 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
4397 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
4398 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
4399 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4400 break;
57a6839d
A
4401
4402
b75a7d8f 4403 case URX_DOLLAR: // $, test for End of line
729e4ab9 4404 // or for position before new line at end of input
46f4442e 4405 if (fp->fInputIdx < fAnchorLimit-2) {
b75a7d8f 4406 // We are no where near the end of input. Fail.
46f4442e
A
4407 // This is the common case. Keep it first.
4408 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4409 break;
4410 }
46f4442e 4411 if (fp->fInputIdx >= fAnchorLimit) {
b75a7d8f 4412 // We really are at the end of input. Success.
46f4442e
A
4413 fHitEnd = TRUE;
4414 fRequireEnd = TRUE;
b75a7d8f
A
4415 break;
4416 }
57a6839d 4417
b75a7d8f
A
4418 // If we are positioned just before a new-line that is located at the
4419 // end of input, succeed.
46f4442e 4420 if (fp->fInputIdx == fAnchorLimit-1) {
729e4ab9
A
4421 UChar32 c;
4422 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
57a6839d 4423
46f4442e 4424 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
46f4442e 4425 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
374ca955 4426 // At new-line at end of input. Success
46f4442e
A
4427 fHitEnd = TRUE;
4428 fRequireEnd = TRUE;
4429 break;
374ca955 4430 }
b75a7d8f 4431 }
729e4ab9
A
4432 } else if (fp->fInputIdx == fAnchorLimit-2 &&
4433 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
46f4442e
A
4434 fHitEnd = TRUE;
4435 fRequireEnd = TRUE;
b75a7d8f 4436 break; // At CR/LF at end of input. Success
b75a7d8f 4437 }
57a6839d 4438
46f4442e 4439 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
57a6839d 4440
46f4442e 4441 break;
57a6839d
A
4442
4443
729e4ab9 4444 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
46f4442e
A
4445 if (fp->fInputIdx >= fAnchorLimit-1) {
4446 // Either at the last character of input, or off the end.
4447 if (fp->fInputIdx == fAnchorLimit-1) {
4448 // At last char of input. Success if it's a new line.
729e4ab9 4449 if (inputBuf[fp->fInputIdx] == 0x0a) {
46f4442e
A
4450 fHitEnd = TRUE;
4451 fRequireEnd = TRUE;
4452 break;
4453 }
4454 } else {
4455 // Off the end of input. Success.
4456 fHitEnd = TRUE;
4457 fRequireEnd = TRUE;
4458 break;
4459 }
4460 }
57a6839d 4461
46f4442e
A
4462 // Not at end of input. Back-track out.
4463 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f 4464 break;
57a6839d
A
4465
4466
729e4ab9
A
4467 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
4468 {
4469 if (fp->fInputIdx >= fAnchorLimit) {
4470 // We really are at the end of input. Success.
4471 fHitEnd = TRUE;
4472 fRequireEnd = TRUE;
4473 break;
4474 }
4475 // If we are positioned just before a new-line, succeed.
4476 // It makes no difference where the new-line is within the input.
4477 UChar32 c = inputBuf[fp->fInputIdx];
4478 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
4479 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
4480 // In multi-line mode, hitting a new-line just before the end of input does not
4481 // set the hitEnd or requireEnd flags
4482 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
46f4442e 4483 break;
729e4ab9
A
4484 }
4485 }
4486 // not at a new line. Fail.
4487 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4488 }
4489 break;
57a6839d
A
4490
4491
729e4ab9
A
4492 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
4493 {
4494 if (fp->fInputIdx >= fAnchorLimit) {
4495 // We really are at the end of input. Success.
4496 fHitEnd = TRUE;
4497 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
4498 break; // adding a new-line would not lose the match.
4499 }
4500 // If we are not positioned just before a new-line, the test fails; backtrack out.
4501 // It makes no difference where the new-line is within the input.
4502 if (inputBuf[fp->fInputIdx] != 0x0a) {
4503 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4504 }
4505 }
4506 break;
57a6839d
A
4507
4508
729e4ab9 4509 case URX_CARET: // ^, test for start of line
46f4442e
A
4510 if (fp->fInputIdx != fAnchorStart) {
4511 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4512 }
b75a7d8f 4513 break;
57a6839d
A
4514
4515
729e4ab9
A
4516 case URX_CARET_M: // ^, test for start of line in mulit-line mode
4517 {
4518 if (fp->fInputIdx == fAnchorStart) {
4519 // We are at the start input. Success.
4520 break;
4521 }
4522 // Check whether character just before the current pos is a new-line
4523 // unless we are at the end of input
57a6839d
A
4524 UChar c = inputBuf[fp->fInputIdx - 1];
4525 if ((fp->fInputIdx < fAnchorLimit) &&
729e4ab9
A
4526 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
4527 // It's a new-line. ^ is true. Success.
4528 // TODO: what should be done with positions between a CR and LF?
4529 break;
4530 }
4531 // Not at the start of a line. Fail.
4532 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4533 }
4534 break;
57a6839d
A
4535
4536
729e4ab9
A
4537 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
4538 {
4539 U_ASSERT(fp->fInputIdx >= fAnchorStart);
4540 if (fp->fInputIdx <= fAnchorStart) {
4541 // We are at the start input. Success.
4542 break;
4543 }
4544 // Check whether character just before the current pos is a new-line
4545 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
57a6839d 4546 UChar c = inputBuf[fp->fInputIdx - 1];
729e4ab9
A
4547 if (c != 0x0a) {
4548 // Not at the start of a line. Back-track out.
4549 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4550 }
4551 }
4552 break;
57a6839d 4553
b75a7d8f
A
4554 case URX_BACKSLASH_B: // Test for word boundaries
4555 {
729e4ab9 4556 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
51004dcb 4557 success ^= (UBool)(opValue != 0); // flip sense for \B
b75a7d8f 4558 if (!success) {
46f4442e 4559 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4560 }
4561 }
4562 break;
57a6839d
A
4563
4564
374ca955
A
4565 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
4566 {
4567 UBool success = isUWordBoundary(fp->fInputIdx);
51004dcb 4568 success ^= (UBool)(opValue != 0); // flip sense for \B
374ca955 4569 if (!success) {
46f4442e 4570 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
374ca955
A
4571 }
4572 }
4573 break;
57a6839d
A
4574
4575
b75a7d8f
A
4576 case URX_BACKSLASH_D: // Test for decimal digit
4577 {
46f4442e
A
4578 if (fp->fInputIdx >= fActiveLimit) {
4579 fHitEnd = TRUE;
4580 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4581 break;
4582 }
57a6839d 4583
729e4ab9
A
4584 UChar32 c;
4585 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
46f4442e 4586 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
b75a7d8f 4587 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
51004dcb 4588 success ^= (UBool)(opValue != 0); // flip sense for \D
729e4ab9 4589 if (!success) {
46f4442e 4590 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4591 }
4592 }
4593 break;
57a6839d
A
4594
4595
b75a7d8f 4596 case URX_BACKSLASH_G: // Test for position at end of previous match
729e4ab9 4597 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
46f4442e 4598 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4599 }
4600 break;
57a6839d
A
4601
4602
4603 case URX_BACKSLASH_X:
729e4ab9
A
4604 // Match a Grapheme, as defined by Unicode TR 29.
4605 // Differs slightly from Perl, which consumes combining marks independently
4606 // of context.
4607 {
b75a7d8f 4608
729e4ab9
A
4609 // Fail if at end of input
4610 if (fp->fInputIdx >= fActiveLimit) {
4611 fHitEnd = TRUE;
4612 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4613 break;
4614 }
b75a7d8f 4615
729e4ab9
A
4616 // Examine (and consume) the current char.
4617 // Dispatch into a little state machine, based on the char.
4618 UChar32 c;
4619 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4620 UnicodeSet **sets = fPattern->fStaticSets;
4621 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
4622 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
4623 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4624 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4625 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4626 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4627 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4628 goto GC_Extend;
b75a7d8f
A
4629
4630
4631
4632GC_L:
729e4ab9
A
4633 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4634 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4635 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4636 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4637 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4638 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4639 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4640 goto GC_Extend;
b75a7d8f
A
4641
4642GC_V:
729e4ab9
A
4643 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4644 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4645 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4646 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4647 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4648 goto GC_Extend;
b75a7d8f
A
4649
4650GC_T:
729e4ab9
A
4651 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4652 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4653 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4654 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4655 goto GC_Extend;
b75a7d8f
A
4656
4657GC_Extend:
729e4ab9
A
4658 // Combining characters are consumed here
4659 for (;;) {
4660 if (fp->fInputIdx >= fActiveLimit) {
4661 break;
b75a7d8f 4662 }
729e4ab9
A
4663 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4664 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
4665 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
4666 break;
4667 }
4668 }
4669 goto GC_Done;
b75a7d8f
A
4670
4671GC_Control:
57a6839d 4672 // Most control chars stand alone (don't combine with combining chars),
729e4ab9
A
4673 // except for that CR/LF sequence is a single grapheme cluster.
4674 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
4675 fp->fInputIdx++;
4676 }
b75a7d8f
A
4677
4678GC_Done:
729e4ab9
A
4679 if (fp->fInputIdx >= fActiveLimit) {
4680 fHitEnd = TRUE;
b75a7d8f 4681 }
729e4ab9
A
4682 break;
4683 }
57a6839d
A
4684
4685
4686
4687
46f4442e
A
4688 case URX_BACKSLASH_Z: // Test for end of Input
4689 if (fp->fInputIdx < fAnchorLimit) {
4690 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4691 } else {
4692 fHitEnd = TRUE;
4693 fRequireEnd = TRUE;
b75a7d8f
A
4694 }
4695 break;
57a6839d
A
4696
4697
4698
b75a7d8f
A
4699 case URX_STATIC_SETREF:
4700 {
4701 // Test input character against one of the predefined sets
4702 // (Word Characters, for example)
4703 // The high bit of the op value is a flag for the match polarity.
4704 // 0: success if input char is in set.
4705 // 1: success if input char is not in set.
46f4442e
A
4706 if (fp->fInputIdx >= fActiveLimit) {
4707 fHitEnd = TRUE;
4708 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4709 break;
4710 }
57a6839d
A
4711
4712 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
b75a7d8f
A
4713 opValue &= ~URX_NEG_SET;
4714 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
57a6839d 4715
729e4ab9 4716 UChar32 c;
46f4442e 4717 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
b75a7d8f
A
4718 if (c < 256) {
4719 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4720 if (s8->contains(c)) {
4721 success = !success;
4722 }
4723 } else {
4724 const UnicodeSet *s = fPattern->fStaticSets[opValue];
4725 if (s->contains(c)) {
4726 success = !success;
4727 }
4728 }
4729 if (!success) {
46f4442e 4730 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4731 }
4732 }
4733 break;
57a6839d
A
4734
4735
b75a7d8f
A
4736 case URX_STAT_SETREF_N:
4737 {
57a6839d 4738 // Test input character for NOT being a member of one of
b75a7d8f 4739 // the predefined sets (Word Characters, for example)
46f4442e
A
4740 if (fp->fInputIdx >= fActiveLimit) {
4741 fHitEnd = TRUE;
4742 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4743 break;
4744 }
57a6839d 4745
b75a7d8f 4746 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
57a6839d 4747
b75a7d8f 4748 UChar32 c;
46f4442e 4749 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
b75a7d8f
A
4750 if (c < 256) {
4751 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4752 if (s8->contains(c) == FALSE) {
4753 break;
4754 }
4755 } else {
4756 const UnicodeSet *s = fPattern->fStaticSets[opValue];
4757 if (s->contains(c) == FALSE) {
4758 break;
4759 }
4760 }
46f4442e 4761 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4762 }
4763 break;
57a6839d
A
4764
4765
b75a7d8f 4766 case URX_SETREF:
729e4ab9
A
4767 {
4768 if (fp->fInputIdx >= fActiveLimit) {
4769 fHitEnd = TRUE;
4770 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
46f4442e
A
4771 break;
4772 }
57a6839d 4773
729e4ab9
A
4774 U_ASSERT(opValue > 0 && opValue < sets->size());
4775
4776 // There is input left. Pick up one char and test it for set membership.
4777 UChar32 c;
4778 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4779 if (c<256) {
4780 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4781 if (s8->contains(c)) {
4782 // The character is in the set. A Match.
4783 break;
4784 }
4785 } else {
4786 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
4787 if (s->contains(c)) {
4788 // The character is in the set. A Match.
4789 break;
4790 }
4791 }
57a6839d 4792
729e4ab9 4793 // the character wasn't in the set.
729e4ab9 4794 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
46f4442e 4795 }
b75a7d8f 4796 break;
57a6839d
A
4797
4798
b75a7d8f
A
4799 case URX_DOTANY:
4800 {
4801 // . matches anything, but stops at end-of-line.
46f4442e 4802 if (fp->fInputIdx >= fActiveLimit) {
b75a7d8f 4803 // At end of input. Match failed. Backtrack out.
46f4442e
A
4804 fHitEnd = TRUE;
4805 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4806 break;
4807 }
57a6839d 4808
b75a7d8f 4809 // There is input left. Advance over one char, unless we've hit end-of-line
729e4ab9 4810 UChar32 c;
46f4442e 4811 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
b75a7d8f 4812 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
73c04bcf 4813 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
b75a7d8f 4814 // End of line in normal mode. . does not match.
729e4ab9 4815 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4816 break;
4817 }
4818 }
4819 break;
57a6839d
A
4820
4821
b75a7d8f
A
4822 case URX_DOTANY_ALL:
4823 {
729e4ab9 4824 // . in dot-matches-all (including new lines) mode
46f4442e 4825 if (fp->fInputIdx >= fActiveLimit) {
b75a7d8f 4826 // At end of input. Match failed. Backtrack out.
46f4442e
A
4827 fHitEnd = TRUE;
4828 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4829 break;
4830 }
57a6839d 4831
b75a7d8f
A
4832 // There is input left. Advance over one char, except if we are
4833 // at a cr/lf, advance over both of them.
57a6839d 4834 UChar32 c;
46f4442e
A
4835 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4836 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
b75a7d8f 4837 // In the case of a CR/LF, we need to advance over both.
729e4ab9
A
4838 if (inputBuf[fp->fInputIdx] == 0x0a) {
4839 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
b75a7d8f
A
4840 }
4841 }
4842 }
4843 break;
57a6839d
A
4844
4845
46f4442e 4846 case URX_DOTANY_UNIX:
b75a7d8f 4847 {
46f4442e
A
4848 // '.' operator, matches all, but stops at end-of-line.
4849 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
4850 if (fp->fInputIdx >= fActiveLimit) {
4851 // At end of input. Match failed. Backtrack out.
4852 fHitEnd = TRUE;
4853 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4854 break;
4855 }
57a6839d 4856
46f4442e 4857 // There is input left. Advance over one char, unless we've hit end-of-line
57a6839d 4858 UChar32 c;
46f4442e
A
4859 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4860 if (c == 0x0a) {
4861 // End of line in normal mode. '.' does not match the \n
4862 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4863 }
4864 }
4865 break;
57a6839d
A
4866
4867
b75a7d8f
A
4868 case URX_JMP:
4869 fp->fPatIdx = opValue;
4870 break;
57a6839d 4871
b75a7d8f
A
4872 case URX_FAIL:
4873 isMatch = FALSE;
4874 goto breakFromLoop;
57a6839d 4875
b75a7d8f
A
4876 case URX_JMP_SAV:
4877 U_ASSERT(opValue < fPattern->fCompiledPat->size());
46f4442e
A
4878 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
4879 fp->fPatIdx = opValue; // Then JMP.
b75a7d8f 4880 break;
57a6839d 4881
b75a7d8f
A
4882 case URX_JMP_SAV_X:
4883 // This opcode is used with (x)+, when x can match a zero length string.
4884 // Same as JMP_SAV, except conditional on the match having made forward progress.
4885 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
4886 // data address of the input position at the start of the loop.
4887 {
4888 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
729e4ab9 4889 int32_t stoOp = (int32_t)pat[opValue-1];
b75a7d8f
A
4890 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
4891 int32_t frameLoc = URX_VAL(stoOp);
46f4442e 4892 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
729e4ab9 4893 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
b75a7d8f
A
4894 U_ASSERT(prevInputIdx <= fp->fInputIdx);
4895 if (prevInputIdx < fp->fInputIdx) {
4896 // The match did make progress. Repeat the loop.
46f4442e 4897 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
b75a7d8f
A
4898 fp->fPatIdx = opValue;
4899 fp->fExtra[frameLoc] = fp->fInputIdx;
57a6839d 4900 }
b75a7d8f
A
4901 // If the input position did not advance, we do nothing here,
4902 // execution will fall out of the loop.
4903 }
4904 break;
57a6839d 4905
b75a7d8f
A
4906 case URX_CTR_INIT:
4907 {
46f4442e 4908 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
57a6839d
A
4909 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
4910
b75a7d8f 4911 // Pick up the three extra operands that CTR_INIT has, and
57a6839d 4912 // skip the pattern location counter past
729e4ab9 4913 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
b75a7d8f
A
4914 fp->fPatIdx += 3;
4915 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
729e4ab9
A
4916 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
4917 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
b75a7d8f
A
4918 U_ASSERT(minCount>=0);
4919 U_ASSERT(maxCount>=minCount || maxCount==-1);
57a6839d
A
4920 U_ASSERT(loopLoc>=fp->fPatIdx);
4921
b75a7d8f 4922 if (minCount == 0) {
46f4442e 4923 fp = StateSave(fp, loopLoc+1, status);
b75a7d8f 4924 }
57a6839d
A
4925 if (maxCount == -1) {
4926 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
4927 } else if (maxCount == 0) {
46f4442e 4928 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
4929 }
4930 }
4931 break;
57a6839d 4932
b75a7d8f
A
4933 case URX_CTR_LOOP:
4934 {
4935 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
729e4ab9 4936 int32_t initOp = (int32_t)pat[opValue];
b75a7d8f 4937 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
729e4ab9
A
4938 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
4939 int32_t minCount = (int32_t)pat[opValue+2];
4940 int32_t maxCount = (int32_t)pat[opValue+3];
b75a7d8f 4941 (*pCounter)++;
57a6839d
A
4942 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
4943 U_ASSERT(*pCounter == maxCount);
b75a7d8f
A
4944 break;
4945 }
4946 if (*pCounter >= minCount) {
57a6839d
A
4947 if (maxCount == -1) {
4948 // Loop has no hard upper bound.
4949 // Check that it is progressing through the input, break if it is not.
4950 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
4951 if (fp->fInputIdx == *pLastInputIdx) {
4952 break;
4953 } else {
4954 *pLastInputIdx = fp->fInputIdx;
4955 }
4956 }
46f4442e 4957 fp = StateSave(fp, fp->fPatIdx, status);
b75a7d8f
A
4958 }
4959 fp->fPatIdx = opValue + 4; // Loop back.
4960 }
4961 break;
57a6839d 4962
b75a7d8f
A
4963 case URX_CTR_INIT_NG:
4964 {
46f4442e
A
4965 // Initialize a non-greedy loop
4966 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
57a6839d
A
4967 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
4968
4969 // Pick up the three extra operands that CTR_INIT_NG has, and
4970 // skip the pattern location counter past
729e4ab9 4971 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
b75a7d8f
A
4972 fp->fPatIdx += 3;
4973 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
729e4ab9
A
4974 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
4975 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
b75a7d8f
A
4976 U_ASSERT(minCount>=0);
4977 U_ASSERT(maxCount>=minCount || maxCount==-1);
4978 U_ASSERT(loopLoc>fp->fPatIdx);
57a6839d
A
4979 if (maxCount == -1) {
4980 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
4981 }
4982
b75a7d8f
A
4983 if (minCount == 0) {
4984 if (maxCount != 0) {
46f4442e 4985 fp = StateSave(fp, fp->fPatIdx, status);
b75a7d8f
A
4986 }
4987 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
57a6839d 4988 }
b75a7d8f
A
4989 }
4990 break;
57a6839d 4991
b75a7d8f
A
4992 case URX_CTR_LOOP_NG:
4993 {
46f4442e 4994 // Non-greedy {min, max} loops
b75a7d8f 4995 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
729e4ab9 4996 int32_t initOp = (int32_t)pat[opValue];
b75a7d8f 4997 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
729e4ab9
A
4998 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
4999 int32_t minCount = (int32_t)pat[opValue+2];
5000 int32_t maxCount = (int32_t)pat[opValue+3];
57a6839d 5001
b75a7d8f 5002 (*pCounter)++;
57a6839d 5003 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
b75a7d8f
A
5004 // The loop has matched the maximum permitted number of times.
5005 // Break out of here with no action. Matching will
5006 // continue with the following pattern.
57a6839d 5007 U_ASSERT(*pCounter == maxCount);
b75a7d8f
A
5008 break;
5009 }
57a6839d 5010
b75a7d8f
A
5011 if (*pCounter < minCount) {
5012 // We haven't met the minimum number of matches yet.
5013 // Loop back for another one.
5014 fp->fPatIdx = opValue + 4; // Loop back.
5015 } else {
5016 // We do have the minimum number of matches.
57a6839d
A
5017
5018 // If there is no upper bound on the loop iterations, check that the input index
5019 // is progressing, and stop the loop if it is not.
5020 if (maxCount == -1) {
5021 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
5022 if (fp->fInputIdx == *pLastInputIdx) {
5023 break;
5024 }
5025 *pLastInputIdx = fp->fInputIdx;
5026 }
5027
5028 // Loop Continuation: we will fall into the pattern following the loop
5029 // (non-greedy, don't execute loop body first), but first do
5030 // a state save to the top of the loop, so that a match failure
b75a7d8f 5031 // in the following pattern will try another iteration of the loop.
46f4442e 5032 fp = StateSave(fp, opValue + 4, status);
b75a7d8f
A
5033 }
5034 }
5035 break;
57a6839d 5036
b75a7d8f
A
5037 case URX_STO_SP:
5038 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5039 fData[opValue] = fStack->size();
5040 break;
57a6839d 5041
b75a7d8f
A
5042 case URX_LD_SP:
5043 {
5044 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
729e4ab9 5045 int32_t newStackSize = (int32_t)fData[opValue];
b75a7d8f 5046 U_ASSERT(newStackSize <= fStack->size());
729e4ab9
A
5047 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5048 if (newFP == (int64_t *)fp) {
b75a7d8f
A
5049 break;
5050 }
5051 int32_t i;
46f4442e 5052 for (i=0; i<fFrameSize; i++) {
729e4ab9 5053 newFP[i] = ((int64_t *)fp)[i];
b75a7d8f
A
5054 }
5055 fp = (REStackFrame *)newFP;
5056 fStack->setSize(newStackSize);
5057 }
5058 break;
57a6839d 5059
b75a7d8f 5060 case URX_BACKREF:
4388f060
A
5061 {
5062 U_ASSERT(opValue < fFrameSize);
5063 int64_t groupStartIdx = fp->fExtra[opValue];
5064 int64_t groupEndIdx = fp->fExtra[opValue+1];
5065 U_ASSERT(groupStartIdx <= groupEndIdx);
5066 int64_t inputIndex = fp->fInputIdx;
5067 if (groupStartIdx < 0) {
5068 // This capture group has not participated in the match thus far,
5069 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5070 break;
5071 }
5072 UBool success = TRUE;
5073 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
5074 if (inputIndex >= fActiveLimit) {
5075 success = FALSE;
5076 fHitEnd = TRUE;
5077 break;
5078 }
5079 if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
5080 success = FALSE;
5081 break;
5082 }
5083 }
5084 if (success) {
5085 fp->fInputIdx = inputIndex;
5086 } else {
5087 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5088 }
5089 }
5090 break;
57a6839d 5091
b75a7d8f
A
5092 case URX_BACKREF_I:
5093 {
46f4442e 5094 U_ASSERT(opValue < fFrameSize);
729e4ab9
A
5095 int64_t groupStartIdx = fp->fExtra[opValue];
5096 int64_t groupEndIdx = fp->fExtra[opValue+1];
b75a7d8f 5097 U_ASSERT(groupStartIdx <= groupEndIdx);
b75a7d8f
A
5098 if (groupStartIdx < 0) {
5099 // This capture group has not participated in the match thus far,
46f4442e 5100 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
4388f060 5101 break;
b75a7d8f 5102 }
4388f060
A
5103 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
5104 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
b75a7d8f 5105
4388f060 5106 // Note: if the capture group match was of an empty string the backref
57a6839d 5107 // match succeeds. Verified by testing: Perl matches succeed
4388f060 5108 // in this case, so we do too.
57a6839d 5109
4388f060
A
5110 UBool success = TRUE;
5111 for (;;) {
5112 UChar32 captureGroupChar = captureGroupItr.next();
5113 if (captureGroupChar == U_SENTINEL) {
5114 success = TRUE;
b75a7d8f
A
5115 break;
5116 }
4388f060
A
5117 UChar32 inputChar = inputItr.next();
5118 if (inputChar == U_SENTINEL) {
5119 success = FALSE;
5120 fHitEnd = TRUE;
5121 break;
b75a7d8f 5122 }
4388f060
A
5123 if (inputChar != captureGroupChar) {
5124 success = FALSE;
5125 break;
5126 }
5127 }
5128
5129 if (success && inputItr.inExpansion()) {
57a6839d
A
5130 // We otained a match by consuming part of a string obtained from
5131 // case-folding a single code point of the input text.
4388f060
A
5132 // This does not count as an overall match.
5133 success = FALSE;
b75a7d8f 5134 }
4388f060
A
5135
5136 if (success) {
5137 fp->fInputIdx = inputItr.getIndex();
b75a7d8f 5138 } else {
4388f060 5139 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
5140 }
5141 }
5142 break;
4388f060 5143
b75a7d8f
A
5144 case URX_STO_INP_LOC:
5145 {
46f4442e 5146 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
b75a7d8f
A
5147 fp->fExtra[opValue] = fp->fInputIdx;
5148 }
5149 break;
57a6839d 5150
b75a7d8f
A
5151 case URX_JMPX:
5152 {
729e4ab9 5153 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
b75a7d8f
A
5154 fp->fPatIdx += 1;
5155 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
46f4442e 5156 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
729e4ab9 5157 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
b75a7d8f
A
5158 U_ASSERT(savedInputIdx <= fp->fInputIdx);
5159 if (savedInputIdx < fp->fInputIdx) {
5160 fp->fPatIdx = opValue; // JMP
5161 } else {
729e4ab9 5162 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
b75a7d8f
A
5163 }
5164 }
5165 break;
57a6839d 5166
b75a7d8f
A
5167 case URX_LA_START:
5168 {
5169 // Entering a lookahead block.
5170 // Save Stack Ptr, Input Pos.
5171 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5172 fData[opValue] = fStack->size();
5173 fData[opValue+1] = fp->fInputIdx;
46f4442e
A
5174 fActiveStart = fLookStart; // Set the match region change for
5175 fActiveLimit = fLookLimit; // transparent bounds.
b75a7d8f
A
5176 }
5177 break;
57a6839d 5178
b75a7d8f
A
5179 case URX_LA_END:
5180 {
5181 // Leaving a look-ahead block.
5182 // restore Stack Ptr, Input Pos to positions they had on entry to block.
5183 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5184 int32_t stackSize = fStack->size();
729e4ab9 5185 int32_t newStackSize = (int32_t)fData[opValue];
b75a7d8f
A
5186 U_ASSERT(stackSize >= newStackSize);
5187 if (stackSize > newStackSize) {
46f4442e
A
5188 // Copy the current top frame back to the new (cut back) top frame.
5189 // This makes the capture groups from within the look-ahead
5190 // expression available.
729e4ab9 5191 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
b75a7d8f 5192 int32_t i;
46f4442e 5193 for (i=0; i<fFrameSize; i++) {
729e4ab9 5194 newFP[i] = ((int64_t *)fp)[i];
b75a7d8f
A
5195 }
5196 fp = (REStackFrame *)newFP;
5197 fStack->setSize(newStackSize);
5198 }
5199 fp->fInputIdx = fData[opValue+1];
57a6839d 5200
46f4442e
A
5201 // Restore the active region bounds in the input string; they may have
5202 // been changed because of transparent bounds on a Region.
5203 fActiveStart = fRegionStart;
5204 fActiveLimit = fRegionLimit;
b75a7d8f
A
5205 }
5206 break;
57a6839d 5207
b75a7d8f 5208 case URX_ONECHAR_I:
46f4442e 5209 if (fp->fInputIdx < fActiveLimit) {
57a6839d 5210 UChar32 c;
46f4442e
A
5211 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5212 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
b75a7d8f
A
5213 break;
5214 }
46f4442e
A
5215 } else {
5216 fHitEnd = TRUE;
5217 }
5218 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f 5219 break;
57a6839d 5220
b75a7d8f 5221 case URX_STRING_I:
4388f060
A
5222 // Case-insensitive test input against a literal string.
5223 // Strings require two slots in the compiled pattern, one for the
5224 // offset to the string text, and one for the length.
5225 // The compiled string has already been case folded.
b75a7d8f 5226 {
4388f060
A
5227 const UChar *patternString = litText + opValue;
5228
5229 op = (int32_t)pat[fp->fPatIdx];
5230 fp->fPatIdx++;
5231 opType = URX_TYPE(op);
5232 opValue = URX_VAL(op);
5233 U_ASSERT(opType == URX_STRING_LEN);
5234 int32_t patternStringLen = opValue; // Length of the string from the pattern.
57a6839d 5235
4388f060
A
5236 UChar32 cText;
5237 UChar32 cPattern;
5238 UBool success = TRUE;
5239 int32_t patternStringIdx = 0;
5240 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
5241 while (patternStringIdx < patternStringLen) {
5242 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
5243 cText = inputIterator.next();
5244 if (cText != cPattern) {
5245 success = FALSE;
5246 if (cText == U_SENTINEL) {
5247 fHitEnd = TRUE;
729e4ab9 5248 }
4388f060 5249 break;
374ca955 5250 }
46f4442e 5251 }
4388f060
A
5252 if (inputIterator.inExpansion()) {
5253 success = FALSE;
5254 }
5255
5256 if (success) {
5257 fp->fInputIdx = inputIterator.getIndex();
5258 } else {
5259 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5260 }
b75a7d8f
A
5261 }
5262 break;
4388f060 5263
b75a7d8f
A
5264 case URX_LB_START:
5265 {
5266 // Entering a look-behind block.
5267 // Save Stack Ptr, Input Pos.
46f4442e 5268 // TODO: implement transparent bounds. Ticket #6067
b75a7d8f
A
5269 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5270 fData[opValue] = fStack->size();
5271 fData[opValue+1] = fp->fInputIdx;
5272 // Init the variable containing the start index for attempted matches.
5273 fData[opValue+2] = -1;
5274 // Save input string length, then reset to pin any matches to end at
5275 // the current position.
46f4442e
A
5276 fData[opValue+3] = fActiveLimit;
5277 fActiveLimit = fp->fInputIdx;
b75a7d8f
A
5278 }
5279 break;
57a6839d
A
5280
5281
b75a7d8f
A
5282 case URX_LB_CONT:
5283 {
5284 // Positive Look-Behind, at top of loop checking for matches of LB expression
5285 // at all possible input starting positions.
57a6839d 5286
b75a7d8f
A
5287 // Fetch the min and max possible match lengths. They are the operands
5288 // of this op in the pattern.
729e4ab9
A
5289 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5290 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
b75a7d8f
A
5291 U_ASSERT(minML <= maxML);
5292 U_ASSERT(minML >= 0);
57a6839d 5293
b75a7d8f
A
5294 // Fetch (from data) the last input index where a match was attempted.
5295 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
729e4ab9 5296 int64_t *lbStartIdx = &fData[opValue+2];
b75a7d8f
A
5297 if (*lbStartIdx < 0) {
5298 // First time through loop.
5299 *lbStartIdx = fp->fInputIdx - minML;
5300 } else {
5301 // 2nd through nth time through the loop.
5302 // Back up start position for match by one.
5303 if (*lbStartIdx == 0) {
729e4ab9 5304 (*lbStartIdx)--;
b75a7d8f
A
5305 } else {
5306 U16_BACK_1(inputBuf, 0, *lbStartIdx);
5307 }
5308 }
57a6839d 5309
b75a7d8f
A
5310 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5311 // We have tried all potential match starting points without
5312 // getting a match. Backtrack out, and out of the
5313 // Look Behind altogether.
46f4442e 5314 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
729e4ab9 5315 int64_t restoreInputLen = fData[opValue+3];
46f4442e 5316 U_ASSERT(restoreInputLen >= fActiveLimit);
729e4ab9 5317 U_ASSERT(restoreInputLen <= fInputLength);
46f4442e 5318 fActiveLimit = restoreInputLen;
b75a7d8f
A
5319 break;
5320 }
57a6839d 5321
b75a7d8f
A
5322 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5323 // (successful match will fall off the end of the loop.)
46f4442e 5324 fp = StateSave(fp, fp->fPatIdx-3, status);
b75a7d8f
A
5325 fp->fInputIdx = *lbStartIdx;
5326 }
5327 break;
57a6839d 5328
b75a7d8f
A
5329 case URX_LB_END:
5330 // End of a look-behind block, after a successful match.
5331 {
5332 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
46f4442e 5333 if (fp->fInputIdx != fActiveLimit) {
b75a7d8f
A
5334 // The look-behind expression matched, but the match did not
5335 // extend all the way to the point that we are looking behind from.
5336 // FAIL out of here, which will take us back to the LB_CONT, which
5337 // will retry the match starting at another position or fail
5338 // the look-behind altogether, whichever is appropriate.
46f4442e 5339 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
5340 break;
5341 }
57a6839d 5342
b75a7d8f 5343 // Look-behind match is good. Restore the orignal input string length,
57a6839d 5344 // which had been truncated to pin the end of the lookbehind match to the
b75a7d8f 5345 // position being looked-behind.
729e4ab9 5346 int64_t originalInputLen = fData[opValue+3];
46f4442e 5347 U_ASSERT(originalInputLen >= fActiveLimit);
729e4ab9 5348 U_ASSERT(originalInputLen <= fInputLength);
46f4442e 5349 fActiveLimit = originalInputLen;
b75a7d8f
A
5350 }
5351 break;
57a6839d
A
5352
5353
b75a7d8f
A
5354 case URX_LBN_CONT:
5355 {
5356 // Negative Look-Behind, at top of loop checking for matches of LB expression
5357 // at all possible input starting positions.
57a6839d 5358
b75a7d8f 5359 // Fetch the extra parameters of this op.
729e4ab9
A
5360 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5361 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5362 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5363 continueLoc = URX_VAL(continueLoc);
b75a7d8f
A
5364 U_ASSERT(minML <= maxML);
5365 U_ASSERT(minML >= 0);
5366 U_ASSERT(continueLoc > fp->fPatIdx);
57a6839d 5367
b75a7d8f
A
5368 // Fetch (from data) the last input index where a match was attempted.
5369 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
729e4ab9 5370 int64_t *lbStartIdx = &fData[opValue+2];
b75a7d8f
A
5371 if (*lbStartIdx < 0) {
5372 // First time through loop.
5373 *lbStartIdx = fp->fInputIdx - minML;
5374 } else {
5375 // 2nd through nth time through the loop.
5376 // Back up start position for match by one.
5377 if (*lbStartIdx == 0) {
5378 (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0.
5379 } else {
5380 U16_BACK_1(inputBuf, 0, *lbStartIdx);
5381 }
5382 }
57a6839d 5383
b75a7d8f
A
5384 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5385 // We have tried all potential match starting points without
5386 // getting a match, which means that the negative lookbehind as
5387 // a whole has succeeded. Jump forward to the continue location
729e4ab9 5388 int64_t restoreInputLen = fData[opValue+3];
46f4442e 5389 U_ASSERT(restoreInputLen >= fActiveLimit);
729e4ab9 5390 U_ASSERT(restoreInputLen <= fInputLength);
46f4442e 5391 fActiveLimit = restoreInputLen;
b75a7d8f
A
5392 fp->fPatIdx = continueLoc;
5393 break;
5394 }
57a6839d 5395
b75a7d8f
A
5396 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5397 // (successful match will cause a FAIL out of the loop altogether.)
46f4442e 5398 fp = StateSave(fp, fp->fPatIdx-4, status);
b75a7d8f
A
5399 fp->fInputIdx = *lbStartIdx;
5400 }
5401 break;
57a6839d 5402
b75a7d8f
A
5403 case URX_LBN_END:
5404 // End of a negative look-behind block, after a successful match.
5405 {
5406 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
46f4442e 5407 if (fp->fInputIdx != fActiveLimit) {
b75a7d8f
A
5408 // The look-behind expression matched, but the match did not
5409 // extend all the way to the point that we are looking behind from.
5410 // FAIL out of here, which will take us back to the LB_CONT, which
5411 // will retry the match starting at another position or succeed
5412 // the look-behind altogether, whichever is appropriate.
46f4442e 5413 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
5414 break;
5415 }
57a6839d 5416
b75a7d8f
A
5417 // Look-behind expression matched, which means look-behind test as
5418 // a whole Fails
57a6839d
A
5419
5420 // Restore the orignal input string length, which had been truncated
5421 // inorder to pin the end of the lookbehind match
b75a7d8f 5422 // to the position being looked-behind.
729e4ab9 5423 int64_t originalInputLen = fData[opValue+3];
46f4442e 5424 U_ASSERT(originalInputLen >= fActiveLimit);
729e4ab9 5425 U_ASSERT(originalInputLen <= fInputLength);
46f4442e 5426 fActiveLimit = originalInputLen;
57a6839d 5427
b75a7d8f
A
5428 // Restore original stack position, discarding any state saved
5429 // by the successful pattern match.
5430 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
729e4ab9 5431 int32_t newStackSize = (int32_t)fData[opValue];
b75a7d8f
A
5432 U_ASSERT(fStack->size() > newStackSize);
5433 fStack->setSize(newStackSize);
57a6839d
A
5434
5435 // FAIL, which will take control back to someplace
b75a7d8f 5436 // prior to entering the look-behind test.
46f4442e 5437 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
b75a7d8f
A
5438 }
5439 break;
57a6839d
A
5440
5441
b75a7d8f
A
5442 case URX_LOOP_SR_I:
5443 // Loop Initialization for the optimized implementation of
5444 // [some character set]*
5445 // This op scans through all matching input.
5446 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5447 {
5448 U_ASSERT(opValue > 0 && opValue < sets->size());
5449 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5450 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
57a6839d 5451
b75a7d8f
A
5452 // Loop through input, until either the input is exhausted or
5453 // we reach a character that is not a member of the set.
729e4ab9 5454 int32_t ix = (int32_t)fp->fInputIdx;
b75a7d8f 5455 for (;;) {
46f4442e
A
5456 if (ix >= fActiveLimit) {
5457 fHitEnd = TRUE;
b75a7d8f
A
5458 break;
5459 }
5460 UChar32 c;
46f4442e 5461 U16_NEXT(inputBuf, ix, fActiveLimit, c);
b75a7d8f
A
5462 if (c<256) {
5463 if (s8->contains(c) == FALSE) {
5464 U16_BACK_1(inputBuf, 0, ix);
5465 break;
5466 }
5467 } else {
5468 if (s->contains(c) == FALSE) {
5469 U16_BACK_1(inputBuf, 0, ix);
5470 break;
5471 }
5472 }
5473 }
57a6839d 5474
b75a7d8f
A
5475 // If there were no matching characters, skip over the loop altogether.
5476 // The loop doesn't run at all, a * op always succeeds.
5477 if (ix == fp->fInputIdx) {
5478 fp->fPatIdx++; // skip the URX_LOOP_C op.
5479 break;
5480 }
57a6839d 5481
b75a7d8f
A
5482 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5483 // must follow. It's operand is the stack location
5484 // that holds the starting input index for the match of this [set]*
729e4ab9 5485 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
b75a7d8f
A
5486 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5487 int32_t stackLoc = URX_VAL(loopcOp);
46f4442e 5488 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
b75a7d8f
A
5489 fp->fExtra[stackLoc] = fp->fInputIdx;
5490 fp->fInputIdx = ix;
57a6839d 5491
b75a7d8f
A
5492 // Save State to the URX_LOOP_C op that follows this one,
5493 // so that match failures in the following code will return to there.
5494 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
46f4442e 5495 fp = StateSave(fp, fp->fPatIdx, status);
b75a7d8f
A
5496 fp->fPatIdx++;
5497 }
5498 break;
57a6839d
A
5499
5500
b75a7d8f
A
5501 case URX_LOOP_DOT_I:
5502 // Loop Initialization for the optimized implementation of .*
5503 // This op scans through all remaining input.
5504 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5505 {
5506 // Loop through input until the input is exhausted (we reach an end-of-line)
46f4442e 5507 // In DOTALL mode, we can just go straight to the end of the input.
374ca955 5508 int32_t ix;
46f4442e
A
5509 if ((opValue & 1) == 1) {
5510 // Dot-matches-All mode. Jump straight to the end of the string.
729e4ab9 5511 ix = (int32_t)fActiveLimit;
46f4442e 5512 fHitEnd = TRUE;
374ca955 5513 } else {
46f4442e 5514 // NOT DOT ALL mode. Line endings do not match '.'
b75a7d8f 5515 // Scan forward until a line ending or end of input.
729e4ab9 5516 ix = (int32_t)fp->fInputIdx;
b75a7d8f 5517 for (;;) {
46f4442e
A
5518 if (ix >= fActiveLimit) {
5519 fHitEnd = TRUE;
b75a7d8f
A
5520 break;
5521 }
5522 UChar32 c;
46f4442e 5523 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++]
729e4ab9
A
5524 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
5525 if ((c == 0x0a) || // 0x0a is newline in both modes.
5526 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
5527 ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) {
46f4442e
A
5528 // char is a line ending. Put the input pos back to the
5529 // line ending char, and exit the scanning loop.
5530 U16_BACK_1(inputBuf, 0, ix);
5531 break;
5532 }
b75a7d8f
A
5533 }
5534 }
5535 }
57a6839d 5536
b75a7d8f
A
5537 // If there were no matching characters, skip over the loop altogether.
5538 // The loop doesn't run at all, a * op always succeeds.
5539 if (ix == fp->fInputIdx) {
5540 fp->fPatIdx++; // skip the URX_LOOP_C op.
5541 break;
5542 }
57a6839d 5543
b75a7d8f
A
5544 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5545 // must follow. It's operand is the stack location
46f4442e 5546 // that holds the starting input index for the match of this .*
729e4ab9 5547 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
b75a7d8f
A
5548 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5549 int32_t stackLoc = URX_VAL(loopcOp);
46f4442e 5550 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
b75a7d8f
A
5551 fp->fExtra[stackLoc] = fp->fInputIdx;
5552 fp->fInputIdx = ix;
57a6839d 5553
b75a7d8f
A
5554 // Save State to the URX_LOOP_C op that follows this one,
5555 // so that match failures in the following code will return to there.
5556 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
46f4442e 5557 fp = StateSave(fp, fp->fPatIdx, status);
b75a7d8f
A
5558 fp->fPatIdx++;
5559 }
5560 break;
57a6839d
A
5561
5562
b75a7d8f
A
5563 case URX_LOOP_C:
5564 {
46f4442e 5565 U_ASSERT(opValue>=0 && opValue<fFrameSize);
729e4ab9
A
5566 backSearchIndex = (int32_t)fp->fExtra[opValue];
5567 U_ASSERT(backSearchIndex <= fp->fInputIdx);
5568 if (backSearchIndex == fp->fInputIdx) {
b75a7d8f 5569 // We've backed up the input idx to the point that the loop started.
57a6839d 5570 // The loop is done. Leave here without saving state.
b75a7d8f
A
5571 // Subsequent failures won't come back here.
5572 break;
5573 }
5574 // Set up for the next iteration of the loop, with input index
5575 // backed up by one from the last time through,
5576 // and a state save to this instruction in case the following code fails again.
5577 // (We're going backwards because this loop emulates stack unwinding, not
5578 // the initial scan forward.)
5579 U_ASSERT(fp->fInputIdx > 0);
729e4ab9
A
5580 UChar32 prevC;
5581 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
57a6839d
A
5582
5583 if (prevC == 0x0a &&
729e4ab9 5584 fp->fInputIdx > backSearchIndex &&
b75a7d8f 5585 inputBuf[fp->fInputIdx-1] == 0x0d) {
729e4ab9 5586 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
b75a7d8f
A
5587 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
5588 // .*, stepping back over CRLF pair.
729e4ab9 5589 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
b75a7d8f
A
5590 }
5591 }
57a6839d
A
5592
5593
46f4442e 5594 fp = StateSave(fp, fp->fPatIdx-1, status);
b75a7d8f
A
5595 }
5596 break;
57a6839d
A
5597
5598
5599
b75a7d8f
A
5600 default:
5601 // Trouble. The compiled pattern contains an entry with an
5602 // unrecognized type tag.
5603 U_ASSERT(FALSE);
5604 }
57a6839d 5605
b75a7d8f 5606 if (U_FAILURE(status)) {
46f4442e 5607 isMatch = FALSE;
b75a7d8f
A
5608 break;
5609 }
5610 }
57a6839d 5611
b75a7d8f
A
5612breakFromLoop:
5613 fMatch = isMatch;
5614 if (isMatch) {
5615 fLastMatchEnd = fMatchEnd;
5616 fMatchStart = startIdx;
5617 fMatchEnd = fp->fInputIdx;
b75a7d8f 5618 }
57a6839d
A
5619
5620#ifdef REGEX_RUN_DEBUG
5621 if (fTraceDebug) {
5622 if (isMatch) {
5623 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
5624 } else {
5625 printf("No match\n\n");
b75a7d8f
A
5626 }
5627 }
57a6839d
A
5628#endif
5629
b75a7d8f 5630 fFrame = fp; // The active stack frame when the engine stopped.
57a6839d
A
5631 // Contains the capture group results that we need to
5632 // access later.
b75a7d8f
A
5633
5634 return;
5635}
5636
5637
374ca955 5638UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
b75a7d8f
A
5639
5640U_NAMESPACE_END
5641
5642#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS