1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
5 * Copyright (c) 2002-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
12 // ICU Regular Expressions test, part of intltest.
18 PLEASE be careful about ASCII assumptions in this test.
19 This test is one of the worst repeat offenders.
20 If you have questions, contact someone on the ICU PMC
21 who has access to an EBCDIC system.
26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
32 #include "unicode/localpointer.h"
33 #include "unicode/regex.h"
34 #include "unicode/uchar.h"
35 #include "unicode/ucnv.h"
36 #include "unicode/uniset.h"
37 #include "unicode/uregex.h"
38 #include "unicode/usetiter.h"
39 #include "unicode/ustring.h"
40 #include "unicode/utext.h"
41 #include "unicode/utf16.h"
51 #define SUPPORT_MUTATING_INPUT_STRING 0
53 //---------------------------------------------------------------------------
55 // Test class boilerplate
57 //---------------------------------------------------------------------------
58 RegexTest::RegexTest()
63 RegexTest::~RegexTest()
69 void RegexTest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* /*par*/ )
71 if (exec
) logln("TestSuite RegexTest: ");
74 TESTCASE_AUTO(API_Match
);
75 TESTCASE_AUTO(API_Replace
);
76 TESTCASE_AUTO(API_Pattern
);
77 #if !UCONFIG_NO_FILE_IO
78 TESTCASE_AUTO(Extended
);
80 TESTCASE_AUTO(Errors
);
81 TESTCASE_AUTO(PerlTests
);
82 TESTCASE_AUTO(Callbacks
);
83 TESTCASE_AUTO(FindProgressCallbacks
);
84 TESTCASE_AUTO(Bug6149
);
85 TESTCASE_AUTO(UTextBasic
);
86 TESTCASE_AUTO(API_Match_UTF8
);
87 TESTCASE_AUTO(API_Replace_UTF8
);
88 TESTCASE_AUTO(API_Pattern_UTF8
);
89 TESTCASE_AUTO(PerlTestsUTF8
);
90 TESTCASE_AUTO(PreAllocatedUTextCAPI
);
91 TESTCASE_AUTO(Bug7651
);
92 TESTCASE_AUTO(Bug7740
);
93 TESTCASE_AUTO(Bug8479
);
94 TESTCASE_AUTO(Bug7029
);
95 TESTCASE_AUTO(CheckInvBufSize
);
96 TESTCASE_AUTO(Bug9283
);
97 TESTCASE_AUTO(Bug10459
);
98 TESTCASE_AUTO(TestCaseInsensitiveStarters
);
99 TESTCASE_AUTO(TestBug11049
);
100 TESTCASE_AUTO(TestBug11371
);
101 TESTCASE_AUTO(TestBug11480
);
102 TESTCASE_AUTO(NamedCapture
);
103 TESTCASE_AUTO(NamedCaptureLimits
);
104 TESTCASE_AUTO(TestBug12884
);
105 TESTCASE_AUTO(TestBug13631
);
106 TESTCASE_AUTO(TestBug13632
);
112 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
114 * @see utext_openUTF8
116 static UText
* regextst_openUTF8FromInvariant(UText
* ut
, const char *inv
, int64_t length
, UErrorCode
*status
);
118 //---------------------------------------------------------------------------
120 // Error Checking / Reporting macros used in all of the tests.
122 //---------------------------------------------------------------------------
124 static void utextToPrintable(char *buf
, int32_t bufLen
, UText
*text
) {
125 int64_t oldIndex
= utext_getNativeIndex(text
);
126 utext_setNativeIndex(text
, 0);
128 UChar32 c
= utext_next32From(text
, 0);
129 while ((c
!= U_SENTINEL
) && (bufPtr
< buf
+bufLen
)) {
130 if (0x000020<=c
&& c
<0x00007e) {
134 sprintf(bufPtr
,"U+%04X", c
);
135 bufPtr
+= strlen(bufPtr
)-1;
141 c
= UTEXT_NEXT32(text
);
144 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
145 char *ebuf
= (char*)malloc(bufLen
);
146 uprv_eastrncpy((unsigned char*)ebuf
, (const unsigned char*)buf
, bufLen
);
147 uprv_strncpy(buf
, ebuf
, bufLen
);
150 utext_setNativeIndex(text
, oldIndex
);
154 static char ASSERT_BUF
[1024];
156 const char* RegexTest::extractToAssertBuf(const UnicodeString
& message
) {
157 if(message
.length()==0) {
158 strcpy(ASSERT_BUF
, "[[empty UnicodeString]]");
161 IntlTest::prettify(message
,buf
);
162 if(buf
.length()==0) {
163 strcpy(ASSERT_BUF
, "[[escape() returned 0 chars]]");
165 buf
.extract(0, 0x7FFFFFFF, ASSERT_BUF
, sizeof(ASSERT_BUF
)-1);
166 if(ASSERT_BUF
[0]==0) {
168 for(int32_t i
=0;i
<buf
.length();i
++) {
170 sprintf(ASSERT_BUF
+strlen(ASSERT_BUF
),"\\u%02x",ch
);
175 ASSERT_BUF
[sizeof(ASSERT_BUF
)-1] = 0;
179 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,UPRV_LENGTHOF(buf),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
181 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \
182 __FILE__, __LINE__, u_errorName(status)); return;}}
184 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
186 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
187 if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \
188 __LINE__, u_errorName(errcode), u_errorName(status));};}
190 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
191 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
193 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
194 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
196 // expected: const char * , restricted to invariant characters.
197 // actual: const UnicodeString &
198 #define REGEX_ASSERT_UNISTR(expected, actual) { \
199 if (UnicodeString(expected, -1, US_INV) != (actual)) { \
200 errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n", \
201 __FILE__, __LINE__, expected, extractToAssertBuf(actual));};}
204 static UBool
testUTextEqual(UText
*uta
, UText
*utb
) {
207 utext_setNativeIndex(uta
, 0);
208 utext_setNativeIndex(utb
, 0);
210 ca
= utext_next32(uta
);
211 cb
= utext_next32(utb
);
215 } while (ca
!= U_SENTINEL
);
221 * @param expected expected text in UTF-8 (not platform) codepage
223 void RegexTest::assertUText(const char *expected
, UText
*actual
, const char *file
, int line
) {
224 UErrorCode status
= U_ZERO_ERROR
;
225 UText expectedText
= UTEXT_INITIALIZER
;
226 utext_openUTF8(&expectedText
, expected
, -1, &status
);
227 if(U_FAILURE(status
)) {
228 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file
, line
, u_errorName(status
), strlen(expected
));
231 if(utext_nativeLength(&expectedText
)==0 && (strlen(expected
)!=0)) {
232 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file
, line
, strlen(expected
));
235 utext_setNativeIndex(actual
, 0);
236 if (!testUTextEqual(&expectedText
, actual
)) {
237 char buf
[201 /*21*/];
238 char expectedBuf
[201];
239 utextToPrintable(buf
, UPRV_LENGTHOF(buf
), actual
);
240 utextToPrintable(expectedBuf
, UPRV_LENGTHOF(expectedBuf
), &expectedText
);
241 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file
, line
, expectedBuf
, (int)utext_nativeLength(&expectedText
), buf
, (int)utext_nativeLength(actual
));
243 utext_close(&expectedText
);
246 * @param expected invariant (platform local text) input
249 void RegexTest::assertUTextInvariant(const char *expected
, UText
*actual
, const char *file
, int line
) {
250 UErrorCode status
= U_ZERO_ERROR
;
251 UText expectedText
= UTEXT_INITIALIZER
;
252 regextst_openUTF8FromInvariant(&expectedText
, expected
, -1, &status
);
253 if(U_FAILURE(status
)) {
254 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file
, line
, u_errorName(status
), strlen(expected
));
257 utext_setNativeIndex(actual
, 0);
258 if (!testUTextEqual(&expectedText
, actual
)) {
259 char buf
[201 /*21*/];
260 char expectedBuf
[201];
261 utextToPrintable(buf
, UPRV_LENGTHOF(buf
), actual
);
262 utextToPrintable(expectedBuf
, UPRV_LENGTHOF(expectedBuf
), &expectedText
);
263 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file
, line
, expectedBuf
, (int)utext_nativeLength(&expectedText
), buf
, (int)utext_nativeLength(actual
));
265 utext_close(&expectedText
);
269 * Assumes utf-8 input
271 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
273 * Assumes Invariant input
275 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
278 * This buffer ( inv_buf ) is used to hold the UTF-8 strings
279 * passed into utext_openUTF8. An error will be given if
280 * INV_BUFSIZ is too small. It's only used on EBCDIC systems.
283 #define INV_BUFSIZ 2048 /* increase this if too small */
285 static int64_t inv_next
=0;
287 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
288 static char inv_buf
[INV_BUFSIZ
];
291 static UText
* regextst_openUTF8FromInvariant(UText
*ut
, const char *inv
, int64_t length
, UErrorCode
*status
) {
292 if(length
==-1) length
=strlen(inv
);
293 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
295 return utext_openUTF8(ut
, inv
, length
, status
);
297 if(inv_next
+length
+1>INV_BUFSIZ
) {
298 fprintf(stderr
, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
299 __FILE__
, __LINE__
, INV_BUFSIZ
, (inv_next
+length
+1));
300 *status
= U_MEMORY_ALLOCATION_ERROR
;
304 unsigned char *buf
= (unsigned char*)inv_buf
+inv_next
;
305 uprv_aestrncpy(buf
, (const uint8_t*)inv
, length
);
309 fprintf(stderr
, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ
, inv_next
);
312 return utext_openUTF8(ut
, (const char*)buf
, length
, status
);
317 //---------------------------------------------------------------------------
319 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests
320 // for the LookingAt() and Match() functions.
323 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
325 // The expected results are UBool - TRUE or FALSE.
326 // The input text is unescaped. The pattern is not.
329 //---------------------------------------------------------------------------
331 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
333 UBool
RegexTest::doRegexLMTest(const char *pat
, const char *text
, UBool looking
, UBool match
, int32_t line
) {
334 const UnicodeString
pattern(pat
, -1, US_INV
);
335 const UnicodeString
inputText(text
, -1, US_INV
);
336 UErrorCode status
= U_ZERO_ERROR
;
338 RegexPattern
*REPattern
= NULL
;
339 RegexMatcher
*REMatcher
= NULL
;
342 UnicodeString
patString(pat
, -1, US_INV
);
343 REPattern
= RegexPattern::compile(patString
, 0, pe
, status
);
344 if (U_FAILURE(status
)) {
345 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s",
346 line
, u_errorName(status
));
349 if (line
==376) { REPattern
->dumpPattern();}
351 UnicodeString
inputString(inputText
);
352 UnicodeString unEscapedInput
= inputString
.unescape();
353 REMatcher
= REPattern
->matcher(unEscapedInput
, status
);
354 if (U_FAILURE(status
)) {
355 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
356 line
, u_errorName(status
));
361 actualmatch
= REMatcher
->lookingAt(status
);
362 if (U_FAILURE(status
)) {
363 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
364 line
, u_errorName(status
));
367 if (actualmatch
!= looking
) {
368 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line
);
372 status
= U_ZERO_ERROR
;
373 actualmatch
= REMatcher
->matches(status
);
374 if (U_FAILURE(status
)) {
375 errln("RegexTest failure in matches() at line %d. Status = %s\n",
376 line
, u_errorName(status
));
379 if (actualmatch
!= match
) {
380 errln("RegexTest: wrong return from matches() at line %d.\n", line
);
384 if (retVal
== FALSE
) {
385 REPattern
->dumpPattern();
394 UBool
RegexTest::doRegexLMTestUTF8(const char *pat
, const char *text
, UBool looking
, UBool match
, int32_t line
) {
395 UText pattern
= UTEXT_INITIALIZER
;
396 int32_t inputUTF8Length
;
397 char *textChars
= NULL
;
398 UText inputText
= UTEXT_INITIALIZER
;
399 UErrorCode status
= U_ZERO_ERROR
;
401 RegexPattern
*REPattern
= NULL
;
402 RegexMatcher
*REMatcher
= NULL
;
405 regextst_openUTF8FromInvariant(&pattern
, pat
, -1, &status
);
406 REPattern
= RegexPattern::compile(&pattern
, 0, pe
, status
);
407 if (U_FAILURE(status
)) {
408 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n",
409 line
, u_errorName(status
));
413 UnicodeString
inputString(text
, -1, US_INV
);
414 UnicodeString unEscapedInput
= inputString
.unescape();
415 LocalUConverterPointer
UTF8Converter(ucnv_open("UTF8", &status
));
416 ucnv_setFromUCallBack(UTF8Converter
.getAlias(), UCNV_FROM_U_CALLBACK_STOP
, NULL
, NULL
, NULL
, &status
);
418 inputUTF8Length
= unEscapedInput
.extract(NULL
, 0, UTF8Converter
.getAlias(), status
);
419 if (U_FAILURE(status
) && status
!= U_BUFFER_OVERFLOW_ERROR
) {
420 // UTF-8 does not allow unpaired surrogates, so this could actually happen
421 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line
, u_errorName(status
));
422 return TRUE
; // not a failure of the Regex engine
424 status
= U_ZERO_ERROR
; // buffer overflow
425 textChars
= new char[inputUTF8Length
+1];
426 unEscapedInput
.extract(textChars
, inputUTF8Length
+1, UTF8Converter
.getAlias(), status
);
427 utext_openUTF8(&inputText
, textChars
, inputUTF8Length
, &status
);
429 REMatcher
= &REPattern
->matcher(status
)->reset(&inputText
);
430 if (U_FAILURE(status
)) {
431 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n",
432 line
, u_errorName(status
));
437 actualmatch
= REMatcher
->lookingAt(status
);
438 if (U_FAILURE(status
)) {
439 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n",
440 line
, u_errorName(status
));
443 if (actualmatch
!= looking
) {
444 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line
);
448 status
= U_ZERO_ERROR
;
449 actualmatch
= REMatcher
->matches(status
);
450 if (U_FAILURE(status
)) {
451 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n",
452 line
, u_errorName(status
));
455 if (actualmatch
!= match
) {
456 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line
);
460 if (retVal
== FALSE
) {
461 REPattern
->dumpPattern();
466 utext_close(&inputText
);
467 utext_close(&pattern
);
474 //---------------------------------------------------------------------------
476 // REGEX_ERR Macro + invocation function to simplify writing tests
477 // regex tests for incorrect patterns
480 // REGEX_ERR("pattern", expected error line, column, expected status);
482 //---------------------------------------------------------------------------
483 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
485 void RegexTest::regex_err(const char *pat
, int32_t errLine
, int32_t errCol
,
486 UErrorCode expectedStatus
, int32_t line
) {
487 UnicodeString
pattern(pat
);
489 UErrorCode status
= U_ZERO_ERROR
;
491 RegexPattern
*callerPattern
= NULL
;
494 // Compile the caller's pattern
496 UnicodeString
patString(pat
);
497 callerPattern
= RegexPattern::compile(patString
, 0, pe
, status
);
498 if (status
!= expectedStatus
) {
499 dataerrln("Line %d: unexpected error %s compiling pattern.", line
, u_errorName(status
));
501 if (status
!= U_ZERO_ERROR
) {
502 if (pe
.line
!= errLine
|| pe
.offset
!= errCol
) {
503 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
504 line
, errLine
, errCol
, pe
.line
, pe
.offset
);
509 delete callerPattern
;
512 // Compile again, using a UTF-8-based UText
514 UText patternText
= UTEXT_INITIALIZER
;
515 regextst_openUTF8FromInvariant(&patternText
, pat
, -1, &status
);
516 callerPattern
= RegexPattern::compile(&patternText
, 0, pe
, status
);
517 if (status
!= expectedStatus
) {
518 dataerrln("Line %d: unexpected error %s compiling pattern.", line
, u_errorName(status
));
520 if (status
!= U_ZERO_ERROR
) {
521 if (pe
.line
!= errLine
|| pe
.offset
!= errCol
) {
522 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
523 line
, errLine
, errCol
, pe
.line
, pe
.offset
);
528 delete callerPattern
;
529 utext_close(&patternText
);
534 //---------------------------------------------------------------------------
536 // Basic Check for basic functionality of regex pattern matching.
537 // Avoid the use of REGEX_FIND test macro, which has
538 // substantial dependencies on basic Regex functionality.
540 //---------------------------------------------------------------------------
541 void RegexTest::Basic() {
545 // Debug - slide failing test cases early
549 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
551 UErrorCode status
= U_ZERO_ERROR
;
552 RegexPattern
*pattern
;
553 pattern
= RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE
, pe
, status
);
554 pattern
->dumpPattern();
555 RegexMatcher
*m
= pattern
->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status
);
556 UBool result
= m
->find();
557 printf("result = %d\n", result
);
558 // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
559 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
566 // Pattern with parentheses
568 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE
, FALSE
);
569 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE
, TRUE
);
570 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE
, FALSE
);
575 REGEX_TESTLM("st(abc)*ring", "string", TRUE
, TRUE
);
576 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE
, TRUE
);
577 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE
, TRUE
);
578 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE
, FALSE
);
579 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE
, FALSE
);
581 REGEX_TESTLM("a*", "", TRUE
, TRUE
);
582 REGEX_TESTLM("a*", "b", TRUE
, FALSE
);
588 REGEX_TESTLM(".", "abc", TRUE
, FALSE
);
589 REGEX_TESTLM("...", "abc", TRUE
, TRUE
);
590 REGEX_TESTLM("....", "abc", FALSE
, FALSE
);
591 REGEX_TESTLM(".*", "abcxyz123", TRUE
, TRUE
);
592 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE
, FALSE
);
593 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE
, TRUE
);
594 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE
, TRUE
);
595 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE
, FALSE
);
598 // Patterns with * applied to chars at end of literal string
600 REGEX_TESTLM("abc*", "ab", TRUE
, TRUE
);
601 REGEX_TESTLM("abc*", "abccccc", TRUE
, TRUE
);
604 // Supplemental chars match as single chars, not a pair of surrogates.
606 REGEX_TESTLM(".", "\\U00011000", TRUE
, TRUE
);
607 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE
, TRUE
);
608 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE
, FALSE
);
612 // UnicodeSets in the pattern
614 REGEX_TESTLM("[1-6]", "1", TRUE
, TRUE
);
615 REGEX_TESTLM("[1-6]", "3", TRUE
, TRUE
);
616 REGEX_TESTLM("[1-6]", "7", FALSE
, FALSE
);
617 REGEX_TESTLM("a[1-6]", "a3", TRUE
, TRUE
);
618 REGEX_TESTLM("a[1-6]", "a3", TRUE
, TRUE
);
619 REGEX_TESTLM("a[1-6]b", "a3b", TRUE
, TRUE
);
621 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE
, TRUE
);
622 REGEX_TESTLM("a[0-9]*b", "abc", TRUE
, FALSE
);
623 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE
, TRUE
);
624 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE
, FALSE
); // note that * matches 0 occurences.
625 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE
, TRUE
);
628 // OR operator in patterns
630 REGEX_TESTLM("(a|b)", "a", TRUE
, TRUE
);
631 REGEX_TESTLM("(a|b)", "b", TRUE
, TRUE
);
632 REGEX_TESTLM("(a|b)", "c", FALSE
, FALSE
);
633 REGEX_TESTLM("a|b", "b", TRUE
, TRUE
);
635 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE
, TRUE
);
636 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE
, FALSE
);
637 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE
, TRUE
);
638 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE
, TRUE
);
639 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE
, TRUE
);
640 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE
, FALSE
);
645 REGEX_TESTLM("ab+", "abbc", TRUE
, FALSE
);
646 REGEX_TESTLM("ab+c", "ac", FALSE
, FALSE
);
647 REGEX_TESTLM("b+", "", FALSE
, FALSE
);
648 REGEX_TESTLM("(abc|def)+", "defabc", TRUE
, TRUE
);
649 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE
, FALSE
);
650 REGEX_TESTLM(".+y", "zippity dooy", TRUE
, TRUE
);
655 REGEX_TESTLM("ab?", "ab", TRUE
, TRUE
);
656 REGEX_TESTLM("ab?", "a", TRUE
, TRUE
);
657 REGEX_TESTLM("ab?", "ac", TRUE
, FALSE
);
658 REGEX_TESTLM("ab?", "abb", TRUE
, FALSE
);
659 REGEX_TESTLM("a(b|c)?d", "abd", TRUE
, TRUE
);
660 REGEX_TESTLM("a(b|c)?d", "acd", TRUE
, TRUE
);
661 REGEX_TESTLM("a(b|c)?d", "ad", TRUE
, TRUE
);
662 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE
, FALSE
);
663 REGEX_TESTLM("a(b|c)?d", "ab", FALSE
, FALSE
);
666 // Escape sequences that become single literal chars, handled internally
667 // by ICU's Unescape.
670 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
671 REGEX_TESTLM("\\a", "\\u0007", TRUE
, TRUE
); // BEL
672 REGEX_TESTLM("\\cL", "\\u000c", TRUE
, TRUE
); // Control-L
673 REGEX_TESTLM("\\e", "\\u001b", TRUE
, TRUE
); // Escape
674 REGEX_TESTLM("\\f", "\\u000c", TRUE
, TRUE
); // Form Feed
675 REGEX_TESTLM("\\n", "\\u000a", TRUE
, TRUE
); // new line
676 REGEX_TESTLM("\\r", "\\u000d", TRUE
, TRUE
); // CR
677 REGEX_TESTLM("\\t", "\\u0009", TRUE
, TRUE
); // Tab
678 REGEX_TESTLM("\\u1234", "\\u1234", TRUE
, TRUE
);
679 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE
, TRUE
);
681 REGEX_TESTLM(".*\\Ax", "xyz", TRUE
, FALSE
); // \A matches only at the beginning of input
682 REGEX_TESTLM(".*\\Ax", " xyz", FALSE
, FALSE
); // \A matches only at the beginning of input
684 // Escape of special chars in patterns
685 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE
, TRUE
);
689 //---------------------------------------------------------------------------
691 // UTextBasic Check for quirks that are specific to the UText
694 //---------------------------------------------------------------------------
695 void RegexTest::UTextBasic() {
696 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
697 UErrorCode status
= U_ZERO_ERROR
;
698 UText pattern
= UTEXT_INITIALIZER
;
699 utext_openUTF8(&pattern
, str_abc
, -1, &status
);
700 RegexMatcher
matcher(&pattern
, 0, status
);
703 UText input
= UTEXT_INITIALIZER
;
704 utext_openUTF8(&input
, str_abc
, -1, &status
);
706 matcher
.reset(&input
);
708 REGEX_ASSERT_UTEXT_UTF8(str_abc
, matcher
.inputText());
710 matcher
.reset(matcher
.inputText());
712 REGEX_ASSERT_UTEXT_UTF8(str_abc
, matcher
.inputText());
714 utext_close(&pattern
);
719 //---------------------------------------------------------------------------
721 // API_Match Test that the API for class RegexMatcher
722 // is present and nominally working, but excluding functions
723 // implementing replace operations.
725 //---------------------------------------------------------------------------
726 void RegexTest::API_Match() {
728 UErrorCode status
=U_ZERO_ERROR
;
732 // Debug - slide failing test cases early
741 // Simple pattern compilation
744 UnicodeString
re("abc");
746 pat2
= RegexPattern::compile(re
, flags
, pe
, status
);
749 UnicodeString inStr1
= "abcdef this is a test";
750 UnicodeString instr2
= "not abc";
751 UnicodeString empty
= "";
755 // Matcher creation and reset.
757 RegexMatcher
*m1
= pat2
->matcher(inStr1
, status
);
759 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
760 REGEX_ASSERT(m1
->input() == inStr1
);
762 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
763 REGEX_ASSERT(m1
->input() == instr2
);
765 REGEX_ASSERT(m1
->input() == inStr1
);
766 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
768 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
769 REGEX_ASSERT(m1
->input() == empty
);
770 REGEX_ASSERT(&m1
->pattern() == pat2
);
773 // reset(pos, status)
776 m1
->reset(4, status
);
778 REGEX_ASSERT(m1
->input() == inStr1
);
779 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
781 m1
->reset(-1, status
);
782 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
783 status
= U_ZERO_ERROR
;
785 m1
->reset(0, status
);
787 status
= U_ZERO_ERROR
;
789 int32_t len
= m1
->input().length();
790 m1
->reset(len
-1, status
);
792 status
= U_ZERO_ERROR
;
794 m1
->reset(len
, status
);
796 status
= U_ZERO_ERROR
;
798 m1
->reset(len
+1, status
);
799 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
800 status
= U_ZERO_ERROR
;
803 // match(pos, status)
806 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
808 REGEX_ASSERT(m1
->matches(3, status
) == FALSE
);
810 REGEX_ASSERT(m1
->matches(5, status
) == FALSE
);
811 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
812 REGEX_ASSERT(m1
->matches(-1, status
) == FALSE
);
813 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
815 // Match() at end of string should fail, but should not
817 status
= U_ZERO_ERROR
;
818 len
= m1
->input().length();
819 REGEX_ASSERT(m1
->matches(len
, status
) == FALSE
);
822 // Match beyond end of string should fail with an error.
823 status
= U_ZERO_ERROR
;
824 REGEX_ASSERT(m1
->matches(len
+1, status
) == FALSE
);
825 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
827 // Successful match at end of string.
829 status
= U_ZERO_ERROR
;
830 RegexMatcher
m("A?", 0, status
); // will match zero length string.
833 len
= inStr1
.length();
834 REGEX_ASSERT(m
.matches(len
, status
) == TRUE
);
837 REGEX_ASSERT(m
.matches(0, status
) == TRUE
);
843 // lookingAt(pos, status)
845 status
= U_ZERO_ERROR
;
846 m1
->reset(instr2
); // "not abc"
847 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
848 REGEX_ASSERT(m1
->lookingAt(5, status
) == FALSE
);
849 REGEX_ASSERT(m1
->lookingAt(3, status
) == FALSE
);
850 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
851 REGEX_ASSERT(m1
->lookingAt(-1, status
) == FALSE
);
852 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
853 status
= U_ZERO_ERROR
;
854 len
= m1
->input().length();
855 REGEX_ASSERT(m1
->lookingAt(len
, status
) == FALSE
);
857 REGEX_ASSERT(m1
->lookingAt(len
+1, status
) == FALSE
);
858 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
867 // RegexMatcher::start();
868 // RegexMatcher::end();
869 // RegexMatcher::groupCount();
874 UErrorCode status
=U_ZERO_ERROR
;
876 UnicodeString
re("01(23(45)67)(.*)");
877 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
879 UnicodeString data
= "0123456789";
881 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
883 REGEX_ASSERT(matcher
->lookingAt(status
) == TRUE
);
884 static const int32_t matchStarts
[] = {0, 2, 4, 8};
885 static const int32_t matchEnds
[] = {10, 8, 6, 10};
887 for (i
=0; i
<4; i
++) {
888 int32_t actualStart
= matcher
->start(i
, status
);
890 if (actualStart
!= matchStarts
[i
]) {
891 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
892 __LINE__
, i
, matchStarts
[i
], actualStart
);
894 int32_t actualEnd
= matcher
->end(i
, status
);
896 if (actualEnd
!= matchEnds
[i
]) {
897 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
898 __LINE__
, i
, matchEnds
[i
], actualEnd
);
902 REGEX_ASSERT(matcher
->start(0, status
) == matcher
->start(status
));
903 REGEX_ASSERT(matcher
->end(0, status
) == matcher
->end(status
));
905 REGEX_ASSERT_FAIL(matcher
->start(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
906 REGEX_ASSERT_FAIL(matcher
->start( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
908 REGEX_ASSERT_FAIL(matcher
->start( 0, status
), U_REGEX_INVALID_STATE
);
910 matcher
->lookingAt(status
);
911 REGEX_ASSERT(matcher
->group(status
) == "0123456789");
912 REGEX_ASSERT(matcher
->group(0, status
) == "0123456789");
913 REGEX_ASSERT(matcher
->group(1, status
) == "234567" );
914 REGEX_ASSERT(matcher
->group(2, status
) == "45" );
915 REGEX_ASSERT(matcher
->group(3, status
) == "89" );
917 REGEX_ASSERT_FAIL(matcher
->group(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
918 REGEX_ASSERT_FAIL(matcher
->group( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
920 REGEX_ASSERT_FAIL(matcher
->group( 0, status
), U_REGEX_INVALID_STATE
);
933 UErrorCode status
=U_ZERO_ERROR
;
935 UnicodeString
re("abc");
936 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
938 UnicodeString data
= ".abc..abc...abc..";
939 // 012345678901234567
941 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
943 REGEX_ASSERT(matcher
->find());
944 REGEX_ASSERT(matcher
->start(status
) == 1);
945 REGEX_ASSERT(matcher
->find());
946 REGEX_ASSERT(matcher
->start(status
) == 6);
947 REGEX_ASSERT(matcher
->find());
948 REGEX_ASSERT(matcher
->start(status
) == 12);
949 REGEX_ASSERT(matcher
->find() == FALSE
);
950 REGEX_ASSERT(matcher
->find() == FALSE
);
953 REGEX_ASSERT(matcher
->find());
954 REGEX_ASSERT(matcher
->start(status
) == 1);
956 REGEX_ASSERT(matcher
->find(0, status
));
957 REGEX_ASSERT(matcher
->start(status
) == 1);
958 REGEX_ASSERT(matcher
->find(1, status
));
959 REGEX_ASSERT(matcher
->start(status
) == 1);
960 REGEX_ASSERT(matcher
->find(2, status
));
961 REGEX_ASSERT(matcher
->start(status
) == 6);
962 REGEX_ASSERT(matcher
->find(12, status
));
963 REGEX_ASSERT(matcher
->start(status
) == 12);
964 REGEX_ASSERT(matcher
->find(13, status
) == FALSE
);
965 REGEX_ASSERT(matcher
->find(16, status
) == FALSE
);
966 REGEX_ASSERT(matcher
->find(17, status
) == FALSE
);
967 REGEX_ASSERT_FAIL(matcher
->start(status
), U_REGEX_INVALID_STATE
);
969 status
= U_ZERO_ERROR
;
970 REGEX_ASSERT_FAIL(matcher
->find(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
971 status
= U_ZERO_ERROR
;
972 REGEX_ASSERT_FAIL(matcher
->find(18, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
974 REGEX_ASSERT(matcher
->groupCount() == 0);
982 // find, with \G in pattern (true if at the end of a previous match).
987 UErrorCode status
=U_ZERO_ERROR
;
989 UnicodeString
re(".*?(?:(\\Gabc)|(abc))", -1, US_INV
);
990 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
992 UnicodeString data
= ".abcabc.abc..";
993 // 012345678901234567
995 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
997 REGEX_ASSERT(matcher
->find());
998 REGEX_ASSERT(matcher
->start(status
) == 0);
999 REGEX_ASSERT(matcher
->start(1, status
) == -1);
1000 REGEX_ASSERT(matcher
->start(2, status
) == 1);
1002 REGEX_ASSERT(matcher
->find());
1003 REGEX_ASSERT(matcher
->start(status
) == 4);
1004 REGEX_ASSERT(matcher
->start(1, status
) == 4);
1005 REGEX_ASSERT(matcher
->start(2, status
) == -1);
1013 // find with zero length matches, match position should bump ahead
1014 // to prevent loops.
1018 UErrorCode status
=U_ZERO_ERROR
;
1019 RegexMatcher
m("(?= ?)", 0, status
); // This pattern will zero-length matches anywhere,
1020 // using an always-true look-ahead.
1022 UnicodeString
s(" ");
1025 if (m
.find() == FALSE
) {
1028 REGEX_ASSERT(m
.start(status
) == i
);
1029 REGEX_ASSERT(m
.end(status
) == i
);
1033 // Check that the bump goes over surrogate pairs OK
1034 s
= UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1038 if (m
.find() == FALSE
) {
1041 REGEX_ASSERT(m
.start(status
) == i
);
1042 REGEX_ASSERT(m
.end(status
) == i
);
1044 REGEX_ASSERT(i
==10);
1047 // find() loop breaking test.
1048 // with pattern of /.?/, should see a series of one char matches, then a single
1049 // match of zero length at the end of the input string.
1051 UErrorCode status
=U_ZERO_ERROR
;
1052 RegexMatcher
m(".?", 0, status
);
1054 UnicodeString
s(" ");
1057 if (m
.find() == FALSE
) {
1060 REGEX_ASSERT(m
.start(status
) == i
);
1061 REGEX_ASSERT(m
.end(status
) == (i
<4 ? i
+1 : i
));
1068 // Matchers with no input string behave as if they had an empty input string.
1072 UErrorCode status
= U_ZERO_ERROR
;
1073 RegexMatcher
m(".?", 0, status
);
1075 REGEX_ASSERT(m
.find());
1076 REGEX_ASSERT(m
.start(status
) == 0);
1077 REGEX_ASSERT(m
.input() == "");
1080 UErrorCode status
= U_ZERO_ERROR
;
1081 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
1082 RegexMatcher
*m
= p
->matcher(status
);
1085 REGEX_ASSERT(m
->find() == FALSE
);
1086 REGEX_ASSERT(m
->input() == "");
1095 UErrorCode status
= U_ZERO_ERROR
;
1096 UnicodeString
testString("This is test data");
1097 RegexMatcher
m(".*", testString
, 0, status
);
1099 REGEX_ASSERT(m
.regionStart() == 0);
1100 REGEX_ASSERT(m
.regionEnd() == testString
.length());
1101 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1102 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1104 m
.region(2,4, status
);
1106 REGEX_ASSERT(m
.matches(status
));
1107 REGEX_ASSERT(m
.start(status
)==2);
1108 REGEX_ASSERT(m
.end(status
)==4);
1112 REGEX_ASSERT(m
.regionStart() == 0);
1113 REGEX_ASSERT(m
.regionEnd() == testString
.length());
1115 UnicodeString
shorterString("short");
1116 m
.reset(shorterString
);
1117 REGEX_ASSERT(m
.regionStart() == 0);
1118 REGEX_ASSERT(m
.regionEnd() == shorterString
.length());
1120 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1121 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(FALSE
));
1122 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
1123 REGEX_ASSERT(&m
== &m
.reset());
1124 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
1126 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(TRUE
));
1127 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1128 REGEX_ASSERT(&m
== &m
.reset());
1129 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1131 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1132 REGEX_ASSERT(&m
== &m
.useTransparentBounds(TRUE
));
1133 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
1134 REGEX_ASSERT(&m
== &m
.reset());
1135 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
1137 REGEX_ASSERT(&m
== &m
.useTransparentBounds(FALSE
));
1138 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1139 REGEX_ASSERT(&m
== &m
.reset());
1140 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1145 // hitEnd() and requireEnd()
1148 UErrorCode status
= U_ZERO_ERROR
;
1149 UnicodeString
testString("aabb");
1150 RegexMatcher
m1(".*", testString
, 0, status
);
1151 REGEX_ASSERT(m1
.lookingAt(status
) == TRUE
);
1152 REGEX_ASSERT(m1
.hitEnd() == TRUE
);
1153 REGEX_ASSERT(m1
.requireEnd() == FALSE
);
1156 status
= U_ZERO_ERROR
;
1157 RegexMatcher
m2("a*", testString
, 0, status
);
1158 REGEX_ASSERT(m2
.lookingAt(status
) == TRUE
);
1159 REGEX_ASSERT(m2
.hitEnd() == FALSE
);
1160 REGEX_ASSERT(m2
.requireEnd() == FALSE
);
1163 status
= U_ZERO_ERROR
;
1164 RegexMatcher
m3(".*$", testString
, 0, status
);
1165 REGEX_ASSERT(m3
.lookingAt(status
) == TRUE
);
1166 REGEX_ASSERT(m3
.hitEnd() == TRUE
);
1167 REGEX_ASSERT(m3
.requireEnd() == TRUE
);
1173 // Compilation error on reset with UChar *
1174 // These were a hazard that people were stumbling over with runtime errors.
1175 // Changed them to compiler errors by adding private methods that more closely
1176 // matched the incorrect use of the functions.
1180 UErrorCode status
= U_ZERO_ERROR
;
1181 UChar ucharString
[20];
1182 RegexMatcher
m(".", 0, status
);
1183 m
.reset(ucharString
); // should not compile.
1185 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
1186 RegexMatcher
*m2
= p
->matcher(ucharString
, status
); // should not compile.
1188 RegexMatcher
m3(".", ucharString
, 0, status
); // Should not compile
1194 // Note: These tests will need to be changed when the regexp engine is
1195 // able to detect and cut short the exponential time behavior on
1196 // this type of match.
1199 UErrorCode status
= U_ZERO_ERROR
;
1200 // Enough 'a's in the string to cause the match to time out.
1201 // (Each on additonal 'a' doubles the time)
1202 UnicodeString
testString("aaaaaaaaaaaaaaaaaaaaa");
1203 RegexMatcher
matcher("(a+)+b", testString
, 0, status
);
1205 REGEX_ASSERT(matcher
.getTimeLimit() == 0);
1206 matcher
.setTimeLimit(100, status
);
1207 REGEX_ASSERT(matcher
.getTimeLimit() == 100);
1208 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1209 REGEX_ASSERT(status
== U_REGEX_TIME_OUT
);
1212 UErrorCode status
= U_ZERO_ERROR
;
1213 // Few enough 'a's to slip in under the time limit.
1214 UnicodeString
testString("aaaaaaaaaaaaaaaaaa");
1215 RegexMatcher
matcher("(a+)+b", testString
, 0, status
);
1217 matcher
.setTimeLimit(100, status
);
1218 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1226 UErrorCode status
= U_ZERO_ERROR
;
1227 UnicodeString
testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'
1229 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1230 // of the '+', and makes the stack frames larger.
1231 RegexMatcher
matcher("(A)+A$", testString
, 0, status
);
1233 // With the default stack, this match should fail to run
1234 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1235 REGEX_ASSERT(status
== U_REGEX_STACK_OVERFLOW
);
1237 // With unlimited stack, it should run
1238 status
= U_ZERO_ERROR
;
1239 matcher
.setStackLimit(0, status
);
1241 REGEX_ASSERT(matcher
.lookingAt(status
) == TRUE
);
1243 REGEX_ASSERT(matcher
.getStackLimit() == 0);
1245 // With a limited stack, it the match should fail
1246 status
= U_ZERO_ERROR
;
1247 matcher
.setStackLimit(10000, status
);
1248 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1249 REGEX_ASSERT(status
== U_REGEX_STACK_OVERFLOW
);
1250 REGEX_ASSERT(matcher
.getStackLimit() == 10000);
1253 // A pattern that doesn't save state should work with
1254 // a minimal sized stack
1256 UErrorCode status
= U_ZERO_ERROR
;
1257 UnicodeString testString
= "abc";
1258 RegexMatcher
matcher("abc", testString
, 0, status
);
1260 matcher
.setStackLimit(30, status
);
1262 REGEX_ASSERT(matcher
.matches(status
) == TRUE
);
1264 REGEX_ASSERT(matcher
.getStackLimit() == 30);
1266 // Negative stack sizes should fail
1267 status
= U_ZERO_ERROR
;
1268 matcher
.setStackLimit(1000, status
);
1270 matcher
.setStackLimit(-1, status
);
1271 REGEX_ASSERT(status
== U_ILLEGAL_ARGUMENT_ERROR
);
1272 REGEX_ASSERT(matcher
.getStackLimit() == 1000);
1283 //---------------------------------------------------------------------------
1285 // API_Replace API test for class RegexMatcher, testing the
1286 // Replace family of functions.
1288 //---------------------------------------------------------------------------
1289 void RegexTest::API_Replace() {
1295 UErrorCode status
=U_ZERO_ERROR
;
1297 UnicodeString
re("abc");
1298 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
1300 UnicodeString data
= ".abc..abc...abc..";
1301 // 012345678901234567
1302 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
1305 // Plain vanilla matches.
1308 dest
= matcher
->replaceFirst("yz", status
);
1310 REGEX_ASSERT(dest
== ".yz..abc...abc..");
1312 dest
= matcher
->replaceAll("yz", status
);
1314 REGEX_ASSERT(dest
== ".yz..yz...yz..");
1317 // Plain vanilla non-matches.
1319 UnicodeString d2
= ".abx..abx...abx..";
1321 dest
= matcher
->replaceFirst("yz", status
);
1323 REGEX_ASSERT(dest
== ".abx..abx...abx..");
1325 dest
= matcher
->replaceAll("yz", status
);
1327 REGEX_ASSERT(dest
== ".abx..abx...abx..");
1330 // Empty source string
1332 UnicodeString d3
= "";
1334 dest
= matcher
->replaceFirst("yz", status
);
1336 REGEX_ASSERT(dest
== "");
1338 dest
= matcher
->replaceAll("yz", status
);
1340 REGEX_ASSERT(dest
== "");
1343 // Empty substitution string
1345 matcher
->reset(data
); // ".abc..abc...abc.."
1346 dest
= matcher
->replaceFirst("", status
);
1348 REGEX_ASSERT(dest
== "...abc...abc..");
1350 dest
= matcher
->replaceAll("", status
);
1352 REGEX_ASSERT(dest
== "........");
1355 // match whole string
1357 UnicodeString d4
= "abc";
1359 dest
= matcher
->replaceFirst("xyz", status
);
1361 REGEX_ASSERT(dest
== "xyz");
1363 dest
= matcher
->replaceAll("xyz", status
);
1365 REGEX_ASSERT(dest
== "xyz");
1368 // Capture Group, simple case
1370 UnicodeString
re2("a(..)");
1371 RegexPattern
*pat2
= RegexPattern::compile(re2
, flags
, pe
, status
);
1373 UnicodeString d5
= "abcdefg";
1374 RegexMatcher
*matcher2
= pat2
->matcher(d5
, status
);
1376 dest
= matcher2
->replaceFirst("$1$1", status
);
1378 REGEX_ASSERT(dest
== "bcbcdefg");
1380 dest
= matcher2
->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status
);
1382 REGEX_ASSERT(dest
== "The value of $1 is bc.defg");
1384 dest
= matcher2
->replaceFirst("$ by itself, no group number $$$", status
);
1385 REGEX_ASSERT(U_FAILURE(status
));
1386 status
= U_ZERO_ERROR
;
1388 UnicodeString replacement
= UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1389 replacement
= replacement
.unescape();
1390 dest
= matcher2
->replaceFirst(replacement
, status
);
1392 REGEX_ASSERT(dest
== "Supplemental Digit 1 bc.defg");
1394 REGEX_ASSERT_FAIL(matcher2
->replaceFirst("bad capture group number $5...",status
), U_INDEX_OUTOFBOUNDS_ERROR
);
1398 // Replacement String with \u hex escapes
1401 UnicodeString src
= "abc 1 abc 2 abc 3";
1402 UnicodeString substitute
= UNICODE_STRING_SIMPLE("--\\u0043--");
1403 matcher
->reset(src
);
1404 UnicodeString result
= matcher
->replaceAll(substitute
, status
);
1406 REGEX_ASSERT(result
== "--C-- 1 --C-- 2 --C-- 3");
1409 UnicodeString src
= "abc !";
1410 UnicodeString substitute
= UNICODE_STRING_SIMPLE("--\\U00010000--");
1411 matcher
->reset(src
);
1412 UnicodeString result
= matcher
->replaceAll(substitute
, status
);
1414 UnicodeString expected
= UnicodeString("--");
1415 expected
.append((UChar32
)0x10000);
1416 expected
.append("-- !");
1417 REGEX_ASSERT(result
== expected
);
1419 // TODO: need more through testing of capture substitutions.
1424 status
= U_ZERO_ERROR
;
1425 UnicodeString s
= "The matches start with ss and end with ee ss stuff ee fin";
1426 RegexMatcher
m("ss(.*?)ee", 0, status
);
1428 UnicodeString result
;
1430 // Multiple finds do NOT bump up the previous appendReplacement postion.
1434 m
.appendReplacement(result
, "ooh", status
);
1436 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1438 // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1439 status
= U_ZERO_ERROR
;
1441 m
.reset(10, status
);
1444 m
.appendReplacement(result
, "ooh", status
);
1446 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1448 // find() at interior of string, appendReplacemnt still starts at beginning.
1449 status
= U_ZERO_ERROR
;
1454 m
.appendReplacement(result
, "ooh", status
);
1456 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1458 m
.appendTail(result
);
1459 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh fin");
1470 //---------------------------------------------------------------------------
1472 // API_Pattern Test that the API for class RegexPattern is
1473 // present and nominally working.
1475 //---------------------------------------------------------------------------
1476 void RegexTest::API_Pattern() {
1477 RegexPattern pata
; // Test default constructor to not crash.
1480 REGEX_ASSERT(pata
== patb
);
1481 REGEX_ASSERT(pata
== pata
);
1483 UnicodeString
re1("abc[a-l][m-z]");
1484 UnicodeString
re2("def");
1485 UErrorCode status
= U_ZERO_ERROR
;
1488 RegexPattern
*pat1
= RegexPattern::compile(re1
, 0, pe
, status
);
1489 RegexPattern
*pat2
= RegexPattern::compile(re2
, 0, pe
, status
);
1491 REGEX_ASSERT(*pat1
== *pat1
);
1492 REGEX_ASSERT(*pat1
!= pata
);
1496 REGEX_ASSERT(patb
== *pat1
);
1499 RegexPattern
patc(*pat1
);
1500 REGEX_ASSERT(patc
== *pat1
);
1501 REGEX_ASSERT(patb
== patc
);
1502 REGEX_ASSERT(pat1
!= pat2
);
1504 REGEX_ASSERT(patb
!= patc
);
1505 REGEX_ASSERT(patb
== *pat2
);
1507 // Compile with no flags.
1508 RegexPattern
*pat1a
= RegexPattern::compile(re1
, pe
, status
);
1509 REGEX_ASSERT(*pat1a
== *pat1
);
1511 REGEX_ASSERT(pat1a
->flags() == 0);
1513 // Compile with different flags should be not equal
1514 RegexPattern
*pat1b
= RegexPattern::compile(re1
, UREGEX_CASE_INSENSITIVE
, pe
, status
);
1517 REGEX_ASSERT(*pat1b
!= *pat1a
);
1518 REGEX_ASSERT(pat1b
->flags() == UREGEX_CASE_INSENSITIVE
);
1519 REGEX_ASSERT(pat1a
->flags() == 0);
1523 RegexPattern
*pat1c
= pat1
->clone();
1524 REGEX_ASSERT(*pat1c
== *pat1
);
1525 REGEX_ASSERT(*pat1c
!= *pat2
);
1534 // Verify that a matcher created from a cloned pattern works.
1538 UErrorCode status
= U_ZERO_ERROR
;
1539 RegexPattern
*pSource
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status
);
1540 RegexPattern
*pClone
= pSource
->clone();
1542 RegexMatcher
*mFromClone
= pClone
->matcher(status
);
1544 UnicodeString s
= "Hello World";
1545 mFromClone
->reset(s
);
1546 REGEX_ASSERT(mFromClone
->find() == TRUE
);
1547 REGEX_ASSERT(mFromClone
->group(status
) == "Hello");
1548 REGEX_ASSERT(mFromClone
->find() == TRUE
);
1549 REGEX_ASSERT(mFromClone
->group(status
) == "World");
1550 REGEX_ASSERT(mFromClone
->find() == FALSE
);
1556 // matches convenience API
1558 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe
, status
) == TRUE
);
1560 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe
, status
) == FALSE
);
1562 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe
, status
) == TRUE
);
1564 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe
, status
) == TRUE
);
1566 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe
, status
) == FALSE
);
1568 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1569 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe
, status
) == FALSE
);
1570 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1576 status
= U_ZERO_ERROR
;
1577 pat1
= RegexPattern::compile(" +", pe
, status
);
1579 UnicodeString fields
[10];
1582 n
= pat1
->split("Now is the time", fields
, 10, status
);
1585 REGEX_ASSERT(fields
[0]=="Now");
1586 REGEX_ASSERT(fields
[1]=="is");
1587 REGEX_ASSERT(fields
[2]=="the");
1588 REGEX_ASSERT(fields
[3]=="time");
1589 REGEX_ASSERT(fields
[4]=="");
1591 n
= pat1
->split("Now is the time", fields
, 2, status
);
1594 REGEX_ASSERT(fields
[0]=="Now");
1595 REGEX_ASSERT(fields
[1]=="is the time");
1596 REGEX_ASSERT(fields
[2]=="the"); // left over from previous test
1599 status
= U_ZERO_ERROR
;
1600 n
= pat1
->split("Now is the time", fields
, 1, status
);
1603 REGEX_ASSERT(fields
[0]=="Now is the time");
1604 REGEX_ASSERT(fields
[1]=="*");
1605 status
= U_ZERO_ERROR
;
1607 n
= pat1
->split(" Now is the time ", fields
, 10, status
);
1610 REGEX_ASSERT(fields
[0]=="");
1611 REGEX_ASSERT(fields
[1]=="Now");
1612 REGEX_ASSERT(fields
[2]=="is");
1613 REGEX_ASSERT(fields
[3]=="the");
1614 REGEX_ASSERT(fields
[4]=="time");
1615 REGEX_ASSERT(fields
[5]=="");
1617 n
= pat1
->split(" ", fields
, 10, status
);
1620 REGEX_ASSERT(fields
[0]=="");
1621 REGEX_ASSERT(fields
[1]=="");
1624 n
= pat1
->split("", fields
, 10, status
);
1627 REGEX_ASSERT(fields
[0]=="foo");
1631 // split, with a pattern with (capture)
1632 pat1
= RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe
, status
);
1635 status
= U_ZERO_ERROR
;
1636 n
= pat1
->split("<a>Now is <b>the time<c>", fields
, 10, status
);
1639 REGEX_ASSERT(fields
[0]=="");
1640 REGEX_ASSERT(fields
[1]=="a");
1641 REGEX_ASSERT(fields
[2]=="Now is ");
1642 REGEX_ASSERT(fields
[3]=="b");
1643 REGEX_ASSERT(fields
[4]=="the time");
1644 REGEX_ASSERT(fields
[5]=="c");
1645 REGEX_ASSERT(fields
[6]=="");
1646 REGEX_ASSERT(status
==U_ZERO_ERROR
);
1648 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 10, status
);
1651 REGEX_ASSERT(fields
[0]==" ");
1652 REGEX_ASSERT(fields
[1]=="a");
1653 REGEX_ASSERT(fields
[2]=="Now is ");
1654 REGEX_ASSERT(fields
[3]=="b");
1655 REGEX_ASSERT(fields
[4]=="the time");
1656 REGEX_ASSERT(fields
[5]=="c");
1657 REGEX_ASSERT(fields
[6]=="");
1659 status
= U_ZERO_ERROR
;
1661 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 6, status
);
1664 REGEX_ASSERT(fields
[0]==" ");
1665 REGEX_ASSERT(fields
[1]=="a");
1666 REGEX_ASSERT(fields
[2]=="Now is ");
1667 REGEX_ASSERT(fields
[3]=="b");
1668 REGEX_ASSERT(fields
[4]=="the time");
1669 REGEX_ASSERT(fields
[5]==""); // All text following "<c>" field delimiter.
1670 REGEX_ASSERT(fields
[6]=="foo");
1672 status
= U_ZERO_ERROR
;
1674 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 5, status
);
1677 REGEX_ASSERT(fields
[0]==" ");
1678 REGEX_ASSERT(fields
[1]=="a");
1679 REGEX_ASSERT(fields
[2]=="Now is ");
1680 REGEX_ASSERT(fields
[3]=="b");
1681 REGEX_ASSERT(fields
[4]=="the time<c>");
1682 REGEX_ASSERT(fields
[5]=="foo");
1684 status
= U_ZERO_ERROR
;
1686 n
= pat1
->split(" <a>Now is <b>the time", fields
, 5, status
);
1689 REGEX_ASSERT(fields
[0]==" ");
1690 REGEX_ASSERT(fields
[1]=="a");
1691 REGEX_ASSERT(fields
[2]=="Now is ");
1692 REGEX_ASSERT(fields
[3]=="b");
1693 REGEX_ASSERT(fields
[4]=="the time");
1694 REGEX_ASSERT(fields
[5]=="foo");
1696 status
= U_ZERO_ERROR
;
1697 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 4, status
);
1700 REGEX_ASSERT(fields
[0]==" ");
1701 REGEX_ASSERT(fields
[1]=="a");
1702 REGEX_ASSERT(fields
[2]=="Now is ");
1703 REGEX_ASSERT(fields
[3]=="the time<c>");
1704 status
= U_ZERO_ERROR
;
1707 pat1
= RegexPattern::compile("([-,])", pe
, status
);
1709 n
= pat1
->split("1-10,20", fields
, 10, status
);
1712 REGEX_ASSERT(fields
[0]=="1");
1713 REGEX_ASSERT(fields
[1]=="-");
1714 REGEX_ASSERT(fields
[2]=="10");
1715 REGEX_ASSERT(fields
[3]==",");
1716 REGEX_ASSERT(fields
[4]=="20");
1719 // Test split of string with empty trailing fields
1720 pat1
= RegexPattern::compile(",", pe
, status
);
1722 n
= pat1
->split("a,b,c,", fields
, 10, status
);
1725 REGEX_ASSERT(fields
[0]=="a");
1726 REGEX_ASSERT(fields
[1]=="b");
1727 REGEX_ASSERT(fields
[2]=="c");
1728 REGEX_ASSERT(fields
[3]=="");
1730 n
= pat1
->split("a,,,", fields
, 10, status
);
1733 REGEX_ASSERT(fields
[0]=="a");
1734 REGEX_ASSERT(fields
[1]=="");
1735 REGEX_ASSERT(fields
[2]=="");
1736 REGEX_ASSERT(fields
[3]=="");
1739 // Split Separator with zero length match.
1740 pat1
= RegexPattern::compile(":?", pe
, status
);
1742 n
= pat1
->split("abc", fields
, 10, status
);
1745 REGEX_ASSERT(fields
[0]=="");
1746 REGEX_ASSERT(fields
[1]=="a");
1747 REGEX_ASSERT(fields
[2]=="b");
1748 REGEX_ASSERT(fields
[3]=="c");
1749 REGEX_ASSERT(fields
[4]=="");
1754 // RegexPattern::pattern()
1756 pat1
= new RegexPattern();
1757 REGEX_ASSERT(pat1
->pattern() == "");
1760 pat1
= RegexPattern::compile("(Hello, world)*", pe
, status
);
1762 REGEX_ASSERT(pat1
->pattern() == "(Hello, world)*");
1767 // classID functions
1769 pat1
= RegexPattern::compile("(Hello, world)*", pe
, status
);
1771 REGEX_ASSERT(pat1
->getDynamicClassID() == RegexPattern::getStaticClassID());
1772 REGEX_ASSERT(pat1
->getDynamicClassID() != NULL
);
1773 UnicodeString
Hello("Hello, world.");
1774 RegexMatcher
*m
= pat1
->matcher(Hello
, status
);
1775 REGEX_ASSERT(pat1
->getDynamicClassID() != m
->getDynamicClassID());
1776 REGEX_ASSERT(m
->getDynamicClassID() == RegexMatcher::getStaticClassID());
1777 REGEX_ASSERT(m
->getDynamicClassID() != NULL
);
1783 //---------------------------------------------------------------------------
1785 // API_Match_UTF8 Test that the alternate engine for class RegexMatcher
1786 // is present and working, but excluding functions
1787 // implementing replace operations.
1789 //---------------------------------------------------------------------------
1790 void RegexTest::API_Match_UTF8() {
1792 UErrorCode status
=U_ZERO_ERROR
;
1796 // Debug - slide failing test cases early
1805 // Simple pattern compilation
1808 UText re
= UTEXT_INITIALIZER
;
1809 regextst_openUTF8FromInvariant(&re
, "abc", -1, &status
);
1810 REGEX_VERBOSE_TEXT(&re
);
1812 pat2
= RegexPattern::compile(&re
, flags
, pe
, status
);
1815 UText input1
= UTEXT_INITIALIZER
;
1816 UText input2
= UTEXT_INITIALIZER
;
1817 UText empty
= UTEXT_INITIALIZER
;
1818 regextst_openUTF8FromInvariant(&input1
, "abcdef this is a test", -1, &status
);
1819 REGEX_VERBOSE_TEXT(&input1
);
1820 regextst_openUTF8FromInvariant(&input2
, "not abc", -1, &status
);
1821 REGEX_VERBOSE_TEXT(&input2
);
1822 utext_openUChars(&empty
, NULL
, 0, &status
);
1824 int32_t input1Len
= strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1825 int32_t input2Len
= strlen("not abc");
1829 // Matcher creation and reset.
1831 RegexMatcher
*m1
= &pat2
->matcher(status
)->reset(&input1
);
1833 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
1834 const char str_abcdefthisisatest
[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1835 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest
, m1
->inputText());
1837 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
1838 const char str_notabc
[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1839 REGEX_ASSERT_UTEXT_UTF8(str_notabc
, m1
->inputText());
1841 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest
, m1
->inputText());
1842 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
1844 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
1845 REGEX_ASSERT(utext_nativeLength(&empty
) == 0);
1848 // reset(pos, status)
1851 m1
->reset(4, status
);
1853 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest
, m1
->inputText());
1854 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
1856 m1
->reset(-1, status
);
1857 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1858 status
= U_ZERO_ERROR
;
1860 m1
->reset(0, status
);
1862 status
= U_ZERO_ERROR
;
1864 m1
->reset(input1Len
-1, status
);
1866 status
= U_ZERO_ERROR
;
1868 m1
->reset(input1Len
, status
);
1870 status
= U_ZERO_ERROR
;
1872 m1
->reset(input1Len
+1, status
);
1873 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1874 status
= U_ZERO_ERROR
;
1877 // match(pos, status)
1880 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
1882 REGEX_ASSERT(m1
->matches(3, status
) == FALSE
);
1884 REGEX_ASSERT(m1
->matches(5, status
) == FALSE
);
1885 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
1886 REGEX_ASSERT(m1
->matches(-1, status
) == FALSE
);
1887 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1889 // Match() at end of string should fail, but should not
1891 status
= U_ZERO_ERROR
;
1892 REGEX_ASSERT(m1
->matches(input2Len
, status
) == FALSE
);
1895 // Match beyond end of string should fail with an error.
1896 status
= U_ZERO_ERROR
;
1897 REGEX_ASSERT(m1
->matches(input2Len
+1, status
) == FALSE
);
1898 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1900 // Successful match at end of string.
1902 status
= U_ZERO_ERROR
;
1903 RegexMatcher
m("A?", 0, status
); // will match zero length string.
1906 REGEX_ASSERT(m
.matches(input1Len
, status
) == TRUE
);
1909 REGEX_ASSERT(m
.matches(0, status
) == TRUE
);
1915 // lookingAt(pos, status)
1917 status
= U_ZERO_ERROR
;
1918 m1
->reset(&input2
); // "not abc"
1919 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
1920 REGEX_ASSERT(m1
->lookingAt(5, status
) == FALSE
);
1921 REGEX_ASSERT(m1
->lookingAt(3, status
) == FALSE
);
1922 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
1923 REGEX_ASSERT(m1
->lookingAt(-1, status
) == FALSE
);
1924 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1925 status
= U_ZERO_ERROR
;
1926 REGEX_ASSERT(m1
->lookingAt(input2Len
, status
) == FALSE
);
1928 REGEX_ASSERT(m1
->lookingAt(input2Len
+1, status
) == FALSE
);
1929 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1935 utext_close(&input1
);
1936 utext_close(&input2
);
1937 utext_close(&empty
);
1943 // RegexMatcher::start();
1944 // RegexMatcher::end();
1945 // RegexMatcher::groupCount();
1950 UErrorCode status
=U_ZERO_ERROR
;
1951 UText re
=UTEXT_INITIALIZER
;
1952 const char str_01234567_pat
[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1953 utext_openUTF8(&re
, str_01234567_pat
, -1, &status
);
1955 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
1958 UText input
= UTEXT_INITIALIZER
;
1959 const char str_0123456789
[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1960 utext_openUTF8(&input
, str_0123456789
, -1, &status
);
1962 RegexMatcher
*matcher
= &pat
->matcher(status
)->reset(&input
);
1964 REGEX_ASSERT(matcher
->lookingAt(status
) == TRUE
);
1965 static const int32_t matchStarts
[] = {0, 2, 4, 8};
1966 static const int32_t matchEnds
[] = {10, 8, 6, 10};
1968 for (i
=0; i
<4; i
++) {
1969 int32_t actualStart
= matcher
->start(i
, status
);
1971 if (actualStart
!= matchStarts
[i
]) {
1972 errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n",
1973 __FILE__
, __LINE__
, i
, matchStarts
[i
], actualStart
);
1975 int32_t actualEnd
= matcher
->end(i
, status
);
1977 if (actualEnd
!= matchEnds
[i
]) {
1978 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n",
1979 __FILE__
, __LINE__
, i
, matchEnds
[i
], actualEnd
);
1983 REGEX_ASSERT(matcher
->start(0, status
) == matcher
->start(status
));
1984 REGEX_ASSERT(matcher
->end(0, status
) == matcher
->end(status
));
1986 REGEX_ASSERT_FAIL(matcher
->start(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
1987 REGEX_ASSERT_FAIL(matcher
->start( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
1989 REGEX_ASSERT_FAIL(matcher
->start( 0, status
), U_REGEX_INVALID_STATE
);
1991 matcher
->lookingAt(status
);
1994 UText destText
= UTEXT_INITIALIZER
;
1995 utext_openUnicodeString(&destText
, &dest
, &status
);
1997 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1998 // Test shallow-clone API
2000 result
= matcher
->group((UText
*)NULL
, group_len
, status
);
2002 REGEX_ASSERT_UTEXT_UTF8(str_0123456789
, result
);
2003 utext_close(result
);
2004 result
= matcher
->group(0, &destText
, group_len
, status
);
2006 REGEX_ASSERT(result
== &destText
);
2007 REGEX_ASSERT_UTEXT_UTF8(str_0123456789
, result
);
2008 // destText is now immutable, reopen it
2009 utext_close(&destText
);
2010 utext_openUnicodeString(&destText
, &dest
, &status
);
2013 result
= matcher
->group(0, NULL
, length
, status
);
2015 REGEX_ASSERT_UTEXT_UTF8(str_0123456789
, result
);
2016 utext_close(result
);
2017 result
= matcher
->group(0, &destText
, length
, status
);
2019 REGEX_ASSERT(result
== &destText
);
2020 REGEX_ASSERT(utext_getNativeIndex(result
) == 0);
2021 REGEX_ASSERT(length
== 10);
2022 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2024 // Capture Group 1 == "234567"
2025 result
= matcher
->group(1, NULL
, length
, status
);
2027 REGEX_ASSERT(utext_getNativeIndex(result
) == 2);
2028 REGEX_ASSERT(length
== 6);
2029 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2030 utext_close(result
);
2032 result
= matcher
->group(1, &destText
, length
, status
);
2034 REGEX_ASSERT(result
== &destText
);
2035 REGEX_ASSERT(utext_getNativeIndex(result
) == 2);
2036 REGEX_ASSERT(length
== 6);
2037 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2038 utext_close(result
);
2040 // Capture Group 2 == "45"
2041 result
= matcher
->group(2, NULL
, length
, status
);
2043 REGEX_ASSERT(utext_getNativeIndex(result
) == 4);
2044 REGEX_ASSERT(length
== 2);
2045 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2046 utext_close(result
);
2048 result
= matcher
->group(2, &destText
, length
, status
);
2050 REGEX_ASSERT(result
== &destText
);
2051 REGEX_ASSERT(utext_getNativeIndex(result
) == 4);
2052 REGEX_ASSERT(length
== 2);
2053 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2054 utext_close(result
);
2056 // Capture Group 3 == "89"
2057 result
= matcher
->group(3, NULL
, length
, status
);
2059 REGEX_ASSERT(utext_getNativeIndex(result
) == 8);
2060 REGEX_ASSERT(length
== 2);
2061 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2062 utext_close(result
);
2064 result
= matcher
->group(3, &destText
, length
, status
);
2066 REGEX_ASSERT(result
== &destText
);
2067 REGEX_ASSERT(utext_getNativeIndex(result
) == 8);
2068 REGEX_ASSERT(length
== 2);
2069 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2070 utext_close(result
);
2072 // Capture Group number out of range.
2073 status
= U_ZERO_ERROR
;
2074 REGEX_ASSERT_FAIL(matcher
->group(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2075 status
= U_ZERO_ERROR
;
2076 REGEX_ASSERT_FAIL(matcher
->group( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2077 status
= U_ZERO_ERROR
;
2079 REGEX_ASSERT_FAIL(matcher
->group( 0, status
), U_REGEX_INVALID_STATE
);
2084 utext_close(&destText
);
2085 utext_close(&input
);
2095 UErrorCode status
=U_ZERO_ERROR
;
2096 UText re
=UTEXT_INITIALIZER
;
2097 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2098 utext_openUTF8(&re
, str_abc
, -1, &status
);
2100 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
2102 UText input
= UTEXT_INITIALIZER
;
2103 const char str_abcabcabc
[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2104 utext_openUTF8(&input
, str_abcabcabc
, -1, &status
);
2105 // 012345678901234567
2107 RegexMatcher
*matcher
= &pat
->matcher(status
)->reset(&input
);
2109 REGEX_ASSERT(matcher
->find());
2110 REGEX_ASSERT(matcher
->start(status
) == 1);
2111 REGEX_ASSERT(matcher
->find());
2112 REGEX_ASSERT(matcher
->start(status
) == 6);
2113 REGEX_ASSERT(matcher
->find());
2114 REGEX_ASSERT(matcher
->start(status
) == 12);
2115 REGEX_ASSERT(matcher
->find() == FALSE
);
2116 REGEX_ASSERT(matcher
->find() == FALSE
);
2119 REGEX_ASSERT(matcher
->find());
2120 REGEX_ASSERT(matcher
->start(status
) == 1);
2122 REGEX_ASSERT(matcher
->find(0, status
));
2123 REGEX_ASSERT(matcher
->start(status
) == 1);
2124 REGEX_ASSERT(matcher
->find(1, status
));
2125 REGEX_ASSERT(matcher
->start(status
) == 1);
2126 REGEX_ASSERT(matcher
->find(2, status
));
2127 REGEX_ASSERT(matcher
->start(status
) == 6);
2128 REGEX_ASSERT(matcher
->find(12, status
));
2129 REGEX_ASSERT(matcher
->start(status
) == 12);
2130 REGEX_ASSERT(matcher
->find(13, status
) == FALSE
);
2131 REGEX_ASSERT(matcher
->find(16, status
) == FALSE
);
2132 REGEX_ASSERT(matcher
->find(17, status
) == FALSE
);
2133 REGEX_ASSERT_FAIL(matcher
->start(status
), U_REGEX_INVALID_STATE
);
2135 status
= U_ZERO_ERROR
;
2136 REGEX_ASSERT_FAIL(matcher
->find(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2137 status
= U_ZERO_ERROR
;
2138 REGEX_ASSERT_FAIL(matcher
->find(18, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2140 REGEX_ASSERT(matcher
->groupCount() == 0);
2145 utext_close(&input
);
2151 // find, with \G in pattern (true if at the end of a previous match).
2156 UErrorCode status
=U_ZERO_ERROR
;
2157 UText re
=UTEXT_INITIALIZER
;
2158 const char str_Gabcabc
[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2159 utext_openUTF8(&re
, str_Gabcabc
, -1, &status
);
2161 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
2164 UText input
= UTEXT_INITIALIZER
;
2165 const char str_abcabcabc
[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2166 utext_openUTF8(&input
, str_abcabcabc
, -1, &status
);
2167 // 012345678901234567
2169 RegexMatcher
*matcher
= &pat
->matcher(status
)->reset(&input
);
2171 REGEX_ASSERT(matcher
->find());
2172 REGEX_ASSERT(matcher
->start(status
) == 0);
2173 REGEX_ASSERT(matcher
->start(1, status
) == -1);
2174 REGEX_ASSERT(matcher
->start(2, status
) == 1);
2176 REGEX_ASSERT(matcher
->find());
2177 REGEX_ASSERT(matcher
->start(status
) == 4);
2178 REGEX_ASSERT(matcher
->start(1, status
) == 4);
2179 REGEX_ASSERT(matcher
->start(2, status
) == -1);
2185 utext_close(&input
);
2190 // find with zero length matches, match position should bump ahead
2191 // to prevent loops.
2195 UErrorCode status
=U_ZERO_ERROR
;
2196 RegexMatcher
m("(?= ?)", 0, status
); // This pattern will zero-length matches anywhere,
2197 // using an always-true look-ahead.
2199 UText s
= UTEXT_INITIALIZER
;
2200 utext_openUTF8(&s
, " ", -1, &status
);
2203 if (m
.find() == FALSE
) {
2206 REGEX_ASSERT(m
.start(status
) == i
);
2207 REGEX_ASSERT(m
.end(status
) == i
);
2211 // Check that the bump goes over characters outside the BMP OK
2212 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2213 unsigned char aboveBMP
[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2214 utext_openUTF8(&s
, (char *)aboveBMP
, -1, &status
);
2217 if (m
.find() == FALSE
) {
2220 REGEX_ASSERT(m
.start(status
) == i
);
2221 REGEX_ASSERT(m
.end(status
) == i
);
2223 REGEX_ASSERT(i
==20);
2228 // find() loop breaking test.
2229 // with pattern of /.?/, should see a series of one char matches, then a single
2230 // match of zero length at the end of the input string.
2232 UErrorCode status
=U_ZERO_ERROR
;
2233 RegexMatcher
m(".?", 0, status
);
2235 UText s
= UTEXT_INITIALIZER
;
2236 utext_openUTF8(&s
, " ", -1, &status
);
2239 if (m
.find() == FALSE
) {
2242 REGEX_ASSERT(m
.start(status
) == i
);
2243 REGEX_ASSERT(m
.end(status
) == (i
<4 ? i
+1 : i
));
2252 // Matchers with no input string behave as if they had an empty input string.
2256 UErrorCode status
= U_ZERO_ERROR
;
2257 RegexMatcher
m(".?", 0, status
);
2259 REGEX_ASSERT(m
.find());
2260 REGEX_ASSERT(m
.start(status
) == 0);
2261 REGEX_ASSERT(m
.input() == "");
2264 UErrorCode status
= U_ZERO_ERROR
;
2265 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
2266 RegexMatcher
*m
= p
->matcher(status
);
2269 REGEX_ASSERT(m
->find() == FALSE
);
2270 REGEX_ASSERT(utext_nativeLength(m
->inputText()) == 0);
2279 UErrorCode status
= U_ZERO_ERROR
;
2280 UText testPattern
= UTEXT_INITIALIZER
;
2281 UText testText
= UTEXT_INITIALIZER
;
2282 regextst_openUTF8FromInvariant(&testPattern
, ".*", -1, &status
);
2283 REGEX_VERBOSE_TEXT(&testPattern
);
2284 regextst_openUTF8FromInvariant(&testText
, "This is test data", -1, &status
);
2285 REGEX_VERBOSE_TEXT(&testText
);
2287 RegexMatcher
m(&testPattern
, &testText
, 0, status
);
2289 REGEX_ASSERT(m
.regionStart() == 0);
2290 REGEX_ASSERT(m
.regionEnd() == (int32_t)strlen("This is test data"));
2291 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2292 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2294 m
.region(2,4, status
);
2296 REGEX_ASSERT(m
.matches(status
));
2297 REGEX_ASSERT(m
.start(status
)==2);
2298 REGEX_ASSERT(m
.end(status
)==4);
2302 REGEX_ASSERT(m
.regionStart() == 0);
2303 REGEX_ASSERT(m
.regionEnd() == (int32_t)strlen("This is test data"));
2305 regextst_openUTF8FromInvariant(&testText
, "short", -1, &status
);
2306 REGEX_VERBOSE_TEXT(&testText
);
2308 REGEX_ASSERT(m
.regionStart() == 0);
2309 REGEX_ASSERT(m
.regionEnd() == (int32_t)strlen("short"));
2311 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2312 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(FALSE
));
2313 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
2314 REGEX_ASSERT(&m
== &m
.reset());
2315 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
2317 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(TRUE
));
2318 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2319 REGEX_ASSERT(&m
== &m
.reset());
2320 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2322 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2323 REGEX_ASSERT(&m
== &m
.useTransparentBounds(TRUE
));
2324 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
2325 REGEX_ASSERT(&m
== &m
.reset());
2326 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
2328 REGEX_ASSERT(&m
== &m
.useTransparentBounds(FALSE
));
2329 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2330 REGEX_ASSERT(&m
== &m
.reset());
2331 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2333 utext_close(&testText
);
2334 utext_close(&testPattern
);
2338 // hitEnd() and requireEnd()
2341 UErrorCode status
= U_ZERO_ERROR
;
2342 UText testPattern
= UTEXT_INITIALIZER
;
2343 UText testText
= UTEXT_INITIALIZER
;
2344 const char str_
[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2345 const char str_aabb
[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2346 utext_openUTF8(&testPattern
, str_
, -1, &status
);
2347 utext_openUTF8(&testText
, str_aabb
, -1, &status
);
2349 RegexMatcher
m1(&testPattern
, &testText
, 0, status
);
2350 REGEX_ASSERT(m1
.lookingAt(status
) == TRUE
);
2351 REGEX_ASSERT(m1
.hitEnd() == TRUE
);
2352 REGEX_ASSERT(m1
.requireEnd() == FALSE
);
2355 status
= U_ZERO_ERROR
;
2356 const char str_a
[] = { 0x61, 0x2a, 0x00 }; /* a* */
2357 utext_openUTF8(&testPattern
, str_a
, -1, &status
);
2358 RegexMatcher
m2(&testPattern
, &testText
, 0, status
);
2359 REGEX_ASSERT(m2
.lookingAt(status
) == TRUE
);
2360 REGEX_ASSERT(m2
.hitEnd() == FALSE
);
2361 REGEX_ASSERT(m2
.requireEnd() == FALSE
);
2364 status
= U_ZERO_ERROR
;
2365 const char str_dotstardollar
[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2366 utext_openUTF8(&testPattern
, str_dotstardollar
, -1, &status
);
2367 RegexMatcher
m3(&testPattern
, &testText
, 0, status
);
2368 REGEX_ASSERT(m3
.lookingAt(status
) == TRUE
);
2369 REGEX_ASSERT(m3
.hitEnd() == TRUE
);
2370 REGEX_ASSERT(m3
.requireEnd() == TRUE
);
2373 utext_close(&testText
);
2374 utext_close(&testPattern
);
2379 //---------------------------------------------------------------------------
2381 // API_Replace_UTF8 API test for class RegexMatcher, testing the
2382 // Replace family of functions.
2384 //---------------------------------------------------------------------------
2385 void RegexTest::API_Replace_UTF8() {
2391 UErrorCode status
=U_ZERO_ERROR
;
2393 UText re
=UTEXT_INITIALIZER
;
2394 regextst_openUTF8FromInvariant(&re
, "abc", -1, &status
);
2395 REGEX_VERBOSE_TEXT(&re
);
2396 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
2399 char data
[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2400 // 012345678901234567
2401 UText dataText
= UTEXT_INITIALIZER
;
2402 utext_openUTF8(&dataText
, data
, -1, &status
);
2404 REGEX_VERBOSE_TEXT(&dataText
);
2405 RegexMatcher
*matcher
= &pat
->matcher(status
)->reset(&dataText
);
2408 // Plain vanilla matches.
2411 UText destText
= UTEXT_INITIALIZER
;
2412 utext_openUnicodeString(&destText
, &dest
, &status
);
2415 UText replText
= UTEXT_INITIALIZER
;
2417 const char str_yz
[] = { 0x79, 0x7a, 0x00 }; /* yz */
2418 utext_openUTF8(&replText
, str_yz
, -1, &status
);
2419 REGEX_VERBOSE_TEXT(&replText
);
2420 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2422 const char str_yzabcabc
[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2423 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc
, result
);
2424 utext_close(result
);
2425 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2427 REGEX_ASSERT(result
== &destText
);
2428 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc
, result
);
2430 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2432 const char str_yzyzyz
[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2433 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz
, result
);
2434 utext_close(result
);
2436 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2437 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2439 REGEX_ASSERT(result
== &destText
);
2440 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz
, result
);
2443 // Plain vanilla non-matches.
2445 const char str_abxabxabx
[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2446 utext_openUTF8(&dataText
, str_abxabxabx
, -1, &status
);
2447 matcher
->reset(&dataText
);
2449 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2451 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2452 utext_close(result
);
2453 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2455 REGEX_ASSERT(result
== &destText
);
2456 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2458 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2460 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2461 utext_close(result
);
2462 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2463 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2465 REGEX_ASSERT(result
== &destText
);
2466 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2469 // Empty source string
2471 utext_openUTF8(&dataText
, NULL
, 0, &status
);
2472 matcher
->reset(&dataText
);
2474 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2476 REGEX_ASSERT_UTEXT_UTF8("", result
);
2477 utext_close(result
);
2478 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2480 REGEX_ASSERT(result
== &destText
);
2481 REGEX_ASSERT_UTEXT_UTF8("", result
);
2483 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2485 REGEX_ASSERT_UTEXT_UTF8("", result
);
2486 utext_close(result
);
2487 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2489 REGEX_ASSERT(result
== &destText
);
2490 REGEX_ASSERT_UTEXT_UTF8("", result
);
2493 // Empty substitution string
2495 utext_openUTF8(&dataText
, data
, -1, &status
); // ".abc..abc...abc.."
2496 matcher
->reset(&dataText
);
2498 utext_openUTF8(&replText
, NULL
, 0, &status
);
2499 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2501 const char str_abcabc
[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2502 REGEX_ASSERT_UTEXT_UTF8(str_abcabc
, result
);
2503 utext_close(result
);
2504 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2506 REGEX_ASSERT(result
== &destText
);
2507 REGEX_ASSERT_UTEXT_UTF8(str_abcabc
, result
);
2509 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2511 const char str_dots
[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2512 REGEX_ASSERT_UTEXT_UTF8(str_dots
, result
);
2513 utext_close(result
);
2514 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2515 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2517 REGEX_ASSERT(result
== &destText
);
2518 REGEX_ASSERT_UTEXT_UTF8(str_dots
, result
);
2521 // match whole string
2523 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2524 utext_openUTF8(&dataText
, str_abc
, -1, &status
);
2525 matcher
->reset(&dataText
);
2527 const char str_xyz
[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2528 utext_openUTF8(&replText
, str_xyz
, -1, &status
);
2529 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2531 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2532 utext_close(result
);
2533 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2534 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2536 REGEX_ASSERT(result
== &destText
);
2537 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2539 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2541 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2542 utext_close(result
);
2543 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2544 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2546 REGEX_ASSERT(result
== &destText
);
2547 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2550 // Capture Group, simple case
2552 const char str_add
[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2553 utext_openUTF8(&re
, str_add
, -1, &status
);
2554 RegexPattern
*pat2
= RegexPattern::compile(&re
, flags
, pe
, status
);
2557 const char str_abcdefg
[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2558 utext_openUTF8(&dataText
, str_abcdefg
, -1, &status
);
2559 RegexMatcher
*matcher2
= &pat2
->matcher(status
)->reset(&dataText
);
2562 const char str_11
[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2563 utext_openUTF8(&replText
, str_11
, -1, &status
);
2564 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2566 const char str_bcbcdefg
[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2567 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg
, result
);
2568 utext_close(result
);
2569 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2570 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2572 REGEX_ASSERT(result
== &destText
);
2573 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg
, result
);
2575 const char str_v
[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2576 utext_openUTF8(&replText
, str_v
, -1, &status
);
2577 REGEX_VERBOSE_TEXT(&replText
);
2578 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2580 const char str_Thevalueof1isbcdefg
[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2581 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg
, result
);
2582 utext_close(result
);
2583 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2584 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2586 REGEX_ASSERT(result
== &destText
);
2587 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg
, result
);
2589 const char str_byitselfnogroupnumber
[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
2590 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
2591 0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
2592 utext_openUTF8(&replText
, str_byitselfnogroupnumber
, -1, &status
);
2593 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2595 const char str_byitselfnogroupnumberdefg
[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2596 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg
, result
);
2597 utext_close(result
);
2598 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2599 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2601 REGEX_ASSERT(result
== &destText
);
2602 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg
, result
);
2604 unsigned char supplDigitChars
[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2605 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2606 // 012345678901234567890123456
2607 supplDigitChars
[22] = 0xF0;
2608 supplDigitChars
[23] = 0x9D;
2609 supplDigitChars
[24] = 0x9F;
2610 supplDigitChars
[25] = 0x8F;
2611 utext_openUTF8(&replText
, (char *)supplDigitChars
, -1, &status
);
2613 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2615 const char str_SupplementalDigit1bcdefg
[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2616 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg
, result
);
2617 utext_close(result
);
2618 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2619 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2621 REGEX_ASSERT(result
== &destText
);
2622 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg
, result
);
2623 const char str_badcapturegroupnumber5
[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */
2624 utext_openUTF8(&replText
, str_badcapturegroupnumber5
, -1, &status
);
2625 REGEX_ASSERT_FAIL((result
= matcher2
->replaceFirst(&replText
, NULL
, status
)), U_INDEX_OUTOFBOUNDS_ERROR
);
2626 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2627 utext_close(result
);
2628 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2629 REGEX_ASSERT_FAIL((result
= matcher2
->replaceFirst(&replText
, &destText
, status
)), U_INDEX_OUTOFBOUNDS_ERROR
);
2630 REGEX_ASSERT(result
== &destText
);
2631 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2634 // Replacement String with \u hex escapes
2637 const char str_abc1abc2abc3
[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2638 const char str_u0043
[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2639 utext_openUTF8(&dataText
, str_abc1abc2abc3
, -1, &status
);
2640 utext_openUTF8(&replText
, str_u0043
, -1, &status
);
2641 matcher
->reset(&dataText
);
2643 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2645 const char str_C1C2C3
[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2646 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3
, result
);
2647 utext_close(result
);
2648 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2649 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2651 REGEX_ASSERT(result
== &destText
);
2652 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3
, result
);
2655 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2656 utext_openUTF8(&dataText
, str_abc
, -1, &status
);
2657 const char str_U00010000
[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2658 utext_openUTF8(&replText
, str_U00010000
, -1, &status
);
2659 matcher
->reset(&dataText
);
2661 unsigned char expected
[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2668 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2670 REGEX_ASSERT_UTEXT_UTF8((char *)expected
, result
);
2671 utext_close(result
);
2672 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2673 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2675 REGEX_ASSERT(result
== &destText
);
2676 REGEX_ASSERT_UTEXT_UTF8((char *)expected
, result
);
2678 // TODO: need more through testing of capture substitutions.
2683 status
= U_ZERO_ERROR
;
2684 const char str_ssee
[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2685 const char str_blah
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2686 const char str_ooh
[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2687 utext_openUTF8(&re
, str_ssee
, -1, &status
);
2688 utext_openUTF8(&dataText
, str_blah
, -1, &status
);
2689 utext_openUTF8(&replText
, str_ooh
, -1, &status
);
2691 RegexMatcher
m(&re
, 0, status
);
2694 UnicodeString result
;
2695 UText resultText
= UTEXT_INITIALIZER
;
2696 utext_openUnicodeString(&resultText
, &result
, &status
);
2698 // Multiple finds do NOT bump up the previous appendReplacement postion.
2702 m
.appendReplacement(&resultText
, &replText
, status
);
2704 const char str_blah2
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2705 REGEX_ASSERT_UTEXT_UTF8(str_blah2
, &resultText
);
2707 // After a reset into the interior of a string, appendReplacement still starts at beginning.
2708 status
= U_ZERO_ERROR
;
2710 utext_openUnicodeString(&resultText
, &result
, &status
);
2711 m
.reset(10, status
);
2714 m
.appendReplacement(&resultText
, &replText
, status
);
2716 const char str_blah3
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2717 REGEX_ASSERT_UTEXT_UTF8(str_blah3
, &resultText
);
2719 // find() at interior of string, appendReplacement still starts at beginning.
2720 status
= U_ZERO_ERROR
;
2722 utext_openUnicodeString(&resultText
, &result
, &status
);
2726 m
.appendReplacement(&resultText
, &replText
, status
);
2728 const char str_blah8
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2729 REGEX_ASSERT_UTEXT_UTF8(str_blah8
, &resultText
);
2731 m
.appendTail(&resultText
, status
);
2732 const char str_blah9
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2733 REGEX_ASSERT_UTEXT_UTF8(str_blah9
, &resultText
);
2735 utext_close(&resultText
);
2743 utext_close(&dataText
);
2744 utext_close(&replText
);
2745 utext_close(&destText
);
2750 //---------------------------------------------------------------------------
2752 // API_Pattern_UTF8 Test that the API for class RegexPattern is
2753 // present and nominally working.
2755 //---------------------------------------------------------------------------
2756 void RegexTest::API_Pattern_UTF8() {
2757 RegexPattern pata
; // Test default constructor to not crash.
2760 REGEX_ASSERT(pata
== patb
);
2761 REGEX_ASSERT(pata
== pata
);
2763 UText re1
= UTEXT_INITIALIZER
;
2764 UText re2
= UTEXT_INITIALIZER
;
2765 UErrorCode status
= U_ZERO_ERROR
;
2768 const char str_abcalmz
[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2769 const char str_def
[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2770 utext_openUTF8(&re1
, str_abcalmz
, -1, &status
);
2771 utext_openUTF8(&re2
, str_def
, -1, &status
);
2773 RegexPattern
*pat1
= RegexPattern::compile(&re1
, 0, pe
, status
);
2774 RegexPattern
*pat2
= RegexPattern::compile(&re2
, 0, pe
, status
);
2776 REGEX_ASSERT(*pat1
== *pat1
);
2777 REGEX_ASSERT(*pat1
!= pata
);
2781 REGEX_ASSERT(patb
== *pat1
);
2784 RegexPattern
patc(*pat1
);
2785 REGEX_ASSERT(patc
== *pat1
);
2786 REGEX_ASSERT(patb
== patc
);
2787 REGEX_ASSERT(pat1
!= pat2
);
2789 REGEX_ASSERT(patb
!= patc
);
2790 REGEX_ASSERT(patb
== *pat2
);
2792 // Compile with no flags.
2793 RegexPattern
*pat1a
= RegexPattern::compile(&re1
, pe
, status
);
2794 REGEX_ASSERT(*pat1a
== *pat1
);
2796 REGEX_ASSERT(pat1a
->flags() == 0);
2798 // Compile with different flags should be not equal
2799 RegexPattern
*pat1b
= RegexPattern::compile(&re1
, UREGEX_CASE_INSENSITIVE
, pe
, status
);
2802 REGEX_ASSERT(*pat1b
!= *pat1a
);
2803 REGEX_ASSERT(pat1b
->flags() == UREGEX_CASE_INSENSITIVE
);
2804 REGEX_ASSERT(pat1a
->flags() == 0);
2808 RegexPattern
*pat1c
= pat1
->clone();
2809 REGEX_ASSERT(*pat1c
== *pat1
);
2810 REGEX_ASSERT(*pat1c
!= *pat2
);
2822 // Verify that a matcher created from a cloned pattern works.
2826 UErrorCode status
= U_ZERO_ERROR
;
2827 UText pattern
= UTEXT_INITIALIZER
;
2828 const char str_pL
[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2829 utext_openUTF8(&pattern
, str_pL
, -1, &status
);
2831 RegexPattern
*pSource
= RegexPattern::compile(&pattern
, 0, status
);
2832 RegexPattern
*pClone
= pSource
->clone();
2834 RegexMatcher
*mFromClone
= pClone
->matcher(status
);
2837 UText input
= UTEXT_INITIALIZER
;
2838 const char str_HelloWorld
[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2839 utext_openUTF8(&input
, str_HelloWorld
, -1, &status
);
2840 mFromClone
->reset(&input
);
2841 REGEX_ASSERT(mFromClone
->find() == TRUE
);
2842 REGEX_ASSERT(mFromClone
->group(status
) == "Hello");
2843 REGEX_ASSERT(mFromClone
->find() == TRUE
);
2844 REGEX_ASSERT(mFromClone
->group(status
) == "World");
2845 REGEX_ASSERT(mFromClone
->find() == FALSE
);
2849 utext_close(&input
);
2850 utext_close(&pattern
);
2854 // matches convenience API
2857 UErrorCode status
= U_ZERO_ERROR
;
2858 UText pattern
= UTEXT_INITIALIZER
;
2859 UText input
= UTEXT_INITIALIZER
;
2861 const char str_randominput
[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2862 utext_openUTF8(&input
, str_randominput
, -1, &status
);
2864 const char str_dotstar
[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2865 utext_openUTF8(&pattern
, str_dotstar
, -1, &status
);
2866 REGEX_ASSERT(RegexPattern::matches(&pattern
, &input
, pe
, status
) == TRUE
);
2869 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2870 utext_openUTF8(&pattern
, str_abc
, -1, &status
);
2871 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe
, status
) == FALSE
);
2874 const char str_nput
[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2875 utext_openUTF8(&pattern
, str_nput
, -1, &status
);
2876 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe
, status
) == TRUE
);
2879 utext_openUTF8(&pattern
, str_randominput
, -1, &status
);
2880 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe
, status
) == TRUE
);
2883 const char str_u
[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2884 utext_openUTF8(&pattern
, str_u
, -1, &status
);
2885 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe
, status
) == FALSE
);
2888 utext_openUTF8(&input
, str_abc
, -1, &status
);
2889 utext_openUTF8(&pattern
, str_abc
, -1, &status
);
2890 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
2891 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe
, status
) == FALSE
);
2892 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
2894 utext_close(&input
);
2895 utext_close(&pattern
);
2902 status
= U_ZERO_ERROR
;
2903 const char str_spaceplus
[] = { 0x20, 0x2b, 0x00 }; /* + */
2904 utext_openUTF8(&re1
, str_spaceplus
, -1, &status
);
2905 pat1
= RegexPattern::compile(&re1
, pe
, status
);
2907 UnicodeString fields
[10];
2910 n
= pat1
->split("Now is the time", fields
, 10, status
);
2913 REGEX_ASSERT(fields
[0]=="Now");
2914 REGEX_ASSERT(fields
[1]=="is");
2915 REGEX_ASSERT(fields
[2]=="the");
2916 REGEX_ASSERT(fields
[3]=="time");
2917 REGEX_ASSERT(fields
[4]=="");
2919 n
= pat1
->split("Now is the time", fields
, 2, status
);
2922 REGEX_ASSERT(fields
[0]=="Now");
2923 REGEX_ASSERT(fields
[1]=="is the time");
2924 REGEX_ASSERT(fields
[2]=="the"); // left over from previous test
2927 status
= U_ZERO_ERROR
;
2928 n
= pat1
->split("Now is the time", fields
, 1, status
);
2931 REGEX_ASSERT(fields
[0]=="Now is the time");
2932 REGEX_ASSERT(fields
[1]=="*");
2933 status
= U_ZERO_ERROR
;
2935 n
= pat1
->split(" Now is the time ", fields
, 10, status
);
2938 REGEX_ASSERT(fields
[0]=="");
2939 REGEX_ASSERT(fields
[1]=="Now");
2940 REGEX_ASSERT(fields
[2]=="is");
2941 REGEX_ASSERT(fields
[3]=="the");
2942 REGEX_ASSERT(fields
[4]=="time");
2943 REGEX_ASSERT(fields
[5]=="");
2944 REGEX_ASSERT(fields
[6]=="");
2947 n
= pat1
->split(" ", fields
, 10, status
);
2950 REGEX_ASSERT(fields
[0]=="");
2951 REGEX_ASSERT(fields
[1]=="");
2952 REGEX_ASSERT(fields
[2]=="*");
2955 n
= pat1
->split("", fields
, 10, status
);
2958 REGEX_ASSERT(fields
[0]=="foo");
2962 // split, with a pattern with (capture)
2963 regextst_openUTF8FromInvariant(&re1
, "<(\\w*)>", -1, &status
);
2964 pat1
= RegexPattern::compile(&re1
, pe
, status
);
2967 status
= U_ZERO_ERROR
;
2968 fields
[6] = fields
[7] = "*";
2969 n
= pat1
->split("<a>Now is <b>the time<c>", fields
, 10, status
);
2972 REGEX_ASSERT(fields
[0]=="");
2973 REGEX_ASSERT(fields
[1]=="a");
2974 REGEX_ASSERT(fields
[2]=="Now is ");
2975 REGEX_ASSERT(fields
[3]=="b");
2976 REGEX_ASSERT(fields
[4]=="the time");
2977 REGEX_ASSERT(fields
[5]=="c");
2978 REGEX_ASSERT(fields
[6]=="");
2979 REGEX_ASSERT(fields
[7]=="*");
2980 REGEX_ASSERT(status
==U_ZERO_ERROR
);
2982 fields
[6] = fields
[7] = "*";
2983 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 10, status
);
2986 REGEX_ASSERT(fields
[0]==" ");
2987 REGEX_ASSERT(fields
[1]=="a");
2988 REGEX_ASSERT(fields
[2]=="Now is ");
2989 REGEX_ASSERT(fields
[3]=="b");
2990 REGEX_ASSERT(fields
[4]=="the time");
2991 REGEX_ASSERT(fields
[5]=="c");
2992 REGEX_ASSERT(fields
[6]=="");
2993 REGEX_ASSERT(fields
[7]=="*");
2995 status
= U_ZERO_ERROR
;
2997 n
= pat1
->split(" <a>Now is <b>the time<c> ", fields
, 6, status
);
3000 REGEX_ASSERT(fields
[0]==" ");
3001 REGEX_ASSERT(fields
[1]=="a");
3002 REGEX_ASSERT(fields
[2]=="Now is ");
3003 REGEX_ASSERT(fields
[3]=="b");
3004 REGEX_ASSERT(fields
[4]=="the time");
3005 REGEX_ASSERT(fields
[5]==" ");
3006 REGEX_ASSERT(fields
[6]=="foo");
3008 status
= U_ZERO_ERROR
;
3010 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 5, status
);
3013 REGEX_ASSERT(fields
[0]==" ");
3014 REGEX_ASSERT(fields
[1]=="a");
3015 REGEX_ASSERT(fields
[2]=="Now is ");
3016 REGEX_ASSERT(fields
[3]=="b");
3017 REGEX_ASSERT(fields
[4]=="the time<c>");
3018 REGEX_ASSERT(fields
[5]=="foo");
3020 status
= U_ZERO_ERROR
;
3022 n
= pat1
->split(" <a>Now is <b>the time", fields
, 5, status
);
3025 REGEX_ASSERT(fields
[0]==" ");
3026 REGEX_ASSERT(fields
[1]=="a");
3027 REGEX_ASSERT(fields
[2]=="Now is ");
3028 REGEX_ASSERT(fields
[3]=="b");
3029 REGEX_ASSERT(fields
[4]=="the time");
3030 REGEX_ASSERT(fields
[5]=="foo");
3032 status
= U_ZERO_ERROR
;
3033 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 4, status
);
3036 REGEX_ASSERT(fields
[0]==" ");
3037 REGEX_ASSERT(fields
[1]=="a");
3038 REGEX_ASSERT(fields
[2]=="Now is ");
3039 REGEX_ASSERT(fields
[3]=="the time<c>");
3040 status
= U_ZERO_ERROR
;
3043 regextst_openUTF8FromInvariant(&re1
, "([-,])", -1, &status
);
3044 pat1
= RegexPattern::compile(&re1
, pe
, status
);
3046 n
= pat1
->split("1-10,20", fields
, 10, status
);
3049 REGEX_ASSERT(fields
[0]=="1");
3050 REGEX_ASSERT(fields
[1]=="-");
3051 REGEX_ASSERT(fields
[2]=="10");
3052 REGEX_ASSERT(fields
[3]==",");
3053 REGEX_ASSERT(fields
[4]=="20");
3058 // split of a UText based string, with library allocating output UTexts.
3061 status
= U_ZERO_ERROR
;
3062 RegexMatcher
matcher(UnicodeString("(:)"), 0, status
);
3063 UnicodeString
stringToSplit("first:second:third");
3064 UText
*textToSplit
= utext_openUnicodeString(NULL
, &stringToSplit
, &status
);
3067 UText
*splits
[10] = {NULL
};
3068 int32_t numFields
= matcher
.split(textToSplit
, splits
, UPRV_LENGTHOF(splits
), status
);
3070 REGEX_ASSERT(numFields
== 5);
3071 REGEX_ASSERT_UTEXT_INVARIANT("first", splits
[0]);
3072 REGEX_ASSERT_UTEXT_INVARIANT(":", splits
[1]);
3073 REGEX_ASSERT_UTEXT_INVARIANT("second", splits
[2]);
3074 REGEX_ASSERT_UTEXT_INVARIANT(":", splits
[3]);
3075 REGEX_ASSERT_UTEXT_INVARIANT("third", splits
[4]);
3076 REGEX_ASSERT(splits
[5] == NULL
);
3078 for (int i
=0; i
<UPRV_LENGTHOF(splits
); i
++) {
3080 utext_close(splits
[i
]);
3084 utext_close(textToSplit
);
3089 // RegexPattern::pattern() and patternText()
3091 pat1
= new RegexPattern();
3092 REGEX_ASSERT(pat1
->pattern() == "");
3093 REGEX_ASSERT_UTEXT_UTF8("", pat1
->patternText(status
));
3095 const char *helloWorldInvariant
= "(Hello, world)*";
3096 regextst_openUTF8FromInvariant(&re1
, helloWorldInvariant
, -1, &status
);
3097 pat1
= RegexPattern::compile(&re1
, pe
, status
);
3099 REGEX_ASSERT_UNISTR("(Hello, world)*", pat1
->pattern());
3100 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1
->patternText(status
));
3107 //---------------------------------------------------------------------------
3109 // Extended A more thorough check for features of regex patterns
3110 // The test cases are in a separate data file,
3111 // source/tests/testdata/regextst.txt
3112 // A description of the test data format is included in that file.
3114 //---------------------------------------------------------------------------
3117 RegexTest::getPath(char buffer
[2048], const char *filename
) {
3118 UErrorCode status
=U_ZERO_ERROR
;
3119 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
3120 if (U_FAILURE(status
)) {
3121 errln("ERROR: loadTestData() failed - %s", u_errorName(status
));
3125 strcpy(buffer
, testDataDirectory
);
3126 strcat(buffer
, filename
);
3130 void RegexTest::Extended() {
3132 const char *srcPath
;
3133 UErrorCode status
= U_ZERO_ERROR
;
3134 int32_t lineNum
= 0;
3137 // Open and read the test data file.
3139 srcPath
=getPath(tdd
, "regextst.txt");
3141 return; /* something went wrong, error already output */
3145 UChar
*testData
= ReadAndConvertFile(srcPath
, len
, "utf-8", status
);
3146 if (U_FAILURE(status
)) {
3147 return; /* something went wrong, error already output */
3151 // Put the test data into a UnicodeString
3153 UnicodeString
testString(FALSE
, testData
, len
);
3155 RegexMatcher
quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status
);
3156 RegexMatcher
commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status
);
3157 RegexMatcher
flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status
);
3159 RegexMatcher
lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString
, 0, status
);
3160 UnicodeString testPattern
; // The pattern for test from the test file.
3161 UnicodeString testFlags
; // the flags for a test.
3162 UnicodeString matchString
; // The marked up string to be used as input
3164 if (U_FAILURE(status
)){
3165 dataerrln("Construct RegexMatcher() error - %s", u_errorName(status
));
3171 // Loop over the test data file, once per line.
3173 while (lineMat
.find()) {
3175 if (U_FAILURE(status
)) {
3176 errln("%s:%d: ICU Error \"%s\"", srcPath
, lineNum
, u_errorName(status
));
3179 status
= U_ZERO_ERROR
;
3180 UnicodeString testLine
= lineMat
.group(1, status
);
3181 if (testLine
.length() == 0) {
3186 // Parse the test line. Skip blank and comment only lines.
3187 // Separate out the three main fields - pattern, flags, target.
3190 commentMat
.reset(testLine
);
3191 if (commentMat
.lookingAt(status
)) {
3192 // This line is a comment, or blank.
3197 // Pull out the pattern field, remove it from the test file line.
3199 quotedStuffMat
.reset(testLine
);
3200 if (quotedStuffMat
.lookingAt(status
)) {
3201 testPattern
= quotedStuffMat
.group(2, status
);
3202 testLine
.remove(0, quotedStuffMat
.end(0, status
));
3204 errln("Bad pattern (missing quotes?) at %s:%d", srcPath
, lineNum
);
3210 // Pull out the flags from the test file line.
3212 flagsMat
.reset(testLine
);
3213 flagsMat
.lookingAt(status
); // Will always match, possibly an empty string.
3214 testFlags
= flagsMat
.group(1, status
);
3215 if (flagsMat
.group(2, status
).length() > 0) {
3216 errln("Bad Match flag at line %d. Scanning %c\n",
3217 lineNum
, flagsMat
.group(2, status
).charAt(0));
3220 testLine
.remove(0, flagsMat
.end(0, status
));
3223 // Pull out the match string, as a whole.
3224 // We'll process the <tags> later.
3226 quotedStuffMat
.reset(testLine
);
3227 if (quotedStuffMat
.lookingAt(status
)) {
3228 matchString
= quotedStuffMat
.group(2, status
);
3229 testLine
.remove(0, quotedStuffMat
.end(0, status
));
3231 errln("Bad match string at test file line %d", lineNum
);
3236 // The only thing left from the input line should be an optional trailing comment.
3238 commentMat
.reset(testLine
);
3239 if (commentMat
.lookingAt(status
) == FALSE
) {
3240 errln("Line %d: unexpected characters at end of test line.", lineNum
);
3247 regex_find(testPattern
, testFlags
, matchString
, srcPath
, lineNum
);
3256 //---------------------------------------------------------------------------
3258 // regex_find(pattern, flags, inputString, lineNumber)
3260 // Function to run a single test from the Extended (data driven) tests.
3261 // See file test/testdata/regextst.txt for a description of the
3262 // pattern and inputString fields, and the allowed flags.
3263 // lineNumber is the source line in regextst.txt of the test.
3265 //---------------------------------------------------------------------------
3268 // Set a value into a UVector at position specified by a decimal number in
3269 // a UnicodeString. This is a utility function needed by the actual test function,
3271 static void set(UVector
&vec
, int32_t val
, UnicodeString index
) {
3272 UErrorCode status
=U_ZERO_ERROR
;
3274 for (int32_t i
=0; i
<index
.length(); i
++) {
3275 int32_t d
=u_charDigitValue(index
.charAt(i
));
3279 while (vec
.size()<idx
+1) {vec
.addElement(-1, status
);}
3280 vec
.setElementAt(val
, idx
);
3283 static void setInt(UVector
&vec
, int32_t val
, int32_t idx
) {
3284 UErrorCode status
=U_ZERO_ERROR
;
3285 while (vec
.size()<idx
+1) {vec
.addElement(-1, status
);}
3286 vec
.setElementAt(val
, idx
);
3289 static UBool
utextOffsetToNative(UText
*utext
, int32_t unistrOffset
, int32_t& nativeIndex
)
3291 UBool couldFind
= TRUE
;
3292 UTEXT_SETNATIVEINDEX(utext
, 0);
3294 while (i
< unistrOffset
) {
3295 UChar32 c
= UTEXT_NEXT32(utext
);
3296 if (c
!= U_SENTINEL
) {
3303 nativeIndex
= (int32_t)UTEXT_GETNATIVEINDEX(utext
);
3308 void RegexTest::regex_find(const UnicodeString
&pattern
,
3309 const UnicodeString
&flags
,
3310 const UnicodeString
&inputString
,
3311 const char *srcPath
,
3313 UnicodeString unEscapedInput
;
3314 UnicodeString deTaggedInput
;
3316 int32_t patternUTF8Length
, inputUTF8Length
;
3317 char *patternChars
= NULL
, *inputChars
= NULL
;
3318 UText patternText
= UTEXT_INITIALIZER
;
3319 UText inputText
= UTEXT_INITIALIZER
;
3320 UConverter
*UTF8Converter
= NULL
;
3322 UErrorCode status
= U_ZERO_ERROR
;
3324 RegexPattern
*parsePat
= NULL
;
3325 RegexMatcher
*parseMatcher
= NULL
;
3326 RegexPattern
*callerPattern
= NULL
, *UTF8Pattern
= NULL
;
3327 RegexMatcher
*matcher
= NULL
, *UTF8Matcher
= NULL
;
3328 UVector
groupStarts(status
);
3329 UVector
groupEnds(status
);
3330 UVector
groupStartsUTF8(status
);
3331 UVector
groupEndsUTF8(status
);
3332 UBool isMatch
= FALSE
, isUTF8Match
= FALSE
;
3333 UBool failed
= FALSE
;
3336 UBool useMatchesFunc
= FALSE
;
3337 UBool useLookingAtFunc
= FALSE
;
3338 int32_t regionStart
= -1;
3339 int32_t regionEnd
= -1;
3340 int32_t regionStartUTF8
= -1;
3341 int32_t regionEndUTF8
= -1;
3345 // Compile the caller's pattern
3347 uint32_t bflags
= 0;
3348 if (flags
.indexOf((UChar
)0x69) >= 0) { // 'i' flag
3349 bflags
|= UREGEX_CASE_INSENSITIVE
;
3351 if (flags
.indexOf((UChar
)0x78) >= 0) { // 'x' flag
3352 bflags
|= UREGEX_COMMENTS
;
3354 if (flags
.indexOf((UChar
)0x73) >= 0) { // 's' flag
3355 bflags
|= UREGEX_DOTALL
;
3357 if (flags
.indexOf((UChar
)0x6d) >= 0) { // 'm' flag
3358 bflags
|= UREGEX_MULTILINE
;
3361 if (flags
.indexOf((UChar
)0x65) >= 0) { // 'e' flag
3362 bflags
|= UREGEX_ERROR_ON_UNKNOWN_ESCAPES
;
3364 if (flags
.indexOf((UChar
)0x44) >= 0) { // 'D' flag
3365 bflags
|= UREGEX_UNIX_LINES
;
3367 if (flags
.indexOf((UChar
)0x51) >= 0) { // 'Q' flag
3368 bflags
|= UREGEX_LITERAL
;
3372 callerPattern
= RegexPattern::compile(pattern
, bflags
, pe
, status
);
3373 if (status
!= U_ZERO_ERROR
) {
3374 #if UCONFIG_NO_BREAK_ITERATION==1
3375 // 'v' test flag means that the test pattern should not compile if ICU was configured
3376 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3377 if (flags
.indexOf((UChar
)0x76) >= 0 /*'v'*/ && status
== U_UNSUPPORTED_ERROR
) {
3378 goto cleanupAndReturn
;
3381 if (flags
.indexOf((UChar
)0x45) >= 0) { // flags contain 'E'
3382 // Expected pattern compilation error.
3383 if (flags
.indexOf((UChar
)0x64) >= 0) { // flags contain 'd'
3384 logln("Pattern Compile returns \"%s\"", u_errorName(status
));
3386 goto cleanupAndReturn
;
3388 // Unexpected pattern compilation error.
3389 dataerrln("Line %d: error %s compiling pattern.", line
, u_errorName(status
));
3390 goto cleanupAndReturn
;
3394 UTF8Converter
= ucnv_open("UTF8", &status
);
3395 ucnv_setFromUCallBack(UTF8Converter
, UCNV_FROM_U_CALLBACK_STOP
, NULL
, NULL
, NULL
, &status
);
3397 patternUTF8Length
= pattern
.extract(NULL
, 0, UTF8Converter
, status
);
3398 status
= U_ZERO_ERROR
; // buffer overflow
3399 patternChars
= new char[patternUTF8Length
+1];
3400 pattern
.extract(patternChars
, patternUTF8Length
+1, UTF8Converter
, status
);
3401 utext_openUTF8(&patternText
, patternChars
, patternUTF8Length
, &status
);
3403 if (status
== U_ZERO_ERROR
) {
3404 UTF8Pattern
= RegexPattern::compile(&patternText
, bflags
, pe
, status
);
3406 if (status
!= U_ZERO_ERROR
) {
3407 #if UCONFIG_NO_BREAK_ITERATION==1
3408 // 'v' test flag means that the test pattern should not compile if ICU was configured
3409 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3410 if (flags
.indexOf((UChar
)0x76) >= 0 /*'v'*/ && status
== U_UNSUPPORTED_ERROR
) {
3411 goto cleanupAndReturn
;
3414 if (flags
.indexOf((UChar
)0x45) >= 0) { // flags contain 'E'
3415 // Expected pattern compilation error.
3416 if (flags
.indexOf((UChar
)0x64) >= 0) { // flags contain 'd'
3417 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status
));
3419 goto cleanupAndReturn
;
3421 // Unexpected pattern compilation error.
3422 errln("Line %d: error %s compiling pattern. (UTF8)", line
, u_errorName(status
));
3423 goto cleanupAndReturn
;
3428 if (UTF8Pattern
== NULL
) {
3429 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3430 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath
, line
);
3431 status
= U_ZERO_ERROR
;
3434 if (flags
.indexOf((UChar
)0x64) >= 0) { // 'd' flag
3435 callerPattern
->dumpPattern();
3438 if (flags
.indexOf((UChar
)0x45) >= 0) { // 'E' flag
3439 errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath
, line
);
3440 goto cleanupAndReturn
;
3445 // Number of times find() should be called on the test string, default to 1
3448 for (i
=2; i
<=9; i
++) {
3449 if (flags
.indexOf((UChar
)(0x30 + i
)) >= 0) { // digit flag
3450 if (numFinds
!= 1) {
3451 errln("Line %d: more than one digit flag. Scanning %d.", line
, i
);
3452 goto cleanupAndReturn
;
3458 // 'M' flag. Use matches() instead of find()
3459 if (flags
.indexOf((UChar
)0x4d) >= 0) {
3460 useMatchesFunc
= TRUE
;
3462 if (flags
.indexOf((UChar
)0x4c) >= 0) {
3463 useLookingAtFunc
= TRUE
;
3467 // Find the tags in the input data, remove them, and record the group boundary
3470 parsePat
= RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe
, status
);
3471 REGEX_CHECK_STATUS_L(line
);
3473 unEscapedInput
= inputString
.unescape();
3474 parseMatcher
= parsePat
->matcher(unEscapedInput
, status
);
3475 REGEX_CHECK_STATUS_L(line
);
3476 while(parseMatcher
->find()) {
3477 parseMatcher
->appendReplacement(deTaggedInput
, "", status
);
3479 UnicodeString groupNum
= parseMatcher
->group(2, status
);
3480 if (groupNum
== "r") {
3481 // <r> or </r>, a region specification within the string
3482 if (parseMatcher
->group(1, status
) == "/") {
3483 regionEnd
= deTaggedInput
.length();
3485 regionStart
= deTaggedInput
.length();
3488 // <digits> or </digits>, a group match boundary tag.
3489 if (parseMatcher
->group(1, status
) == "/") {
3490 set(groupEnds
, deTaggedInput
.length(), groupNum
);
3492 set(groupStarts
, deTaggedInput
.length(), groupNum
);
3496 parseMatcher
->appendTail(deTaggedInput
);
3497 REGEX_ASSERT_L(groupStarts
.size() == groupEnds
.size(), line
);
3498 if ((regionStart
>=0 || regionEnd
>=0) && (regionStart
<0 || regionStart
>regionEnd
)) {
3499 errln("mismatched <r> tags");
3501 goto cleanupAndReturn
;
3505 // Configure the matcher according to the flags specified with this test.
3507 matcher
= callerPattern
->matcher(deTaggedInput
, status
);
3508 REGEX_CHECK_STATUS_L(line
);
3509 if (flags
.indexOf((UChar
)0x74) >= 0) { // 't' trace flag
3510 matcher
->setTrace(TRUE
);
3513 if (UTF8Pattern
!= NULL
) {
3514 inputUTF8Length
= deTaggedInput
.extract(NULL
, 0, UTF8Converter
, status
);
3515 status
= U_ZERO_ERROR
; // buffer overflow
3516 inputChars
= new char[inputUTF8Length
+1];
3517 deTaggedInput
.extract(inputChars
, inputUTF8Length
+1, UTF8Converter
, status
);
3518 utext_openUTF8(&inputText
, inputChars
, inputUTF8Length
, &status
);
3520 if (status
== U_ZERO_ERROR
) {
3521 UTF8Matcher
= &UTF8Pattern
->matcher(status
)->reset(&inputText
);
3522 REGEX_CHECK_STATUS_L(line
);
3525 if (UTF8Matcher
== NULL
) {
3526 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3527 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath
, line
);
3528 status
= U_ZERO_ERROR
;
3533 // Generate native indices for UTF8 versions of region and capture group info
3535 if (UTF8Matcher
!= NULL
) {
3536 if (flags
.indexOf((UChar
)0x74) >= 0) { // 't' trace flag
3537 UTF8Matcher
->setTrace(TRUE
);
3539 if (regionStart
>=0) (void) utextOffsetToNative(&inputText
, regionStart
, regionStartUTF8
);
3540 if (regionEnd
>=0) (void) utextOffsetToNative(&inputText
, regionEnd
, regionEndUTF8
);
3542 // Fill out the native index UVector info.
3543 // Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3544 for (i
=0; i
<groupStarts
.size(); i
++) {
3545 int32_t start
= groupStarts
.elementAti(i
);
3546 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3549 if (!utextOffsetToNative(&inputText
, start
, startUTF8
)) {
3550 errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line
, i
, start
);
3552 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3554 setInt(groupStartsUTF8
, startUTF8
, i
);
3557 int32_t end
= groupEnds
.elementAti(i
);
3558 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3561 if (!utextOffsetToNative(&inputText
, end
, endUTF8
)) {
3562 errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line
, i
, end
);
3564 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3566 setInt(groupEndsUTF8
, endUTF8
, i
);
3571 if (regionStart
>=0) {
3572 matcher
->region(regionStart
, regionEnd
, status
);
3573 REGEX_CHECK_STATUS_L(line
);
3574 if (UTF8Matcher
!= NULL
) {
3575 UTF8Matcher
->region(regionStartUTF8
, regionEndUTF8
, status
);
3576 REGEX_CHECK_STATUS_L(line
);
3579 if (flags
.indexOf((UChar
)0x61) >= 0) { // 'a' anchoring bounds flag
3580 matcher
->useAnchoringBounds(FALSE
);
3581 if (UTF8Matcher
!= NULL
) {
3582 UTF8Matcher
->useAnchoringBounds(FALSE
);
3585 if (flags
.indexOf((UChar
)0x62) >= 0) { // 'b' transparent bounds flag
3586 matcher
->useTransparentBounds(TRUE
);
3587 if (UTF8Matcher
!= NULL
) {
3588 UTF8Matcher
->useTransparentBounds(TRUE
);
3595 // Do a find on the de-tagged input using the caller's pattern
3596 // TODO: error on count>1 and not find().
3597 // error on both matches() and lookingAt().
3599 for (i
=0; i
<numFinds
; i
++) {
3600 if (useMatchesFunc
) {
3601 isMatch
= matcher
->matches(status
);
3602 if (UTF8Matcher
!= NULL
) {
3603 isUTF8Match
= UTF8Matcher
->matches(status
);
3605 } else if (useLookingAtFunc
) {
3606 isMatch
= matcher
->lookingAt(status
);
3607 if (UTF8Matcher
!= NULL
) {
3608 isUTF8Match
= UTF8Matcher
->lookingAt(status
);
3611 isMatch
= matcher
->find();
3612 if (UTF8Matcher
!= NULL
) {
3613 isUTF8Match
= UTF8Matcher
->find();
3617 matcher
->setTrace(FALSE
);
3619 UTF8Matcher
->setTrace(FALSE
);
3621 if (U_FAILURE(status
)) {
3622 errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status
));
3626 // Match up the groups from the find() with the groups from the tags
3629 // number of tags should match number of groups from find operation.
3630 // matcher->groupCount does not include group 0, the entire match, hence the +1.
3631 // G option in test means that capture group data is not available in the
3632 // expected results, so the check needs to be suppressed.
3633 if (isMatch
== FALSE
&& groupStarts
.size() != 0) {
3634 dataerrln("Error at line %d: Match expected, but none found.", line
);
3636 goto cleanupAndReturn
;
3637 } else if (UTF8Matcher
!= NULL
&& isUTF8Match
== FALSE
&& groupStarts
.size() != 0) {
3638 errln("Error at line %d: Match expected, but none found. (UTF8)", line
);
3640 goto cleanupAndReturn
;
3642 if (isMatch
&& groupStarts
.size() == 0) {
3643 errln("Error at line %d: No match expected, but one found at position %d.", line
, matcher
->start(status
));
3646 if (UTF8Matcher
&& isUTF8Match
&& groupStarts
.size() == 0) {
3647 errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line
, UTF8Matcher
->start(status
));
3651 if (flags
.indexOf((UChar
)0x47 /*G*/) >= 0) {
3652 // Only check for match / no match. Don't check capture groups.
3653 goto cleanupAndReturn
;
3656 REGEX_CHECK_STATUS_L(line
);
3657 for (i
=0; i
<=matcher
->groupCount(); i
++) {
3658 int32_t expectedStart
= (i
>= groupStarts
.size()? -1 : groupStarts
.elementAti(i
));
3659 int32_t expectedStartUTF8
= (i
>= groupStartsUTF8
.size()? -1 : groupStartsUTF8
.elementAti(i
));
3660 if (matcher
->start(i
, status
) != expectedStart
) {
3661 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
3662 line
, i
, expectedStart
, matcher
->start(i
, status
));
3664 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3665 } else if (UTF8Matcher
!= NULL
&& UTF8Matcher
->start(i
, status
) != expectedStartUTF8
) {
3666 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)",
3667 line
, i
, expectedStartUTF8
, UTF8Matcher
->start(i
, status
));
3669 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3672 int32_t expectedEnd
= (i
>= groupEnds
.size()? -1 : groupEnds
.elementAti(i
));
3673 int32_t expectedEndUTF8
= (i
>= groupEndsUTF8
.size()? -1 : groupEndsUTF8
.elementAti(i
));
3674 if (matcher
->end(i
, status
) != expectedEnd
) {
3675 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
3676 line
, i
, expectedEnd
, matcher
->end(i
, status
));
3678 // Error on end position; keep going; real error is probably yet to come as group
3679 // end positions work from end of the input data towards the front.
3680 } else if (UTF8Matcher
!= NULL
&& UTF8Matcher
->end(i
, status
) != expectedEndUTF8
) {
3681 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)",
3682 line
, i
, expectedEndUTF8
, UTF8Matcher
->end(i
, status
));
3684 // Error on end position; keep going; real error is probably yet to come as group
3685 // end positions work from end of the input data towards the front.
3688 if ( matcher
->groupCount()+1 < groupStarts
.size()) {
3689 errln("Error at line %d: Expected %d capture groups, found %d.",
3690 line
, groupStarts
.size()-1, matcher
->groupCount());
3693 else if (UTF8Matcher
!= NULL
&& UTF8Matcher
->groupCount()+1 < groupStarts
.size()) {
3694 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3695 line
, groupStarts
.size()-1, UTF8Matcher
->groupCount());
3699 if ((flags
.indexOf((UChar
)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3700 matcher
->requireEnd() == TRUE
) {
3701 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line
);
3703 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3704 UTF8Matcher
->requireEnd() == TRUE
) {
3705 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line
);
3709 if ((flags
.indexOf((UChar
)0x79) >= 0) && // 'y' flag: RequireEnd() == true
3710 matcher
->requireEnd() == FALSE
) {
3711 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line
);
3713 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x79) >= 0) && // 'Y' flag: RequireEnd() == false
3714 UTF8Matcher
->requireEnd() == FALSE
) {
3715 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line
);
3719 if ((flags
.indexOf((UChar
)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3720 matcher
->hitEnd() == TRUE
) {
3721 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line
);
3723 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3724 UTF8Matcher
->hitEnd() == TRUE
) {
3725 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line
);
3729 if ((flags
.indexOf((UChar
)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3730 matcher
->hitEnd() == FALSE
) {
3731 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line
);
3733 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3734 UTF8Matcher
->hitEnd() == FALSE
) {
3735 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line
);
3742 infoln((UnicodeString
)"\""+pattern
+(UnicodeString
)"\" "
3743 +flags
+(UnicodeString
)" \""+inputString
+(UnicodeString
)"\"");
3744 // callerPattern->dump();
3746 delete parseMatcher
;
3751 delete callerPattern
;
3753 utext_close(&inputText
);
3754 delete[] inputChars
;
3755 utext_close(&patternText
);
3756 delete[] patternChars
;
3757 ucnv_close(UTF8Converter
);
3763 //---------------------------------------------------------------------------
3765 // Errors Check for error handling in patterns.
3767 //---------------------------------------------------------------------------
3768 void RegexTest::Errors() {
3769 // \escape sequences that aren't implemented yet.
3770 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3772 // Missing close parentheses
3773 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN
);
3774 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN
);
3775 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN
);
3777 // Extra close paren
3778 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN
);
3779 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN
);
3780 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN
);
3782 // Look-ahead, Look-behind
3783 // TODO: add tests for unbounded length look-behinds.
3784 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX
); // illegal construct
3786 // Attempt to use non-default flags
3789 UErrorCode status
= U_ZERO_ERROR
;
3790 int32_t flags
= UREGEX_CANON_EQ
|
3791 UREGEX_COMMENTS
| UREGEX_DOTALL
|
3793 RegexPattern
*pat1
= RegexPattern::compile(".*", flags
, pe
, status
);
3794 REGEX_ASSERT(status
== U_REGEX_UNIMPLEMENTED
);
3799 // Quantifiers are allowed only after something that can be quantified.
3800 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX
);
3801 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX
);
3802 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX
);
3804 // Mal-formed {min,max} quantifiers
3805 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL
);
3806 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN
);
3807 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL
);
3808 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL
);
3809 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL
);
3810 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG
);
3811 REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG
); // Overflows int during scan
3812 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG
); // Overflows regex binary format
3813 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG
);
3816 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX
);
3818 // Invalid Back Reference \0
3819 // For ICU 3.8 and earlier
3820 // For ICU versions newer than 3.8, \0 introduces an octal escape.
3822 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE
);
3827 //-------------------------------------------------------------------------------
3829 // Read a text data file, convert it to UChars, and return the data
3830 // in one big UChar * buffer, which the caller must delete.
3832 //--------------------------------------------------------------------------------
3833 UChar
*RegexTest::ReadAndConvertFile(const char *fileName
, int32_t &ulen
,
3834 const char *defEncoding
, UErrorCode
&status
) {
3835 UChar
*retPtr
= NULL
;
3836 char *fileBuf
= NULL
;
3837 UConverter
* conv
= NULL
;
3841 if (U_FAILURE(status
)) {
3848 f
= fopen(fileName
, "rb");
3850 dataerrln("Error opening test data file %s\n", fileName
);
3851 status
= U_FILE_ACCESS_ERROR
;
3860 fseek( f
, 0, SEEK_END
);
3861 fileSize
= ftell(f
);
3862 fileBuf
= new char[fileSize
];
3863 fseek(f
, 0, SEEK_SET
);
3864 amt_read
= fread(fileBuf
, 1, fileSize
, f
);
3865 if (amt_read
!= fileSize
|| fileSize
<= 0) {
3866 errln("Error reading test data file.");
3867 goto cleanUpAndReturn
;
3871 // Look for a Unicode Signature (BOM) on the data just read
3873 int32_t signatureLength
;
3874 const char * fileBufC
;
3875 const char* encoding
;
3878 encoding
= ucnv_detectUnicodeSignature(
3879 fileBuf
, fileSize
, &signatureLength
, &status
);
3880 if(encoding
!=NULL
){
3881 fileBufC
+= signatureLength
;
3882 fileSize
-= signatureLength
;
3884 encoding
= defEncoding
;
3885 if (strcmp(encoding
, "utf-8") == 0) {
3886 errln("file %s is missing its BOM", fileName
);
3891 // Open a converter to take the rule file to UTF-16
3893 conv
= ucnv_open(encoding
, &status
);
3894 if (U_FAILURE(status
)) {
3895 goto cleanUpAndReturn
;
3899 // Convert the rules to UChar.
3900 // Preflight first to determine required buffer size.
3902 ulen
= ucnv_toUChars(conv
,
3908 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
3909 // Buffer Overflow is expected from the preflight operation.
3910 status
= U_ZERO_ERROR
;
3912 retPtr
= new UChar
[ulen
+1];
3925 if (U_FAILURE(status
)) {
3926 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
3935 //-------------------------------------------------------------------------------
3937 // PerlTests - Run Perl's regular expression tests
3938 // The input file for this test is re_tests, the standard regular
3939 // expression test data distributed with the Perl source code.
3941 // Here is Perl's description of the test data file:
3943 // # The tests are in a separate file 't/op/re_tests'.
3944 // # Each line in that file is a separate test.
3945 // # There are five columns, separated by tabs.
3947 // # Column 1 contains the pattern, optionally enclosed in C<''>.
3948 // # Modifiers can be put after the closing C<'>.
3950 // # Column 2 contains the string to be matched.
3952 // # Column 3 contains the expected result:
3953 // # y expect a match
3954 // # n expect no match
3955 // # c expect an error
3956 // # B test exposes a known bug in Perl, should be skipped
3957 // # b test exposes a known bug in Perl, should be skipped if noamp
3959 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3961 // # Column 4 contains a string, usually C<$&>.
3963 // # Column 5 contains the expected result of double-quote
3964 // # interpolating that string after the match, or start of error message.
3966 // # Column 6, if present, contains a reason why the test is skipped.
3967 // # This is printed with "skipped", for harness to pick up.
3969 // # \n in the tests are interpolated, as are variables of the form ${\w+}.
3971 // # If you want to add a regular expression test that can't be expressed
3972 // # in this format, don't add it here: put it in op/pat.t instead.
3974 // For ICU, if field 3 contains an 'i', the test will be skipped.
3975 // The test exposes is some known incompatibility between ICU and Perl regexps.
3976 // (The i is in addition to whatever was there before.)
3978 //-------------------------------------------------------------------------------
3979 void RegexTest::PerlTests() {
3981 const char *srcPath
;
3982 UErrorCode status
= U_ZERO_ERROR
;
3986 // Open and read the test data file.
3988 srcPath
=getPath(tdd
, "re_tests.txt");
3990 return; /* something went wrong, error already output */
3994 UChar
*testData
= ReadAndConvertFile(srcPath
, len
, "iso-8859-1", status
);
3995 if (U_FAILURE(status
)) {
3996 return; /* something went wrong, error already output */
4000 // Put the test data into a UnicodeString
4002 UnicodeString
testDataString(FALSE
, testData
, len
);
4005 // Regex to break the input file into lines, and strip the new lines.
4006 // One line per match, capture group one is the desired data.
4008 RegexPattern
* linePat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe
, status
);
4009 if (U_FAILURE(status
)) {
4010 dataerrln("RegexPattern::compile() error");
4013 RegexMatcher
* lineMat
= linePat
->matcher(testDataString
, status
);
4016 // Regex to split a test file line into fields.
4017 // There are six fields, separated by tabs.
4019 RegexPattern
* fieldPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe
, status
);
4022 // Regex to identify test patterns with flag settings, and to separate them.
4023 // Test patterns with flags look like 'pattern'i
4024 // Test patterns without flags are not quoted: pattern
4025 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4027 RegexPattern
*flagPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe
, status
);
4028 RegexMatcher
* flagMat
= flagPat
->matcher(status
);
4031 // The Perl tests reference several perl-isms, which are evaluated/substituted
4032 // in the test data. Not being perl, this must be done explicitly. Here
4033 // are string constants and REs for these constructs.
4035 UnicodeString
nulnulSrc("${nulnul}");
4036 UnicodeString
nulnul("\\u0000\\u0000", -1, US_INV
);
4037 nulnul
= nulnul
.unescape();
4039 UnicodeString
ffffSrc("${ffff}");
4040 UnicodeString
ffff("\\uffff", -1, US_INV
);
4041 ffff
= ffff
.unescape();
4043 // regexp for $-[0], $+[2], etc.
4044 RegexPattern
*groupsPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe
, status
);
4045 RegexMatcher
*groupsMat
= groupsPat
->matcher(status
);
4047 // regexp for $0, $1, $2, etc.
4048 RegexPattern
*cgPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe
, status
);
4049 RegexMatcher
*cgMat
= cgPat
->matcher(status
);
4053 // Main Loop for the Perl Tests, runs once per line from the
4056 int32_t lineNum
= 0;
4057 int32_t skippedUnimplementedCount
= 0;
4058 while (lineMat
->find()) {
4062 // Get a line, break it into its fields, do the Perl
4063 // variable substitutions.
4065 UnicodeString line
= lineMat
->group(1, status
);
4066 UnicodeString fields
[7];
4067 fieldPat
->split(line
, fields
, 7, status
);
4069 flagMat
->reset(fields
[0]);
4070 flagMat
->matches(status
);
4071 UnicodeString pattern
= flagMat
->group(2, status
);
4072 pattern
.findAndReplace("${bang}", "!");
4073 pattern
.findAndReplace(nulnulSrc
, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4074 pattern
.findAndReplace(ffffSrc
, ffff
);
4077 // Identify patterns that include match flag settings,
4078 // split off the flags, remove the extra quotes.
4080 UnicodeString flagStr
= flagMat
->group(3, status
);
4081 if (U_FAILURE(status
)) {
4082 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
4086 const UChar UChar_c
= 0x63; // Char constants for the flag letters.
4087 const UChar UChar_i
= 0x69; // (Damn the lack of Unicode support in C)
4088 const UChar UChar_m
= 0x6d;
4089 const UChar UChar_x
= 0x78;
4090 const UChar UChar_y
= 0x79;
4091 if (flagStr
.indexOf(UChar_i
) != -1) {
4092 flags
|= UREGEX_CASE_INSENSITIVE
;
4094 if (flagStr
.indexOf(UChar_m
) != -1) {
4095 flags
|= UREGEX_MULTILINE
;
4097 if (flagStr
.indexOf(UChar_x
) != -1) {
4098 flags
|= UREGEX_COMMENTS
;
4102 // Compile the test pattern.
4104 status
= U_ZERO_ERROR
;
4105 RegexPattern
*testPat
= RegexPattern::compile(pattern
, flags
, pe
, status
);
4106 if (status
== U_REGEX_UNIMPLEMENTED
) {
4108 // Test of a feature that is planned for ICU, but not yet implemented.
4110 skippedUnimplementedCount
++;
4112 status
= U_ZERO_ERROR
;
4116 if (U_FAILURE(status
)) {
4117 // Some tests are supposed to generate errors.
4118 // Only report an error for tests that are supposed to succeed.
4119 if (fields
[2].indexOf(UChar_c
) == -1 && // Compilation is not supposed to fail AND
4120 fields
[2].indexOf(UChar_i
) == -1) // it's not an accepted ICU incompatibility
4122 errln("line %d: ICU Error \"%s\"\n", lineNum
, u_errorName(status
));
4124 status
= U_ZERO_ERROR
;
4129 if (fields
[2].indexOf(UChar_i
) >= 0) {
4130 // ICU should skip this test.
4135 if (fields
[2].indexOf(UChar_c
) >= 0) {
4136 // This pattern should have caused a compilation error, but didn't/
4137 errln("line %d: Expected a pattern compile error, got success.", lineNum
);
4143 // replace the Perl variables that appear in some of the
4144 // match data strings.
4146 UnicodeString matchString
= fields
[1];
4147 matchString
.findAndReplace(nulnulSrc
, nulnul
);
4148 matchString
.findAndReplace(ffffSrc
, ffff
);
4150 // Replace any \n in the match string with an actual new-line char.
4151 // Don't do full unescape, as this unescapes more than Perl does, which
4152 // causes other spurious failures in the tests.
4153 matchString
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4158 // Run the test, check for expected match/don't match result.
4160 RegexMatcher
*testMat
= testPat
->matcher(matchString
, status
);
4161 UBool found
= testMat
->find();
4162 UBool expected
= FALSE
;
4163 if (fields
[2].indexOf(UChar_y
) >=0) {
4166 if (expected
!= found
) {
4167 errln("line %d: Expected %smatch, got %smatch",
4168 lineNum
, expected
?"":"no ", found
?"":"no " );
4172 // Don't try to check expected results if there is no match.
4173 // (Some have stuff in the expected fields)
4181 // Interpret the Perl expression from the fourth field of the data file,
4182 // building up an ICU string from the results of the ICU match.
4183 // The Perl expression will contain references to the results of
4184 // a regex match, including the matched string, capture group strings,
4185 // group starting and ending indicies, etc.
4187 UnicodeString resultString
;
4188 UnicodeString perlExpr
= fields
[3];
4189 #if SUPPORT_MUTATING_INPUT_STRING
4190 groupsMat
->reset(perlExpr
);
4191 cgMat
->reset(perlExpr
);
4194 while (perlExpr
.length() > 0) {
4195 #if !SUPPORT_MUTATING_INPUT_STRING
4196 // Perferred usage. Reset after any modification to input string.
4197 groupsMat
->reset(perlExpr
);
4198 cgMat
->reset(perlExpr
);
4201 if (perlExpr
.startsWith("$&")) {
4202 resultString
.append(testMat
->group(status
));
4203 perlExpr
.remove(0, 2);
4206 else if (groupsMat
->lookingAt(status
)) {
4208 UnicodeString digitString
= groupsMat
->group(2, status
);
4210 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4211 UnicodeString plusOrMinus
= groupsMat
->group(1, status
);
4212 int32_t matchPosition
;
4213 if (plusOrMinus
.compare("+") == 0) {
4214 matchPosition
= testMat
->end(groupNum
, status
);
4216 matchPosition
= testMat
->start(groupNum
, status
);
4218 if (matchPosition
!= -1) {
4219 ICU_Utility::appendNumber(resultString
, matchPosition
);
4221 perlExpr
.remove(0, groupsMat
->end(status
));
4224 else if (cgMat
->lookingAt(status
)) {
4226 UnicodeString digitString
= cgMat
->group(1, status
);
4228 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4229 if (U_SUCCESS(status
)) {
4230 resultString
.append(testMat
->group(groupNum
, status
));
4231 status
= U_ZERO_ERROR
;
4233 perlExpr
.remove(0, cgMat
->end(status
));
4236 else if (perlExpr
.startsWith("@-")) {
4238 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4240 resultString
.append(" ");
4242 ICU_Utility::appendNumber(resultString
, testMat
->start(i
, status
));
4244 perlExpr
.remove(0, 2);
4247 else if (perlExpr
.startsWith("@+")) {
4249 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4251 resultString
.append(" ");
4253 ICU_Utility::appendNumber(resultString
, testMat
->end(i
, status
));
4255 perlExpr
.remove(0, 2);
4258 else if (perlExpr
.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4259 // or as an escaped sequence (e.g. \n)
4260 if (perlExpr
.length() > 1) {
4261 perlExpr
.remove(0, 1); // Remove the '\', but only if not last char.
4263 UChar c
= perlExpr
.charAt(0);
4265 case 'n': c
= '\n'; break;
4266 // add any other escape sequences that show up in the test expected results.
4268 resultString
.append(c
);
4269 perlExpr
.remove(0, 1);
4273 // Any characters from the perl expression that we don't explicitly
4274 // recognize before here are assumed to be literals and copied
4275 // as-is to the expected results.
4276 resultString
.append(perlExpr
.charAt(0));
4277 perlExpr
.remove(0, 1);
4280 if (U_FAILURE(status
)) {
4281 errln("Line %d: ICU Error \"%s\"", lineNum
, u_errorName(status
));
4287 // Expected Results Compare
4289 UnicodeString
expectedS(fields
[4]);
4290 expectedS
.findAndReplace(nulnulSrc
, nulnul
);
4291 expectedS
.findAndReplace(ffffSrc
, ffff
);
4292 expectedS
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4295 if (expectedS
.compare(resultString
) != 0) {
4296 err("Line %d: Incorrect perl expression results.", lineNum
);
4297 infoln((UnicodeString
)"Expected \""+expectedS
+(UnicodeString
)"\"; got \""+resultString
+(UnicodeString
)"\"");
4305 // All done. Clean up allocated stuff.
4323 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount
);
4328 //-------------------------------------------------------------------------------
4330 // PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts
4331 // (instead of using UnicodeStrings) to test the alternate engine.
4332 // The input file for this test is re_tests, the standard regular
4333 // expression test data distributed with the Perl source code.
4334 // See PerlTests() for more information.
4336 //-------------------------------------------------------------------------------
4337 void RegexTest::PerlTestsUTF8() {
4339 const char *srcPath
;
4340 UErrorCode status
= U_ZERO_ERROR
;
4342 LocalUConverterPointer
UTF8Converter(ucnv_open("UTF-8", &status
));
4343 UText patternText
= UTEXT_INITIALIZER
;
4344 char *patternChars
= NULL
;
4345 int32_t patternLength
;
4346 int32_t patternCapacity
= 0;
4347 UText inputText
= UTEXT_INITIALIZER
;
4348 char *inputChars
= NULL
;
4349 int32_t inputLength
;
4350 int32_t inputCapacity
= 0;
4352 ucnv_setFromUCallBack(UTF8Converter
.getAlias(), UCNV_FROM_U_CALLBACK_STOP
, NULL
, NULL
, NULL
, &status
);
4355 // Open and read the test data file.
4357 srcPath
=getPath(tdd
, "re_tests.txt");
4359 return; /* something went wrong, error already output */
4363 UChar
*testData
= ReadAndConvertFile(srcPath
, len
, "iso-8859-1", status
);
4364 if (U_FAILURE(status
)) {
4365 return; /* something went wrong, error already output */
4369 // Put the test data into a UnicodeString
4371 UnicodeString
testDataString(FALSE
, testData
, len
);
4374 // Regex to break the input file into lines, and strip the new lines.
4375 // One line per match, capture group one is the desired data.
4377 RegexPattern
* linePat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe
, status
);
4378 if (U_FAILURE(status
)) {
4379 dataerrln("RegexPattern::compile() error");
4382 RegexMatcher
* lineMat
= linePat
->matcher(testDataString
, status
);
4385 // Regex to split a test file line into fields.
4386 // There are six fields, separated by tabs.
4388 RegexPattern
* fieldPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe
, status
);
4391 // Regex to identify test patterns with flag settings, and to separate them.
4392 // Test patterns with flags look like 'pattern'i
4393 // Test patterns without flags are not quoted: pattern
4394 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4396 RegexPattern
*flagPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe
, status
);
4397 RegexMatcher
* flagMat
= flagPat
->matcher(status
);
4400 // The Perl tests reference several perl-isms, which are evaluated/substituted
4401 // in the test data. Not being perl, this must be done explicitly. Here
4402 // are string constants and REs for these constructs.
4404 UnicodeString
nulnulSrc("${nulnul}");
4405 UnicodeString
nulnul("\\u0000\\u0000", -1, US_INV
);
4406 nulnul
= nulnul
.unescape();
4408 UnicodeString
ffffSrc("${ffff}");
4409 UnicodeString
ffff("\\uffff", -1, US_INV
);
4410 ffff
= ffff
.unescape();
4412 // regexp for $-[0], $+[2], etc.
4413 RegexPattern
*groupsPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe
, status
);
4414 RegexMatcher
*groupsMat
= groupsPat
->matcher(status
);
4416 // regexp for $0, $1, $2, etc.
4417 RegexPattern
*cgPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe
, status
);
4418 RegexMatcher
*cgMat
= cgPat
->matcher(status
);
4422 // Main Loop for the Perl Tests, runs once per line from the
4425 int32_t lineNum
= 0;
4426 int32_t skippedUnimplementedCount
= 0;
4427 while (lineMat
->find()) {
4431 // Get a line, break it into its fields, do the Perl
4432 // variable substitutions.
4434 UnicodeString line
= lineMat
->group(1, status
);
4435 UnicodeString fields
[7];
4436 fieldPat
->split(line
, fields
, 7, status
);
4438 flagMat
->reset(fields
[0]);
4439 flagMat
->matches(status
);
4440 UnicodeString pattern
= flagMat
->group(2, status
);
4441 pattern
.findAndReplace("${bang}", "!");
4442 pattern
.findAndReplace(nulnulSrc
, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4443 pattern
.findAndReplace(ffffSrc
, ffff
);
4446 // Identify patterns that include match flag settings,
4447 // split off the flags, remove the extra quotes.
4449 UnicodeString flagStr
= flagMat
->group(3, status
);
4450 if (U_FAILURE(status
)) {
4451 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
4455 const UChar UChar_c
= 0x63; // Char constants for the flag letters.
4456 const UChar UChar_i
= 0x69; // (Damn the lack of Unicode support in C)
4457 const UChar UChar_m
= 0x6d;
4458 const UChar UChar_x
= 0x78;
4459 const UChar UChar_y
= 0x79;
4460 if (flagStr
.indexOf(UChar_i
) != -1) {
4461 flags
|= UREGEX_CASE_INSENSITIVE
;
4463 if (flagStr
.indexOf(UChar_m
) != -1) {
4464 flags
|= UREGEX_MULTILINE
;
4466 if (flagStr
.indexOf(UChar_x
) != -1) {
4467 flags
|= UREGEX_COMMENTS
;
4471 // Put the pattern in a UTF-8 UText
4473 status
= U_ZERO_ERROR
;
4474 patternLength
= pattern
.extract(patternChars
, patternCapacity
, UTF8Converter
.getAlias(), status
);
4475 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
4476 status
= U_ZERO_ERROR
;
4477 delete[] patternChars
;
4478 patternCapacity
= patternLength
+ 1;
4479 patternChars
= new char[patternCapacity
];
4480 pattern
.extract(patternChars
, patternCapacity
, UTF8Converter
.getAlias(), status
);
4482 utext_openUTF8(&patternText
, patternChars
, patternLength
, &status
);
4485 // Compile the test pattern.
4487 RegexPattern
*testPat
= RegexPattern::compile(&patternText
, flags
, pe
, status
);
4488 if (status
== U_REGEX_UNIMPLEMENTED
) {
4490 // Test of a feature that is planned for ICU, but not yet implemented.
4492 skippedUnimplementedCount
++;
4494 status
= U_ZERO_ERROR
;
4498 if (U_FAILURE(status
)) {
4499 // Some tests are supposed to generate errors.
4500 // Only report an error for tests that are supposed to succeed.
4501 if (fields
[2].indexOf(UChar_c
) == -1 && // Compilation is not supposed to fail AND
4502 fields
[2].indexOf(UChar_i
) == -1) // it's not an accepted ICU incompatibility
4504 errln("line %d: ICU Error \"%s\"\n", lineNum
, u_errorName(status
));
4506 status
= U_ZERO_ERROR
;
4511 if (fields
[2].indexOf(UChar_i
) >= 0) {
4512 // ICU should skip this test.
4517 if (fields
[2].indexOf(UChar_c
) >= 0) {
4518 // This pattern should have caused a compilation error, but didn't/
4519 errln("line %d: Expected a pattern compile error, got success.", lineNum
);
4526 // replace the Perl variables that appear in some of the
4527 // match data strings.
4529 UnicodeString matchString
= fields
[1];
4530 matchString
.findAndReplace(nulnulSrc
, nulnul
);
4531 matchString
.findAndReplace(ffffSrc
, ffff
);
4533 // Replace any \n in the match string with an actual new-line char.
4534 // Don't do full unescape, as this unescapes more than Perl does, which
4535 // causes other spurious failures in the tests.
4536 matchString
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4539 // Put the input in a UTF-8 UText
4541 status
= U_ZERO_ERROR
;
4542 inputLength
= matchString
.extract(inputChars
, inputCapacity
, UTF8Converter
.getAlias(), status
);
4543 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
4544 status
= U_ZERO_ERROR
;
4545 delete[] inputChars
;
4546 inputCapacity
= inputLength
+ 1;
4547 inputChars
= new char[inputCapacity
];
4548 matchString
.extract(inputChars
, inputCapacity
, UTF8Converter
.getAlias(), status
);
4550 utext_openUTF8(&inputText
, inputChars
, inputLength
, &status
);
4553 // Run the test, check for expected match/don't match result.
4555 RegexMatcher
*testMat
= &testPat
->matcher(status
)->reset(&inputText
);
4556 UBool found
= testMat
->find();
4557 UBool expected
= FALSE
;
4558 if (fields
[2].indexOf(UChar_y
) >=0) {
4561 if (expected
!= found
) {
4562 errln("line %d: Expected %smatch, got %smatch",
4563 lineNum
, expected
?"":"no ", found
?"":"no " );
4567 // Don't try to check expected results if there is no match.
4568 // (Some have stuff in the expected fields)
4576 // Interpret the Perl expression from the fourth field of the data file,
4577 // building up an ICU string from the results of the ICU match.
4578 // The Perl expression will contain references to the results of
4579 // a regex match, including the matched string, capture group strings,
4580 // group starting and ending indicies, etc.
4582 UnicodeString resultString
;
4583 UnicodeString perlExpr
= fields
[3];
4585 while (perlExpr
.length() > 0) {
4586 groupsMat
->reset(perlExpr
);
4587 cgMat
->reset(perlExpr
);
4589 if (perlExpr
.startsWith("$&")) {
4590 resultString
.append(testMat
->group(status
));
4591 perlExpr
.remove(0, 2);
4594 else if (groupsMat
->lookingAt(status
)) {
4596 UnicodeString digitString
= groupsMat
->group(2, status
);
4598 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4599 UnicodeString plusOrMinus
= groupsMat
->group(1, status
);
4600 int32_t matchPosition
;
4601 if (plusOrMinus
.compare("+") == 0) {
4602 matchPosition
= testMat
->end(groupNum
, status
);
4604 matchPosition
= testMat
->start(groupNum
, status
);
4606 if (matchPosition
!= -1) {
4607 ICU_Utility::appendNumber(resultString
, matchPosition
);
4609 perlExpr
.remove(0, groupsMat
->end(status
));
4612 else if (cgMat
->lookingAt(status
)) {
4614 UnicodeString digitString
= cgMat
->group(1, status
);
4616 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4617 if (U_SUCCESS(status
)) {
4618 resultString
.append(testMat
->group(groupNum
, status
));
4619 status
= U_ZERO_ERROR
;
4621 perlExpr
.remove(0, cgMat
->end(status
));
4624 else if (perlExpr
.startsWith("@-")) {
4626 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4628 resultString
.append(" ");
4630 ICU_Utility::appendNumber(resultString
, testMat
->start(i
, status
));
4632 perlExpr
.remove(0, 2);
4635 else if (perlExpr
.startsWith("@+")) {
4637 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4639 resultString
.append(" ");
4641 ICU_Utility::appendNumber(resultString
, testMat
->end(i
, status
));
4643 perlExpr
.remove(0, 2);
4646 else if (perlExpr
.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4647 // or as an escaped sequence (e.g. \n)
4648 if (perlExpr
.length() > 1) {
4649 perlExpr
.remove(0, 1); // Remove the '\', but only if not last char.
4651 UChar c
= perlExpr
.charAt(0);
4653 case 'n': c
= '\n'; break;
4654 // add any other escape sequences that show up in the test expected results.
4656 resultString
.append(c
);
4657 perlExpr
.remove(0, 1);
4661 // Any characters from the perl expression that we don't explicitly
4662 // recognize before here are assumed to be literals and copied
4663 // as-is to the expected results.
4664 resultString
.append(perlExpr
.charAt(0));
4665 perlExpr
.remove(0, 1);
4668 if (U_FAILURE(status
)) {
4669 errln("Line %d: ICU Error \"%s\"", lineNum
, u_errorName(status
));
4675 // Expected Results Compare
4677 UnicodeString
expectedS(fields
[4]);
4678 expectedS
.findAndReplace(nulnulSrc
, nulnul
);
4679 expectedS
.findAndReplace(ffffSrc
, ffff
);
4680 expectedS
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4683 if (expectedS
.compare(resultString
) != 0) {
4684 err("Line %d: Incorrect perl expression results.", lineNum
);
4685 infoln((UnicodeString
)"Expected \""+expectedS
+(UnicodeString
)"\"; got \""+resultString
+(UnicodeString
)"\"");
4693 // All done. Clean up allocated stuff.
4710 utext_close(&patternText
);
4711 utext_close(&inputText
);
4713 delete [] patternChars
;
4714 delete [] inputChars
;
4717 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount
);
4722 //--------------------------------------------------------------
4724 // Bug6149 Verify limits to heap expansion for backtrack stack.
4725 // Use this pattern,
4726 // "(a?){1,8000000}"
4727 // Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4728 // This test is likely to be fragile, as further optimizations stop
4729 // more cases of pointless looping in the match engine.
4731 //---------------------------------------------------------------
4732 void RegexTest::Bug6149() {
4733 UnicodeString
pattern("(a?){1,8000000}");
4734 UnicodeString
s("xyz");
4736 UErrorCode status
= U_ZERO_ERROR
;
4738 RegexMatcher
matcher(pattern
, s
, flags
, status
);
4739 UBool result
= false;
4740 REGEX_ASSERT_FAIL(result
=matcher
.matches(status
), U_REGEX_STACK_OVERFLOW
);
4741 REGEX_ASSERT(result
== FALSE
);
4746 // Callbacks() Test the callback function.
4747 // When set, callbacks occur periodically during matching operations,
4748 // giving the application code the ability to abort the operation
4749 // before it's normal completion.
4752 struct callBackContext
{
4757 void reset(int32_t max
) {maxCalls
=max
; numCalls
=0; lastSteps
=0;};
4761 static UBool U_CALLCONV
4762 testCallBackFn(const void *context
, int32_t steps
) {
4763 callBackContext
*info
= (callBackContext
*)context
;
4764 if (info
->lastSteps
+1 != steps
) {
4765 info
->test
->errln("incorrect steps in callback. Expected %d, got %d\n", info
->lastSteps
+1, steps
);
4767 info
->lastSteps
= steps
;
4769 return (info
->numCalls
< info
->maxCalls
);
4773 void RegexTest::Callbacks() {
4775 // Getter returns NULLs if no callback has been set
4777 // The variables that the getter will fill in.
4778 // Init to non-null values so that the action of the getter can be seen.
4779 const void *returnedContext
= &returnedContext
;
4780 URegexMatchCallback
*returnedFn
= &testCallBackFn
;
4782 UErrorCode status
= U_ZERO_ERROR
;
4783 RegexMatcher
matcher("x", 0, status
);
4785 matcher
.getMatchCallback(returnedFn
, returnedContext
, status
);
4787 REGEX_ASSERT(returnedFn
== NULL
);
4788 REGEX_ASSERT(returnedContext
== NULL
);
4793 callBackContext cbInfo
= {this, 0, 0, 0};
4794 const void *returnedContext
;
4795 URegexMatchCallback
*returnedFn
;
4796 UErrorCode status
= U_ZERO_ERROR
;
4797 RegexMatcher
matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status
); // A pattern that can run long.
4799 matcher
.setMatchCallback(testCallBackFn
, &cbInfo
, status
);
4801 matcher
.getMatchCallback(returnedFn
, returnedContext
, status
);
4803 REGEX_ASSERT(returnedFn
== testCallBackFn
);
4804 REGEX_ASSERT(returnedContext
== &cbInfo
);
4806 // A short-running match shouldn't invoke the callback
4807 status
= U_ZERO_ERROR
;
4809 UnicodeString s
= "xxx";
4811 REGEX_ASSERT(matcher
.matches(status
));
4813 REGEX_ASSERT(cbInfo
.numCalls
== 0);
4815 // A medium-length match that runs long enough to invoke the
4816 // callback, but not so long that the callback aborts it.
4817 status
= U_ZERO_ERROR
;
4819 s
= "aaaaaaaaaaaaaaaaaaab";
4821 REGEX_ASSERT(matcher
.matches(status
)==FALSE
);
4823 REGEX_ASSERT(cbInfo
.numCalls
> 0);
4825 // A longer running match that the callback function will abort.
4826 status
= U_ZERO_ERROR
;
4828 s
= "aaaaaaaaaaaaaaaaaaaaaaab";
4830 REGEX_ASSERT(matcher
.matches(status
)==FALSE
);
4831 REGEX_ASSERT(status
== U_REGEX_STOPPED_BY_CALLER
);
4832 REGEX_ASSERT(cbInfo
.numCalls
== 4);
4834 // A longer running find that the callback function will abort.
4835 status
= U_ZERO_ERROR
;
4837 s
= "aaaaaaaaaaaaaaaaaaaaaaab";
4839 REGEX_ASSERT(matcher
.find(status
)==FALSE
);
4840 REGEX_ASSERT(status
== U_REGEX_STOPPED_BY_CALLER
);
4841 REGEX_ASSERT(cbInfo
.numCalls
== 4);
4849 // FindProgressCallbacks() Test the find "progress" callback function.
4850 // When set, the find progress callback will be invoked during a find operations
4851 // after each return from a match attempt, giving the application the opportunity
4852 // to terminate a long-running find operation before it's normal completion.
4855 struct progressCallBackContext
{
4860 void reset(int32_t max
) {maxCalls
=max
; numCalls
=0;lastIndex
=0;};
4863 // call-back function for find().
4864 // Return TRUE to continue the find().
4865 // Return FALSE to stop the find().
4867 static UBool U_CALLCONV
4868 testProgressCallBackFn(const void *context
, int64_t matchIndex
) {
4869 progressCallBackContext
*info
= (progressCallBackContext
*)context
;
4871 info
->lastIndex
= matchIndex
;
4872 // info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4873 return (info
->numCalls
< info
->maxCalls
);
4877 void RegexTest::FindProgressCallbacks() {
4879 // Getter returns NULLs if no callback has been set
4881 // The variables that the getter will fill in.
4882 // Init to non-null values so that the action of the getter can be seen.
4883 const void *returnedContext
= &returnedContext
;
4884 URegexFindProgressCallback
*returnedFn
= &testProgressCallBackFn
;
4886 UErrorCode status
= U_ZERO_ERROR
;
4887 RegexMatcher
matcher("x", 0, status
);
4889 matcher
.getFindProgressCallback(returnedFn
, returnedContext
, status
);
4891 REGEX_ASSERT(returnedFn
== NULL
);
4892 REGEX_ASSERT(returnedContext
== NULL
);
4897 progressCallBackContext cbInfo
= {this, 0, 0, 0};
4898 const void *returnedContext
;
4899 URegexFindProgressCallback
*returnedFn
;
4900 UErrorCode status
= U_ZERO_ERROR
;
4901 RegexMatcher
matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status
);
4903 matcher
.setFindProgressCallback(testProgressCallBackFn
, &cbInfo
, status
);
4905 matcher
.getFindProgressCallback(returnedFn
, returnedContext
, status
);
4907 REGEX_ASSERT(returnedFn
== testProgressCallBackFn
);
4908 REGEX_ASSERT(returnedContext
== &cbInfo
);
4910 // A find that matches on the initial position does NOT invoke the callback.
4911 status
= U_ZERO_ERROR
;
4913 UnicodeString s
= "aaxxx";
4916 matcher
.setTrace(TRUE
);
4918 REGEX_ASSERT(matcher
.find(0, status
));
4920 REGEX_ASSERT(cbInfo
.numCalls
== 0);
4922 // A medium running find() that causes matcher.find() to invoke our callback for each index,
4923 // but not so many times that we interrupt the operation.
4924 status
= U_ZERO_ERROR
;
4925 s
= "aaaaaaaaaaaaaaaaaaab";
4926 cbInfo
.reset(s
.length()); // Some upper limit for number of calls that is greater than size of our input string
4928 REGEX_ASSERT(matcher
.find(0, status
)==FALSE
);
4930 REGEX_ASSERT(cbInfo
.numCalls
> 0 && cbInfo
.numCalls
< 25);
4932 // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4933 status
= U_ZERO_ERROR
;
4934 UnicodeString s1
= "aaaaaaaaaaaaaaaaaaaaaaab";
4935 cbInfo
.reset(s1
.length() - 5); // Bail early somewhere near the end of input string
4937 REGEX_ASSERT(matcher
.find(0, status
)==FALSE
);
4938 REGEX_ASSERT(status
== U_REGEX_STOPPED_BY_CALLER
);
4939 REGEX_ASSERT(cbInfo
.numCalls
== s1
.length() - 5);
4941 // Now a match that will succeed, but after an interruption
4942 status
= U_ZERO_ERROR
;
4943 UnicodeString s2
= "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4944 cbInfo
.reset(s2
.length() - 10); // Bail early somewhere near the end of input string
4946 REGEX_ASSERT(matcher
.find(0, status
)==FALSE
);
4947 REGEX_ASSERT(status
== U_REGEX_STOPPED_BY_CALLER
);
4948 // Now retry the match from where left off
4949 cbInfo
.maxCalls
= 100; // No callback limit
4950 status
= U_ZERO_ERROR
;
4951 REGEX_ASSERT(matcher
.find(cbInfo
.lastIndex
, status
));
4959 //---------------------------------------------------------------------------
4961 // PreAllocatedUTextCAPI Check the C API with pre-allocated mutable
4962 // UTexts. The pure-C implementation of UText
4963 // has no mutable backing stores, but we can
4964 // use UnicodeString here to test the functionality.
4966 //---------------------------------------------------------------------------
4967 void RegexTest::PreAllocatedUTextCAPI () {
4968 UErrorCode status
= U_ZERO_ERROR
;
4969 URegularExpression
*re
;
4970 UText patternText
= UTEXT_INITIALIZER
;
4971 UnicodeString buffer
;
4972 UText bufferText
= UTEXT_INITIALIZER
;
4974 utext_openUnicodeString(&bufferText
, &buffer
, &status
);
4977 * getText() and getUText()
4980 UText text1
= UTEXT_INITIALIZER
;
4981 UText text2
= UTEXT_INITIALIZER
;
4982 UChar text2Chars
[20];
4985 status
= U_ZERO_ERROR
;
4986 regextst_openUTF8FromInvariant(&text1
, "abcccd", -1, &status
);
4987 regextst_openUTF8FromInvariant(&text2
, "abcccxd", -1, &status
);
4988 u_uastrncpy(text2Chars
, "abcccxd", sizeof(text2
)/2);
4989 utext_openUChars(&text2
, text2Chars
, -1, &status
);
4991 regextst_openUTF8FromInvariant(&patternText
, "abc*d", -1, &status
);
4992 re
= uregex_openUText(&patternText
, 0, NULL
, &status
);
4994 /* First set a UText */
4995 uregex_setUText(re
, &text1
, &status
);
4996 resultText
= uregex_getUText(re
, &bufferText
, &status
);
4998 REGEX_ASSERT(resultText
== &bufferText
);
4999 utext_setNativeIndex(resultText
, 0);
5000 utext_setNativeIndex(&text1
, 0);
5001 REGEX_ASSERT(testUTextEqual(resultText
, &text1
));
5003 resultText
= uregex_getUText(re
, &bufferText
, &status
);
5005 REGEX_ASSERT(resultText
== &bufferText
);
5006 utext_setNativeIndex(resultText
, 0);
5007 utext_setNativeIndex(&text1
, 0);
5008 REGEX_ASSERT(testUTextEqual(resultText
, &text1
));
5010 /* Then set a UChar * */
5011 uregex_setText(re
, text2Chars
, 7, &status
);
5012 resultText
= uregex_getUText(re
, &bufferText
, &status
);
5014 REGEX_ASSERT(resultText
== &bufferText
);
5015 utext_setNativeIndex(resultText
, 0);
5016 utext_setNativeIndex(&text2
, 0);
5017 REGEX_ASSERT(testUTextEqual(resultText
, &text2
));
5020 utext_close(&text1
);
5021 utext_close(&text2
);
5033 u_uastrncpy(text1
, "noise abc interior def, and this is off the end", UPRV_LENGTHOF(text1
));
5034 // 012345678901234567890123456789012345678901234567
5037 status
= U_ZERO_ERROR
;
5038 re
= uregex_openC("abc(.*?)def", 0, NULL
, &status
);
5041 uregex_setText(re
, text1
, -1, &status
);
5042 result
= uregex_find(re
, 0, &status
);
5043 REGEX_ASSERT(result
==TRUE
);
5045 /* Capture Group 0, the full match. Should succeed. "abc interior def" */
5046 status
= U_ZERO_ERROR
;
5047 actual
= uregex_groupUText(re
, 0, &bufferText
, &length
, &status
);
5049 REGEX_ASSERT(actual
== &bufferText
);
5050 REGEX_ASSERT(utext_getNativeIndex(actual
) == 6);
5051 REGEX_ASSERT(length
== 16);
5052 REGEX_ASSERT(utext_nativeLength(actual
) == 47);
5054 /* Capture group #1. Should succeed, matching " interior ". */
5055 status
= U_ZERO_ERROR
;
5056 actual
= uregex_groupUText(re
, 1, &bufferText
, &length
, &status
);
5058 REGEX_ASSERT(actual
== &bufferText
);
5059 REGEX_ASSERT(utext_getNativeIndex(actual
) == 9); // position of " interior "
5060 REGEX_ASSERT(length
== 10);
5061 REGEX_ASSERT(utext_nativeLength(actual
) == 47);
5063 /* Capture group out of range. Error. */
5064 status
= U_ZERO_ERROR
;
5065 actual
= uregex_groupUText(re
, 2, &bufferText
, &length
, &status
);
5066 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
5067 REGEX_ASSERT(actual
== &bufferText
);
5078 UText replText
= UTEXT_INITIALIZER
;
5080 status
= U_ZERO_ERROR
;
5081 utext_openUnicodeString(&bufferText
, &buffer
, &status
);
5083 status
= U_ZERO_ERROR
;
5084 u_uastrncpy(text1
, "Replace xaax x1x x...x.", UPRV_LENGTHOF(text1
));
5085 u_uastrncpy(text2
, "No match here.", UPRV_LENGTHOF(text2
)/2);
5086 regextst_openUTF8FromInvariant(&replText
, "<$1>", -1, &status
);
5088 re
= uregex_openC("x(.*?)x", 0, NULL
, &status
);
5091 /* Normal case, with match */
5092 uregex_setText(re
, text1
, -1, &status
);
5094 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5096 result
= uregex_replaceFirstUText(re
, &replText
, &bufferText
, &status
);
5098 REGEX_ASSERT(result
== &bufferText
);
5099 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result
);
5101 /* No match. Text should copy to output with no changes. */
5102 uregex_setText(re
, text2
, -1, &status
);
5103 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5104 result
= uregex_replaceFirstUText(re
, &replText
, &bufferText
, &status
);
5106 REGEX_ASSERT(result
== &bufferText
);
5107 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result
);
5109 /* Unicode escapes */
5110 uregex_setText(re
, text1
, -1, &status
);
5111 regextst_openUTF8FromInvariant(&replText
, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status
);
5112 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5113 result
= uregex_replaceFirstUText(re
, &replText
, &bufferText
, &status
);
5115 REGEX_ASSERT(result
== &bufferText
);
5116 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result
);
5119 utext_close(&replText
);
5129 UText replText
= UTEXT_INITIALIZER
;
5132 status
= U_ZERO_ERROR
;
5133 u_uastrncpy(text1
, "Replace xaax x1x x...x.", sizeof(text1
)/2);
5134 u_uastrncpy(text2
, "No match here.", sizeof(text2
)/2);
5135 regextst_openUTF8FromInvariant(&replText
, "<$1>", -1, &status
);
5137 re
= uregex_openC("x(.*?)x", 0, NULL
, &status
);
5140 /* Normal case, with match */
5141 uregex_setText(re
, text1
, -1, &status
);
5142 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5143 result
= uregex_replaceAllUText(re
, &replText
, &bufferText
, &status
);
5145 REGEX_ASSERT(result
== &bufferText
);
5146 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result
);
5148 /* No match. Text should copy to output with no changes. */
5149 uregex_setText(re
, text2
, -1, &status
);
5150 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5151 result
= uregex_replaceAllUText(re
, &replText
, &bufferText
, &status
);
5153 REGEX_ASSERT(result
== &bufferText
);
5154 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result
);
5157 utext_close(&replText
);
5162 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5163 * so we don't need to test it here.
5166 utext_close(&bufferText
);
5167 utext_close(&patternText
);
5171 //--------------------------------------------------------------
5173 // NamedCapture Check basic named capture group functionality
5175 //--------------------------------------------------------------
5176 void RegexTest::NamedCapture() {
5177 UErrorCode status
= U_ZERO_ERROR
;
5178 RegexPattern
*pat
= RegexPattern::compile(UnicodeString(
5179 "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status
);
5181 int32_t group
= pat
->groupNumberFromName("five", -1, status
);
5183 REGEX_ASSERT(5 == group
);
5184 group
= pat
->groupNumberFromName("three", -1, status
);
5186 REGEX_ASSERT(3 == group
);
5188 status
= U_ZERO_ERROR
;
5189 group
= pat
->groupNumberFromName(UnicodeString("six"), status
);
5191 REGEX_ASSERT(6 == group
);
5193 status
= U_ZERO_ERROR
;
5194 group
= pat
->groupNumberFromName(UnicodeString("nosuch"), status
);
5195 U_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5197 status
= U_ZERO_ERROR
;
5199 // After copying a pattern, named capture should still work in the copy.
5200 RegexPattern
*copiedPat
= new RegexPattern(*pat
);
5201 REGEX_ASSERT(*copiedPat
== *pat
);
5202 delete pat
; pat
= NULL
; // Delete original, copy should have no references back to it.
5204 group
= copiedPat
->groupNumberFromName("five", -1, status
);
5206 REGEX_ASSERT(5 == group
);
5207 group
= copiedPat
->groupNumberFromName("three", -1, status
);
5209 REGEX_ASSERT(3 == group
);
5212 // ReplaceAll with named capture group.
5213 status
= U_ZERO_ERROR
;
5214 UnicodeString
text("Substitution of <<quotes>> for <<double brackets>>");
5215 RegexMatcher
*m
= new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text
, 0, status
);
5217 // m.pattern().dumpPattern();
5218 UnicodeString replacedText
= m
->replaceAll("'${mid}'", status
);
5220 REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText
);
5223 // ReplaceAll, allowed capture group numbers.
5224 text
= UnicodeString("abcmxyz");
5225 m
= new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text
, 0, status
);
5228 status
= U_ZERO_ERROR
;
5229 replacedText
= m
->replaceAll(UnicodeString("<$0>"), status
); // group 0, full match, is allowed.
5231 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText
);
5233 status
= U_ZERO_ERROR
;
5234 replacedText
= m
->replaceAll(UnicodeString("<$1>"), status
); // group 1 by number.
5236 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText
);
5238 status
= U_ZERO_ERROR
;
5239 replacedText
= m
->replaceAll(UnicodeString("<${one}>"), status
); // group 1 by name.
5241 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText
);
5243 status
= U_ZERO_ERROR
;
5244 replacedText
= m
->replaceAll(UnicodeString("<$2>"), status
); // group 2.
5246 REGEX_ASSERT(UnicodeString("a<x>z") == replacedText
);
5248 status
= U_ZERO_ERROR
;
5249 replacedText
= m
->replaceAll(UnicodeString("<$3>"), status
);
5251 REGEX_ASSERT(UnicodeString("a<y>z") == replacedText
);
5253 status
= U_ZERO_ERROR
;
5254 replacedText
= m
->replaceAll(UnicodeString("<$4>"), status
);
5255 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
5257 status
= U_ZERO_ERROR
;
5258 replacedText
= m
->replaceAll(UnicodeString("<$04>"), status
); // group 0, leading 0,
5259 REGEX_CHECK_STATUS
; // trailing out-of-range 4 passes through.
5260 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText
);
5262 status
= U_ZERO_ERROR
;
5263 replacedText
= m
->replaceAll(UnicodeString("<$000016>"), status
); // Consume leading zeroes. Don't consume digits
5264 REGEX_CHECK_STATUS
; // that push group num out of range.
5265 REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText
); // This is group 1.
5267 status
= U_ZERO_ERROR
;
5268 replacedText
= m
->replaceAll(UnicodeString("<$3$2$1${one}>"), status
);
5270 REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText
);
5272 status
= U_ZERO_ERROR
;
5273 replacedText
= m
->replaceAll(UnicodeString("$3$2$1${one}"), status
);
5275 REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText
);
5277 status
= U_ZERO_ERROR
;
5278 replacedText
= m
->replaceAll(UnicodeString("<${noSuchName}>"), status
);
5279 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5281 status
= U_ZERO_ERROR
;
5282 replacedText
= m
->replaceAll(UnicodeString("<${invalid-name}>"), status
);
5283 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5285 status
= U_ZERO_ERROR
;
5286 replacedText
= m
->replaceAll(UnicodeString("<${one"), status
);
5287 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5289 status
= U_ZERO_ERROR
;
5290 replacedText
= m
->replaceAll(UnicodeString("$not a capture group"), status
);
5291 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5295 // Repeat the above replaceAll() tests using the plain C API, which
5296 // has a separate implementation internally.
5297 // TODO: factor out the test data.
5299 status
= U_ZERO_ERROR
;
5300 URegularExpression
*re
= uregex_openC("..(?<one>m)(.)(.)", 0, NULL
, &status
);
5302 text
= UnicodeString("abcmxyz");
5303 uregex_setText(re
, text
.getBuffer(), text
.length(), &status
);
5306 UChar resultBuf
[100];
5307 int32_t resultLength
;
5310 status
= U_ZERO_ERROR
;
5311 repl
= UnicodeString("<$0>");
5312 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5314 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf
, resultLength
));
5316 status
= U_ZERO_ERROR
;
5317 repl
= UnicodeString("<$1>");
5318 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5320 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf
, resultLength
));
5322 status
= U_ZERO_ERROR
;
5323 repl
= UnicodeString("<${one}>");
5324 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5326 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf
, resultLength
));
5328 status
= U_ZERO_ERROR
;
5329 repl
= UnicodeString("<$2>");
5330 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5332 REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf
, resultLength
));
5334 status
= U_ZERO_ERROR
;
5335 repl
= UnicodeString("<$3>");
5336 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5338 REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf
, resultLength
));
5340 status
= U_ZERO_ERROR
;
5341 repl
= UnicodeString("<$4>");
5342 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5343 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
5345 status
= U_ZERO_ERROR
;
5346 repl
= UnicodeString("<$04>");
5347 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5349 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf
, resultLength
));
5351 status
= U_ZERO_ERROR
;
5352 repl
= UnicodeString("<$000016>");
5353 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5355 REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf
, resultLength
));
5357 status
= U_ZERO_ERROR
;
5358 repl
= UnicodeString("<$3$2$1${one}>");
5359 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5361 REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf
, resultLength
));
5363 status
= U_ZERO_ERROR
;
5364 repl
= UnicodeString("$3$2$1${one}");
5365 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5367 REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf
, resultLength
));
5369 status
= U_ZERO_ERROR
;
5370 repl
= UnicodeString("<${noSuchName}>");
5371 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5372 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5374 status
= U_ZERO_ERROR
;
5375 repl
= UnicodeString("<${invalid-name}>");
5376 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5377 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5379 status
= U_ZERO_ERROR
;
5380 repl
= UnicodeString("<${one");
5381 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5382 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5384 status
= U_ZERO_ERROR
;
5385 repl
= UnicodeString("$not a capture group");
5386 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5387 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5392 //--------------------------------------------------------------
5394 // NamedCaptureLimits Patterns with huge numbers of named capture groups.
5395 // The point is not so much what the exact limit is,
5396 // but that a largish number doesn't hit bad non-linear performance,
5397 // and that exceeding the limit fails cleanly.
5399 //--------------------------------------------------------------
5400 void RegexTest::NamedCaptureLimits() {
5402 logln("Skipping test. Runs in exhuastive mode only.");
5405 const int32_t goodLimit
= 1000000; // Pattern w this many groups builds successfully.
5406 const int32_t failLimit
= 10000000; // Pattern exceeds internal limits, fails to compile.
5408 UnicodeString pattern
;
5411 for (nn
=1; nn
<goodLimit
; nn
++) {
5412 sprintf(nnbuf
, "(?<nn%d>)", nn
);
5413 pattern
.append(UnicodeString(nnbuf
, -1, US_INV
));
5415 UErrorCode status
= U_ZERO_ERROR
;
5416 RegexPattern
*pat
= RegexPattern::compile(pattern
, 0, status
);
5418 for (nn
=1; nn
<goodLimit
; nn
++) {
5419 sprintf(nnbuf
, "nn%d", nn
);
5420 int32_t groupNum
= pat
->groupNumberFromName(nnbuf
, -1, status
);
5421 REGEX_ASSERT(nn
== groupNum
);
5422 if (nn
!= groupNum
) {
5429 for (nn
=1; nn
<failLimit
; nn
++) {
5430 sprintf(nnbuf
, "(?<nn%d>)", nn
);
5431 pattern
.append(UnicodeString(nnbuf
, -1, US_INV
));
5433 status
= U_ZERO_ERROR
;
5434 pat
= RegexPattern::compile(pattern
, 0, status
);
5435 REGEX_ASSERT(status
== U_REGEX_PATTERN_TOO_BIG
);
5440 //--------------------------------------------------------------
5442 // Bug7651 Regex pattern that exceeds default operator stack depth in matcher.
5444 //---------------------------------------------------------------
5445 void RegexTest::Bug7651() {
5446 UnicodeString
pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5447 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5448 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5449 UnicodeString
pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5450 UnicodeString
s("#ff @abcd This is test");
5451 RegexPattern
*REPattern
= NULL
;
5452 RegexMatcher
*REMatcher
= NULL
;
5453 UErrorCode status
= U_ZERO_ERROR
;
5456 REPattern
= RegexPattern::compile(pattern1
, 0, pe
, status
);
5458 REMatcher
= REPattern
->matcher(s
, status
);
5460 REGEX_ASSERT(REMatcher
->find());
5461 REGEX_ASSERT(REMatcher
->start(status
) == 0);
5464 status
= U_ZERO_ERROR
;
5466 REPattern
= RegexPattern::compile(pattern2
, 0, pe
, status
);
5468 REMatcher
= REPattern
->matcher(s
, status
);
5470 REGEX_ASSERT(REMatcher
->find());
5471 REGEX_ASSERT(REMatcher
->start(status
) == 0);
5474 status
= U_ZERO_ERROR
;
5477 void RegexTest::Bug7740() {
5478 UErrorCode status
= U_ZERO_ERROR
;
5479 UnicodeString pattern
= "(a)";
5480 UnicodeString text
= "abcdef";
5481 RegexMatcher
*m
= new RegexMatcher(pattern
, text
, 0, status
);
5483 REGEX_ASSERT(m
->lookingAt(status
));
5485 status
= U_ILLEGAL_ARGUMENT_ERROR
;
5486 UnicodeString s
= m
->group(1, status
); // Bug 7740: segfault here.
5487 REGEX_ASSERT(status
== U_ILLEGAL_ARGUMENT_ERROR
);
5488 REGEX_ASSERT(s
== "");
5492 // Bug 8479: was crashing whith a Bogus UnicodeString as input.
5494 void RegexTest::Bug8479() {
5495 UErrorCode status
= U_ZERO_ERROR
;
5497 RegexMatcher
* const pMatcher
= new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL
|UREGEX_CASE_INSENSITIVE
, status
);
5499 if (U_SUCCESS(status
))
5503 pMatcher
->reset(str
);
5504 status
= U_ZERO_ERROR
;
5505 pMatcher
->matches(status
);
5506 REGEX_ASSERT(status
== U_ILLEGAL_ARGUMENT_ERROR
);
5513 void RegexTest::Bug7029() {
5514 UErrorCode status
= U_ZERO_ERROR
;
5516 RegexMatcher
* const pMatcher
= new RegexMatcher(".", 0, status
);
5517 UnicodeString text
= "abc.def";
5518 UnicodeString splits
[10];
5520 int32_t numFields
= pMatcher
->split(text
, splits
, 10, status
);
5522 REGEX_ASSERT(numFields
== 8);
5527 // This test is checking for the existance of any supplemental characters that case-fold
5528 // to a bmp character.
5530 // At the time of this writing there are none. If any should appear in a subsequent release
5531 // of Unicode, the code in regular expressions compilation that determines the longest
5532 // posssible match for a literal string will need to be enhanced.
5534 // See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5535 // for details on what to do in case of a failure of this test.
5537 void RegexTest::Bug9283() {
5538 #if !UCONFIG_NO_NORMALIZATION
5539 UErrorCode status
= U_ZERO_ERROR
;
5540 UnicodeSet
supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status
);
5544 for (index
=0; ; index
++) {
5545 c
= supplementalsWithCaseFolding
.charAt(index
);
5549 UnicodeString cf
= UnicodeString(c
).foldCase();
5550 REGEX_ASSERT(cf
.length() >= 2);
5552 #endif /* #if !UCONFIG_NO_NORMALIZATION */
5556 void RegexTest::CheckInvBufSize() {
5557 if(inv_next
>=INV_BUFSIZ
) {
5558 errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5559 __FILE__
, INV_BUFSIZ
, inv_next
);
5561 logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__
, INV_BUFSIZ
, inv_next
);
5566 void RegexTest::Bug10459() {
5567 UErrorCode status
= U_ZERO_ERROR
;
5568 UnicodeString
patternString("(txt)");
5569 UnicodeString
txtString("txt");
5571 UText
*utext_pat
= utext_openUnicodeString(NULL
, &patternString
, &status
);
5573 UText
*utext_txt
= utext_openUnicodeString(NULL
, &txtString
, &status
);
5576 URegularExpression
*icu_re
= uregex_openUText(utext_pat
, 0, NULL
, &status
);
5579 uregex_setUText(icu_re
, utext_txt
, &status
);
5582 // The bug was that calling uregex_group() before doing a matching operation
5583 // was causing a segfault. Only for Regular Expressions created from UText.
5584 // It should set an U_REGEX_INVALID_STATE.
5587 int32_t len
= uregex_group(icu_re
, 0, buf
, UPRV_LENGTHOF(buf
), &status
);
5588 REGEX_ASSERT(status
== U_REGEX_INVALID_STATE
);
5589 REGEX_ASSERT(len
== 0);
5591 uregex_close(icu_re
);
5592 utext_close(utext_pat
);
5593 utext_close(utext_txt
);
5596 void RegexTest::TestCaseInsensitiveStarters() {
5597 // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
5598 // become stale because of new Unicode characters.
5599 // If it is stale, rerun the generation tool
5600 // svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
5601 // and replace the embedded data in i18n/regexcmp.cpp
5603 for (UChar32 cp
=0; cp
<=0x10ffff; cp
++) {
5604 if (!u_hasBinaryProperty(cp
, UCHAR_CASE_SENSITIVE
)) {
5607 UnicodeSet
s(cp
, cp
);
5608 s
.closeOver(USET_CASE_INSENSITIVE
);
5609 UnicodeSetIterator
setIter(s
);
5610 while (setIter
.next()) {
5611 if (!setIter
.isString()) {
5614 const UnicodeString
&str
= setIter
.getString();
5615 UChar32 firstChar
= str
.char32At(0);
5616 UnicodeSet starters
;
5617 RegexCompile::findCaseInsensitiveStarters(firstChar
, &starters
);
5618 if (!starters
.contains(cp
)) {
5619 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp
, firstChar
);
5627 void RegexTest::TestBug11049() {
5628 // Original bug report: pattern with match start consisting of one of several individual characters,
5629 // and the text being matched ending with a supplementary character. find() would read past the
5630 // end of the input text when searching for potential match starting points.
5632 // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
5633 // detect the bad read.
5635 TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE
, __LINE__
);
5636 TestCase11049("A|B|C", "string matches at end C", TRUE
, __LINE__
);
5638 // Test again with a pattern starting with a single character,
5639 // which takes a different code path than starting with an OR expression,
5640 // but with similar logic.
5641 TestCase11049("C", "a string \\ud800\\udc00", FALSE
, __LINE__
);
5642 TestCase11049("C", "string matches at end C", TRUE
, __LINE__
);
5645 // Run a single test case from TestBug11049(). Internal function.
5646 void RegexTest::TestCase11049(const char *pattern
, const char *data
, UBool expectMatch
, int32_t lineNumber
) {
5647 UErrorCode status
= U_ZERO_ERROR
;
5648 UnicodeString patternString
= UnicodeString(pattern
).unescape();
5649 LocalPointer
<RegexPattern
> compiledPat(RegexPattern::compile(patternString
, 0, status
));
5651 UnicodeString dataString
= UnicodeString(data
).unescape();
5652 UChar
*exactBuffer
= new UChar
[dataString
.length()];
5653 dataString
.extract(exactBuffer
, dataString
.length(), status
);
5654 UText
*ut
= utext_openUChars(NULL
, exactBuffer
, dataString
.length(), &status
);
5656 LocalPointer
<RegexMatcher
> matcher(compiledPat
->matcher(status
));
5659 UBool result
= matcher
->find();
5660 if (result
!= expectMatch
) {
5661 errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5662 __FILE__
, lineNumber
, expectMatch
, result
, pattern
, data
);
5665 // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
5666 // off-by-one on find() with match at the last code point.
5667 // Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
5668 // because string.unescape() will only shrink it.
5669 char * utf8Buffer
= new char[uprv_strlen(data
)+1];
5670 u_strToUTF8(utf8Buffer
, uprv_strlen(data
)+1, NULL
, dataString
.getBuffer(), dataString
.length(), &status
);
5672 ut
= utext_openUTF8(ut
, utf8Buffer
, -1, &status
);
5675 result
= matcher
->find();
5676 if (result
!= expectMatch
) {
5677 errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5678 __FILE__
, lineNumber
, expectMatch
, result
, pattern
, data
);
5680 delete [] utf8Buffer
;
5683 delete [] exactBuffer
;
5687 void RegexTest::TestBug11371() {
5689 logln("Skipping test. Runs in exhuastive mode only.");
5692 UErrorCode status
= U_ZERO_ERROR
;
5693 UnicodeString patternString
;
5695 for (int i
=0; i
<8000000; i
++) {
5696 patternString
.append(UnicodeString("()"));
5698 LocalPointer
<RegexPattern
> compiledPat(RegexPattern::compile(patternString
, 0, status
));
5699 if (status
!= U_REGEX_PATTERN_TOO_BIG
) {
5700 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5701 __FILE__
, __LINE__
, u_errorName(status
));
5704 status
= U_ZERO_ERROR
;
5705 patternString
= "(";
5706 for (int i
=0; i
<20000000; i
++) {
5707 patternString
.append(UnicodeString("A++"));
5709 patternString
.append(UnicodeString("){0}B++"));
5710 LocalPointer
<RegexPattern
> compiledPat2(RegexPattern::compile(patternString
, 0, status
));
5711 if (status
!= U_REGEX_PATTERN_TOO_BIG
) {
5712 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5713 __FILE__
, __LINE__
, u_errorName(status
));
5716 // Pattern with too much string data, such that string indexes overflow operand data field size
5717 // in compiled instruction.
5718 status
= U_ZERO_ERROR
;
5720 while (patternString
.length() < 0x00ffffff) {
5721 patternString
.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
5723 patternString
.append(UnicodeString("X? trailing string"));
5724 LocalPointer
<RegexPattern
> compiledPat3(RegexPattern::compile(patternString
, 0, status
));
5725 if (status
!= U_REGEX_PATTERN_TOO_BIG
) {
5726 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5727 __FILE__
, __LINE__
, u_errorName(status
));
5731 void RegexTest::TestBug11480() {
5732 // C API, get capture group of a group that does not participate in the match.
5733 // (Returns a zero length string, with nul termination,
5734 // indistinguishable from a group with a zero length match.)
5736 UErrorCode status
= U_ZERO_ERROR
;
5737 URegularExpression
*re
= uregex_openC("(A)|(B)", 0, NULL
, &status
);
5739 UnicodeString text
= UNICODE_STRING_SIMPLE("A");
5740 uregex_setText(re
, text
.getBuffer(), text
.length(), &status
);
5742 REGEX_ASSERT(uregex_lookingAt(re
, 0, &status
));
5743 UChar buf
[10] = {(UChar
)13, (UChar
)13, (UChar
)13, (UChar
)13};
5744 int32_t length
= uregex_group(re
, 2, buf
+1, UPRV_LENGTHOF(buf
)-1, &status
);
5745 REGEX_ASSERT(length
== 0);
5746 REGEX_ASSERT(buf
[0] == 13);
5747 REGEX_ASSERT(buf
[1] == 0);
5748 REGEX_ASSERT(buf
[2] == 13);
5751 // UText C++ API, length of match is 0 for non-participating matches.
5752 UText ut
= UTEXT_INITIALIZER
;
5753 utext_openUnicodeString(&ut
, &text
, &status
);
5754 RegexMatcher
matcher(UnicodeString("(A)|(B)"), 0, status
);
5757 REGEX_ASSERT(matcher
.lookingAt(0, status
));
5759 // UText C++ API, Capture group 1 matches "A", position 0, length 1.
5760 int64_t groupLen
= -666;
5761 UText group
= UTEXT_INITIALIZER
;
5762 matcher
.group(1, &group
, groupLen
, status
);
5764 REGEX_ASSERT(groupLen
== 1);
5765 REGEX_ASSERT(utext_getNativeIndex(&group
) == 0);
5767 // Capture group 2, the (B), does not participate in the match.
5768 matcher
.group(2, &group
, groupLen
, status
);
5770 REGEX_ASSERT(groupLen
== 0);
5771 REGEX_ASSERT(matcher
.start(2, status
) == -1);
5775 void RegexTest::TestBug12884() {
5776 // setTimeLimit() was not effective for empty sub-patterns with large {minimum counts}
5777 UnicodeString
pattern(u
"(((((((){120}){11}){11}){11}){80}){11}){4}");
5778 UnicodeString
text(u
"hello");
5779 UErrorCode status
= U_ZERO_ERROR
;
5780 RegexMatcher
m(pattern
, text
, 0, status
);
5782 m
.setTimeLimit(5, status
);
5784 REGEX_ASSERT(status
== U_REGEX_TIME_OUT
);
5786 // Non-greedy loops. They take a different code path during matching.
5787 UnicodeString
ngPattern(u
"(((((((){120}?){11}?){11}?){11}?){80}?){11}?){4}?");
5788 status
= U_ZERO_ERROR
;
5789 RegexMatcher
ngM(ngPattern
, text
, 0, status
);
5791 ngM
.setTimeLimit(5, status
);
5793 REGEX_ASSERT(status
== U_REGEX_TIME_OUT
);
5795 // UText, wrapping non-UTF-16 text, also takes a different execution path.
5796 const char *text8
= u8
"¿Qué es Unicode? Unicode proporciona un número único para cada"
5797 "carácter, sin importar la plataforma, sin importar el programa,"
5798 "sin importar el idioma.";
5799 status
= U_ZERO_ERROR
;
5800 LocalUTextPointer
ut(utext_openUTF8(NULL
, text8
, -1, &status
));
5802 m
.reset(ut
.getAlias());
5804 REGEX_ASSERT(status
== U_REGEX_TIME_OUT
);
5806 status
= U_ZERO_ERROR
;
5807 ngM
.reset(ut
.getAlias());
5809 REGEX_ASSERT(status
== U_REGEX_TIME_OUT
);
5812 // Bug 13631. A find() of a pattern with a zero length look-behind assertions
5813 // can cause a read past the end of the input text.
5814 // The failure is seen when running this test with Clang's Addresss Sanitizer.
5816 void RegexTest::TestBug13631() {
5817 const UChar
*pats
[] = { u
"(?<!^)",
5821 for (const UChar
**pat
=pats
; *pat
; ++pat
) {
5822 UErrorCode status
= U_ZERO_ERROR
;
5823 UnicodeString
upat(*pat
);
5824 RegexMatcher
matcher(upat
, 0, status
);
5825 const UChar s
=u
'a';
5826 UText
*ut
= utext_openUChars(nullptr, &s
, 1, &status
);
5829 while (matcher
.find()) {
5835 // Bug 13632 Out of bounds memory reference if a replacement string ends with a '$',
5836 // where a following group specification would be expected.
5837 // Failure shows when running the test under Clang's Address Sanitizer.
5839 void RegexTest::TestBug13632() {
5840 UErrorCode status
= U_ZERO_ERROR
;
5841 URegularExpression
*re
= uregex_openC(" ", 0, nullptr, &status
);
5842 const char16_t *sourceString
= u
"Hello, world.";
5843 uregex_setText(re
, sourceString
, u_strlen(sourceString
), &status
);
5845 const int32_t destCap
= 20;
5846 char16_t dest
[destCap
] = {};
5847 const char16_t replacement
[] = {u
'x', u
'$'}; // Not nul terminated string.
5848 uregex_replaceAll(re
, replacement
, 2, dest
, destCap
, &status
);
5850 assertEquals("", U_REGEX_INVALID_CAPTURE_GROUP_NAME
, status
);
5854 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */