1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
5 * Copyright (c) 2002-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
12 // ICU Regular Expressions test, part of intltest.
18 PLEASE be careful about ASCII assumptions in this test.
19 This test is one of the worst repeat offenders.
20 If you have questions, contact someone on the ICU PMC
21 who has access to an EBCDIC system.
26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
32 #include "unicode/localpointer.h"
33 #include "unicode/regex.h"
34 #include "unicode/uchar.h"
35 #include "unicode/ucnv.h"
36 #include "unicode/uniset.h"
37 #include "unicode/uregex.h"
38 #include "unicode/usetiter.h"
39 #include "unicode/ustring.h"
40 #include "unicode/utext.h"
41 #include "unicode/utf16.h"
51 #define SUPPORT_MUTATING_INPUT_STRING 0
53 //---------------------------------------------------------------------------
55 // Test class boilerplate
57 //---------------------------------------------------------------------------
58 RegexTest::RegexTest()
63 RegexTest::~RegexTest()
69 void RegexTest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* /*par*/ )
71 if (exec
) logln("TestSuite RegexTest: ");
74 TESTCASE_AUTO(API_Match
);
75 TESTCASE_AUTO(API_Replace
);
76 TESTCASE_AUTO(API_Pattern
);
77 #if !UCONFIG_NO_FILE_IO
78 TESTCASE_AUTO(Extended
);
80 TESTCASE_AUTO(Errors
);
81 TESTCASE_AUTO(PerlTests
);
82 TESTCASE_AUTO(Callbacks
);
83 TESTCASE_AUTO(FindProgressCallbacks
);
84 TESTCASE_AUTO(Bug6149
);
85 TESTCASE_AUTO(UTextBasic
);
86 TESTCASE_AUTO(API_Match_UTF8
);
87 TESTCASE_AUTO(API_Replace_UTF8
);
88 TESTCASE_AUTO(API_Pattern_UTF8
);
89 TESTCASE_AUTO(PerlTestsUTF8
);
90 TESTCASE_AUTO(PreAllocatedUTextCAPI
);
91 TESTCASE_AUTO(Bug7651
);
92 TESTCASE_AUTO(Bug7740
);
93 TESTCASE_AUTO(Bug8479
);
94 TESTCASE_AUTO(Bug7029
);
95 TESTCASE_AUTO(CheckInvBufSize
);
96 TESTCASE_AUTO(Bug9283
);
97 TESTCASE_AUTO(Bug10459
);
98 TESTCASE_AUTO(TestCaseInsensitiveStarters
);
99 TESTCASE_AUTO(TestBug11049
);
100 TESTCASE_AUTO(TestBug11371
);
101 TESTCASE_AUTO(TestBug11480
);
102 TESTCASE_AUTO(NamedCapture
);
103 TESTCASE_AUTO(NamedCaptureLimits
);
104 TESTCASE_AUTO(TestBug12884
);
105 TESTCASE_AUTO(TestBug13631
);
106 TESTCASE_AUTO(TestBug13632
);
107 TESTCASE_AUTO(TestBug20359
);
108 TESTCASE_AUTO(TestBug20863
);
114 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
116 * @see utext_openUTF8
118 static UText
* regextst_openUTF8FromInvariant(UText
* ut
, const char *inv
, int64_t length
, UErrorCode
*status
);
120 //---------------------------------------------------------------------------
122 // Error Checking / Reporting macros used in all of the tests.
124 //---------------------------------------------------------------------------
126 static void utextToPrintable(char *buf
, int32_t bufLen
, UText
*text
) {
127 int64_t oldIndex
= utext_getNativeIndex(text
);
128 utext_setNativeIndex(text
, 0);
130 UChar32 c
= utext_next32From(text
, 0);
131 while ((c
!= U_SENTINEL
) && (bufPtr
< buf
+bufLen
)) {
132 if (0x000020<=c
&& c
<0x00007e) {
136 sprintf(bufPtr
,"U+%04X", c
);
137 bufPtr
+= strlen(bufPtr
)-1;
143 c
= UTEXT_NEXT32(text
);
146 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
147 char *ebuf
= (char*)malloc(bufLen
);
148 uprv_eastrncpy((unsigned char*)ebuf
, (const unsigned char*)buf
, bufLen
);
149 uprv_strncpy(buf
, ebuf
, bufLen
);
152 utext_setNativeIndex(text
, oldIndex
);
156 static char ASSERT_BUF
[1024];
158 const char* RegexTest::extractToAssertBuf(const UnicodeString
& message
) {
159 if(message
.length()==0) {
160 strcpy(ASSERT_BUF
, "[[empty UnicodeString]]");
163 IntlTest::prettify(message
,buf
);
164 if(buf
.length()==0) {
165 strcpy(ASSERT_BUF
, "[[escape() returned 0 chars]]");
167 buf
.extract(0, 0x7FFFFFFF, ASSERT_BUF
, sizeof(ASSERT_BUF
)-1);
168 if(ASSERT_BUF
[0]==0) {
170 for(int32_t i
=0;i
<buf
.length();i
++) {
172 sprintf(ASSERT_BUF
+strlen(ASSERT_BUF
),"\\u%02x",ch
);
177 ASSERT_BUF
[sizeof(ASSERT_BUF
)-1] = 0;
181 #define REGEX_VERBOSE_TEXT(text) UPRV_BLOCK_MACRO_BEGIN { \
183 utextToPrintable(buf,UPRV_LENGTHOF(buf),text); \
184 logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf); \
185 } UPRV_BLOCK_MACRO_END
187 #define REGEX_CHECK_STATUS UPRV_BLOCK_MACRO_BEGIN { \
188 if (U_FAILURE(status)) { \
189 dataerrln("%s:%d: RegexTest failure. status=%s", \
190 __FILE__, __LINE__, u_errorName(status)); \
193 } UPRV_BLOCK_MACRO_END
195 #define REGEX_ASSERT(expr) UPRV_BLOCK_MACRO_BEGIN { \
196 if ((expr)==FALSE) { \
197 errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr); \
199 } UPRV_BLOCK_MACRO_END
201 #define REGEX_ASSERT_FAIL(expr, errcode) UPRV_BLOCK_MACRO_BEGIN { \
202 UErrorCode status=U_ZERO_ERROR; \
204 if (status!=errcode) { \
205 dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \
206 __LINE__, u_errorName(errcode), u_errorName(status)); \
208 } UPRV_BLOCK_MACRO_END
210 #define REGEX_CHECK_STATUS_L(line) UPRV_BLOCK_MACRO_BEGIN { \
211 if (U_FAILURE(status)) { \
212 errln("RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); \
214 } UPRV_BLOCK_MACRO_END
216 #define REGEX_ASSERT_L(expr, line) UPRV_BLOCK_MACRO_BEGIN { \
217 if ((expr)==FALSE) { \
218 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); \
221 } UPRV_BLOCK_MACRO_END
223 // expected: const char * , restricted to invariant characters.
224 // actual: const UnicodeString &
225 #define REGEX_ASSERT_UNISTR(expected, actual) UPRV_BLOCK_MACRO_BEGIN { \
226 if (UnicodeString(expected, -1, US_INV) != (actual)) { \
227 errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n", \
228 __FILE__, __LINE__, expected, extractToAssertBuf(actual)); \
230 } UPRV_BLOCK_MACRO_END
233 static UBool
testUTextEqual(UText
*uta
, UText
*utb
) {
236 utext_setNativeIndex(uta
, 0);
237 utext_setNativeIndex(utb
, 0);
239 ca
= utext_next32(uta
);
240 cb
= utext_next32(utb
);
244 } while (ca
!= U_SENTINEL
);
250 * @param expected expected text in UTF-8 (not platform) codepage
252 void RegexTest::assertUText(const char *expected
, UText
*actual
, const char *file
, int line
) {
253 UErrorCode status
= U_ZERO_ERROR
;
254 UText expectedText
= UTEXT_INITIALIZER
;
255 utext_openUTF8(&expectedText
, expected
, -1, &status
);
256 if(U_FAILURE(status
)) {
257 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file
, line
, u_errorName(status
), strlen(expected
));
260 if(utext_nativeLength(&expectedText
)==0 && (strlen(expected
)!=0)) {
261 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file
, line
, strlen(expected
));
264 utext_setNativeIndex(actual
, 0);
265 if (!testUTextEqual(&expectedText
, actual
)) {
266 char buf
[201 /*21*/];
267 char expectedBuf
[201];
268 utextToPrintable(buf
, UPRV_LENGTHOF(buf
), actual
);
269 utextToPrintable(expectedBuf
, UPRV_LENGTHOF(expectedBuf
), &expectedText
);
270 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file
, line
, expectedBuf
, (int)utext_nativeLength(&expectedText
), buf
, (int)utext_nativeLength(actual
));
272 utext_close(&expectedText
);
275 * @param expected invariant (platform local text) input
278 void RegexTest::assertUTextInvariant(const char *expected
, UText
*actual
, const char *file
, int line
) {
279 UErrorCode status
= U_ZERO_ERROR
;
280 UText expectedText
= UTEXT_INITIALIZER
;
281 regextst_openUTF8FromInvariant(&expectedText
, expected
, -1, &status
);
282 if(U_FAILURE(status
)) {
283 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file
, line
, u_errorName(status
), strlen(expected
));
286 utext_setNativeIndex(actual
, 0);
287 if (!testUTextEqual(&expectedText
, actual
)) {
288 char buf
[201 /*21*/];
289 char expectedBuf
[201];
290 utextToPrintable(buf
, UPRV_LENGTHOF(buf
), actual
);
291 utextToPrintable(expectedBuf
, UPRV_LENGTHOF(expectedBuf
), &expectedText
);
292 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file
, line
, expectedBuf
, (int)utext_nativeLength(&expectedText
), buf
, (int)utext_nativeLength(actual
));
294 utext_close(&expectedText
);
298 * Assumes utf-8 input
300 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
302 * Assumes Invariant input
304 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
307 * This buffer ( inv_buf ) is used to hold the UTF-8 strings
308 * passed into utext_openUTF8. An error will be given if
309 * INV_BUFSIZ is too small. It's only used on EBCDIC systems.
312 #define INV_BUFSIZ 2048 /* increase this if too small */
314 static int64_t inv_next
=0;
316 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
317 static char inv_buf
[INV_BUFSIZ
];
320 static UText
* regextst_openUTF8FromInvariant(UText
*ut
, const char *inv
, int64_t length
, UErrorCode
*status
) {
321 if(length
==-1) length
=strlen(inv
);
322 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
324 return utext_openUTF8(ut
, inv
, length
, status
);
326 if(inv_next
+length
+1>INV_BUFSIZ
) {
327 fprintf(stderr
, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
328 __FILE__
, __LINE__
, INV_BUFSIZ
, (inv_next
+length
+1));
329 *status
= U_MEMORY_ALLOCATION_ERROR
;
333 unsigned char *buf
= (unsigned char*)inv_buf
+inv_next
;
334 uprv_aestrncpy(buf
, (const uint8_t*)inv
, length
);
338 fprintf(stderr
, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ
, inv_next
);
341 return utext_openUTF8(ut
, (const char*)buf
, length
, status
);
346 //---------------------------------------------------------------------------
348 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests
349 // for the LookingAt() and Match() functions.
352 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
354 // The expected results are UBool - TRUE or FALSE.
355 // The input text is unescaped. The pattern is not.
358 //---------------------------------------------------------------------------
360 #define REGEX_TESTLM(pat, text, looking, match) UPRV_BLOCK_MACRO_BEGIN { \
361 doRegexLMTest(pat, text, looking, match, __LINE__); \
362 doRegexLMTestUTF8(pat, text, looking, match, __LINE__); \
363 } UPRV_BLOCK_MACRO_END
365 UBool
RegexTest::doRegexLMTest(const char *pat
, const char *text
, UBool looking
, UBool match
, int32_t line
) {
366 const UnicodeString
pattern(pat
, -1, US_INV
);
367 const UnicodeString
inputText(text
, -1, US_INV
);
368 UErrorCode status
= U_ZERO_ERROR
;
370 RegexPattern
*REPattern
= NULL
;
371 RegexMatcher
*REMatcher
= NULL
;
374 UnicodeString
patString(pat
, -1, US_INV
);
375 REPattern
= RegexPattern::compile(patString
, 0, pe
, status
);
376 if (U_FAILURE(status
)) {
377 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s",
378 line
, u_errorName(status
));
381 if (line
==376) { REPattern
->dumpPattern();}
383 UnicodeString
inputString(inputText
);
384 UnicodeString unEscapedInput
= inputString
.unescape();
385 REMatcher
= REPattern
->matcher(unEscapedInput
, status
);
386 if (U_FAILURE(status
)) {
387 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
388 line
, u_errorName(status
));
393 actualmatch
= REMatcher
->lookingAt(status
);
394 if (U_FAILURE(status
)) {
395 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
396 line
, u_errorName(status
));
399 if (actualmatch
!= looking
) {
400 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line
);
404 status
= U_ZERO_ERROR
;
405 actualmatch
= REMatcher
->matches(status
);
406 if (U_FAILURE(status
)) {
407 errln("RegexTest failure in matches() at line %d. Status = %s\n",
408 line
, u_errorName(status
));
411 if (actualmatch
!= match
) {
412 errln("RegexTest: wrong return from matches() at line %d.\n", line
);
416 if (retVal
== FALSE
) {
417 REPattern
->dumpPattern();
426 UBool
RegexTest::doRegexLMTestUTF8(const char *pat
, const char *text
, UBool looking
, UBool match
, int32_t line
) {
427 UText pattern
= UTEXT_INITIALIZER
;
428 int32_t inputUTF8Length
;
429 char *textChars
= NULL
;
430 UText inputText
= UTEXT_INITIALIZER
;
431 UErrorCode status
= U_ZERO_ERROR
;
433 RegexPattern
*REPattern
= NULL
;
434 RegexMatcher
*REMatcher
= NULL
;
437 regextst_openUTF8FromInvariant(&pattern
, pat
, -1, &status
);
438 REPattern
= RegexPattern::compile(&pattern
, 0, pe
, status
);
439 if (U_FAILURE(status
)) {
440 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n",
441 line
, u_errorName(status
));
445 UnicodeString
inputString(text
, -1, US_INV
);
446 UnicodeString unEscapedInput
= inputString
.unescape();
447 LocalUConverterPointer
UTF8Converter(ucnv_open("UTF8", &status
));
448 ucnv_setFromUCallBack(UTF8Converter
.getAlias(), UCNV_FROM_U_CALLBACK_STOP
, NULL
, NULL
, NULL
, &status
);
450 inputUTF8Length
= unEscapedInput
.extract(NULL
, 0, UTF8Converter
.getAlias(), status
);
451 if (U_FAILURE(status
) && status
!= U_BUFFER_OVERFLOW_ERROR
) {
452 // UTF-8 does not allow unpaired surrogates, so this could actually happen
453 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line
, u_errorName(status
));
454 return TRUE
; // not a failure of the Regex engine
456 status
= U_ZERO_ERROR
; // buffer overflow
457 textChars
= new char[inputUTF8Length
+1];
458 unEscapedInput
.extract(textChars
, inputUTF8Length
+1, UTF8Converter
.getAlias(), status
);
459 utext_openUTF8(&inputText
, textChars
, inputUTF8Length
, &status
);
461 REMatcher
= &REPattern
->matcher(status
)->reset(&inputText
);
462 if (U_FAILURE(status
)) {
463 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n",
464 line
, u_errorName(status
));
469 actualmatch
= REMatcher
->lookingAt(status
);
470 if (U_FAILURE(status
)) {
471 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n",
472 line
, u_errorName(status
));
475 if (actualmatch
!= looking
) {
476 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line
);
480 status
= U_ZERO_ERROR
;
481 actualmatch
= REMatcher
->matches(status
);
482 if (U_FAILURE(status
)) {
483 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n",
484 line
, u_errorName(status
));
487 if (actualmatch
!= match
) {
488 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line
);
492 if (retVal
== FALSE
) {
493 REPattern
->dumpPattern();
498 utext_close(&inputText
);
499 utext_close(&pattern
);
506 //---------------------------------------------------------------------------
508 // REGEX_ERR Macro + invocation function to simplify writing tests
509 // regex tests for incorrect patterns
512 // REGEX_ERR("pattern", expected error line, column, expected status);
514 //---------------------------------------------------------------------------
515 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__)
517 void RegexTest::regex_err(const char *pat
, int32_t errLine
, int32_t errCol
,
518 UErrorCode expectedStatus
, int32_t line
) {
519 UnicodeString
pattern(pat
);
521 UErrorCode status
= U_ZERO_ERROR
;
523 RegexPattern
*callerPattern
= NULL
;
526 // Compile the caller's pattern
528 UnicodeString
patString(pat
);
529 callerPattern
= RegexPattern::compile(patString
, 0, pe
, status
);
530 if (status
!= expectedStatus
) {
531 dataerrln("Line %d: unexpected error %s compiling pattern.", line
, u_errorName(status
));
533 if (status
!= U_ZERO_ERROR
) {
534 if (pe
.line
!= errLine
|| pe
.offset
!= errCol
) {
535 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
536 line
, errLine
, errCol
, pe
.line
, pe
.offset
);
541 delete callerPattern
;
544 // Compile again, using a UTF-8-based UText
546 UText patternText
= UTEXT_INITIALIZER
;
547 regextst_openUTF8FromInvariant(&patternText
, pat
, -1, &status
);
548 callerPattern
= RegexPattern::compile(&patternText
, 0, pe
, status
);
549 if (status
!= expectedStatus
) {
550 dataerrln("Line %d: unexpected error %s compiling pattern.", line
, u_errorName(status
));
552 if (status
!= U_ZERO_ERROR
) {
553 if (pe
.line
!= errLine
|| pe
.offset
!= errCol
) {
554 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
555 line
, errLine
, errCol
, pe
.line
, pe
.offset
);
560 delete callerPattern
;
561 utext_close(&patternText
);
566 //---------------------------------------------------------------------------
568 // Basic Check for basic functionality of regex pattern matching.
569 // Avoid the use of REGEX_FIND test macro, which has
570 // substantial dependencies on basic Regex functionality.
572 //---------------------------------------------------------------------------
573 void RegexTest::Basic() {
577 // Debug - slide failing test cases early
581 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
583 UErrorCode status
= U_ZERO_ERROR
;
584 RegexPattern
*pattern
;
585 pattern
= RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE
, pe
, status
);
586 pattern
->dumpPattern();
587 RegexMatcher
*m
= pattern
->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status
);
588 UBool result
= m
->find();
589 printf("result = %d\n", result
);
590 // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
591 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
598 // Pattern with parentheses
600 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE
, FALSE
);
601 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE
, TRUE
);
602 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE
, FALSE
);
607 REGEX_TESTLM("st(abc)*ring", "string", TRUE
, TRUE
);
608 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE
, TRUE
);
609 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE
, TRUE
);
610 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE
, FALSE
);
611 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE
, FALSE
);
613 REGEX_TESTLM("a*", "", TRUE
, TRUE
);
614 REGEX_TESTLM("a*", "b", TRUE
, FALSE
);
620 REGEX_TESTLM(".", "abc", TRUE
, FALSE
);
621 REGEX_TESTLM("...", "abc", TRUE
, TRUE
);
622 REGEX_TESTLM("....", "abc", FALSE
, FALSE
);
623 REGEX_TESTLM(".*", "abcxyz123", TRUE
, TRUE
);
624 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE
, FALSE
);
625 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE
, TRUE
);
626 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE
, TRUE
);
627 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE
, FALSE
);
630 // Patterns with * applied to chars at end of literal string
632 REGEX_TESTLM("abc*", "ab", TRUE
, TRUE
);
633 REGEX_TESTLM("abc*", "abccccc", TRUE
, TRUE
);
636 // Supplemental chars match as single chars, not a pair of surrogates.
638 REGEX_TESTLM(".", "\\U00011000", TRUE
, TRUE
);
639 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE
, TRUE
);
640 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE
, FALSE
);
644 // UnicodeSets in the pattern
646 REGEX_TESTLM("[1-6]", "1", TRUE
, TRUE
);
647 REGEX_TESTLM("[1-6]", "3", TRUE
, TRUE
);
648 REGEX_TESTLM("[1-6]", "7", FALSE
, FALSE
);
649 REGEX_TESTLM("a[1-6]", "a3", TRUE
, TRUE
);
650 REGEX_TESTLM("a[1-6]", "a3", TRUE
, TRUE
);
651 REGEX_TESTLM("a[1-6]b", "a3b", TRUE
, TRUE
);
653 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE
, TRUE
);
654 REGEX_TESTLM("a[0-9]*b", "abc", TRUE
, FALSE
);
655 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE
, TRUE
);
656 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE
, FALSE
); // note that * matches 0 occurences.
657 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE
, TRUE
);
660 // OR operator in patterns
662 REGEX_TESTLM("(a|b)", "a", TRUE
, TRUE
);
663 REGEX_TESTLM("(a|b)", "b", TRUE
, TRUE
);
664 REGEX_TESTLM("(a|b)", "c", FALSE
, FALSE
);
665 REGEX_TESTLM("a|b", "b", TRUE
, TRUE
);
667 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE
, TRUE
);
668 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE
, FALSE
);
669 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE
, TRUE
);
670 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE
, TRUE
);
671 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE
, TRUE
);
672 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE
, FALSE
);
677 REGEX_TESTLM("ab+", "abbc", TRUE
, FALSE
);
678 REGEX_TESTLM("ab+c", "ac", FALSE
, FALSE
);
679 REGEX_TESTLM("b+", "", FALSE
, FALSE
);
680 REGEX_TESTLM("(abc|def)+", "defabc", TRUE
, TRUE
);
681 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE
, FALSE
);
682 REGEX_TESTLM(".+y", "zippity dooy", TRUE
, TRUE
);
687 REGEX_TESTLM("ab?", "ab", TRUE
, TRUE
);
688 REGEX_TESTLM("ab?", "a", TRUE
, TRUE
);
689 REGEX_TESTLM("ab?", "ac", TRUE
, FALSE
);
690 REGEX_TESTLM("ab?", "abb", TRUE
, FALSE
);
691 REGEX_TESTLM("a(b|c)?d", "abd", TRUE
, TRUE
);
692 REGEX_TESTLM("a(b|c)?d", "acd", TRUE
, TRUE
);
693 REGEX_TESTLM("a(b|c)?d", "ad", TRUE
, TRUE
);
694 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE
, FALSE
);
695 REGEX_TESTLM("a(b|c)?d", "ab", FALSE
, FALSE
);
698 // Escape sequences that become single literal chars, handled internally
699 // by ICU's Unescape.
702 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
703 REGEX_TESTLM("\\a", "\\u0007", TRUE
, TRUE
); // BEL
704 REGEX_TESTLM("\\cL", "\\u000c", TRUE
, TRUE
); // Control-L
705 REGEX_TESTLM("\\e", "\\u001b", TRUE
, TRUE
); // Escape
706 REGEX_TESTLM("\\f", "\\u000c", TRUE
, TRUE
); // Form Feed
707 REGEX_TESTLM("\\n", "\\u000a", TRUE
, TRUE
); // new line
708 REGEX_TESTLM("\\r", "\\u000d", TRUE
, TRUE
); // CR
709 REGEX_TESTLM("\\t", "\\u0009", TRUE
, TRUE
); // Tab
710 REGEX_TESTLM("\\u1234", "\\u1234", TRUE
, TRUE
);
711 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE
, TRUE
);
713 REGEX_TESTLM(".*\\Ax", "xyz", TRUE
, FALSE
); // \A matches only at the beginning of input
714 REGEX_TESTLM(".*\\Ax", " xyz", FALSE
, FALSE
); // \A matches only at the beginning of input
716 // Escape of special chars in patterns
717 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE
, TRUE
);
721 //---------------------------------------------------------------------------
723 // UTextBasic Check for quirks that are specific to the UText
726 //---------------------------------------------------------------------------
727 void RegexTest::UTextBasic() {
728 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
729 UErrorCode status
= U_ZERO_ERROR
;
730 UText pattern
= UTEXT_INITIALIZER
;
731 utext_openUTF8(&pattern
, str_abc
, -1, &status
);
732 RegexMatcher
matcher(&pattern
, 0, status
);
735 UText input
= UTEXT_INITIALIZER
;
736 utext_openUTF8(&input
, str_abc
, -1, &status
);
738 matcher
.reset(&input
);
740 REGEX_ASSERT_UTEXT_UTF8(str_abc
, matcher
.inputText());
742 matcher
.reset(matcher
.inputText());
744 REGEX_ASSERT_UTEXT_UTF8(str_abc
, matcher
.inputText());
746 utext_close(&pattern
);
751 //---------------------------------------------------------------------------
753 // API_Match Test that the API for class RegexMatcher
754 // is present and nominally working, but excluding functions
755 // implementing replace operations.
757 //---------------------------------------------------------------------------
758 void RegexTest::API_Match() {
760 UErrorCode status
=U_ZERO_ERROR
;
764 // Debug - slide failing test cases early
773 // Simple pattern compilation
776 UnicodeString
re("abc");
778 pat2
= RegexPattern::compile(re
, flags
, pe
, status
);
781 UnicodeString inStr1
= "abcdef this is a test";
782 UnicodeString instr2
= "not abc";
783 UnicodeString empty
= "";
787 // Matcher creation and reset.
789 RegexMatcher
*m1
= pat2
->matcher(inStr1
, status
);
791 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
792 REGEX_ASSERT(m1
->input() == inStr1
);
794 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
795 REGEX_ASSERT(m1
->input() == instr2
);
797 REGEX_ASSERT(m1
->input() == inStr1
);
798 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
800 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
801 REGEX_ASSERT(m1
->input() == empty
);
802 REGEX_ASSERT(&m1
->pattern() == pat2
);
805 // reset(pos, status)
808 m1
->reset(4, status
);
810 REGEX_ASSERT(m1
->input() == inStr1
);
811 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
813 m1
->reset(-1, status
);
814 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
815 status
= U_ZERO_ERROR
;
817 m1
->reset(0, status
);
819 status
= U_ZERO_ERROR
;
821 int32_t len
= m1
->input().length();
822 m1
->reset(len
-1, status
);
824 status
= U_ZERO_ERROR
;
826 m1
->reset(len
, status
);
828 status
= U_ZERO_ERROR
;
830 m1
->reset(len
+1, status
);
831 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
832 status
= U_ZERO_ERROR
;
835 // match(pos, status)
838 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
840 REGEX_ASSERT(m1
->matches(3, status
) == FALSE
);
842 REGEX_ASSERT(m1
->matches(5, status
) == FALSE
);
843 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
844 REGEX_ASSERT(m1
->matches(-1, status
) == FALSE
);
845 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
847 // Match() at end of string should fail, but should not
849 status
= U_ZERO_ERROR
;
850 len
= m1
->input().length();
851 REGEX_ASSERT(m1
->matches(len
, status
) == FALSE
);
854 // Match beyond end of string should fail with an error.
855 status
= U_ZERO_ERROR
;
856 REGEX_ASSERT(m1
->matches(len
+1, status
) == FALSE
);
857 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
859 // Successful match at end of string.
861 status
= U_ZERO_ERROR
;
862 RegexMatcher
m("A?", 0, status
); // will match zero length string.
865 len
= inStr1
.length();
866 REGEX_ASSERT(m
.matches(len
, status
) == TRUE
);
869 REGEX_ASSERT(m
.matches(0, status
) == TRUE
);
875 // lookingAt(pos, status)
877 status
= U_ZERO_ERROR
;
878 m1
->reset(instr2
); // "not abc"
879 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
880 REGEX_ASSERT(m1
->lookingAt(5, status
) == FALSE
);
881 REGEX_ASSERT(m1
->lookingAt(3, status
) == FALSE
);
882 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
883 REGEX_ASSERT(m1
->lookingAt(-1, status
) == FALSE
);
884 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
885 status
= U_ZERO_ERROR
;
886 len
= m1
->input().length();
887 REGEX_ASSERT(m1
->lookingAt(len
, status
) == FALSE
);
889 REGEX_ASSERT(m1
->lookingAt(len
+1, status
) == FALSE
);
890 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
899 // RegexMatcher::start();
900 // RegexMatcher::end();
901 // RegexMatcher::groupCount();
906 UErrorCode status
=U_ZERO_ERROR
;
908 UnicodeString
re("01(23(45)67)(.*)");
909 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
911 UnicodeString data
= "0123456789";
913 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
915 REGEX_ASSERT(matcher
->lookingAt(status
) == TRUE
);
916 static const int32_t matchStarts
[] = {0, 2, 4, 8};
917 static const int32_t matchEnds
[] = {10, 8, 6, 10};
919 for (i
=0; i
<4; i
++) {
920 int32_t actualStart
= matcher
->start(i
, status
);
922 if (actualStart
!= matchStarts
[i
]) {
923 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
924 __LINE__
, i
, matchStarts
[i
], actualStart
);
926 int32_t actualEnd
= matcher
->end(i
, status
);
928 if (actualEnd
!= matchEnds
[i
]) {
929 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
930 __LINE__
, i
, matchEnds
[i
], actualEnd
);
934 REGEX_ASSERT(matcher
->start(0, status
) == matcher
->start(status
));
935 REGEX_ASSERT(matcher
->end(0, status
) == matcher
->end(status
));
937 REGEX_ASSERT_FAIL(matcher
->start(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
938 REGEX_ASSERT_FAIL(matcher
->start( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
940 REGEX_ASSERT_FAIL(matcher
->start( 0, status
), U_REGEX_INVALID_STATE
);
942 matcher
->lookingAt(status
);
943 REGEX_ASSERT(matcher
->group(status
) == "0123456789");
944 REGEX_ASSERT(matcher
->group(0, status
) == "0123456789");
945 REGEX_ASSERT(matcher
->group(1, status
) == "234567" );
946 REGEX_ASSERT(matcher
->group(2, status
) == "45" );
947 REGEX_ASSERT(matcher
->group(3, status
) == "89" );
949 REGEX_ASSERT_FAIL(matcher
->group(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
950 REGEX_ASSERT_FAIL(matcher
->group( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
952 REGEX_ASSERT_FAIL(matcher
->group( 0, status
), U_REGEX_INVALID_STATE
);
965 UErrorCode status
=U_ZERO_ERROR
;
967 UnicodeString
re("abc");
968 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
970 UnicodeString data
= ".abc..abc...abc..";
971 // 012345678901234567
973 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
975 REGEX_ASSERT(matcher
->find());
976 REGEX_ASSERT(matcher
->start(status
) == 1);
977 REGEX_ASSERT(matcher
->find());
978 REGEX_ASSERT(matcher
->start(status
) == 6);
979 REGEX_ASSERT(matcher
->find());
980 REGEX_ASSERT(matcher
->start(status
) == 12);
981 REGEX_ASSERT(matcher
->find() == FALSE
);
982 REGEX_ASSERT(matcher
->find() == FALSE
);
985 REGEX_ASSERT(matcher
->find());
986 REGEX_ASSERT(matcher
->start(status
) == 1);
988 REGEX_ASSERT(matcher
->find(0, status
));
989 REGEX_ASSERT(matcher
->start(status
) == 1);
990 REGEX_ASSERT(matcher
->find(1, status
));
991 REGEX_ASSERT(matcher
->start(status
) == 1);
992 REGEX_ASSERT(matcher
->find(2, status
));
993 REGEX_ASSERT(matcher
->start(status
) == 6);
994 REGEX_ASSERT(matcher
->find(12, status
));
995 REGEX_ASSERT(matcher
->start(status
) == 12);
996 REGEX_ASSERT(matcher
->find(13, status
) == FALSE
);
997 REGEX_ASSERT(matcher
->find(16, status
) == FALSE
);
998 REGEX_ASSERT(matcher
->find(17, status
) == FALSE
);
999 REGEX_ASSERT_FAIL(matcher
->start(status
), U_REGEX_INVALID_STATE
);
1001 status
= U_ZERO_ERROR
;
1002 REGEX_ASSERT_FAIL(matcher
->find(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
1003 status
= U_ZERO_ERROR
;
1004 REGEX_ASSERT_FAIL(matcher
->find(18, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
1006 REGEX_ASSERT(matcher
->groupCount() == 0);
1014 // find, with \G in pattern (true if at the end of a previous match).
1019 UErrorCode status
=U_ZERO_ERROR
;
1021 UnicodeString
re(".*?(?:(\\Gabc)|(abc))", -1, US_INV
);
1022 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
1024 UnicodeString data
= ".abcabc.abc..";
1025 // 012345678901234567
1027 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
1029 REGEX_ASSERT(matcher
->find());
1030 REGEX_ASSERT(matcher
->start(status
) == 0);
1031 REGEX_ASSERT(matcher
->start(1, status
) == -1);
1032 REGEX_ASSERT(matcher
->start(2, status
) == 1);
1034 REGEX_ASSERT(matcher
->find());
1035 REGEX_ASSERT(matcher
->start(status
) == 4);
1036 REGEX_ASSERT(matcher
->start(1, status
) == 4);
1037 REGEX_ASSERT(matcher
->start(2, status
) == -1);
1045 // find with zero length matches, match position should bump ahead
1046 // to prevent loops.
1050 UErrorCode status
=U_ZERO_ERROR
;
1051 RegexMatcher
m("(?= ?)", 0, status
); // This pattern will zero-length matches anywhere,
1052 // using an always-true look-ahead.
1054 UnicodeString
s(" ");
1057 if (m
.find() == FALSE
) {
1060 REGEX_ASSERT(m
.start(status
) == i
);
1061 REGEX_ASSERT(m
.end(status
) == i
);
1065 // Check that the bump goes over surrogate pairs OK
1066 s
= UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1070 if (m
.find() == FALSE
) {
1073 REGEX_ASSERT(m
.start(status
) == i
);
1074 REGEX_ASSERT(m
.end(status
) == i
);
1076 REGEX_ASSERT(i
==10);
1079 // find() loop breaking test.
1080 // with pattern of /.?/, should see a series of one char matches, then a single
1081 // match of zero length at the end of the input string.
1083 UErrorCode status
=U_ZERO_ERROR
;
1084 RegexMatcher
m(".?", 0, status
);
1086 UnicodeString
s(" ");
1089 if (m
.find() == FALSE
) {
1092 REGEX_ASSERT(m
.start(status
) == i
);
1093 REGEX_ASSERT(m
.end(status
) == (i
<4 ? i
+1 : i
));
1100 // Matchers with no input string behave as if they had an empty input string.
1104 UErrorCode status
= U_ZERO_ERROR
;
1105 RegexMatcher
m(".?", 0, status
);
1107 REGEX_ASSERT(m
.find());
1108 REGEX_ASSERT(m
.start(status
) == 0);
1109 REGEX_ASSERT(m
.input() == "");
1112 UErrorCode status
= U_ZERO_ERROR
;
1113 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
1114 RegexMatcher
*m
= p
->matcher(status
);
1117 REGEX_ASSERT(m
->find() == FALSE
);
1118 REGEX_ASSERT(m
->input() == "");
1127 UErrorCode status
= U_ZERO_ERROR
;
1128 UnicodeString
testString("This is test data");
1129 RegexMatcher
m(".*", testString
, 0, status
);
1131 REGEX_ASSERT(m
.regionStart() == 0);
1132 REGEX_ASSERT(m
.regionEnd() == testString
.length());
1133 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1134 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1136 m
.region(2,4, status
);
1138 REGEX_ASSERT(m
.matches(status
));
1139 REGEX_ASSERT(m
.start(status
)==2);
1140 REGEX_ASSERT(m
.end(status
)==4);
1144 REGEX_ASSERT(m
.regionStart() == 0);
1145 REGEX_ASSERT(m
.regionEnd() == testString
.length());
1147 UnicodeString
shorterString("short");
1148 m
.reset(shorterString
);
1149 REGEX_ASSERT(m
.regionStart() == 0);
1150 REGEX_ASSERT(m
.regionEnd() == shorterString
.length());
1152 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1153 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(FALSE
));
1154 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
1155 REGEX_ASSERT(&m
== &m
.reset());
1156 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
1158 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(TRUE
));
1159 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1160 REGEX_ASSERT(&m
== &m
.reset());
1161 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1163 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1164 REGEX_ASSERT(&m
== &m
.useTransparentBounds(TRUE
));
1165 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
1166 REGEX_ASSERT(&m
== &m
.reset());
1167 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
1169 REGEX_ASSERT(&m
== &m
.useTransparentBounds(FALSE
));
1170 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1171 REGEX_ASSERT(&m
== &m
.reset());
1172 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1177 // hitEnd() and requireEnd()
1180 UErrorCode status
= U_ZERO_ERROR
;
1181 UnicodeString
testString("aabb");
1182 RegexMatcher
m1(".*", testString
, 0, status
);
1183 REGEX_ASSERT(m1
.lookingAt(status
) == TRUE
);
1184 REGEX_ASSERT(m1
.hitEnd() == TRUE
);
1185 REGEX_ASSERT(m1
.requireEnd() == FALSE
);
1188 status
= U_ZERO_ERROR
;
1189 RegexMatcher
m2("a*", testString
, 0, status
);
1190 REGEX_ASSERT(m2
.lookingAt(status
) == TRUE
);
1191 REGEX_ASSERT(m2
.hitEnd() == FALSE
);
1192 REGEX_ASSERT(m2
.requireEnd() == FALSE
);
1195 status
= U_ZERO_ERROR
;
1196 RegexMatcher
m3(".*$", testString
, 0, status
);
1197 REGEX_ASSERT(m3
.lookingAt(status
) == TRUE
);
1198 REGEX_ASSERT(m3
.hitEnd() == TRUE
);
1199 REGEX_ASSERT(m3
.requireEnd() == TRUE
);
1205 // Compilation error on reset with UChar *
1206 // These were a hazard that people were stumbling over with runtime errors.
1207 // Changed them to compiler errors by adding private methods that more closely
1208 // matched the incorrect use of the functions.
1212 UErrorCode status
= U_ZERO_ERROR
;
1213 UChar ucharString
[20];
1214 RegexMatcher
m(".", 0, status
);
1215 m
.reset(ucharString
); // should not compile.
1217 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
1218 RegexMatcher
*m2
= p
->matcher(ucharString
, status
); // should not compile.
1220 RegexMatcher
m3(".", ucharString
, 0, status
); // Should not compile
1226 // Note: These tests will need to be changed when the regexp engine is
1227 // able to detect and cut short the exponential time behavior on
1228 // this type of match.
1231 UErrorCode status
= U_ZERO_ERROR
;
1232 // Enough 'a's in the string to cause the match to time out.
1233 // (Each on additonal 'a' doubles the time)
1234 UnicodeString
testString("aaaaaaaaaaaaaaaaaaaaa");
1235 RegexMatcher
matcher("(a+)+b", testString
, 0, status
);
1237 REGEX_ASSERT(matcher
.getTimeLimit() == 0);
1238 matcher
.setTimeLimit(100, status
);
1239 REGEX_ASSERT(matcher
.getTimeLimit() == 100);
1240 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1241 REGEX_ASSERT(status
== U_REGEX_TIME_OUT
);
1244 UErrorCode status
= U_ZERO_ERROR
;
1245 // Few enough 'a's to slip in under the time limit.
1246 UnicodeString
testString("aaaaaaaaaaaaaaaaaa");
1247 RegexMatcher
matcher("(a+)+b", testString
, 0, status
);
1249 matcher
.setTimeLimit(100, status
);
1250 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1258 UErrorCode status
= U_ZERO_ERROR
;
1259 UnicodeString
testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'
1261 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1262 // of the '+', and makes the stack frames larger.
1263 RegexMatcher
matcher("(A)+A$", testString
, 0, status
);
1265 // With the default stack, this match should fail to run
1266 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1267 REGEX_ASSERT(status
== U_REGEX_STACK_OVERFLOW
);
1269 // With unlimited stack, it should run
1270 status
= U_ZERO_ERROR
;
1271 matcher
.setStackLimit(0, status
);
1273 REGEX_ASSERT(matcher
.lookingAt(status
) == TRUE
);
1275 REGEX_ASSERT(matcher
.getStackLimit() == 0);
1277 // With a limited stack, it the match should fail
1278 status
= U_ZERO_ERROR
;
1279 matcher
.setStackLimit(10000, status
);
1280 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1281 REGEX_ASSERT(status
== U_REGEX_STACK_OVERFLOW
);
1282 REGEX_ASSERT(matcher
.getStackLimit() == 10000);
1285 // A pattern that doesn't save state should work with
1286 // a minimal sized stack
1288 UErrorCode status
= U_ZERO_ERROR
;
1289 UnicodeString testString
= "abc";
1290 RegexMatcher
matcher("abc", testString
, 0, status
);
1292 matcher
.setStackLimit(30, status
);
1294 REGEX_ASSERT(matcher
.matches(status
) == TRUE
);
1296 REGEX_ASSERT(matcher
.getStackLimit() == 30);
1298 // Negative stack sizes should fail
1299 status
= U_ZERO_ERROR
;
1300 matcher
.setStackLimit(1000, status
);
1302 matcher
.setStackLimit(-1, status
);
1303 REGEX_ASSERT(status
== U_ILLEGAL_ARGUMENT_ERROR
);
1304 REGEX_ASSERT(matcher
.getStackLimit() == 1000);
1315 //---------------------------------------------------------------------------
1317 // API_Replace API test for class RegexMatcher, testing the
1318 // Replace family of functions.
1320 //---------------------------------------------------------------------------
1321 void RegexTest::API_Replace() {
1327 UErrorCode status
=U_ZERO_ERROR
;
1329 UnicodeString
re("abc");
1330 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
1332 UnicodeString data
= ".abc..abc...abc..";
1333 // 012345678901234567
1334 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
1337 // Plain vanilla matches.
1340 dest
= matcher
->replaceFirst("yz", status
);
1342 REGEX_ASSERT(dest
== ".yz..abc...abc..");
1344 dest
= matcher
->replaceAll("yz", status
);
1346 REGEX_ASSERT(dest
== ".yz..yz...yz..");
1349 // Plain vanilla non-matches.
1351 UnicodeString d2
= ".abx..abx...abx..";
1353 dest
= matcher
->replaceFirst("yz", status
);
1355 REGEX_ASSERT(dest
== ".abx..abx...abx..");
1357 dest
= matcher
->replaceAll("yz", status
);
1359 REGEX_ASSERT(dest
== ".abx..abx...abx..");
1362 // Empty source string
1364 UnicodeString d3
= "";
1366 dest
= matcher
->replaceFirst("yz", status
);
1368 REGEX_ASSERT(dest
== "");
1370 dest
= matcher
->replaceAll("yz", status
);
1372 REGEX_ASSERT(dest
== "");
1375 // Empty substitution string
1377 matcher
->reset(data
); // ".abc..abc...abc.."
1378 dest
= matcher
->replaceFirst("", status
);
1380 REGEX_ASSERT(dest
== "...abc...abc..");
1382 dest
= matcher
->replaceAll("", status
);
1384 REGEX_ASSERT(dest
== "........");
1387 // match whole string
1389 UnicodeString d4
= "abc";
1391 dest
= matcher
->replaceFirst("xyz", status
);
1393 REGEX_ASSERT(dest
== "xyz");
1395 dest
= matcher
->replaceAll("xyz", status
);
1397 REGEX_ASSERT(dest
== "xyz");
1400 // Capture Group, simple case
1402 UnicodeString
re2("a(..)");
1403 RegexPattern
*pat2
= RegexPattern::compile(re2
, flags
, pe
, status
);
1405 UnicodeString d5
= "abcdefg";
1406 RegexMatcher
*matcher2
= pat2
->matcher(d5
, status
);
1408 dest
= matcher2
->replaceFirst("$1$1", status
);
1410 REGEX_ASSERT(dest
== "bcbcdefg");
1412 dest
= matcher2
->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status
);
1414 REGEX_ASSERT(dest
== "The value of $1 is bc.defg");
1416 dest
= matcher2
->replaceFirst("$ by itself, no group number $$$", status
);
1417 REGEX_ASSERT(U_FAILURE(status
));
1418 status
= U_ZERO_ERROR
;
1420 UnicodeString replacement
= UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1421 replacement
= replacement
.unescape();
1422 dest
= matcher2
->replaceFirst(replacement
, status
);
1424 REGEX_ASSERT(dest
== "Supplemental Digit 1 bc.defg");
1426 REGEX_ASSERT_FAIL(matcher2
->replaceFirst("bad capture group number $5...",status
), U_INDEX_OUTOFBOUNDS_ERROR
);
1430 // Replacement String with \u hex escapes
1433 UnicodeString src
= "abc 1 abc 2 abc 3";
1434 UnicodeString substitute
= UNICODE_STRING_SIMPLE("--\\u0043--");
1435 matcher
->reset(src
);
1436 UnicodeString result
= matcher
->replaceAll(substitute
, status
);
1438 REGEX_ASSERT(result
== "--C-- 1 --C-- 2 --C-- 3");
1441 UnicodeString src
= "abc !";
1442 UnicodeString substitute
= UNICODE_STRING_SIMPLE("--\\U00010000--");
1443 matcher
->reset(src
);
1444 UnicodeString result
= matcher
->replaceAll(substitute
, status
);
1446 UnicodeString expected
= UnicodeString("--");
1447 expected
.append((UChar32
)0x10000);
1448 expected
.append("-- !");
1449 REGEX_ASSERT(result
== expected
);
1451 // TODO: need more through testing of capture substitutions.
1456 status
= U_ZERO_ERROR
;
1457 UnicodeString s
= "The matches start with ss and end with ee ss stuff ee fin";
1458 RegexMatcher
m("ss(.*?)ee", 0, status
);
1460 UnicodeString result
;
1462 // Multiple finds do NOT bump up the previous appendReplacement postion.
1466 m
.appendReplacement(result
, "ooh", status
);
1468 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1470 // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1471 status
= U_ZERO_ERROR
;
1473 m
.reset(10, status
);
1476 m
.appendReplacement(result
, "ooh", status
);
1478 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1480 // find() at interior of string, appendReplacemnt still starts at beginning.
1481 status
= U_ZERO_ERROR
;
1486 m
.appendReplacement(result
, "ooh", status
);
1488 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1490 m
.appendTail(result
);
1491 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh fin");
1502 //---------------------------------------------------------------------------
1504 // API_Pattern Test that the API for class RegexPattern is
1505 // present and nominally working.
1507 //---------------------------------------------------------------------------
1508 void RegexTest::API_Pattern() {
1509 RegexPattern pata
; // Test default constructor to not crash.
1512 REGEX_ASSERT(pata
== patb
);
1513 REGEX_ASSERT(pata
== pata
);
1515 UnicodeString
re1("abc[a-l][m-z]");
1516 UnicodeString
re2("def");
1517 UErrorCode status
= U_ZERO_ERROR
;
1520 RegexPattern
*pat1
= RegexPattern::compile(re1
, 0, pe
, status
);
1521 RegexPattern
*pat2
= RegexPattern::compile(re2
, 0, pe
, status
);
1523 REGEX_ASSERT(*pat1
== *pat1
);
1524 REGEX_ASSERT(*pat1
!= pata
);
1528 REGEX_ASSERT(patb
== *pat1
);
1531 RegexPattern
patc(*pat1
);
1532 REGEX_ASSERT(patc
== *pat1
);
1533 REGEX_ASSERT(patb
== patc
);
1534 REGEX_ASSERT(pat1
!= pat2
);
1536 REGEX_ASSERT(patb
!= patc
);
1537 REGEX_ASSERT(patb
== *pat2
);
1539 // Compile with no flags.
1540 RegexPattern
*pat1a
= RegexPattern::compile(re1
, pe
, status
);
1541 REGEX_ASSERT(*pat1a
== *pat1
);
1543 REGEX_ASSERT(pat1a
->flags() == 0);
1545 // Compile with different flags should be not equal
1546 RegexPattern
*pat1b
= RegexPattern::compile(re1
, UREGEX_CASE_INSENSITIVE
, pe
, status
);
1549 REGEX_ASSERT(*pat1b
!= *pat1a
);
1550 REGEX_ASSERT(pat1b
->flags() == UREGEX_CASE_INSENSITIVE
);
1551 REGEX_ASSERT(pat1a
->flags() == 0);
1555 RegexPattern
*pat1c
= pat1
->clone();
1556 REGEX_ASSERT(*pat1c
== *pat1
);
1557 REGEX_ASSERT(*pat1c
!= *pat2
);
1566 // Verify that a matcher created from a cloned pattern works.
1570 UErrorCode status
= U_ZERO_ERROR
;
1571 RegexPattern
*pSource
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status
);
1572 RegexPattern
*pClone
= pSource
->clone();
1574 RegexMatcher
*mFromClone
= pClone
->matcher(status
);
1576 UnicodeString s
= "Hello World";
1577 mFromClone
->reset(s
);
1578 REGEX_ASSERT(mFromClone
->find() == TRUE
);
1579 REGEX_ASSERT(mFromClone
->group(status
) == "Hello");
1580 REGEX_ASSERT(mFromClone
->find() == TRUE
);
1581 REGEX_ASSERT(mFromClone
->group(status
) == "World");
1582 REGEX_ASSERT(mFromClone
->find() == FALSE
);
1588 // matches convenience API
1590 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe
, status
) == TRUE
);
1592 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe
, status
) == FALSE
);
1594 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe
, status
) == TRUE
);
1596 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe
, status
) == TRUE
);
1598 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe
, status
) == FALSE
);
1600 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1601 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe
, status
) == FALSE
);
1602 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1608 status
= U_ZERO_ERROR
;
1609 pat1
= RegexPattern::compile(" +", pe
, status
);
1611 UnicodeString fields
[10];
1614 n
= pat1
->split("Now is the time", fields
, 10, status
);
1617 REGEX_ASSERT(fields
[0]=="Now");
1618 REGEX_ASSERT(fields
[1]=="is");
1619 REGEX_ASSERT(fields
[2]=="the");
1620 REGEX_ASSERT(fields
[3]=="time");
1621 REGEX_ASSERT(fields
[4]=="");
1623 n
= pat1
->split("Now is the time", fields
, 2, status
);
1626 REGEX_ASSERT(fields
[0]=="Now");
1627 REGEX_ASSERT(fields
[1]=="is the time");
1628 REGEX_ASSERT(fields
[2]=="the"); // left over from previous test
1631 status
= U_ZERO_ERROR
;
1632 n
= pat1
->split("Now is the time", fields
, 1, status
);
1635 REGEX_ASSERT(fields
[0]=="Now is the time");
1636 REGEX_ASSERT(fields
[1]=="*");
1637 status
= U_ZERO_ERROR
;
1639 n
= pat1
->split(" Now is the time ", fields
, 10, status
);
1642 REGEX_ASSERT(fields
[0]=="");
1643 REGEX_ASSERT(fields
[1]=="Now");
1644 REGEX_ASSERT(fields
[2]=="is");
1645 REGEX_ASSERT(fields
[3]=="the");
1646 REGEX_ASSERT(fields
[4]=="time");
1647 REGEX_ASSERT(fields
[5]=="");
1649 n
= pat1
->split(" ", fields
, 10, status
);
1652 REGEX_ASSERT(fields
[0]=="");
1653 REGEX_ASSERT(fields
[1]=="");
1656 n
= pat1
->split("", fields
, 10, status
);
1659 REGEX_ASSERT(fields
[0]=="foo");
1663 // split, with a pattern with (capture)
1664 pat1
= RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe
, status
);
1667 status
= U_ZERO_ERROR
;
1668 n
= pat1
->split("<a>Now is <b>the time<c>", fields
, 10, status
);
1671 REGEX_ASSERT(fields
[0]=="");
1672 REGEX_ASSERT(fields
[1]=="a");
1673 REGEX_ASSERT(fields
[2]=="Now is ");
1674 REGEX_ASSERT(fields
[3]=="b");
1675 REGEX_ASSERT(fields
[4]=="the time");
1676 REGEX_ASSERT(fields
[5]=="c");
1677 REGEX_ASSERT(fields
[6]=="");
1678 REGEX_ASSERT(status
==U_ZERO_ERROR
);
1680 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 10, status
);
1683 REGEX_ASSERT(fields
[0]==" ");
1684 REGEX_ASSERT(fields
[1]=="a");
1685 REGEX_ASSERT(fields
[2]=="Now is ");
1686 REGEX_ASSERT(fields
[3]=="b");
1687 REGEX_ASSERT(fields
[4]=="the time");
1688 REGEX_ASSERT(fields
[5]=="c");
1689 REGEX_ASSERT(fields
[6]=="");
1691 status
= U_ZERO_ERROR
;
1693 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 6, status
);
1696 REGEX_ASSERT(fields
[0]==" ");
1697 REGEX_ASSERT(fields
[1]=="a");
1698 REGEX_ASSERT(fields
[2]=="Now is ");
1699 REGEX_ASSERT(fields
[3]=="b");
1700 REGEX_ASSERT(fields
[4]=="the time");
1701 REGEX_ASSERT(fields
[5]==""); // All text following "<c>" field delimiter.
1702 REGEX_ASSERT(fields
[6]=="foo");
1704 status
= U_ZERO_ERROR
;
1706 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 5, status
);
1709 REGEX_ASSERT(fields
[0]==" ");
1710 REGEX_ASSERT(fields
[1]=="a");
1711 REGEX_ASSERT(fields
[2]=="Now is ");
1712 REGEX_ASSERT(fields
[3]=="b");
1713 REGEX_ASSERT(fields
[4]=="the time<c>");
1714 REGEX_ASSERT(fields
[5]=="foo");
1716 status
= U_ZERO_ERROR
;
1718 n
= pat1
->split(" <a>Now is <b>the time", fields
, 5, status
);
1721 REGEX_ASSERT(fields
[0]==" ");
1722 REGEX_ASSERT(fields
[1]=="a");
1723 REGEX_ASSERT(fields
[2]=="Now is ");
1724 REGEX_ASSERT(fields
[3]=="b");
1725 REGEX_ASSERT(fields
[4]=="the time");
1726 REGEX_ASSERT(fields
[5]=="foo");
1728 status
= U_ZERO_ERROR
;
1729 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 4, status
);
1732 REGEX_ASSERT(fields
[0]==" ");
1733 REGEX_ASSERT(fields
[1]=="a");
1734 REGEX_ASSERT(fields
[2]=="Now is ");
1735 REGEX_ASSERT(fields
[3]=="the time<c>");
1736 status
= U_ZERO_ERROR
;
1739 pat1
= RegexPattern::compile("([-,])", pe
, status
);
1741 n
= pat1
->split("1-10,20", fields
, 10, status
);
1744 REGEX_ASSERT(fields
[0]=="1");
1745 REGEX_ASSERT(fields
[1]=="-");
1746 REGEX_ASSERT(fields
[2]=="10");
1747 REGEX_ASSERT(fields
[3]==",");
1748 REGEX_ASSERT(fields
[4]=="20");
1751 // Test split of string with empty trailing fields
1752 pat1
= RegexPattern::compile(",", pe
, status
);
1754 n
= pat1
->split("a,b,c,", fields
, 10, status
);
1757 REGEX_ASSERT(fields
[0]=="a");
1758 REGEX_ASSERT(fields
[1]=="b");
1759 REGEX_ASSERT(fields
[2]=="c");
1760 REGEX_ASSERT(fields
[3]=="");
1762 n
= pat1
->split("a,,,", fields
, 10, status
);
1765 REGEX_ASSERT(fields
[0]=="a");
1766 REGEX_ASSERT(fields
[1]=="");
1767 REGEX_ASSERT(fields
[2]=="");
1768 REGEX_ASSERT(fields
[3]=="");
1771 // Split Separator with zero length match.
1772 pat1
= RegexPattern::compile(":?", pe
, status
);
1774 n
= pat1
->split("abc", fields
, 10, status
);
1777 REGEX_ASSERT(fields
[0]=="");
1778 REGEX_ASSERT(fields
[1]=="a");
1779 REGEX_ASSERT(fields
[2]=="b");
1780 REGEX_ASSERT(fields
[3]=="c");
1781 REGEX_ASSERT(fields
[4]=="");
1786 // RegexPattern::pattern()
1788 pat1
= new RegexPattern();
1789 REGEX_ASSERT(pat1
->pattern() == "");
1792 pat1
= RegexPattern::compile("(Hello, world)*", pe
, status
);
1794 REGEX_ASSERT(pat1
->pattern() == "(Hello, world)*");
1799 // classID functions
1801 pat1
= RegexPattern::compile("(Hello, world)*", pe
, status
);
1803 REGEX_ASSERT(pat1
->getDynamicClassID() == RegexPattern::getStaticClassID());
1804 REGEX_ASSERT(pat1
->getDynamicClassID() != NULL
);
1805 UnicodeString
Hello("Hello, world.");
1806 RegexMatcher
*m
= pat1
->matcher(Hello
, status
);
1807 REGEX_ASSERT(pat1
->getDynamicClassID() != m
->getDynamicClassID());
1808 REGEX_ASSERT(m
->getDynamicClassID() == RegexMatcher::getStaticClassID());
1809 REGEX_ASSERT(m
->getDynamicClassID() != NULL
);
1815 //---------------------------------------------------------------------------
1817 // API_Match_UTF8 Test that the alternate engine for class RegexMatcher
1818 // is present and working, but excluding functions
1819 // implementing replace operations.
1821 //---------------------------------------------------------------------------
1822 void RegexTest::API_Match_UTF8() {
1824 UErrorCode status
=U_ZERO_ERROR
;
1828 // Debug - slide failing test cases early
1837 // Simple pattern compilation
1840 UText re
= UTEXT_INITIALIZER
;
1841 regextst_openUTF8FromInvariant(&re
, "abc", -1, &status
);
1842 REGEX_VERBOSE_TEXT(&re
);
1844 pat2
= RegexPattern::compile(&re
, flags
, pe
, status
);
1847 UText input1
= UTEXT_INITIALIZER
;
1848 UText input2
= UTEXT_INITIALIZER
;
1849 UText empty
= UTEXT_INITIALIZER
;
1850 regextst_openUTF8FromInvariant(&input1
, "abcdef this is a test", -1, &status
);
1851 REGEX_VERBOSE_TEXT(&input1
);
1852 regextst_openUTF8FromInvariant(&input2
, "not abc", -1, &status
);
1853 REGEX_VERBOSE_TEXT(&input2
);
1854 utext_openUChars(&empty
, NULL
, 0, &status
);
1856 int32_t input1Len
= static_cast<int32_t>(strlen("abcdef this is a test")); /* TODO: why not nativelen (input1) ? */
1857 int32_t input2Len
= static_cast<int32_t>(strlen("not abc"));
1861 // Matcher creation and reset.
1863 RegexMatcher
*m1
= &pat2
->matcher(status
)->reset(&input1
);
1865 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
1866 const char str_abcdefthisisatest
[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1867 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest
, m1
->inputText());
1869 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
1870 const char str_notabc
[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1871 REGEX_ASSERT_UTEXT_UTF8(str_notabc
, m1
->inputText());
1873 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest
, m1
->inputText());
1874 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
1876 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
1877 REGEX_ASSERT(utext_nativeLength(&empty
) == 0);
1880 // reset(pos, status)
1883 m1
->reset(4, status
);
1885 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest
, m1
->inputText());
1886 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
1888 m1
->reset(-1, status
);
1889 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1890 status
= U_ZERO_ERROR
;
1892 m1
->reset(0, status
);
1894 status
= U_ZERO_ERROR
;
1896 m1
->reset(input1Len
-1, status
);
1898 status
= U_ZERO_ERROR
;
1900 m1
->reset(input1Len
, status
);
1902 status
= U_ZERO_ERROR
;
1904 m1
->reset(input1Len
+1, status
);
1905 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1906 status
= U_ZERO_ERROR
;
1909 // match(pos, status)
1912 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
1914 REGEX_ASSERT(m1
->matches(3, status
) == FALSE
);
1916 REGEX_ASSERT(m1
->matches(5, status
) == FALSE
);
1917 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
1918 REGEX_ASSERT(m1
->matches(-1, status
) == FALSE
);
1919 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1921 // Match() at end of string should fail, but should not
1923 status
= U_ZERO_ERROR
;
1924 REGEX_ASSERT(m1
->matches(input2Len
, status
) == FALSE
);
1927 // Match beyond end of string should fail with an error.
1928 status
= U_ZERO_ERROR
;
1929 REGEX_ASSERT(m1
->matches(input2Len
+1, status
) == FALSE
);
1930 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1932 // Successful match at end of string.
1934 status
= U_ZERO_ERROR
;
1935 RegexMatcher
m("A?", 0, status
); // will match zero length string.
1938 REGEX_ASSERT(m
.matches(input1Len
, status
) == TRUE
);
1941 REGEX_ASSERT(m
.matches(0, status
) == TRUE
);
1947 // lookingAt(pos, status)
1949 status
= U_ZERO_ERROR
;
1950 m1
->reset(&input2
); // "not abc"
1951 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
1952 REGEX_ASSERT(m1
->lookingAt(5, status
) == FALSE
);
1953 REGEX_ASSERT(m1
->lookingAt(3, status
) == FALSE
);
1954 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
1955 REGEX_ASSERT(m1
->lookingAt(-1, status
) == FALSE
);
1956 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1957 status
= U_ZERO_ERROR
;
1958 REGEX_ASSERT(m1
->lookingAt(input2Len
, status
) == FALSE
);
1960 REGEX_ASSERT(m1
->lookingAt(input2Len
+1, status
) == FALSE
);
1961 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1967 utext_close(&input1
);
1968 utext_close(&input2
);
1969 utext_close(&empty
);
1975 // RegexMatcher::start();
1976 // RegexMatcher::end();
1977 // RegexMatcher::groupCount();
1982 UErrorCode status
=U_ZERO_ERROR
;
1983 UText re
=UTEXT_INITIALIZER
;
1984 const char str_01234567_pat
[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1985 utext_openUTF8(&re
, str_01234567_pat
, -1, &status
);
1987 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
1990 UText input
= UTEXT_INITIALIZER
;
1991 const char str_0123456789
[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1992 utext_openUTF8(&input
, str_0123456789
, -1, &status
);
1994 RegexMatcher
*matcher
= &pat
->matcher(status
)->reset(&input
);
1996 REGEX_ASSERT(matcher
->lookingAt(status
) == TRUE
);
1997 static const int32_t matchStarts
[] = {0, 2, 4, 8};
1998 static const int32_t matchEnds
[] = {10, 8, 6, 10};
2000 for (i
=0; i
<4; i
++) {
2001 int32_t actualStart
= matcher
->start(i
, status
);
2003 if (actualStart
!= matchStarts
[i
]) {
2004 errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n",
2005 __FILE__
, __LINE__
, i
, matchStarts
[i
], actualStart
);
2007 int32_t actualEnd
= matcher
->end(i
, status
);
2009 if (actualEnd
!= matchEnds
[i
]) {
2010 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n",
2011 __FILE__
, __LINE__
, i
, matchEnds
[i
], actualEnd
);
2015 REGEX_ASSERT(matcher
->start(0, status
) == matcher
->start(status
));
2016 REGEX_ASSERT(matcher
->end(0, status
) == matcher
->end(status
));
2018 REGEX_ASSERT_FAIL(matcher
->start(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2019 REGEX_ASSERT_FAIL(matcher
->start( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2021 REGEX_ASSERT_FAIL(matcher
->start( 0, status
), U_REGEX_INVALID_STATE
);
2023 matcher
->lookingAt(status
);
2026 UText destText
= UTEXT_INITIALIZER
;
2027 utext_openUnicodeString(&destText
, &dest
, &status
);
2029 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2030 // Test shallow-clone API
2032 result
= matcher
->group((UText
*)NULL
, group_len
, status
);
2034 REGEX_ASSERT_UTEXT_UTF8(str_0123456789
, result
);
2035 utext_close(result
);
2036 result
= matcher
->group(0, &destText
, group_len
, status
);
2038 REGEX_ASSERT(result
== &destText
);
2039 REGEX_ASSERT_UTEXT_UTF8(str_0123456789
, result
);
2040 // destText is now immutable, reopen it
2041 utext_close(&destText
);
2042 utext_openUnicodeString(&destText
, &dest
, &status
);
2045 result
= matcher
->group(0, NULL
, length
, status
);
2047 REGEX_ASSERT_UTEXT_UTF8(str_0123456789
, result
);
2048 utext_close(result
);
2049 result
= matcher
->group(0, &destText
, length
, status
);
2051 REGEX_ASSERT(result
== &destText
);
2052 REGEX_ASSERT(utext_getNativeIndex(result
) == 0);
2053 REGEX_ASSERT(length
== 10);
2054 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2056 // Capture Group 1 == "234567"
2057 result
= matcher
->group(1, NULL
, length
, status
);
2059 REGEX_ASSERT(utext_getNativeIndex(result
) == 2);
2060 REGEX_ASSERT(length
== 6);
2061 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2062 utext_close(result
);
2064 result
= matcher
->group(1, &destText
, length
, status
);
2066 REGEX_ASSERT(result
== &destText
);
2067 REGEX_ASSERT(utext_getNativeIndex(result
) == 2);
2068 REGEX_ASSERT(length
== 6);
2069 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2070 utext_close(result
);
2072 // Capture Group 2 == "45"
2073 result
= matcher
->group(2, NULL
, length
, status
);
2075 REGEX_ASSERT(utext_getNativeIndex(result
) == 4);
2076 REGEX_ASSERT(length
== 2);
2077 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2078 utext_close(result
);
2080 result
= matcher
->group(2, &destText
, length
, status
);
2082 REGEX_ASSERT(result
== &destText
);
2083 REGEX_ASSERT(utext_getNativeIndex(result
) == 4);
2084 REGEX_ASSERT(length
== 2);
2085 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2086 utext_close(result
);
2088 // Capture Group 3 == "89"
2089 result
= matcher
->group(3, NULL
, length
, status
);
2091 REGEX_ASSERT(utext_getNativeIndex(result
) == 8);
2092 REGEX_ASSERT(length
== 2);
2093 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2094 utext_close(result
);
2096 result
= matcher
->group(3, &destText
, length
, status
);
2098 REGEX_ASSERT(result
== &destText
);
2099 REGEX_ASSERT(utext_getNativeIndex(result
) == 8);
2100 REGEX_ASSERT(length
== 2);
2101 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2102 utext_close(result
);
2104 // Capture Group number out of range.
2105 status
= U_ZERO_ERROR
;
2106 REGEX_ASSERT_FAIL(matcher
->group(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2107 status
= U_ZERO_ERROR
;
2108 REGEX_ASSERT_FAIL(matcher
->group( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2109 status
= U_ZERO_ERROR
;
2111 REGEX_ASSERT_FAIL(matcher
->group( 0, status
), U_REGEX_INVALID_STATE
);
2116 utext_close(&destText
);
2117 utext_close(&input
);
2127 UErrorCode status
=U_ZERO_ERROR
;
2128 UText re
=UTEXT_INITIALIZER
;
2129 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2130 utext_openUTF8(&re
, str_abc
, -1, &status
);
2132 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
2134 UText input
= UTEXT_INITIALIZER
;
2135 const char str_abcabcabc
[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2136 utext_openUTF8(&input
, str_abcabcabc
, -1, &status
);
2137 // 012345678901234567
2139 RegexMatcher
*matcher
= &pat
->matcher(status
)->reset(&input
);
2141 REGEX_ASSERT(matcher
->find());
2142 REGEX_ASSERT(matcher
->start(status
) == 1);
2143 REGEX_ASSERT(matcher
->find());
2144 REGEX_ASSERT(matcher
->start(status
) == 6);
2145 REGEX_ASSERT(matcher
->find());
2146 REGEX_ASSERT(matcher
->start(status
) == 12);
2147 REGEX_ASSERT(matcher
->find() == FALSE
);
2148 REGEX_ASSERT(matcher
->find() == FALSE
);
2151 REGEX_ASSERT(matcher
->find());
2152 REGEX_ASSERT(matcher
->start(status
) == 1);
2154 REGEX_ASSERT(matcher
->find(0, status
));
2155 REGEX_ASSERT(matcher
->start(status
) == 1);
2156 REGEX_ASSERT(matcher
->find(1, status
));
2157 REGEX_ASSERT(matcher
->start(status
) == 1);
2158 REGEX_ASSERT(matcher
->find(2, status
));
2159 REGEX_ASSERT(matcher
->start(status
) == 6);
2160 REGEX_ASSERT(matcher
->find(12, status
));
2161 REGEX_ASSERT(matcher
->start(status
) == 12);
2162 REGEX_ASSERT(matcher
->find(13, status
) == FALSE
);
2163 REGEX_ASSERT(matcher
->find(16, status
) == FALSE
);
2164 REGEX_ASSERT(matcher
->find(17, status
) == FALSE
);
2165 REGEX_ASSERT_FAIL(matcher
->start(status
), U_REGEX_INVALID_STATE
);
2167 status
= U_ZERO_ERROR
;
2168 REGEX_ASSERT_FAIL(matcher
->find(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2169 status
= U_ZERO_ERROR
;
2170 REGEX_ASSERT_FAIL(matcher
->find(18, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2172 REGEX_ASSERT(matcher
->groupCount() == 0);
2177 utext_close(&input
);
2183 // find, with \G in pattern (true if at the end of a previous match).
2188 UErrorCode status
=U_ZERO_ERROR
;
2189 UText re
=UTEXT_INITIALIZER
;
2190 const char str_Gabcabc
[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2191 utext_openUTF8(&re
, str_Gabcabc
, -1, &status
);
2193 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
2196 UText input
= UTEXT_INITIALIZER
;
2197 const char str_abcabcabc
[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2198 utext_openUTF8(&input
, str_abcabcabc
, -1, &status
);
2199 // 012345678901234567
2201 RegexMatcher
*matcher
= &pat
->matcher(status
)->reset(&input
);
2203 REGEX_ASSERT(matcher
->find());
2204 REGEX_ASSERT(matcher
->start(status
) == 0);
2205 REGEX_ASSERT(matcher
->start(1, status
) == -1);
2206 REGEX_ASSERT(matcher
->start(2, status
) == 1);
2208 REGEX_ASSERT(matcher
->find());
2209 REGEX_ASSERT(matcher
->start(status
) == 4);
2210 REGEX_ASSERT(matcher
->start(1, status
) == 4);
2211 REGEX_ASSERT(matcher
->start(2, status
) == -1);
2217 utext_close(&input
);
2222 // find with zero length matches, match position should bump ahead
2223 // to prevent loops.
2227 UErrorCode status
=U_ZERO_ERROR
;
2228 RegexMatcher
m("(?= ?)", 0, status
); // This pattern will zero-length matches anywhere,
2229 // using an always-true look-ahead.
2231 UText s
= UTEXT_INITIALIZER
;
2232 utext_openUTF8(&s
, " ", -1, &status
);
2235 if (m
.find() == FALSE
) {
2238 REGEX_ASSERT(m
.start(status
) == i
);
2239 REGEX_ASSERT(m
.end(status
) == i
);
2243 // Check that the bump goes over characters outside the BMP OK
2244 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2245 unsigned char aboveBMP
[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2246 utext_openUTF8(&s
, (char *)aboveBMP
, -1, &status
);
2249 if (m
.find() == FALSE
) {
2252 REGEX_ASSERT(m
.start(status
) == i
);
2253 REGEX_ASSERT(m
.end(status
) == i
);
2255 REGEX_ASSERT(i
==20);
2260 // find() loop breaking test.
2261 // with pattern of /.?/, should see a series of one char matches, then a single
2262 // match of zero length at the end of the input string.
2264 UErrorCode status
=U_ZERO_ERROR
;
2265 RegexMatcher
m(".?", 0, status
);
2267 UText s
= UTEXT_INITIALIZER
;
2268 utext_openUTF8(&s
, " ", -1, &status
);
2271 if (m
.find() == FALSE
) {
2274 REGEX_ASSERT(m
.start(status
) == i
);
2275 REGEX_ASSERT(m
.end(status
) == (i
<4 ? i
+1 : i
));
2284 // Matchers with no input string behave as if they had an empty input string.
2288 UErrorCode status
= U_ZERO_ERROR
;
2289 RegexMatcher
m(".?", 0, status
);
2291 REGEX_ASSERT(m
.find());
2292 REGEX_ASSERT(m
.start(status
) == 0);
2293 REGEX_ASSERT(m
.input() == "");
2296 UErrorCode status
= U_ZERO_ERROR
;
2297 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
2298 RegexMatcher
*m
= p
->matcher(status
);
2301 REGEX_ASSERT(m
->find() == FALSE
);
2302 REGEX_ASSERT(utext_nativeLength(m
->inputText()) == 0);
2311 UErrorCode status
= U_ZERO_ERROR
;
2312 UText testPattern
= UTEXT_INITIALIZER
;
2313 UText testText
= UTEXT_INITIALIZER
;
2314 regextst_openUTF8FromInvariant(&testPattern
, ".*", -1, &status
);
2315 REGEX_VERBOSE_TEXT(&testPattern
);
2316 regextst_openUTF8FromInvariant(&testText
, "This is test data", -1, &status
);
2317 REGEX_VERBOSE_TEXT(&testText
);
2319 RegexMatcher
m(&testPattern
, &testText
, 0, status
);
2321 REGEX_ASSERT(m
.regionStart() == 0);
2322 REGEX_ASSERT(m
.regionEnd() == (int32_t)strlen("This is test data"));
2323 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2324 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2326 m
.region(2,4, status
);
2328 REGEX_ASSERT(m
.matches(status
));
2329 REGEX_ASSERT(m
.start(status
)==2);
2330 REGEX_ASSERT(m
.end(status
)==4);
2334 REGEX_ASSERT(m
.regionStart() == 0);
2335 REGEX_ASSERT(m
.regionEnd() == (int32_t)strlen("This is test data"));
2337 regextst_openUTF8FromInvariant(&testText
, "short", -1, &status
);
2338 REGEX_VERBOSE_TEXT(&testText
);
2340 REGEX_ASSERT(m
.regionStart() == 0);
2341 REGEX_ASSERT(m
.regionEnd() == (int32_t)strlen("short"));
2343 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2344 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(FALSE
));
2345 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
2346 REGEX_ASSERT(&m
== &m
.reset());
2347 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
2349 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(TRUE
));
2350 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2351 REGEX_ASSERT(&m
== &m
.reset());
2352 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2354 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2355 REGEX_ASSERT(&m
== &m
.useTransparentBounds(TRUE
));
2356 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
2357 REGEX_ASSERT(&m
== &m
.reset());
2358 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
2360 REGEX_ASSERT(&m
== &m
.useTransparentBounds(FALSE
));
2361 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2362 REGEX_ASSERT(&m
== &m
.reset());
2363 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2365 utext_close(&testText
);
2366 utext_close(&testPattern
);
2370 // hitEnd() and requireEnd()
2373 UErrorCode status
= U_ZERO_ERROR
;
2374 UText testPattern
= UTEXT_INITIALIZER
;
2375 UText testText
= UTEXT_INITIALIZER
;
2376 const char str_
[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2377 const char str_aabb
[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2378 utext_openUTF8(&testPattern
, str_
, -1, &status
);
2379 utext_openUTF8(&testText
, str_aabb
, -1, &status
);
2381 RegexMatcher
m1(&testPattern
, &testText
, 0, status
);
2382 REGEX_ASSERT(m1
.lookingAt(status
) == TRUE
);
2383 REGEX_ASSERT(m1
.hitEnd() == TRUE
);
2384 REGEX_ASSERT(m1
.requireEnd() == FALSE
);
2387 status
= U_ZERO_ERROR
;
2388 const char str_a
[] = { 0x61, 0x2a, 0x00 }; /* a* */
2389 utext_openUTF8(&testPattern
, str_a
, -1, &status
);
2390 RegexMatcher
m2(&testPattern
, &testText
, 0, status
);
2391 REGEX_ASSERT(m2
.lookingAt(status
) == TRUE
);
2392 REGEX_ASSERT(m2
.hitEnd() == FALSE
);
2393 REGEX_ASSERT(m2
.requireEnd() == FALSE
);
2396 status
= U_ZERO_ERROR
;
2397 const char str_dotstardollar
[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2398 utext_openUTF8(&testPattern
, str_dotstardollar
, -1, &status
);
2399 RegexMatcher
m3(&testPattern
, &testText
, 0, status
);
2400 REGEX_ASSERT(m3
.lookingAt(status
) == TRUE
);
2401 REGEX_ASSERT(m3
.hitEnd() == TRUE
);
2402 REGEX_ASSERT(m3
.requireEnd() == TRUE
);
2405 utext_close(&testText
);
2406 utext_close(&testPattern
);
2411 //---------------------------------------------------------------------------
2413 // API_Replace_UTF8 API test for class RegexMatcher, testing the
2414 // Replace family of functions.
2416 //---------------------------------------------------------------------------
2417 void RegexTest::API_Replace_UTF8() {
2423 UErrorCode status
=U_ZERO_ERROR
;
2425 UText re
=UTEXT_INITIALIZER
;
2426 regextst_openUTF8FromInvariant(&re
, "abc", -1, &status
);
2427 REGEX_VERBOSE_TEXT(&re
);
2428 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
2431 char data
[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2432 // 012345678901234567
2433 UText dataText
= UTEXT_INITIALIZER
;
2434 utext_openUTF8(&dataText
, data
, -1, &status
);
2436 REGEX_VERBOSE_TEXT(&dataText
);
2437 RegexMatcher
*matcher
= &pat
->matcher(status
)->reset(&dataText
);
2440 // Plain vanilla matches.
2443 UText destText
= UTEXT_INITIALIZER
;
2444 utext_openUnicodeString(&destText
, &dest
, &status
);
2447 UText replText
= UTEXT_INITIALIZER
;
2449 const char str_yz
[] = { 0x79, 0x7a, 0x00 }; /* yz */
2450 utext_openUTF8(&replText
, str_yz
, -1, &status
);
2451 REGEX_VERBOSE_TEXT(&replText
);
2452 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2454 const char str_yzabcabc
[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2455 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc
, result
);
2456 utext_close(result
);
2457 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2459 REGEX_ASSERT(result
== &destText
);
2460 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc
, result
);
2462 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2464 const char str_yzyzyz
[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2465 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz
, result
);
2466 utext_close(result
);
2468 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2469 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2471 REGEX_ASSERT(result
== &destText
);
2472 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz
, result
);
2475 // Plain vanilla non-matches.
2477 const char str_abxabxabx
[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2478 utext_openUTF8(&dataText
, str_abxabxabx
, -1, &status
);
2479 matcher
->reset(&dataText
);
2481 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2483 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2484 utext_close(result
);
2485 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2487 REGEX_ASSERT(result
== &destText
);
2488 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2490 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2492 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2493 utext_close(result
);
2494 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2495 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2497 REGEX_ASSERT(result
== &destText
);
2498 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2501 // Empty source string
2503 utext_openUTF8(&dataText
, NULL
, 0, &status
);
2504 matcher
->reset(&dataText
);
2506 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2508 REGEX_ASSERT_UTEXT_UTF8("", result
);
2509 utext_close(result
);
2510 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2512 REGEX_ASSERT(result
== &destText
);
2513 REGEX_ASSERT_UTEXT_UTF8("", result
);
2515 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2517 REGEX_ASSERT_UTEXT_UTF8("", result
);
2518 utext_close(result
);
2519 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2521 REGEX_ASSERT(result
== &destText
);
2522 REGEX_ASSERT_UTEXT_UTF8("", result
);
2525 // Empty substitution string
2527 utext_openUTF8(&dataText
, data
, -1, &status
); // ".abc..abc...abc.."
2528 matcher
->reset(&dataText
);
2530 utext_openUTF8(&replText
, NULL
, 0, &status
);
2531 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2533 const char str_abcabc
[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2534 REGEX_ASSERT_UTEXT_UTF8(str_abcabc
, result
);
2535 utext_close(result
);
2536 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2538 REGEX_ASSERT(result
== &destText
);
2539 REGEX_ASSERT_UTEXT_UTF8(str_abcabc
, result
);
2541 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2543 const char str_dots
[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2544 REGEX_ASSERT_UTEXT_UTF8(str_dots
, result
);
2545 utext_close(result
);
2546 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2547 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2549 REGEX_ASSERT(result
== &destText
);
2550 REGEX_ASSERT_UTEXT_UTF8(str_dots
, result
);
2553 // match whole string
2555 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2556 utext_openUTF8(&dataText
, str_abc
, -1, &status
);
2557 matcher
->reset(&dataText
);
2559 const char str_xyz
[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2560 utext_openUTF8(&replText
, str_xyz
, -1, &status
);
2561 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2563 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2564 utext_close(result
);
2565 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2566 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2568 REGEX_ASSERT(result
== &destText
);
2569 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2571 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2573 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2574 utext_close(result
);
2575 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2576 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2578 REGEX_ASSERT(result
== &destText
);
2579 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2582 // Capture Group, simple case
2584 const char str_add
[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2585 utext_openUTF8(&re
, str_add
, -1, &status
);
2586 RegexPattern
*pat2
= RegexPattern::compile(&re
, flags
, pe
, status
);
2589 const char str_abcdefg
[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2590 utext_openUTF8(&dataText
, str_abcdefg
, -1, &status
);
2591 RegexMatcher
*matcher2
= &pat2
->matcher(status
)->reset(&dataText
);
2594 const char str_11
[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2595 utext_openUTF8(&replText
, str_11
, -1, &status
);
2596 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2598 const char str_bcbcdefg
[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2599 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg
, result
);
2600 utext_close(result
);
2601 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2602 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2604 REGEX_ASSERT(result
== &destText
);
2605 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg
, result
);
2607 const char str_v
[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2608 utext_openUTF8(&replText
, str_v
, -1, &status
);
2609 REGEX_VERBOSE_TEXT(&replText
);
2610 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2612 const char str_Thevalueof1isbcdefg
[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2613 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg
, result
);
2614 utext_close(result
);
2615 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2616 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2618 REGEX_ASSERT(result
== &destText
);
2619 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg
, result
);
2621 const char str_byitselfnogroupnumber
[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
2622 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
2623 0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
2624 utext_openUTF8(&replText
, str_byitselfnogroupnumber
, -1, &status
);
2625 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2627 const char str_byitselfnogroupnumberdefg
[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2628 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg
, result
);
2629 utext_close(result
);
2630 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2631 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2633 REGEX_ASSERT(result
== &destText
);
2634 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg
, result
);
2636 unsigned char supplDigitChars
[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2637 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2638 // 012345678901234567890123456
2639 supplDigitChars
[22] = 0xF0;
2640 supplDigitChars
[23] = 0x9D;
2641 supplDigitChars
[24] = 0x9F;
2642 supplDigitChars
[25] = 0x8F;
2643 utext_openUTF8(&replText
, (char *)supplDigitChars
, -1, &status
);
2645 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2647 const char str_SupplementalDigit1bcdefg
[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2648 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg
, result
);
2649 utext_close(result
);
2650 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2651 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2653 REGEX_ASSERT(result
== &destText
);
2654 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg
, result
);
2655 const char str_badcapturegroupnumber5
[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */
2656 utext_openUTF8(&replText
, str_badcapturegroupnumber5
, -1, &status
);
2657 REGEX_ASSERT_FAIL((result
= matcher2
->replaceFirst(&replText
, NULL
, status
)), U_INDEX_OUTOFBOUNDS_ERROR
);
2658 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2659 utext_close(result
);
2660 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2661 REGEX_ASSERT_FAIL((result
= matcher2
->replaceFirst(&replText
, &destText
, status
)), U_INDEX_OUTOFBOUNDS_ERROR
);
2662 REGEX_ASSERT(result
== &destText
);
2663 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2666 // Replacement String with \u hex escapes
2669 const char str_abc1abc2abc3
[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2670 const char str_u0043
[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2671 utext_openUTF8(&dataText
, str_abc1abc2abc3
, -1, &status
);
2672 utext_openUTF8(&replText
, str_u0043
, -1, &status
);
2673 matcher
->reset(&dataText
);
2675 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2677 const char str_C1C2C3
[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2678 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3
, result
);
2679 utext_close(result
);
2680 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2681 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2683 REGEX_ASSERT(result
== &destText
);
2684 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3
, result
);
2687 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2688 utext_openUTF8(&dataText
, str_abc
, -1, &status
);
2689 const char str_U00010000
[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2690 utext_openUTF8(&replText
, str_U00010000
, -1, &status
);
2691 matcher
->reset(&dataText
);
2693 unsigned char expected
[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2700 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2702 REGEX_ASSERT_UTEXT_UTF8((char *)expected
, result
);
2703 utext_close(result
);
2704 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2705 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2707 REGEX_ASSERT(result
== &destText
);
2708 REGEX_ASSERT_UTEXT_UTF8((char *)expected
, result
);
2710 // TODO: need more through testing of capture substitutions.
2715 status
= U_ZERO_ERROR
;
2716 const char str_ssee
[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2717 const char str_blah
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2718 const char str_ooh
[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2719 utext_openUTF8(&re
, str_ssee
, -1, &status
);
2720 utext_openUTF8(&dataText
, str_blah
, -1, &status
);
2721 utext_openUTF8(&replText
, str_ooh
, -1, &status
);
2723 RegexMatcher
m(&re
, 0, status
);
2726 UnicodeString result
;
2727 UText resultText
= UTEXT_INITIALIZER
;
2728 utext_openUnicodeString(&resultText
, &result
, &status
);
2730 // Multiple finds do NOT bump up the previous appendReplacement postion.
2734 m
.appendReplacement(&resultText
, &replText
, status
);
2736 const char str_blah2
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2737 REGEX_ASSERT_UTEXT_UTF8(str_blah2
, &resultText
);
2739 // After a reset into the interior of a string, appendReplacement still starts at beginning.
2740 status
= U_ZERO_ERROR
;
2742 utext_openUnicodeString(&resultText
, &result
, &status
);
2743 m
.reset(10, status
);
2746 m
.appendReplacement(&resultText
, &replText
, status
);
2748 const char str_blah3
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2749 REGEX_ASSERT_UTEXT_UTF8(str_blah3
, &resultText
);
2751 // find() at interior of string, appendReplacement still starts at beginning.
2752 status
= U_ZERO_ERROR
;
2754 utext_openUnicodeString(&resultText
, &result
, &status
);
2758 m
.appendReplacement(&resultText
, &replText
, status
);
2760 const char str_blah8
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2761 REGEX_ASSERT_UTEXT_UTF8(str_blah8
, &resultText
);
2763 m
.appendTail(&resultText
, status
);
2764 const char str_blah9
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2765 REGEX_ASSERT_UTEXT_UTF8(str_blah9
, &resultText
);
2767 utext_close(&resultText
);
2775 utext_close(&dataText
);
2776 utext_close(&replText
);
2777 utext_close(&destText
);
2782 //---------------------------------------------------------------------------
2784 // API_Pattern_UTF8 Test that the API for class RegexPattern is
2785 // present and nominally working.
2787 //---------------------------------------------------------------------------
2788 void RegexTest::API_Pattern_UTF8() {
2789 RegexPattern pata
; // Test default constructor to not crash.
2792 REGEX_ASSERT(pata
== patb
);
2793 REGEX_ASSERT(pata
== pata
);
2795 UText re1
= UTEXT_INITIALIZER
;
2796 UText re2
= UTEXT_INITIALIZER
;
2797 UErrorCode status
= U_ZERO_ERROR
;
2800 const char str_abcalmz
[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2801 const char str_def
[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2802 utext_openUTF8(&re1
, str_abcalmz
, -1, &status
);
2803 utext_openUTF8(&re2
, str_def
, -1, &status
);
2805 RegexPattern
*pat1
= RegexPattern::compile(&re1
, 0, pe
, status
);
2806 RegexPattern
*pat2
= RegexPattern::compile(&re2
, 0, pe
, status
);
2808 REGEX_ASSERT(*pat1
== *pat1
);
2809 REGEX_ASSERT(*pat1
!= pata
);
2813 REGEX_ASSERT(patb
== *pat1
);
2816 RegexPattern
patc(*pat1
);
2817 REGEX_ASSERT(patc
== *pat1
);
2818 REGEX_ASSERT(patb
== patc
);
2819 REGEX_ASSERT(pat1
!= pat2
);
2821 REGEX_ASSERT(patb
!= patc
);
2822 REGEX_ASSERT(patb
== *pat2
);
2824 // Compile with no flags.
2825 RegexPattern
*pat1a
= RegexPattern::compile(&re1
, pe
, status
);
2826 REGEX_ASSERT(*pat1a
== *pat1
);
2828 REGEX_ASSERT(pat1a
->flags() == 0);
2830 // Compile with different flags should be not equal
2831 RegexPattern
*pat1b
= RegexPattern::compile(&re1
, UREGEX_CASE_INSENSITIVE
, pe
, status
);
2834 REGEX_ASSERT(*pat1b
!= *pat1a
);
2835 REGEX_ASSERT(pat1b
->flags() == UREGEX_CASE_INSENSITIVE
);
2836 REGEX_ASSERT(pat1a
->flags() == 0);
2840 RegexPattern
*pat1c
= pat1
->clone();
2841 REGEX_ASSERT(*pat1c
== *pat1
);
2842 REGEX_ASSERT(*pat1c
!= *pat2
);
2854 // Verify that a matcher created from a cloned pattern works.
2858 UErrorCode status
= U_ZERO_ERROR
;
2859 UText pattern
= UTEXT_INITIALIZER
;
2860 const char str_pL
[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2861 utext_openUTF8(&pattern
, str_pL
, -1, &status
);
2863 RegexPattern
*pSource
= RegexPattern::compile(&pattern
, 0, status
);
2864 RegexPattern
*pClone
= pSource
->clone();
2866 RegexMatcher
*mFromClone
= pClone
->matcher(status
);
2869 UText input
= UTEXT_INITIALIZER
;
2870 const char str_HelloWorld
[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2871 utext_openUTF8(&input
, str_HelloWorld
, -1, &status
);
2872 mFromClone
->reset(&input
);
2873 REGEX_ASSERT(mFromClone
->find() == TRUE
);
2874 REGEX_ASSERT(mFromClone
->group(status
) == "Hello");
2875 REGEX_ASSERT(mFromClone
->find() == TRUE
);
2876 REGEX_ASSERT(mFromClone
->group(status
) == "World");
2877 REGEX_ASSERT(mFromClone
->find() == FALSE
);
2881 utext_close(&input
);
2882 utext_close(&pattern
);
2886 // matches convenience API
2889 UErrorCode status
= U_ZERO_ERROR
;
2890 UText pattern
= UTEXT_INITIALIZER
;
2891 UText input
= UTEXT_INITIALIZER
;
2893 const char str_randominput
[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2894 utext_openUTF8(&input
, str_randominput
, -1, &status
);
2896 const char str_dotstar
[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2897 utext_openUTF8(&pattern
, str_dotstar
, -1, &status
);
2898 REGEX_ASSERT(RegexPattern::matches(&pattern
, &input
, pe
, status
) == TRUE
);
2901 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2902 utext_openUTF8(&pattern
, str_abc
, -1, &status
);
2903 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe
, status
) == FALSE
);
2906 const char str_nput
[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2907 utext_openUTF8(&pattern
, str_nput
, -1, &status
);
2908 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe
, status
) == TRUE
);
2911 utext_openUTF8(&pattern
, str_randominput
, -1, &status
);
2912 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe
, status
) == TRUE
);
2915 const char str_u
[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2916 utext_openUTF8(&pattern
, str_u
, -1, &status
);
2917 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe
, status
) == FALSE
);
2920 utext_openUTF8(&input
, str_abc
, -1, &status
);
2921 utext_openUTF8(&pattern
, str_abc
, -1, &status
);
2922 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
2923 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe
, status
) == FALSE
);
2924 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
2926 utext_close(&input
);
2927 utext_close(&pattern
);
2934 status
= U_ZERO_ERROR
;
2935 const char str_spaceplus
[] = { 0x20, 0x2b, 0x00 }; /* + */
2936 utext_openUTF8(&re1
, str_spaceplus
, -1, &status
);
2937 pat1
= RegexPattern::compile(&re1
, pe
, status
);
2939 UnicodeString fields
[10];
2942 n
= pat1
->split("Now is the time", fields
, 10, status
);
2945 REGEX_ASSERT(fields
[0]=="Now");
2946 REGEX_ASSERT(fields
[1]=="is");
2947 REGEX_ASSERT(fields
[2]=="the");
2948 REGEX_ASSERT(fields
[3]=="time");
2949 REGEX_ASSERT(fields
[4]=="");
2951 n
= pat1
->split("Now is the time", fields
, 2, status
);
2954 REGEX_ASSERT(fields
[0]=="Now");
2955 REGEX_ASSERT(fields
[1]=="is the time");
2956 REGEX_ASSERT(fields
[2]=="the"); // left over from previous test
2959 status
= U_ZERO_ERROR
;
2960 n
= pat1
->split("Now is the time", fields
, 1, status
);
2963 REGEX_ASSERT(fields
[0]=="Now is the time");
2964 REGEX_ASSERT(fields
[1]=="*");
2965 status
= U_ZERO_ERROR
;
2967 n
= pat1
->split(" Now is the time ", fields
, 10, status
);
2970 REGEX_ASSERT(fields
[0]=="");
2971 REGEX_ASSERT(fields
[1]=="Now");
2972 REGEX_ASSERT(fields
[2]=="is");
2973 REGEX_ASSERT(fields
[3]=="the");
2974 REGEX_ASSERT(fields
[4]=="time");
2975 REGEX_ASSERT(fields
[5]=="");
2976 REGEX_ASSERT(fields
[6]=="");
2979 n
= pat1
->split(" ", fields
, 10, status
);
2982 REGEX_ASSERT(fields
[0]=="");
2983 REGEX_ASSERT(fields
[1]=="");
2984 REGEX_ASSERT(fields
[2]=="*");
2987 n
= pat1
->split("", fields
, 10, status
);
2990 REGEX_ASSERT(fields
[0]=="foo");
2994 // split, with a pattern with (capture)
2995 regextst_openUTF8FromInvariant(&re1
, "<(\\w*)>", -1, &status
);
2996 pat1
= RegexPattern::compile(&re1
, pe
, status
);
2999 status
= U_ZERO_ERROR
;
3000 fields
[6] = fields
[7] = "*";
3001 n
= pat1
->split("<a>Now is <b>the time<c>", fields
, 10, status
);
3004 REGEX_ASSERT(fields
[0]=="");
3005 REGEX_ASSERT(fields
[1]=="a");
3006 REGEX_ASSERT(fields
[2]=="Now is ");
3007 REGEX_ASSERT(fields
[3]=="b");
3008 REGEX_ASSERT(fields
[4]=="the time");
3009 REGEX_ASSERT(fields
[5]=="c");
3010 REGEX_ASSERT(fields
[6]=="");
3011 REGEX_ASSERT(fields
[7]=="*");
3012 REGEX_ASSERT(status
==U_ZERO_ERROR
);
3014 fields
[6] = fields
[7] = "*";
3015 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 10, status
);
3018 REGEX_ASSERT(fields
[0]==" ");
3019 REGEX_ASSERT(fields
[1]=="a");
3020 REGEX_ASSERT(fields
[2]=="Now is ");
3021 REGEX_ASSERT(fields
[3]=="b");
3022 REGEX_ASSERT(fields
[4]=="the time");
3023 REGEX_ASSERT(fields
[5]=="c");
3024 REGEX_ASSERT(fields
[6]=="");
3025 REGEX_ASSERT(fields
[7]=="*");
3027 status
= U_ZERO_ERROR
;
3029 n
= pat1
->split(" <a>Now is <b>the time<c> ", fields
, 6, status
);
3032 REGEX_ASSERT(fields
[0]==" ");
3033 REGEX_ASSERT(fields
[1]=="a");
3034 REGEX_ASSERT(fields
[2]=="Now is ");
3035 REGEX_ASSERT(fields
[3]=="b");
3036 REGEX_ASSERT(fields
[4]=="the time");
3037 REGEX_ASSERT(fields
[5]==" ");
3038 REGEX_ASSERT(fields
[6]=="foo");
3040 status
= U_ZERO_ERROR
;
3042 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 5, status
);
3045 REGEX_ASSERT(fields
[0]==" ");
3046 REGEX_ASSERT(fields
[1]=="a");
3047 REGEX_ASSERT(fields
[2]=="Now is ");
3048 REGEX_ASSERT(fields
[3]=="b");
3049 REGEX_ASSERT(fields
[4]=="the time<c>");
3050 REGEX_ASSERT(fields
[5]=="foo");
3052 status
= U_ZERO_ERROR
;
3054 n
= pat1
->split(" <a>Now is <b>the time", fields
, 5, status
);
3057 REGEX_ASSERT(fields
[0]==" ");
3058 REGEX_ASSERT(fields
[1]=="a");
3059 REGEX_ASSERT(fields
[2]=="Now is ");
3060 REGEX_ASSERT(fields
[3]=="b");
3061 REGEX_ASSERT(fields
[4]=="the time");
3062 REGEX_ASSERT(fields
[5]=="foo");
3064 status
= U_ZERO_ERROR
;
3065 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 4, status
);
3068 REGEX_ASSERT(fields
[0]==" ");
3069 REGEX_ASSERT(fields
[1]=="a");
3070 REGEX_ASSERT(fields
[2]=="Now is ");
3071 REGEX_ASSERT(fields
[3]=="the time<c>");
3072 status
= U_ZERO_ERROR
;
3075 regextst_openUTF8FromInvariant(&re1
, "([-,])", -1, &status
);
3076 pat1
= RegexPattern::compile(&re1
, pe
, status
);
3078 n
= pat1
->split("1-10,20", fields
, 10, status
);
3081 REGEX_ASSERT(fields
[0]=="1");
3082 REGEX_ASSERT(fields
[1]=="-");
3083 REGEX_ASSERT(fields
[2]=="10");
3084 REGEX_ASSERT(fields
[3]==",");
3085 REGEX_ASSERT(fields
[4]=="20");
3090 // split of a UText based string, with library allocating output UTexts.
3093 status
= U_ZERO_ERROR
;
3094 RegexMatcher
matcher(UnicodeString("(:)"), 0, status
);
3095 UnicodeString
stringToSplit("first:second:third");
3096 UText
*textToSplit
= utext_openUnicodeString(NULL
, &stringToSplit
, &status
);
3099 UText
*splits
[10] = {NULL
};
3100 int32_t numFields
= matcher
.split(textToSplit
, splits
, UPRV_LENGTHOF(splits
), status
);
3102 REGEX_ASSERT(numFields
== 5);
3103 REGEX_ASSERT_UTEXT_INVARIANT("first", splits
[0]);
3104 REGEX_ASSERT_UTEXT_INVARIANT(":", splits
[1]);
3105 REGEX_ASSERT_UTEXT_INVARIANT("second", splits
[2]);
3106 REGEX_ASSERT_UTEXT_INVARIANT(":", splits
[3]);
3107 REGEX_ASSERT_UTEXT_INVARIANT("third", splits
[4]);
3108 REGEX_ASSERT(splits
[5] == NULL
);
3110 for (int i
=0; i
<UPRV_LENGTHOF(splits
); i
++) {
3112 utext_close(splits
[i
]);
3116 utext_close(textToSplit
);
3121 // RegexPattern::pattern() and patternText()
3123 pat1
= new RegexPattern();
3124 REGEX_ASSERT(pat1
->pattern() == "");
3125 REGEX_ASSERT_UTEXT_UTF8("", pat1
->patternText(status
));
3127 const char *helloWorldInvariant
= "(Hello, world)*";
3128 regextst_openUTF8FromInvariant(&re1
, helloWorldInvariant
, -1, &status
);
3129 pat1
= RegexPattern::compile(&re1
, pe
, status
);
3131 REGEX_ASSERT_UNISTR("(Hello, world)*", pat1
->pattern());
3132 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1
->patternText(status
));
3139 //---------------------------------------------------------------------------
3141 // Extended A more thorough check for features of regex patterns
3142 // The test cases are in a separate data file,
3143 // source/tests/testdata/regextst.txt
3144 // A description of the test data format is included in that file.
3146 //---------------------------------------------------------------------------
3149 RegexTest::getPath(char buffer
[2048], const char *filename
) {
3150 UErrorCode status
=U_ZERO_ERROR
;
3151 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
3152 if (U_FAILURE(status
)) {
3153 errln("ERROR: loadTestData() failed - %s", u_errorName(status
));
3157 strcpy(buffer
, testDataDirectory
);
3158 strcat(buffer
, filename
);
3162 void RegexTest::Extended() {
3164 const char *srcPath
;
3165 UErrorCode status
= U_ZERO_ERROR
;
3166 int32_t lineNum
= 0;
3169 // Open and read the test data file.
3171 srcPath
=getPath(tdd
, "regextst.txt");
3173 return; /* something went wrong, error already output */
3177 UChar
*testData
= ReadAndConvertFile(srcPath
, len
, "utf-8", status
);
3178 if (U_FAILURE(status
)) {
3179 return; /* something went wrong, error already output */
3183 // Put the test data into a UnicodeString
3185 UnicodeString
testString(FALSE
, testData
, len
);
3187 RegexMatcher
quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status
);
3188 RegexMatcher
commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status
);
3189 RegexMatcher
flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status
);
3191 RegexMatcher
lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString
, 0, status
);
3192 UnicodeString testPattern
; // The pattern for test from the test file.
3193 UnicodeString testFlags
; // the flags for a test.
3194 UnicodeString matchString
; // The marked up string to be used as input
3196 if (U_FAILURE(status
)){
3197 dataerrln("Construct RegexMatcher() error - %s", u_errorName(status
));
3203 // Loop over the test data file, once per line.
3205 while (lineMat
.find()) {
3207 if (U_FAILURE(status
)) {
3208 errln("%s:%d: ICU Error \"%s\"", srcPath
, lineNum
, u_errorName(status
));
3211 status
= U_ZERO_ERROR
;
3212 UnicodeString testLine
= lineMat
.group(1, status
);
3213 if (testLine
.length() == 0) {
3218 // Parse the test line. Skip blank and comment only lines.
3219 // Separate out the three main fields - pattern, flags, target.
3222 commentMat
.reset(testLine
);
3223 if (commentMat
.lookingAt(status
)) {
3224 // This line is a comment, or blank.
3229 // Pull out the pattern field, remove it from the test file line.
3231 quotedStuffMat
.reset(testLine
);
3232 if (quotedStuffMat
.lookingAt(status
)) {
3233 testPattern
= quotedStuffMat
.group(2, status
);
3234 testLine
.remove(0, quotedStuffMat
.end(0, status
));
3236 errln("Bad pattern (missing quotes?) at %s:%d", srcPath
, lineNum
);
3242 // Pull out the flags from the test file line.
3244 flagsMat
.reset(testLine
);
3245 flagsMat
.lookingAt(status
); // Will always match, possibly an empty string.
3246 testFlags
= flagsMat
.group(1, status
);
3247 if (flagsMat
.group(2, status
).length() > 0) {
3248 errln("Bad Match flag at line %d. Scanning %c\n",
3249 lineNum
, flagsMat
.group(2, status
).charAt(0));
3252 testLine
.remove(0, flagsMat
.end(0, status
));
3255 // Pull out the match string, as a whole.
3256 // We'll process the <tags> later.
3258 quotedStuffMat
.reset(testLine
);
3259 if (quotedStuffMat
.lookingAt(status
)) {
3260 matchString
= quotedStuffMat
.group(2, status
);
3261 testLine
.remove(0, quotedStuffMat
.end(0, status
));
3263 errln("Bad match string at test file line %d", lineNum
);
3268 // The only thing left from the input line should be an optional trailing comment.
3270 commentMat
.reset(testLine
);
3271 if (commentMat
.lookingAt(status
) == FALSE
) {
3272 errln("Line %d: unexpected characters at end of test line.", lineNum
);
3279 regex_find(testPattern
, testFlags
, matchString
, srcPath
, lineNum
);
3288 //---------------------------------------------------------------------------
3290 // regex_find(pattern, flags, inputString, lineNumber)
3292 // Function to run a single test from the Extended (data driven) tests.
3293 // See file test/testdata/regextst.txt for a description of the
3294 // pattern and inputString fields, and the allowed flags.
3295 // lineNumber is the source line in regextst.txt of the test.
3297 //---------------------------------------------------------------------------
3300 // Set a value into a UVector at position specified by a decimal number in
3301 // a UnicodeString. This is a utility function needed by the actual test function,
3303 static void set(UVector
&vec
, int32_t val
, UnicodeString index
) {
3304 UErrorCode status
=U_ZERO_ERROR
;
3306 for (int32_t i
=0; i
<index
.length(); i
++) {
3307 int32_t d
=u_charDigitValue(index
.charAt(i
));
3311 while (vec
.size()<idx
+1) {vec
.addElement(-1, status
);}
3312 vec
.setElementAt(val
, idx
);
3315 static void setInt(UVector
&vec
, int32_t val
, int32_t idx
) {
3316 UErrorCode status
=U_ZERO_ERROR
;
3317 while (vec
.size()<idx
+1) {vec
.addElement(-1, status
);}
3318 vec
.setElementAt(val
, idx
);
3321 static UBool
utextOffsetToNative(UText
*utext
, int32_t unistrOffset
, int32_t& nativeIndex
)
3323 UBool couldFind
= TRUE
;
3324 UTEXT_SETNATIVEINDEX(utext
, 0);
3326 while (i
< unistrOffset
) {
3327 UChar32 c
= UTEXT_NEXT32(utext
);
3328 if (c
!= U_SENTINEL
) {
3335 nativeIndex
= (int32_t)UTEXT_GETNATIVEINDEX(utext
);
3340 void RegexTest::regex_find(const UnicodeString
&pattern
,
3341 const UnicodeString
&flags
,
3342 const UnicodeString
&inputString
,
3343 const char *srcPath
,
3345 UnicodeString unEscapedInput
;
3346 UnicodeString deTaggedInput
;
3348 int32_t patternUTF8Length
, inputUTF8Length
;
3349 char *patternChars
= NULL
, *inputChars
= NULL
;
3350 UText patternText
= UTEXT_INITIALIZER
;
3351 UText inputText
= UTEXT_INITIALIZER
;
3352 UConverter
*UTF8Converter
= NULL
;
3354 UErrorCode status
= U_ZERO_ERROR
;
3356 RegexPattern
*parsePat
= NULL
;
3357 RegexMatcher
*parseMatcher
= NULL
;
3358 RegexPattern
*callerPattern
= NULL
, *UTF8Pattern
= NULL
;
3359 RegexMatcher
*matcher
= NULL
, *UTF8Matcher
= NULL
;
3360 UVector
groupStarts(status
);
3361 UVector
groupEnds(status
);
3362 UVector
groupStartsUTF8(status
);
3363 UVector
groupEndsUTF8(status
);
3364 UBool isMatch
= FALSE
, isUTF8Match
= FALSE
;
3365 UBool failed
= FALSE
;
3368 UBool useMatchesFunc
= FALSE
;
3369 UBool useLookingAtFunc
= FALSE
;
3370 int32_t regionStart
= -1;
3371 int32_t regionEnd
= -1;
3372 int32_t regionStartUTF8
= -1;
3373 int32_t regionEndUTF8
= -1;
3377 // Compile the caller's pattern
3379 uint32_t bflags
= 0;
3380 if (flags
.indexOf((UChar
)0x69) >= 0) { // 'i' flag
3381 bflags
|= UREGEX_CASE_INSENSITIVE
;
3383 if (flags
.indexOf((UChar
)0x78) >= 0) { // 'x' flag
3384 bflags
|= UREGEX_COMMENTS
;
3386 if (flags
.indexOf((UChar
)0x73) >= 0) { // 's' flag
3387 bflags
|= UREGEX_DOTALL
;
3389 if (flags
.indexOf((UChar
)0x6d) >= 0) { // 'm' flag
3390 bflags
|= UREGEX_MULTILINE
;
3393 if (flags
.indexOf((UChar
)0x65) >= 0) { // 'e' flag
3394 bflags
|= UREGEX_ERROR_ON_UNKNOWN_ESCAPES
;
3396 if (flags
.indexOf((UChar
)0x44) >= 0) { // 'D' flag
3397 bflags
|= UREGEX_UNIX_LINES
;
3399 if (flags
.indexOf((UChar
)0x51) >= 0) { // 'Q' flag
3400 bflags
|= UREGEX_LITERAL
;
3404 callerPattern
= RegexPattern::compile(pattern
, bflags
, pe
, status
);
3405 if (status
!= U_ZERO_ERROR
) {
3406 #if UCONFIG_NO_BREAK_ITERATION==1
3407 // 'v' test flag means that the test pattern should not compile if ICU was configured
3408 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3409 if (flags
.indexOf((UChar
)0x76) >= 0 /*'v'*/ && status
== U_UNSUPPORTED_ERROR
) {
3410 goto cleanupAndReturn
;
3413 if (flags
.indexOf((UChar
)0x45) >= 0) { // flags contain 'E'
3414 // Expected pattern compilation error.
3415 if (flags
.indexOf((UChar
)0x64) >= 0) { // flags contain 'd'
3416 logln("Pattern Compile returns \"%s\"", u_errorName(status
));
3418 goto cleanupAndReturn
;
3420 // Unexpected pattern compilation error.
3421 dataerrln("Line %d: error %s compiling pattern.", line
, u_errorName(status
));
3422 goto cleanupAndReturn
;
3426 UTF8Converter
= ucnv_open("UTF8", &status
);
3427 ucnv_setFromUCallBack(UTF8Converter
, UCNV_FROM_U_CALLBACK_STOP
, NULL
, NULL
, NULL
, &status
);
3429 patternUTF8Length
= pattern
.extract(NULL
, 0, UTF8Converter
, status
);
3430 status
= U_ZERO_ERROR
; // buffer overflow
3431 patternChars
= new char[patternUTF8Length
+1];
3432 pattern
.extract(patternChars
, patternUTF8Length
+1, UTF8Converter
, status
);
3433 utext_openUTF8(&patternText
, patternChars
, patternUTF8Length
, &status
);
3435 if (status
== U_ZERO_ERROR
) {
3436 UTF8Pattern
= RegexPattern::compile(&patternText
, bflags
, pe
, status
);
3438 if (status
!= U_ZERO_ERROR
) {
3439 #if UCONFIG_NO_BREAK_ITERATION==1
3440 // 'v' test flag means that the test pattern should not compile if ICU was configured
3441 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3442 if (flags
.indexOf((UChar
)0x76) >= 0 /*'v'*/ && status
== U_UNSUPPORTED_ERROR
) {
3443 goto cleanupAndReturn
;
3446 if (flags
.indexOf((UChar
)0x45) >= 0) { // flags contain 'E'
3447 // Expected pattern compilation error.
3448 if (flags
.indexOf((UChar
)0x64) >= 0) { // flags contain 'd'
3449 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status
));
3451 goto cleanupAndReturn
;
3453 // Unexpected pattern compilation error.
3454 errln("Line %d: error %s compiling pattern. (UTF8)", line
, u_errorName(status
));
3455 goto cleanupAndReturn
;
3460 if (UTF8Pattern
== NULL
) {
3461 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3462 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath
, line
);
3463 status
= U_ZERO_ERROR
;
3466 if (flags
.indexOf((UChar
)0x64) >= 0) { // 'd' flag
3467 callerPattern
->dumpPattern();
3470 if (flags
.indexOf((UChar
)0x45) >= 0) { // 'E' flag
3471 errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath
, line
);
3472 goto cleanupAndReturn
;
3477 // Number of times find() should be called on the test string, default to 1
3480 for (i
=2; i
<=9; i
++) {
3481 if (flags
.indexOf((UChar
)(0x30 + i
)) >= 0) { // digit flag
3482 if (numFinds
!= 1) {
3483 errln("Line %d: more than one digit flag. Scanning %d.", line
, i
);
3484 goto cleanupAndReturn
;
3490 // 'M' flag. Use matches() instead of find()
3491 if (flags
.indexOf((UChar
)0x4d) >= 0) {
3492 useMatchesFunc
= TRUE
;
3494 if (flags
.indexOf((UChar
)0x4c) >= 0) {
3495 useLookingAtFunc
= TRUE
;
3499 // Find the tags in the input data, remove them, and record the group boundary
3502 parsePat
= RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe
, status
);
3503 REGEX_CHECK_STATUS_L(line
);
3505 unEscapedInput
= inputString
.unescape();
3506 parseMatcher
= parsePat
->matcher(unEscapedInput
, status
);
3507 REGEX_CHECK_STATUS_L(line
);
3508 while(parseMatcher
->find()) {
3509 parseMatcher
->appendReplacement(deTaggedInput
, "", status
);
3511 UnicodeString groupNum
= parseMatcher
->group(2, status
);
3512 if (groupNum
== "r") {
3513 // <r> or </r>, a region specification within the string
3514 if (parseMatcher
->group(1, status
) == "/") {
3515 regionEnd
= deTaggedInput
.length();
3517 regionStart
= deTaggedInput
.length();
3520 // <digits> or </digits>, a group match boundary tag.
3521 if (parseMatcher
->group(1, status
) == "/") {
3522 set(groupEnds
, deTaggedInput
.length(), groupNum
);
3524 set(groupStarts
, deTaggedInput
.length(), groupNum
);
3528 parseMatcher
->appendTail(deTaggedInput
);
3530 if (groupStarts
.size() != groupEnds
.size()) {
3531 errln("Error at line %d: mismatched <n> group tags in expected results.", line
);
3533 goto cleanupAndReturn
;
3535 if ((regionStart
>=0 || regionEnd
>=0) && (regionStart
<0 || regionStart
>regionEnd
)) {
3536 errln("mismatched <r> tags");
3538 goto cleanupAndReturn
;
3542 // Configure the matcher according to the flags specified with this test.
3544 matcher
= callerPattern
->matcher(deTaggedInput
, status
);
3545 REGEX_CHECK_STATUS_L(line
);
3546 if (flags
.indexOf((UChar
)0x74) >= 0) { // 't' trace flag
3547 matcher
->setTrace(TRUE
);
3550 if (UTF8Pattern
!= NULL
) {
3551 inputUTF8Length
= deTaggedInput
.extract(NULL
, 0, UTF8Converter
, status
);
3552 status
= U_ZERO_ERROR
; // buffer overflow
3553 inputChars
= new char[inputUTF8Length
+1];
3554 deTaggedInput
.extract(inputChars
, inputUTF8Length
+1, UTF8Converter
, status
);
3555 utext_openUTF8(&inputText
, inputChars
, inputUTF8Length
, &status
);
3557 if (status
== U_ZERO_ERROR
) {
3558 UTF8Matcher
= &UTF8Pattern
->matcher(status
)->reset(&inputText
);
3559 REGEX_CHECK_STATUS_L(line
);
3562 if (UTF8Matcher
== NULL
) {
3563 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3564 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath
, line
);
3565 status
= U_ZERO_ERROR
;
3570 // Generate native indices for UTF8 versions of region and capture group info
3572 if (UTF8Matcher
!= NULL
) {
3573 if (flags
.indexOf((UChar
)0x74) >= 0) { // 't' trace flag
3574 UTF8Matcher
->setTrace(TRUE
);
3576 if (regionStart
>=0) (void) utextOffsetToNative(&inputText
, regionStart
, regionStartUTF8
);
3577 if (regionEnd
>=0) (void) utextOffsetToNative(&inputText
, regionEnd
, regionEndUTF8
);
3579 // Fill out the native index UVector info.
3580 // Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3581 for (i
=0; i
<groupStarts
.size(); i
++) {
3582 int32_t start
= groupStarts
.elementAti(i
);
3583 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3586 if (!utextOffsetToNative(&inputText
, start
, startUTF8
)) {
3587 errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line
, i
, start
);
3589 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3591 setInt(groupStartsUTF8
, startUTF8
, i
);
3594 int32_t end
= groupEnds
.elementAti(i
);
3595 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3598 if (!utextOffsetToNative(&inputText
, end
, endUTF8
)) {
3599 errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line
, i
, end
);
3601 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3603 setInt(groupEndsUTF8
, endUTF8
, i
);
3608 if (regionStart
>=0) {
3609 matcher
->region(regionStart
, regionEnd
, status
);
3610 REGEX_CHECK_STATUS_L(line
);
3611 if (UTF8Matcher
!= NULL
) {
3612 UTF8Matcher
->region(regionStartUTF8
, regionEndUTF8
, status
);
3613 REGEX_CHECK_STATUS_L(line
);
3616 if (flags
.indexOf((UChar
)0x61) >= 0) { // 'a' anchoring bounds flag
3617 matcher
->useAnchoringBounds(FALSE
);
3618 if (UTF8Matcher
!= NULL
) {
3619 UTF8Matcher
->useAnchoringBounds(FALSE
);
3622 if (flags
.indexOf((UChar
)0x62) >= 0) { // 'b' transparent bounds flag
3623 matcher
->useTransparentBounds(TRUE
);
3624 if (UTF8Matcher
!= NULL
) {
3625 UTF8Matcher
->useTransparentBounds(TRUE
);
3632 // Do a find on the de-tagged input using the caller's pattern
3633 // TODO: error on count>1 and not find().
3634 // error on both matches() and lookingAt().
3636 for (i
=0; i
<numFinds
; i
++) {
3637 if (useMatchesFunc
) {
3638 isMatch
= matcher
->matches(status
);
3639 if (UTF8Matcher
!= NULL
) {
3640 isUTF8Match
= UTF8Matcher
->matches(status
);
3642 } else if (useLookingAtFunc
) {
3643 isMatch
= matcher
->lookingAt(status
);
3644 if (UTF8Matcher
!= NULL
) {
3645 isUTF8Match
= UTF8Matcher
->lookingAt(status
);
3648 isMatch
= matcher
->find();
3649 if (UTF8Matcher
!= NULL
) {
3650 isUTF8Match
= UTF8Matcher
->find();
3654 matcher
->setTrace(FALSE
);
3656 UTF8Matcher
->setTrace(FALSE
);
3658 if (U_FAILURE(status
)) {
3659 errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status
));
3663 // Match up the groups from the find() with the groups from the tags
3666 // number of tags should match number of groups from find operation.
3667 // matcher->groupCount does not include group 0, the entire match, hence the +1.
3668 // G option in test means that capture group data is not available in the
3669 // expected results, so the check needs to be suppressed.
3670 if (isMatch
== FALSE
&& groupStarts
.size() != 0) {
3671 dataerrln("Error at line %d: Match expected, but none found.", line
);
3673 goto cleanupAndReturn
;
3674 } else if (UTF8Matcher
!= NULL
&& isUTF8Match
== FALSE
&& groupStarts
.size() != 0) {
3675 errln("Error at line %d: Match expected, but none found. (UTF8)", line
);
3677 goto cleanupAndReturn
;
3679 if (isMatch
&& groupStarts
.size() == 0) {
3680 errln("Error at line %d: No match expected, but one found at position %d.", line
, matcher
->start(status
));
3683 if (UTF8Matcher
&& isUTF8Match
&& groupStarts
.size() == 0) {
3684 errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line
, UTF8Matcher
->start(status
));
3688 if (flags
.indexOf((UChar
)0x47 /*G*/) >= 0) {
3689 // Only check for match / no match. Don't check capture groups.
3690 goto cleanupAndReturn
;
3693 REGEX_CHECK_STATUS_L(line
);
3694 for (i
=0; i
<=matcher
->groupCount(); i
++) {
3695 int32_t expectedStart
= (i
>= groupStarts
.size()? -1 : groupStarts
.elementAti(i
));
3696 int32_t expectedStartUTF8
= (i
>= groupStartsUTF8
.size()? -1 : groupStartsUTF8
.elementAti(i
));
3697 if (matcher
->start(i
, status
) != expectedStart
) {
3698 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
3699 line
, i
, expectedStart
, matcher
->start(i
, status
));
3701 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3702 } else if (UTF8Matcher
!= NULL
&& UTF8Matcher
->start(i
, status
) != expectedStartUTF8
) {
3703 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)",
3704 line
, i
, expectedStartUTF8
, UTF8Matcher
->start(i
, status
));
3706 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3709 int32_t expectedEnd
= (i
>= groupEnds
.size()? -1 : groupEnds
.elementAti(i
));
3710 int32_t expectedEndUTF8
= (i
>= groupEndsUTF8
.size()? -1 : groupEndsUTF8
.elementAti(i
));
3711 if (matcher
->end(i
, status
) != expectedEnd
) {
3712 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
3713 line
, i
, expectedEnd
, matcher
->end(i
, status
));
3715 // Error on end position; keep going; real error is probably yet to come as group
3716 // end positions work from end of the input data towards the front.
3717 } else if (UTF8Matcher
!= NULL
&& UTF8Matcher
->end(i
, status
) != expectedEndUTF8
) {
3718 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)",
3719 line
, i
, expectedEndUTF8
, UTF8Matcher
->end(i
, status
));
3721 // Error on end position; keep going; real error is probably yet to come as group
3722 // end positions work from end of the input data towards the front.
3725 if ( matcher
->groupCount()+1 < groupStarts
.size()) {
3726 errln("Error at line %d: Expected %d capture groups, found %d.",
3727 line
, groupStarts
.size()-1, matcher
->groupCount());
3730 else if (UTF8Matcher
!= NULL
&& UTF8Matcher
->groupCount()+1 < groupStarts
.size()) {
3731 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3732 line
, groupStarts
.size()-1, UTF8Matcher
->groupCount());
3736 if ((flags
.indexOf((UChar
)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3737 matcher
->requireEnd() == TRUE
) {
3738 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line
);
3740 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3741 UTF8Matcher
->requireEnd() == TRUE
) {
3742 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line
);
3746 if ((flags
.indexOf((UChar
)0x79) >= 0) && // 'y' flag: RequireEnd() == true
3747 matcher
->requireEnd() == FALSE
) {
3748 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line
);
3750 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x79) >= 0) && // 'Y' flag: RequireEnd() == false
3751 UTF8Matcher
->requireEnd() == FALSE
) {
3752 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line
);
3756 if ((flags
.indexOf((UChar
)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3757 matcher
->hitEnd() == TRUE
) {
3758 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line
);
3760 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3761 UTF8Matcher
->hitEnd() == TRUE
) {
3762 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line
);
3766 if ((flags
.indexOf((UChar
)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3767 matcher
->hitEnd() == FALSE
) {
3768 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line
);
3770 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3771 UTF8Matcher
->hitEnd() == FALSE
) {
3772 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line
);
3779 infoln((UnicodeString
)"\""+pattern
+(UnicodeString
)"\" "
3780 +flags
+(UnicodeString
)" \""+inputString
+(UnicodeString
)"\"");
3781 // callerPattern->dump();
3783 delete parseMatcher
;
3788 delete callerPattern
;
3790 utext_close(&inputText
);
3791 delete[] inputChars
;
3792 utext_close(&patternText
);
3793 delete[] patternChars
;
3794 ucnv_close(UTF8Converter
);
3800 //---------------------------------------------------------------------------
3802 // Errors Check for error handling in patterns.
3804 //---------------------------------------------------------------------------
3805 void RegexTest::Errors() {
3806 // \escape sequences that aren't implemented yet.
3807 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3809 // Missing close parentheses
3810 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN
);
3811 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN
);
3812 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN
);
3814 // Extra close paren
3815 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN
);
3816 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN
);
3817 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN
);
3819 // Look-ahead, Look-behind
3820 // TODO: add tests for unbounded length look-behinds.
3821 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX
); // illegal construct
3823 // Attempt to use non-default flags
3826 UErrorCode status
= U_ZERO_ERROR
;
3827 int32_t flags
= UREGEX_CANON_EQ
|
3828 UREGEX_COMMENTS
| UREGEX_DOTALL
|
3830 RegexPattern
*pat1
= RegexPattern::compile(".*", flags
, pe
, status
);
3831 REGEX_ASSERT(status
== U_REGEX_UNIMPLEMENTED
);
3836 // Quantifiers are allowed only after something that can be quantified.
3837 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX
);
3838 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX
);
3839 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX
);
3841 // Mal-formed {min,max} quantifiers
3842 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL
);
3843 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN
);
3844 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL
);
3845 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL
);
3846 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL
);
3847 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG
);
3848 REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG
); // Overflows int during scan
3849 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG
); // Overflows regex binary format
3850 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG
);
3853 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX
);
3855 // Invalid Back Reference \0
3856 // For ICU 3.8 and earlier
3857 // For ICU versions newer than 3.8, \0 introduces an octal escape.
3859 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE
);
3864 //-------------------------------------------------------------------------------
3866 // Read a text data file, convert it to UChars, and return the data
3867 // in one big UChar * buffer, which the caller must delete.
3869 //--------------------------------------------------------------------------------
3870 UChar
*RegexTest::ReadAndConvertFile(const char *fileName
, int32_t &ulen
,
3871 const char *defEncoding
, UErrorCode
&status
) {
3872 UChar
*retPtr
= NULL
;
3873 char *fileBuf
= NULL
;
3874 UConverter
* conv
= NULL
;
3878 if (U_FAILURE(status
)) {
3885 f
= fopen(fileName
, "rb");
3887 dataerrln("Error opening test data file %s\n", fileName
);
3888 status
= U_FILE_ACCESS_ERROR
;
3897 fseek( f
, 0, SEEK_END
);
3898 fileSize
= ftell(f
);
3899 fileBuf
= new char[fileSize
];
3900 fseek(f
, 0, SEEK_SET
);
3901 amt_read
= static_cast<int32_t>(fread(fileBuf
, 1, fileSize
, f
));
3902 if (amt_read
!= fileSize
|| fileSize
<= 0) {
3903 errln("Error reading test data file.");
3904 goto cleanUpAndReturn
;
3908 // Look for a Unicode Signature (BOM) on the data just read
3910 int32_t signatureLength
;
3911 const char * fileBufC
;
3912 const char* encoding
;
3915 encoding
= ucnv_detectUnicodeSignature(
3916 fileBuf
, fileSize
, &signatureLength
, &status
);
3917 if(encoding
!=NULL
){
3918 fileBufC
+= signatureLength
;
3919 fileSize
-= signatureLength
;
3921 encoding
= defEncoding
;
3922 if (strcmp(encoding
, "utf-8") == 0) {
3923 errln("file %s is missing its BOM", fileName
);
3928 // Open a converter to take the rule file to UTF-16
3930 conv
= ucnv_open(encoding
, &status
);
3931 if (U_FAILURE(status
)) {
3932 goto cleanUpAndReturn
;
3936 // Convert the rules to UChar.
3937 // Preflight first to determine required buffer size.
3939 ulen
= ucnv_toUChars(conv
,
3945 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
3946 // Buffer Overflow is expected from the preflight operation.
3947 status
= U_ZERO_ERROR
;
3949 retPtr
= new UChar
[ulen
+1];
3962 if (U_FAILURE(status
)) {
3963 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
3972 //-------------------------------------------------------------------------------
3974 // PerlTests - Run Perl's regular expression tests
3975 // The input file for this test is re_tests, the standard regular
3976 // expression test data distributed with the Perl source code.
3978 // Here is Perl's description of the test data file:
3980 // # The tests are in a separate file 't/op/re_tests'.
3981 // # Each line in that file is a separate test.
3982 // # There are five columns, separated by tabs.
3984 // # Column 1 contains the pattern, optionally enclosed in C<''>.
3985 // # Modifiers can be put after the closing C<'>.
3987 // # Column 2 contains the string to be matched.
3989 // # Column 3 contains the expected result:
3990 // # y expect a match
3991 // # n expect no match
3992 // # c expect an error
3993 // # B test exposes a known bug in Perl, should be skipped
3994 // # b test exposes a known bug in Perl, should be skipped if noamp
3996 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3998 // # Column 4 contains a string, usually C<$&>.
4000 // # Column 5 contains the expected result of double-quote
4001 // # interpolating that string after the match, or start of error message.
4003 // # Column 6, if present, contains a reason why the test is skipped.
4004 // # This is printed with "skipped", for harness to pick up.
4006 // # \n in the tests are interpolated, as are variables of the form ${\w+}.
4008 // # If you want to add a regular expression test that can't be expressed
4009 // # in this format, don't add it here: put it in op/pat.t instead.
4011 // For ICU, if field 3 contains an 'i', the test will be skipped.
4012 // The test exposes is some known incompatibility between ICU and Perl regexps.
4013 // (The i is in addition to whatever was there before.)
4015 //-------------------------------------------------------------------------------
4016 void RegexTest::PerlTests() {
4018 const char *srcPath
;
4019 UErrorCode status
= U_ZERO_ERROR
;
4023 // Open and read the test data file.
4025 srcPath
=getPath(tdd
, "re_tests.txt");
4027 return; /* something went wrong, error already output */
4031 UChar
*testData
= ReadAndConvertFile(srcPath
, len
, "iso-8859-1", status
);
4032 if (U_FAILURE(status
)) {
4033 return; /* something went wrong, error already output */
4037 // Put the test data into a UnicodeString
4039 UnicodeString
testDataString(FALSE
, testData
, len
);
4042 // Regex to break the input file into lines, and strip the new lines.
4043 // One line per match, capture group one is the desired data.
4045 RegexPattern
* linePat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe
, status
);
4046 if (U_FAILURE(status
)) {
4047 dataerrln("RegexPattern::compile() error");
4050 RegexMatcher
* lineMat
= linePat
->matcher(testDataString
, status
);
4053 // Regex to split a test file line into fields.
4054 // There are six fields, separated by tabs.
4056 RegexPattern
* fieldPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe
, status
);
4059 // Regex to identify test patterns with flag settings, and to separate them.
4060 // Test patterns with flags look like 'pattern'i
4061 // Test patterns without flags are not quoted: pattern
4062 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4064 RegexPattern
*flagPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe
, status
);
4065 RegexMatcher
* flagMat
= flagPat
->matcher(status
);
4068 // The Perl tests reference several perl-isms, which are evaluated/substituted
4069 // in the test data. Not being perl, this must be done explicitly. Here
4070 // are string constants and REs for these constructs.
4072 UnicodeString
nulnulSrc("${nulnul}");
4073 UnicodeString
nulnul("\\u0000\\u0000", -1, US_INV
);
4074 nulnul
= nulnul
.unescape();
4076 UnicodeString
ffffSrc("${ffff}");
4077 UnicodeString
ffff("\\uffff", -1, US_INV
);
4078 ffff
= ffff
.unescape();
4080 // regexp for $-[0], $+[2], etc.
4081 RegexPattern
*groupsPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe
, status
);
4082 RegexMatcher
*groupsMat
= groupsPat
->matcher(status
);
4084 // regexp for $0, $1, $2, etc.
4085 RegexPattern
*cgPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe
, status
);
4086 RegexMatcher
*cgMat
= cgPat
->matcher(status
);
4090 // Main Loop for the Perl Tests, runs once per line from the
4093 int32_t lineNum
= 0;
4094 int32_t skippedUnimplementedCount
= 0;
4095 while (lineMat
->find()) {
4099 // Get a line, break it into its fields, do the Perl
4100 // variable substitutions.
4102 UnicodeString line
= lineMat
->group(1, status
);
4103 UnicodeString fields
[7];
4104 fieldPat
->split(line
, fields
, 7, status
);
4106 flagMat
->reset(fields
[0]);
4107 flagMat
->matches(status
);
4108 UnicodeString pattern
= flagMat
->group(2, status
);
4109 pattern
.findAndReplace("${bang}", "!");
4110 pattern
.findAndReplace(nulnulSrc
, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4111 pattern
.findAndReplace(ffffSrc
, ffff
);
4114 // Identify patterns that include match flag settings,
4115 // split off the flags, remove the extra quotes.
4117 UnicodeString flagStr
= flagMat
->group(3, status
);
4118 if (U_FAILURE(status
)) {
4119 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
4123 const UChar UChar_c
= 0x63; // Char constants for the flag letters.
4124 const UChar UChar_i
= 0x69; // (Damn the lack of Unicode support in C)
4125 const UChar UChar_m
= 0x6d;
4126 const UChar UChar_x
= 0x78;
4127 const UChar UChar_y
= 0x79;
4128 if (flagStr
.indexOf(UChar_i
) != -1) {
4129 flags
|= UREGEX_CASE_INSENSITIVE
;
4131 if (flagStr
.indexOf(UChar_m
) != -1) {
4132 flags
|= UREGEX_MULTILINE
;
4134 if (flagStr
.indexOf(UChar_x
) != -1) {
4135 flags
|= UREGEX_COMMENTS
;
4139 // Compile the test pattern.
4141 status
= U_ZERO_ERROR
;
4142 RegexPattern
*testPat
= RegexPattern::compile(pattern
, flags
, pe
, status
);
4143 if (status
== U_REGEX_UNIMPLEMENTED
) {
4145 // Test of a feature that is planned for ICU, but not yet implemented.
4147 skippedUnimplementedCount
++;
4149 status
= U_ZERO_ERROR
;
4153 if (U_FAILURE(status
)) {
4154 // Some tests are supposed to generate errors.
4155 // Only report an error for tests that are supposed to succeed.
4156 if (fields
[2].indexOf(UChar_c
) == -1 && // Compilation is not supposed to fail AND
4157 fields
[2].indexOf(UChar_i
) == -1) // it's not an accepted ICU incompatibility
4159 errln("line %d: ICU Error \"%s\"\n", lineNum
, u_errorName(status
));
4161 status
= U_ZERO_ERROR
;
4166 if (fields
[2].indexOf(UChar_i
) >= 0) {
4167 // ICU should skip this test.
4172 if (fields
[2].indexOf(UChar_c
) >= 0) {
4173 // This pattern should have caused a compilation error, but didn't/
4174 errln("line %d: Expected a pattern compile error, got success.", lineNum
);
4180 // replace the Perl variables that appear in some of the
4181 // match data strings.
4183 UnicodeString matchString
= fields
[1];
4184 matchString
.findAndReplace(nulnulSrc
, nulnul
);
4185 matchString
.findAndReplace(ffffSrc
, ffff
);
4187 // Replace any \n in the match string with an actual new-line char.
4188 // Don't do full unescape, as this unescapes more than Perl does, which
4189 // causes other spurious failures in the tests.
4190 matchString
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4195 // Run the test, check for expected match/don't match result.
4197 RegexMatcher
*testMat
= testPat
->matcher(matchString
, status
);
4198 UBool found
= testMat
->find();
4199 UBool expected
= FALSE
;
4200 if (fields
[2].indexOf(UChar_y
) >=0) {
4203 if (expected
!= found
) {
4204 errln("line %d: Expected %smatch, got %smatch",
4205 lineNum
, expected
?"":"no ", found
?"":"no " );
4209 // Don't try to check expected results if there is no match.
4210 // (Some have stuff in the expected fields)
4218 // Interpret the Perl expression from the fourth field of the data file,
4219 // building up an ICU string from the results of the ICU match.
4220 // The Perl expression will contain references to the results of
4221 // a regex match, including the matched string, capture group strings,
4222 // group starting and ending indicies, etc.
4224 UnicodeString resultString
;
4225 UnicodeString perlExpr
= fields
[3];
4226 #if SUPPORT_MUTATING_INPUT_STRING
4227 groupsMat
->reset(perlExpr
);
4228 cgMat
->reset(perlExpr
);
4231 while (perlExpr
.length() > 0) {
4232 #if !SUPPORT_MUTATING_INPUT_STRING
4233 // Perferred usage. Reset after any modification to input string.
4234 groupsMat
->reset(perlExpr
);
4235 cgMat
->reset(perlExpr
);
4238 if (perlExpr
.startsWith("$&")) {
4239 resultString
.append(testMat
->group(status
));
4240 perlExpr
.remove(0, 2);
4243 else if (groupsMat
->lookingAt(status
)) {
4245 UnicodeString digitString
= groupsMat
->group(2, status
);
4247 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4248 UnicodeString plusOrMinus
= groupsMat
->group(1, status
);
4249 int32_t matchPosition
;
4250 if (plusOrMinus
.compare("+") == 0) {
4251 matchPosition
= testMat
->end(groupNum
, status
);
4253 matchPosition
= testMat
->start(groupNum
, status
);
4255 if (matchPosition
!= -1) {
4256 ICU_Utility::appendNumber(resultString
, matchPosition
);
4258 perlExpr
.remove(0, groupsMat
->end(status
));
4261 else if (cgMat
->lookingAt(status
)) {
4263 UnicodeString digitString
= cgMat
->group(1, status
);
4265 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4266 if (U_SUCCESS(status
)) {
4267 resultString
.append(testMat
->group(groupNum
, status
));
4268 status
= U_ZERO_ERROR
;
4270 perlExpr
.remove(0, cgMat
->end(status
));
4273 else if (perlExpr
.startsWith("@-")) {
4275 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4277 resultString
.append(" ");
4279 ICU_Utility::appendNumber(resultString
, testMat
->start(i
, status
));
4281 perlExpr
.remove(0, 2);
4284 else if (perlExpr
.startsWith("@+")) {
4286 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4288 resultString
.append(" ");
4290 ICU_Utility::appendNumber(resultString
, testMat
->end(i
, status
));
4292 perlExpr
.remove(0, 2);
4295 else if (perlExpr
.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4296 // or as an escaped sequence (e.g. \n)
4297 if (perlExpr
.length() > 1) {
4298 perlExpr
.remove(0, 1); // Remove the '\', but only if not last char.
4300 UChar c
= perlExpr
.charAt(0);
4302 case 'n': c
= '\n'; break;
4303 // add any other escape sequences that show up in the test expected results.
4305 resultString
.append(c
);
4306 perlExpr
.remove(0, 1);
4310 // Any characters from the perl expression that we don't explicitly
4311 // recognize before here are assumed to be literals and copied
4312 // as-is to the expected results.
4313 resultString
.append(perlExpr
.charAt(0));
4314 perlExpr
.remove(0, 1);
4317 if (U_FAILURE(status
)) {
4318 errln("Line %d: ICU Error \"%s\"", lineNum
, u_errorName(status
));
4324 // Expected Results Compare
4326 UnicodeString
expectedS(fields
[4]);
4327 expectedS
.findAndReplace(nulnulSrc
, nulnul
);
4328 expectedS
.findAndReplace(ffffSrc
, ffff
);
4329 expectedS
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4332 if (expectedS
.compare(resultString
) != 0) {
4333 err("Line %d: Incorrect perl expression results.", lineNum
);
4334 infoln((UnicodeString
)"Expected \""+expectedS
+(UnicodeString
)"\"; got \""+resultString
+(UnicodeString
)"\"");
4342 // All done. Clean up allocated stuff.
4360 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount
);
4365 //-------------------------------------------------------------------------------
4367 // PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts
4368 // (instead of using UnicodeStrings) to test the alternate engine.
4369 // The input file for this test is re_tests, the standard regular
4370 // expression test data distributed with the Perl source code.
4371 // See PerlTests() for more information.
4373 //-------------------------------------------------------------------------------
4374 void RegexTest::PerlTestsUTF8() {
4376 const char *srcPath
;
4377 UErrorCode status
= U_ZERO_ERROR
;
4379 LocalUConverterPointer
UTF8Converter(ucnv_open("UTF-8", &status
));
4380 UText patternText
= UTEXT_INITIALIZER
;
4381 char *patternChars
= NULL
;
4382 int32_t patternLength
;
4383 int32_t patternCapacity
= 0;
4384 UText inputText
= UTEXT_INITIALIZER
;
4385 char *inputChars
= NULL
;
4386 int32_t inputLength
;
4387 int32_t inputCapacity
= 0;
4389 ucnv_setFromUCallBack(UTF8Converter
.getAlias(), UCNV_FROM_U_CALLBACK_STOP
, NULL
, NULL
, NULL
, &status
);
4392 // Open and read the test data file.
4394 srcPath
=getPath(tdd
, "re_tests.txt");
4396 return; /* something went wrong, error already output */
4400 UChar
*testData
= ReadAndConvertFile(srcPath
, len
, "iso-8859-1", status
);
4401 if (U_FAILURE(status
)) {
4402 return; /* something went wrong, error already output */
4406 // Put the test data into a UnicodeString
4408 UnicodeString
testDataString(FALSE
, testData
, len
);
4411 // Regex to break the input file into lines, and strip the new lines.
4412 // One line per match, capture group one is the desired data.
4414 RegexPattern
* linePat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe
, status
);
4415 if (U_FAILURE(status
)) {
4416 dataerrln("RegexPattern::compile() error");
4419 RegexMatcher
* lineMat
= linePat
->matcher(testDataString
, status
);
4422 // Regex to split a test file line into fields.
4423 // There are six fields, separated by tabs.
4425 RegexPattern
* fieldPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe
, status
);
4428 // Regex to identify test patterns with flag settings, and to separate them.
4429 // Test patterns with flags look like 'pattern'i
4430 // Test patterns without flags are not quoted: pattern
4431 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4433 RegexPattern
*flagPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe
, status
);
4434 RegexMatcher
* flagMat
= flagPat
->matcher(status
);
4437 // The Perl tests reference several perl-isms, which are evaluated/substituted
4438 // in the test data. Not being perl, this must be done explicitly. Here
4439 // are string constants and REs for these constructs.
4441 UnicodeString
nulnulSrc("${nulnul}");
4442 UnicodeString
nulnul("\\u0000\\u0000", -1, US_INV
);
4443 nulnul
= nulnul
.unescape();
4445 UnicodeString
ffffSrc("${ffff}");
4446 UnicodeString
ffff("\\uffff", -1, US_INV
);
4447 ffff
= ffff
.unescape();
4449 // regexp for $-[0], $+[2], etc.
4450 RegexPattern
*groupsPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe
, status
);
4451 RegexMatcher
*groupsMat
= groupsPat
->matcher(status
);
4453 // regexp for $0, $1, $2, etc.
4454 RegexPattern
*cgPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe
, status
);
4455 RegexMatcher
*cgMat
= cgPat
->matcher(status
);
4459 // Main Loop for the Perl Tests, runs once per line from the
4462 int32_t lineNum
= 0;
4463 int32_t skippedUnimplementedCount
= 0;
4464 while (lineMat
->find()) {
4468 // Get a line, break it into its fields, do the Perl
4469 // variable substitutions.
4471 UnicodeString line
= lineMat
->group(1, status
);
4472 UnicodeString fields
[7];
4473 fieldPat
->split(line
, fields
, 7, status
);
4475 flagMat
->reset(fields
[0]);
4476 flagMat
->matches(status
);
4477 UnicodeString pattern
= flagMat
->group(2, status
);
4478 pattern
.findAndReplace("${bang}", "!");
4479 pattern
.findAndReplace(nulnulSrc
, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4480 pattern
.findAndReplace(ffffSrc
, ffff
);
4483 // Identify patterns that include match flag settings,
4484 // split off the flags, remove the extra quotes.
4486 UnicodeString flagStr
= flagMat
->group(3, status
);
4487 if (U_FAILURE(status
)) {
4488 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
4492 const UChar UChar_c
= 0x63; // Char constants for the flag letters.
4493 const UChar UChar_i
= 0x69; // (Damn the lack of Unicode support in C)
4494 const UChar UChar_m
= 0x6d;
4495 const UChar UChar_x
= 0x78;
4496 const UChar UChar_y
= 0x79;
4497 if (flagStr
.indexOf(UChar_i
) != -1) {
4498 flags
|= UREGEX_CASE_INSENSITIVE
;
4500 if (flagStr
.indexOf(UChar_m
) != -1) {
4501 flags
|= UREGEX_MULTILINE
;
4503 if (flagStr
.indexOf(UChar_x
) != -1) {
4504 flags
|= UREGEX_COMMENTS
;
4508 // Put the pattern in a UTF-8 UText
4510 status
= U_ZERO_ERROR
;
4511 patternLength
= pattern
.extract(patternChars
, patternCapacity
, UTF8Converter
.getAlias(), status
);
4512 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
4513 status
= U_ZERO_ERROR
;
4514 delete[] patternChars
;
4515 patternCapacity
= patternLength
+ 1;
4516 patternChars
= new char[patternCapacity
];
4517 pattern
.extract(patternChars
, patternCapacity
, UTF8Converter
.getAlias(), status
);
4519 utext_openUTF8(&patternText
, patternChars
, patternLength
, &status
);
4522 // Compile the test pattern.
4524 RegexPattern
*testPat
= RegexPattern::compile(&patternText
, flags
, pe
, status
);
4525 if (status
== U_REGEX_UNIMPLEMENTED
) {
4527 // Test of a feature that is planned for ICU, but not yet implemented.
4529 skippedUnimplementedCount
++;
4531 status
= U_ZERO_ERROR
;
4535 if (U_FAILURE(status
)) {
4536 // Some tests are supposed to generate errors.
4537 // Only report an error for tests that are supposed to succeed.
4538 if (fields
[2].indexOf(UChar_c
) == -1 && // Compilation is not supposed to fail AND
4539 fields
[2].indexOf(UChar_i
) == -1) // it's not an accepted ICU incompatibility
4541 errln("line %d: ICU Error \"%s\"\n", lineNum
, u_errorName(status
));
4543 status
= U_ZERO_ERROR
;
4548 if (fields
[2].indexOf(UChar_i
) >= 0) {
4549 // ICU should skip this test.
4554 if (fields
[2].indexOf(UChar_c
) >= 0) {
4555 // This pattern should have caused a compilation error, but didn't/
4556 errln("line %d: Expected a pattern compile error, got success.", lineNum
);
4563 // replace the Perl variables that appear in some of the
4564 // match data strings.
4566 UnicodeString matchString
= fields
[1];
4567 matchString
.findAndReplace(nulnulSrc
, nulnul
);
4568 matchString
.findAndReplace(ffffSrc
, ffff
);
4570 // Replace any \n in the match string with an actual new-line char.
4571 // Don't do full unescape, as this unescapes more than Perl does, which
4572 // causes other spurious failures in the tests.
4573 matchString
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4576 // Put the input in a UTF-8 UText
4578 status
= U_ZERO_ERROR
;
4579 inputLength
= matchString
.extract(inputChars
, inputCapacity
, UTF8Converter
.getAlias(), status
);
4580 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
4581 status
= U_ZERO_ERROR
;
4582 delete[] inputChars
;
4583 inputCapacity
= inputLength
+ 1;
4584 inputChars
= new char[inputCapacity
];
4585 matchString
.extract(inputChars
, inputCapacity
, UTF8Converter
.getAlias(), status
);
4587 utext_openUTF8(&inputText
, inputChars
, inputLength
, &status
);
4590 // Run the test, check for expected match/don't match result.
4592 RegexMatcher
*testMat
= &testPat
->matcher(status
)->reset(&inputText
);
4593 UBool found
= testMat
->find();
4594 UBool expected
= FALSE
;
4595 if (fields
[2].indexOf(UChar_y
) >=0) {
4598 if (expected
!= found
) {
4599 errln("line %d: Expected %smatch, got %smatch",
4600 lineNum
, expected
?"":"no ", found
?"":"no " );
4604 // Don't try to check expected results if there is no match.
4605 // (Some have stuff in the expected fields)
4613 // Interpret the Perl expression from the fourth field of the data file,
4614 // building up an ICU string from the results of the ICU match.
4615 // The Perl expression will contain references to the results of
4616 // a regex match, including the matched string, capture group strings,
4617 // group starting and ending indicies, etc.
4619 UnicodeString resultString
;
4620 UnicodeString perlExpr
= fields
[3];
4622 while (perlExpr
.length() > 0) {
4623 groupsMat
->reset(perlExpr
);
4624 cgMat
->reset(perlExpr
);
4626 if (perlExpr
.startsWith("$&")) {
4627 resultString
.append(testMat
->group(status
));
4628 perlExpr
.remove(0, 2);
4631 else if (groupsMat
->lookingAt(status
)) {
4633 UnicodeString digitString
= groupsMat
->group(2, status
);
4635 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4636 UnicodeString plusOrMinus
= groupsMat
->group(1, status
);
4637 int32_t matchPosition
;
4638 if (plusOrMinus
.compare("+") == 0) {
4639 matchPosition
= testMat
->end(groupNum
, status
);
4641 matchPosition
= testMat
->start(groupNum
, status
);
4643 if (matchPosition
!= -1) {
4644 ICU_Utility::appendNumber(resultString
, matchPosition
);
4646 perlExpr
.remove(0, groupsMat
->end(status
));
4649 else if (cgMat
->lookingAt(status
)) {
4651 UnicodeString digitString
= cgMat
->group(1, status
);
4653 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4654 if (U_SUCCESS(status
)) {
4655 resultString
.append(testMat
->group(groupNum
, status
));
4656 status
= U_ZERO_ERROR
;
4658 perlExpr
.remove(0, cgMat
->end(status
));
4661 else if (perlExpr
.startsWith("@-")) {
4663 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4665 resultString
.append(" ");
4667 ICU_Utility::appendNumber(resultString
, testMat
->start(i
, status
));
4669 perlExpr
.remove(0, 2);
4672 else if (perlExpr
.startsWith("@+")) {
4674 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4676 resultString
.append(" ");
4678 ICU_Utility::appendNumber(resultString
, testMat
->end(i
, status
));
4680 perlExpr
.remove(0, 2);
4683 else if (perlExpr
.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4684 // or as an escaped sequence (e.g. \n)
4685 if (perlExpr
.length() > 1) {
4686 perlExpr
.remove(0, 1); // Remove the '\', but only if not last char.
4688 UChar c
= perlExpr
.charAt(0);
4690 case 'n': c
= '\n'; break;
4691 // add any other escape sequences that show up in the test expected results.
4693 resultString
.append(c
);
4694 perlExpr
.remove(0, 1);
4698 // Any characters from the perl expression that we don't explicitly
4699 // recognize before here are assumed to be literals and copied
4700 // as-is to the expected results.
4701 resultString
.append(perlExpr
.charAt(0));
4702 perlExpr
.remove(0, 1);
4705 if (U_FAILURE(status
)) {
4706 errln("Line %d: ICU Error \"%s\"", lineNum
, u_errorName(status
));
4712 // Expected Results Compare
4714 UnicodeString
expectedS(fields
[4]);
4715 expectedS
.findAndReplace(nulnulSrc
, nulnul
);
4716 expectedS
.findAndReplace(ffffSrc
, ffff
);
4717 expectedS
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4720 if (expectedS
.compare(resultString
) != 0) {
4721 err("Line %d: Incorrect perl expression results.", lineNum
);
4722 infoln((UnicodeString
)"Expected \""+expectedS
+(UnicodeString
)"\"; got \""+resultString
+(UnicodeString
)"\"");
4730 // All done. Clean up allocated stuff.
4747 utext_close(&patternText
);
4748 utext_close(&inputText
);
4750 delete [] patternChars
;
4751 delete [] inputChars
;
4754 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount
);
4759 //--------------------------------------------------------------
4761 // Bug6149 Verify limits to heap expansion for backtrack stack.
4762 // Use this pattern,
4763 // "(a?){1,8000000}"
4764 // Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4765 // This test is likely to be fragile, as further optimizations stop
4766 // more cases of pointless looping in the match engine.
4768 //---------------------------------------------------------------
4769 void RegexTest::Bug6149() {
4770 UnicodeString
pattern("(a?){1,8000000}");
4771 UnicodeString
s("xyz");
4773 UErrorCode status
= U_ZERO_ERROR
;
4775 RegexMatcher
matcher(pattern
, s
, flags
, status
);
4776 UBool result
= false;
4777 REGEX_ASSERT_FAIL(result
=matcher
.matches(status
), U_REGEX_STACK_OVERFLOW
);
4778 REGEX_ASSERT(result
== FALSE
);
4783 // Callbacks() Test the callback function.
4784 // When set, callbacks occur periodically during matching operations,
4785 // giving the application code the ability to abort the operation
4786 // before it's normal completion.
4789 struct callBackContext
{
4794 void reset(int32_t max
) {maxCalls
=max
; numCalls
=0; lastSteps
=0;}
4798 static UBool U_CALLCONV
4799 testCallBackFn(const void *context
, int32_t steps
) {
4800 callBackContext
*info
= (callBackContext
*)context
;
4801 if (info
->lastSteps
+1 != steps
) {
4802 info
->test
->errln("incorrect steps in callback. Expected %d, got %d\n", info
->lastSteps
+1, steps
);
4804 info
->lastSteps
= steps
;
4806 return (info
->numCalls
< info
->maxCalls
);
4810 void RegexTest::Callbacks() {
4812 // Getter returns NULLs if no callback has been set
4814 // The variables that the getter will fill in.
4815 // Init to non-null values so that the action of the getter can be seen.
4816 const void *returnedContext
= &returnedContext
;
4817 URegexMatchCallback
*returnedFn
= &testCallBackFn
;
4819 UErrorCode status
= U_ZERO_ERROR
;
4820 RegexMatcher
matcher("x", 0, status
);
4822 matcher
.getMatchCallback(returnedFn
, returnedContext
, status
);
4824 REGEX_ASSERT(returnedFn
== NULL
);
4825 REGEX_ASSERT(returnedContext
== NULL
);
4830 callBackContext cbInfo
= {this, 0, 0, 0};
4831 const void *returnedContext
;
4832 URegexMatchCallback
*returnedFn
;
4833 UErrorCode status
= U_ZERO_ERROR
;
4834 RegexMatcher
matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status
); // A pattern that can run long.
4836 matcher
.setMatchCallback(testCallBackFn
, &cbInfo
, status
);
4838 matcher
.getMatchCallback(returnedFn
, returnedContext
, status
);
4840 REGEX_ASSERT(returnedFn
== testCallBackFn
);
4841 REGEX_ASSERT(returnedContext
== &cbInfo
);
4843 // A short-running match shouldn't invoke the callback
4844 status
= U_ZERO_ERROR
;
4846 UnicodeString s
= "xxx";
4848 REGEX_ASSERT(matcher
.matches(status
));
4850 REGEX_ASSERT(cbInfo
.numCalls
== 0);
4852 // A medium-length match that runs long enough to invoke the
4853 // callback, but not so long that the callback aborts it.
4854 status
= U_ZERO_ERROR
;
4856 s
= "aaaaaaaaaaaaaaaaaaab";
4858 REGEX_ASSERT(matcher
.matches(status
)==FALSE
);
4860 REGEX_ASSERT(cbInfo
.numCalls
> 0);
4862 // A longer running match that the callback function will abort.
4863 status
= U_ZERO_ERROR
;
4865 s
= "aaaaaaaaaaaaaaaaaaaaaaab";
4867 REGEX_ASSERT(matcher
.matches(status
)==FALSE
);
4868 REGEX_ASSERT(status
== U_REGEX_STOPPED_BY_CALLER
);
4869 REGEX_ASSERT(cbInfo
.numCalls
== 4);
4871 // A longer running find that the callback function will abort.
4872 status
= U_ZERO_ERROR
;
4874 s
= "aaaaaaaaaaaaaaaaaaaaaaab";
4876 REGEX_ASSERT(matcher
.find(status
)==FALSE
);
4877 REGEX_ASSERT(status
== U_REGEX_STOPPED_BY_CALLER
);
4878 REGEX_ASSERT(cbInfo
.numCalls
== 4);
4886 // FindProgressCallbacks() Test the find "progress" callback function.
4887 // When set, the find progress callback will be invoked during a find operations
4888 // after each return from a match attempt, giving the application the opportunity
4889 // to terminate a long-running find operation before it's normal completion.
4892 struct progressCallBackContext
{
4897 void reset(int32_t max
) {maxCalls
=max
; numCalls
=0;lastIndex
=0;}
4900 // call-back function for find().
4901 // Return TRUE to continue the find().
4902 // Return FALSE to stop the find().
4904 static UBool U_CALLCONV
4905 testProgressCallBackFn(const void *context
, int64_t matchIndex
) {
4906 progressCallBackContext
*info
= (progressCallBackContext
*)context
;
4908 info
->lastIndex
= matchIndex
;
4909 // info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4910 return (info
->numCalls
< info
->maxCalls
);
4914 void RegexTest::FindProgressCallbacks() {
4916 // Getter returns NULLs if no callback has been set
4918 // The variables that the getter will fill in.
4919 // Init to non-null values so that the action of the getter can be seen.
4920 const void *returnedContext
= &returnedContext
;
4921 URegexFindProgressCallback
*returnedFn
= &testProgressCallBackFn
;
4923 UErrorCode status
= U_ZERO_ERROR
;
4924 RegexMatcher
matcher("x", 0, status
);
4926 matcher
.getFindProgressCallback(returnedFn
, returnedContext
, status
);
4928 REGEX_ASSERT(returnedFn
== NULL
);
4929 REGEX_ASSERT(returnedContext
== NULL
);
4934 progressCallBackContext cbInfo
= {this, 0, 0, 0};
4935 const void *returnedContext
;
4936 URegexFindProgressCallback
*returnedFn
;
4937 UErrorCode status
= U_ZERO_ERROR
;
4938 RegexMatcher
matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status
);
4940 matcher
.setFindProgressCallback(testProgressCallBackFn
, &cbInfo
, status
);
4942 matcher
.getFindProgressCallback(returnedFn
, returnedContext
, status
);
4944 REGEX_ASSERT(returnedFn
== testProgressCallBackFn
);
4945 REGEX_ASSERT(returnedContext
== &cbInfo
);
4947 // A find that matches on the initial position does NOT invoke the callback.
4948 status
= U_ZERO_ERROR
;
4950 UnicodeString s
= "aaxxx";
4953 matcher
.setTrace(TRUE
);
4955 REGEX_ASSERT(matcher
.find(0, status
));
4957 REGEX_ASSERT(cbInfo
.numCalls
== 0);
4959 // A medium running find() that causes matcher.find() to invoke our callback for each index,
4960 // but not so many times that we interrupt the operation.
4961 status
= U_ZERO_ERROR
;
4962 s
= "aaaaaaaaaaaaaaaaaaab";
4963 cbInfo
.reset(s
.length()); // Some upper limit for number of calls that is greater than size of our input string
4965 REGEX_ASSERT(matcher
.find(0, status
)==FALSE
);
4967 REGEX_ASSERT(cbInfo
.numCalls
> 0 && cbInfo
.numCalls
< 25);
4969 // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4970 status
= U_ZERO_ERROR
;
4971 UnicodeString s1
= "aaaaaaaaaaaaaaaaaaaaaaab";
4972 cbInfo
.reset(s1
.length() - 5); // Bail early somewhere near the end of input string
4974 REGEX_ASSERT(matcher
.find(0, status
)==FALSE
);
4975 REGEX_ASSERT(status
== U_REGEX_STOPPED_BY_CALLER
);
4976 REGEX_ASSERT(cbInfo
.numCalls
== s1
.length() - 5);
4978 // Now a match that will succeed, but after an interruption
4979 status
= U_ZERO_ERROR
;
4980 UnicodeString s2
= "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4981 cbInfo
.reset(s2
.length() - 10); // Bail early somewhere near the end of input string
4983 REGEX_ASSERT(matcher
.find(0, status
)==FALSE
);
4984 REGEX_ASSERT(status
== U_REGEX_STOPPED_BY_CALLER
);
4985 // Now retry the match from where left off
4986 cbInfo
.maxCalls
= 100; // No callback limit
4987 status
= U_ZERO_ERROR
;
4988 REGEX_ASSERT(matcher
.find(cbInfo
.lastIndex
, status
));
4996 //---------------------------------------------------------------------------
4998 // PreAllocatedUTextCAPI Check the C API with pre-allocated mutable
4999 // UTexts. The pure-C implementation of UText
5000 // has no mutable backing stores, but we can
5001 // use UnicodeString here to test the functionality.
5003 //---------------------------------------------------------------------------
5004 void RegexTest::PreAllocatedUTextCAPI () {
5005 UErrorCode status
= U_ZERO_ERROR
;
5006 URegularExpression
*re
;
5007 UText patternText
= UTEXT_INITIALIZER
;
5008 UnicodeString buffer
;
5009 UText bufferText
= UTEXT_INITIALIZER
;
5011 utext_openUnicodeString(&bufferText
, &buffer
, &status
);
5014 * getText() and getUText()
5017 UText text1
= UTEXT_INITIALIZER
;
5018 UText text2
= UTEXT_INITIALIZER
;
5019 UChar text2Chars
[20];
5022 status
= U_ZERO_ERROR
;
5023 regextst_openUTF8FromInvariant(&text1
, "abcccd", -1, &status
);
5024 regextst_openUTF8FromInvariant(&text2
, "abcccxd", -1, &status
);
5025 u_uastrncpy(text2Chars
, "abcccxd", sizeof(text2
)/2);
5026 utext_openUChars(&text2
, text2Chars
, -1, &status
);
5028 regextst_openUTF8FromInvariant(&patternText
, "abc*d", -1, &status
);
5029 re
= uregex_openUText(&patternText
, 0, NULL
, &status
);
5031 /* First set a UText */
5032 uregex_setUText(re
, &text1
, &status
);
5033 resultText
= uregex_getUText(re
, &bufferText
, &status
);
5035 REGEX_ASSERT(resultText
== &bufferText
);
5036 utext_setNativeIndex(resultText
, 0);
5037 utext_setNativeIndex(&text1
, 0);
5038 REGEX_ASSERT(testUTextEqual(resultText
, &text1
));
5040 resultText
= uregex_getUText(re
, &bufferText
, &status
);
5042 REGEX_ASSERT(resultText
== &bufferText
);
5043 utext_setNativeIndex(resultText
, 0);
5044 utext_setNativeIndex(&text1
, 0);
5045 REGEX_ASSERT(testUTextEqual(resultText
, &text1
));
5047 /* Then set a UChar * */
5048 uregex_setText(re
, text2Chars
, 7, &status
);
5049 resultText
= uregex_getUText(re
, &bufferText
, &status
);
5051 REGEX_ASSERT(resultText
== &bufferText
);
5052 utext_setNativeIndex(resultText
, 0);
5053 utext_setNativeIndex(&text2
, 0);
5054 REGEX_ASSERT(testUTextEqual(resultText
, &text2
));
5057 utext_close(&text1
);
5058 utext_close(&text2
);
5070 u_uastrncpy(text1
, "noise abc interior def, and this is off the end", UPRV_LENGTHOF(text1
));
5071 // 012345678901234567890123456789012345678901234567
5074 status
= U_ZERO_ERROR
;
5075 re
= uregex_openC("abc(.*?)def", 0, NULL
, &status
);
5078 uregex_setText(re
, text1
, -1, &status
);
5079 result
= uregex_find(re
, 0, &status
);
5080 REGEX_ASSERT(result
==TRUE
);
5082 /* Capture Group 0, the full match. Should succeed. "abc interior def" */
5083 status
= U_ZERO_ERROR
;
5084 actual
= uregex_groupUText(re
, 0, &bufferText
, &length
, &status
);
5086 REGEX_ASSERT(actual
== &bufferText
);
5087 REGEX_ASSERT(utext_getNativeIndex(actual
) == 6);
5088 REGEX_ASSERT(length
== 16);
5089 REGEX_ASSERT(utext_nativeLength(actual
) == 47);
5091 /* Capture group #1. Should succeed, matching " interior ". */
5092 status
= U_ZERO_ERROR
;
5093 actual
= uregex_groupUText(re
, 1, &bufferText
, &length
, &status
);
5095 REGEX_ASSERT(actual
== &bufferText
);
5096 REGEX_ASSERT(utext_getNativeIndex(actual
) == 9); // position of " interior "
5097 REGEX_ASSERT(length
== 10);
5098 REGEX_ASSERT(utext_nativeLength(actual
) == 47);
5100 /* Capture group out of range. Error. */
5101 status
= U_ZERO_ERROR
;
5102 actual
= uregex_groupUText(re
, 2, &bufferText
, &length
, &status
);
5103 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
5104 REGEX_ASSERT(actual
== &bufferText
);
5115 UText replText
= UTEXT_INITIALIZER
;
5117 status
= U_ZERO_ERROR
;
5118 utext_openUnicodeString(&bufferText
, &buffer
, &status
);
5120 status
= U_ZERO_ERROR
;
5121 u_uastrncpy(text1
, "Replace xaax x1x x...x.", UPRV_LENGTHOF(text1
));
5122 u_uastrncpy(text2
, "No match here.", UPRV_LENGTHOF(text2
)/2);
5123 regextst_openUTF8FromInvariant(&replText
, "<$1>", -1, &status
);
5125 re
= uregex_openC("x(.*?)x", 0, NULL
, &status
);
5128 /* Normal case, with match */
5129 uregex_setText(re
, text1
, -1, &status
);
5131 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5133 result
= uregex_replaceFirstUText(re
, &replText
, &bufferText
, &status
);
5135 REGEX_ASSERT(result
== &bufferText
);
5136 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result
);
5138 /* No match. Text should copy to output with no changes. */
5139 uregex_setText(re
, text2
, -1, &status
);
5140 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5141 result
= uregex_replaceFirstUText(re
, &replText
, &bufferText
, &status
);
5143 REGEX_ASSERT(result
== &bufferText
);
5144 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result
);
5146 /* Unicode escapes */
5147 uregex_setText(re
, text1
, -1, &status
);
5148 regextst_openUTF8FromInvariant(&replText
, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status
);
5149 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5150 result
= uregex_replaceFirstUText(re
, &replText
, &bufferText
, &status
);
5152 REGEX_ASSERT(result
== &bufferText
);
5153 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result
);
5156 utext_close(&replText
);
5166 UText replText
= UTEXT_INITIALIZER
;
5169 status
= U_ZERO_ERROR
;
5170 u_uastrncpy(text1
, "Replace xaax x1x x...x.", sizeof(text1
)/2);
5171 u_uastrncpy(text2
, "No match here.", sizeof(text2
)/2);
5172 regextst_openUTF8FromInvariant(&replText
, "<$1>", -1, &status
);
5174 re
= uregex_openC("x(.*?)x", 0, NULL
, &status
);
5177 /* Normal case, with match */
5178 uregex_setText(re
, text1
, -1, &status
);
5179 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5180 result
= uregex_replaceAllUText(re
, &replText
, &bufferText
, &status
);
5182 REGEX_ASSERT(result
== &bufferText
);
5183 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result
);
5185 /* No match. Text should copy to output with no changes. */
5186 uregex_setText(re
, text2
, -1, &status
);
5187 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5188 result
= uregex_replaceAllUText(re
, &replText
, &bufferText
, &status
);
5190 REGEX_ASSERT(result
== &bufferText
);
5191 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result
);
5194 utext_close(&replText
);
5199 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5200 * so we don't need to test it here.
5203 utext_close(&bufferText
);
5204 utext_close(&patternText
);
5208 //--------------------------------------------------------------
5210 // NamedCapture Check basic named capture group functionality
5212 //--------------------------------------------------------------
5213 void RegexTest::NamedCapture() {
5214 UErrorCode status
= U_ZERO_ERROR
;
5215 RegexPattern
*pat
= RegexPattern::compile(UnicodeString(
5216 "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status
);
5218 int32_t group
= pat
->groupNumberFromName("five", -1, status
);
5220 REGEX_ASSERT(5 == group
);
5221 group
= pat
->groupNumberFromName("three", -1, status
);
5223 REGEX_ASSERT(3 == group
);
5225 status
= U_ZERO_ERROR
;
5226 group
= pat
->groupNumberFromName(UnicodeString("six"), status
);
5228 REGEX_ASSERT(6 == group
);
5230 status
= U_ZERO_ERROR
;
5231 group
= pat
->groupNumberFromName(UnicodeString("nosuch"), status
);
5232 U_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5234 status
= U_ZERO_ERROR
;
5236 // After copying a pattern, named capture should still work in the copy.
5237 RegexPattern
*copiedPat
= new RegexPattern(*pat
);
5238 REGEX_ASSERT(*copiedPat
== *pat
);
5239 delete pat
; pat
= NULL
; // Delete original, copy should have no references back to it.
5241 group
= copiedPat
->groupNumberFromName("five", -1, status
);
5243 REGEX_ASSERT(5 == group
);
5244 group
= copiedPat
->groupNumberFromName("three", -1, status
);
5246 REGEX_ASSERT(3 == group
);
5249 // ReplaceAll with named capture group.
5250 status
= U_ZERO_ERROR
;
5251 UnicodeString
text("Substitution of <<quotes>> for <<double brackets>>");
5252 RegexMatcher
*m
= new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text
, 0, status
);
5254 // m.pattern().dumpPattern();
5255 UnicodeString replacedText
= m
->replaceAll("'${mid}'", status
);
5257 REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText
);
5260 // ReplaceAll, allowed capture group numbers.
5261 text
= UnicodeString("abcmxyz");
5262 m
= new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text
, 0, status
);
5265 status
= U_ZERO_ERROR
;
5266 replacedText
= m
->replaceAll(UnicodeString("<$0>"), status
); // group 0, full match, is allowed.
5268 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText
);
5270 status
= U_ZERO_ERROR
;
5271 replacedText
= m
->replaceAll(UnicodeString("<$1>"), status
); // group 1 by number.
5273 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText
);
5275 status
= U_ZERO_ERROR
;
5276 replacedText
= m
->replaceAll(UnicodeString("<${one}>"), status
); // group 1 by name.
5278 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText
);
5280 status
= U_ZERO_ERROR
;
5281 replacedText
= m
->replaceAll(UnicodeString("<$2>"), status
); // group 2.
5283 REGEX_ASSERT(UnicodeString("a<x>z") == replacedText
);
5285 status
= U_ZERO_ERROR
;
5286 replacedText
= m
->replaceAll(UnicodeString("<$3>"), status
);
5288 REGEX_ASSERT(UnicodeString("a<y>z") == replacedText
);
5290 status
= U_ZERO_ERROR
;
5291 replacedText
= m
->replaceAll(UnicodeString("<$4>"), status
);
5292 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
5294 status
= U_ZERO_ERROR
;
5295 replacedText
= m
->replaceAll(UnicodeString("<$04>"), status
); // group 0, leading 0,
5296 REGEX_CHECK_STATUS
; // trailing out-of-range 4 passes through.
5297 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText
);
5299 status
= U_ZERO_ERROR
;
5300 replacedText
= m
->replaceAll(UnicodeString("<$000016>"), status
); // Consume leading zeroes. Don't consume digits
5301 REGEX_CHECK_STATUS
; // that push group num out of range.
5302 REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText
); // This is group 1.
5304 status
= U_ZERO_ERROR
;
5305 replacedText
= m
->replaceAll(UnicodeString("<$3$2$1${one}>"), status
);
5307 REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText
);
5309 status
= U_ZERO_ERROR
;
5310 replacedText
= m
->replaceAll(UnicodeString("$3$2$1${one}"), status
);
5312 REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText
);
5314 status
= U_ZERO_ERROR
;
5315 replacedText
= m
->replaceAll(UnicodeString("<${noSuchName}>"), status
);
5316 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5318 status
= U_ZERO_ERROR
;
5319 replacedText
= m
->replaceAll(UnicodeString("<${invalid-name}>"), status
);
5320 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5322 status
= U_ZERO_ERROR
;
5323 replacedText
= m
->replaceAll(UnicodeString("<${one"), status
);
5324 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5326 status
= U_ZERO_ERROR
;
5327 replacedText
= m
->replaceAll(UnicodeString("$not a capture group"), status
);
5328 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5332 // Repeat the above replaceAll() tests using the plain C API, which
5333 // has a separate implementation internally.
5334 // TODO: factor out the test data.
5336 status
= U_ZERO_ERROR
;
5337 URegularExpression
*re
= uregex_openC("..(?<one>m)(.)(.)", 0, NULL
, &status
);
5339 text
= UnicodeString("abcmxyz");
5340 uregex_setText(re
, text
.getBuffer(), text
.length(), &status
);
5343 UChar resultBuf
[100];
5344 int32_t resultLength
;
5347 status
= U_ZERO_ERROR
;
5348 repl
= UnicodeString("<$0>");
5349 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5351 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf
, resultLength
));
5353 status
= U_ZERO_ERROR
;
5354 repl
= UnicodeString("<$1>");
5355 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5357 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf
, resultLength
));
5359 status
= U_ZERO_ERROR
;
5360 repl
= UnicodeString("<${one}>");
5361 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5363 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf
, resultLength
));
5365 status
= U_ZERO_ERROR
;
5366 repl
= UnicodeString("<$2>");
5367 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5369 REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf
, resultLength
));
5371 status
= U_ZERO_ERROR
;
5372 repl
= UnicodeString("<$3>");
5373 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5375 REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf
, resultLength
));
5377 status
= U_ZERO_ERROR
;
5378 repl
= UnicodeString("<$4>");
5379 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5380 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
5382 status
= U_ZERO_ERROR
;
5383 repl
= UnicodeString("<$04>");
5384 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5386 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf
, resultLength
));
5388 status
= U_ZERO_ERROR
;
5389 repl
= UnicodeString("<$000016>");
5390 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5392 REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf
, resultLength
));
5394 status
= U_ZERO_ERROR
;
5395 repl
= UnicodeString("<$3$2$1${one}>");
5396 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5398 REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf
, resultLength
));
5400 status
= U_ZERO_ERROR
;
5401 repl
= UnicodeString("$3$2$1${one}");
5402 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5404 REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf
, resultLength
));
5406 status
= U_ZERO_ERROR
;
5407 repl
= UnicodeString("<${noSuchName}>");
5408 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5409 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5411 status
= U_ZERO_ERROR
;
5412 repl
= UnicodeString("<${invalid-name}>");
5413 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5414 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5416 status
= U_ZERO_ERROR
;
5417 repl
= UnicodeString("<${one");
5418 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5419 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5421 status
= U_ZERO_ERROR
;
5422 repl
= UnicodeString("$not a capture group");
5423 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5424 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5429 //--------------------------------------------------------------
5431 // NamedCaptureLimits Patterns with huge numbers of named capture groups.
5432 // The point is not so much what the exact limit is,
5433 // but that a largish number doesn't hit bad non-linear performance,
5434 // and that exceeding the limit fails cleanly.
5436 //--------------------------------------------------------------
5437 void RegexTest::NamedCaptureLimits() {
5439 logln("Skipping test. Runs in exhuastive mode only.");
5442 const int32_t goodLimit
= 1000000; // Pattern w this many groups builds successfully.
5443 const int32_t failLimit
= 10000000; // Pattern exceeds internal limits, fails to compile.
5445 UnicodeString pattern
;
5448 for (nn
=1; nn
<goodLimit
; nn
++) {
5449 sprintf(nnbuf
, "(?<nn%d>)", nn
);
5450 pattern
.append(UnicodeString(nnbuf
, -1, US_INV
));
5452 UErrorCode status
= U_ZERO_ERROR
;
5453 RegexPattern
*pat
= RegexPattern::compile(pattern
, 0, status
);
5455 for (nn
=1; nn
<goodLimit
; nn
++) {
5456 sprintf(nnbuf
, "nn%d", nn
);
5457 int32_t groupNum
= pat
->groupNumberFromName(nnbuf
, -1, status
);
5458 REGEX_ASSERT(nn
== groupNum
);
5459 if (nn
!= groupNum
) {
5466 for (nn
=1; nn
<failLimit
; nn
++) {
5467 sprintf(nnbuf
, "(?<nn%d>)", nn
);
5468 pattern
.append(UnicodeString(nnbuf
, -1, US_INV
));
5470 status
= U_ZERO_ERROR
;
5471 pat
= RegexPattern::compile(pattern
, 0, status
);
5472 REGEX_ASSERT(status
== U_REGEX_PATTERN_TOO_BIG
);
5477 //--------------------------------------------------------------
5479 // Bug7651 Regex pattern that exceeds default operator stack depth in matcher.
5481 //---------------------------------------------------------------
5482 void RegexTest::Bug7651() {
5483 UnicodeString
pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5484 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5485 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5486 UnicodeString
pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5487 UnicodeString
s("#ff @abcd This is test");
5488 RegexPattern
*REPattern
= NULL
;
5489 RegexMatcher
*REMatcher
= NULL
;
5490 UErrorCode status
= U_ZERO_ERROR
;
5493 REPattern
= RegexPattern::compile(pattern1
, 0, pe
, status
);
5495 REMatcher
= REPattern
->matcher(s
, status
);
5497 REGEX_ASSERT(REMatcher
->find());
5498 REGEX_ASSERT(REMatcher
->start(status
) == 0);
5501 status
= U_ZERO_ERROR
;
5503 REPattern
= RegexPattern::compile(pattern2
, 0, pe
, status
);
5505 REMatcher
= REPattern
->matcher(s
, status
);
5507 REGEX_ASSERT(REMatcher
->find());
5508 REGEX_ASSERT(REMatcher
->start(status
) == 0);
5511 status
= U_ZERO_ERROR
;
5514 void RegexTest::Bug7740() {
5515 UErrorCode status
= U_ZERO_ERROR
;
5516 UnicodeString pattern
= "(a)";
5517 UnicodeString text
= "abcdef";
5518 RegexMatcher
*m
= new RegexMatcher(pattern
, text
, 0, status
);
5520 REGEX_ASSERT(m
->lookingAt(status
));
5522 status
= U_ILLEGAL_ARGUMENT_ERROR
;
5523 UnicodeString s
= m
->group(1, status
); // Bug 7740: segfault here.
5524 REGEX_ASSERT(status
== U_ILLEGAL_ARGUMENT_ERROR
);
5525 REGEX_ASSERT(s
== "");
5529 // Bug 8479: was crashing whith a Bogus UnicodeString as input.
5531 void RegexTest::Bug8479() {
5532 UErrorCode status
= U_ZERO_ERROR
;
5534 RegexMatcher
* const pMatcher
= new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL
|UREGEX_CASE_INSENSITIVE
, status
);
5536 if (U_SUCCESS(status
))
5540 pMatcher
->reset(str
);
5541 status
= U_ZERO_ERROR
;
5542 pMatcher
->matches(status
);
5543 REGEX_ASSERT(status
== U_ILLEGAL_ARGUMENT_ERROR
);
5550 void RegexTest::Bug7029() {
5551 UErrorCode status
= U_ZERO_ERROR
;
5553 RegexMatcher
* const pMatcher
= new RegexMatcher(".", 0, status
);
5554 UnicodeString text
= "abc.def";
5555 UnicodeString splits
[10];
5557 int32_t numFields
= pMatcher
->split(text
, splits
, 10, status
);
5559 REGEX_ASSERT(numFields
== 8);
5564 // This test is checking for the existance of any supplemental characters that case-fold
5565 // to a bmp character.
5567 // At the time of this writing there are none. If any should appear in a subsequent release
5568 // of Unicode, the code in regular expressions compilation that determines the longest
5569 // posssible match for a literal string will need to be enhanced.
5571 // See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5572 // for details on what to do in case of a failure of this test.
5574 void RegexTest::Bug9283() {
5575 #if !UCONFIG_NO_NORMALIZATION
5576 UErrorCode status
= U_ZERO_ERROR
;
5577 UnicodeSet
supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status
);
5581 for (index
=0; ; index
++) {
5582 c
= supplementalsWithCaseFolding
.charAt(index
);
5586 UnicodeString cf
= UnicodeString(c
).foldCase();
5587 REGEX_ASSERT(cf
.length() >= 2);
5589 #endif /* #if !UCONFIG_NO_NORMALIZATION */
5593 void RegexTest::CheckInvBufSize() {
5594 if(inv_next
>=INV_BUFSIZ
) {
5595 errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5596 __FILE__
, INV_BUFSIZ
, inv_next
);
5598 logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__
, INV_BUFSIZ
, inv_next
);
5603 void RegexTest::Bug10459() {
5604 UErrorCode status
= U_ZERO_ERROR
;
5605 UnicodeString
patternString("(txt)");
5606 UnicodeString
txtString("txt");
5608 UText
*utext_pat
= utext_openUnicodeString(NULL
, &patternString
, &status
);
5610 UText
*utext_txt
= utext_openUnicodeString(NULL
, &txtString
, &status
);
5613 URegularExpression
*icu_re
= uregex_openUText(utext_pat
, 0, NULL
, &status
);
5616 uregex_setUText(icu_re
, utext_txt
, &status
);
5619 // The bug was that calling uregex_group() before doing a matching operation
5620 // was causing a segfault. Only for Regular Expressions created from UText.
5621 // It should set an U_REGEX_INVALID_STATE.
5624 int32_t len
= uregex_group(icu_re
, 0, buf
, UPRV_LENGTHOF(buf
), &status
);
5625 REGEX_ASSERT(status
== U_REGEX_INVALID_STATE
);
5626 REGEX_ASSERT(len
== 0);
5628 uregex_close(icu_re
);
5629 utext_close(utext_pat
);
5630 utext_close(utext_txt
);
5633 void RegexTest::TestCaseInsensitiveStarters() {
5634 // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
5635 // become stale because of new Unicode characters.
5636 // If it is stale, rerun the generation tool
5637 // svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
5638 // and replace the embedded data in i18n/regexcmp.cpp
5640 for (UChar32 cp
=0; cp
<=0x10ffff; cp
++) {
5641 if (!u_hasBinaryProperty(cp
, UCHAR_CASE_SENSITIVE
)) {
5644 UnicodeSet
s(cp
, cp
);
5645 s
.closeOver(USET_CASE_INSENSITIVE
);
5646 UnicodeSetIterator
setIter(s
);
5647 while (setIter
.next()) {
5648 if (!setIter
.isString()) {
5651 const UnicodeString
&str
= setIter
.getString();
5652 UChar32 firstChar
= str
.char32At(0);
5653 UnicodeSet starters
;
5654 RegexCompile::findCaseInsensitiveStarters(firstChar
, &starters
);
5655 if (!starters
.contains(cp
)) {
5656 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp
, firstChar
);
5664 void RegexTest::TestBug11049() {
5665 // Original bug report: pattern with match start consisting of one of several individual characters,
5666 // and the text being matched ending with a supplementary character. find() would read past the
5667 // end of the input text when searching for potential match starting points.
5669 // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
5670 // detect the bad read.
5672 TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE
, __LINE__
);
5673 TestCase11049("A|B|C", "string matches at end C", TRUE
, __LINE__
);
5675 // Test again with a pattern starting with a single character,
5676 // which takes a different code path than starting with an OR expression,
5677 // but with similar logic.
5678 TestCase11049("C", "a string \\ud800\\udc00", FALSE
, __LINE__
);
5679 TestCase11049("C", "string matches at end C", TRUE
, __LINE__
);
5682 // Run a single test case from TestBug11049(). Internal function.
5683 void RegexTest::TestCase11049(const char *pattern
, const char *data
, UBool expectMatch
, int32_t lineNumber
) {
5684 UErrorCode status
= U_ZERO_ERROR
;
5685 UnicodeString patternString
= UnicodeString(pattern
).unescape();
5686 LocalPointer
<RegexPattern
> compiledPat(RegexPattern::compile(patternString
, 0, status
));
5688 UnicodeString dataString
= UnicodeString(data
).unescape();
5689 UChar
*exactBuffer
= new UChar
[dataString
.length()];
5690 dataString
.extract(exactBuffer
, dataString
.length(), status
);
5691 UText
*ut
= utext_openUChars(NULL
, exactBuffer
, dataString
.length(), &status
);
5693 LocalPointer
<RegexMatcher
> matcher(compiledPat
->matcher(status
));
5696 UBool result
= matcher
->find();
5697 if (result
!= expectMatch
) {
5698 errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5699 __FILE__
, lineNumber
, expectMatch
, result
, pattern
, data
);
5702 // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
5703 // off-by-one on find() with match at the last code point.
5704 // Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
5705 // because string.unescape() will only shrink it.
5706 char * utf8Buffer
= new char[uprv_strlen(data
)+1];
5707 u_strToUTF8(utf8Buffer
, static_cast<int32_t>(uprv_strlen(data
)+1), NULL
, dataString
.getBuffer(), dataString
.length(), &status
);
5709 ut
= utext_openUTF8(ut
, utf8Buffer
, -1, &status
);
5712 result
= matcher
->find();
5713 if (result
!= expectMatch
) {
5714 errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5715 __FILE__
, lineNumber
, expectMatch
, result
, pattern
, data
);
5717 delete [] utf8Buffer
;
5720 delete [] exactBuffer
;
5724 void RegexTest::TestBug11371() {
5726 logln("Skipping test. Runs in exhuastive mode only.");
5729 UErrorCode status
= U_ZERO_ERROR
;
5730 UnicodeString patternString
;
5732 for (int i
=0; i
<8000000; i
++) {
5733 patternString
.append(UnicodeString("()"));
5735 LocalPointer
<RegexPattern
> compiledPat(RegexPattern::compile(patternString
, 0, status
));
5736 if (status
!= U_REGEX_PATTERN_TOO_BIG
) {
5737 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5738 __FILE__
, __LINE__
, u_errorName(status
));
5741 status
= U_ZERO_ERROR
;
5742 patternString
= "(";
5743 for (int i
=0; i
<20000000; i
++) {
5744 patternString
.append(UnicodeString("A++"));
5746 patternString
.append(UnicodeString("){0}B++"));
5747 LocalPointer
<RegexPattern
> compiledPat2(RegexPattern::compile(patternString
, 0, status
));
5748 if (status
!= U_REGEX_PATTERN_TOO_BIG
) {
5749 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5750 __FILE__
, __LINE__
, u_errorName(status
));
5753 // Pattern with too much string data, such that string indexes overflow operand data field size
5754 // in compiled instruction.
5755 status
= U_ZERO_ERROR
;
5757 while (patternString
.length() < 0x00ffffff) {
5758 patternString
.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
5760 patternString
.append(UnicodeString("X? trailing string"));
5761 LocalPointer
<RegexPattern
> compiledPat3(RegexPattern::compile(patternString
, 0, status
));
5762 if (status
!= U_REGEX_PATTERN_TOO_BIG
) {
5763 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5764 __FILE__
, __LINE__
, u_errorName(status
));
5768 void RegexTest::TestBug11480() {
5769 // C API, get capture group of a group that does not participate in the match.
5770 // (Returns a zero length string, with nul termination,
5771 // indistinguishable from a group with a zero length match.)
5773 UErrorCode status
= U_ZERO_ERROR
;
5774 URegularExpression
*re
= uregex_openC("(A)|(B)", 0, NULL
, &status
);
5776 UnicodeString text
= UNICODE_STRING_SIMPLE("A");
5777 uregex_setText(re
, text
.getBuffer(), text
.length(), &status
);
5779 REGEX_ASSERT(uregex_lookingAt(re
, 0, &status
));
5780 UChar buf
[10] = {(UChar
)13, (UChar
)13, (UChar
)13, (UChar
)13};
5781 int32_t length
= uregex_group(re
, 2, buf
+1, UPRV_LENGTHOF(buf
)-1, &status
);
5782 REGEX_ASSERT(length
== 0);
5783 REGEX_ASSERT(buf
[0] == 13);
5784 REGEX_ASSERT(buf
[1] == 0);
5785 REGEX_ASSERT(buf
[2] == 13);
5788 // UText C++ API, length of match is 0 for non-participating matches.
5789 UText ut
= UTEXT_INITIALIZER
;
5790 utext_openUnicodeString(&ut
, &text
, &status
);
5791 RegexMatcher
matcher(UnicodeString("(A)|(B)"), 0, status
);
5794 REGEX_ASSERT(matcher
.lookingAt(0, status
));
5796 // UText C++ API, Capture group 1 matches "A", position 0, length 1.
5797 int64_t groupLen
= -666;
5798 UText group
= UTEXT_INITIALIZER
;
5799 matcher
.group(1, &group
, groupLen
, status
);
5801 REGEX_ASSERT(groupLen
== 1);
5802 REGEX_ASSERT(utext_getNativeIndex(&group
) == 0);
5804 // Capture group 2, the (B), does not participate in the match.
5805 matcher
.group(2, &group
, groupLen
, status
);
5807 REGEX_ASSERT(groupLen
== 0);
5808 REGEX_ASSERT(matcher
.start(2, status
) == -1);
5812 void RegexTest::TestBug12884() {
5813 // setTimeLimit() was not effective for empty sub-patterns with large {minimum counts}
5814 UnicodeString
pattern(u
"(((((((){120}){11}){11}){11}){80}){11}){4}");
5815 UnicodeString
text(u
"hello");
5816 UErrorCode status
= U_ZERO_ERROR
;
5817 RegexMatcher
m(pattern
, text
, 0, status
);
5819 m
.setTimeLimit(5, status
);
5821 REGEX_ASSERT(status
== U_REGEX_TIME_OUT
);
5823 // Non-greedy loops. They take a different code path during matching.
5824 UnicodeString
ngPattern(u
"(((((((){120}?){11}?){11}?){11}?){80}?){11}?){4}?");
5825 status
= U_ZERO_ERROR
;
5826 RegexMatcher
ngM(ngPattern
, text
, 0, status
);
5828 ngM
.setTimeLimit(5, status
);
5830 REGEX_ASSERT(status
== U_REGEX_TIME_OUT
);
5832 // UText, wrapping non-UTF-16 text, also takes a different execution path.
5833 const char *text8
= reinterpret_cast<const char*>(u8
"¿Qué es Unicode? Unicode proporciona un número único para cada"
5834 "carácter, sin importar la plataforma, sin importar el programa,"
5835 "sin importar el idioma.");
5836 status
= U_ZERO_ERROR
;
5837 LocalUTextPointer
ut(utext_openUTF8(NULL
, text8
, -1, &status
));
5839 m
.reset(ut
.getAlias());
5841 REGEX_ASSERT(status
== U_REGEX_TIME_OUT
);
5843 status
= U_ZERO_ERROR
;
5844 ngM
.reset(ut
.getAlias());
5846 REGEX_ASSERT(status
== U_REGEX_TIME_OUT
);
5849 // Bug 13631. A find() of a pattern with a zero length look-behind assertions
5850 // can cause a read past the end of the input text.
5851 // The failure is seen when running this test with Clang's Addresss Sanitizer.
5853 void RegexTest::TestBug13631() {
5854 const UChar
*pats
[] = { u
"(?<!^)",
5858 for (const UChar
**pat
=pats
; *pat
; ++pat
) {
5859 UErrorCode status
= U_ZERO_ERROR
;
5860 UnicodeString
upat(*pat
);
5861 RegexMatcher
matcher(upat
, 0, status
);
5862 const UChar s
=u
'a';
5863 UText
*ut
= utext_openUChars(nullptr, &s
, 1, &status
);
5866 while (matcher
.find()) {
5872 // Bug 13632 Out of bounds memory reference if a replacement string ends with a '$',
5873 // where a following group specification would be expected.
5874 // Failure shows when running the test under Clang's Address Sanitizer.
5876 void RegexTest::TestBug13632() {
5877 UErrorCode status
= U_ZERO_ERROR
;
5878 URegularExpression
*re
= uregex_openC(" ", 0, nullptr, &status
);
5879 const char16_t *sourceString
= u
"Hello, world.";
5880 uregex_setText(re
, sourceString
, u_strlen(sourceString
), &status
);
5882 const int32_t destCap
= 20;
5883 char16_t dest
[destCap
] = {};
5884 const char16_t replacement
[] = {u
'x', u
'$'}; // Not nul terminated string.
5885 uregex_replaceAll(re
, replacement
, 2, dest
, destCap
, &status
);
5887 assertEquals("", U_REGEX_INVALID_CAPTURE_GROUP_NAME
, status
);
5891 void RegexTest::TestBug20359() {
5892 // The bug was stack overflow while parsing a pattern with a huge number of adjacent \Q\E
5893 // pairs. (Enter and exit pattern literal quote mode). Logic was correct.
5894 // Changed implementation to loop instead of recursing.
5896 UnicodeString pattern
;
5897 for (int i
=0; i
<50000; ++i
) {
5898 pattern
+= u
"\\Q\\E";
5902 UErrorCode status
= U_ZERO_ERROR
;
5903 LocalURegularExpressionPointer
re(uregex_open(pattern
.getBuffer(), pattern
.length(),
5904 0, nullptr, &status
));
5905 assertSuccess(WHERE
, status
);
5907 // We have passed the point where the bug crashed. The following is a small sanity
5908 // check that the pattern works, that all the \Q\E\Q\E... didn't cause other problems.
5910 uregex_setText(re
.getAlias(), u
"abcxyz", -1, &status
);
5911 assertSuccess(WHERE
, status
);
5912 assertTrue(WHERE
, uregex_find(re
.getAlias(), 0, &status
));
5913 assertEquals(WHERE
, 3, uregex_start(re
.getAlias(), 0, &status
));
5914 assertSuccess(WHERE
, status
);
5918 void RegexTest::TestBug20863() {
5919 // Test that patterns with a large number of named capture groups work correctly.
5921 // The ticket was not for a bug per se, but to reduce memory usage by using lazy
5922 // construction of the map from capture names to numbers, and decreasing the
5923 // default size of the map.
5925 constexpr int GROUP_COUNT
= 2000;
5926 std::vector
<UnicodeString
> groupNames
;
5927 for (int32_t i
=0; i
<GROUP_COUNT
; ++i
) {
5929 name
.append(u
"name");
5930 name
.append(Int64ToUnicodeString(i
));
5931 groupNames
.push_back(name
);
5934 UnicodeString patternString
;
5935 for (UnicodeString name
: groupNames
) {
5936 patternString
.append(u
"(?<");
5937 patternString
.append(name
);
5938 patternString
.append(u
">.)");
5941 UErrorCode status
= U_ZERO_ERROR
;
5943 LocalPointer
<RegexPattern
> pattern(RegexPattern::compile(patternString
, pe
, status
), status
);
5944 if (!assertSuccess(WHERE
, status
)) {
5948 for (int32_t i
=0; i
<GROUP_COUNT
; ++i
) {
5949 int32_t group
= pattern
->groupNumberFromName(groupNames
[i
], status
);
5950 if (!assertSuccess(WHERE
, status
)) {
5953 assertEquals(WHERE
, i
+1, group
);
5954 // Note: group 0 is the overall match; group 1 is the first separate capture group.
5957 // Verify that assignment of patterns with various combinations of named capture work.
5958 // Lazy creation of the internal named capture map changed the implementation logic here.
5960 LocalPointer
<RegexPattern
> pat1(RegexPattern::compile(u
"abc", pe
, status
), status
);
5961 LocalPointer
<RegexPattern
> pat2(RegexPattern::compile(u
"a(?<name>b)c", pe
, status
), status
);
5962 assertSuccess(WHERE
, status
);
5963 assertFalse(WHERE
, *pat1
== *pat2
);
5965 assertTrue(WHERE
, *pat1
== *pat2
);
5966 assertEquals(WHERE
, 1, pat1
->groupNumberFromName(u
"name", status
));
5967 assertEquals(WHERE
, 1, pat2
->groupNumberFromName(u
"name", status
));
5968 assertSuccess(WHERE
, status
);
5972 LocalPointer
<RegexPattern
> pat1(RegexPattern::compile(u
"abc", pe
, status
), status
);
5973 LocalPointer
<RegexPattern
> pat2(RegexPattern::compile(u
"a(?<name>b)c", pe
, status
), status
);
5974 assertSuccess(WHERE
, status
);
5975 assertFalse(WHERE
, *pat1
== *pat2
);
5977 assertTrue(WHERE
, *pat1
== *pat2
);
5978 assertEquals(WHERE
, 0, pat1
->groupNumberFromName(u
"name", status
));
5979 assertEquals(WHERE
, U_REGEX_INVALID_CAPTURE_GROUP_NAME
, status
);
5980 status
= U_ZERO_ERROR
;
5981 assertEquals(WHERE
, 0, pat2
->groupNumberFromName(u
"name", status
));
5982 assertEquals(WHERE
, U_REGEX_INVALID_CAPTURE_GROUP_NAME
, status
);
5983 status
= U_ZERO_ERROR
;
5987 LocalPointer
<RegexPattern
> pat1(RegexPattern::compile(u
"a(?<name1>b)c", pe
, status
), status
);
5988 LocalPointer
<RegexPattern
> pat2(RegexPattern::compile(u
"a(?<name2>b)c", pe
, status
), status
);
5989 assertSuccess(WHERE
, status
);
5990 assertFalse(WHERE
, *pat1
== *pat2
);
5992 assertTrue(WHERE
, *pat1
== *pat2
);
5993 assertEquals(WHERE
, 1, pat1
->groupNumberFromName(u
"name1", status
));
5994 assertSuccess(WHERE
, status
);
5995 assertEquals(WHERE
, 1, pat2
->groupNumberFromName(u
"name1", status
));
5996 assertSuccess(WHERE
, status
);
5997 assertEquals(WHERE
, 0, pat1
->groupNumberFromName(u
"name2", status
));
5998 assertEquals(WHERE
, U_REGEX_INVALID_CAPTURE_GROUP_NAME
, status
);
5999 status
= U_ZERO_ERROR
;
6000 assertEquals(WHERE
, 0, pat2
->groupNumberFromName(u
"name2", status
));
6001 assertEquals(WHERE
, U_REGEX_INVALID_CAPTURE_GROUP_NAME
, status
);
6002 status
= U_ZERO_ERROR
;
6008 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */