1 /********************************************************************
3 * Copyright (c) 2002-2015, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
10 // ICU Regular Expressions test, part of intltest.
16 PLEASE be careful about ASCII assumptions in this test.
17 This test is one of the worst repeat offenders.
18 If you have questions, contact someone on the ICU PMC
19 who has access to an EBCDIC system.
24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
26 #include "unicode/localpointer.h"
27 #include "unicode/regex.h"
28 #include "unicode/uchar.h"
29 #include "unicode/ucnv.h"
30 #include "unicode/uniset.h"
31 #include "unicode/uregex.h"
32 #include "unicode/usetiter.h"
33 #include "unicode/ustring.h"
45 #define SUPPORT_MUTATING_INPUT_STRING 0
47 //---------------------------------------------------------------------------
49 // Test class boilerplate
51 //---------------------------------------------------------------------------
52 RegexTest::RegexTest()
57 RegexTest::~RegexTest()
63 void RegexTest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* /*par*/ )
65 if (exec
) logln("TestSuite RegexTest: ");
68 case 0: name
= "Basic";
71 case 1: name
= "API_Match";
72 if (exec
) API_Match();
74 case 2: name
= "API_Replace";
75 if (exec
) API_Replace();
77 case 3: name
= "API_Pattern";
78 if (exec
) API_Pattern();
81 #if !UCONFIG_NO_FILE_IO
88 case 5: name
= "Errors";
91 case 6: name
= "PerlTests";
92 if (exec
) PerlTests();
94 case 7: name
= "Callbacks";
95 if (exec
) Callbacks();
97 case 8: name
= "FindProgressCallbacks";
98 if (exec
) FindProgressCallbacks();
100 case 9: name
= "Bug 6149";
103 case 10: name
= "UTextBasic";
104 if (exec
) UTextBasic();
106 case 11: name
= "API_Match_UTF8";
107 if (exec
) API_Match_UTF8();
109 case 12: name
= "API_Replace_UTF8";
110 if (exec
) API_Replace_UTF8();
112 case 13: name
= "API_Pattern_UTF8";
113 if (exec
) API_Pattern_UTF8();
115 case 14: name
= "PerlTestsUTF8";
116 if (exec
) PerlTestsUTF8();
118 case 15: name
= "PreAllocatedUTextCAPI";
119 if (exec
) PreAllocatedUTextCAPI();
121 case 16: name
= "Bug 7651";
124 case 17: name
= "Bug 7740";
127 case 18: name
= "Bug 8479";
130 case 19: name
= "Bug 7029";
133 case 20: name
= "CheckInvBufSize";
134 if (exec
) CheckInvBufSize();
136 case 21: name
= "Bug 9283";
139 case 22: name
= "Bug10459";
140 if (exec
) Bug10459();
142 case 23: name
= "TestCaseInsensitiveStarters";
143 if (exec
) TestCaseInsensitiveStarters();
145 case 24: name
= "TestBug11049";
146 if (exec
) TestBug11049();
148 case 25: name
= "TestBug11371";
149 if (exec
) TestBug11371();
151 case 26: name
= "TestBug11480";
152 if (exec
) TestBug11480();
154 case 27: name
= "NamedCapture";
155 if (exec
) NamedCapture();
157 case 28: name
= "NamedCaptureLimits";
158 if (exec
) NamedCaptureLimits();
161 break; //needed to end loop
168 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
170 * @see utext_openUTF8
172 static UText
* regextst_openUTF8FromInvariant(UText
* ut
, const char *inv
, int64_t length
, UErrorCode
*status
);
174 //---------------------------------------------------------------------------
176 // Error Checking / Reporting macros used in all of the tests.
178 //---------------------------------------------------------------------------
180 static void utextToPrintable(char *buf
, int32_t bufLen
, UText
*text
) {
181 int64_t oldIndex
= utext_getNativeIndex(text
);
182 utext_setNativeIndex(text
, 0);
184 UChar32 c
= utext_next32From(text
, 0);
185 while ((c
!= U_SENTINEL
) && (bufPtr
< buf
+bufLen
)) {
186 if (0x000020<=c
&& c
<0x00007e) {
190 sprintf(bufPtr
,"U+%04X", c
);
191 bufPtr
+= strlen(bufPtr
)-1;
197 c
= UTEXT_NEXT32(text
);
200 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
201 char *ebuf
= (char*)malloc(bufLen
);
202 uprv_eastrncpy((unsigned char*)ebuf
, (const unsigned char*)buf
, bufLen
);
203 uprv_strncpy(buf
, ebuf
, bufLen
);
206 utext_setNativeIndex(text
, oldIndex
);
210 static char ASSERT_BUF
[1024];
212 const char* RegexTest::extractToAssertBuf(const UnicodeString
& message
) {
213 if(message
.length()==0) {
214 strcpy(ASSERT_BUF
, "[[empty UnicodeString]]");
217 IntlTest::prettify(message
,buf
);
218 if(buf
.length()==0) {
219 strcpy(ASSERT_BUF
, "[[escape() returned 0 chars]]");
221 buf
.extract(0, 0x7FFFFFFF, ASSERT_BUF
, sizeof(ASSERT_BUF
)-1);
222 if(ASSERT_BUF
[0]==0) {
224 for(int32_t i
=0;i
<buf
.length();i
++) {
226 sprintf(ASSERT_BUF
+strlen(ASSERT_BUF
),"\\u%02x",ch
);
231 ASSERT_BUF
[sizeof(ASSERT_BUF
)-1] = 0;
235 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
237 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \
238 __FILE__, __LINE__, u_errorName(status)); return;}}
240 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
242 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
243 if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \
244 __LINE__, u_errorName(errcode), u_errorName(status));};}
246 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
247 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
249 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
250 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
252 // expected: const char * , restricted to invariant characters.
253 // actual: const UnicodeString &
254 #define REGEX_ASSERT_UNISTR(expected, actual) { \
255 if (UnicodeString(expected, -1, US_INV) != (actual)) { \
256 errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n", \
257 __FILE__, __LINE__, expected, extractToAssertBuf(actual));};}
260 static UBool
testUTextEqual(UText
*uta
, UText
*utb
) {
263 utext_setNativeIndex(uta
, 0);
264 utext_setNativeIndex(utb
, 0);
266 ca
= utext_next32(uta
);
267 cb
= utext_next32(utb
);
271 } while (ca
!= U_SENTINEL
);
277 * @param expected expected text in UTF-8 (not platform) codepage
279 void RegexTest::assertUText(const char *expected
, UText
*actual
, const char *file
, int line
) {
280 UErrorCode status
= U_ZERO_ERROR
;
281 UText expectedText
= UTEXT_INITIALIZER
;
282 utext_openUTF8(&expectedText
, expected
, -1, &status
);
283 if(U_FAILURE(status
)) {
284 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file
, line
, u_errorName(status
), strlen(expected
));
287 if(utext_nativeLength(&expectedText
)==0 && (strlen(expected
)!=0)) {
288 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file
, line
, strlen(expected
));
291 utext_setNativeIndex(actual
, 0);
292 if (!testUTextEqual(&expectedText
, actual
)) {
293 char buf
[201 /*21*/];
294 char expectedBuf
[201];
295 utextToPrintable(buf
, sizeof(buf
)/sizeof(buf
[0]), actual
);
296 utextToPrintable(expectedBuf
, sizeof(expectedBuf
)/sizeof(expectedBuf
[0]), &expectedText
);
297 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file
, line
, expectedBuf
, (int)utext_nativeLength(&expectedText
), buf
, (int)utext_nativeLength(actual
));
299 utext_close(&expectedText
);
302 * @param expected invariant (platform local text) input
305 void RegexTest::assertUTextInvariant(const char *expected
, UText
*actual
, const char *file
, int line
) {
306 UErrorCode status
= U_ZERO_ERROR
;
307 UText expectedText
= UTEXT_INITIALIZER
;
308 regextst_openUTF8FromInvariant(&expectedText
, expected
, -1, &status
);
309 if(U_FAILURE(status
)) {
310 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file
, line
, u_errorName(status
), strlen(expected
));
313 utext_setNativeIndex(actual
, 0);
314 if (!testUTextEqual(&expectedText
, actual
)) {
315 char buf
[201 /*21*/];
316 char expectedBuf
[201];
317 utextToPrintable(buf
, sizeof(buf
)/sizeof(buf
[0]), actual
);
318 utextToPrintable(expectedBuf
, sizeof(expectedBuf
)/sizeof(expectedBuf
[0]), &expectedText
);
319 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file
, line
, expectedBuf
, (int)utext_nativeLength(&expectedText
), buf
, (int)utext_nativeLength(actual
));
321 utext_close(&expectedText
);
325 * Assumes utf-8 input
327 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
329 * Assumes Invariant input
331 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
334 * This buffer ( inv_buf ) is used to hold the UTF-8 strings
335 * passed into utext_openUTF8. An error will be given if
336 * INV_BUFSIZ is too small. It's only used on EBCDIC systems.
339 #define INV_BUFSIZ 2048 /* increase this if too small */
341 static int64_t inv_next
=0;
343 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
344 static char inv_buf
[INV_BUFSIZ
];
347 static UText
* regextst_openUTF8FromInvariant(UText
*ut
, const char *inv
, int64_t length
, UErrorCode
*status
) {
348 if(length
==-1) length
=strlen(inv
);
349 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
351 return utext_openUTF8(ut
, inv
, length
, status
);
353 if(inv_next
+length
+1>INV_BUFSIZ
) {
354 fprintf(stderr
, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
355 __FILE__
, __LINE__
, INV_BUFSIZ
, (inv_next
+length
+1));
356 *status
= U_MEMORY_ALLOCATION_ERROR
;
360 unsigned char *buf
= (unsigned char*)inv_buf
+inv_next
;
361 uprv_aestrncpy(buf
, (const uint8_t*)inv
, length
);
365 fprintf(stderr
, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ
, inv_next
);
368 return utext_openUTF8(ut
, (const char*)buf
, length
, status
);
373 //---------------------------------------------------------------------------
375 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests
376 // for the LookingAt() and Match() functions.
379 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
381 // The expected results are UBool - TRUE or FALSE.
382 // The input text is unescaped. The pattern is not.
385 //---------------------------------------------------------------------------
387 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
389 UBool
RegexTest::doRegexLMTest(const char *pat
, const char *text
, UBool looking
, UBool match
, int32_t line
) {
390 const UnicodeString
pattern(pat
, -1, US_INV
);
391 const UnicodeString
inputText(text
, -1, US_INV
);
392 UErrorCode status
= U_ZERO_ERROR
;
394 RegexPattern
*REPattern
= NULL
;
395 RegexMatcher
*REMatcher
= NULL
;
398 UnicodeString
patString(pat
, -1, US_INV
);
399 REPattern
= RegexPattern::compile(patString
, 0, pe
, status
);
400 if (U_FAILURE(status
)) {
401 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s",
402 line
, u_errorName(status
));
405 if (line
==376) { REPattern
->dumpPattern();}
407 UnicodeString
inputString(inputText
);
408 UnicodeString unEscapedInput
= inputString
.unescape();
409 REMatcher
= REPattern
->matcher(unEscapedInput
, status
);
410 if (U_FAILURE(status
)) {
411 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
412 line
, u_errorName(status
));
417 actualmatch
= REMatcher
->lookingAt(status
);
418 if (U_FAILURE(status
)) {
419 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
420 line
, u_errorName(status
));
423 if (actualmatch
!= looking
) {
424 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line
);
428 status
= U_ZERO_ERROR
;
429 actualmatch
= REMatcher
->matches(status
);
430 if (U_FAILURE(status
)) {
431 errln("RegexTest failure in matches() at line %d. Status = %s\n",
432 line
, u_errorName(status
));
435 if (actualmatch
!= match
) {
436 errln("RegexTest: wrong return from matches() at line %d.\n", line
);
440 if (retVal
== FALSE
) {
441 REPattern
->dumpPattern();
450 UBool
RegexTest::doRegexLMTestUTF8(const char *pat
, const char *text
, UBool looking
, UBool match
, int32_t line
) {
451 UText pattern
= UTEXT_INITIALIZER
;
452 int32_t inputUTF8Length
;
453 char *textChars
= NULL
;
454 UText inputText
= UTEXT_INITIALIZER
;
455 UErrorCode status
= U_ZERO_ERROR
;
457 RegexPattern
*REPattern
= NULL
;
458 RegexMatcher
*REMatcher
= NULL
;
461 regextst_openUTF8FromInvariant(&pattern
, pat
, -1, &status
);
462 REPattern
= RegexPattern::compile(&pattern
, 0, pe
, status
);
463 if (U_FAILURE(status
)) {
464 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n",
465 line
, u_errorName(status
));
469 UnicodeString
inputString(text
, -1, US_INV
);
470 UnicodeString unEscapedInput
= inputString
.unescape();
471 LocalUConverterPointer
UTF8Converter(ucnv_open("UTF8", &status
));
472 ucnv_setFromUCallBack(UTF8Converter
.getAlias(), UCNV_FROM_U_CALLBACK_STOP
, NULL
, NULL
, NULL
, &status
);
474 inputUTF8Length
= unEscapedInput
.extract(NULL
, 0, UTF8Converter
.getAlias(), status
);
475 if (U_FAILURE(status
) && status
!= U_BUFFER_OVERFLOW_ERROR
) {
476 // UTF-8 does not allow unpaired surrogates, so this could actually happen
477 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line
, u_errorName(status
));
478 return TRUE
; // not a failure of the Regex engine
480 status
= U_ZERO_ERROR
; // buffer overflow
481 textChars
= new char[inputUTF8Length
+1];
482 unEscapedInput
.extract(textChars
, inputUTF8Length
+1, UTF8Converter
.getAlias(), status
);
483 utext_openUTF8(&inputText
, textChars
, inputUTF8Length
, &status
);
485 REMatcher
= &REPattern
->matcher(status
)->reset(&inputText
);
486 if (U_FAILURE(status
)) {
487 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n",
488 line
, u_errorName(status
));
493 actualmatch
= REMatcher
->lookingAt(status
);
494 if (U_FAILURE(status
)) {
495 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n",
496 line
, u_errorName(status
));
499 if (actualmatch
!= looking
) {
500 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line
);
504 status
= U_ZERO_ERROR
;
505 actualmatch
= REMatcher
->matches(status
);
506 if (U_FAILURE(status
)) {
507 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n",
508 line
, u_errorName(status
));
511 if (actualmatch
!= match
) {
512 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line
);
516 if (retVal
== FALSE
) {
517 REPattern
->dumpPattern();
522 utext_close(&inputText
);
523 utext_close(&pattern
);
530 //---------------------------------------------------------------------------
532 // REGEX_ERR Macro + invocation function to simplify writing tests
533 // regex tests for incorrect patterns
536 // REGEX_ERR("pattern", expected error line, column, expected status);
538 //---------------------------------------------------------------------------
539 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
541 void RegexTest::regex_err(const char *pat
, int32_t errLine
, int32_t errCol
,
542 UErrorCode expectedStatus
, int32_t line
) {
543 UnicodeString
pattern(pat
);
545 UErrorCode status
= U_ZERO_ERROR
;
547 RegexPattern
*callerPattern
= NULL
;
550 // Compile the caller's pattern
552 UnicodeString
patString(pat
);
553 callerPattern
= RegexPattern::compile(patString
, 0, pe
, status
);
554 if (status
!= expectedStatus
) {
555 dataerrln("Line %d: unexpected error %s compiling pattern.", line
, u_errorName(status
));
557 if (status
!= U_ZERO_ERROR
) {
558 if (pe
.line
!= errLine
|| pe
.offset
!= errCol
) {
559 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
560 line
, errLine
, errCol
, pe
.line
, pe
.offset
);
565 delete callerPattern
;
568 // Compile again, using a UTF-8-based UText
570 UText patternText
= UTEXT_INITIALIZER
;
571 regextst_openUTF8FromInvariant(&patternText
, pat
, -1, &status
);
572 callerPattern
= RegexPattern::compile(&patternText
, 0, pe
, status
);
573 if (status
!= expectedStatus
) {
574 dataerrln("Line %d: unexpected error %s compiling pattern.", line
, u_errorName(status
));
576 if (status
!= U_ZERO_ERROR
) {
577 if (pe
.line
!= errLine
|| pe
.offset
!= errCol
) {
578 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
579 line
, errLine
, errCol
, pe
.line
, pe
.offset
);
584 delete callerPattern
;
585 utext_close(&patternText
);
590 //---------------------------------------------------------------------------
592 // Basic Check for basic functionality of regex pattern matching.
593 // Avoid the use of REGEX_FIND test macro, which has
594 // substantial dependencies on basic Regex functionality.
596 //---------------------------------------------------------------------------
597 void RegexTest::Basic() {
601 // Debug - slide failing test cases early
605 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
607 UErrorCode status
= U_ZERO_ERROR
;
608 RegexPattern
*pattern
;
609 pattern
= RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE
, pe
, status
);
610 pattern
->dumpPattern();
611 RegexMatcher
*m
= pattern
->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status
);
612 UBool result
= m
->find();
613 printf("result = %d\n", result
);
614 // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
615 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
622 // Pattern with parentheses
624 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE
, FALSE
);
625 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE
, TRUE
);
626 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE
, FALSE
);
631 REGEX_TESTLM("st(abc)*ring", "string", TRUE
, TRUE
);
632 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE
, TRUE
);
633 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE
, TRUE
);
634 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE
, FALSE
);
635 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE
, FALSE
);
637 REGEX_TESTLM("a*", "", TRUE
, TRUE
);
638 REGEX_TESTLM("a*", "b", TRUE
, FALSE
);
644 REGEX_TESTLM(".", "abc", TRUE
, FALSE
);
645 REGEX_TESTLM("...", "abc", TRUE
, TRUE
);
646 REGEX_TESTLM("....", "abc", FALSE
, FALSE
);
647 REGEX_TESTLM(".*", "abcxyz123", TRUE
, TRUE
);
648 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE
, FALSE
);
649 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE
, TRUE
);
650 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE
, TRUE
);
651 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE
, FALSE
);
654 // Patterns with * applied to chars at end of literal string
656 REGEX_TESTLM("abc*", "ab", TRUE
, TRUE
);
657 REGEX_TESTLM("abc*", "abccccc", TRUE
, TRUE
);
660 // Supplemental chars match as single chars, not a pair of surrogates.
662 REGEX_TESTLM(".", "\\U00011000", TRUE
, TRUE
);
663 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE
, TRUE
);
664 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE
, FALSE
);
668 // UnicodeSets in the pattern
670 REGEX_TESTLM("[1-6]", "1", TRUE
, TRUE
);
671 REGEX_TESTLM("[1-6]", "3", TRUE
, TRUE
);
672 REGEX_TESTLM("[1-6]", "7", FALSE
, FALSE
);
673 REGEX_TESTLM("a[1-6]", "a3", TRUE
, TRUE
);
674 REGEX_TESTLM("a[1-6]", "a3", TRUE
, TRUE
);
675 REGEX_TESTLM("a[1-6]b", "a3b", TRUE
, TRUE
);
677 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE
, TRUE
);
678 REGEX_TESTLM("a[0-9]*b", "abc", TRUE
, FALSE
);
679 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE
, TRUE
);
680 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE
, FALSE
); // note that * matches 0 occurences.
681 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE
, TRUE
);
684 // OR operator in patterns
686 REGEX_TESTLM("(a|b)", "a", TRUE
, TRUE
);
687 REGEX_TESTLM("(a|b)", "b", TRUE
, TRUE
);
688 REGEX_TESTLM("(a|b)", "c", FALSE
, FALSE
);
689 REGEX_TESTLM("a|b", "b", TRUE
, TRUE
);
691 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE
, TRUE
);
692 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE
, FALSE
);
693 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE
, TRUE
);
694 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE
, TRUE
);
695 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE
, TRUE
);
696 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE
, FALSE
);
701 REGEX_TESTLM("ab+", "abbc", TRUE
, FALSE
);
702 REGEX_TESTLM("ab+c", "ac", FALSE
, FALSE
);
703 REGEX_TESTLM("b+", "", FALSE
, FALSE
);
704 REGEX_TESTLM("(abc|def)+", "defabc", TRUE
, TRUE
);
705 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE
, FALSE
);
706 REGEX_TESTLM(".+y", "zippity dooy", TRUE
, TRUE
);
711 REGEX_TESTLM("ab?", "ab", TRUE
, TRUE
);
712 REGEX_TESTLM("ab?", "a", TRUE
, TRUE
);
713 REGEX_TESTLM("ab?", "ac", TRUE
, FALSE
);
714 REGEX_TESTLM("ab?", "abb", TRUE
, FALSE
);
715 REGEX_TESTLM("a(b|c)?d", "abd", TRUE
, TRUE
);
716 REGEX_TESTLM("a(b|c)?d", "acd", TRUE
, TRUE
);
717 REGEX_TESTLM("a(b|c)?d", "ad", TRUE
, TRUE
);
718 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE
, FALSE
);
719 REGEX_TESTLM("a(b|c)?d", "ab", FALSE
, FALSE
);
722 // Escape sequences that become single literal chars, handled internally
723 // by ICU's Unescape.
726 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
727 REGEX_TESTLM("\\a", "\\u0007", TRUE
, TRUE
); // BEL
728 REGEX_TESTLM("\\cL", "\\u000c", TRUE
, TRUE
); // Control-L
729 REGEX_TESTLM("\\e", "\\u001b", TRUE
, TRUE
); // Escape
730 REGEX_TESTLM("\\f", "\\u000c", TRUE
, TRUE
); // Form Feed
731 REGEX_TESTLM("\\n", "\\u000a", TRUE
, TRUE
); // new line
732 REGEX_TESTLM("\\r", "\\u000d", TRUE
, TRUE
); // CR
733 REGEX_TESTLM("\\t", "\\u0009", TRUE
, TRUE
); // Tab
734 REGEX_TESTLM("\\u1234", "\\u1234", TRUE
, TRUE
);
735 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE
, TRUE
);
737 REGEX_TESTLM(".*\\Ax", "xyz", TRUE
, FALSE
); // \A matches only at the beginning of input
738 REGEX_TESTLM(".*\\Ax", " xyz", FALSE
, FALSE
); // \A matches only at the beginning of input
740 // Escape of special chars in patterns
741 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE
, TRUE
);
745 //---------------------------------------------------------------------------
747 // UTextBasic Check for quirks that are specific to the UText
750 //---------------------------------------------------------------------------
751 void RegexTest::UTextBasic() {
752 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
753 UErrorCode status
= U_ZERO_ERROR
;
754 UText pattern
= UTEXT_INITIALIZER
;
755 utext_openUTF8(&pattern
, str_abc
, -1, &status
);
756 RegexMatcher
matcher(&pattern
, 0, status
);
759 UText input
= UTEXT_INITIALIZER
;
760 utext_openUTF8(&input
, str_abc
, -1, &status
);
762 matcher
.reset(&input
);
764 REGEX_ASSERT_UTEXT_UTF8(str_abc
, matcher
.inputText());
766 matcher
.reset(matcher
.inputText());
768 REGEX_ASSERT_UTEXT_UTF8(str_abc
, matcher
.inputText());
770 utext_close(&pattern
);
775 //---------------------------------------------------------------------------
777 // API_Match Test that the API for class RegexMatcher
778 // is present and nominally working, but excluding functions
779 // implementing replace operations.
781 //---------------------------------------------------------------------------
782 void RegexTest::API_Match() {
784 UErrorCode status
=U_ZERO_ERROR
;
788 // Debug - slide failing test cases early
797 // Simple pattern compilation
800 UnicodeString
re("abc");
802 pat2
= RegexPattern::compile(re
, flags
, pe
, status
);
805 UnicodeString inStr1
= "abcdef this is a test";
806 UnicodeString instr2
= "not abc";
807 UnicodeString empty
= "";
811 // Matcher creation and reset.
813 RegexMatcher
*m1
= pat2
->matcher(inStr1
, status
);
815 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
816 REGEX_ASSERT(m1
->input() == inStr1
);
818 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
819 REGEX_ASSERT(m1
->input() == instr2
);
821 REGEX_ASSERT(m1
->input() == inStr1
);
822 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
824 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
825 REGEX_ASSERT(m1
->input() == empty
);
826 REGEX_ASSERT(&m1
->pattern() == pat2
);
829 // reset(pos, status)
832 m1
->reset(4, status
);
834 REGEX_ASSERT(m1
->input() == inStr1
);
835 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
837 m1
->reset(-1, status
);
838 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
839 status
= U_ZERO_ERROR
;
841 m1
->reset(0, status
);
843 status
= U_ZERO_ERROR
;
845 int32_t len
= m1
->input().length();
846 m1
->reset(len
-1, status
);
848 status
= U_ZERO_ERROR
;
850 m1
->reset(len
, status
);
852 status
= U_ZERO_ERROR
;
854 m1
->reset(len
+1, status
);
855 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
856 status
= U_ZERO_ERROR
;
859 // match(pos, status)
862 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
864 REGEX_ASSERT(m1
->matches(3, status
) == FALSE
);
866 REGEX_ASSERT(m1
->matches(5, status
) == FALSE
);
867 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
868 REGEX_ASSERT(m1
->matches(-1, status
) == FALSE
);
869 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
871 // Match() at end of string should fail, but should not
873 status
= U_ZERO_ERROR
;
874 len
= m1
->input().length();
875 REGEX_ASSERT(m1
->matches(len
, status
) == FALSE
);
878 // Match beyond end of string should fail with an error.
879 status
= U_ZERO_ERROR
;
880 REGEX_ASSERT(m1
->matches(len
+1, status
) == FALSE
);
881 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
883 // Successful match at end of string.
885 status
= U_ZERO_ERROR
;
886 RegexMatcher
m("A?", 0, status
); // will match zero length string.
889 len
= inStr1
.length();
890 REGEX_ASSERT(m
.matches(len
, status
) == TRUE
);
893 REGEX_ASSERT(m
.matches(0, status
) == TRUE
);
899 // lookingAt(pos, status)
901 status
= U_ZERO_ERROR
;
902 m1
->reset(instr2
); // "not abc"
903 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
904 REGEX_ASSERT(m1
->lookingAt(5, status
) == FALSE
);
905 REGEX_ASSERT(m1
->lookingAt(3, status
) == FALSE
);
906 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
907 REGEX_ASSERT(m1
->lookingAt(-1, status
) == FALSE
);
908 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
909 status
= U_ZERO_ERROR
;
910 len
= m1
->input().length();
911 REGEX_ASSERT(m1
->lookingAt(len
, status
) == FALSE
);
913 REGEX_ASSERT(m1
->lookingAt(len
+1, status
) == FALSE
);
914 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
923 // RegexMatcher::start();
924 // RegexMatcher::end();
925 // RegexMatcher::groupCount();
930 UErrorCode status
=U_ZERO_ERROR
;
932 UnicodeString
re("01(23(45)67)(.*)");
933 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
935 UnicodeString data
= "0123456789";
937 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
939 REGEX_ASSERT(matcher
->lookingAt(status
) == TRUE
);
940 static const int32_t matchStarts
[] = {0, 2, 4, 8};
941 static const int32_t matchEnds
[] = {10, 8, 6, 10};
943 for (i
=0; i
<4; i
++) {
944 int32_t actualStart
= matcher
->start(i
, status
);
946 if (actualStart
!= matchStarts
[i
]) {
947 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
948 __LINE__
, i
, matchStarts
[i
], actualStart
);
950 int32_t actualEnd
= matcher
->end(i
, status
);
952 if (actualEnd
!= matchEnds
[i
]) {
953 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
954 __LINE__
, i
, matchEnds
[i
], actualEnd
);
958 REGEX_ASSERT(matcher
->start(0, status
) == matcher
->start(status
));
959 REGEX_ASSERT(matcher
->end(0, status
) == matcher
->end(status
));
961 REGEX_ASSERT_FAIL(matcher
->start(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
962 REGEX_ASSERT_FAIL(matcher
->start( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
964 REGEX_ASSERT_FAIL(matcher
->start( 0, status
), U_REGEX_INVALID_STATE
);
966 matcher
->lookingAt(status
);
967 REGEX_ASSERT(matcher
->group(status
) == "0123456789");
968 REGEX_ASSERT(matcher
->group(0, status
) == "0123456789");
969 REGEX_ASSERT(matcher
->group(1, status
) == "234567" );
970 REGEX_ASSERT(matcher
->group(2, status
) == "45" );
971 REGEX_ASSERT(matcher
->group(3, status
) == "89" );
973 REGEX_ASSERT_FAIL(matcher
->group(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
974 REGEX_ASSERT_FAIL(matcher
->group( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
976 REGEX_ASSERT_FAIL(matcher
->group( 0, status
), U_REGEX_INVALID_STATE
);
989 UErrorCode status
=U_ZERO_ERROR
;
991 UnicodeString
re("abc");
992 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
994 UnicodeString data
= ".abc..abc...abc..";
995 // 012345678901234567
997 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
999 REGEX_ASSERT(matcher
->find());
1000 REGEX_ASSERT(matcher
->start(status
) == 1);
1001 REGEX_ASSERT(matcher
->find());
1002 REGEX_ASSERT(matcher
->start(status
) == 6);
1003 REGEX_ASSERT(matcher
->find());
1004 REGEX_ASSERT(matcher
->start(status
) == 12);
1005 REGEX_ASSERT(matcher
->find() == FALSE
);
1006 REGEX_ASSERT(matcher
->find() == FALSE
);
1009 REGEX_ASSERT(matcher
->find());
1010 REGEX_ASSERT(matcher
->start(status
) == 1);
1012 REGEX_ASSERT(matcher
->find(0, status
));
1013 REGEX_ASSERT(matcher
->start(status
) == 1);
1014 REGEX_ASSERT(matcher
->find(1, status
));
1015 REGEX_ASSERT(matcher
->start(status
) == 1);
1016 REGEX_ASSERT(matcher
->find(2, status
));
1017 REGEX_ASSERT(matcher
->start(status
) == 6);
1018 REGEX_ASSERT(matcher
->find(12, status
));
1019 REGEX_ASSERT(matcher
->start(status
) == 12);
1020 REGEX_ASSERT(matcher
->find(13, status
) == FALSE
);
1021 REGEX_ASSERT(matcher
->find(16, status
) == FALSE
);
1022 REGEX_ASSERT(matcher
->find(17, status
) == FALSE
);
1023 REGEX_ASSERT_FAIL(matcher
->start(status
), U_REGEX_INVALID_STATE
);
1025 status
= U_ZERO_ERROR
;
1026 REGEX_ASSERT_FAIL(matcher
->find(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
1027 status
= U_ZERO_ERROR
;
1028 REGEX_ASSERT_FAIL(matcher
->find(18, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
1030 REGEX_ASSERT(matcher
->groupCount() == 0);
1038 // find, with \G in pattern (true if at the end of a previous match).
1043 UErrorCode status
=U_ZERO_ERROR
;
1045 UnicodeString
re(".*?(?:(\\Gabc)|(abc))", -1, US_INV
);
1046 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
1048 UnicodeString data
= ".abcabc.abc..";
1049 // 012345678901234567
1051 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
1053 REGEX_ASSERT(matcher
->find());
1054 REGEX_ASSERT(matcher
->start(status
) == 0);
1055 REGEX_ASSERT(matcher
->start(1, status
) == -1);
1056 REGEX_ASSERT(matcher
->start(2, status
) == 1);
1058 REGEX_ASSERT(matcher
->find());
1059 REGEX_ASSERT(matcher
->start(status
) == 4);
1060 REGEX_ASSERT(matcher
->start(1, status
) == 4);
1061 REGEX_ASSERT(matcher
->start(2, status
) == -1);
1069 // find with zero length matches, match position should bump ahead
1070 // to prevent loops.
1074 UErrorCode status
=U_ZERO_ERROR
;
1075 RegexMatcher
m("(?= ?)", 0, status
); // This pattern will zero-length matches anywhere,
1076 // using an always-true look-ahead.
1078 UnicodeString
s(" ");
1081 if (m
.find() == FALSE
) {
1084 REGEX_ASSERT(m
.start(status
) == i
);
1085 REGEX_ASSERT(m
.end(status
) == i
);
1089 // Check that the bump goes over surrogate pairs OK
1090 s
= UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1094 if (m
.find() == FALSE
) {
1097 REGEX_ASSERT(m
.start(status
) == i
);
1098 REGEX_ASSERT(m
.end(status
) == i
);
1100 REGEX_ASSERT(i
==10);
1103 // find() loop breaking test.
1104 // with pattern of /.?/, should see a series of one char matches, then a single
1105 // match of zero length at the end of the input string.
1107 UErrorCode status
=U_ZERO_ERROR
;
1108 RegexMatcher
m(".?", 0, status
);
1110 UnicodeString
s(" ");
1113 if (m
.find() == FALSE
) {
1116 REGEX_ASSERT(m
.start(status
) == i
);
1117 REGEX_ASSERT(m
.end(status
) == (i
<4 ? i
+1 : i
));
1124 // Matchers with no input string behave as if they had an empty input string.
1128 UErrorCode status
= U_ZERO_ERROR
;
1129 RegexMatcher
m(".?", 0, status
);
1131 REGEX_ASSERT(m
.find());
1132 REGEX_ASSERT(m
.start(status
) == 0);
1133 REGEX_ASSERT(m
.input() == "");
1136 UErrorCode status
= U_ZERO_ERROR
;
1137 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
1138 RegexMatcher
*m
= p
->matcher(status
);
1141 REGEX_ASSERT(m
->find() == FALSE
);
1142 REGEX_ASSERT(m
->input() == "");
1151 UErrorCode status
= U_ZERO_ERROR
;
1152 UnicodeString
testString("This is test data");
1153 RegexMatcher
m(".*", testString
, 0, status
);
1155 REGEX_ASSERT(m
.regionStart() == 0);
1156 REGEX_ASSERT(m
.regionEnd() == testString
.length());
1157 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1158 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1160 m
.region(2,4, status
);
1162 REGEX_ASSERT(m
.matches(status
));
1163 REGEX_ASSERT(m
.start(status
)==2);
1164 REGEX_ASSERT(m
.end(status
)==4);
1168 REGEX_ASSERT(m
.regionStart() == 0);
1169 REGEX_ASSERT(m
.regionEnd() == testString
.length());
1171 UnicodeString
shorterString("short");
1172 m
.reset(shorterString
);
1173 REGEX_ASSERT(m
.regionStart() == 0);
1174 REGEX_ASSERT(m
.regionEnd() == shorterString
.length());
1176 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1177 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(FALSE
));
1178 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
1179 REGEX_ASSERT(&m
== &m
.reset());
1180 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
1182 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(TRUE
));
1183 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1184 REGEX_ASSERT(&m
== &m
.reset());
1185 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1187 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1188 REGEX_ASSERT(&m
== &m
.useTransparentBounds(TRUE
));
1189 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
1190 REGEX_ASSERT(&m
== &m
.reset());
1191 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
1193 REGEX_ASSERT(&m
== &m
.useTransparentBounds(FALSE
));
1194 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1195 REGEX_ASSERT(&m
== &m
.reset());
1196 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1201 // hitEnd() and requireEnd()
1204 UErrorCode status
= U_ZERO_ERROR
;
1205 UnicodeString
testString("aabb");
1206 RegexMatcher
m1(".*", testString
, 0, status
);
1207 REGEX_ASSERT(m1
.lookingAt(status
) == TRUE
);
1208 REGEX_ASSERT(m1
.hitEnd() == TRUE
);
1209 REGEX_ASSERT(m1
.requireEnd() == FALSE
);
1212 status
= U_ZERO_ERROR
;
1213 RegexMatcher
m2("a*", testString
, 0, status
);
1214 REGEX_ASSERT(m2
.lookingAt(status
) == TRUE
);
1215 REGEX_ASSERT(m2
.hitEnd() == FALSE
);
1216 REGEX_ASSERT(m2
.requireEnd() == FALSE
);
1219 status
= U_ZERO_ERROR
;
1220 RegexMatcher
m3(".*$", testString
, 0, status
);
1221 REGEX_ASSERT(m3
.lookingAt(status
) == TRUE
);
1222 REGEX_ASSERT(m3
.hitEnd() == TRUE
);
1223 REGEX_ASSERT(m3
.requireEnd() == TRUE
);
1229 // Compilation error on reset with UChar *
1230 // These were a hazard that people were stumbling over with runtime errors.
1231 // Changed them to compiler errors by adding private methods that more closely
1232 // matched the incorrect use of the functions.
1236 UErrorCode status
= U_ZERO_ERROR
;
1237 UChar ucharString
[20];
1238 RegexMatcher
m(".", 0, status
);
1239 m
.reset(ucharString
); // should not compile.
1241 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
1242 RegexMatcher
*m2
= p
->matcher(ucharString
, status
); // should not compile.
1244 RegexMatcher
m3(".", ucharString
, 0, status
); // Should not compile
1250 // Note: These tests will need to be changed when the regexp engine is
1251 // able to detect and cut short the exponential time behavior on
1252 // this type of match.
1255 UErrorCode status
= U_ZERO_ERROR
;
1256 // Enough 'a's in the string to cause the match to time out.
1257 // (Each on additonal 'a' doubles the time)
1258 UnicodeString
testString("aaaaaaaaaaaaaaaaaaaaa");
1259 RegexMatcher
matcher("(a+)+b", testString
, 0, status
);
1261 REGEX_ASSERT(matcher
.getTimeLimit() == 0);
1262 matcher
.setTimeLimit(100, status
);
1263 REGEX_ASSERT(matcher
.getTimeLimit() == 100);
1264 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1265 REGEX_ASSERT(status
== U_REGEX_TIME_OUT
);
1268 UErrorCode status
= U_ZERO_ERROR
;
1269 // Few enough 'a's to slip in under the time limit.
1270 UnicodeString
testString("aaaaaaaaaaaaaaaaaa");
1271 RegexMatcher
matcher("(a+)+b", testString
, 0, status
);
1273 matcher
.setTimeLimit(100, status
);
1274 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1282 UErrorCode status
= U_ZERO_ERROR
;
1283 UnicodeString
testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'
1285 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1286 // of the '+', and makes the stack frames larger.
1287 RegexMatcher
matcher("(A)+A$", testString
, 0, status
);
1289 // With the default stack, this match should fail to run
1290 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1291 REGEX_ASSERT(status
== U_REGEX_STACK_OVERFLOW
);
1293 // With unlimited stack, it should run
1294 status
= U_ZERO_ERROR
;
1295 matcher
.setStackLimit(0, status
);
1297 REGEX_ASSERT(matcher
.lookingAt(status
) == TRUE
);
1299 REGEX_ASSERT(matcher
.getStackLimit() == 0);
1301 // With a limited stack, it the match should fail
1302 status
= U_ZERO_ERROR
;
1303 matcher
.setStackLimit(10000, status
);
1304 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1305 REGEX_ASSERT(status
== U_REGEX_STACK_OVERFLOW
);
1306 REGEX_ASSERT(matcher
.getStackLimit() == 10000);
1309 // A pattern that doesn't save state should work with
1310 // a minimal sized stack
1312 UErrorCode status
= U_ZERO_ERROR
;
1313 UnicodeString testString
= "abc";
1314 RegexMatcher
matcher("abc", testString
, 0, status
);
1316 matcher
.setStackLimit(30, status
);
1318 REGEX_ASSERT(matcher
.matches(status
) == TRUE
);
1320 REGEX_ASSERT(matcher
.getStackLimit() == 30);
1322 // Negative stack sizes should fail
1323 status
= U_ZERO_ERROR
;
1324 matcher
.setStackLimit(1000, status
);
1326 matcher
.setStackLimit(-1, status
);
1327 REGEX_ASSERT(status
== U_ILLEGAL_ARGUMENT_ERROR
);
1328 REGEX_ASSERT(matcher
.getStackLimit() == 1000);
1339 //---------------------------------------------------------------------------
1341 // API_Replace API test for class RegexMatcher, testing the
1342 // Replace family of functions.
1344 //---------------------------------------------------------------------------
1345 void RegexTest::API_Replace() {
1351 UErrorCode status
=U_ZERO_ERROR
;
1353 UnicodeString
re("abc");
1354 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
1356 UnicodeString data
= ".abc..abc...abc..";
1357 // 012345678901234567
1358 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
1361 // Plain vanilla matches.
1364 dest
= matcher
->replaceFirst("yz", status
);
1366 REGEX_ASSERT(dest
== ".yz..abc...abc..");
1368 dest
= matcher
->replaceAll("yz", status
);
1370 REGEX_ASSERT(dest
== ".yz..yz...yz..");
1373 // Plain vanilla non-matches.
1375 UnicodeString d2
= ".abx..abx...abx..";
1377 dest
= matcher
->replaceFirst("yz", status
);
1379 REGEX_ASSERT(dest
== ".abx..abx...abx..");
1381 dest
= matcher
->replaceAll("yz", status
);
1383 REGEX_ASSERT(dest
== ".abx..abx...abx..");
1386 // Empty source string
1388 UnicodeString d3
= "";
1390 dest
= matcher
->replaceFirst("yz", status
);
1392 REGEX_ASSERT(dest
== "");
1394 dest
= matcher
->replaceAll("yz", status
);
1396 REGEX_ASSERT(dest
== "");
1399 // Empty substitution string
1401 matcher
->reset(data
); // ".abc..abc...abc.."
1402 dest
= matcher
->replaceFirst("", status
);
1404 REGEX_ASSERT(dest
== "...abc...abc..");
1406 dest
= matcher
->replaceAll("", status
);
1408 REGEX_ASSERT(dest
== "........");
1411 // match whole string
1413 UnicodeString d4
= "abc";
1415 dest
= matcher
->replaceFirst("xyz", status
);
1417 REGEX_ASSERT(dest
== "xyz");
1419 dest
= matcher
->replaceAll("xyz", status
);
1421 REGEX_ASSERT(dest
== "xyz");
1424 // Capture Group, simple case
1426 UnicodeString
re2("a(..)");
1427 RegexPattern
*pat2
= RegexPattern::compile(re2
, flags
, pe
, status
);
1429 UnicodeString d5
= "abcdefg";
1430 RegexMatcher
*matcher2
= pat2
->matcher(d5
, status
);
1432 dest
= matcher2
->replaceFirst("$1$1", status
);
1434 REGEX_ASSERT(dest
== "bcbcdefg");
1436 dest
= matcher2
->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status
);
1438 REGEX_ASSERT(dest
== "The value of $1 is bc.defg");
1440 dest
= matcher2
->replaceFirst("$ by itself, no group number $$$", status
);
1441 REGEX_ASSERT(U_FAILURE(status
));
1442 status
= U_ZERO_ERROR
;
1444 UnicodeString replacement
= UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1445 replacement
= replacement
.unescape();
1446 dest
= matcher2
->replaceFirst(replacement
, status
);
1448 REGEX_ASSERT(dest
== "Supplemental Digit 1 bc.defg");
1450 REGEX_ASSERT_FAIL(matcher2
->replaceFirst("bad capture group number $5...",status
), U_INDEX_OUTOFBOUNDS_ERROR
);
1454 // Replacement String with \u hex escapes
1457 UnicodeString src
= "abc 1 abc 2 abc 3";
1458 UnicodeString substitute
= UNICODE_STRING_SIMPLE("--\\u0043--");
1459 matcher
->reset(src
);
1460 UnicodeString result
= matcher
->replaceAll(substitute
, status
);
1462 REGEX_ASSERT(result
== "--C-- 1 --C-- 2 --C-- 3");
1465 UnicodeString src
= "abc !";
1466 UnicodeString substitute
= UNICODE_STRING_SIMPLE("--\\U00010000--");
1467 matcher
->reset(src
);
1468 UnicodeString result
= matcher
->replaceAll(substitute
, status
);
1470 UnicodeString expected
= UnicodeString("--");
1471 expected
.append((UChar32
)0x10000);
1472 expected
.append("-- !");
1473 REGEX_ASSERT(result
== expected
);
1475 // TODO: need more through testing of capture substitutions.
1480 status
= U_ZERO_ERROR
;
1481 UnicodeString s
= "The matches start with ss and end with ee ss stuff ee fin";
1482 RegexMatcher
m("ss(.*?)ee", 0, status
);
1484 UnicodeString result
;
1486 // Multiple finds do NOT bump up the previous appendReplacement postion.
1490 m
.appendReplacement(result
, "ooh", status
);
1492 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1494 // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1495 status
= U_ZERO_ERROR
;
1497 m
.reset(10, status
);
1500 m
.appendReplacement(result
, "ooh", status
);
1502 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1504 // find() at interior of string, appendReplacemnt still starts at beginning.
1505 status
= U_ZERO_ERROR
;
1510 m
.appendReplacement(result
, "ooh", status
);
1512 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1514 m
.appendTail(result
);
1515 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh fin");
1526 //---------------------------------------------------------------------------
1528 // API_Pattern Test that the API for class RegexPattern is
1529 // present and nominally working.
1531 //---------------------------------------------------------------------------
1532 void RegexTest::API_Pattern() {
1533 RegexPattern pata
; // Test default constructor to not crash.
1536 REGEX_ASSERT(pata
== patb
);
1537 REGEX_ASSERT(pata
== pata
);
1539 UnicodeString
re1("abc[a-l][m-z]");
1540 UnicodeString
re2("def");
1541 UErrorCode status
= U_ZERO_ERROR
;
1544 RegexPattern
*pat1
= RegexPattern::compile(re1
, 0, pe
, status
);
1545 RegexPattern
*pat2
= RegexPattern::compile(re2
, 0, pe
, status
);
1547 REGEX_ASSERT(*pat1
== *pat1
);
1548 REGEX_ASSERT(*pat1
!= pata
);
1552 REGEX_ASSERT(patb
== *pat1
);
1555 RegexPattern
patc(*pat1
);
1556 REGEX_ASSERT(patc
== *pat1
);
1557 REGEX_ASSERT(patb
== patc
);
1558 REGEX_ASSERT(pat1
!= pat2
);
1560 REGEX_ASSERT(patb
!= patc
);
1561 REGEX_ASSERT(patb
== *pat2
);
1563 // Compile with no flags.
1564 RegexPattern
*pat1a
= RegexPattern::compile(re1
, pe
, status
);
1565 REGEX_ASSERT(*pat1a
== *pat1
);
1567 REGEX_ASSERT(pat1a
->flags() == 0);
1569 // Compile with different flags should be not equal
1570 RegexPattern
*pat1b
= RegexPattern::compile(re1
, UREGEX_CASE_INSENSITIVE
, pe
, status
);
1573 REGEX_ASSERT(*pat1b
!= *pat1a
);
1574 REGEX_ASSERT(pat1b
->flags() == UREGEX_CASE_INSENSITIVE
);
1575 REGEX_ASSERT(pat1a
->flags() == 0);
1579 RegexPattern
*pat1c
= pat1
->clone();
1580 REGEX_ASSERT(*pat1c
== *pat1
);
1581 REGEX_ASSERT(*pat1c
!= *pat2
);
1590 // Verify that a matcher created from a cloned pattern works.
1594 UErrorCode status
= U_ZERO_ERROR
;
1595 RegexPattern
*pSource
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status
);
1596 RegexPattern
*pClone
= pSource
->clone();
1598 RegexMatcher
*mFromClone
= pClone
->matcher(status
);
1600 UnicodeString s
= "Hello World";
1601 mFromClone
->reset(s
);
1602 REGEX_ASSERT(mFromClone
->find() == TRUE
);
1603 REGEX_ASSERT(mFromClone
->group(status
) == "Hello");
1604 REGEX_ASSERT(mFromClone
->find() == TRUE
);
1605 REGEX_ASSERT(mFromClone
->group(status
) == "World");
1606 REGEX_ASSERT(mFromClone
->find() == FALSE
);
1612 // matches convenience API
1614 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe
, status
) == TRUE
);
1616 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe
, status
) == FALSE
);
1618 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe
, status
) == TRUE
);
1620 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe
, status
) == TRUE
);
1622 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe
, status
) == FALSE
);
1624 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1625 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe
, status
) == FALSE
);
1626 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1632 status
= U_ZERO_ERROR
;
1633 pat1
= RegexPattern::compile(" +", pe
, status
);
1635 UnicodeString fields
[10];
1638 n
= pat1
->split("Now is the time", fields
, 10, status
);
1641 REGEX_ASSERT(fields
[0]=="Now");
1642 REGEX_ASSERT(fields
[1]=="is");
1643 REGEX_ASSERT(fields
[2]=="the");
1644 REGEX_ASSERT(fields
[3]=="time");
1645 REGEX_ASSERT(fields
[4]=="");
1647 n
= pat1
->split("Now is the time", fields
, 2, status
);
1650 REGEX_ASSERT(fields
[0]=="Now");
1651 REGEX_ASSERT(fields
[1]=="is the time");
1652 REGEX_ASSERT(fields
[2]=="the"); // left over from previous test
1655 status
= U_ZERO_ERROR
;
1656 n
= pat1
->split("Now is the time", fields
, 1, status
);
1659 REGEX_ASSERT(fields
[0]=="Now is the time");
1660 REGEX_ASSERT(fields
[1]=="*");
1661 status
= U_ZERO_ERROR
;
1663 n
= pat1
->split(" Now is the time ", fields
, 10, status
);
1666 REGEX_ASSERT(fields
[0]=="");
1667 REGEX_ASSERT(fields
[1]=="Now");
1668 REGEX_ASSERT(fields
[2]=="is");
1669 REGEX_ASSERT(fields
[3]=="the");
1670 REGEX_ASSERT(fields
[4]=="time");
1671 REGEX_ASSERT(fields
[5]=="");
1673 n
= pat1
->split(" ", fields
, 10, status
);
1676 REGEX_ASSERT(fields
[0]=="");
1677 REGEX_ASSERT(fields
[1]=="");
1680 n
= pat1
->split("", fields
, 10, status
);
1683 REGEX_ASSERT(fields
[0]=="foo");
1687 // split, with a pattern with (capture)
1688 pat1
= RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe
, status
);
1691 status
= U_ZERO_ERROR
;
1692 n
= pat1
->split("<a>Now is <b>the time<c>", fields
, 10, status
);
1695 REGEX_ASSERT(fields
[0]=="");
1696 REGEX_ASSERT(fields
[1]=="a");
1697 REGEX_ASSERT(fields
[2]=="Now is ");
1698 REGEX_ASSERT(fields
[3]=="b");
1699 REGEX_ASSERT(fields
[4]=="the time");
1700 REGEX_ASSERT(fields
[5]=="c");
1701 REGEX_ASSERT(fields
[6]=="");
1702 REGEX_ASSERT(status
==U_ZERO_ERROR
);
1704 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 10, status
);
1707 REGEX_ASSERT(fields
[0]==" ");
1708 REGEX_ASSERT(fields
[1]=="a");
1709 REGEX_ASSERT(fields
[2]=="Now is ");
1710 REGEX_ASSERT(fields
[3]=="b");
1711 REGEX_ASSERT(fields
[4]=="the time");
1712 REGEX_ASSERT(fields
[5]=="c");
1713 REGEX_ASSERT(fields
[6]=="");
1715 status
= U_ZERO_ERROR
;
1717 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 6, status
);
1720 REGEX_ASSERT(fields
[0]==" ");
1721 REGEX_ASSERT(fields
[1]=="a");
1722 REGEX_ASSERT(fields
[2]=="Now is ");
1723 REGEX_ASSERT(fields
[3]=="b");
1724 REGEX_ASSERT(fields
[4]=="the time");
1725 REGEX_ASSERT(fields
[5]==""); // All text following "<c>" field delimiter.
1726 REGEX_ASSERT(fields
[6]=="foo");
1728 status
= U_ZERO_ERROR
;
1730 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 5, status
);
1733 REGEX_ASSERT(fields
[0]==" ");
1734 REGEX_ASSERT(fields
[1]=="a");
1735 REGEX_ASSERT(fields
[2]=="Now is ");
1736 REGEX_ASSERT(fields
[3]=="b");
1737 REGEX_ASSERT(fields
[4]=="the time<c>");
1738 REGEX_ASSERT(fields
[5]=="foo");
1740 status
= U_ZERO_ERROR
;
1742 n
= pat1
->split(" <a>Now is <b>the time", fields
, 5, status
);
1745 REGEX_ASSERT(fields
[0]==" ");
1746 REGEX_ASSERT(fields
[1]=="a");
1747 REGEX_ASSERT(fields
[2]=="Now is ");
1748 REGEX_ASSERT(fields
[3]=="b");
1749 REGEX_ASSERT(fields
[4]=="the time");
1750 REGEX_ASSERT(fields
[5]=="foo");
1752 status
= U_ZERO_ERROR
;
1753 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 4, status
);
1756 REGEX_ASSERT(fields
[0]==" ");
1757 REGEX_ASSERT(fields
[1]=="a");
1758 REGEX_ASSERT(fields
[2]=="Now is ");
1759 REGEX_ASSERT(fields
[3]=="the time<c>");
1760 status
= U_ZERO_ERROR
;
1763 pat1
= RegexPattern::compile("([-,])", pe
, status
);
1765 n
= pat1
->split("1-10,20", fields
, 10, status
);
1768 REGEX_ASSERT(fields
[0]=="1");
1769 REGEX_ASSERT(fields
[1]=="-");
1770 REGEX_ASSERT(fields
[2]=="10");
1771 REGEX_ASSERT(fields
[3]==",");
1772 REGEX_ASSERT(fields
[4]=="20");
1775 // Test split of string with empty trailing fields
1776 pat1
= RegexPattern::compile(",", pe
, status
);
1778 n
= pat1
->split("a,b,c,", fields
, 10, status
);
1781 REGEX_ASSERT(fields
[0]=="a");
1782 REGEX_ASSERT(fields
[1]=="b");
1783 REGEX_ASSERT(fields
[2]=="c");
1784 REGEX_ASSERT(fields
[3]=="");
1786 n
= pat1
->split("a,,,", fields
, 10, status
);
1789 REGEX_ASSERT(fields
[0]=="a");
1790 REGEX_ASSERT(fields
[1]=="");
1791 REGEX_ASSERT(fields
[2]=="");
1792 REGEX_ASSERT(fields
[3]=="");
1795 // Split Separator with zero length match.
1796 pat1
= RegexPattern::compile(":?", pe
, status
);
1798 n
= pat1
->split("abc", fields
, 10, status
);
1801 REGEX_ASSERT(fields
[0]=="");
1802 REGEX_ASSERT(fields
[1]=="a");
1803 REGEX_ASSERT(fields
[2]=="b");
1804 REGEX_ASSERT(fields
[3]=="c");
1805 REGEX_ASSERT(fields
[4]=="");
1810 // RegexPattern::pattern()
1812 pat1
= new RegexPattern();
1813 REGEX_ASSERT(pat1
->pattern() == "");
1816 pat1
= RegexPattern::compile("(Hello, world)*", pe
, status
);
1818 REGEX_ASSERT(pat1
->pattern() == "(Hello, world)*");
1823 // classID functions
1825 pat1
= RegexPattern::compile("(Hello, world)*", pe
, status
);
1827 REGEX_ASSERT(pat1
->getDynamicClassID() == RegexPattern::getStaticClassID());
1828 REGEX_ASSERT(pat1
->getDynamicClassID() != NULL
);
1829 UnicodeString
Hello("Hello, world.");
1830 RegexMatcher
*m
= pat1
->matcher(Hello
, status
);
1831 REGEX_ASSERT(pat1
->getDynamicClassID() != m
->getDynamicClassID());
1832 REGEX_ASSERT(m
->getDynamicClassID() == RegexMatcher::getStaticClassID());
1833 REGEX_ASSERT(m
->getDynamicClassID() != NULL
);
1839 //---------------------------------------------------------------------------
1841 // API_Match_UTF8 Test that the alternate engine for class RegexMatcher
1842 // is present and working, but excluding functions
1843 // implementing replace operations.
1845 //---------------------------------------------------------------------------
1846 void RegexTest::API_Match_UTF8() {
1848 UErrorCode status
=U_ZERO_ERROR
;
1852 // Debug - slide failing test cases early
1861 // Simple pattern compilation
1864 UText re
= UTEXT_INITIALIZER
;
1865 regextst_openUTF8FromInvariant(&re
, "abc", -1, &status
);
1866 REGEX_VERBOSE_TEXT(&re
);
1868 pat2
= RegexPattern::compile(&re
, flags
, pe
, status
);
1871 UText input1
= UTEXT_INITIALIZER
;
1872 UText input2
= UTEXT_INITIALIZER
;
1873 UText empty
= UTEXT_INITIALIZER
;
1874 regextst_openUTF8FromInvariant(&input1
, "abcdef this is a test", -1, &status
);
1875 REGEX_VERBOSE_TEXT(&input1
);
1876 regextst_openUTF8FromInvariant(&input2
, "not abc", -1, &status
);
1877 REGEX_VERBOSE_TEXT(&input2
);
1878 utext_openUChars(&empty
, NULL
, 0, &status
);
1880 int32_t input1Len
= strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1881 int32_t input2Len
= strlen("not abc");
1885 // Matcher creation and reset.
1887 RegexMatcher
*m1
= &pat2
->matcher(status
)->reset(&input1
);
1889 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
1890 const char str_abcdefthisisatest
[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1891 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest
, m1
->inputText());
1893 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
1894 const char str_notabc
[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1895 REGEX_ASSERT_UTEXT_UTF8(str_notabc
, m1
->inputText());
1897 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest
, m1
->inputText());
1898 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
1900 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
1901 REGEX_ASSERT(utext_nativeLength(&empty
) == 0);
1904 // reset(pos, status)
1907 m1
->reset(4, status
);
1909 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest
, m1
->inputText());
1910 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
1912 m1
->reset(-1, status
);
1913 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1914 status
= U_ZERO_ERROR
;
1916 m1
->reset(0, status
);
1918 status
= U_ZERO_ERROR
;
1920 m1
->reset(input1Len
-1, status
);
1922 status
= U_ZERO_ERROR
;
1924 m1
->reset(input1Len
, status
);
1926 status
= U_ZERO_ERROR
;
1928 m1
->reset(input1Len
+1, status
);
1929 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1930 status
= U_ZERO_ERROR
;
1933 // match(pos, status)
1936 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
1938 REGEX_ASSERT(m1
->matches(3, status
) == FALSE
);
1940 REGEX_ASSERT(m1
->matches(5, status
) == FALSE
);
1941 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
1942 REGEX_ASSERT(m1
->matches(-1, status
) == FALSE
);
1943 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1945 // Match() at end of string should fail, but should not
1947 status
= U_ZERO_ERROR
;
1948 REGEX_ASSERT(m1
->matches(input2Len
, status
) == FALSE
);
1951 // Match beyond end of string should fail with an error.
1952 status
= U_ZERO_ERROR
;
1953 REGEX_ASSERT(m1
->matches(input2Len
+1, status
) == FALSE
);
1954 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1956 // Successful match at end of string.
1958 status
= U_ZERO_ERROR
;
1959 RegexMatcher
m("A?", 0, status
); // will match zero length string.
1962 REGEX_ASSERT(m
.matches(input1Len
, status
) == TRUE
);
1965 REGEX_ASSERT(m
.matches(0, status
) == TRUE
);
1971 // lookingAt(pos, status)
1973 status
= U_ZERO_ERROR
;
1974 m1
->reset(&input2
); // "not abc"
1975 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
1976 REGEX_ASSERT(m1
->lookingAt(5, status
) == FALSE
);
1977 REGEX_ASSERT(m1
->lookingAt(3, status
) == FALSE
);
1978 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
1979 REGEX_ASSERT(m1
->lookingAt(-1, status
) == FALSE
);
1980 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1981 status
= U_ZERO_ERROR
;
1982 REGEX_ASSERT(m1
->lookingAt(input2Len
, status
) == FALSE
);
1984 REGEX_ASSERT(m1
->lookingAt(input2Len
+1, status
) == FALSE
);
1985 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1991 utext_close(&input1
);
1992 utext_close(&input2
);
1993 utext_close(&empty
);
1999 // RegexMatcher::start();
2000 // RegexMatcher::end();
2001 // RegexMatcher::groupCount();
2006 UErrorCode status
=U_ZERO_ERROR
;
2007 UText re
=UTEXT_INITIALIZER
;
2008 const char str_01234567_pat
[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
2009 utext_openUTF8(&re
, str_01234567_pat
, -1, &status
);
2011 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
2014 UText input
= UTEXT_INITIALIZER
;
2015 const char str_0123456789
[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2016 utext_openUTF8(&input
, str_0123456789
, -1, &status
);
2018 RegexMatcher
*matcher
= &pat
->matcher(status
)->reset(&input
);
2020 REGEX_ASSERT(matcher
->lookingAt(status
) == TRUE
);
2021 static const int32_t matchStarts
[] = {0, 2, 4, 8};
2022 static const int32_t matchEnds
[] = {10, 8, 6, 10};
2024 for (i
=0; i
<4; i
++) {
2025 int32_t actualStart
= matcher
->start(i
, status
);
2027 if (actualStart
!= matchStarts
[i
]) {
2028 errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n",
2029 __FILE__
, __LINE__
, i
, matchStarts
[i
], actualStart
);
2031 int32_t actualEnd
= matcher
->end(i
, status
);
2033 if (actualEnd
!= matchEnds
[i
]) {
2034 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n",
2035 __FILE__
, __LINE__
, i
, matchEnds
[i
], actualEnd
);
2039 REGEX_ASSERT(matcher
->start(0, status
) == matcher
->start(status
));
2040 REGEX_ASSERT(matcher
->end(0, status
) == matcher
->end(status
));
2042 REGEX_ASSERT_FAIL(matcher
->start(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2043 REGEX_ASSERT_FAIL(matcher
->start( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2045 REGEX_ASSERT_FAIL(matcher
->start( 0, status
), U_REGEX_INVALID_STATE
);
2047 matcher
->lookingAt(status
);
2050 UText destText
= UTEXT_INITIALIZER
;
2051 utext_openUnicodeString(&destText
, &dest
, &status
);
2053 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2054 // Test shallow-clone API
2056 result
= matcher
->group((UText
*)NULL
, group_len
, status
);
2058 REGEX_ASSERT_UTEXT_UTF8(str_0123456789
, result
);
2059 utext_close(result
);
2060 result
= matcher
->group(0, &destText
, group_len
, status
);
2062 REGEX_ASSERT(result
== &destText
);
2063 REGEX_ASSERT_UTEXT_UTF8(str_0123456789
, result
);
2064 // destText is now immutable, reopen it
2065 utext_close(&destText
);
2066 utext_openUnicodeString(&destText
, &dest
, &status
);
2069 result
= matcher
->group(0, NULL
, length
, status
);
2071 REGEX_ASSERT_UTEXT_UTF8(str_0123456789
, result
);
2072 utext_close(result
);
2073 result
= matcher
->group(0, &destText
, length
, status
);
2075 REGEX_ASSERT(result
== &destText
);
2076 REGEX_ASSERT(utext_getNativeIndex(result
) == 0);
2077 REGEX_ASSERT(length
== 10);
2078 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2080 // Capture Group 1 == "234567"
2081 result
= matcher
->group(1, NULL
, length
, status
);
2083 REGEX_ASSERT(utext_getNativeIndex(result
) == 2);
2084 REGEX_ASSERT(length
== 6);
2085 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2086 utext_close(result
);
2088 result
= matcher
->group(1, &destText
, length
, status
);
2090 REGEX_ASSERT(result
== &destText
);
2091 REGEX_ASSERT(utext_getNativeIndex(result
) == 2);
2092 REGEX_ASSERT(length
== 6);
2093 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2094 utext_close(result
);
2096 // Capture Group 2 == "45"
2097 result
= matcher
->group(2, NULL
, length
, status
);
2099 REGEX_ASSERT(utext_getNativeIndex(result
) == 4);
2100 REGEX_ASSERT(length
== 2);
2101 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2102 utext_close(result
);
2104 result
= matcher
->group(2, &destText
, length
, status
);
2106 REGEX_ASSERT(result
== &destText
);
2107 REGEX_ASSERT(utext_getNativeIndex(result
) == 4);
2108 REGEX_ASSERT(length
== 2);
2109 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2110 utext_close(result
);
2112 // Capture Group 3 == "89"
2113 result
= matcher
->group(3, NULL
, length
, status
);
2115 REGEX_ASSERT(utext_getNativeIndex(result
) == 8);
2116 REGEX_ASSERT(length
== 2);
2117 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2118 utext_close(result
);
2120 result
= matcher
->group(3, &destText
, length
, status
);
2122 REGEX_ASSERT(result
== &destText
);
2123 REGEX_ASSERT(utext_getNativeIndex(result
) == 8);
2124 REGEX_ASSERT(length
== 2);
2125 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2126 utext_close(result
);
2128 // Capture Group number out of range.
2129 status
= U_ZERO_ERROR
;
2130 REGEX_ASSERT_FAIL(matcher
->group(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2131 status
= U_ZERO_ERROR
;
2132 REGEX_ASSERT_FAIL(matcher
->group( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2133 status
= U_ZERO_ERROR
;
2135 REGEX_ASSERT_FAIL(matcher
->group( 0, status
), U_REGEX_INVALID_STATE
);
2140 utext_close(&destText
);
2141 utext_close(&input
);
2151 UErrorCode status
=U_ZERO_ERROR
;
2152 UText re
=UTEXT_INITIALIZER
;
2153 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2154 utext_openUTF8(&re
, str_abc
, -1, &status
);
2156 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
2158 UText input
= UTEXT_INITIALIZER
;
2159 const char str_abcabcabc
[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2160 utext_openUTF8(&input
, str_abcabcabc
, -1, &status
);
2161 // 012345678901234567
2163 RegexMatcher
*matcher
= &pat
->matcher(status
)->reset(&input
);
2165 REGEX_ASSERT(matcher
->find());
2166 REGEX_ASSERT(matcher
->start(status
) == 1);
2167 REGEX_ASSERT(matcher
->find());
2168 REGEX_ASSERT(matcher
->start(status
) == 6);
2169 REGEX_ASSERT(matcher
->find());
2170 REGEX_ASSERT(matcher
->start(status
) == 12);
2171 REGEX_ASSERT(matcher
->find() == FALSE
);
2172 REGEX_ASSERT(matcher
->find() == FALSE
);
2175 REGEX_ASSERT(matcher
->find());
2176 REGEX_ASSERT(matcher
->start(status
) == 1);
2178 REGEX_ASSERT(matcher
->find(0, status
));
2179 REGEX_ASSERT(matcher
->start(status
) == 1);
2180 REGEX_ASSERT(matcher
->find(1, status
));
2181 REGEX_ASSERT(matcher
->start(status
) == 1);
2182 REGEX_ASSERT(matcher
->find(2, status
));
2183 REGEX_ASSERT(matcher
->start(status
) == 6);
2184 REGEX_ASSERT(matcher
->find(12, status
));
2185 REGEX_ASSERT(matcher
->start(status
) == 12);
2186 REGEX_ASSERT(matcher
->find(13, status
) == FALSE
);
2187 REGEX_ASSERT(matcher
->find(16, status
) == FALSE
);
2188 REGEX_ASSERT(matcher
->find(17, status
) == FALSE
);
2189 REGEX_ASSERT_FAIL(matcher
->start(status
), U_REGEX_INVALID_STATE
);
2191 status
= U_ZERO_ERROR
;
2192 REGEX_ASSERT_FAIL(matcher
->find(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2193 status
= U_ZERO_ERROR
;
2194 REGEX_ASSERT_FAIL(matcher
->find(18, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2196 REGEX_ASSERT(matcher
->groupCount() == 0);
2201 utext_close(&input
);
2207 // find, with \G in pattern (true if at the end of a previous match).
2212 UErrorCode status
=U_ZERO_ERROR
;
2213 UText re
=UTEXT_INITIALIZER
;
2214 const char str_Gabcabc
[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2215 utext_openUTF8(&re
, str_Gabcabc
, -1, &status
);
2217 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
2220 UText input
= UTEXT_INITIALIZER
;
2221 const char str_abcabcabc
[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2222 utext_openUTF8(&input
, str_abcabcabc
, -1, &status
);
2223 // 012345678901234567
2225 RegexMatcher
*matcher
= &pat
->matcher(status
)->reset(&input
);
2227 REGEX_ASSERT(matcher
->find());
2228 REGEX_ASSERT(matcher
->start(status
) == 0);
2229 REGEX_ASSERT(matcher
->start(1, status
) == -1);
2230 REGEX_ASSERT(matcher
->start(2, status
) == 1);
2232 REGEX_ASSERT(matcher
->find());
2233 REGEX_ASSERT(matcher
->start(status
) == 4);
2234 REGEX_ASSERT(matcher
->start(1, status
) == 4);
2235 REGEX_ASSERT(matcher
->start(2, status
) == -1);
2241 utext_close(&input
);
2246 // find with zero length matches, match position should bump ahead
2247 // to prevent loops.
2251 UErrorCode status
=U_ZERO_ERROR
;
2252 RegexMatcher
m("(?= ?)", 0, status
); // This pattern will zero-length matches anywhere,
2253 // using an always-true look-ahead.
2255 UText s
= UTEXT_INITIALIZER
;
2256 utext_openUTF8(&s
, " ", -1, &status
);
2259 if (m
.find() == FALSE
) {
2262 REGEX_ASSERT(m
.start(status
) == i
);
2263 REGEX_ASSERT(m
.end(status
) == i
);
2267 // Check that the bump goes over characters outside the BMP OK
2268 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2269 unsigned char aboveBMP
[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2270 utext_openUTF8(&s
, (char *)aboveBMP
, -1, &status
);
2273 if (m
.find() == FALSE
) {
2276 REGEX_ASSERT(m
.start(status
) == i
);
2277 REGEX_ASSERT(m
.end(status
) == i
);
2279 REGEX_ASSERT(i
==20);
2284 // find() loop breaking test.
2285 // with pattern of /.?/, should see a series of one char matches, then a single
2286 // match of zero length at the end of the input string.
2288 UErrorCode status
=U_ZERO_ERROR
;
2289 RegexMatcher
m(".?", 0, status
);
2291 UText s
= UTEXT_INITIALIZER
;
2292 utext_openUTF8(&s
, " ", -1, &status
);
2295 if (m
.find() == FALSE
) {
2298 REGEX_ASSERT(m
.start(status
) == i
);
2299 REGEX_ASSERT(m
.end(status
) == (i
<4 ? i
+1 : i
));
2308 // Matchers with no input string behave as if they had an empty input string.
2312 UErrorCode status
= U_ZERO_ERROR
;
2313 RegexMatcher
m(".?", 0, status
);
2315 REGEX_ASSERT(m
.find());
2316 REGEX_ASSERT(m
.start(status
) == 0);
2317 REGEX_ASSERT(m
.input() == "");
2320 UErrorCode status
= U_ZERO_ERROR
;
2321 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
2322 RegexMatcher
*m
= p
->matcher(status
);
2325 REGEX_ASSERT(m
->find() == FALSE
);
2326 REGEX_ASSERT(utext_nativeLength(m
->inputText()) == 0);
2335 UErrorCode status
= U_ZERO_ERROR
;
2336 UText testPattern
= UTEXT_INITIALIZER
;
2337 UText testText
= UTEXT_INITIALIZER
;
2338 regextst_openUTF8FromInvariant(&testPattern
, ".*", -1, &status
);
2339 REGEX_VERBOSE_TEXT(&testPattern
);
2340 regextst_openUTF8FromInvariant(&testText
, "This is test data", -1, &status
);
2341 REGEX_VERBOSE_TEXT(&testText
);
2343 RegexMatcher
m(&testPattern
, &testText
, 0, status
);
2345 REGEX_ASSERT(m
.regionStart() == 0);
2346 REGEX_ASSERT(m
.regionEnd() == (int32_t)strlen("This is test data"));
2347 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2348 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2350 m
.region(2,4, status
);
2352 REGEX_ASSERT(m
.matches(status
));
2353 REGEX_ASSERT(m
.start(status
)==2);
2354 REGEX_ASSERT(m
.end(status
)==4);
2358 REGEX_ASSERT(m
.regionStart() == 0);
2359 REGEX_ASSERT(m
.regionEnd() == (int32_t)strlen("This is test data"));
2361 regextst_openUTF8FromInvariant(&testText
, "short", -1, &status
);
2362 REGEX_VERBOSE_TEXT(&testText
);
2364 REGEX_ASSERT(m
.regionStart() == 0);
2365 REGEX_ASSERT(m
.regionEnd() == (int32_t)strlen("short"));
2367 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2368 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(FALSE
));
2369 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
2370 REGEX_ASSERT(&m
== &m
.reset());
2371 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
2373 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(TRUE
));
2374 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2375 REGEX_ASSERT(&m
== &m
.reset());
2376 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2378 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2379 REGEX_ASSERT(&m
== &m
.useTransparentBounds(TRUE
));
2380 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
2381 REGEX_ASSERT(&m
== &m
.reset());
2382 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
2384 REGEX_ASSERT(&m
== &m
.useTransparentBounds(FALSE
));
2385 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2386 REGEX_ASSERT(&m
== &m
.reset());
2387 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2389 utext_close(&testText
);
2390 utext_close(&testPattern
);
2394 // hitEnd() and requireEnd()
2397 UErrorCode status
= U_ZERO_ERROR
;
2398 UText testPattern
= UTEXT_INITIALIZER
;
2399 UText testText
= UTEXT_INITIALIZER
;
2400 const char str_
[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2401 const char str_aabb
[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2402 utext_openUTF8(&testPattern
, str_
, -1, &status
);
2403 utext_openUTF8(&testText
, str_aabb
, -1, &status
);
2405 RegexMatcher
m1(&testPattern
, &testText
, 0, status
);
2406 REGEX_ASSERT(m1
.lookingAt(status
) == TRUE
);
2407 REGEX_ASSERT(m1
.hitEnd() == TRUE
);
2408 REGEX_ASSERT(m1
.requireEnd() == FALSE
);
2411 status
= U_ZERO_ERROR
;
2412 const char str_a
[] = { 0x61, 0x2a, 0x00 }; /* a* */
2413 utext_openUTF8(&testPattern
, str_a
, -1, &status
);
2414 RegexMatcher
m2(&testPattern
, &testText
, 0, status
);
2415 REGEX_ASSERT(m2
.lookingAt(status
) == TRUE
);
2416 REGEX_ASSERT(m2
.hitEnd() == FALSE
);
2417 REGEX_ASSERT(m2
.requireEnd() == FALSE
);
2420 status
= U_ZERO_ERROR
;
2421 const char str_dotstardollar
[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2422 utext_openUTF8(&testPattern
, str_dotstardollar
, -1, &status
);
2423 RegexMatcher
m3(&testPattern
, &testText
, 0, status
);
2424 REGEX_ASSERT(m3
.lookingAt(status
) == TRUE
);
2425 REGEX_ASSERT(m3
.hitEnd() == TRUE
);
2426 REGEX_ASSERT(m3
.requireEnd() == TRUE
);
2429 utext_close(&testText
);
2430 utext_close(&testPattern
);
2435 //---------------------------------------------------------------------------
2437 // API_Replace_UTF8 API test for class RegexMatcher, testing the
2438 // Replace family of functions.
2440 //---------------------------------------------------------------------------
2441 void RegexTest::API_Replace_UTF8() {
2447 UErrorCode status
=U_ZERO_ERROR
;
2449 UText re
=UTEXT_INITIALIZER
;
2450 regextst_openUTF8FromInvariant(&re
, "abc", -1, &status
);
2451 REGEX_VERBOSE_TEXT(&re
);
2452 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
2455 char data
[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2456 // 012345678901234567
2457 UText dataText
= UTEXT_INITIALIZER
;
2458 utext_openUTF8(&dataText
, data
, -1, &status
);
2460 REGEX_VERBOSE_TEXT(&dataText
);
2461 RegexMatcher
*matcher
= &pat
->matcher(status
)->reset(&dataText
);
2464 // Plain vanilla matches.
2467 UText destText
= UTEXT_INITIALIZER
;
2468 utext_openUnicodeString(&destText
, &dest
, &status
);
2471 UText replText
= UTEXT_INITIALIZER
;
2473 const char str_yz
[] = { 0x79, 0x7a, 0x00 }; /* yz */
2474 utext_openUTF8(&replText
, str_yz
, -1, &status
);
2475 REGEX_VERBOSE_TEXT(&replText
);
2476 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2478 const char str_yzabcabc
[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2479 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc
, result
);
2480 utext_close(result
);
2481 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2483 REGEX_ASSERT(result
== &destText
);
2484 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc
, result
);
2486 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2488 const char str_yzyzyz
[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2489 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz
, result
);
2490 utext_close(result
);
2492 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2493 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2495 REGEX_ASSERT(result
== &destText
);
2496 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz
, result
);
2499 // Plain vanilla non-matches.
2501 const char str_abxabxabx
[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2502 utext_openUTF8(&dataText
, str_abxabxabx
, -1, &status
);
2503 matcher
->reset(&dataText
);
2505 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2507 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2508 utext_close(result
);
2509 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2511 REGEX_ASSERT(result
== &destText
);
2512 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2514 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2516 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2517 utext_close(result
);
2518 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2519 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2521 REGEX_ASSERT(result
== &destText
);
2522 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2525 // Empty source string
2527 utext_openUTF8(&dataText
, NULL
, 0, &status
);
2528 matcher
->reset(&dataText
);
2530 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2532 REGEX_ASSERT_UTEXT_UTF8("", result
);
2533 utext_close(result
);
2534 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2536 REGEX_ASSERT(result
== &destText
);
2537 REGEX_ASSERT_UTEXT_UTF8("", result
);
2539 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2541 REGEX_ASSERT_UTEXT_UTF8("", result
);
2542 utext_close(result
);
2543 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2545 REGEX_ASSERT(result
== &destText
);
2546 REGEX_ASSERT_UTEXT_UTF8("", result
);
2549 // Empty substitution string
2551 utext_openUTF8(&dataText
, data
, -1, &status
); // ".abc..abc...abc.."
2552 matcher
->reset(&dataText
);
2554 utext_openUTF8(&replText
, NULL
, 0, &status
);
2555 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2557 const char str_abcabc
[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2558 REGEX_ASSERT_UTEXT_UTF8(str_abcabc
, result
);
2559 utext_close(result
);
2560 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2562 REGEX_ASSERT(result
== &destText
);
2563 REGEX_ASSERT_UTEXT_UTF8(str_abcabc
, result
);
2565 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2567 const char str_dots
[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2568 REGEX_ASSERT_UTEXT_UTF8(str_dots
, result
);
2569 utext_close(result
);
2570 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2571 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2573 REGEX_ASSERT(result
== &destText
);
2574 REGEX_ASSERT_UTEXT_UTF8(str_dots
, result
);
2577 // match whole string
2579 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2580 utext_openUTF8(&dataText
, str_abc
, -1, &status
);
2581 matcher
->reset(&dataText
);
2583 const char str_xyz
[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2584 utext_openUTF8(&replText
, str_xyz
, -1, &status
);
2585 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2587 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2588 utext_close(result
);
2589 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2590 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2592 REGEX_ASSERT(result
== &destText
);
2593 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2595 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2597 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2598 utext_close(result
);
2599 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2600 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2602 REGEX_ASSERT(result
== &destText
);
2603 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2606 // Capture Group, simple case
2608 const char str_add
[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2609 utext_openUTF8(&re
, str_add
, -1, &status
);
2610 RegexPattern
*pat2
= RegexPattern::compile(&re
, flags
, pe
, status
);
2613 const char str_abcdefg
[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2614 utext_openUTF8(&dataText
, str_abcdefg
, -1, &status
);
2615 RegexMatcher
*matcher2
= &pat2
->matcher(status
)->reset(&dataText
);
2618 const char str_11
[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2619 utext_openUTF8(&replText
, str_11
, -1, &status
);
2620 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2622 const char str_bcbcdefg
[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2623 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg
, result
);
2624 utext_close(result
);
2625 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2626 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2628 REGEX_ASSERT(result
== &destText
);
2629 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg
, result
);
2631 const char str_v
[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2632 utext_openUTF8(&replText
, str_v
, -1, &status
);
2633 REGEX_VERBOSE_TEXT(&replText
);
2634 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2636 const char str_Thevalueof1isbcdefg
[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2637 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg
, result
);
2638 utext_close(result
);
2639 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2640 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2642 REGEX_ASSERT(result
== &destText
);
2643 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg
, result
);
2645 const char str_byitselfnogroupnumber
[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
2646 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
2647 0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
2648 utext_openUTF8(&replText
, str_byitselfnogroupnumber
, -1, &status
);
2649 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2651 const char str_byitselfnogroupnumberdefg
[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2652 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg
, result
);
2653 utext_close(result
);
2654 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2655 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2657 REGEX_ASSERT(result
== &destText
);
2658 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg
, result
);
2660 unsigned char supplDigitChars
[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2661 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2662 // 012345678901234567890123456
2663 supplDigitChars
[22] = 0xF0;
2664 supplDigitChars
[23] = 0x9D;
2665 supplDigitChars
[24] = 0x9F;
2666 supplDigitChars
[25] = 0x8F;
2667 utext_openUTF8(&replText
, (char *)supplDigitChars
, -1, &status
);
2669 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2671 const char str_SupplementalDigit1bcdefg
[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2672 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg
, result
);
2673 utext_close(result
);
2674 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2675 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2677 REGEX_ASSERT(result
== &destText
);
2678 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg
, result
);
2679 const char str_badcapturegroupnumber5
[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */
2680 utext_openUTF8(&replText
, str_badcapturegroupnumber5
, -1, &status
);
2681 REGEX_ASSERT_FAIL((result
= matcher2
->replaceFirst(&replText
, NULL
, status
)), U_INDEX_OUTOFBOUNDS_ERROR
);
2682 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2683 utext_close(result
);
2684 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2685 REGEX_ASSERT_FAIL((result
= matcher2
->replaceFirst(&replText
, &destText
, status
)), U_INDEX_OUTOFBOUNDS_ERROR
);
2686 REGEX_ASSERT(result
== &destText
);
2687 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2690 // Replacement String with \u hex escapes
2693 const char str_abc1abc2abc3
[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2694 const char str_u0043
[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2695 utext_openUTF8(&dataText
, str_abc1abc2abc3
, -1, &status
);
2696 utext_openUTF8(&replText
, str_u0043
, -1, &status
);
2697 matcher
->reset(&dataText
);
2699 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2701 const char str_C1C2C3
[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2702 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3
, result
);
2703 utext_close(result
);
2704 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2705 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2707 REGEX_ASSERT(result
== &destText
);
2708 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3
, result
);
2711 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2712 utext_openUTF8(&dataText
, str_abc
, -1, &status
);
2713 const char str_U00010000
[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2714 utext_openUTF8(&replText
, str_U00010000
, -1, &status
);
2715 matcher
->reset(&dataText
);
2717 unsigned char expected
[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2724 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2726 REGEX_ASSERT_UTEXT_UTF8((char *)expected
, result
);
2727 utext_close(result
);
2728 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2729 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2731 REGEX_ASSERT(result
== &destText
);
2732 REGEX_ASSERT_UTEXT_UTF8((char *)expected
, result
);
2734 // TODO: need more through testing of capture substitutions.
2739 status
= U_ZERO_ERROR
;
2740 const char str_ssee
[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2741 const char str_blah
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2742 const char str_ooh
[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2743 utext_openUTF8(&re
, str_ssee
, -1, &status
);
2744 utext_openUTF8(&dataText
, str_blah
, -1, &status
);
2745 utext_openUTF8(&replText
, str_ooh
, -1, &status
);
2747 RegexMatcher
m(&re
, 0, status
);
2750 UnicodeString result
;
2751 UText resultText
= UTEXT_INITIALIZER
;
2752 utext_openUnicodeString(&resultText
, &result
, &status
);
2754 // Multiple finds do NOT bump up the previous appendReplacement postion.
2758 m
.appendReplacement(&resultText
, &replText
, status
);
2760 const char str_blah2
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2761 REGEX_ASSERT_UTEXT_UTF8(str_blah2
, &resultText
);
2763 // After a reset into the interior of a string, appendReplacement still starts at beginning.
2764 status
= U_ZERO_ERROR
;
2766 utext_openUnicodeString(&resultText
, &result
, &status
);
2767 m
.reset(10, status
);
2770 m
.appendReplacement(&resultText
, &replText
, status
);
2772 const char str_blah3
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2773 REGEX_ASSERT_UTEXT_UTF8(str_blah3
, &resultText
);
2775 // find() at interior of string, appendReplacement still starts at beginning.
2776 status
= U_ZERO_ERROR
;
2778 utext_openUnicodeString(&resultText
, &result
, &status
);
2782 m
.appendReplacement(&resultText
, &replText
, status
);
2784 const char str_blah8
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2785 REGEX_ASSERT_UTEXT_UTF8(str_blah8
, &resultText
);
2787 m
.appendTail(&resultText
, status
);
2788 const char str_blah9
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2789 REGEX_ASSERT_UTEXT_UTF8(str_blah9
, &resultText
);
2791 utext_close(&resultText
);
2799 utext_close(&dataText
);
2800 utext_close(&replText
);
2801 utext_close(&destText
);
2806 //---------------------------------------------------------------------------
2808 // API_Pattern_UTF8 Test that the API for class RegexPattern is
2809 // present and nominally working.
2811 //---------------------------------------------------------------------------
2812 void RegexTest::API_Pattern_UTF8() {
2813 RegexPattern pata
; // Test default constructor to not crash.
2816 REGEX_ASSERT(pata
== patb
);
2817 REGEX_ASSERT(pata
== pata
);
2819 UText re1
= UTEXT_INITIALIZER
;
2820 UText re2
= UTEXT_INITIALIZER
;
2821 UErrorCode status
= U_ZERO_ERROR
;
2824 const char str_abcalmz
[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2825 const char str_def
[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2826 utext_openUTF8(&re1
, str_abcalmz
, -1, &status
);
2827 utext_openUTF8(&re2
, str_def
, -1, &status
);
2829 RegexPattern
*pat1
= RegexPattern::compile(&re1
, 0, pe
, status
);
2830 RegexPattern
*pat2
= RegexPattern::compile(&re2
, 0, pe
, status
);
2832 REGEX_ASSERT(*pat1
== *pat1
);
2833 REGEX_ASSERT(*pat1
!= pata
);
2837 REGEX_ASSERT(patb
== *pat1
);
2840 RegexPattern
patc(*pat1
);
2841 REGEX_ASSERT(patc
== *pat1
);
2842 REGEX_ASSERT(patb
== patc
);
2843 REGEX_ASSERT(pat1
!= pat2
);
2845 REGEX_ASSERT(patb
!= patc
);
2846 REGEX_ASSERT(patb
== *pat2
);
2848 // Compile with no flags.
2849 RegexPattern
*pat1a
= RegexPattern::compile(&re1
, pe
, status
);
2850 REGEX_ASSERT(*pat1a
== *pat1
);
2852 REGEX_ASSERT(pat1a
->flags() == 0);
2854 // Compile with different flags should be not equal
2855 RegexPattern
*pat1b
= RegexPattern::compile(&re1
, UREGEX_CASE_INSENSITIVE
, pe
, status
);
2858 REGEX_ASSERT(*pat1b
!= *pat1a
);
2859 REGEX_ASSERT(pat1b
->flags() == UREGEX_CASE_INSENSITIVE
);
2860 REGEX_ASSERT(pat1a
->flags() == 0);
2864 RegexPattern
*pat1c
= pat1
->clone();
2865 REGEX_ASSERT(*pat1c
== *pat1
);
2866 REGEX_ASSERT(*pat1c
!= *pat2
);
2878 // Verify that a matcher created from a cloned pattern works.
2882 UErrorCode status
= U_ZERO_ERROR
;
2883 UText pattern
= UTEXT_INITIALIZER
;
2884 const char str_pL
[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2885 utext_openUTF8(&pattern
, str_pL
, -1, &status
);
2887 RegexPattern
*pSource
= RegexPattern::compile(&pattern
, 0, status
);
2888 RegexPattern
*pClone
= pSource
->clone();
2890 RegexMatcher
*mFromClone
= pClone
->matcher(status
);
2893 UText input
= UTEXT_INITIALIZER
;
2894 const char str_HelloWorld
[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2895 utext_openUTF8(&input
, str_HelloWorld
, -1, &status
);
2896 mFromClone
->reset(&input
);
2897 REGEX_ASSERT(mFromClone
->find() == TRUE
);
2898 REGEX_ASSERT(mFromClone
->group(status
) == "Hello");
2899 REGEX_ASSERT(mFromClone
->find() == TRUE
);
2900 REGEX_ASSERT(mFromClone
->group(status
) == "World");
2901 REGEX_ASSERT(mFromClone
->find() == FALSE
);
2905 utext_close(&input
);
2906 utext_close(&pattern
);
2910 // matches convenience API
2913 UErrorCode status
= U_ZERO_ERROR
;
2914 UText pattern
= UTEXT_INITIALIZER
;
2915 UText input
= UTEXT_INITIALIZER
;
2917 const char str_randominput
[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2918 utext_openUTF8(&input
, str_randominput
, -1, &status
);
2920 const char str_dotstar
[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2921 utext_openUTF8(&pattern
, str_dotstar
, -1, &status
);
2922 REGEX_ASSERT(RegexPattern::matches(&pattern
, &input
, pe
, status
) == TRUE
);
2925 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2926 utext_openUTF8(&pattern
, str_abc
, -1, &status
);
2927 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe
, status
) == FALSE
);
2930 const char str_nput
[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2931 utext_openUTF8(&pattern
, str_nput
, -1, &status
);
2932 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe
, status
) == TRUE
);
2935 utext_openUTF8(&pattern
, str_randominput
, -1, &status
);
2936 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe
, status
) == TRUE
);
2939 const char str_u
[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2940 utext_openUTF8(&pattern
, str_u
, -1, &status
);
2941 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe
, status
) == FALSE
);
2944 utext_openUTF8(&input
, str_abc
, -1, &status
);
2945 utext_openUTF8(&pattern
, str_abc
, -1, &status
);
2946 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
2947 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe
, status
) == FALSE
);
2948 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
2950 utext_close(&input
);
2951 utext_close(&pattern
);
2958 status
= U_ZERO_ERROR
;
2959 const char str_spaceplus
[] = { 0x20, 0x2b, 0x00 }; /* + */
2960 utext_openUTF8(&re1
, str_spaceplus
, -1, &status
);
2961 pat1
= RegexPattern::compile(&re1
, pe
, status
);
2963 UnicodeString fields
[10];
2966 n
= pat1
->split("Now is the time", fields
, 10, status
);
2969 REGEX_ASSERT(fields
[0]=="Now");
2970 REGEX_ASSERT(fields
[1]=="is");
2971 REGEX_ASSERT(fields
[2]=="the");
2972 REGEX_ASSERT(fields
[3]=="time");
2973 REGEX_ASSERT(fields
[4]=="");
2975 n
= pat1
->split("Now is the time", fields
, 2, status
);
2978 REGEX_ASSERT(fields
[0]=="Now");
2979 REGEX_ASSERT(fields
[1]=="is the time");
2980 REGEX_ASSERT(fields
[2]=="the"); // left over from previous test
2983 status
= U_ZERO_ERROR
;
2984 n
= pat1
->split("Now is the time", fields
, 1, status
);
2987 REGEX_ASSERT(fields
[0]=="Now is the time");
2988 REGEX_ASSERT(fields
[1]=="*");
2989 status
= U_ZERO_ERROR
;
2991 n
= pat1
->split(" Now is the time ", fields
, 10, status
);
2994 REGEX_ASSERT(fields
[0]=="");
2995 REGEX_ASSERT(fields
[1]=="Now");
2996 REGEX_ASSERT(fields
[2]=="is");
2997 REGEX_ASSERT(fields
[3]=="the");
2998 REGEX_ASSERT(fields
[4]=="time");
2999 REGEX_ASSERT(fields
[5]=="");
3000 REGEX_ASSERT(fields
[6]=="");
3003 n
= pat1
->split(" ", fields
, 10, status
);
3006 REGEX_ASSERT(fields
[0]=="");
3007 REGEX_ASSERT(fields
[1]=="");
3008 REGEX_ASSERT(fields
[2]=="*");
3011 n
= pat1
->split("", fields
, 10, status
);
3014 REGEX_ASSERT(fields
[0]=="foo");
3018 // split, with a pattern with (capture)
3019 regextst_openUTF8FromInvariant(&re1
, "<(\\w*)>", -1, &status
);
3020 pat1
= RegexPattern::compile(&re1
, pe
, status
);
3023 status
= U_ZERO_ERROR
;
3024 fields
[6] = fields
[7] = "*";
3025 n
= pat1
->split("<a>Now is <b>the time<c>", fields
, 10, status
);
3028 REGEX_ASSERT(fields
[0]=="");
3029 REGEX_ASSERT(fields
[1]=="a");
3030 REGEX_ASSERT(fields
[2]=="Now is ");
3031 REGEX_ASSERT(fields
[3]=="b");
3032 REGEX_ASSERT(fields
[4]=="the time");
3033 REGEX_ASSERT(fields
[5]=="c");
3034 REGEX_ASSERT(fields
[6]=="");
3035 REGEX_ASSERT(fields
[7]=="*");
3036 REGEX_ASSERT(status
==U_ZERO_ERROR
);
3038 fields
[6] = fields
[7] = "*";
3039 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 10, status
);
3042 REGEX_ASSERT(fields
[0]==" ");
3043 REGEX_ASSERT(fields
[1]=="a");
3044 REGEX_ASSERT(fields
[2]=="Now is ");
3045 REGEX_ASSERT(fields
[3]=="b");
3046 REGEX_ASSERT(fields
[4]=="the time");
3047 REGEX_ASSERT(fields
[5]=="c");
3048 REGEX_ASSERT(fields
[6]=="");
3049 REGEX_ASSERT(fields
[7]=="*");
3051 status
= U_ZERO_ERROR
;
3053 n
= pat1
->split(" <a>Now is <b>the time<c> ", fields
, 6, status
);
3056 REGEX_ASSERT(fields
[0]==" ");
3057 REGEX_ASSERT(fields
[1]=="a");
3058 REGEX_ASSERT(fields
[2]=="Now is ");
3059 REGEX_ASSERT(fields
[3]=="b");
3060 REGEX_ASSERT(fields
[4]=="the time");
3061 REGEX_ASSERT(fields
[5]==" ");
3062 REGEX_ASSERT(fields
[6]=="foo");
3064 status
= U_ZERO_ERROR
;
3066 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 5, status
);
3069 REGEX_ASSERT(fields
[0]==" ");
3070 REGEX_ASSERT(fields
[1]=="a");
3071 REGEX_ASSERT(fields
[2]=="Now is ");
3072 REGEX_ASSERT(fields
[3]=="b");
3073 REGEX_ASSERT(fields
[4]=="the time<c>");
3074 REGEX_ASSERT(fields
[5]=="foo");
3076 status
= U_ZERO_ERROR
;
3078 n
= pat1
->split(" <a>Now is <b>the time", fields
, 5, status
);
3081 REGEX_ASSERT(fields
[0]==" ");
3082 REGEX_ASSERT(fields
[1]=="a");
3083 REGEX_ASSERT(fields
[2]=="Now is ");
3084 REGEX_ASSERT(fields
[3]=="b");
3085 REGEX_ASSERT(fields
[4]=="the time");
3086 REGEX_ASSERT(fields
[5]=="foo");
3088 status
= U_ZERO_ERROR
;
3089 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 4, status
);
3092 REGEX_ASSERT(fields
[0]==" ");
3093 REGEX_ASSERT(fields
[1]=="a");
3094 REGEX_ASSERT(fields
[2]=="Now is ");
3095 REGEX_ASSERT(fields
[3]=="the time<c>");
3096 status
= U_ZERO_ERROR
;
3099 regextst_openUTF8FromInvariant(&re1
, "([-,])", -1, &status
);
3100 pat1
= RegexPattern::compile(&re1
, pe
, status
);
3102 n
= pat1
->split("1-10,20", fields
, 10, status
);
3105 REGEX_ASSERT(fields
[0]=="1");
3106 REGEX_ASSERT(fields
[1]=="-");
3107 REGEX_ASSERT(fields
[2]=="10");
3108 REGEX_ASSERT(fields
[3]==",");
3109 REGEX_ASSERT(fields
[4]=="20");
3114 // split of a UText based string, with library allocating output UTexts.
3117 status
= U_ZERO_ERROR
;
3118 RegexMatcher
matcher(UnicodeString("(:)"), 0, status
);
3119 UnicodeString
stringToSplit("first:second:third");
3120 UText
*textToSplit
= utext_openUnicodeString(NULL
, &stringToSplit
, &status
);
3123 UText
*splits
[10] = {NULL
};
3124 int32_t numFields
= matcher
.split(textToSplit
, splits
, UPRV_LENGTHOF(splits
), status
);
3126 REGEX_ASSERT(numFields
== 5);
3127 REGEX_ASSERT_UTEXT_INVARIANT("first", splits
[0]);
3128 REGEX_ASSERT_UTEXT_INVARIANT(":", splits
[1]);
3129 REGEX_ASSERT_UTEXT_INVARIANT("second", splits
[2]);
3130 REGEX_ASSERT_UTEXT_INVARIANT(":", splits
[3]);
3131 REGEX_ASSERT_UTEXT_INVARIANT("third", splits
[4]);
3132 REGEX_ASSERT(splits
[5] == NULL
);
3134 for (int i
=0; i
<UPRV_LENGTHOF(splits
); i
++) {
3136 utext_close(splits
[i
]);
3140 utext_close(textToSplit
);
3145 // RegexPattern::pattern() and patternText()
3147 pat1
= new RegexPattern();
3148 REGEX_ASSERT(pat1
->pattern() == "");
3149 REGEX_ASSERT_UTEXT_UTF8("", pat1
->patternText(status
));
3151 const char *helloWorldInvariant
= "(Hello, world)*";
3152 regextst_openUTF8FromInvariant(&re1
, helloWorldInvariant
, -1, &status
);
3153 pat1
= RegexPattern::compile(&re1
, pe
, status
);
3155 REGEX_ASSERT_UNISTR("(Hello, world)*", pat1
->pattern());
3156 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1
->patternText(status
));
3163 //---------------------------------------------------------------------------
3165 // Extended A more thorough check for features of regex patterns
3166 // The test cases are in a separate data file,
3167 // source/tests/testdata/regextst.txt
3168 // A description of the test data format is included in that file.
3170 //---------------------------------------------------------------------------
3173 RegexTest::getPath(char buffer
[2048], const char *filename
) {
3174 UErrorCode status
=U_ZERO_ERROR
;
3175 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
3176 if (U_FAILURE(status
)) {
3177 errln("ERROR: loadTestData() failed - %s", u_errorName(status
));
3181 strcpy(buffer
, testDataDirectory
);
3182 strcat(buffer
, filename
);
3186 void RegexTest::Extended() {
3188 const char *srcPath
;
3189 UErrorCode status
= U_ZERO_ERROR
;
3190 int32_t lineNum
= 0;
3193 // Open and read the test data file.
3195 srcPath
=getPath(tdd
, "regextst.txt");
3197 return; /* something went wrong, error already output */
3201 UChar
*testData
= ReadAndConvertFile(srcPath
, len
, "utf-8", status
);
3202 if (U_FAILURE(status
)) {
3203 return; /* something went wrong, error already output */
3207 // Put the test data into a UnicodeString
3209 UnicodeString
testString(FALSE
, testData
, len
);
3211 RegexMatcher
quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status
);
3212 RegexMatcher
commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status
);
3213 RegexMatcher
flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status
);
3215 RegexMatcher
lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString
, 0, status
);
3216 UnicodeString testPattern
; // The pattern for test from the test file.
3217 UnicodeString testFlags
; // the flags for a test.
3218 UnicodeString matchString
; // The marked up string to be used as input
3220 if (U_FAILURE(status
)){
3221 dataerrln("Construct RegexMatcher() error - %s", u_errorName(status
));
3227 // Loop over the test data file, once per line.
3229 while (lineMat
.find()) {
3231 if (U_FAILURE(status
)) {
3232 errln("%s:%d: ICU Error \"%s\"", srcPath
, lineNum
, u_errorName(status
));
3235 status
= U_ZERO_ERROR
;
3236 UnicodeString testLine
= lineMat
.group(1, status
);
3237 if (testLine
.length() == 0) {
3242 // Parse the test line. Skip blank and comment only lines.
3243 // Separate out the three main fields - pattern, flags, target.
3246 commentMat
.reset(testLine
);
3247 if (commentMat
.lookingAt(status
)) {
3248 // This line is a comment, or blank.
3253 // Pull out the pattern field, remove it from the test file line.
3255 quotedStuffMat
.reset(testLine
);
3256 if (quotedStuffMat
.lookingAt(status
)) {
3257 testPattern
= quotedStuffMat
.group(2, status
);
3258 testLine
.remove(0, quotedStuffMat
.end(0, status
));
3260 errln("Bad pattern (missing quotes?) at %s:%d", srcPath
, lineNum
);
3266 // Pull out the flags from the test file line.
3268 flagsMat
.reset(testLine
);
3269 flagsMat
.lookingAt(status
); // Will always match, possibly an empty string.
3270 testFlags
= flagsMat
.group(1, status
);
3271 if (flagsMat
.group(2, status
).length() > 0) {
3272 errln("Bad Match flag at line %d. Scanning %c\n",
3273 lineNum
, flagsMat
.group(2, status
).charAt(0));
3276 testLine
.remove(0, flagsMat
.end(0, status
));
3279 // Pull out the match string, as a whole.
3280 // We'll process the <tags> later.
3282 quotedStuffMat
.reset(testLine
);
3283 if (quotedStuffMat
.lookingAt(status
)) {
3284 matchString
= quotedStuffMat
.group(2, status
);
3285 testLine
.remove(0, quotedStuffMat
.end(0, status
));
3287 errln("Bad match string at test file line %d", lineNum
);
3292 // The only thing left from the input line should be an optional trailing comment.
3294 commentMat
.reset(testLine
);
3295 if (commentMat
.lookingAt(status
) == FALSE
) {
3296 errln("Line %d: unexpected characters at end of test line.", lineNum
);
3303 regex_find(testPattern
, testFlags
, matchString
, srcPath
, lineNum
);
3312 //---------------------------------------------------------------------------
3314 // regex_find(pattern, flags, inputString, lineNumber)
3316 // Function to run a single test from the Extended (data driven) tests.
3317 // See file test/testdata/regextst.txt for a description of the
3318 // pattern and inputString fields, and the allowed flags.
3319 // lineNumber is the source line in regextst.txt of the test.
3321 //---------------------------------------------------------------------------
3324 // Set a value into a UVector at position specified by a decimal number in
3325 // a UnicodeString. This is a utility function needed by the actual test function,
3327 static void set(UVector
&vec
, int32_t val
, UnicodeString index
) {
3328 UErrorCode status
=U_ZERO_ERROR
;
3330 for (int32_t i
=0; i
<index
.length(); i
++) {
3331 int32_t d
=u_charDigitValue(index
.charAt(i
));
3335 while (vec
.size()<idx
+1) {vec
.addElement(-1, status
);}
3336 vec
.setElementAt(val
, idx
);
3339 static void setInt(UVector
&vec
, int32_t val
, int32_t idx
) {
3340 UErrorCode status
=U_ZERO_ERROR
;
3341 while (vec
.size()<idx
+1) {vec
.addElement(-1, status
);}
3342 vec
.setElementAt(val
, idx
);
3345 static UBool
utextOffsetToNative(UText
*utext
, int32_t unistrOffset
, int32_t& nativeIndex
)
3347 UBool couldFind
= TRUE
;
3348 UTEXT_SETNATIVEINDEX(utext
, 0);
3350 while (i
< unistrOffset
) {
3351 UChar32 c
= UTEXT_NEXT32(utext
);
3352 if (c
!= U_SENTINEL
) {
3359 nativeIndex
= (int32_t)UTEXT_GETNATIVEINDEX(utext
);
3364 void RegexTest::regex_find(const UnicodeString
&pattern
,
3365 const UnicodeString
&flags
,
3366 const UnicodeString
&inputString
,
3367 const char *srcPath
,
3369 UnicodeString unEscapedInput
;
3370 UnicodeString deTaggedInput
;
3372 int32_t patternUTF8Length
, inputUTF8Length
;
3373 char *patternChars
= NULL
, *inputChars
= NULL
;
3374 UText patternText
= UTEXT_INITIALIZER
;
3375 UText inputText
= UTEXT_INITIALIZER
;
3376 UConverter
*UTF8Converter
= NULL
;
3378 UErrorCode status
= U_ZERO_ERROR
;
3380 RegexPattern
*parsePat
= NULL
;
3381 RegexMatcher
*parseMatcher
= NULL
;
3382 RegexPattern
*callerPattern
= NULL
, *UTF8Pattern
= NULL
;
3383 RegexMatcher
*matcher
= NULL
, *UTF8Matcher
= NULL
;
3384 UVector
groupStarts(status
);
3385 UVector
groupEnds(status
);
3386 UVector
groupStartsUTF8(status
);
3387 UVector
groupEndsUTF8(status
);
3388 UBool isMatch
= FALSE
, isUTF8Match
= FALSE
;
3389 UBool failed
= FALSE
;
3392 UBool useMatchesFunc
= FALSE
;
3393 UBool useLookingAtFunc
= FALSE
;
3394 int32_t regionStart
= -1;
3395 int32_t regionEnd
= -1;
3396 int32_t regionStartUTF8
= -1;
3397 int32_t regionEndUTF8
= -1;
3401 // Compile the caller's pattern
3403 uint32_t bflags
= 0;
3404 if (flags
.indexOf((UChar
)0x69) >= 0) { // 'i' flag
3405 bflags
|= UREGEX_CASE_INSENSITIVE
;
3407 if (flags
.indexOf((UChar
)0x78) >= 0) { // 'x' flag
3408 bflags
|= UREGEX_COMMENTS
;
3410 if (flags
.indexOf((UChar
)0x73) >= 0) { // 's' flag
3411 bflags
|= UREGEX_DOTALL
;
3413 if (flags
.indexOf((UChar
)0x6d) >= 0) { // 'm' flag
3414 bflags
|= UREGEX_MULTILINE
;
3417 if (flags
.indexOf((UChar
)0x65) >= 0) { // 'e' flag
3418 bflags
|= UREGEX_ERROR_ON_UNKNOWN_ESCAPES
;
3420 if (flags
.indexOf((UChar
)0x44) >= 0) { // 'D' flag
3421 bflags
|= UREGEX_UNIX_LINES
;
3423 if (flags
.indexOf((UChar
)0x51) >= 0) { // 'Q' flag
3424 bflags
|= UREGEX_LITERAL
;
3428 callerPattern
= RegexPattern::compile(pattern
, bflags
, pe
, status
);
3429 if (status
!= U_ZERO_ERROR
) {
3430 #if UCONFIG_NO_BREAK_ITERATION==1
3431 // 'v' test flag means that the test pattern should not compile if ICU was configured
3432 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3433 if (flags
.indexOf((UChar
)0x76) >= 0 /*'v'*/ && status
== U_UNSUPPORTED_ERROR
) {
3434 goto cleanupAndReturn
;
3437 if (flags
.indexOf((UChar
)0x45) >= 0) { // flags contain 'E'
3438 // Expected pattern compilation error.
3439 if (flags
.indexOf((UChar
)0x64) >= 0) { // flags contain 'd'
3440 logln("Pattern Compile returns \"%s\"", u_errorName(status
));
3442 goto cleanupAndReturn
;
3444 // Unexpected pattern compilation error.
3445 dataerrln("Line %d: error %s compiling pattern.", line
, u_errorName(status
));
3446 goto cleanupAndReturn
;
3450 UTF8Converter
= ucnv_open("UTF8", &status
);
3451 ucnv_setFromUCallBack(UTF8Converter
, UCNV_FROM_U_CALLBACK_STOP
, NULL
, NULL
, NULL
, &status
);
3453 patternUTF8Length
= pattern
.extract(NULL
, 0, UTF8Converter
, status
);
3454 status
= U_ZERO_ERROR
; // buffer overflow
3455 patternChars
= new char[patternUTF8Length
+1];
3456 pattern
.extract(patternChars
, patternUTF8Length
+1, UTF8Converter
, status
);
3457 utext_openUTF8(&patternText
, patternChars
, patternUTF8Length
, &status
);
3459 if (status
== U_ZERO_ERROR
) {
3460 UTF8Pattern
= RegexPattern::compile(&patternText
, bflags
, pe
, status
);
3462 if (status
!= U_ZERO_ERROR
) {
3463 #if UCONFIG_NO_BREAK_ITERATION==1
3464 // 'v' test flag means that the test pattern should not compile if ICU was configured
3465 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3466 if (flags
.indexOf((UChar
)0x76) >= 0 /*'v'*/ && status
== U_UNSUPPORTED_ERROR
) {
3467 goto cleanupAndReturn
;
3470 if (flags
.indexOf((UChar
)0x45) >= 0) { // flags contain 'E'
3471 // Expected pattern compilation error.
3472 if (flags
.indexOf((UChar
)0x64) >= 0) { // flags contain 'd'
3473 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status
));
3475 goto cleanupAndReturn
;
3477 // Unexpected pattern compilation error.
3478 errln("Line %d: error %s compiling pattern. (UTF8)", line
, u_errorName(status
));
3479 goto cleanupAndReturn
;
3484 if (UTF8Pattern
== NULL
) {
3485 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3486 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath
, line
);
3487 status
= U_ZERO_ERROR
;
3490 if (flags
.indexOf((UChar
)0x64) >= 0) { // 'd' flag
3491 callerPattern
->dumpPattern();
3494 if (flags
.indexOf((UChar
)0x45) >= 0) { // 'E' flag
3495 errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath
, line
);
3496 goto cleanupAndReturn
;
3501 // Number of times find() should be called on the test string, default to 1
3504 for (i
=2; i
<=9; i
++) {
3505 if (flags
.indexOf((UChar
)(0x30 + i
)) >= 0) { // digit flag
3506 if (numFinds
!= 1) {
3507 errln("Line %d: more than one digit flag. Scanning %d.", line
, i
);
3508 goto cleanupAndReturn
;
3514 // 'M' flag. Use matches() instead of find()
3515 if (flags
.indexOf((UChar
)0x4d) >= 0) {
3516 useMatchesFunc
= TRUE
;
3518 if (flags
.indexOf((UChar
)0x4c) >= 0) {
3519 useLookingAtFunc
= TRUE
;
3523 // Find the tags in the input data, remove them, and record the group boundary
3526 parsePat
= RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe
, status
);
3527 REGEX_CHECK_STATUS_L(line
);
3529 unEscapedInput
= inputString
.unescape();
3530 parseMatcher
= parsePat
->matcher(unEscapedInput
, status
);
3531 REGEX_CHECK_STATUS_L(line
);
3532 while(parseMatcher
->find()) {
3533 parseMatcher
->appendReplacement(deTaggedInput
, "", status
);
3535 UnicodeString groupNum
= parseMatcher
->group(2, status
);
3536 if (groupNum
== "r") {
3537 // <r> or </r>, a region specification within the string
3538 if (parseMatcher
->group(1, status
) == "/") {
3539 regionEnd
= deTaggedInput
.length();
3541 regionStart
= deTaggedInput
.length();
3544 // <digits> or </digits>, a group match boundary tag.
3545 if (parseMatcher
->group(1, status
) == "/") {
3546 set(groupEnds
, deTaggedInput
.length(), groupNum
);
3548 set(groupStarts
, deTaggedInput
.length(), groupNum
);
3552 parseMatcher
->appendTail(deTaggedInput
);
3553 REGEX_ASSERT_L(groupStarts
.size() == groupEnds
.size(), line
);
3554 if ((regionStart
>=0 || regionEnd
>=0) && (regionStart
<0 || regionStart
>regionEnd
)) {
3555 errln("mismatched <r> tags");
3557 goto cleanupAndReturn
;
3561 // Configure the matcher according to the flags specified with this test.
3563 matcher
= callerPattern
->matcher(deTaggedInput
, status
);
3564 REGEX_CHECK_STATUS_L(line
);
3565 if (flags
.indexOf((UChar
)0x74) >= 0) { // 't' trace flag
3566 matcher
->setTrace(TRUE
);
3569 if (UTF8Pattern
!= NULL
) {
3570 inputUTF8Length
= deTaggedInput
.extract(NULL
, 0, UTF8Converter
, status
);
3571 status
= U_ZERO_ERROR
; // buffer overflow
3572 inputChars
= new char[inputUTF8Length
+1];
3573 deTaggedInput
.extract(inputChars
, inputUTF8Length
+1, UTF8Converter
, status
);
3574 utext_openUTF8(&inputText
, inputChars
, inputUTF8Length
, &status
);
3576 if (status
== U_ZERO_ERROR
) {
3577 UTF8Matcher
= &UTF8Pattern
->matcher(status
)->reset(&inputText
);
3578 REGEX_CHECK_STATUS_L(line
);
3581 if (UTF8Matcher
== NULL
) {
3582 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3583 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath
, line
);
3584 status
= U_ZERO_ERROR
;
3589 // Generate native indices for UTF8 versions of region and capture group info
3591 if (UTF8Matcher
!= NULL
) {
3592 if (regionStart
>=0) (void) utextOffsetToNative(&inputText
, regionStart
, regionStartUTF8
);
3593 if (regionEnd
>=0) (void) utextOffsetToNative(&inputText
, regionEnd
, regionEndUTF8
);
3595 // Fill out the native index UVector info.
3596 // Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3597 for (i
=0; i
<groupStarts
.size(); i
++) {
3598 int32_t start
= groupStarts
.elementAti(i
);
3599 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3602 if (!utextOffsetToNative(&inputText
, start
, startUTF8
)) {
3603 errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line
, i
, start
);
3605 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3607 setInt(groupStartsUTF8
, startUTF8
, i
);
3610 int32_t end
= groupEnds
.elementAti(i
);
3611 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3614 if (!utextOffsetToNative(&inputText
, end
, endUTF8
)) {
3615 errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line
, i
, end
);
3617 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3619 setInt(groupEndsUTF8
, endUTF8
, i
);
3624 if (regionStart
>=0) {
3625 matcher
->region(regionStart
, regionEnd
, status
);
3626 REGEX_CHECK_STATUS_L(line
);
3627 if (UTF8Matcher
!= NULL
) {
3628 UTF8Matcher
->region(regionStartUTF8
, regionEndUTF8
, status
);
3629 REGEX_CHECK_STATUS_L(line
);
3632 if (flags
.indexOf((UChar
)0x61) >= 0) { // 'a' anchoring bounds flag
3633 matcher
->useAnchoringBounds(FALSE
);
3634 if (UTF8Matcher
!= NULL
) {
3635 UTF8Matcher
->useAnchoringBounds(FALSE
);
3638 if (flags
.indexOf((UChar
)0x62) >= 0) { // 'b' transparent bounds flag
3639 matcher
->useTransparentBounds(TRUE
);
3640 if (UTF8Matcher
!= NULL
) {
3641 UTF8Matcher
->useTransparentBounds(TRUE
);
3648 // Do a find on the de-tagged input using the caller's pattern
3649 // TODO: error on count>1 and not find().
3650 // error on both matches() and lookingAt().
3652 for (i
=0; i
<numFinds
; i
++) {
3653 if (useMatchesFunc
) {
3654 isMatch
= matcher
->matches(status
);
3655 if (UTF8Matcher
!= NULL
) {
3656 isUTF8Match
= UTF8Matcher
->matches(status
);
3658 } else if (useLookingAtFunc
) {
3659 isMatch
= matcher
->lookingAt(status
);
3660 if (UTF8Matcher
!= NULL
) {
3661 isUTF8Match
= UTF8Matcher
->lookingAt(status
);
3664 isMatch
= matcher
->find();
3665 if (UTF8Matcher
!= NULL
) {
3666 isUTF8Match
= UTF8Matcher
->find();
3670 matcher
->setTrace(FALSE
);
3671 if (U_FAILURE(status
)) {
3672 errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status
));
3676 // Match up the groups from the find() with the groups from the tags
3679 // number of tags should match number of groups from find operation.
3680 // matcher->groupCount does not include group 0, the entire match, hence the +1.
3681 // G option in test means that capture group data is not available in the
3682 // expected results, so the check needs to be suppressed.
3683 if (isMatch
== FALSE
&& groupStarts
.size() != 0) {
3684 dataerrln("Error at line %d: Match expected, but none found.", line
);
3686 goto cleanupAndReturn
;
3687 } else if (UTF8Matcher
!= NULL
&& isUTF8Match
== FALSE
&& groupStarts
.size() != 0) {
3688 errln("Error at line %d: Match expected, but none found. (UTF8)", line
);
3690 goto cleanupAndReturn
;
3693 if (flags
.indexOf((UChar
)0x47 /*G*/) >= 0) {
3694 // Only check for match / no match. Don't check capture groups.
3695 if (isMatch
&& groupStarts
.size() == 0) {
3696 errln("Error at line %d: No match expected, but one found.", line
);
3698 } else if (UTF8Matcher
!= NULL
&& isUTF8Match
&& groupStarts
.size() == 0) {
3699 errln("Error at line %d: No match expected, but one found. (UTF8)", line
);
3702 goto cleanupAndReturn
;
3705 REGEX_CHECK_STATUS_L(line
);
3706 for (i
=0; i
<=matcher
->groupCount(); i
++) {
3707 int32_t expectedStart
= (i
>= groupStarts
.size()? -1 : groupStarts
.elementAti(i
));
3708 int32_t expectedStartUTF8
= (i
>= groupStartsUTF8
.size()? -1 : groupStartsUTF8
.elementAti(i
));
3709 if (matcher
->start(i
, status
) != expectedStart
) {
3710 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
3711 line
, i
, expectedStart
, matcher
->start(i
, status
));
3713 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3714 } else if (UTF8Matcher
!= NULL
&& UTF8Matcher
->start(i
, status
) != expectedStartUTF8
) {
3715 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)",
3716 line
, i
, expectedStartUTF8
, UTF8Matcher
->start(i
, status
));
3718 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3721 int32_t expectedEnd
= (i
>= groupEnds
.size()? -1 : groupEnds
.elementAti(i
));
3722 int32_t expectedEndUTF8
= (i
>= groupEndsUTF8
.size()? -1 : groupEndsUTF8
.elementAti(i
));
3723 if (matcher
->end(i
, status
) != expectedEnd
) {
3724 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
3725 line
, i
, expectedEnd
, matcher
->end(i
, status
));
3727 // Error on end position; keep going; real error is probably yet to come as group
3728 // end positions work from end of the input data towards the front.
3729 } else if (UTF8Matcher
!= NULL
&& UTF8Matcher
->end(i
, status
) != expectedEndUTF8
) {
3730 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)",
3731 line
, i
, expectedEndUTF8
, UTF8Matcher
->end(i
, status
));
3733 // Error on end position; keep going; real error is probably yet to come as group
3734 // end positions work from end of the input data towards the front.
3737 if ( matcher
->groupCount()+1 < groupStarts
.size()) {
3738 errln("Error at line %d: Expected %d capture groups, found %d.",
3739 line
, groupStarts
.size()-1, matcher
->groupCount());
3742 else if (UTF8Matcher
!= NULL
&& UTF8Matcher
->groupCount()+1 < groupStarts
.size()) {
3743 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3744 line
, groupStarts
.size()-1, UTF8Matcher
->groupCount());
3748 if ((flags
.indexOf((UChar
)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3749 matcher
->requireEnd() == TRUE
) {
3750 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line
);
3752 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3753 UTF8Matcher
->requireEnd() == TRUE
) {
3754 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line
);
3758 if ((flags
.indexOf((UChar
)0x79) >= 0) && // 'y' flag: RequireEnd() == true
3759 matcher
->requireEnd() == FALSE
) {
3760 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line
);
3762 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x79) >= 0) && // 'Y' flag: RequireEnd() == false
3763 UTF8Matcher
->requireEnd() == FALSE
) {
3764 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line
);
3768 if ((flags
.indexOf((UChar
)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3769 matcher
->hitEnd() == TRUE
) {
3770 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line
);
3772 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3773 UTF8Matcher
->hitEnd() == TRUE
) {
3774 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line
);
3778 if ((flags
.indexOf((UChar
)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3779 matcher
->hitEnd() == FALSE
) {
3780 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line
);
3782 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3783 UTF8Matcher
->hitEnd() == FALSE
) {
3784 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line
);
3791 infoln((UnicodeString
)"\""+pattern
+(UnicodeString
)"\" "
3792 +flags
+(UnicodeString
)" \""+inputString
+(UnicodeString
)"\"");
3793 // callerPattern->dump();
3795 delete parseMatcher
;
3800 delete callerPattern
;
3802 utext_close(&inputText
);
3803 delete[] inputChars
;
3804 utext_close(&patternText
);
3805 delete[] patternChars
;
3806 ucnv_close(UTF8Converter
);
3812 //---------------------------------------------------------------------------
3814 // Errors Check for error handling in patterns.
3816 //---------------------------------------------------------------------------
3817 void RegexTest::Errors() {
3818 // \escape sequences that aren't implemented yet.
3819 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3821 // Missing close parentheses
3822 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN
);
3823 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN
);
3824 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN
);
3826 // Extra close paren
3827 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN
);
3828 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN
);
3829 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN
);
3831 // Look-ahead, Look-behind
3832 // TODO: add tests for unbounded length look-behinds.
3833 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX
); // illegal construct
3835 // Attempt to use non-default flags
3838 UErrorCode status
= U_ZERO_ERROR
;
3839 int32_t flags
= UREGEX_CANON_EQ
|
3840 UREGEX_COMMENTS
| UREGEX_DOTALL
|
3842 RegexPattern
*pat1
= RegexPattern::compile(".*", flags
, pe
, status
);
3843 REGEX_ASSERT(status
== U_REGEX_UNIMPLEMENTED
);
3848 // Quantifiers are allowed only after something that can be quantified.
3849 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX
);
3850 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX
);
3851 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX
);
3853 // Mal-formed {min,max} quantifiers
3854 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL
);
3855 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN
);
3856 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL
);
3857 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL
);
3858 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL
);
3859 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG
);
3860 REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG
); // Overflows int during scan
3861 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG
); // Overflows regex binary format
3862 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG
);
3865 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX
);
3867 // Invalid Back Reference \0
3868 // For ICU 3.8 and earlier
3869 // For ICU versions newer than 3.8, \0 introduces an octal escape.
3871 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE
);
3876 //-------------------------------------------------------------------------------
3878 // Read a text data file, convert it to UChars, and return the data
3879 // in one big UChar * buffer, which the caller must delete.
3881 //--------------------------------------------------------------------------------
3882 UChar
*RegexTest::ReadAndConvertFile(const char *fileName
, int32_t &ulen
,
3883 const char *defEncoding
, UErrorCode
&status
) {
3884 UChar
*retPtr
= NULL
;
3885 char *fileBuf
= NULL
;
3886 UConverter
* conv
= NULL
;
3890 if (U_FAILURE(status
)) {
3897 f
= fopen(fileName
, "rb");
3899 dataerrln("Error opening test data file %s\n", fileName
);
3900 status
= U_FILE_ACCESS_ERROR
;
3909 fseek( f
, 0, SEEK_END
);
3910 fileSize
= ftell(f
);
3911 fileBuf
= new char[fileSize
];
3912 fseek(f
, 0, SEEK_SET
);
3913 amt_read
= fread(fileBuf
, 1, fileSize
, f
);
3914 if (amt_read
!= fileSize
|| fileSize
<= 0) {
3915 errln("Error reading test data file.");
3916 goto cleanUpAndReturn
;
3920 // Look for a Unicode Signature (BOM) on the data just read
3922 int32_t signatureLength
;
3923 const char * fileBufC
;
3924 const char* encoding
;
3927 encoding
= ucnv_detectUnicodeSignature(
3928 fileBuf
, fileSize
, &signatureLength
, &status
);
3929 if(encoding
!=NULL
){
3930 fileBufC
+= signatureLength
;
3931 fileSize
-= signatureLength
;
3933 encoding
= defEncoding
;
3934 if (strcmp(encoding
, "utf-8") == 0) {
3935 errln("file %s is missing its BOM", fileName
);
3940 // Open a converter to take the rule file to UTF-16
3942 conv
= ucnv_open(encoding
, &status
);
3943 if (U_FAILURE(status
)) {
3944 goto cleanUpAndReturn
;
3948 // Convert the rules to UChar.
3949 // Preflight first to determine required buffer size.
3951 ulen
= ucnv_toUChars(conv
,
3957 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
3958 // Buffer Overflow is expected from the preflight operation.
3959 status
= U_ZERO_ERROR
;
3961 retPtr
= new UChar
[ulen
+1];
3974 if (U_FAILURE(status
)) {
3975 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
3984 //-------------------------------------------------------------------------------
3986 // PerlTests - Run Perl's regular expression tests
3987 // The input file for this test is re_tests, the standard regular
3988 // expression test data distributed with the Perl source code.
3990 // Here is Perl's description of the test data file:
3992 // # The tests are in a separate file 't/op/re_tests'.
3993 // # Each line in that file is a separate test.
3994 // # There are five columns, separated by tabs.
3996 // # Column 1 contains the pattern, optionally enclosed in C<''>.
3997 // # Modifiers can be put after the closing C<'>.
3999 // # Column 2 contains the string to be matched.
4001 // # Column 3 contains the expected result:
4002 // # y expect a match
4003 // # n expect no match
4004 // # c expect an error
4005 // # B test exposes a known bug in Perl, should be skipped
4006 // # b test exposes a known bug in Perl, should be skipped if noamp
4008 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
4010 // # Column 4 contains a string, usually C<$&>.
4012 // # Column 5 contains the expected result of double-quote
4013 // # interpolating that string after the match, or start of error message.
4015 // # Column 6, if present, contains a reason why the test is skipped.
4016 // # This is printed with "skipped", for harness to pick up.
4018 // # \n in the tests are interpolated, as are variables of the form ${\w+}.
4020 // # If you want to add a regular expression test that can't be expressed
4021 // # in this format, don't add it here: put it in op/pat.t instead.
4023 // For ICU, if field 3 contains an 'i', the test will be skipped.
4024 // The test exposes is some known incompatibility between ICU and Perl regexps.
4025 // (The i is in addition to whatever was there before.)
4027 //-------------------------------------------------------------------------------
4028 void RegexTest::PerlTests() {
4030 const char *srcPath
;
4031 UErrorCode status
= U_ZERO_ERROR
;
4035 // Open and read the test data file.
4037 srcPath
=getPath(tdd
, "re_tests.txt");
4039 return; /* something went wrong, error already output */
4043 UChar
*testData
= ReadAndConvertFile(srcPath
, len
, "iso-8859-1", status
);
4044 if (U_FAILURE(status
)) {
4045 return; /* something went wrong, error already output */
4049 // Put the test data into a UnicodeString
4051 UnicodeString
testDataString(FALSE
, testData
, len
);
4054 // Regex to break the input file into lines, and strip the new lines.
4055 // One line per match, capture group one is the desired data.
4057 RegexPattern
* linePat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe
, status
);
4058 if (U_FAILURE(status
)) {
4059 dataerrln("RegexPattern::compile() error");
4062 RegexMatcher
* lineMat
= linePat
->matcher(testDataString
, status
);
4065 // Regex to split a test file line into fields.
4066 // There are six fields, separated by tabs.
4068 RegexPattern
* fieldPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe
, status
);
4071 // Regex to identify test patterns with flag settings, and to separate them.
4072 // Test patterns with flags look like 'pattern'i
4073 // Test patterns without flags are not quoted: pattern
4074 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4076 RegexPattern
*flagPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe
, status
);
4077 RegexMatcher
* flagMat
= flagPat
->matcher(status
);
4080 // The Perl tests reference several perl-isms, which are evaluated/substituted
4081 // in the test data. Not being perl, this must be done explicitly. Here
4082 // are string constants and REs for these constructs.
4084 UnicodeString
nulnulSrc("${nulnul}");
4085 UnicodeString
nulnul("\\u0000\\u0000", -1, US_INV
);
4086 nulnul
= nulnul
.unescape();
4088 UnicodeString
ffffSrc("${ffff}");
4089 UnicodeString
ffff("\\uffff", -1, US_INV
);
4090 ffff
= ffff
.unescape();
4092 // regexp for $-[0], $+[2], etc.
4093 RegexPattern
*groupsPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe
, status
);
4094 RegexMatcher
*groupsMat
= groupsPat
->matcher(status
);
4096 // regexp for $0, $1, $2, etc.
4097 RegexPattern
*cgPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe
, status
);
4098 RegexMatcher
*cgMat
= cgPat
->matcher(status
);
4102 // Main Loop for the Perl Tests, runs once per line from the
4105 int32_t lineNum
= 0;
4106 int32_t skippedUnimplementedCount
= 0;
4107 while (lineMat
->find()) {
4111 // Get a line, break it into its fields, do the Perl
4112 // variable substitutions.
4114 UnicodeString line
= lineMat
->group(1, status
);
4115 UnicodeString fields
[7];
4116 fieldPat
->split(line
, fields
, 7, status
);
4118 flagMat
->reset(fields
[0]);
4119 flagMat
->matches(status
);
4120 UnicodeString pattern
= flagMat
->group(2, status
);
4121 pattern
.findAndReplace("${bang}", "!");
4122 pattern
.findAndReplace(nulnulSrc
, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4123 pattern
.findAndReplace(ffffSrc
, ffff
);
4126 // Identify patterns that include match flag settings,
4127 // split off the flags, remove the extra quotes.
4129 UnicodeString flagStr
= flagMat
->group(3, status
);
4130 if (U_FAILURE(status
)) {
4131 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
4135 const UChar UChar_c
= 0x63; // Char constants for the flag letters.
4136 const UChar UChar_i
= 0x69; // (Damn the lack of Unicode support in C)
4137 const UChar UChar_m
= 0x6d;
4138 const UChar UChar_x
= 0x78;
4139 const UChar UChar_y
= 0x79;
4140 if (flagStr
.indexOf(UChar_i
) != -1) {
4141 flags
|= UREGEX_CASE_INSENSITIVE
;
4143 if (flagStr
.indexOf(UChar_m
) != -1) {
4144 flags
|= UREGEX_MULTILINE
;
4146 if (flagStr
.indexOf(UChar_x
) != -1) {
4147 flags
|= UREGEX_COMMENTS
;
4151 // Compile the test pattern.
4153 status
= U_ZERO_ERROR
;
4154 RegexPattern
*testPat
= RegexPattern::compile(pattern
, flags
, pe
, status
);
4155 if (status
== U_REGEX_UNIMPLEMENTED
) {
4157 // Test of a feature that is planned for ICU, but not yet implemented.
4159 skippedUnimplementedCount
++;
4161 status
= U_ZERO_ERROR
;
4165 if (U_FAILURE(status
)) {
4166 // Some tests are supposed to generate errors.
4167 // Only report an error for tests that are supposed to succeed.
4168 if (fields
[2].indexOf(UChar_c
) == -1 && // Compilation is not supposed to fail AND
4169 fields
[2].indexOf(UChar_i
) == -1) // it's not an accepted ICU incompatibility
4171 errln("line %d: ICU Error \"%s\"\n", lineNum
, u_errorName(status
));
4173 status
= U_ZERO_ERROR
;
4178 if (fields
[2].indexOf(UChar_i
) >= 0) {
4179 // ICU should skip this test.
4184 if (fields
[2].indexOf(UChar_c
) >= 0) {
4185 // This pattern should have caused a compilation error, but didn't/
4186 errln("line %d: Expected a pattern compile error, got success.", lineNum
);
4192 // replace the Perl variables that appear in some of the
4193 // match data strings.
4195 UnicodeString matchString
= fields
[1];
4196 matchString
.findAndReplace(nulnulSrc
, nulnul
);
4197 matchString
.findAndReplace(ffffSrc
, ffff
);
4199 // Replace any \n in the match string with an actual new-line char.
4200 // Don't do full unescape, as this unescapes more than Perl does, which
4201 // causes other spurious failures in the tests.
4202 matchString
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4207 // Run the test, check for expected match/don't match result.
4209 RegexMatcher
*testMat
= testPat
->matcher(matchString
, status
);
4210 UBool found
= testMat
->find();
4211 UBool expected
= FALSE
;
4212 if (fields
[2].indexOf(UChar_y
) >=0) {
4215 if (expected
!= found
) {
4216 errln("line %d: Expected %smatch, got %smatch",
4217 lineNum
, expected
?"":"no ", found
?"":"no " );
4221 // Don't try to check expected results if there is no match.
4222 // (Some have stuff in the expected fields)
4230 // Interpret the Perl expression from the fourth field of the data file,
4231 // building up an ICU string from the results of the ICU match.
4232 // The Perl expression will contain references to the results of
4233 // a regex match, including the matched string, capture group strings,
4234 // group starting and ending indicies, etc.
4236 UnicodeString resultString
;
4237 UnicodeString perlExpr
= fields
[3];
4238 #if SUPPORT_MUTATING_INPUT_STRING
4239 groupsMat
->reset(perlExpr
);
4240 cgMat
->reset(perlExpr
);
4243 while (perlExpr
.length() > 0) {
4244 #if !SUPPORT_MUTATING_INPUT_STRING
4245 // Perferred usage. Reset after any modification to input string.
4246 groupsMat
->reset(perlExpr
);
4247 cgMat
->reset(perlExpr
);
4250 if (perlExpr
.startsWith("$&")) {
4251 resultString
.append(testMat
->group(status
));
4252 perlExpr
.remove(0, 2);
4255 else if (groupsMat
->lookingAt(status
)) {
4257 UnicodeString digitString
= groupsMat
->group(2, status
);
4259 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4260 UnicodeString plusOrMinus
= groupsMat
->group(1, status
);
4261 int32_t matchPosition
;
4262 if (plusOrMinus
.compare("+") == 0) {
4263 matchPosition
= testMat
->end(groupNum
, status
);
4265 matchPosition
= testMat
->start(groupNum
, status
);
4267 if (matchPosition
!= -1) {
4268 ICU_Utility::appendNumber(resultString
, matchPosition
);
4270 perlExpr
.remove(0, groupsMat
->end(status
));
4273 else if (cgMat
->lookingAt(status
)) {
4275 UnicodeString digitString
= cgMat
->group(1, status
);
4277 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4278 if (U_SUCCESS(status
)) {
4279 resultString
.append(testMat
->group(groupNum
, status
));
4280 status
= U_ZERO_ERROR
;
4282 perlExpr
.remove(0, cgMat
->end(status
));
4285 else if (perlExpr
.startsWith("@-")) {
4287 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4289 resultString
.append(" ");
4291 ICU_Utility::appendNumber(resultString
, testMat
->start(i
, status
));
4293 perlExpr
.remove(0, 2);
4296 else if (perlExpr
.startsWith("@+")) {
4298 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4300 resultString
.append(" ");
4302 ICU_Utility::appendNumber(resultString
, testMat
->end(i
, status
));
4304 perlExpr
.remove(0, 2);
4307 else if (perlExpr
.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4308 // or as an escaped sequence (e.g. \n)
4309 if (perlExpr
.length() > 1) {
4310 perlExpr
.remove(0, 1); // Remove the '\', but only if not last char.
4312 UChar c
= perlExpr
.charAt(0);
4314 case 'n': c
= '\n'; break;
4315 // add any other escape sequences that show up in the test expected results.
4317 resultString
.append(c
);
4318 perlExpr
.remove(0, 1);
4322 // Any characters from the perl expression that we don't explicitly
4323 // recognize before here are assumed to be literals and copied
4324 // as-is to the expected results.
4325 resultString
.append(perlExpr
.charAt(0));
4326 perlExpr
.remove(0, 1);
4329 if (U_FAILURE(status
)) {
4330 errln("Line %d: ICU Error \"%s\"", lineNum
, u_errorName(status
));
4336 // Expected Results Compare
4338 UnicodeString
expectedS(fields
[4]);
4339 expectedS
.findAndReplace(nulnulSrc
, nulnul
);
4340 expectedS
.findAndReplace(ffffSrc
, ffff
);
4341 expectedS
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4344 if (expectedS
.compare(resultString
) != 0) {
4345 err("Line %d: Incorrect perl expression results.", lineNum
);
4346 infoln((UnicodeString
)"Expected \""+expectedS
+(UnicodeString
)"\"; got \""+resultString
+(UnicodeString
)"\"");
4354 // All done. Clean up allocated stuff.
4372 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount
);
4377 //-------------------------------------------------------------------------------
4379 // PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts
4380 // (instead of using UnicodeStrings) to test the alternate engine.
4381 // The input file for this test is re_tests, the standard regular
4382 // expression test data distributed with the Perl source code.
4383 // See PerlTests() for more information.
4385 //-------------------------------------------------------------------------------
4386 void RegexTest::PerlTestsUTF8() {
4388 const char *srcPath
;
4389 UErrorCode status
= U_ZERO_ERROR
;
4391 LocalUConverterPointer
UTF8Converter(ucnv_open("UTF-8", &status
));
4392 UText patternText
= UTEXT_INITIALIZER
;
4393 char *patternChars
= NULL
;
4394 int32_t patternLength
;
4395 int32_t patternCapacity
= 0;
4396 UText inputText
= UTEXT_INITIALIZER
;
4397 char *inputChars
= NULL
;
4398 int32_t inputLength
;
4399 int32_t inputCapacity
= 0;
4401 ucnv_setFromUCallBack(UTF8Converter
.getAlias(), UCNV_FROM_U_CALLBACK_STOP
, NULL
, NULL
, NULL
, &status
);
4404 // Open and read the test data file.
4406 srcPath
=getPath(tdd
, "re_tests.txt");
4408 return; /* something went wrong, error already output */
4412 UChar
*testData
= ReadAndConvertFile(srcPath
, len
, "iso-8859-1", status
);
4413 if (U_FAILURE(status
)) {
4414 return; /* something went wrong, error already output */
4418 // Put the test data into a UnicodeString
4420 UnicodeString
testDataString(FALSE
, testData
, len
);
4423 // Regex to break the input file into lines, and strip the new lines.
4424 // One line per match, capture group one is the desired data.
4426 RegexPattern
* linePat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe
, status
);
4427 if (U_FAILURE(status
)) {
4428 dataerrln("RegexPattern::compile() error");
4431 RegexMatcher
* lineMat
= linePat
->matcher(testDataString
, status
);
4434 // Regex to split a test file line into fields.
4435 // There are six fields, separated by tabs.
4437 RegexPattern
* fieldPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe
, status
);
4440 // Regex to identify test patterns with flag settings, and to separate them.
4441 // Test patterns with flags look like 'pattern'i
4442 // Test patterns without flags are not quoted: pattern
4443 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4445 RegexPattern
*flagPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe
, status
);
4446 RegexMatcher
* flagMat
= flagPat
->matcher(status
);
4449 // The Perl tests reference several perl-isms, which are evaluated/substituted
4450 // in the test data. Not being perl, this must be done explicitly. Here
4451 // are string constants and REs for these constructs.
4453 UnicodeString
nulnulSrc("${nulnul}");
4454 UnicodeString
nulnul("\\u0000\\u0000", -1, US_INV
);
4455 nulnul
= nulnul
.unescape();
4457 UnicodeString
ffffSrc("${ffff}");
4458 UnicodeString
ffff("\\uffff", -1, US_INV
);
4459 ffff
= ffff
.unescape();
4461 // regexp for $-[0], $+[2], etc.
4462 RegexPattern
*groupsPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe
, status
);
4463 RegexMatcher
*groupsMat
= groupsPat
->matcher(status
);
4465 // regexp for $0, $1, $2, etc.
4466 RegexPattern
*cgPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe
, status
);
4467 RegexMatcher
*cgMat
= cgPat
->matcher(status
);
4471 // Main Loop for the Perl Tests, runs once per line from the
4474 int32_t lineNum
= 0;
4475 int32_t skippedUnimplementedCount
= 0;
4476 while (lineMat
->find()) {
4480 // Get a line, break it into its fields, do the Perl
4481 // variable substitutions.
4483 UnicodeString line
= lineMat
->group(1, status
);
4484 UnicodeString fields
[7];
4485 fieldPat
->split(line
, fields
, 7, status
);
4487 flagMat
->reset(fields
[0]);
4488 flagMat
->matches(status
);
4489 UnicodeString pattern
= flagMat
->group(2, status
);
4490 pattern
.findAndReplace("${bang}", "!");
4491 pattern
.findAndReplace(nulnulSrc
, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4492 pattern
.findAndReplace(ffffSrc
, ffff
);
4495 // Identify patterns that include match flag settings,
4496 // split off the flags, remove the extra quotes.
4498 UnicodeString flagStr
= flagMat
->group(3, status
);
4499 if (U_FAILURE(status
)) {
4500 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
4504 const UChar UChar_c
= 0x63; // Char constants for the flag letters.
4505 const UChar UChar_i
= 0x69; // (Damn the lack of Unicode support in C)
4506 const UChar UChar_m
= 0x6d;
4507 const UChar UChar_x
= 0x78;
4508 const UChar UChar_y
= 0x79;
4509 if (flagStr
.indexOf(UChar_i
) != -1) {
4510 flags
|= UREGEX_CASE_INSENSITIVE
;
4512 if (flagStr
.indexOf(UChar_m
) != -1) {
4513 flags
|= UREGEX_MULTILINE
;
4515 if (flagStr
.indexOf(UChar_x
) != -1) {
4516 flags
|= UREGEX_COMMENTS
;
4520 // Put the pattern in a UTF-8 UText
4522 status
= U_ZERO_ERROR
;
4523 patternLength
= pattern
.extract(patternChars
, patternCapacity
, UTF8Converter
.getAlias(), status
);
4524 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
4525 status
= U_ZERO_ERROR
;
4526 delete[] patternChars
;
4527 patternCapacity
= patternLength
+ 1;
4528 patternChars
= new char[patternCapacity
];
4529 pattern
.extract(patternChars
, patternCapacity
, UTF8Converter
.getAlias(), status
);
4531 utext_openUTF8(&patternText
, patternChars
, patternLength
, &status
);
4534 // Compile the test pattern.
4536 RegexPattern
*testPat
= RegexPattern::compile(&patternText
, flags
, pe
, status
);
4537 if (status
== U_REGEX_UNIMPLEMENTED
) {
4539 // Test of a feature that is planned for ICU, but not yet implemented.
4541 skippedUnimplementedCount
++;
4543 status
= U_ZERO_ERROR
;
4547 if (U_FAILURE(status
)) {
4548 // Some tests are supposed to generate errors.
4549 // Only report an error for tests that are supposed to succeed.
4550 if (fields
[2].indexOf(UChar_c
) == -1 && // Compilation is not supposed to fail AND
4551 fields
[2].indexOf(UChar_i
) == -1) // it's not an accepted ICU incompatibility
4553 errln("line %d: ICU Error \"%s\"\n", lineNum
, u_errorName(status
));
4555 status
= U_ZERO_ERROR
;
4560 if (fields
[2].indexOf(UChar_i
) >= 0) {
4561 // ICU should skip this test.
4566 if (fields
[2].indexOf(UChar_c
) >= 0) {
4567 // This pattern should have caused a compilation error, but didn't/
4568 errln("line %d: Expected a pattern compile error, got success.", lineNum
);
4575 // replace the Perl variables that appear in some of the
4576 // match data strings.
4578 UnicodeString matchString
= fields
[1];
4579 matchString
.findAndReplace(nulnulSrc
, nulnul
);
4580 matchString
.findAndReplace(ffffSrc
, ffff
);
4582 // Replace any \n in the match string with an actual new-line char.
4583 // Don't do full unescape, as this unescapes more than Perl does, which
4584 // causes other spurious failures in the tests.
4585 matchString
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4588 // Put the input in a UTF-8 UText
4590 status
= U_ZERO_ERROR
;
4591 inputLength
= matchString
.extract(inputChars
, inputCapacity
, UTF8Converter
.getAlias(), status
);
4592 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
4593 status
= U_ZERO_ERROR
;
4594 delete[] inputChars
;
4595 inputCapacity
= inputLength
+ 1;
4596 inputChars
= new char[inputCapacity
];
4597 matchString
.extract(inputChars
, inputCapacity
, UTF8Converter
.getAlias(), status
);
4599 utext_openUTF8(&inputText
, inputChars
, inputLength
, &status
);
4602 // Run the test, check for expected match/don't match result.
4604 RegexMatcher
*testMat
= &testPat
->matcher(status
)->reset(&inputText
);
4605 UBool found
= testMat
->find();
4606 UBool expected
= FALSE
;
4607 if (fields
[2].indexOf(UChar_y
) >=0) {
4610 if (expected
!= found
) {
4611 errln("line %d: Expected %smatch, got %smatch",
4612 lineNum
, expected
?"":"no ", found
?"":"no " );
4616 // Don't try to check expected results if there is no match.
4617 // (Some have stuff in the expected fields)
4625 // Interpret the Perl expression from the fourth field of the data file,
4626 // building up an ICU string from the results of the ICU match.
4627 // The Perl expression will contain references to the results of
4628 // a regex match, including the matched string, capture group strings,
4629 // group starting and ending indicies, etc.
4631 UnicodeString resultString
;
4632 UnicodeString perlExpr
= fields
[3];
4634 while (perlExpr
.length() > 0) {
4635 groupsMat
->reset(perlExpr
);
4636 cgMat
->reset(perlExpr
);
4638 if (perlExpr
.startsWith("$&")) {
4639 resultString
.append(testMat
->group(status
));
4640 perlExpr
.remove(0, 2);
4643 else if (groupsMat
->lookingAt(status
)) {
4645 UnicodeString digitString
= groupsMat
->group(2, status
);
4647 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4648 UnicodeString plusOrMinus
= groupsMat
->group(1, status
);
4649 int32_t matchPosition
;
4650 if (plusOrMinus
.compare("+") == 0) {
4651 matchPosition
= testMat
->end(groupNum
, status
);
4653 matchPosition
= testMat
->start(groupNum
, status
);
4655 if (matchPosition
!= -1) {
4656 ICU_Utility::appendNumber(resultString
, matchPosition
);
4658 perlExpr
.remove(0, groupsMat
->end(status
));
4661 else if (cgMat
->lookingAt(status
)) {
4663 UnicodeString digitString
= cgMat
->group(1, status
);
4665 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4666 if (U_SUCCESS(status
)) {
4667 resultString
.append(testMat
->group(groupNum
, status
));
4668 status
= U_ZERO_ERROR
;
4670 perlExpr
.remove(0, cgMat
->end(status
));
4673 else if (perlExpr
.startsWith("@-")) {
4675 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4677 resultString
.append(" ");
4679 ICU_Utility::appendNumber(resultString
, testMat
->start(i
, status
));
4681 perlExpr
.remove(0, 2);
4684 else if (perlExpr
.startsWith("@+")) {
4686 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4688 resultString
.append(" ");
4690 ICU_Utility::appendNumber(resultString
, testMat
->end(i
, status
));
4692 perlExpr
.remove(0, 2);
4695 else if (perlExpr
.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4696 // or as an escaped sequence (e.g. \n)
4697 if (perlExpr
.length() > 1) {
4698 perlExpr
.remove(0, 1); // Remove the '\', but only if not last char.
4700 UChar c
= perlExpr
.charAt(0);
4702 case 'n': c
= '\n'; break;
4703 // add any other escape sequences that show up in the test expected results.
4705 resultString
.append(c
);
4706 perlExpr
.remove(0, 1);
4710 // Any characters from the perl expression that we don't explicitly
4711 // recognize before here are assumed to be literals and copied
4712 // as-is to the expected results.
4713 resultString
.append(perlExpr
.charAt(0));
4714 perlExpr
.remove(0, 1);
4717 if (U_FAILURE(status
)) {
4718 errln("Line %d: ICU Error \"%s\"", lineNum
, u_errorName(status
));
4724 // Expected Results Compare
4726 UnicodeString
expectedS(fields
[4]);
4727 expectedS
.findAndReplace(nulnulSrc
, nulnul
);
4728 expectedS
.findAndReplace(ffffSrc
, ffff
);
4729 expectedS
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4732 if (expectedS
.compare(resultString
) != 0) {
4733 err("Line %d: Incorrect perl expression results.", lineNum
);
4734 infoln((UnicodeString
)"Expected \""+expectedS
+(UnicodeString
)"\"; got \""+resultString
+(UnicodeString
)"\"");
4742 // All done. Clean up allocated stuff.
4759 utext_close(&patternText
);
4760 utext_close(&inputText
);
4762 delete [] patternChars
;
4763 delete [] inputChars
;
4766 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount
);
4771 //--------------------------------------------------------------
4773 // Bug6149 Verify limits to heap expansion for backtrack stack.
4774 // Use this pattern,
4775 // "(a?){1,8000000}"
4776 // Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4777 // This test is likely to be fragile, as further optimizations stop
4778 // more cases of pointless looping in the match engine.
4780 //---------------------------------------------------------------
4781 void RegexTest::Bug6149() {
4782 UnicodeString
pattern("(a?){1,8000000}");
4783 UnicodeString
s("xyz");
4785 UErrorCode status
= U_ZERO_ERROR
;
4787 RegexMatcher
matcher(pattern
, s
, flags
, status
);
4788 UBool result
= false;
4789 REGEX_ASSERT_FAIL(result
=matcher
.matches(status
), U_REGEX_STACK_OVERFLOW
);
4790 REGEX_ASSERT(result
== FALSE
);
4795 // Callbacks() Test the callback function.
4796 // When set, callbacks occur periodically during matching operations,
4797 // giving the application code the ability to abort the operation
4798 // before it's normal completion.
4801 struct callBackContext
{
4806 void reset(int32_t max
) {maxCalls
=max
; numCalls
=0; lastSteps
=0;};
4810 static UBool U_CALLCONV
4811 testCallBackFn(const void *context
, int32_t steps
) {
4812 callBackContext
*info
= (callBackContext
*)context
;
4813 if (info
->lastSteps
+1 != steps
) {
4814 info
->test
->errln("incorrect steps in callback. Expected %d, got %d\n", info
->lastSteps
+1, steps
);
4816 info
->lastSteps
= steps
;
4818 return (info
->numCalls
< info
->maxCalls
);
4822 void RegexTest::Callbacks() {
4824 // Getter returns NULLs if no callback has been set
4826 // The variables that the getter will fill in.
4827 // Init to non-null values so that the action of the getter can be seen.
4828 const void *returnedContext
= &returnedContext
;
4829 URegexMatchCallback
*returnedFn
= &testCallBackFn
;
4831 UErrorCode status
= U_ZERO_ERROR
;
4832 RegexMatcher
matcher("x", 0, status
);
4834 matcher
.getMatchCallback(returnedFn
, returnedContext
, status
);
4836 REGEX_ASSERT(returnedFn
== NULL
);
4837 REGEX_ASSERT(returnedContext
== NULL
);
4842 callBackContext cbInfo
= {this, 0, 0, 0};
4843 const void *returnedContext
;
4844 URegexMatchCallback
*returnedFn
;
4845 UErrorCode status
= U_ZERO_ERROR
;
4846 RegexMatcher
matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status
); // A pattern that can run long.
4848 matcher
.setMatchCallback(testCallBackFn
, &cbInfo
, status
);
4850 matcher
.getMatchCallback(returnedFn
, returnedContext
, status
);
4852 REGEX_ASSERT(returnedFn
== testCallBackFn
);
4853 REGEX_ASSERT(returnedContext
== &cbInfo
);
4855 // A short-running match shouldn't invoke the callback
4856 status
= U_ZERO_ERROR
;
4858 UnicodeString s
= "xxx";
4860 REGEX_ASSERT(matcher
.matches(status
));
4862 REGEX_ASSERT(cbInfo
.numCalls
== 0);
4864 // A medium-length match that runs long enough to invoke the
4865 // callback, but not so long that the callback aborts it.
4866 status
= U_ZERO_ERROR
;
4868 s
= "aaaaaaaaaaaaaaaaaaab";
4870 REGEX_ASSERT(matcher
.matches(status
)==FALSE
);
4872 REGEX_ASSERT(cbInfo
.numCalls
> 0);
4874 // A longer running match that the callback function will abort.
4875 status
= U_ZERO_ERROR
;
4877 s
= "aaaaaaaaaaaaaaaaaaaaaaab";
4879 REGEX_ASSERT(matcher
.matches(status
)==FALSE
);
4880 REGEX_ASSERT(status
== U_REGEX_STOPPED_BY_CALLER
);
4881 REGEX_ASSERT(cbInfo
.numCalls
== 4);
4883 // A longer running find that the callback function will abort.
4884 status
= U_ZERO_ERROR
;
4886 s
= "aaaaaaaaaaaaaaaaaaaaaaab";
4888 REGEX_ASSERT(matcher
.find(status
)==FALSE
);
4889 REGEX_ASSERT(status
== U_REGEX_STOPPED_BY_CALLER
);
4890 REGEX_ASSERT(cbInfo
.numCalls
== 4);
4898 // FindProgressCallbacks() Test the find "progress" callback function.
4899 // When set, the find progress callback will be invoked during a find operations
4900 // after each return from a match attempt, giving the application the opportunity
4901 // to terminate a long-running find operation before it's normal completion.
4904 struct progressCallBackContext
{
4909 void reset(int32_t max
) {maxCalls
=max
; numCalls
=0;lastIndex
=0;};
4912 // call-back function for find().
4913 // Return TRUE to continue the find().
4914 // Return FALSE to stop the find().
4916 static UBool U_CALLCONV
4917 testProgressCallBackFn(const void *context
, int64_t matchIndex
) {
4918 progressCallBackContext
*info
= (progressCallBackContext
*)context
;
4920 info
->lastIndex
= matchIndex
;
4921 // info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4922 return (info
->numCalls
< info
->maxCalls
);
4926 void RegexTest::FindProgressCallbacks() {
4928 // Getter returns NULLs if no callback has been set
4930 // The variables that the getter will fill in.
4931 // Init to non-null values so that the action of the getter can be seen.
4932 const void *returnedContext
= &returnedContext
;
4933 URegexFindProgressCallback
*returnedFn
= &testProgressCallBackFn
;
4935 UErrorCode status
= U_ZERO_ERROR
;
4936 RegexMatcher
matcher("x", 0, status
);
4938 matcher
.getFindProgressCallback(returnedFn
, returnedContext
, status
);
4940 REGEX_ASSERT(returnedFn
== NULL
);
4941 REGEX_ASSERT(returnedContext
== NULL
);
4946 progressCallBackContext cbInfo
= {this, 0, 0, 0};
4947 const void *returnedContext
;
4948 URegexFindProgressCallback
*returnedFn
;
4949 UErrorCode status
= U_ZERO_ERROR
;
4950 RegexMatcher
matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status
);
4952 matcher
.setFindProgressCallback(testProgressCallBackFn
, &cbInfo
, status
);
4954 matcher
.getFindProgressCallback(returnedFn
, returnedContext
, status
);
4956 REGEX_ASSERT(returnedFn
== testProgressCallBackFn
);
4957 REGEX_ASSERT(returnedContext
== &cbInfo
);
4959 // A find that matches on the initial position does NOT invoke the callback.
4960 status
= U_ZERO_ERROR
;
4962 UnicodeString s
= "aaxxx";
4965 matcher
.setTrace(TRUE
);
4967 REGEX_ASSERT(matcher
.find(0, status
));
4969 REGEX_ASSERT(cbInfo
.numCalls
== 0);
4971 // A medium running find() that causes matcher.find() to invoke our callback for each index,
4972 // but not so many times that we interrupt the operation.
4973 status
= U_ZERO_ERROR
;
4974 s
= "aaaaaaaaaaaaaaaaaaab";
4975 cbInfo
.reset(s
.length()); // Some upper limit for number of calls that is greater than size of our input string
4977 REGEX_ASSERT(matcher
.find(0, status
)==FALSE
);
4979 REGEX_ASSERT(cbInfo
.numCalls
> 0 && cbInfo
.numCalls
< 25);
4981 // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4982 status
= U_ZERO_ERROR
;
4983 UnicodeString s1
= "aaaaaaaaaaaaaaaaaaaaaaab";
4984 cbInfo
.reset(s1
.length() - 5); // Bail early somewhere near the end of input string
4986 REGEX_ASSERT(matcher
.find(0, status
)==FALSE
);
4987 REGEX_ASSERT(status
== U_REGEX_STOPPED_BY_CALLER
);
4988 REGEX_ASSERT(cbInfo
.numCalls
== s1
.length() - 5);
4990 // Now a match that will succeed, but after an interruption
4991 status
= U_ZERO_ERROR
;
4992 UnicodeString s2
= "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4993 cbInfo
.reset(s2
.length() - 10); // Bail early somewhere near the end of input string
4995 REGEX_ASSERT(matcher
.find(0, status
)==FALSE
);
4996 REGEX_ASSERT(status
== U_REGEX_STOPPED_BY_CALLER
);
4997 // Now retry the match from where left off
4998 cbInfo
.maxCalls
= 100; // No callback limit
4999 status
= U_ZERO_ERROR
;
5000 REGEX_ASSERT(matcher
.find(cbInfo
.lastIndex
, status
));
5008 //---------------------------------------------------------------------------
5010 // PreAllocatedUTextCAPI Check the C API with pre-allocated mutable
5011 // UTexts. The pure-C implementation of UText
5012 // has no mutable backing stores, but we can
5013 // use UnicodeString here to test the functionality.
5015 //---------------------------------------------------------------------------
5016 void RegexTest::PreAllocatedUTextCAPI () {
5017 UErrorCode status
= U_ZERO_ERROR
;
5018 URegularExpression
*re
;
5019 UText patternText
= UTEXT_INITIALIZER
;
5020 UnicodeString buffer
;
5021 UText bufferText
= UTEXT_INITIALIZER
;
5023 utext_openUnicodeString(&bufferText
, &buffer
, &status
);
5026 * getText() and getUText()
5029 UText text1
= UTEXT_INITIALIZER
;
5030 UText text2
= UTEXT_INITIALIZER
;
5031 UChar text2Chars
[20];
5034 status
= U_ZERO_ERROR
;
5035 regextst_openUTF8FromInvariant(&text1
, "abcccd", -1, &status
);
5036 regextst_openUTF8FromInvariant(&text2
, "abcccxd", -1, &status
);
5037 u_uastrncpy(text2Chars
, "abcccxd", sizeof(text2
)/2);
5038 utext_openUChars(&text2
, text2Chars
, -1, &status
);
5040 regextst_openUTF8FromInvariant(&patternText
, "abc*d", -1, &status
);
5041 re
= uregex_openUText(&patternText
, 0, NULL
, &status
);
5043 /* First set a UText */
5044 uregex_setUText(re
, &text1
, &status
);
5045 resultText
= uregex_getUText(re
, &bufferText
, &status
);
5047 REGEX_ASSERT(resultText
== &bufferText
);
5048 utext_setNativeIndex(resultText
, 0);
5049 utext_setNativeIndex(&text1
, 0);
5050 REGEX_ASSERT(testUTextEqual(resultText
, &text1
));
5052 resultText
= uregex_getUText(re
, &bufferText
, &status
);
5054 REGEX_ASSERT(resultText
== &bufferText
);
5055 utext_setNativeIndex(resultText
, 0);
5056 utext_setNativeIndex(&text1
, 0);
5057 REGEX_ASSERT(testUTextEqual(resultText
, &text1
));
5059 /* Then set a UChar * */
5060 uregex_setText(re
, text2Chars
, 7, &status
);
5061 resultText
= uregex_getUText(re
, &bufferText
, &status
);
5063 REGEX_ASSERT(resultText
== &bufferText
);
5064 utext_setNativeIndex(resultText
, 0);
5065 utext_setNativeIndex(&text2
, 0);
5066 REGEX_ASSERT(testUTextEqual(resultText
, &text2
));
5069 utext_close(&text1
);
5070 utext_close(&text2
);
5082 u_uastrncpy(text1
, "noise abc interior def, and this is off the end", UPRV_LENGTHOF(text1
));
5083 // 012345678901234567890123456789012345678901234567
5086 status
= U_ZERO_ERROR
;
5087 re
= uregex_openC("abc(.*?)def", 0, NULL
, &status
);
5090 uregex_setText(re
, text1
, -1, &status
);
5091 result
= uregex_find(re
, 0, &status
);
5092 REGEX_ASSERT(result
==TRUE
);
5094 /* Capture Group 0, the full match. Should succeed. "abc interior def" */
5095 status
= U_ZERO_ERROR
;
5096 actual
= uregex_groupUText(re
, 0, &bufferText
, &length
, &status
);
5098 REGEX_ASSERT(actual
== &bufferText
);
5099 REGEX_ASSERT(utext_getNativeIndex(actual
) == 6);
5100 REGEX_ASSERT(length
== 16);
5101 REGEX_ASSERT(utext_nativeLength(actual
) == 47);
5103 /* Capture group #1. Should succeed, matching " interior ". */
5104 status
= U_ZERO_ERROR
;
5105 actual
= uregex_groupUText(re
, 1, &bufferText
, &length
, &status
);
5107 REGEX_ASSERT(actual
== &bufferText
);
5108 REGEX_ASSERT(utext_getNativeIndex(actual
) == 9); // position of " interior "
5109 REGEX_ASSERT(length
== 10);
5110 REGEX_ASSERT(utext_nativeLength(actual
) == 47);
5112 /* Capture group out of range. Error. */
5113 status
= U_ZERO_ERROR
;
5114 actual
= uregex_groupUText(re
, 2, &bufferText
, &length
, &status
);
5115 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
5116 REGEX_ASSERT(actual
== &bufferText
);
5127 UText replText
= UTEXT_INITIALIZER
;
5129 status
= U_ZERO_ERROR
;
5130 utext_openUnicodeString(&bufferText
, &buffer
, &status
);
5132 status
= U_ZERO_ERROR
;
5133 u_uastrncpy(text1
, "Replace xaax x1x x...x.", UPRV_LENGTHOF(text1
));
5134 u_uastrncpy(text2
, "No match here.", UPRV_LENGTHOF(text2
)/2);
5135 regextst_openUTF8FromInvariant(&replText
, "<$1>", -1, &status
);
5137 re
= uregex_openC("x(.*?)x", 0, NULL
, &status
);
5140 /* Normal case, with match */
5141 uregex_setText(re
, text1
, -1, &status
);
5143 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5145 result
= uregex_replaceFirstUText(re
, &replText
, &bufferText
, &status
);
5147 REGEX_ASSERT(result
== &bufferText
);
5148 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result
);
5150 /* No match. Text should copy to output with no changes. */
5151 uregex_setText(re
, text2
, -1, &status
);
5152 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5153 result
= uregex_replaceFirstUText(re
, &replText
, &bufferText
, &status
);
5155 REGEX_ASSERT(result
== &bufferText
);
5156 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result
);
5158 /* Unicode escapes */
5159 uregex_setText(re
, text1
, -1, &status
);
5160 regextst_openUTF8FromInvariant(&replText
, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status
);
5161 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5162 result
= uregex_replaceFirstUText(re
, &replText
, &bufferText
, &status
);
5164 REGEX_ASSERT(result
== &bufferText
);
5165 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result
);
5168 utext_close(&replText
);
5178 UText replText
= UTEXT_INITIALIZER
;
5181 status
= U_ZERO_ERROR
;
5182 u_uastrncpy(text1
, "Replace xaax x1x x...x.", sizeof(text1
)/2);
5183 u_uastrncpy(text2
, "No match here.", sizeof(text2
)/2);
5184 regextst_openUTF8FromInvariant(&replText
, "<$1>", -1, &status
);
5186 re
= uregex_openC("x(.*?)x", 0, NULL
, &status
);
5189 /* Normal case, with match */
5190 uregex_setText(re
, text1
, -1, &status
);
5191 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5192 result
= uregex_replaceAllUText(re
, &replText
, &bufferText
, &status
);
5194 REGEX_ASSERT(result
== &bufferText
);
5195 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result
);
5197 /* No match. Text should copy to output with no changes. */
5198 uregex_setText(re
, text2
, -1, &status
);
5199 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5200 result
= uregex_replaceAllUText(re
, &replText
, &bufferText
, &status
);
5202 REGEX_ASSERT(result
== &bufferText
);
5203 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result
);
5206 utext_close(&replText
);
5211 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5212 * so we don't need to test it here.
5215 utext_close(&bufferText
);
5216 utext_close(&patternText
);
5220 //--------------------------------------------------------------
5222 // NamedCapture Check basic named capture group functionality
5224 //--------------------------------------------------------------
5225 void RegexTest::NamedCapture() {
5226 UErrorCode status
= U_ZERO_ERROR
;
5227 RegexPattern
*pat
= RegexPattern::compile(UnicodeString(
5228 "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status
);
5230 int32_t group
= pat
->groupNumberFromName("five", -1, status
);
5232 REGEX_ASSERT(5 == group
);
5233 group
= pat
->groupNumberFromName("three", -1, status
);
5235 REGEX_ASSERT(3 == group
);
5237 status
= U_ZERO_ERROR
;
5238 group
= pat
->groupNumberFromName(UnicodeString("six"), status
);
5240 REGEX_ASSERT(6 == group
);
5242 status
= U_ZERO_ERROR
;
5243 group
= pat
->groupNumberFromName(UnicodeString("nosuch"), status
);
5244 U_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5246 status
= U_ZERO_ERROR
;
5248 // After copying a pattern, named capture should still work in the copy.
5249 RegexPattern
*copiedPat
= new RegexPattern(*pat
);
5250 REGEX_ASSERT(*copiedPat
== *pat
);
5251 delete pat
; pat
= NULL
; // Delete original, copy should have no references back to it.
5253 group
= copiedPat
->groupNumberFromName("five", -1, status
);
5255 REGEX_ASSERT(5 == group
);
5256 group
= copiedPat
->groupNumberFromName("three", -1, status
);
5258 REGEX_ASSERT(3 == group
);
5261 // ReplaceAll with named capture group.
5262 status
= U_ZERO_ERROR
;
5263 UnicodeString
text("Substitution of <<quotes>> for <<double brackets>>");
5264 RegexMatcher
*m
= new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text
, 0, status
);
5266 // m.pattern().dumpPattern();
5267 UnicodeString replacedText
= m
->replaceAll("'${mid}'", status
);
5269 REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText
);
5272 // ReplaceAll, allowed capture group numbers.
5273 text
= UnicodeString("abcmxyz");
5274 m
= new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text
, 0, status
);
5277 status
= U_ZERO_ERROR
;
5278 replacedText
= m
->replaceAll(UnicodeString("<$0>"), status
); // group 0, full match, is allowed.
5280 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText
);
5282 status
= U_ZERO_ERROR
;
5283 replacedText
= m
->replaceAll(UnicodeString("<$1>"), status
); // group 1 by number.
5285 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText
);
5287 status
= U_ZERO_ERROR
;
5288 replacedText
= m
->replaceAll(UnicodeString("<${one}>"), status
); // group 1 by name.
5290 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText
);
5292 status
= U_ZERO_ERROR
;
5293 replacedText
= m
->replaceAll(UnicodeString("<$2>"), status
); // group 2.
5295 REGEX_ASSERT(UnicodeString("a<x>z") == replacedText
);
5297 status
= U_ZERO_ERROR
;
5298 replacedText
= m
->replaceAll(UnicodeString("<$3>"), status
);
5300 REGEX_ASSERT(UnicodeString("a<y>z") == replacedText
);
5302 status
= U_ZERO_ERROR
;
5303 replacedText
= m
->replaceAll(UnicodeString("<$4>"), status
);
5304 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
5306 status
= U_ZERO_ERROR
;
5307 replacedText
= m
->replaceAll(UnicodeString("<$04>"), status
); // group 0, leading 0,
5308 REGEX_CHECK_STATUS
; // trailing out-of-range 4 passes through.
5309 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText
);
5311 status
= U_ZERO_ERROR
;
5312 replacedText
= m
->replaceAll(UnicodeString("<$000016>"), status
); // Consume leading zeroes. Don't consume digits
5313 REGEX_CHECK_STATUS
; // that push group num out of range.
5314 REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText
); // This is group 1.
5316 status
= U_ZERO_ERROR
;
5317 replacedText
= m
->replaceAll(UnicodeString("<$3$2$1${one}>"), status
);
5319 REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText
);
5321 status
= U_ZERO_ERROR
;
5322 replacedText
= m
->replaceAll(UnicodeString("$3$2$1${one}"), status
);
5324 REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText
);
5326 status
= U_ZERO_ERROR
;
5327 replacedText
= m
->replaceAll(UnicodeString("<${noSuchName}>"), status
);
5328 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5330 status
= U_ZERO_ERROR
;
5331 replacedText
= m
->replaceAll(UnicodeString("<${invalid-name}>"), status
);
5332 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5334 status
= U_ZERO_ERROR
;
5335 replacedText
= m
->replaceAll(UnicodeString("<${one"), status
);
5336 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5338 status
= U_ZERO_ERROR
;
5339 replacedText
= m
->replaceAll(UnicodeString("$not a capture group"), status
);
5340 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5344 // Repeat the above replaceAll() tests using the plain C API, which
5345 // has a separate implementation internally.
5346 // TODO: factor out the test data.
5348 status
= U_ZERO_ERROR
;
5349 URegularExpression
*re
= uregex_openC("..(?<one>m)(.)(.)", 0, NULL
, &status
);
5351 text
= UnicodeString("abcmxyz");
5352 uregex_setText(re
, text
.getBuffer(), text
.length(), &status
);
5355 UChar resultBuf
[100];
5356 int32_t resultLength
;
5359 status
= U_ZERO_ERROR
;
5360 repl
= UnicodeString("<$0>");
5361 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5363 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf
, resultLength
));
5365 status
= U_ZERO_ERROR
;
5366 repl
= UnicodeString("<$1>");
5367 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5369 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf
, resultLength
));
5371 status
= U_ZERO_ERROR
;
5372 repl
= UnicodeString("<${one}>");
5373 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5375 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf
, resultLength
));
5377 status
= U_ZERO_ERROR
;
5378 repl
= UnicodeString("<$2>");
5379 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5381 REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf
, resultLength
));
5383 status
= U_ZERO_ERROR
;
5384 repl
= UnicodeString("<$3>");
5385 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5387 REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf
, resultLength
));
5389 status
= U_ZERO_ERROR
;
5390 repl
= UnicodeString("<$4>");
5391 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5392 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
5394 status
= U_ZERO_ERROR
;
5395 repl
= UnicodeString("<$04>");
5396 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5398 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf
, resultLength
));
5400 status
= U_ZERO_ERROR
;
5401 repl
= UnicodeString("<$000016>");
5402 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5404 REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf
, resultLength
));
5406 status
= U_ZERO_ERROR
;
5407 repl
= UnicodeString("<$3$2$1${one}>");
5408 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5410 REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf
, resultLength
));
5412 status
= U_ZERO_ERROR
;
5413 repl
= UnicodeString("$3$2$1${one}");
5414 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5416 REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf
, resultLength
));
5418 status
= U_ZERO_ERROR
;
5419 repl
= UnicodeString("<${noSuchName}>");
5420 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5421 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5423 status
= U_ZERO_ERROR
;
5424 repl
= UnicodeString("<${invalid-name}>");
5425 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5426 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5428 status
= U_ZERO_ERROR
;
5429 repl
= UnicodeString("<${one");
5430 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5431 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5433 status
= U_ZERO_ERROR
;
5434 repl
= UnicodeString("$not a capture group");
5435 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5436 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5441 //--------------------------------------------------------------
5443 // NamedCaptureLimits Patterns with huge numbers of named capture groups.
5444 // The point is not so much what the exact limit is,
5445 // but that a largish number doesn't hit bad non-linear performance,
5446 // and that exceeding the limit fails cleanly.
5448 //--------------------------------------------------------------
5449 void RegexTest::NamedCaptureLimits() {
5451 logln("Skipping test. Runs in exhuastive mode only.");
5454 const int32_t goodLimit
= 1000000; // Pattern w this many groups builds successfully.
5455 const int32_t failLimit
= 10000000; // Pattern exceeds internal limits, fails to compile.
5457 UnicodeString pattern
;
5460 for (nn
=1; nn
<goodLimit
; nn
++) {
5461 sprintf(nnbuf
, "(?<nn%d>)", nn
);
5462 pattern
.append(UnicodeString(nnbuf
, -1, US_INV
));
5464 UErrorCode status
= U_ZERO_ERROR
;
5465 RegexPattern
*pat
= RegexPattern::compile(pattern
, 0, status
);
5467 for (nn
=1; nn
<goodLimit
; nn
++) {
5468 sprintf(nnbuf
, "nn%d", nn
);
5469 int32_t groupNum
= pat
->groupNumberFromName(nnbuf
, -1, status
);
5470 REGEX_ASSERT(nn
== groupNum
);
5471 if (nn
!= groupNum
) {
5478 for (nn
=1; nn
<failLimit
; nn
++) {
5479 sprintf(nnbuf
, "(?<nn%d>)", nn
);
5480 pattern
.append(UnicodeString(nnbuf
, -1, US_INV
));
5482 status
= U_ZERO_ERROR
;
5483 pat
= RegexPattern::compile(pattern
, 0, status
);
5484 REGEX_ASSERT(status
== U_REGEX_PATTERN_TOO_BIG
);
5489 //--------------------------------------------------------------
5491 // Bug7651 Regex pattern that exceeds default operator stack depth in matcher.
5493 //---------------------------------------------------------------
5494 void RegexTest::Bug7651() {
5495 UnicodeString
pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5496 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5497 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5498 UnicodeString
pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5499 UnicodeString
s("#ff @abcd This is test");
5500 RegexPattern
*REPattern
= NULL
;
5501 RegexMatcher
*REMatcher
= NULL
;
5502 UErrorCode status
= U_ZERO_ERROR
;
5505 REPattern
= RegexPattern::compile(pattern1
, 0, pe
, status
);
5507 REMatcher
= REPattern
->matcher(s
, status
);
5509 REGEX_ASSERT(REMatcher
->find());
5510 REGEX_ASSERT(REMatcher
->start(status
) == 0);
5513 status
= U_ZERO_ERROR
;
5515 REPattern
= RegexPattern::compile(pattern2
, 0, pe
, status
);
5517 REMatcher
= REPattern
->matcher(s
, status
);
5519 REGEX_ASSERT(REMatcher
->find());
5520 REGEX_ASSERT(REMatcher
->start(status
) == 0);
5523 status
= U_ZERO_ERROR
;
5526 void RegexTest::Bug7740() {
5527 UErrorCode status
= U_ZERO_ERROR
;
5528 UnicodeString pattern
= "(a)";
5529 UnicodeString text
= "abcdef";
5530 RegexMatcher
*m
= new RegexMatcher(pattern
, text
, 0, status
);
5532 REGEX_ASSERT(m
->lookingAt(status
));
5534 status
= U_ILLEGAL_ARGUMENT_ERROR
;
5535 UnicodeString s
= m
->group(1, status
); // Bug 7740: segfault here.
5536 REGEX_ASSERT(status
== U_ILLEGAL_ARGUMENT_ERROR
);
5537 REGEX_ASSERT(s
== "");
5541 // Bug 8479: was crashing whith a Bogus UnicodeString as input.
5543 void RegexTest::Bug8479() {
5544 UErrorCode status
= U_ZERO_ERROR
;
5546 RegexMatcher
* const pMatcher
= new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL
|UREGEX_CASE_INSENSITIVE
, status
);
5548 if (U_SUCCESS(status
))
5552 pMatcher
->reset(str
);
5553 status
= U_ZERO_ERROR
;
5554 pMatcher
->matches(status
);
5555 REGEX_ASSERT(status
== U_ILLEGAL_ARGUMENT_ERROR
);
5562 void RegexTest::Bug7029() {
5563 UErrorCode status
= U_ZERO_ERROR
;
5565 RegexMatcher
* const pMatcher
= new RegexMatcher(".", 0, status
);
5566 UnicodeString text
= "abc.def";
5567 UnicodeString splits
[10];
5569 int32_t numFields
= pMatcher
->split(text
, splits
, 10, status
);
5571 REGEX_ASSERT(numFields
== 8);
5576 // This test is checking for the existance of any supplemental characters that case-fold
5577 // to a bmp character.
5579 // At the time of this writing there are none. If any should appear in a subsequent release
5580 // of Unicode, the code in regular expressions compilation that determines the longest
5581 // posssible match for a literal string will need to be enhanced.
5583 // See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5584 // for details on what to do in case of a failure of this test.
5586 void RegexTest::Bug9283() {
5587 #if !UCONFIG_NO_NORMALIZATION
5588 UErrorCode status
= U_ZERO_ERROR
;
5589 UnicodeSet
supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status
);
5593 for (index
=0; ; index
++) {
5594 c
= supplementalsWithCaseFolding
.charAt(index
);
5598 UnicodeString cf
= UnicodeString(c
).foldCase();
5599 REGEX_ASSERT(cf
.length() >= 2);
5601 #endif /* #if !UCONFIG_NO_NORMALIZATION */
5605 void RegexTest::CheckInvBufSize() {
5606 if(inv_next
>=INV_BUFSIZ
) {
5607 errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5608 __FILE__
, INV_BUFSIZ
, inv_next
);
5610 logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__
, INV_BUFSIZ
, inv_next
);
5615 void RegexTest::Bug10459() {
5616 UErrorCode status
= U_ZERO_ERROR
;
5617 UnicodeString
patternString("(txt)");
5618 UnicodeString
txtString("txt");
5620 UText
*utext_pat
= utext_openUnicodeString(NULL
, &patternString
, &status
);
5622 UText
*utext_txt
= utext_openUnicodeString(NULL
, &txtString
, &status
);
5625 URegularExpression
*icu_re
= uregex_openUText(utext_pat
, 0, NULL
, &status
);
5628 uregex_setUText(icu_re
, utext_txt
, &status
);
5631 // The bug was that calling uregex_group() before doing a matching operation
5632 // was causing a segfault. Only for Regular Expressions created from UText.
5633 // It should set an U_REGEX_INVALID_STATE.
5636 int32_t len
= uregex_group(icu_re
, 0, buf
, UPRV_LENGTHOF(buf
), &status
);
5637 REGEX_ASSERT(status
== U_REGEX_INVALID_STATE
);
5638 REGEX_ASSERT(len
== 0);
5640 uregex_close(icu_re
);
5641 utext_close(utext_pat
);
5642 utext_close(utext_txt
);
5645 void RegexTest::TestCaseInsensitiveStarters() {
5646 // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
5647 // become stale because of new Unicode characters.
5648 // If it is stale, rerun the generation tool
5649 // svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
5650 // and replace the embedded data in i18n/regexcmp.cpp
5652 for (UChar32 cp
=0; cp
<=0x10ffff; cp
++) {
5653 if (!u_hasBinaryProperty(cp
, UCHAR_CASE_SENSITIVE
)) {
5656 UnicodeSet
s(cp
, cp
);
5657 s
.closeOver(USET_CASE_INSENSITIVE
);
5658 UnicodeSetIterator
setIter(s
);
5659 while (setIter
.next()) {
5660 if (!setIter
.isString()) {
5663 const UnicodeString
&str
= setIter
.getString();
5664 UChar32 firstChar
= str
.char32At(0);
5665 UnicodeSet starters
;
5666 RegexCompile::findCaseInsensitiveStarters(firstChar
, &starters
);
5667 if (!starters
.contains(cp
)) {
5668 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp
, firstChar
);
5676 void RegexTest::TestBug11049() {
5677 // Original bug report: pattern with match start consisting of one of several individual characters,
5678 // and the text being matched ending with a supplementary character. find() would read past the
5679 // end of the input text when searching for potential match starting points.
5681 // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
5682 // detect the bad read.
5684 TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE
, __LINE__
);
5685 TestCase11049("A|B|C", "string matches at end C", TRUE
, __LINE__
);
5687 // Test again with a pattern starting with a single character,
5688 // which takes a different code path than starting with an OR expression,
5689 // but with similar logic.
5690 TestCase11049("C", "a string \\ud800\\udc00", FALSE
, __LINE__
);
5691 TestCase11049("C", "string matches at end C", TRUE
, __LINE__
);
5694 // Run a single test case from TestBug11049(). Internal function.
5695 void RegexTest::TestCase11049(const char *pattern
, const char *data
, UBool expectMatch
, int32_t lineNumber
) {
5696 UErrorCode status
= U_ZERO_ERROR
;
5697 UnicodeString patternString
= UnicodeString(pattern
).unescape();
5698 LocalPointer
<RegexPattern
> compiledPat(RegexPattern::compile(patternString
, 0, status
));
5700 UnicodeString dataString
= UnicodeString(data
).unescape();
5701 UChar
*exactBuffer
= new UChar
[dataString
.length()];
5702 dataString
.extract(exactBuffer
, dataString
.length(), status
);
5703 UText
*ut
= utext_openUChars(NULL
, exactBuffer
, dataString
.length(), &status
);
5705 LocalPointer
<RegexMatcher
> matcher(compiledPat
->matcher(status
));
5708 UBool result
= matcher
->find();
5709 if (result
!= expectMatch
) {
5710 errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5711 __FILE__
, lineNumber
, expectMatch
, result
, pattern
, data
);
5714 // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
5715 // off-by-one on find() with match at the last code point.
5716 // Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
5717 // because string.unescape() will only shrink it.
5718 char * utf8Buffer
= new char[uprv_strlen(data
)+1];
5719 u_strToUTF8(utf8Buffer
, uprv_strlen(data
)+1, NULL
, dataString
.getBuffer(), dataString
.length(), &status
);
5721 ut
= utext_openUTF8(ut
, utf8Buffer
, -1, &status
);
5724 result
= matcher
->find();
5725 if (result
!= expectMatch
) {
5726 errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5727 __FILE__
, lineNumber
, expectMatch
, result
, pattern
, data
);
5729 delete [] utf8Buffer
;
5732 delete [] exactBuffer
;
5736 void RegexTest::TestBug11371() {
5738 logln("Skipping test. Runs in exhuastive mode only.");
5741 UErrorCode status
= U_ZERO_ERROR
;
5742 UnicodeString patternString
;
5744 for (int i
=0; i
<8000000; i
++) {
5745 patternString
.append(UnicodeString("()"));
5747 LocalPointer
<RegexPattern
> compiledPat(RegexPattern::compile(patternString
, 0, status
));
5748 if (status
!= U_REGEX_PATTERN_TOO_BIG
) {
5749 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5750 __FILE__
, __LINE__
, u_errorName(status
));
5753 status
= U_ZERO_ERROR
;
5754 patternString
= "(";
5755 for (int i
=0; i
<20000000; i
++) {
5756 patternString
.append(UnicodeString("A++"));
5758 patternString
.append(UnicodeString("){0}B++"));
5759 LocalPointer
<RegexPattern
> compiledPat2(RegexPattern::compile(patternString
, 0, status
));
5760 if (status
!= U_REGEX_PATTERN_TOO_BIG
) {
5761 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5762 __FILE__
, __LINE__
, u_errorName(status
));
5765 // Pattern with too much string data, such that string indexes overflow operand data field size
5766 // in compiled instruction.
5767 status
= U_ZERO_ERROR
;
5769 while (patternString
.length() < 0x00ffffff) {
5770 patternString
.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
5772 patternString
.append(UnicodeString("X? trailing string"));
5773 LocalPointer
<RegexPattern
> compiledPat3(RegexPattern::compile(patternString
, 0, status
));
5774 if (status
!= U_REGEX_PATTERN_TOO_BIG
) {
5775 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5776 __FILE__
, __LINE__
, u_errorName(status
));
5780 void RegexTest::TestBug11480() {
5781 // C API, get capture group of a group that does not participate in the match.
5782 // (Returns a zero length string, with nul termination,
5783 // indistinguishable from a group with a zero lenght match.)
5785 UErrorCode status
= U_ZERO_ERROR
;
5786 URegularExpression
*re
= uregex_openC("(A)|(B)", 0, NULL
, &status
);
5788 UnicodeString text
= UNICODE_STRING_SIMPLE("A");
5789 uregex_setText(re
, text
.getBuffer(), text
.length(), &status
);
5791 REGEX_ASSERT(uregex_lookingAt(re
, 0, &status
));
5792 UChar buf
[10] = {(UChar
)13, (UChar
)13, (UChar
)13, (UChar
)13};
5793 int32_t length
= uregex_group(re
, 2, buf
+1, UPRV_LENGTHOF(buf
)-1, &status
);
5794 REGEX_ASSERT(length
== 0);
5795 REGEX_ASSERT(buf
[0] == 13);
5796 REGEX_ASSERT(buf
[1] == 0);
5797 REGEX_ASSERT(buf
[2] == 13);
5802 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */