1 /********************************************************************
3 * Copyright (c) 2002-2016, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
10 // ICU Regular Expressions test, part of intltest.
16 PLEASE be careful about ASCII assumptions in this test.
17 This test is one of the worst repeat offenders.
18 If you have questions, contact someone on the ICU PMC
19 who has access to an EBCDIC system.
24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
30 #include "unicode/localpointer.h"
31 #include "unicode/regex.h"
32 #include "unicode/uchar.h"
33 #include "unicode/ucnv.h"
34 #include "unicode/uniset.h"
35 #include "unicode/uregex.h"
36 #include "unicode/usetiter.h"
37 #include "unicode/ustring.h"
38 #include "unicode/utext.h"
48 #define SUPPORT_MUTATING_INPUT_STRING 0
50 //---------------------------------------------------------------------------
52 // Test class boilerplate
54 //---------------------------------------------------------------------------
55 RegexTest::RegexTest()
60 RegexTest::~RegexTest()
66 void RegexTest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* /*par*/ )
68 if (exec
) logln("TestSuite RegexTest: ");
71 case 0: name
= "Basic";
74 case 1: name
= "API_Match";
75 if (exec
) API_Match();
77 case 2: name
= "API_Replace";
78 if (exec
) API_Replace();
80 case 3: name
= "API_Pattern";
81 if (exec
) API_Pattern();
84 #if !UCONFIG_NO_FILE_IO
91 case 5: name
= "Errors";
94 case 6: name
= "PerlTests";
95 if (exec
) PerlTests();
97 case 7: name
= "Callbacks";
98 if (exec
) Callbacks();
100 case 8: name
= "FindProgressCallbacks";
101 if (exec
) FindProgressCallbacks();
103 case 9: name
= "Bug 6149";
106 case 10: name
= "UTextBasic";
107 if (exec
) UTextBasic();
109 case 11: name
= "API_Match_UTF8";
110 if (exec
) API_Match_UTF8();
112 case 12: name
= "API_Replace_UTF8";
113 if (exec
) API_Replace_UTF8();
115 case 13: name
= "API_Pattern_UTF8";
116 if (exec
) API_Pattern_UTF8();
118 case 14: name
= "PerlTestsUTF8";
119 if (exec
) PerlTestsUTF8();
121 case 15: name
= "PreAllocatedUTextCAPI";
122 if (exec
) PreAllocatedUTextCAPI();
124 case 16: name
= "Bug 7651";
127 case 17: name
= "Bug 7740";
130 case 18: name
= "Bug 8479";
133 case 19: name
= "Bug 7029";
136 case 20: name
= "CheckInvBufSize";
137 if (exec
) CheckInvBufSize();
139 case 21: name
= "Bug 9283";
142 case 22: name
= "Bug10459";
143 if (exec
) Bug10459();
145 case 23: name
= "TestCaseInsensitiveStarters";
146 if (exec
) TestCaseInsensitiveStarters();
148 case 24: name
= "TestBug11049";
149 if (exec
) TestBug11049();
151 case 25: name
= "TestBug11371";
152 if (exec
) TestBug11371();
154 case 26: name
= "TestBug11480";
155 if (exec
) TestBug11480();
157 case 27: name
= "NamedCapture";
158 if (exec
) NamedCapture();
160 case 28: name
= "NamedCaptureLimits";
161 if (exec
) NamedCaptureLimits();
164 break; //needed to end loop
171 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
173 * @see utext_openUTF8
175 static UText
* regextst_openUTF8FromInvariant(UText
* ut
, const char *inv
, int64_t length
, UErrorCode
*status
);
177 //---------------------------------------------------------------------------
179 // Error Checking / Reporting macros used in all of the tests.
181 //---------------------------------------------------------------------------
183 static void utextToPrintable(char *buf
, int32_t bufLen
, UText
*text
) {
184 int64_t oldIndex
= utext_getNativeIndex(text
);
185 utext_setNativeIndex(text
, 0);
187 UChar32 c
= utext_next32From(text
, 0);
188 while ((c
!= U_SENTINEL
) && (bufPtr
< buf
+bufLen
)) {
189 if (0x000020<=c
&& c
<0x00007e) {
193 sprintf(bufPtr
,"U+%04X", c
);
194 bufPtr
+= strlen(bufPtr
)-1;
200 c
= UTEXT_NEXT32(text
);
203 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
204 char *ebuf
= (char*)malloc(bufLen
);
205 uprv_eastrncpy((unsigned char*)ebuf
, (const unsigned char*)buf
, bufLen
);
206 uprv_strncpy(buf
, ebuf
, bufLen
);
209 utext_setNativeIndex(text
, oldIndex
);
213 static char ASSERT_BUF
[1024];
215 const char* RegexTest::extractToAssertBuf(const UnicodeString
& message
) {
216 if(message
.length()==0) {
217 strcpy(ASSERT_BUF
, "[[empty UnicodeString]]");
220 IntlTest::prettify(message
,buf
);
221 if(buf
.length()==0) {
222 strcpy(ASSERT_BUF
, "[[escape() returned 0 chars]]");
224 buf
.extract(0, 0x7FFFFFFF, ASSERT_BUF
, sizeof(ASSERT_BUF
)-1);
225 if(ASSERT_BUF
[0]==0) {
227 for(int32_t i
=0;i
<buf
.length();i
++) {
229 sprintf(ASSERT_BUF
+strlen(ASSERT_BUF
),"\\u%02x",ch
);
234 ASSERT_BUF
[sizeof(ASSERT_BUF
)-1] = 0;
238 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,UPRV_LENGTHOF(buf),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
240 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \
241 __FILE__, __LINE__, u_errorName(status)); return;}}
243 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
245 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
246 if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \
247 __LINE__, u_errorName(errcode), u_errorName(status));};}
249 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
250 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
252 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
253 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
255 // expected: const char * , restricted to invariant characters.
256 // actual: const UnicodeString &
257 #define REGEX_ASSERT_UNISTR(expected, actual) { \
258 if (UnicodeString(expected, -1, US_INV) != (actual)) { \
259 errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n", \
260 __FILE__, __LINE__, expected, extractToAssertBuf(actual));};}
263 static UBool
testUTextEqual(UText
*uta
, UText
*utb
) {
266 utext_setNativeIndex(uta
, 0);
267 utext_setNativeIndex(utb
, 0);
269 ca
= utext_next32(uta
);
270 cb
= utext_next32(utb
);
274 } while (ca
!= U_SENTINEL
);
280 * @param expected expected text in UTF-8 (not platform) codepage
282 void RegexTest::assertUText(const char *expected
, UText
*actual
, const char *file
, int line
) {
283 UErrorCode status
= U_ZERO_ERROR
;
284 UText expectedText
= UTEXT_INITIALIZER
;
285 utext_openUTF8(&expectedText
, expected
, -1, &status
);
286 if(U_FAILURE(status
)) {
287 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file
, line
, u_errorName(status
), strlen(expected
));
290 if(utext_nativeLength(&expectedText
)==0 && (strlen(expected
)!=0)) {
291 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file
, line
, strlen(expected
));
294 utext_setNativeIndex(actual
, 0);
295 if (!testUTextEqual(&expectedText
, actual
)) {
296 char buf
[201 /*21*/];
297 char expectedBuf
[201];
298 utextToPrintable(buf
, UPRV_LENGTHOF(buf
), actual
);
299 utextToPrintable(expectedBuf
, UPRV_LENGTHOF(expectedBuf
), &expectedText
);
300 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file
, line
, expectedBuf
, (int)utext_nativeLength(&expectedText
), buf
, (int)utext_nativeLength(actual
));
302 utext_close(&expectedText
);
305 * @param expected invariant (platform local text) input
308 void RegexTest::assertUTextInvariant(const char *expected
, UText
*actual
, const char *file
, int line
) {
309 UErrorCode status
= U_ZERO_ERROR
;
310 UText expectedText
= UTEXT_INITIALIZER
;
311 regextst_openUTF8FromInvariant(&expectedText
, expected
, -1, &status
);
312 if(U_FAILURE(status
)) {
313 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file
, line
, u_errorName(status
), strlen(expected
));
316 utext_setNativeIndex(actual
, 0);
317 if (!testUTextEqual(&expectedText
, actual
)) {
318 char buf
[201 /*21*/];
319 char expectedBuf
[201];
320 utextToPrintable(buf
, UPRV_LENGTHOF(buf
), actual
);
321 utextToPrintable(expectedBuf
, UPRV_LENGTHOF(expectedBuf
), &expectedText
);
322 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file
, line
, expectedBuf
, (int)utext_nativeLength(&expectedText
), buf
, (int)utext_nativeLength(actual
));
324 utext_close(&expectedText
);
328 * Assumes utf-8 input
330 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
332 * Assumes Invariant input
334 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
337 * This buffer ( inv_buf ) is used to hold the UTF-8 strings
338 * passed into utext_openUTF8. An error will be given if
339 * INV_BUFSIZ is too small. It's only used on EBCDIC systems.
342 #define INV_BUFSIZ 2048 /* increase this if too small */
344 static int64_t inv_next
=0;
346 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
347 static char inv_buf
[INV_BUFSIZ
];
350 static UText
* regextst_openUTF8FromInvariant(UText
*ut
, const char *inv
, int64_t length
, UErrorCode
*status
) {
351 if(length
==-1) length
=strlen(inv
);
352 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
354 return utext_openUTF8(ut
, inv
, length
, status
);
356 if(inv_next
+length
+1>INV_BUFSIZ
) {
357 fprintf(stderr
, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
358 __FILE__
, __LINE__
, INV_BUFSIZ
, (inv_next
+length
+1));
359 *status
= U_MEMORY_ALLOCATION_ERROR
;
363 unsigned char *buf
= (unsigned char*)inv_buf
+inv_next
;
364 uprv_aestrncpy(buf
, (const uint8_t*)inv
, length
);
368 fprintf(stderr
, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ
, inv_next
);
371 return utext_openUTF8(ut
, (const char*)buf
, length
, status
);
376 //---------------------------------------------------------------------------
378 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests
379 // for the LookingAt() and Match() functions.
382 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
384 // The expected results are UBool - TRUE or FALSE.
385 // The input text is unescaped. The pattern is not.
388 //---------------------------------------------------------------------------
390 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
392 UBool
RegexTest::doRegexLMTest(const char *pat
, const char *text
, UBool looking
, UBool match
, int32_t line
) {
393 const UnicodeString
pattern(pat
, -1, US_INV
);
394 const UnicodeString
inputText(text
, -1, US_INV
);
395 UErrorCode status
= U_ZERO_ERROR
;
397 RegexPattern
*REPattern
= NULL
;
398 RegexMatcher
*REMatcher
= NULL
;
401 UnicodeString
patString(pat
, -1, US_INV
);
402 REPattern
= RegexPattern::compile(patString
, 0, pe
, status
);
403 if (U_FAILURE(status
)) {
404 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s",
405 line
, u_errorName(status
));
408 if (line
==376) { REPattern
->dumpPattern();}
410 UnicodeString
inputString(inputText
);
411 UnicodeString unEscapedInput
= inputString
.unescape();
412 REMatcher
= REPattern
->matcher(unEscapedInput
, status
);
413 if (U_FAILURE(status
)) {
414 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
415 line
, u_errorName(status
));
420 actualmatch
= REMatcher
->lookingAt(status
);
421 if (U_FAILURE(status
)) {
422 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
423 line
, u_errorName(status
));
426 if (actualmatch
!= looking
) {
427 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line
);
431 status
= U_ZERO_ERROR
;
432 actualmatch
= REMatcher
->matches(status
);
433 if (U_FAILURE(status
)) {
434 errln("RegexTest failure in matches() at line %d. Status = %s\n",
435 line
, u_errorName(status
));
438 if (actualmatch
!= match
) {
439 errln("RegexTest: wrong return from matches() at line %d.\n", line
);
443 if (retVal
== FALSE
) {
444 REPattern
->dumpPattern();
453 UBool
RegexTest::doRegexLMTestUTF8(const char *pat
, const char *text
, UBool looking
, UBool match
, int32_t line
) {
454 UText pattern
= UTEXT_INITIALIZER
;
455 int32_t inputUTF8Length
;
456 char *textChars
= NULL
;
457 UText inputText
= UTEXT_INITIALIZER
;
458 UErrorCode status
= U_ZERO_ERROR
;
460 RegexPattern
*REPattern
= NULL
;
461 RegexMatcher
*REMatcher
= NULL
;
464 regextst_openUTF8FromInvariant(&pattern
, pat
, -1, &status
);
465 REPattern
= RegexPattern::compile(&pattern
, 0, pe
, status
);
466 if (U_FAILURE(status
)) {
467 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n",
468 line
, u_errorName(status
));
472 UnicodeString
inputString(text
, -1, US_INV
);
473 UnicodeString unEscapedInput
= inputString
.unescape();
474 LocalUConverterPointer
UTF8Converter(ucnv_open("UTF8", &status
));
475 ucnv_setFromUCallBack(UTF8Converter
.getAlias(), UCNV_FROM_U_CALLBACK_STOP
, NULL
, NULL
, NULL
, &status
);
477 inputUTF8Length
= unEscapedInput
.extract(NULL
, 0, UTF8Converter
.getAlias(), status
);
478 if (U_FAILURE(status
) && status
!= U_BUFFER_OVERFLOW_ERROR
) {
479 // UTF-8 does not allow unpaired surrogates, so this could actually happen
480 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line
, u_errorName(status
));
481 return TRUE
; // not a failure of the Regex engine
483 status
= U_ZERO_ERROR
; // buffer overflow
484 textChars
= new char[inputUTF8Length
+1];
485 unEscapedInput
.extract(textChars
, inputUTF8Length
+1, UTF8Converter
.getAlias(), status
);
486 utext_openUTF8(&inputText
, textChars
, inputUTF8Length
, &status
);
488 REMatcher
= &REPattern
->matcher(status
)->reset(&inputText
);
489 if (U_FAILURE(status
)) {
490 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n",
491 line
, u_errorName(status
));
496 actualmatch
= REMatcher
->lookingAt(status
);
497 if (U_FAILURE(status
)) {
498 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n",
499 line
, u_errorName(status
));
502 if (actualmatch
!= looking
) {
503 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line
);
507 status
= U_ZERO_ERROR
;
508 actualmatch
= REMatcher
->matches(status
);
509 if (U_FAILURE(status
)) {
510 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n",
511 line
, u_errorName(status
));
514 if (actualmatch
!= match
) {
515 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line
);
519 if (retVal
== FALSE
) {
520 REPattern
->dumpPattern();
525 utext_close(&inputText
);
526 utext_close(&pattern
);
533 //---------------------------------------------------------------------------
535 // REGEX_ERR Macro + invocation function to simplify writing tests
536 // regex tests for incorrect patterns
539 // REGEX_ERR("pattern", expected error line, column, expected status);
541 //---------------------------------------------------------------------------
542 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
544 void RegexTest::regex_err(const char *pat
, int32_t errLine
, int32_t errCol
,
545 UErrorCode expectedStatus
, int32_t line
) {
546 UnicodeString
pattern(pat
);
548 UErrorCode status
= U_ZERO_ERROR
;
550 RegexPattern
*callerPattern
= NULL
;
553 // Compile the caller's pattern
555 UnicodeString
patString(pat
);
556 callerPattern
= RegexPattern::compile(patString
, 0, pe
, status
);
557 if (status
!= expectedStatus
) {
558 dataerrln("Line %d: unexpected error %s compiling pattern.", line
, u_errorName(status
));
560 if (status
!= U_ZERO_ERROR
) {
561 if (pe
.line
!= errLine
|| pe
.offset
!= errCol
) {
562 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
563 line
, errLine
, errCol
, pe
.line
, pe
.offset
);
568 delete callerPattern
;
571 // Compile again, using a UTF-8-based UText
573 UText patternText
= UTEXT_INITIALIZER
;
574 regextst_openUTF8FromInvariant(&patternText
, pat
, -1, &status
);
575 callerPattern
= RegexPattern::compile(&patternText
, 0, pe
, status
);
576 if (status
!= expectedStatus
) {
577 dataerrln("Line %d: unexpected error %s compiling pattern.", line
, u_errorName(status
));
579 if (status
!= U_ZERO_ERROR
) {
580 if (pe
.line
!= errLine
|| pe
.offset
!= errCol
) {
581 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
582 line
, errLine
, errCol
, pe
.line
, pe
.offset
);
587 delete callerPattern
;
588 utext_close(&patternText
);
593 //---------------------------------------------------------------------------
595 // Basic Check for basic functionality of regex pattern matching.
596 // Avoid the use of REGEX_FIND test macro, which has
597 // substantial dependencies on basic Regex functionality.
599 //---------------------------------------------------------------------------
600 void RegexTest::Basic() {
604 // Debug - slide failing test cases early
608 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
610 UErrorCode status
= U_ZERO_ERROR
;
611 RegexPattern
*pattern
;
612 pattern
= RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE
, pe
, status
);
613 pattern
->dumpPattern();
614 RegexMatcher
*m
= pattern
->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status
);
615 UBool result
= m
->find();
616 printf("result = %d\n", result
);
617 // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
618 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
625 // Pattern with parentheses
627 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE
, FALSE
);
628 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE
, TRUE
);
629 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE
, FALSE
);
634 REGEX_TESTLM("st(abc)*ring", "string", TRUE
, TRUE
);
635 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE
, TRUE
);
636 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE
, TRUE
);
637 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE
, FALSE
);
638 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE
, FALSE
);
640 REGEX_TESTLM("a*", "", TRUE
, TRUE
);
641 REGEX_TESTLM("a*", "b", TRUE
, FALSE
);
647 REGEX_TESTLM(".", "abc", TRUE
, FALSE
);
648 REGEX_TESTLM("...", "abc", TRUE
, TRUE
);
649 REGEX_TESTLM("....", "abc", FALSE
, FALSE
);
650 REGEX_TESTLM(".*", "abcxyz123", TRUE
, TRUE
);
651 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE
, FALSE
);
652 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE
, TRUE
);
653 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE
, TRUE
);
654 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE
, FALSE
);
657 // Patterns with * applied to chars at end of literal string
659 REGEX_TESTLM("abc*", "ab", TRUE
, TRUE
);
660 REGEX_TESTLM("abc*", "abccccc", TRUE
, TRUE
);
663 // Supplemental chars match as single chars, not a pair of surrogates.
665 REGEX_TESTLM(".", "\\U00011000", TRUE
, TRUE
);
666 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE
, TRUE
);
667 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE
, FALSE
);
671 // UnicodeSets in the pattern
673 REGEX_TESTLM("[1-6]", "1", TRUE
, TRUE
);
674 REGEX_TESTLM("[1-6]", "3", TRUE
, TRUE
);
675 REGEX_TESTLM("[1-6]", "7", FALSE
, FALSE
);
676 REGEX_TESTLM("a[1-6]", "a3", TRUE
, TRUE
);
677 REGEX_TESTLM("a[1-6]", "a3", TRUE
, TRUE
);
678 REGEX_TESTLM("a[1-6]b", "a3b", TRUE
, TRUE
);
680 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE
, TRUE
);
681 REGEX_TESTLM("a[0-9]*b", "abc", TRUE
, FALSE
);
682 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE
, TRUE
);
683 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE
, FALSE
); // note that * matches 0 occurences.
684 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE
, TRUE
);
687 // OR operator in patterns
689 REGEX_TESTLM("(a|b)", "a", TRUE
, TRUE
);
690 REGEX_TESTLM("(a|b)", "b", TRUE
, TRUE
);
691 REGEX_TESTLM("(a|b)", "c", FALSE
, FALSE
);
692 REGEX_TESTLM("a|b", "b", TRUE
, TRUE
);
694 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE
, TRUE
);
695 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE
, FALSE
);
696 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE
, TRUE
);
697 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE
, TRUE
);
698 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE
, TRUE
);
699 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE
, FALSE
);
704 REGEX_TESTLM("ab+", "abbc", TRUE
, FALSE
);
705 REGEX_TESTLM("ab+c", "ac", FALSE
, FALSE
);
706 REGEX_TESTLM("b+", "", FALSE
, FALSE
);
707 REGEX_TESTLM("(abc|def)+", "defabc", TRUE
, TRUE
);
708 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE
, FALSE
);
709 REGEX_TESTLM(".+y", "zippity dooy", TRUE
, TRUE
);
714 REGEX_TESTLM("ab?", "ab", TRUE
, TRUE
);
715 REGEX_TESTLM("ab?", "a", TRUE
, TRUE
);
716 REGEX_TESTLM("ab?", "ac", TRUE
, FALSE
);
717 REGEX_TESTLM("ab?", "abb", TRUE
, FALSE
);
718 REGEX_TESTLM("a(b|c)?d", "abd", TRUE
, TRUE
);
719 REGEX_TESTLM("a(b|c)?d", "acd", TRUE
, TRUE
);
720 REGEX_TESTLM("a(b|c)?d", "ad", TRUE
, TRUE
);
721 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE
, FALSE
);
722 REGEX_TESTLM("a(b|c)?d", "ab", FALSE
, FALSE
);
725 // Escape sequences that become single literal chars, handled internally
726 // by ICU's Unescape.
729 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
730 REGEX_TESTLM("\\a", "\\u0007", TRUE
, TRUE
); // BEL
731 REGEX_TESTLM("\\cL", "\\u000c", TRUE
, TRUE
); // Control-L
732 REGEX_TESTLM("\\e", "\\u001b", TRUE
, TRUE
); // Escape
733 REGEX_TESTLM("\\f", "\\u000c", TRUE
, TRUE
); // Form Feed
734 REGEX_TESTLM("\\n", "\\u000a", TRUE
, TRUE
); // new line
735 REGEX_TESTLM("\\r", "\\u000d", TRUE
, TRUE
); // CR
736 REGEX_TESTLM("\\t", "\\u0009", TRUE
, TRUE
); // Tab
737 REGEX_TESTLM("\\u1234", "\\u1234", TRUE
, TRUE
);
738 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE
, TRUE
);
740 REGEX_TESTLM(".*\\Ax", "xyz", TRUE
, FALSE
); // \A matches only at the beginning of input
741 REGEX_TESTLM(".*\\Ax", " xyz", FALSE
, FALSE
); // \A matches only at the beginning of input
743 // Escape of special chars in patterns
744 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE
, TRUE
);
748 //---------------------------------------------------------------------------
750 // UTextBasic Check for quirks that are specific to the UText
753 //---------------------------------------------------------------------------
754 void RegexTest::UTextBasic() {
755 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
756 UErrorCode status
= U_ZERO_ERROR
;
757 UText pattern
= UTEXT_INITIALIZER
;
758 utext_openUTF8(&pattern
, str_abc
, -1, &status
);
759 RegexMatcher
matcher(&pattern
, 0, status
);
762 UText input
= UTEXT_INITIALIZER
;
763 utext_openUTF8(&input
, str_abc
, -1, &status
);
765 matcher
.reset(&input
);
767 REGEX_ASSERT_UTEXT_UTF8(str_abc
, matcher
.inputText());
769 matcher
.reset(matcher
.inputText());
771 REGEX_ASSERT_UTEXT_UTF8(str_abc
, matcher
.inputText());
773 utext_close(&pattern
);
778 //---------------------------------------------------------------------------
780 // API_Match Test that the API for class RegexMatcher
781 // is present and nominally working, but excluding functions
782 // implementing replace operations.
784 //---------------------------------------------------------------------------
785 void RegexTest::API_Match() {
787 UErrorCode status
=U_ZERO_ERROR
;
791 // Debug - slide failing test cases early
800 // Simple pattern compilation
803 UnicodeString
re("abc");
805 pat2
= RegexPattern::compile(re
, flags
, pe
, status
);
808 UnicodeString inStr1
= "abcdef this is a test";
809 UnicodeString instr2
= "not abc";
810 UnicodeString empty
= "";
814 // Matcher creation and reset.
816 RegexMatcher
*m1
= pat2
->matcher(inStr1
, status
);
818 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
819 REGEX_ASSERT(m1
->input() == inStr1
);
821 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
822 REGEX_ASSERT(m1
->input() == instr2
);
824 REGEX_ASSERT(m1
->input() == inStr1
);
825 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
827 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
828 REGEX_ASSERT(m1
->input() == empty
);
829 REGEX_ASSERT(&m1
->pattern() == pat2
);
832 // reset(pos, status)
835 m1
->reset(4, status
);
837 REGEX_ASSERT(m1
->input() == inStr1
);
838 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
840 m1
->reset(-1, status
);
841 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
842 status
= U_ZERO_ERROR
;
844 m1
->reset(0, status
);
846 status
= U_ZERO_ERROR
;
848 int32_t len
= m1
->input().length();
849 m1
->reset(len
-1, status
);
851 status
= U_ZERO_ERROR
;
853 m1
->reset(len
, status
);
855 status
= U_ZERO_ERROR
;
857 m1
->reset(len
+1, status
);
858 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
859 status
= U_ZERO_ERROR
;
862 // match(pos, status)
865 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
867 REGEX_ASSERT(m1
->matches(3, status
) == FALSE
);
869 REGEX_ASSERT(m1
->matches(5, status
) == FALSE
);
870 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
871 REGEX_ASSERT(m1
->matches(-1, status
) == FALSE
);
872 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
874 // Match() at end of string should fail, but should not
876 status
= U_ZERO_ERROR
;
877 len
= m1
->input().length();
878 REGEX_ASSERT(m1
->matches(len
, status
) == FALSE
);
881 // Match beyond end of string should fail with an error.
882 status
= U_ZERO_ERROR
;
883 REGEX_ASSERT(m1
->matches(len
+1, status
) == FALSE
);
884 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
886 // Successful match at end of string.
888 status
= U_ZERO_ERROR
;
889 RegexMatcher
m("A?", 0, status
); // will match zero length string.
892 len
= inStr1
.length();
893 REGEX_ASSERT(m
.matches(len
, status
) == TRUE
);
896 REGEX_ASSERT(m
.matches(0, status
) == TRUE
);
902 // lookingAt(pos, status)
904 status
= U_ZERO_ERROR
;
905 m1
->reset(instr2
); // "not abc"
906 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
907 REGEX_ASSERT(m1
->lookingAt(5, status
) == FALSE
);
908 REGEX_ASSERT(m1
->lookingAt(3, status
) == FALSE
);
909 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
910 REGEX_ASSERT(m1
->lookingAt(-1, status
) == FALSE
);
911 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
912 status
= U_ZERO_ERROR
;
913 len
= m1
->input().length();
914 REGEX_ASSERT(m1
->lookingAt(len
, status
) == FALSE
);
916 REGEX_ASSERT(m1
->lookingAt(len
+1, status
) == FALSE
);
917 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
926 // RegexMatcher::start();
927 // RegexMatcher::end();
928 // RegexMatcher::groupCount();
933 UErrorCode status
=U_ZERO_ERROR
;
935 UnicodeString
re("01(23(45)67)(.*)");
936 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
938 UnicodeString data
= "0123456789";
940 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
942 REGEX_ASSERT(matcher
->lookingAt(status
) == TRUE
);
943 static const int32_t matchStarts
[] = {0, 2, 4, 8};
944 static const int32_t matchEnds
[] = {10, 8, 6, 10};
946 for (i
=0; i
<4; i
++) {
947 int32_t actualStart
= matcher
->start(i
, status
);
949 if (actualStart
!= matchStarts
[i
]) {
950 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
951 __LINE__
, i
, matchStarts
[i
], actualStart
);
953 int32_t actualEnd
= matcher
->end(i
, status
);
955 if (actualEnd
!= matchEnds
[i
]) {
956 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
957 __LINE__
, i
, matchEnds
[i
], actualEnd
);
961 REGEX_ASSERT(matcher
->start(0, status
) == matcher
->start(status
));
962 REGEX_ASSERT(matcher
->end(0, status
) == matcher
->end(status
));
964 REGEX_ASSERT_FAIL(matcher
->start(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
965 REGEX_ASSERT_FAIL(matcher
->start( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
967 REGEX_ASSERT_FAIL(matcher
->start( 0, status
), U_REGEX_INVALID_STATE
);
969 matcher
->lookingAt(status
);
970 REGEX_ASSERT(matcher
->group(status
) == "0123456789");
971 REGEX_ASSERT(matcher
->group(0, status
) == "0123456789");
972 REGEX_ASSERT(matcher
->group(1, status
) == "234567" );
973 REGEX_ASSERT(matcher
->group(2, status
) == "45" );
974 REGEX_ASSERT(matcher
->group(3, status
) == "89" );
976 REGEX_ASSERT_FAIL(matcher
->group(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
977 REGEX_ASSERT_FAIL(matcher
->group( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
979 REGEX_ASSERT_FAIL(matcher
->group( 0, status
), U_REGEX_INVALID_STATE
);
992 UErrorCode status
=U_ZERO_ERROR
;
994 UnicodeString
re("abc");
995 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
997 UnicodeString data
= ".abc..abc...abc..";
998 // 012345678901234567
1000 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
1002 REGEX_ASSERT(matcher
->find());
1003 REGEX_ASSERT(matcher
->start(status
) == 1);
1004 REGEX_ASSERT(matcher
->find());
1005 REGEX_ASSERT(matcher
->start(status
) == 6);
1006 REGEX_ASSERT(matcher
->find());
1007 REGEX_ASSERT(matcher
->start(status
) == 12);
1008 REGEX_ASSERT(matcher
->find() == FALSE
);
1009 REGEX_ASSERT(matcher
->find() == FALSE
);
1012 REGEX_ASSERT(matcher
->find());
1013 REGEX_ASSERT(matcher
->start(status
) == 1);
1015 REGEX_ASSERT(matcher
->find(0, status
));
1016 REGEX_ASSERT(matcher
->start(status
) == 1);
1017 REGEX_ASSERT(matcher
->find(1, status
));
1018 REGEX_ASSERT(matcher
->start(status
) == 1);
1019 REGEX_ASSERT(matcher
->find(2, status
));
1020 REGEX_ASSERT(matcher
->start(status
) == 6);
1021 REGEX_ASSERT(matcher
->find(12, status
));
1022 REGEX_ASSERT(matcher
->start(status
) == 12);
1023 REGEX_ASSERT(matcher
->find(13, status
) == FALSE
);
1024 REGEX_ASSERT(matcher
->find(16, status
) == FALSE
);
1025 REGEX_ASSERT(matcher
->find(17, status
) == FALSE
);
1026 REGEX_ASSERT_FAIL(matcher
->start(status
), U_REGEX_INVALID_STATE
);
1028 status
= U_ZERO_ERROR
;
1029 REGEX_ASSERT_FAIL(matcher
->find(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
1030 status
= U_ZERO_ERROR
;
1031 REGEX_ASSERT_FAIL(matcher
->find(18, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
1033 REGEX_ASSERT(matcher
->groupCount() == 0);
1041 // find, with \G in pattern (true if at the end of a previous match).
1046 UErrorCode status
=U_ZERO_ERROR
;
1048 UnicodeString
re(".*?(?:(\\Gabc)|(abc))", -1, US_INV
);
1049 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
1051 UnicodeString data
= ".abcabc.abc..";
1052 // 012345678901234567
1054 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
1056 REGEX_ASSERT(matcher
->find());
1057 REGEX_ASSERT(matcher
->start(status
) == 0);
1058 REGEX_ASSERT(matcher
->start(1, status
) == -1);
1059 REGEX_ASSERT(matcher
->start(2, status
) == 1);
1061 REGEX_ASSERT(matcher
->find());
1062 REGEX_ASSERT(matcher
->start(status
) == 4);
1063 REGEX_ASSERT(matcher
->start(1, status
) == 4);
1064 REGEX_ASSERT(matcher
->start(2, status
) == -1);
1072 // find with zero length matches, match position should bump ahead
1073 // to prevent loops.
1077 UErrorCode status
=U_ZERO_ERROR
;
1078 RegexMatcher
m("(?= ?)", 0, status
); // This pattern will zero-length matches anywhere,
1079 // using an always-true look-ahead.
1081 UnicodeString
s(" ");
1084 if (m
.find() == FALSE
) {
1087 REGEX_ASSERT(m
.start(status
) == i
);
1088 REGEX_ASSERT(m
.end(status
) == i
);
1092 // Check that the bump goes over surrogate pairs OK
1093 s
= UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1097 if (m
.find() == FALSE
) {
1100 REGEX_ASSERT(m
.start(status
) == i
);
1101 REGEX_ASSERT(m
.end(status
) == i
);
1103 REGEX_ASSERT(i
==10);
1106 // find() loop breaking test.
1107 // with pattern of /.?/, should see a series of one char matches, then a single
1108 // match of zero length at the end of the input string.
1110 UErrorCode status
=U_ZERO_ERROR
;
1111 RegexMatcher
m(".?", 0, status
);
1113 UnicodeString
s(" ");
1116 if (m
.find() == FALSE
) {
1119 REGEX_ASSERT(m
.start(status
) == i
);
1120 REGEX_ASSERT(m
.end(status
) == (i
<4 ? i
+1 : i
));
1127 // Matchers with no input string behave as if they had an empty input string.
1131 UErrorCode status
= U_ZERO_ERROR
;
1132 RegexMatcher
m(".?", 0, status
);
1134 REGEX_ASSERT(m
.find());
1135 REGEX_ASSERT(m
.start(status
) == 0);
1136 REGEX_ASSERT(m
.input() == "");
1139 UErrorCode status
= U_ZERO_ERROR
;
1140 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
1141 RegexMatcher
*m
= p
->matcher(status
);
1144 REGEX_ASSERT(m
->find() == FALSE
);
1145 REGEX_ASSERT(m
->input() == "");
1154 UErrorCode status
= U_ZERO_ERROR
;
1155 UnicodeString
testString("This is test data");
1156 RegexMatcher
m(".*", testString
, 0, status
);
1158 REGEX_ASSERT(m
.regionStart() == 0);
1159 REGEX_ASSERT(m
.regionEnd() == testString
.length());
1160 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1161 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1163 m
.region(2,4, status
);
1165 REGEX_ASSERT(m
.matches(status
));
1166 REGEX_ASSERT(m
.start(status
)==2);
1167 REGEX_ASSERT(m
.end(status
)==4);
1171 REGEX_ASSERT(m
.regionStart() == 0);
1172 REGEX_ASSERT(m
.regionEnd() == testString
.length());
1174 UnicodeString
shorterString("short");
1175 m
.reset(shorterString
);
1176 REGEX_ASSERT(m
.regionStart() == 0);
1177 REGEX_ASSERT(m
.regionEnd() == shorterString
.length());
1179 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1180 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(FALSE
));
1181 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
1182 REGEX_ASSERT(&m
== &m
.reset());
1183 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
1185 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(TRUE
));
1186 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1187 REGEX_ASSERT(&m
== &m
.reset());
1188 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1190 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1191 REGEX_ASSERT(&m
== &m
.useTransparentBounds(TRUE
));
1192 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
1193 REGEX_ASSERT(&m
== &m
.reset());
1194 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
1196 REGEX_ASSERT(&m
== &m
.useTransparentBounds(FALSE
));
1197 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1198 REGEX_ASSERT(&m
== &m
.reset());
1199 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1204 // hitEnd() and requireEnd()
1207 UErrorCode status
= U_ZERO_ERROR
;
1208 UnicodeString
testString("aabb");
1209 RegexMatcher
m1(".*", testString
, 0, status
);
1210 REGEX_ASSERT(m1
.lookingAt(status
) == TRUE
);
1211 REGEX_ASSERT(m1
.hitEnd() == TRUE
);
1212 REGEX_ASSERT(m1
.requireEnd() == FALSE
);
1215 status
= U_ZERO_ERROR
;
1216 RegexMatcher
m2("a*", testString
, 0, status
);
1217 REGEX_ASSERT(m2
.lookingAt(status
) == TRUE
);
1218 REGEX_ASSERT(m2
.hitEnd() == FALSE
);
1219 REGEX_ASSERT(m2
.requireEnd() == FALSE
);
1222 status
= U_ZERO_ERROR
;
1223 RegexMatcher
m3(".*$", testString
, 0, status
);
1224 REGEX_ASSERT(m3
.lookingAt(status
) == TRUE
);
1225 REGEX_ASSERT(m3
.hitEnd() == TRUE
);
1226 REGEX_ASSERT(m3
.requireEnd() == TRUE
);
1232 // Compilation error on reset with UChar *
1233 // These were a hazard that people were stumbling over with runtime errors.
1234 // Changed them to compiler errors by adding private methods that more closely
1235 // matched the incorrect use of the functions.
1239 UErrorCode status
= U_ZERO_ERROR
;
1240 UChar ucharString
[20];
1241 RegexMatcher
m(".", 0, status
);
1242 m
.reset(ucharString
); // should not compile.
1244 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
1245 RegexMatcher
*m2
= p
->matcher(ucharString
, status
); // should not compile.
1247 RegexMatcher
m3(".", ucharString
, 0, status
); // Should not compile
1253 // Note: These tests will need to be changed when the regexp engine is
1254 // able to detect and cut short the exponential time behavior on
1255 // this type of match.
1258 UErrorCode status
= U_ZERO_ERROR
;
1259 // Enough 'a's in the string to cause the match to time out.
1260 // (Each on additonal 'a' doubles the time)
1261 UnicodeString
testString("aaaaaaaaaaaaaaaaaaaaa");
1262 RegexMatcher
matcher("(a+)+b", testString
, 0, status
);
1264 REGEX_ASSERT(matcher
.getTimeLimit() == 0);
1265 matcher
.setTimeLimit(100, status
);
1266 REGEX_ASSERT(matcher
.getTimeLimit() == 100);
1267 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1268 REGEX_ASSERT(status
== U_REGEX_TIME_OUT
);
1271 UErrorCode status
= U_ZERO_ERROR
;
1272 // Few enough 'a's to slip in under the time limit.
1273 UnicodeString
testString("aaaaaaaaaaaaaaaaaa");
1274 RegexMatcher
matcher("(a+)+b", testString
, 0, status
);
1276 matcher
.setTimeLimit(100, status
);
1277 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1285 UErrorCode status
= U_ZERO_ERROR
;
1286 UnicodeString
testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'
1288 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1289 // of the '+', and makes the stack frames larger.
1290 RegexMatcher
matcher("(A)+A$", testString
, 0, status
);
1292 // With the default stack, this match should fail to run
1293 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1294 REGEX_ASSERT(status
== U_REGEX_STACK_OVERFLOW
);
1296 // With unlimited stack, it should run
1297 status
= U_ZERO_ERROR
;
1298 matcher
.setStackLimit(0, status
);
1300 REGEX_ASSERT(matcher
.lookingAt(status
) == TRUE
);
1302 REGEX_ASSERT(matcher
.getStackLimit() == 0);
1304 // With a limited stack, it the match should fail
1305 status
= U_ZERO_ERROR
;
1306 matcher
.setStackLimit(10000, status
);
1307 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1308 REGEX_ASSERT(status
== U_REGEX_STACK_OVERFLOW
);
1309 REGEX_ASSERT(matcher
.getStackLimit() == 10000);
1312 // A pattern that doesn't save state should work with
1313 // a minimal sized stack
1315 UErrorCode status
= U_ZERO_ERROR
;
1316 UnicodeString testString
= "abc";
1317 RegexMatcher
matcher("abc", testString
, 0, status
);
1319 matcher
.setStackLimit(30, status
);
1321 REGEX_ASSERT(matcher
.matches(status
) == TRUE
);
1323 REGEX_ASSERT(matcher
.getStackLimit() == 30);
1325 // Negative stack sizes should fail
1326 status
= U_ZERO_ERROR
;
1327 matcher
.setStackLimit(1000, status
);
1329 matcher
.setStackLimit(-1, status
);
1330 REGEX_ASSERT(status
== U_ILLEGAL_ARGUMENT_ERROR
);
1331 REGEX_ASSERT(matcher
.getStackLimit() == 1000);
1342 //---------------------------------------------------------------------------
1344 // API_Replace API test for class RegexMatcher, testing the
1345 // Replace family of functions.
1347 //---------------------------------------------------------------------------
1348 void RegexTest::API_Replace() {
1354 UErrorCode status
=U_ZERO_ERROR
;
1356 UnicodeString
re("abc");
1357 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
1359 UnicodeString data
= ".abc..abc...abc..";
1360 // 012345678901234567
1361 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
1364 // Plain vanilla matches.
1367 dest
= matcher
->replaceFirst("yz", status
);
1369 REGEX_ASSERT(dest
== ".yz..abc...abc..");
1371 dest
= matcher
->replaceAll("yz", status
);
1373 REGEX_ASSERT(dest
== ".yz..yz...yz..");
1376 // Plain vanilla non-matches.
1378 UnicodeString d2
= ".abx..abx...abx..";
1380 dest
= matcher
->replaceFirst("yz", status
);
1382 REGEX_ASSERT(dest
== ".abx..abx...abx..");
1384 dest
= matcher
->replaceAll("yz", status
);
1386 REGEX_ASSERT(dest
== ".abx..abx...abx..");
1389 // Empty source string
1391 UnicodeString d3
= "";
1393 dest
= matcher
->replaceFirst("yz", status
);
1395 REGEX_ASSERT(dest
== "");
1397 dest
= matcher
->replaceAll("yz", status
);
1399 REGEX_ASSERT(dest
== "");
1402 // Empty substitution string
1404 matcher
->reset(data
); // ".abc..abc...abc.."
1405 dest
= matcher
->replaceFirst("", status
);
1407 REGEX_ASSERT(dest
== "...abc...abc..");
1409 dest
= matcher
->replaceAll("", status
);
1411 REGEX_ASSERT(dest
== "........");
1414 // match whole string
1416 UnicodeString d4
= "abc";
1418 dest
= matcher
->replaceFirst("xyz", status
);
1420 REGEX_ASSERT(dest
== "xyz");
1422 dest
= matcher
->replaceAll("xyz", status
);
1424 REGEX_ASSERT(dest
== "xyz");
1427 // Capture Group, simple case
1429 UnicodeString
re2("a(..)");
1430 RegexPattern
*pat2
= RegexPattern::compile(re2
, flags
, pe
, status
);
1432 UnicodeString d5
= "abcdefg";
1433 RegexMatcher
*matcher2
= pat2
->matcher(d5
, status
);
1435 dest
= matcher2
->replaceFirst("$1$1", status
);
1437 REGEX_ASSERT(dest
== "bcbcdefg");
1439 dest
= matcher2
->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status
);
1441 REGEX_ASSERT(dest
== "The value of $1 is bc.defg");
1443 dest
= matcher2
->replaceFirst("$ by itself, no group number $$$", status
);
1444 REGEX_ASSERT(U_FAILURE(status
));
1445 status
= U_ZERO_ERROR
;
1447 UnicodeString replacement
= UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1448 replacement
= replacement
.unescape();
1449 dest
= matcher2
->replaceFirst(replacement
, status
);
1451 REGEX_ASSERT(dest
== "Supplemental Digit 1 bc.defg");
1453 REGEX_ASSERT_FAIL(matcher2
->replaceFirst("bad capture group number $5...",status
), U_INDEX_OUTOFBOUNDS_ERROR
);
1457 // Replacement String with \u hex escapes
1460 UnicodeString src
= "abc 1 abc 2 abc 3";
1461 UnicodeString substitute
= UNICODE_STRING_SIMPLE("--\\u0043--");
1462 matcher
->reset(src
);
1463 UnicodeString result
= matcher
->replaceAll(substitute
, status
);
1465 REGEX_ASSERT(result
== "--C-- 1 --C-- 2 --C-- 3");
1468 UnicodeString src
= "abc !";
1469 UnicodeString substitute
= UNICODE_STRING_SIMPLE("--\\U00010000--");
1470 matcher
->reset(src
);
1471 UnicodeString result
= matcher
->replaceAll(substitute
, status
);
1473 UnicodeString expected
= UnicodeString("--");
1474 expected
.append((UChar32
)0x10000);
1475 expected
.append("-- !");
1476 REGEX_ASSERT(result
== expected
);
1478 // TODO: need more through testing of capture substitutions.
1483 status
= U_ZERO_ERROR
;
1484 UnicodeString s
= "The matches start with ss and end with ee ss stuff ee fin";
1485 RegexMatcher
m("ss(.*?)ee", 0, status
);
1487 UnicodeString result
;
1489 // Multiple finds do NOT bump up the previous appendReplacement postion.
1493 m
.appendReplacement(result
, "ooh", status
);
1495 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1497 // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1498 status
= U_ZERO_ERROR
;
1500 m
.reset(10, status
);
1503 m
.appendReplacement(result
, "ooh", status
);
1505 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1507 // find() at interior of string, appendReplacemnt still starts at beginning.
1508 status
= U_ZERO_ERROR
;
1513 m
.appendReplacement(result
, "ooh", status
);
1515 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1517 m
.appendTail(result
);
1518 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh fin");
1529 //---------------------------------------------------------------------------
1531 // API_Pattern Test that the API for class RegexPattern is
1532 // present and nominally working.
1534 //---------------------------------------------------------------------------
1535 void RegexTest::API_Pattern() {
1536 RegexPattern pata
; // Test default constructor to not crash.
1539 REGEX_ASSERT(pata
== patb
);
1540 REGEX_ASSERT(pata
== pata
);
1542 UnicodeString
re1("abc[a-l][m-z]");
1543 UnicodeString
re2("def");
1544 UErrorCode status
= U_ZERO_ERROR
;
1547 RegexPattern
*pat1
= RegexPattern::compile(re1
, 0, pe
, status
);
1548 RegexPattern
*pat2
= RegexPattern::compile(re2
, 0, pe
, status
);
1550 REGEX_ASSERT(*pat1
== *pat1
);
1551 REGEX_ASSERT(*pat1
!= pata
);
1555 REGEX_ASSERT(patb
== *pat1
);
1558 RegexPattern
patc(*pat1
);
1559 REGEX_ASSERT(patc
== *pat1
);
1560 REGEX_ASSERT(patb
== patc
);
1561 REGEX_ASSERT(pat1
!= pat2
);
1563 REGEX_ASSERT(patb
!= patc
);
1564 REGEX_ASSERT(patb
== *pat2
);
1566 // Compile with no flags.
1567 RegexPattern
*pat1a
= RegexPattern::compile(re1
, pe
, status
);
1568 REGEX_ASSERT(*pat1a
== *pat1
);
1570 REGEX_ASSERT(pat1a
->flags() == 0);
1572 // Compile with different flags should be not equal
1573 RegexPattern
*pat1b
= RegexPattern::compile(re1
, UREGEX_CASE_INSENSITIVE
, pe
, status
);
1576 REGEX_ASSERT(*pat1b
!= *pat1a
);
1577 REGEX_ASSERT(pat1b
->flags() == UREGEX_CASE_INSENSITIVE
);
1578 REGEX_ASSERT(pat1a
->flags() == 0);
1582 RegexPattern
*pat1c
= pat1
->clone();
1583 REGEX_ASSERT(*pat1c
== *pat1
);
1584 REGEX_ASSERT(*pat1c
!= *pat2
);
1593 // Verify that a matcher created from a cloned pattern works.
1597 UErrorCode status
= U_ZERO_ERROR
;
1598 RegexPattern
*pSource
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status
);
1599 RegexPattern
*pClone
= pSource
->clone();
1601 RegexMatcher
*mFromClone
= pClone
->matcher(status
);
1603 UnicodeString s
= "Hello World";
1604 mFromClone
->reset(s
);
1605 REGEX_ASSERT(mFromClone
->find() == TRUE
);
1606 REGEX_ASSERT(mFromClone
->group(status
) == "Hello");
1607 REGEX_ASSERT(mFromClone
->find() == TRUE
);
1608 REGEX_ASSERT(mFromClone
->group(status
) == "World");
1609 REGEX_ASSERT(mFromClone
->find() == FALSE
);
1615 // matches convenience API
1617 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe
, status
) == TRUE
);
1619 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe
, status
) == FALSE
);
1621 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe
, status
) == TRUE
);
1623 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe
, status
) == TRUE
);
1625 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe
, status
) == FALSE
);
1627 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1628 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe
, status
) == FALSE
);
1629 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1635 status
= U_ZERO_ERROR
;
1636 pat1
= RegexPattern::compile(" +", pe
, status
);
1638 UnicodeString fields
[10];
1641 n
= pat1
->split("Now is the time", fields
, 10, status
);
1644 REGEX_ASSERT(fields
[0]=="Now");
1645 REGEX_ASSERT(fields
[1]=="is");
1646 REGEX_ASSERT(fields
[2]=="the");
1647 REGEX_ASSERT(fields
[3]=="time");
1648 REGEX_ASSERT(fields
[4]=="");
1650 n
= pat1
->split("Now is the time", fields
, 2, status
);
1653 REGEX_ASSERT(fields
[0]=="Now");
1654 REGEX_ASSERT(fields
[1]=="is the time");
1655 REGEX_ASSERT(fields
[2]=="the"); // left over from previous test
1658 status
= U_ZERO_ERROR
;
1659 n
= pat1
->split("Now is the time", fields
, 1, status
);
1662 REGEX_ASSERT(fields
[0]=="Now is the time");
1663 REGEX_ASSERT(fields
[1]=="*");
1664 status
= U_ZERO_ERROR
;
1666 n
= pat1
->split(" Now is the time ", fields
, 10, status
);
1669 REGEX_ASSERT(fields
[0]=="");
1670 REGEX_ASSERT(fields
[1]=="Now");
1671 REGEX_ASSERT(fields
[2]=="is");
1672 REGEX_ASSERT(fields
[3]=="the");
1673 REGEX_ASSERT(fields
[4]=="time");
1674 REGEX_ASSERT(fields
[5]=="");
1676 n
= pat1
->split(" ", fields
, 10, status
);
1679 REGEX_ASSERT(fields
[0]=="");
1680 REGEX_ASSERT(fields
[1]=="");
1683 n
= pat1
->split("", fields
, 10, status
);
1686 REGEX_ASSERT(fields
[0]=="foo");
1690 // split, with a pattern with (capture)
1691 pat1
= RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe
, status
);
1694 status
= U_ZERO_ERROR
;
1695 n
= pat1
->split("<a>Now is <b>the time<c>", fields
, 10, status
);
1698 REGEX_ASSERT(fields
[0]=="");
1699 REGEX_ASSERT(fields
[1]=="a");
1700 REGEX_ASSERT(fields
[2]=="Now is ");
1701 REGEX_ASSERT(fields
[3]=="b");
1702 REGEX_ASSERT(fields
[4]=="the time");
1703 REGEX_ASSERT(fields
[5]=="c");
1704 REGEX_ASSERT(fields
[6]=="");
1705 REGEX_ASSERT(status
==U_ZERO_ERROR
);
1707 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 10, status
);
1710 REGEX_ASSERT(fields
[0]==" ");
1711 REGEX_ASSERT(fields
[1]=="a");
1712 REGEX_ASSERT(fields
[2]=="Now is ");
1713 REGEX_ASSERT(fields
[3]=="b");
1714 REGEX_ASSERT(fields
[4]=="the time");
1715 REGEX_ASSERT(fields
[5]=="c");
1716 REGEX_ASSERT(fields
[6]=="");
1718 status
= U_ZERO_ERROR
;
1720 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 6, status
);
1723 REGEX_ASSERT(fields
[0]==" ");
1724 REGEX_ASSERT(fields
[1]=="a");
1725 REGEX_ASSERT(fields
[2]=="Now is ");
1726 REGEX_ASSERT(fields
[3]=="b");
1727 REGEX_ASSERT(fields
[4]=="the time");
1728 REGEX_ASSERT(fields
[5]==""); // All text following "<c>" field delimiter.
1729 REGEX_ASSERT(fields
[6]=="foo");
1731 status
= U_ZERO_ERROR
;
1733 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 5, status
);
1736 REGEX_ASSERT(fields
[0]==" ");
1737 REGEX_ASSERT(fields
[1]=="a");
1738 REGEX_ASSERT(fields
[2]=="Now is ");
1739 REGEX_ASSERT(fields
[3]=="b");
1740 REGEX_ASSERT(fields
[4]=="the time<c>");
1741 REGEX_ASSERT(fields
[5]=="foo");
1743 status
= U_ZERO_ERROR
;
1745 n
= pat1
->split(" <a>Now is <b>the time", fields
, 5, status
);
1748 REGEX_ASSERT(fields
[0]==" ");
1749 REGEX_ASSERT(fields
[1]=="a");
1750 REGEX_ASSERT(fields
[2]=="Now is ");
1751 REGEX_ASSERT(fields
[3]=="b");
1752 REGEX_ASSERT(fields
[4]=="the time");
1753 REGEX_ASSERT(fields
[5]=="foo");
1755 status
= U_ZERO_ERROR
;
1756 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 4, status
);
1759 REGEX_ASSERT(fields
[0]==" ");
1760 REGEX_ASSERT(fields
[1]=="a");
1761 REGEX_ASSERT(fields
[2]=="Now is ");
1762 REGEX_ASSERT(fields
[3]=="the time<c>");
1763 status
= U_ZERO_ERROR
;
1766 pat1
= RegexPattern::compile("([-,])", pe
, status
);
1768 n
= pat1
->split("1-10,20", fields
, 10, status
);
1771 REGEX_ASSERT(fields
[0]=="1");
1772 REGEX_ASSERT(fields
[1]=="-");
1773 REGEX_ASSERT(fields
[2]=="10");
1774 REGEX_ASSERT(fields
[3]==",");
1775 REGEX_ASSERT(fields
[4]=="20");
1778 // Test split of string with empty trailing fields
1779 pat1
= RegexPattern::compile(",", pe
, status
);
1781 n
= pat1
->split("a,b,c,", fields
, 10, status
);
1784 REGEX_ASSERT(fields
[0]=="a");
1785 REGEX_ASSERT(fields
[1]=="b");
1786 REGEX_ASSERT(fields
[2]=="c");
1787 REGEX_ASSERT(fields
[3]=="");
1789 n
= pat1
->split("a,,,", fields
, 10, status
);
1792 REGEX_ASSERT(fields
[0]=="a");
1793 REGEX_ASSERT(fields
[1]=="");
1794 REGEX_ASSERT(fields
[2]=="");
1795 REGEX_ASSERT(fields
[3]=="");
1798 // Split Separator with zero length match.
1799 pat1
= RegexPattern::compile(":?", pe
, status
);
1801 n
= pat1
->split("abc", fields
, 10, status
);
1804 REGEX_ASSERT(fields
[0]=="");
1805 REGEX_ASSERT(fields
[1]=="a");
1806 REGEX_ASSERT(fields
[2]=="b");
1807 REGEX_ASSERT(fields
[3]=="c");
1808 REGEX_ASSERT(fields
[4]=="");
1813 // RegexPattern::pattern()
1815 pat1
= new RegexPattern();
1816 REGEX_ASSERT(pat1
->pattern() == "");
1819 pat1
= RegexPattern::compile("(Hello, world)*", pe
, status
);
1821 REGEX_ASSERT(pat1
->pattern() == "(Hello, world)*");
1826 // classID functions
1828 pat1
= RegexPattern::compile("(Hello, world)*", pe
, status
);
1830 REGEX_ASSERT(pat1
->getDynamicClassID() == RegexPattern::getStaticClassID());
1831 REGEX_ASSERT(pat1
->getDynamicClassID() != NULL
);
1832 UnicodeString
Hello("Hello, world.");
1833 RegexMatcher
*m
= pat1
->matcher(Hello
, status
);
1834 REGEX_ASSERT(pat1
->getDynamicClassID() != m
->getDynamicClassID());
1835 REGEX_ASSERT(m
->getDynamicClassID() == RegexMatcher::getStaticClassID());
1836 REGEX_ASSERT(m
->getDynamicClassID() != NULL
);
1842 //---------------------------------------------------------------------------
1844 // API_Match_UTF8 Test that the alternate engine for class RegexMatcher
1845 // is present and working, but excluding functions
1846 // implementing replace operations.
1848 //---------------------------------------------------------------------------
1849 void RegexTest::API_Match_UTF8() {
1851 UErrorCode status
=U_ZERO_ERROR
;
1855 // Debug - slide failing test cases early
1864 // Simple pattern compilation
1867 UText re
= UTEXT_INITIALIZER
;
1868 regextst_openUTF8FromInvariant(&re
, "abc", -1, &status
);
1869 REGEX_VERBOSE_TEXT(&re
);
1871 pat2
= RegexPattern::compile(&re
, flags
, pe
, status
);
1874 UText input1
= UTEXT_INITIALIZER
;
1875 UText input2
= UTEXT_INITIALIZER
;
1876 UText empty
= UTEXT_INITIALIZER
;
1877 regextst_openUTF8FromInvariant(&input1
, "abcdef this is a test", -1, &status
);
1878 REGEX_VERBOSE_TEXT(&input1
);
1879 regextst_openUTF8FromInvariant(&input2
, "not abc", -1, &status
);
1880 REGEX_VERBOSE_TEXT(&input2
);
1881 utext_openUChars(&empty
, NULL
, 0, &status
);
1883 int32_t input1Len
= strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1884 int32_t input2Len
= strlen("not abc");
1888 // Matcher creation and reset.
1890 RegexMatcher
*m1
= &pat2
->matcher(status
)->reset(&input1
);
1892 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
1893 const char str_abcdefthisisatest
[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1894 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest
, m1
->inputText());
1896 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
1897 const char str_notabc
[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1898 REGEX_ASSERT_UTEXT_UTF8(str_notabc
, m1
->inputText());
1900 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest
, m1
->inputText());
1901 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
1903 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
1904 REGEX_ASSERT(utext_nativeLength(&empty
) == 0);
1907 // reset(pos, status)
1910 m1
->reset(4, status
);
1912 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest
, m1
->inputText());
1913 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
1915 m1
->reset(-1, status
);
1916 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1917 status
= U_ZERO_ERROR
;
1919 m1
->reset(0, status
);
1921 status
= U_ZERO_ERROR
;
1923 m1
->reset(input1Len
-1, status
);
1925 status
= U_ZERO_ERROR
;
1927 m1
->reset(input1Len
, status
);
1929 status
= U_ZERO_ERROR
;
1931 m1
->reset(input1Len
+1, status
);
1932 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1933 status
= U_ZERO_ERROR
;
1936 // match(pos, status)
1939 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
1941 REGEX_ASSERT(m1
->matches(3, status
) == FALSE
);
1943 REGEX_ASSERT(m1
->matches(5, status
) == FALSE
);
1944 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
1945 REGEX_ASSERT(m1
->matches(-1, status
) == FALSE
);
1946 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1948 // Match() at end of string should fail, but should not
1950 status
= U_ZERO_ERROR
;
1951 REGEX_ASSERT(m1
->matches(input2Len
, status
) == FALSE
);
1954 // Match beyond end of string should fail with an error.
1955 status
= U_ZERO_ERROR
;
1956 REGEX_ASSERT(m1
->matches(input2Len
+1, status
) == FALSE
);
1957 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1959 // Successful match at end of string.
1961 status
= U_ZERO_ERROR
;
1962 RegexMatcher
m("A?", 0, status
); // will match zero length string.
1965 REGEX_ASSERT(m
.matches(input1Len
, status
) == TRUE
);
1968 REGEX_ASSERT(m
.matches(0, status
) == TRUE
);
1974 // lookingAt(pos, status)
1976 status
= U_ZERO_ERROR
;
1977 m1
->reset(&input2
); // "not abc"
1978 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
1979 REGEX_ASSERT(m1
->lookingAt(5, status
) == FALSE
);
1980 REGEX_ASSERT(m1
->lookingAt(3, status
) == FALSE
);
1981 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
1982 REGEX_ASSERT(m1
->lookingAt(-1, status
) == FALSE
);
1983 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1984 status
= U_ZERO_ERROR
;
1985 REGEX_ASSERT(m1
->lookingAt(input2Len
, status
) == FALSE
);
1987 REGEX_ASSERT(m1
->lookingAt(input2Len
+1, status
) == FALSE
);
1988 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1994 utext_close(&input1
);
1995 utext_close(&input2
);
1996 utext_close(&empty
);
2002 // RegexMatcher::start();
2003 // RegexMatcher::end();
2004 // RegexMatcher::groupCount();
2009 UErrorCode status
=U_ZERO_ERROR
;
2010 UText re
=UTEXT_INITIALIZER
;
2011 const char str_01234567_pat
[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
2012 utext_openUTF8(&re
, str_01234567_pat
, -1, &status
);
2014 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
2017 UText input
= UTEXT_INITIALIZER
;
2018 const char str_0123456789
[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2019 utext_openUTF8(&input
, str_0123456789
, -1, &status
);
2021 RegexMatcher
*matcher
= &pat
->matcher(status
)->reset(&input
);
2023 REGEX_ASSERT(matcher
->lookingAt(status
) == TRUE
);
2024 static const int32_t matchStarts
[] = {0, 2, 4, 8};
2025 static const int32_t matchEnds
[] = {10, 8, 6, 10};
2027 for (i
=0; i
<4; i
++) {
2028 int32_t actualStart
= matcher
->start(i
, status
);
2030 if (actualStart
!= matchStarts
[i
]) {
2031 errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n",
2032 __FILE__
, __LINE__
, i
, matchStarts
[i
], actualStart
);
2034 int32_t actualEnd
= matcher
->end(i
, status
);
2036 if (actualEnd
!= matchEnds
[i
]) {
2037 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n",
2038 __FILE__
, __LINE__
, i
, matchEnds
[i
], actualEnd
);
2042 REGEX_ASSERT(matcher
->start(0, status
) == matcher
->start(status
));
2043 REGEX_ASSERT(matcher
->end(0, status
) == matcher
->end(status
));
2045 REGEX_ASSERT_FAIL(matcher
->start(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2046 REGEX_ASSERT_FAIL(matcher
->start( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2048 REGEX_ASSERT_FAIL(matcher
->start( 0, status
), U_REGEX_INVALID_STATE
);
2050 matcher
->lookingAt(status
);
2053 UText destText
= UTEXT_INITIALIZER
;
2054 utext_openUnicodeString(&destText
, &dest
, &status
);
2056 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2057 // Test shallow-clone API
2059 result
= matcher
->group((UText
*)NULL
, group_len
, status
);
2061 REGEX_ASSERT_UTEXT_UTF8(str_0123456789
, result
);
2062 utext_close(result
);
2063 result
= matcher
->group(0, &destText
, group_len
, status
);
2065 REGEX_ASSERT(result
== &destText
);
2066 REGEX_ASSERT_UTEXT_UTF8(str_0123456789
, result
);
2067 // destText is now immutable, reopen it
2068 utext_close(&destText
);
2069 utext_openUnicodeString(&destText
, &dest
, &status
);
2072 result
= matcher
->group(0, NULL
, length
, status
);
2074 REGEX_ASSERT_UTEXT_UTF8(str_0123456789
, result
);
2075 utext_close(result
);
2076 result
= matcher
->group(0, &destText
, length
, status
);
2078 REGEX_ASSERT(result
== &destText
);
2079 REGEX_ASSERT(utext_getNativeIndex(result
) == 0);
2080 REGEX_ASSERT(length
== 10);
2081 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2083 // Capture Group 1 == "234567"
2084 result
= matcher
->group(1, NULL
, length
, status
);
2086 REGEX_ASSERT(utext_getNativeIndex(result
) == 2);
2087 REGEX_ASSERT(length
== 6);
2088 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2089 utext_close(result
);
2091 result
= matcher
->group(1, &destText
, length
, status
);
2093 REGEX_ASSERT(result
== &destText
);
2094 REGEX_ASSERT(utext_getNativeIndex(result
) == 2);
2095 REGEX_ASSERT(length
== 6);
2096 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2097 utext_close(result
);
2099 // Capture Group 2 == "45"
2100 result
= matcher
->group(2, NULL
, length
, status
);
2102 REGEX_ASSERT(utext_getNativeIndex(result
) == 4);
2103 REGEX_ASSERT(length
== 2);
2104 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2105 utext_close(result
);
2107 result
= matcher
->group(2, &destText
, length
, status
);
2109 REGEX_ASSERT(result
== &destText
);
2110 REGEX_ASSERT(utext_getNativeIndex(result
) == 4);
2111 REGEX_ASSERT(length
== 2);
2112 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2113 utext_close(result
);
2115 // Capture Group 3 == "89"
2116 result
= matcher
->group(3, NULL
, length
, status
);
2118 REGEX_ASSERT(utext_getNativeIndex(result
) == 8);
2119 REGEX_ASSERT(length
== 2);
2120 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2121 utext_close(result
);
2123 result
= matcher
->group(3, &destText
, length
, status
);
2125 REGEX_ASSERT(result
== &destText
);
2126 REGEX_ASSERT(utext_getNativeIndex(result
) == 8);
2127 REGEX_ASSERT(length
== 2);
2128 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2129 utext_close(result
);
2131 // Capture Group number out of range.
2132 status
= U_ZERO_ERROR
;
2133 REGEX_ASSERT_FAIL(matcher
->group(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2134 status
= U_ZERO_ERROR
;
2135 REGEX_ASSERT_FAIL(matcher
->group( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2136 status
= U_ZERO_ERROR
;
2138 REGEX_ASSERT_FAIL(matcher
->group( 0, status
), U_REGEX_INVALID_STATE
);
2143 utext_close(&destText
);
2144 utext_close(&input
);
2154 UErrorCode status
=U_ZERO_ERROR
;
2155 UText re
=UTEXT_INITIALIZER
;
2156 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2157 utext_openUTF8(&re
, str_abc
, -1, &status
);
2159 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
2161 UText input
= UTEXT_INITIALIZER
;
2162 const char str_abcabcabc
[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2163 utext_openUTF8(&input
, str_abcabcabc
, -1, &status
);
2164 // 012345678901234567
2166 RegexMatcher
*matcher
= &pat
->matcher(status
)->reset(&input
);
2168 REGEX_ASSERT(matcher
->find());
2169 REGEX_ASSERT(matcher
->start(status
) == 1);
2170 REGEX_ASSERT(matcher
->find());
2171 REGEX_ASSERT(matcher
->start(status
) == 6);
2172 REGEX_ASSERT(matcher
->find());
2173 REGEX_ASSERT(matcher
->start(status
) == 12);
2174 REGEX_ASSERT(matcher
->find() == FALSE
);
2175 REGEX_ASSERT(matcher
->find() == FALSE
);
2178 REGEX_ASSERT(matcher
->find());
2179 REGEX_ASSERT(matcher
->start(status
) == 1);
2181 REGEX_ASSERT(matcher
->find(0, status
));
2182 REGEX_ASSERT(matcher
->start(status
) == 1);
2183 REGEX_ASSERT(matcher
->find(1, status
));
2184 REGEX_ASSERT(matcher
->start(status
) == 1);
2185 REGEX_ASSERT(matcher
->find(2, status
));
2186 REGEX_ASSERT(matcher
->start(status
) == 6);
2187 REGEX_ASSERT(matcher
->find(12, status
));
2188 REGEX_ASSERT(matcher
->start(status
) == 12);
2189 REGEX_ASSERT(matcher
->find(13, status
) == FALSE
);
2190 REGEX_ASSERT(matcher
->find(16, status
) == FALSE
);
2191 REGEX_ASSERT(matcher
->find(17, status
) == FALSE
);
2192 REGEX_ASSERT_FAIL(matcher
->start(status
), U_REGEX_INVALID_STATE
);
2194 status
= U_ZERO_ERROR
;
2195 REGEX_ASSERT_FAIL(matcher
->find(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2196 status
= U_ZERO_ERROR
;
2197 REGEX_ASSERT_FAIL(matcher
->find(18, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2199 REGEX_ASSERT(matcher
->groupCount() == 0);
2204 utext_close(&input
);
2210 // find, with \G in pattern (true if at the end of a previous match).
2215 UErrorCode status
=U_ZERO_ERROR
;
2216 UText re
=UTEXT_INITIALIZER
;
2217 const char str_Gabcabc
[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2218 utext_openUTF8(&re
, str_Gabcabc
, -1, &status
);
2220 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
2223 UText input
= UTEXT_INITIALIZER
;
2224 const char str_abcabcabc
[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2225 utext_openUTF8(&input
, str_abcabcabc
, -1, &status
);
2226 // 012345678901234567
2228 RegexMatcher
*matcher
= &pat
->matcher(status
)->reset(&input
);
2230 REGEX_ASSERT(matcher
->find());
2231 REGEX_ASSERT(matcher
->start(status
) == 0);
2232 REGEX_ASSERT(matcher
->start(1, status
) == -1);
2233 REGEX_ASSERT(matcher
->start(2, status
) == 1);
2235 REGEX_ASSERT(matcher
->find());
2236 REGEX_ASSERT(matcher
->start(status
) == 4);
2237 REGEX_ASSERT(matcher
->start(1, status
) == 4);
2238 REGEX_ASSERT(matcher
->start(2, status
) == -1);
2244 utext_close(&input
);
2249 // find with zero length matches, match position should bump ahead
2250 // to prevent loops.
2254 UErrorCode status
=U_ZERO_ERROR
;
2255 RegexMatcher
m("(?= ?)", 0, status
); // This pattern will zero-length matches anywhere,
2256 // using an always-true look-ahead.
2258 UText s
= UTEXT_INITIALIZER
;
2259 utext_openUTF8(&s
, " ", -1, &status
);
2262 if (m
.find() == FALSE
) {
2265 REGEX_ASSERT(m
.start(status
) == i
);
2266 REGEX_ASSERT(m
.end(status
) == i
);
2270 // Check that the bump goes over characters outside the BMP OK
2271 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2272 unsigned char aboveBMP
[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2273 utext_openUTF8(&s
, (char *)aboveBMP
, -1, &status
);
2276 if (m
.find() == FALSE
) {
2279 REGEX_ASSERT(m
.start(status
) == i
);
2280 REGEX_ASSERT(m
.end(status
) == i
);
2282 REGEX_ASSERT(i
==20);
2287 // find() loop breaking test.
2288 // with pattern of /.?/, should see a series of one char matches, then a single
2289 // match of zero length at the end of the input string.
2291 UErrorCode status
=U_ZERO_ERROR
;
2292 RegexMatcher
m(".?", 0, status
);
2294 UText s
= UTEXT_INITIALIZER
;
2295 utext_openUTF8(&s
, " ", -1, &status
);
2298 if (m
.find() == FALSE
) {
2301 REGEX_ASSERT(m
.start(status
) == i
);
2302 REGEX_ASSERT(m
.end(status
) == (i
<4 ? i
+1 : i
));
2311 // Matchers with no input string behave as if they had an empty input string.
2315 UErrorCode status
= U_ZERO_ERROR
;
2316 RegexMatcher
m(".?", 0, status
);
2318 REGEX_ASSERT(m
.find());
2319 REGEX_ASSERT(m
.start(status
) == 0);
2320 REGEX_ASSERT(m
.input() == "");
2323 UErrorCode status
= U_ZERO_ERROR
;
2324 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
2325 RegexMatcher
*m
= p
->matcher(status
);
2328 REGEX_ASSERT(m
->find() == FALSE
);
2329 REGEX_ASSERT(utext_nativeLength(m
->inputText()) == 0);
2338 UErrorCode status
= U_ZERO_ERROR
;
2339 UText testPattern
= UTEXT_INITIALIZER
;
2340 UText testText
= UTEXT_INITIALIZER
;
2341 regextst_openUTF8FromInvariant(&testPattern
, ".*", -1, &status
);
2342 REGEX_VERBOSE_TEXT(&testPattern
);
2343 regextst_openUTF8FromInvariant(&testText
, "This is test data", -1, &status
);
2344 REGEX_VERBOSE_TEXT(&testText
);
2346 RegexMatcher
m(&testPattern
, &testText
, 0, status
);
2348 REGEX_ASSERT(m
.regionStart() == 0);
2349 REGEX_ASSERT(m
.regionEnd() == (int32_t)strlen("This is test data"));
2350 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2351 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2353 m
.region(2,4, status
);
2355 REGEX_ASSERT(m
.matches(status
));
2356 REGEX_ASSERT(m
.start(status
)==2);
2357 REGEX_ASSERT(m
.end(status
)==4);
2361 REGEX_ASSERT(m
.regionStart() == 0);
2362 REGEX_ASSERT(m
.regionEnd() == (int32_t)strlen("This is test data"));
2364 regextst_openUTF8FromInvariant(&testText
, "short", -1, &status
);
2365 REGEX_VERBOSE_TEXT(&testText
);
2367 REGEX_ASSERT(m
.regionStart() == 0);
2368 REGEX_ASSERT(m
.regionEnd() == (int32_t)strlen("short"));
2370 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2371 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(FALSE
));
2372 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
2373 REGEX_ASSERT(&m
== &m
.reset());
2374 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
2376 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(TRUE
));
2377 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2378 REGEX_ASSERT(&m
== &m
.reset());
2379 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2381 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2382 REGEX_ASSERT(&m
== &m
.useTransparentBounds(TRUE
));
2383 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
2384 REGEX_ASSERT(&m
== &m
.reset());
2385 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
2387 REGEX_ASSERT(&m
== &m
.useTransparentBounds(FALSE
));
2388 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2389 REGEX_ASSERT(&m
== &m
.reset());
2390 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2392 utext_close(&testText
);
2393 utext_close(&testPattern
);
2397 // hitEnd() and requireEnd()
2400 UErrorCode status
= U_ZERO_ERROR
;
2401 UText testPattern
= UTEXT_INITIALIZER
;
2402 UText testText
= UTEXT_INITIALIZER
;
2403 const char str_
[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2404 const char str_aabb
[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2405 utext_openUTF8(&testPattern
, str_
, -1, &status
);
2406 utext_openUTF8(&testText
, str_aabb
, -1, &status
);
2408 RegexMatcher
m1(&testPattern
, &testText
, 0, status
);
2409 REGEX_ASSERT(m1
.lookingAt(status
) == TRUE
);
2410 REGEX_ASSERT(m1
.hitEnd() == TRUE
);
2411 REGEX_ASSERT(m1
.requireEnd() == FALSE
);
2414 status
= U_ZERO_ERROR
;
2415 const char str_a
[] = { 0x61, 0x2a, 0x00 }; /* a* */
2416 utext_openUTF8(&testPattern
, str_a
, -1, &status
);
2417 RegexMatcher
m2(&testPattern
, &testText
, 0, status
);
2418 REGEX_ASSERT(m2
.lookingAt(status
) == TRUE
);
2419 REGEX_ASSERT(m2
.hitEnd() == FALSE
);
2420 REGEX_ASSERT(m2
.requireEnd() == FALSE
);
2423 status
= U_ZERO_ERROR
;
2424 const char str_dotstardollar
[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2425 utext_openUTF8(&testPattern
, str_dotstardollar
, -1, &status
);
2426 RegexMatcher
m3(&testPattern
, &testText
, 0, status
);
2427 REGEX_ASSERT(m3
.lookingAt(status
) == TRUE
);
2428 REGEX_ASSERT(m3
.hitEnd() == TRUE
);
2429 REGEX_ASSERT(m3
.requireEnd() == TRUE
);
2432 utext_close(&testText
);
2433 utext_close(&testPattern
);
2438 //---------------------------------------------------------------------------
2440 // API_Replace_UTF8 API test for class RegexMatcher, testing the
2441 // Replace family of functions.
2443 //---------------------------------------------------------------------------
2444 void RegexTest::API_Replace_UTF8() {
2450 UErrorCode status
=U_ZERO_ERROR
;
2452 UText re
=UTEXT_INITIALIZER
;
2453 regextst_openUTF8FromInvariant(&re
, "abc", -1, &status
);
2454 REGEX_VERBOSE_TEXT(&re
);
2455 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
2458 char data
[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2459 // 012345678901234567
2460 UText dataText
= UTEXT_INITIALIZER
;
2461 utext_openUTF8(&dataText
, data
, -1, &status
);
2463 REGEX_VERBOSE_TEXT(&dataText
);
2464 RegexMatcher
*matcher
= &pat
->matcher(status
)->reset(&dataText
);
2467 // Plain vanilla matches.
2470 UText destText
= UTEXT_INITIALIZER
;
2471 utext_openUnicodeString(&destText
, &dest
, &status
);
2474 UText replText
= UTEXT_INITIALIZER
;
2476 const char str_yz
[] = { 0x79, 0x7a, 0x00 }; /* yz */
2477 utext_openUTF8(&replText
, str_yz
, -1, &status
);
2478 REGEX_VERBOSE_TEXT(&replText
);
2479 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2481 const char str_yzabcabc
[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2482 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc
, result
);
2483 utext_close(result
);
2484 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2486 REGEX_ASSERT(result
== &destText
);
2487 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc
, result
);
2489 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2491 const char str_yzyzyz
[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2492 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz
, result
);
2493 utext_close(result
);
2495 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2496 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2498 REGEX_ASSERT(result
== &destText
);
2499 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz
, result
);
2502 // Plain vanilla non-matches.
2504 const char str_abxabxabx
[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2505 utext_openUTF8(&dataText
, str_abxabxabx
, -1, &status
);
2506 matcher
->reset(&dataText
);
2508 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2510 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2511 utext_close(result
);
2512 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2514 REGEX_ASSERT(result
== &destText
);
2515 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2517 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2519 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2520 utext_close(result
);
2521 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2522 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2524 REGEX_ASSERT(result
== &destText
);
2525 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2528 // Empty source string
2530 utext_openUTF8(&dataText
, NULL
, 0, &status
);
2531 matcher
->reset(&dataText
);
2533 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2535 REGEX_ASSERT_UTEXT_UTF8("", result
);
2536 utext_close(result
);
2537 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2539 REGEX_ASSERT(result
== &destText
);
2540 REGEX_ASSERT_UTEXT_UTF8("", result
);
2542 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2544 REGEX_ASSERT_UTEXT_UTF8("", result
);
2545 utext_close(result
);
2546 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2548 REGEX_ASSERT(result
== &destText
);
2549 REGEX_ASSERT_UTEXT_UTF8("", result
);
2552 // Empty substitution string
2554 utext_openUTF8(&dataText
, data
, -1, &status
); // ".abc..abc...abc.."
2555 matcher
->reset(&dataText
);
2557 utext_openUTF8(&replText
, NULL
, 0, &status
);
2558 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2560 const char str_abcabc
[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2561 REGEX_ASSERT_UTEXT_UTF8(str_abcabc
, result
);
2562 utext_close(result
);
2563 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2565 REGEX_ASSERT(result
== &destText
);
2566 REGEX_ASSERT_UTEXT_UTF8(str_abcabc
, result
);
2568 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2570 const char str_dots
[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2571 REGEX_ASSERT_UTEXT_UTF8(str_dots
, result
);
2572 utext_close(result
);
2573 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2574 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2576 REGEX_ASSERT(result
== &destText
);
2577 REGEX_ASSERT_UTEXT_UTF8(str_dots
, result
);
2580 // match whole string
2582 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2583 utext_openUTF8(&dataText
, str_abc
, -1, &status
);
2584 matcher
->reset(&dataText
);
2586 const char str_xyz
[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2587 utext_openUTF8(&replText
, str_xyz
, -1, &status
);
2588 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2590 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2591 utext_close(result
);
2592 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2593 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2595 REGEX_ASSERT(result
== &destText
);
2596 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2598 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2600 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2601 utext_close(result
);
2602 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2603 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2605 REGEX_ASSERT(result
== &destText
);
2606 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2609 // Capture Group, simple case
2611 const char str_add
[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2612 utext_openUTF8(&re
, str_add
, -1, &status
);
2613 RegexPattern
*pat2
= RegexPattern::compile(&re
, flags
, pe
, status
);
2616 const char str_abcdefg
[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2617 utext_openUTF8(&dataText
, str_abcdefg
, -1, &status
);
2618 RegexMatcher
*matcher2
= &pat2
->matcher(status
)->reset(&dataText
);
2621 const char str_11
[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2622 utext_openUTF8(&replText
, str_11
, -1, &status
);
2623 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2625 const char str_bcbcdefg
[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2626 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg
, result
);
2627 utext_close(result
);
2628 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2629 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2631 REGEX_ASSERT(result
== &destText
);
2632 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg
, result
);
2634 const char str_v
[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2635 utext_openUTF8(&replText
, str_v
, -1, &status
);
2636 REGEX_VERBOSE_TEXT(&replText
);
2637 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2639 const char str_Thevalueof1isbcdefg
[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2640 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg
, result
);
2641 utext_close(result
);
2642 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2643 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2645 REGEX_ASSERT(result
== &destText
);
2646 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg
, result
);
2648 const char str_byitselfnogroupnumber
[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
2649 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
2650 0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
2651 utext_openUTF8(&replText
, str_byitselfnogroupnumber
, -1, &status
);
2652 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2654 const char str_byitselfnogroupnumberdefg
[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2655 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg
, result
);
2656 utext_close(result
);
2657 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2658 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2660 REGEX_ASSERT(result
== &destText
);
2661 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg
, result
);
2663 unsigned char supplDigitChars
[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2664 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2665 // 012345678901234567890123456
2666 supplDigitChars
[22] = 0xF0;
2667 supplDigitChars
[23] = 0x9D;
2668 supplDigitChars
[24] = 0x9F;
2669 supplDigitChars
[25] = 0x8F;
2670 utext_openUTF8(&replText
, (char *)supplDigitChars
, -1, &status
);
2672 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2674 const char str_SupplementalDigit1bcdefg
[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2675 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg
, result
);
2676 utext_close(result
);
2677 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2678 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2680 REGEX_ASSERT(result
== &destText
);
2681 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg
, result
);
2682 const char str_badcapturegroupnumber5
[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */
2683 utext_openUTF8(&replText
, str_badcapturegroupnumber5
, -1, &status
);
2684 REGEX_ASSERT_FAIL((result
= matcher2
->replaceFirst(&replText
, NULL
, status
)), U_INDEX_OUTOFBOUNDS_ERROR
);
2685 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2686 utext_close(result
);
2687 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2688 REGEX_ASSERT_FAIL((result
= matcher2
->replaceFirst(&replText
, &destText
, status
)), U_INDEX_OUTOFBOUNDS_ERROR
);
2689 REGEX_ASSERT(result
== &destText
);
2690 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2693 // Replacement String with \u hex escapes
2696 const char str_abc1abc2abc3
[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2697 const char str_u0043
[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2698 utext_openUTF8(&dataText
, str_abc1abc2abc3
, -1, &status
);
2699 utext_openUTF8(&replText
, str_u0043
, -1, &status
);
2700 matcher
->reset(&dataText
);
2702 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2704 const char str_C1C2C3
[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2705 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3
, result
);
2706 utext_close(result
);
2707 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2708 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2710 REGEX_ASSERT(result
== &destText
);
2711 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3
, result
);
2714 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2715 utext_openUTF8(&dataText
, str_abc
, -1, &status
);
2716 const char str_U00010000
[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2717 utext_openUTF8(&replText
, str_U00010000
, -1, &status
);
2718 matcher
->reset(&dataText
);
2720 unsigned char expected
[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2727 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2729 REGEX_ASSERT_UTEXT_UTF8((char *)expected
, result
);
2730 utext_close(result
);
2731 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2732 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2734 REGEX_ASSERT(result
== &destText
);
2735 REGEX_ASSERT_UTEXT_UTF8((char *)expected
, result
);
2737 // TODO: need more through testing of capture substitutions.
2742 status
= U_ZERO_ERROR
;
2743 const char str_ssee
[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2744 const char str_blah
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2745 const char str_ooh
[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2746 utext_openUTF8(&re
, str_ssee
, -1, &status
);
2747 utext_openUTF8(&dataText
, str_blah
, -1, &status
);
2748 utext_openUTF8(&replText
, str_ooh
, -1, &status
);
2750 RegexMatcher
m(&re
, 0, status
);
2753 UnicodeString result
;
2754 UText resultText
= UTEXT_INITIALIZER
;
2755 utext_openUnicodeString(&resultText
, &result
, &status
);
2757 // Multiple finds do NOT bump up the previous appendReplacement postion.
2761 m
.appendReplacement(&resultText
, &replText
, status
);
2763 const char str_blah2
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2764 REGEX_ASSERT_UTEXT_UTF8(str_blah2
, &resultText
);
2766 // After a reset into the interior of a string, appendReplacement still starts at beginning.
2767 status
= U_ZERO_ERROR
;
2769 utext_openUnicodeString(&resultText
, &result
, &status
);
2770 m
.reset(10, status
);
2773 m
.appendReplacement(&resultText
, &replText
, status
);
2775 const char str_blah3
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2776 REGEX_ASSERT_UTEXT_UTF8(str_blah3
, &resultText
);
2778 // find() at interior of string, appendReplacement still starts at beginning.
2779 status
= U_ZERO_ERROR
;
2781 utext_openUnicodeString(&resultText
, &result
, &status
);
2785 m
.appendReplacement(&resultText
, &replText
, status
);
2787 const char str_blah8
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2788 REGEX_ASSERT_UTEXT_UTF8(str_blah8
, &resultText
);
2790 m
.appendTail(&resultText
, status
);
2791 const char str_blah9
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2792 REGEX_ASSERT_UTEXT_UTF8(str_blah9
, &resultText
);
2794 utext_close(&resultText
);
2802 utext_close(&dataText
);
2803 utext_close(&replText
);
2804 utext_close(&destText
);
2809 //---------------------------------------------------------------------------
2811 // API_Pattern_UTF8 Test that the API for class RegexPattern is
2812 // present and nominally working.
2814 //---------------------------------------------------------------------------
2815 void RegexTest::API_Pattern_UTF8() {
2816 RegexPattern pata
; // Test default constructor to not crash.
2819 REGEX_ASSERT(pata
== patb
);
2820 REGEX_ASSERT(pata
== pata
);
2822 UText re1
= UTEXT_INITIALIZER
;
2823 UText re2
= UTEXT_INITIALIZER
;
2824 UErrorCode status
= U_ZERO_ERROR
;
2827 const char str_abcalmz
[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2828 const char str_def
[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2829 utext_openUTF8(&re1
, str_abcalmz
, -1, &status
);
2830 utext_openUTF8(&re2
, str_def
, -1, &status
);
2832 RegexPattern
*pat1
= RegexPattern::compile(&re1
, 0, pe
, status
);
2833 RegexPattern
*pat2
= RegexPattern::compile(&re2
, 0, pe
, status
);
2835 REGEX_ASSERT(*pat1
== *pat1
);
2836 REGEX_ASSERT(*pat1
!= pata
);
2840 REGEX_ASSERT(patb
== *pat1
);
2843 RegexPattern
patc(*pat1
);
2844 REGEX_ASSERT(patc
== *pat1
);
2845 REGEX_ASSERT(patb
== patc
);
2846 REGEX_ASSERT(pat1
!= pat2
);
2848 REGEX_ASSERT(patb
!= patc
);
2849 REGEX_ASSERT(patb
== *pat2
);
2851 // Compile with no flags.
2852 RegexPattern
*pat1a
= RegexPattern::compile(&re1
, pe
, status
);
2853 REGEX_ASSERT(*pat1a
== *pat1
);
2855 REGEX_ASSERT(pat1a
->flags() == 0);
2857 // Compile with different flags should be not equal
2858 RegexPattern
*pat1b
= RegexPattern::compile(&re1
, UREGEX_CASE_INSENSITIVE
, pe
, status
);
2861 REGEX_ASSERT(*pat1b
!= *pat1a
);
2862 REGEX_ASSERT(pat1b
->flags() == UREGEX_CASE_INSENSITIVE
);
2863 REGEX_ASSERT(pat1a
->flags() == 0);
2867 RegexPattern
*pat1c
= pat1
->clone();
2868 REGEX_ASSERT(*pat1c
== *pat1
);
2869 REGEX_ASSERT(*pat1c
!= *pat2
);
2881 // Verify that a matcher created from a cloned pattern works.
2885 UErrorCode status
= U_ZERO_ERROR
;
2886 UText pattern
= UTEXT_INITIALIZER
;
2887 const char str_pL
[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2888 utext_openUTF8(&pattern
, str_pL
, -1, &status
);
2890 RegexPattern
*pSource
= RegexPattern::compile(&pattern
, 0, status
);
2891 RegexPattern
*pClone
= pSource
->clone();
2893 RegexMatcher
*mFromClone
= pClone
->matcher(status
);
2896 UText input
= UTEXT_INITIALIZER
;
2897 const char str_HelloWorld
[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2898 utext_openUTF8(&input
, str_HelloWorld
, -1, &status
);
2899 mFromClone
->reset(&input
);
2900 REGEX_ASSERT(mFromClone
->find() == TRUE
);
2901 REGEX_ASSERT(mFromClone
->group(status
) == "Hello");
2902 REGEX_ASSERT(mFromClone
->find() == TRUE
);
2903 REGEX_ASSERT(mFromClone
->group(status
) == "World");
2904 REGEX_ASSERT(mFromClone
->find() == FALSE
);
2908 utext_close(&input
);
2909 utext_close(&pattern
);
2913 // matches convenience API
2916 UErrorCode status
= U_ZERO_ERROR
;
2917 UText pattern
= UTEXT_INITIALIZER
;
2918 UText input
= UTEXT_INITIALIZER
;
2920 const char str_randominput
[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2921 utext_openUTF8(&input
, str_randominput
, -1, &status
);
2923 const char str_dotstar
[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2924 utext_openUTF8(&pattern
, str_dotstar
, -1, &status
);
2925 REGEX_ASSERT(RegexPattern::matches(&pattern
, &input
, pe
, status
) == TRUE
);
2928 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2929 utext_openUTF8(&pattern
, str_abc
, -1, &status
);
2930 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe
, status
) == FALSE
);
2933 const char str_nput
[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2934 utext_openUTF8(&pattern
, str_nput
, -1, &status
);
2935 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe
, status
) == TRUE
);
2938 utext_openUTF8(&pattern
, str_randominput
, -1, &status
);
2939 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe
, status
) == TRUE
);
2942 const char str_u
[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2943 utext_openUTF8(&pattern
, str_u
, -1, &status
);
2944 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe
, status
) == FALSE
);
2947 utext_openUTF8(&input
, str_abc
, -1, &status
);
2948 utext_openUTF8(&pattern
, str_abc
, -1, &status
);
2949 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
2950 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe
, status
) == FALSE
);
2951 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
2953 utext_close(&input
);
2954 utext_close(&pattern
);
2961 status
= U_ZERO_ERROR
;
2962 const char str_spaceplus
[] = { 0x20, 0x2b, 0x00 }; /* + */
2963 utext_openUTF8(&re1
, str_spaceplus
, -1, &status
);
2964 pat1
= RegexPattern::compile(&re1
, pe
, status
);
2966 UnicodeString fields
[10];
2969 n
= pat1
->split("Now is the time", fields
, 10, status
);
2972 REGEX_ASSERT(fields
[0]=="Now");
2973 REGEX_ASSERT(fields
[1]=="is");
2974 REGEX_ASSERT(fields
[2]=="the");
2975 REGEX_ASSERT(fields
[3]=="time");
2976 REGEX_ASSERT(fields
[4]=="");
2978 n
= pat1
->split("Now is the time", fields
, 2, status
);
2981 REGEX_ASSERT(fields
[0]=="Now");
2982 REGEX_ASSERT(fields
[1]=="is the time");
2983 REGEX_ASSERT(fields
[2]=="the"); // left over from previous test
2986 status
= U_ZERO_ERROR
;
2987 n
= pat1
->split("Now is the time", fields
, 1, status
);
2990 REGEX_ASSERT(fields
[0]=="Now is the time");
2991 REGEX_ASSERT(fields
[1]=="*");
2992 status
= U_ZERO_ERROR
;
2994 n
= pat1
->split(" Now is the time ", fields
, 10, status
);
2997 REGEX_ASSERT(fields
[0]=="");
2998 REGEX_ASSERT(fields
[1]=="Now");
2999 REGEX_ASSERT(fields
[2]=="is");
3000 REGEX_ASSERT(fields
[3]=="the");
3001 REGEX_ASSERT(fields
[4]=="time");
3002 REGEX_ASSERT(fields
[5]=="");
3003 REGEX_ASSERT(fields
[6]=="");
3006 n
= pat1
->split(" ", fields
, 10, status
);
3009 REGEX_ASSERT(fields
[0]=="");
3010 REGEX_ASSERT(fields
[1]=="");
3011 REGEX_ASSERT(fields
[2]=="*");
3014 n
= pat1
->split("", fields
, 10, status
);
3017 REGEX_ASSERT(fields
[0]=="foo");
3021 // split, with a pattern with (capture)
3022 regextst_openUTF8FromInvariant(&re1
, "<(\\w*)>", -1, &status
);
3023 pat1
= RegexPattern::compile(&re1
, pe
, status
);
3026 status
= U_ZERO_ERROR
;
3027 fields
[6] = fields
[7] = "*";
3028 n
= pat1
->split("<a>Now is <b>the time<c>", fields
, 10, status
);
3031 REGEX_ASSERT(fields
[0]=="");
3032 REGEX_ASSERT(fields
[1]=="a");
3033 REGEX_ASSERT(fields
[2]=="Now is ");
3034 REGEX_ASSERT(fields
[3]=="b");
3035 REGEX_ASSERT(fields
[4]=="the time");
3036 REGEX_ASSERT(fields
[5]=="c");
3037 REGEX_ASSERT(fields
[6]=="");
3038 REGEX_ASSERT(fields
[7]=="*");
3039 REGEX_ASSERT(status
==U_ZERO_ERROR
);
3041 fields
[6] = fields
[7] = "*";
3042 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 10, status
);
3045 REGEX_ASSERT(fields
[0]==" ");
3046 REGEX_ASSERT(fields
[1]=="a");
3047 REGEX_ASSERT(fields
[2]=="Now is ");
3048 REGEX_ASSERT(fields
[3]=="b");
3049 REGEX_ASSERT(fields
[4]=="the time");
3050 REGEX_ASSERT(fields
[5]=="c");
3051 REGEX_ASSERT(fields
[6]=="");
3052 REGEX_ASSERT(fields
[7]=="*");
3054 status
= U_ZERO_ERROR
;
3056 n
= pat1
->split(" <a>Now is <b>the time<c> ", fields
, 6, status
);
3059 REGEX_ASSERT(fields
[0]==" ");
3060 REGEX_ASSERT(fields
[1]=="a");
3061 REGEX_ASSERT(fields
[2]=="Now is ");
3062 REGEX_ASSERT(fields
[3]=="b");
3063 REGEX_ASSERT(fields
[4]=="the time");
3064 REGEX_ASSERT(fields
[5]==" ");
3065 REGEX_ASSERT(fields
[6]=="foo");
3067 status
= U_ZERO_ERROR
;
3069 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 5, status
);
3072 REGEX_ASSERT(fields
[0]==" ");
3073 REGEX_ASSERT(fields
[1]=="a");
3074 REGEX_ASSERT(fields
[2]=="Now is ");
3075 REGEX_ASSERT(fields
[3]=="b");
3076 REGEX_ASSERT(fields
[4]=="the time<c>");
3077 REGEX_ASSERT(fields
[5]=="foo");
3079 status
= U_ZERO_ERROR
;
3081 n
= pat1
->split(" <a>Now is <b>the time", fields
, 5, status
);
3084 REGEX_ASSERT(fields
[0]==" ");
3085 REGEX_ASSERT(fields
[1]=="a");
3086 REGEX_ASSERT(fields
[2]=="Now is ");
3087 REGEX_ASSERT(fields
[3]=="b");
3088 REGEX_ASSERT(fields
[4]=="the time");
3089 REGEX_ASSERT(fields
[5]=="foo");
3091 status
= U_ZERO_ERROR
;
3092 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 4, status
);
3095 REGEX_ASSERT(fields
[0]==" ");
3096 REGEX_ASSERT(fields
[1]=="a");
3097 REGEX_ASSERT(fields
[2]=="Now is ");
3098 REGEX_ASSERT(fields
[3]=="the time<c>");
3099 status
= U_ZERO_ERROR
;
3102 regextst_openUTF8FromInvariant(&re1
, "([-,])", -1, &status
);
3103 pat1
= RegexPattern::compile(&re1
, pe
, status
);
3105 n
= pat1
->split("1-10,20", fields
, 10, status
);
3108 REGEX_ASSERT(fields
[0]=="1");
3109 REGEX_ASSERT(fields
[1]=="-");
3110 REGEX_ASSERT(fields
[2]=="10");
3111 REGEX_ASSERT(fields
[3]==",");
3112 REGEX_ASSERT(fields
[4]=="20");
3117 // split of a UText based string, with library allocating output UTexts.
3120 status
= U_ZERO_ERROR
;
3121 RegexMatcher
matcher(UnicodeString("(:)"), 0, status
);
3122 UnicodeString
stringToSplit("first:second:third");
3123 UText
*textToSplit
= utext_openUnicodeString(NULL
, &stringToSplit
, &status
);
3126 UText
*splits
[10] = {NULL
};
3127 int32_t numFields
= matcher
.split(textToSplit
, splits
, UPRV_LENGTHOF(splits
), status
);
3129 REGEX_ASSERT(numFields
== 5);
3130 REGEX_ASSERT_UTEXT_INVARIANT("first", splits
[0]);
3131 REGEX_ASSERT_UTEXT_INVARIANT(":", splits
[1]);
3132 REGEX_ASSERT_UTEXT_INVARIANT("second", splits
[2]);
3133 REGEX_ASSERT_UTEXT_INVARIANT(":", splits
[3]);
3134 REGEX_ASSERT_UTEXT_INVARIANT("third", splits
[4]);
3135 REGEX_ASSERT(splits
[5] == NULL
);
3137 for (int i
=0; i
<UPRV_LENGTHOF(splits
); i
++) {
3139 utext_close(splits
[i
]);
3143 utext_close(textToSplit
);
3148 // RegexPattern::pattern() and patternText()
3150 pat1
= new RegexPattern();
3151 REGEX_ASSERT(pat1
->pattern() == "");
3152 REGEX_ASSERT_UTEXT_UTF8("", pat1
->patternText(status
));
3154 const char *helloWorldInvariant
= "(Hello, world)*";
3155 regextst_openUTF8FromInvariant(&re1
, helloWorldInvariant
, -1, &status
);
3156 pat1
= RegexPattern::compile(&re1
, pe
, status
);
3158 REGEX_ASSERT_UNISTR("(Hello, world)*", pat1
->pattern());
3159 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1
->patternText(status
));
3166 //---------------------------------------------------------------------------
3168 // Extended A more thorough check for features of regex patterns
3169 // The test cases are in a separate data file,
3170 // source/tests/testdata/regextst.txt
3171 // A description of the test data format is included in that file.
3173 //---------------------------------------------------------------------------
3176 RegexTest::getPath(char buffer
[2048], const char *filename
) {
3177 UErrorCode status
=U_ZERO_ERROR
;
3178 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
3179 if (U_FAILURE(status
)) {
3180 errln("ERROR: loadTestData() failed - %s", u_errorName(status
));
3184 strcpy(buffer
, testDataDirectory
);
3185 strcat(buffer
, filename
);
3189 void RegexTest::Extended() {
3191 const char *srcPath
;
3192 UErrorCode status
= U_ZERO_ERROR
;
3193 int32_t lineNum
= 0;
3196 // Open and read the test data file.
3198 srcPath
=getPath(tdd
, "regextst.txt");
3200 return; /* something went wrong, error already output */
3204 UChar
*testData
= ReadAndConvertFile(srcPath
, len
, "utf-8", status
);
3205 if (U_FAILURE(status
)) {
3206 return; /* something went wrong, error already output */
3210 // Put the test data into a UnicodeString
3212 UnicodeString
testString(FALSE
, testData
, len
);
3214 RegexMatcher
quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status
);
3215 RegexMatcher
commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status
);
3216 RegexMatcher
flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status
);
3218 RegexMatcher
lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString
, 0, status
);
3219 UnicodeString testPattern
; // The pattern for test from the test file.
3220 UnicodeString testFlags
; // the flags for a test.
3221 UnicodeString matchString
; // The marked up string to be used as input
3223 if (U_FAILURE(status
)){
3224 dataerrln("Construct RegexMatcher() error - %s", u_errorName(status
));
3230 // Loop over the test data file, once per line.
3232 while (lineMat
.find()) {
3234 if (U_FAILURE(status
)) {
3235 errln("%s:%d: ICU Error \"%s\"", srcPath
, lineNum
, u_errorName(status
));
3238 status
= U_ZERO_ERROR
;
3239 UnicodeString testLine
= lineMat
.group(1, status
);
3240 if (testLine
.length() == 0) {
3245 // Parse the test line. Skip blank and comment only lines.
3246 // Separate out the three main fields - pattern, flags, target.
3249 commentMat
.reset(testLine
);
3250 if (commentMat
.lookingAt(status
)) {
3251 // This line is a comment, or blank.
3256 // Pull out the pattern field, remove it from the test file line.
3258 quotedStuffMat
.reset(testLine
);
3259 if (quotedStuffMat
.lookingAt(status
)) {
3260 testPattern
= quotedStuffMat
.group(2, status
);
3261 testLine
.remove(0, quotedStuffMat
.end(0, status
));
3263 errln("Bad pattern (missing quotes?) at %s:%d", srcPath
, lineNum
);
3269 // Pull out the flags from the test file line.
3271 flagsMat
.reset(testLine
);
3272 flagsMat
.lookingAt(status
); // Will always match, possibly an empty string.
3273 testFlags
= flagsMat
.group(1, status
);
3274 if (flagsMat
.group(2, status
).length() > 0) {
3275 errln("Bad Match flag at line %d. Scanning %c\n",
3276 lineNum
, flagsMat
.group(2, status
).charAt(0));
3279 testLine
.remove(0, flagsMat
.end(0, status
));
3282 // Pull out the match string, as a whole.
3283 // We'll process the <tags> later.
3285 quotedStuffMat
.reset(testLine
);
3286 if (quotedStuffMat
.lookingAt(status
)) {
3287 matchString
= quotedStuffMat
.group(2, status
);
3288 testLine
.remove(0, quotedStuffMat
.end(0, status
));
3290 errln("Bad match string at test file line %d", lineNum
);
3295 // The only thing left from the input line should be an optional trailing comment.
3297 commentMat
.reset(testLine
);
3298 if (commentMat
.lookingAt(status
) == FALSE
) {
3299 errln("Line %d: unexpected characters at end of test line.", lineNum
);
3306 regex_find(testPattern
, testFlags
, matchString
, srcPath
, lineNum
);
3315 //---------------------------------------------------------------------------
3317 // regex_find(pattern, flags, inputString, lineNumber)
3319 // Function to run a single test from the Extended (data driven) tests.
3320 // See file test/testdata/regextst.txt for a description of the
3321 // pattern and inputString fields, and the allowed flags.
3322 // lineNumber is the source line in regextst.txt of the test.
3324 //---------------------------------------------------------------------------
3327 // Set a value into a UVector at position specified by a decimal number in
3328 // a UnicodeString. This is a utility function needed by the actual test function,
3330 static void set(UVector
&vec
, int32_t val
, UnicodeString index
) {
3331 UErrorCode status
=U_ZERO_ERROR
;
3333 for (int32_t i
=0; i
<index
.length(); i
++) {
3334 int32_t d
=u_charDigitValue(index
.charAt(i
));
3338 while (vec
.size()<idx
+1) {vec
.addElement(-1, status
);}
3339 vec
.setElementAt(val
, idx
);
3342 static void setInt(UVector
&vec
, int32_t val
, int32_t idx
) {
3343 UErrorCode status
=U_ZERO_ERROR
;
3344 while (vec
.size()<idx
+1) {vec
.addElement(-1, status
);}
3345 vec
.setElementAt(val
, idx
);
3348 static UBool
utextOffsetToNative(UText
*utext
, int32_t unistrOffset
, int32_t& nativeIndex
)
3350 UBool couldFind
= TRUE
;
3351 UTEXT_SETNATIVEINDEX(utext
, 0);
3353 while (i
< unistrOffset
) {
3354 UChar32 c
= UTEXT_NEXT32(utext
);
3355 if (c
!= U_SENTINEL
) {
3362 nativeIndex
= (int32_t)UTEXT_GETNATIVEINDEX(utext
);
3367 void RegexTest::regex_find(const UnicodeString
&pattern
,
3368 const UnicodeString
&flags
,
3369 const UnicodeString
&inputString
,
3370 const char *srcPath
,
3372 UnicodeString unEscapedInput
;
3373 UnicodeString deTaggedInput
;
3375 int32_t patternUTF8Length
, inputUTF8Length
;
3376 char *patternChars
= NULL
, *inputChars
= NULL
;
3377 UText patternText
= UTEXT_INITIALIZER
;
3378 UText inputText
= UTEXT_INITIALIZER
;
3379 UConverter
*UTF8Converter
= NULL
;
3381 UErrorCode status
= U_ZERO_ERROR
;
3383 RegexPattern
*parsePat
= NULL
;
3384 RegexMatcher
*parseMatcher
= NULL
;
3385 RegexPattern
*callerPattern
= NULL
, *UTF8Pattern
= NULL
;
3386 RegexMatcher
*matcher
= NULL
, *UTF8Matcher
= NULL
;
3387 UVector
groupStarts(status
);
3388 UVector
groupEnds(status
);
3389 UVector
groupStartsUTF8(status
);
3390 UVector
groupEndsUTF8(status
);
3391 UBool isMatch
= FALSE
, isUTF8Match
= FALSE
;
3392 UBool failed
= FALSE
;
3395 UBool useMatchesFunc
= FALSE
;
3396 UBool useLookingAtFunc
= FALSE
;
3397 int32_t regionStart
= -1;
3398 int32_t regionEnd
= -1;
3399 int32_t regionStartUTF8
= -1;
3400 int32_t regionEndUTF8
= -1;
3404 // Compile the caller's pattern
3406 uint32_t bflags
= 0;
3407 if (flags
.indexOf((UChar
)0x69) >= 0) { // 'i' flag
3408 bflags
|= UREGEX_CASE_INSENSITIVE
;
3410 if (flags
.indexOf((UChar
)0x78) >= 0) { // 'x' flag
3411 bflags
|= UREGEX_COMMENTS
;
3413 if (flags
.indexOf((UChar
)0x73) >= 0) { // 's' flag
3414 bflags
|= UREGEX_DOTALL
;
3416 if (flags
.indexOf((UChar
)0x6d) >= 0) { // 'm' flag
3417 bflags
|= UREGEX_MULTILINE
;
3420 if (flags
.indexOf((UChar
)0x65) >= 0) { // 'e' flag
3421 bflags
|= UREGEX_ERROR_ON_UNKNOWN_ESCAPES
;
3423 if (flags
.indexOf((UChar
)0x44) >= 0) { // 'D' flag
3424 bflags
|= UREGEX_UNIX_LINES
;
3426 if (flags
.indexOf((UChar
)0x51) >= 0) { // 'Q' flag
3427 bflags
|= UREGEX_LITERAL
;
3431 callerPattern
= RegexPattern::compile(pattern
, bflags
, pe
, status
);
3432 if (status
!= U_ZERO_ERROR
) {
3433 #if UCONFIG_NO_BREAK_ITERATION==1
3434 // 'v' test flag means that the test pattern should not compile if ICU was configured
3435 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3436 if (flags
.indexOf((UChar
)0x76) >= 0 /*'v'*/ && status
== U_UNSUPPORTED_ERROR
) {
3437 goto cleanupAndReturn
;
3440 if (flags
.indexOf((UChar
)0x45) >= 0) { // flags contain 'E'
3441 // Expected pattern compilation error.
3442 if (flags
.indexOf((UChar
)0x64) >= 0) { // flags contain 'd'
3443 logln("Pattern Compile returns \"%s\"", u_errorName(status
));
3445 goto cleanupAndReturn
;
3447 // Unexpected pattern compilation error.
3448 dataerrln("Line %d: error %s compiling pattern.", line
, u_errorName(status
));
3449 goto cleanupAndReturn
;
3453 UTF8Converter
= ucnv_open("UTF8", &status
);
3454 ucnv_setFromUCallBack(UTF8Converter
, UCNV_FROM_U_CALLBACK_STOP
, NULL
, NULL
, NULL
, &status
);
3456 patternUTF8Length
= pattern
.extract(NULL
, 0, UTF8Converter
, status
);
3457 status
= U_ZERO_ERROR
; // buffer overflow
3458 patternChars
= new char[patternUTF8Length
+1];
3459 pattern
.extract(patternChars
, patternUTF8Length
+1, UTF8Converter
, status
);
3460 utext_openUTF8(&patternText
, patternChars
, patternUTF8Length
, &status
);
3462 if (status
== U_ZERO_ERROR
) {
3463 UTF8Pattern
= RegexPattern::compile(&patternText
, bflags
, pe
, status
);
3465 if (status
!= U_ZERO_ERROR
) {
3466 #if UCONFIG_NO_BREAK_ITERATION==1
3467 // 'v' test flag means that the test pattern should not compile if ICU was configured
3468 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3469 if (flags
.indexOf((UChar
)0x76) >= 0 /*'v'*/ && status
== U_UNSUPPORTED_ERROR
) {
3470 goto cleanupAndReturn
;
3473 if (flags
.indexOf((UChar
)0x45) >= 0) { // flags contain 'E'
3474 // Expected pattern compilation error.
3475 if (flags
.indexOf((UChar
)0x64) >= 0) { // flags contain 'd'
3476 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status
));
3478 goto cleanupAndReturn
;
3480 // Unexpected pattern compilation error.
3481 errln("Line %d: error %s compiling pattern. (UTF8)", line
, u_errorName(status
));
3482 goto cleanupAndReturn
;
3487 if (UTF8Pattern
== NULL
) {
3488 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3489 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath
, line
);
3490 status
= U_ZERO_ERROR
;
3493 if (flags
.indexOf((UChar
)0x64) >= 0) { // 'd' flag
3494 callerPattern
->dumpPattern();
3497 if (flags
.indexOf((UChar
)0x45) >= 0) { // 'E' flag
3498 errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath
, line
);
3499 goto cleanupAndReturn
;
3504 // Number of times find() should be called on the test string, default to 1
3507 for (i
=2; i
<=9; i
++) {
3508 if (flags
.indexOf((UChar
)(0x30 + i
)) >= 0) { // digit flag
3509 if (numFinds
!= 1) {
3510 errln("Line %d: more than one digit flag. Scanning %d.", line
, i
);
3511 goto cleanupAndReturn
;
3517 // 'M' flag. Use matches() instead of find()
3518 if (flags
.indexOf((UChar
)0x4d) >= 0) {
3519 useMatchesFunc
= TRUE
;
3521 if (flags
.indexOf((UChar
)0x4c) >= 0) {
3522 useLookingAtFunc
= TRUE
;
3526 // Find the tags in the input data, remove them, and record the group boundary
3529 parsePat
= RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe
, status
);
3530 REGEX_CHECK_STATUS_L(line
);
3532 unEscapedInput
= inputString
.unescape();
3533 parseMatcher
= parsePat
->matcher(unEscapedInput
, status
);
3534 REGEX_CHECK_STATUS_L(line
);
3535 while(parseMatcher
->find()) {
3536 parseMatcher
->appendReplacement(deTaggedInput
, "", status
);
3538 UnicodeString groupNum
= parseMatcher
->group(2, status
);
3539 if (groupNum
== "r") {
3540 // <r> or </r>, a region specification within the string
3541 if (parseMatcher
->group(1, status
) == "/") {
3542 regionEnd
= deTaggedInput
.length();
3544 regionStart
= deTaggedInput
.length();
3547 // <digits> or </digits>, a group match boundary tag.
3548 if (parseMatcher
->group(1, status
) == "/") {
3549 set(groupEnds
, deTaggedInput
.length(), groupNum
);
3551 set(groupStarts
, deTaggedInput
.length(), groupNum
);
3555 parseMatcher
->appendTail(deTaggedInput
);
3556 REGEX_ASSERT_L(groupStarts
.size() == groupEnds
.size(), line
);
3557 if ((regionStart
>=0 || regionEnd
>=0) && (regionStart
<0 || regionStart
>regionEnd
)) {
3558 errln("mismatched <r> tags");
3560 goto cleanupAndReturn
;
3564 // Configure the matcher according to the flags specified with this test.
3566 matcher
= callerPattern
->matcher(deTaggedInput
, status
);
3567 REGEX_CHECK_STATUS_L(line
);
3568 if (flags
.indexOf((UChar
)0x74) >= 0) { // 't' trace flag
3569 matcher
->setTrace(TRUE
);
3572 if (UTF8Pattern
!= NULL
) {
3573 inputUTF8Length
= deTaggedInput
.extract(NULL
, 0, UTF8Converter
, status
);
3574 status
= U_ZERO_ERROR
; // buffer overflow
3575 inputChars
= new char[inputUTF8Length
+1];
3576 deTaggedInput
.extract(inputChars
, inputUTF8Length
+1, UTF8Converter
, status
);
3577 utext_openUTF8(&inputText
, inputChars
, inputUTF8Length
, &status
);
3579 if (status
== U_ZERO_ERROR
) {
3580 UTF8Matcher
= &UTF8Pattern
->matcher(status
)->reset(&inputText
);
3581 REGEX_CHECK_STATUS_L(line
);
3584 if (UTF8Matcher
== NULL
) {
3585 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3586 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath
, line
);
3587 status
= U_ZERO_ERROR
;
3592 // Generate native indices for UTF8 versions of region and capture group info
3594 if (UTF8Matcher
!= NULL
) {
3595 if (flags
.indexOf((UChar
)0x74) >= 0) { // 't' trace flag
3596 UTF8Matcher
->setTrace(TRUE
);
3598 if (regionStart
>=0) (void) utextOffsetToNative(&inputText
, regionStart
, regionStartUTF8
);
3599 if (regionEnd
>=0) (void) utextOffsetToNative(&inputText
, regionEnd
, regionEndUTF8
);
3601 // Fill out the native index UVector info.
3602 // Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3603 for (i
=0; i
<groupStarts
.size(); i
++) {
3604 int32_t start
= groupStarts
.elementAti(i
);
3605 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3608 if (!utextOffsetToNative(&inputText
, start
, startUTF8
)) {
3609 errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line
, i
, start
);
3611 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3613 setInt(groupStartsUTF8
, startUTF8
, i
);
3616 int32_t end
= groupEnds
.elementAti(i
);
3617 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3620 if (!utextOffsetToNative(&inputText
, end
, endUTF8
)) {
3621 errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line
, i
, end
);
3623 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3625 setInt(groupEndsUTF8
, endUTF8
, i
);
3630 if (regionStart
>=0) {
3631 matcher
->region(regionStart
, regionEnd
, status
);
3632 REGEX_CHECK_STATUS_L(line
);
3633 if (UTF8Matcher
!= NULL
) {
3634 UTF8Matcher
->region(regionStartUTF8
, regionEndUTF8
, status
);
3635 REGEX_CHECK_STATUS_L(line
);
3638 if (flags
.indexOf((UChar
)0x61) >= 0) { // 'a' anchoring bounds flag
3639 matcher
->useAnchoringBounds(FALSE
);
3640 if (UTF8Matcher
!= NULL
) {
3641 UTF8Matcher
->useAnchoringBounds(FALSE
);
3644 if (flags
.indexOf((UChar
)0x62) >= 0) { // 'b' transparent bounds flag
3645 matcher
->useTransparentBounds(TRUE
);
3646 if (UTF8Matcher
!= NULL
) {
3647 UTF8Matcher
->useTransparentBounds(TRUE
);
3654 // Do a find on the de-tagged input using the caller's pattern
3655 // TODO: error on count>1 and not find().
3656 // error on both matches() and lookingAt().
3658 for (i
=0; i
<numFinds
; i
++) {
3659 if (useMatchesFunc
) {
3660 isMatch
= matcher
->matches(status
);
3661 if (UTF8Matcher
!= NULL
) {
3662 isUTF8Match
= UTF8Matcher
->matches(status
);
3664 } else if (useLookingAtFunc
) {
3665 isMatch
= matcher
->lookingAt(status
);
3666 if (UTF8Matcher
!= NULL
) {
3667 isUTF8Match
= UTF8Matcher
->lookingAt(status
);
3670 isMatch
= matcher
->find();
3671 if (UTF8Matcher
!= NULL
) {
3672 isUTF8Match
= UTF8Matcher
->find();
3676 matcher
->setTrace(FALSE
);
3678 UTF8Matcher
->setTrace(FALSE
);
3680 if (U_FAILURE(status
)) {
3681 errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status
));
3685 // Match up the groups from the find() with the groups from the tags
3688 // number of tags should match number of groups from find operation.
3689 // matcher->groupCount does not include group 0, the entire match, hence the +1.
3690 // G option in test means that capture group data is not available in the
3691 // expected results, so the check needs to be suppressed.
3692 if (isMatch
== FALSE
&& groupStarts
.size() != 0) {
3693 dataerrln("Error at line %d: Match expected, but none found.", line
);
3695 goto cleanupAndReturn
;
3696 } else if (UTF8Matcher
!= NULL
&& isUTF8Match
== FALSE
&& groupStarts
.size() != 0) {
3697 errln("Error at line %d: Match expected, but none found. (UTF8)", line
);
3699 goto cleanupAndReturn
;
3701 if (isMatch
&& groupStarts
.size() == 0) {
3702 errln("Error at line %d: No match expected, but one found at position %d.", line
, matcher
->start(status
));
3705 if (UTF8Matcher
&& isUTF8Match
&& groupStarts
.size() == 0) {
3706 errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line
, UTF8Matcher
->start(status
));
3710 if (flags
.indexOf((UChar
)0x47 /*G*/) >= 0) {
3711 // Only check for match / no match. Don't check capture groups.
3712 goto cleanupAndReturn
;
3715 REGEX_CHECK_STATUS_L(line
);
3716 for (i
=0; i
<=matcher
->groupCount(); i
++) {
3717 int32_t expectedStart
= (i
>= groupStarts
.size()? -1 : groupStarts
.elementAti(i
));
3718 int32_t expectedStartUTF8
= (i
>= groupStartsUTF8
.size()? -1 : groupStartsUTF8
.elementAti(i
));
3719 if (matcher
->start(i
, status
) != expectedStart
) {
3720 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
3721 line
, i
, expectedStart
, matcher
->start(i
, status
));
3723 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3724 } else if (UTF8Matcher
!= NULL
&& UTF8Matcher
->start(i
, status
) != expectedStartUTF8
) {
3725 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)",
3726 line
, i
, expectedStartUTF8
, UTF8Matcher
->start(i
, status
));
3728 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3731 int32_t expectedEnd
= (i
>= groupEnds
.size()? -1 : groupEnds
.elementAti(i
));
3732 int32_t expectedEndUTF8
= (i
>= groupEndsUTF8
.size()? -1 : groupEndsUTF8
.elementAti(i
));
3733 if (matcher
->end(i
, status
) != expectedEnd
) {
3734 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
3735 line
, i
, expectedEnd
, matcher
->end(i
, status
));
3737 // Error on end position; keep going; real error is probably yet to come as group
3738 // end positions work from end of the input data towards the front.
3739 } else if (UTF8Matcher
!= NULL
&& UTF8Matcher
->end(i
, status
) != expectedEndUTF8
) {
3740 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)",
3741 line
, i
, expectedEndUTF8
, UTF8Matcher
->end(i
, status
));
3743 // Error on end position; keep going; real error is probably yet to come as group
3744 // end positions work from end of the input data towards the front.
3747 if ( matcher
->groupCount()+1 < groupStarts
.size()) {
3748 errln("Error at line %d: Expected %d capture groups, found %d.",
3749 line
, groupStarts
.size()-1, matcher
->groupCount());
3752 else if (UTF8Matcher
!= NULL
&& UTF8Matcher
->groupCount()+1 < groupStarts
.size()) {
3753 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3754 line
, groupStarts
.size()-1, UTF8Matcher
->groupCount());
3758 if ((flags
.indexOf((UChar
)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3759 matcher
->requireEnd() == TRUE
) {
3760 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line
);
3762 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3763 UTF8Matcher
->requireEnd() == TRUE
) {
3764 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line
);
3768 if ((flags
.indexOf((UChar
)0x79) >= 0) && // 'y' flag: RequireEnd() == true
3769 matcher
->requireEnd() == FALSE
) {
3770 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line
);
3772 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x79) >= 0) && // 'Y' flag: RequireEnd() == false
3773 UTF8Matcher
->requireEnd() == FALSE
) {
3774 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line
);
3778 if ((flags
.indexOf((UChar
)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3779 matcher
->hitEnd() == TRUE
) {
3780 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line
);
3782 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3783 UTF8Matcher
->hitEnd() == TRUE
) {
3784 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line
);
3788 if ((flags
.indexOf((UChar
)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3789 matcher
->hitEnd() == FALSE
) {
3790 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line
);
3792 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3793 UTF8Matcher
->hitEnd() == FALSE
) {
3794 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line
);
3801 infoln((UnicodeString
)"\""+pattern
+(UnicodeString
)"\" "
3802 +flags
+(UnicodeString
)" \""+inputString
+(UnicodeString
)"\"");
3803 // callerPattern->dump();
3805 delete parseMatcher
;
3810 delete callerPattern
;
3812 utext_close(&inputText
);
3813 delete[] inputChars
;
3814 utext_close(&patternText
);
3815 delete[] patternChars
;
3816 ucnv_close(UTF8Converter
);
3822 //---------------------------------------------------------------------------
3824 // Errors Check for error handling in patterns.
3826 //---------------------------------------------------------------------------
3827 void RegexTest::Errors() {
3828 // \escape sequences that aren't implemented yet.
3829 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3831 // Missing close parentheses
3832 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN
);
3833 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN
);
3834 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN
);
3836 // Extra close paren
3837 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN
);
3838 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN
);
3839 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN
);
3841 // Look-ahead, Look-behind
3842 // TODO: add tests for unbounded length look-behinds.
3843 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX
); // illegal construct
3845 // Attempt to use non-default flags
3848 UErrorCode status
= U_ZERO_ERROR
;
3849 int32_t flags
= UREGEX_CANON_EQ
|
3850 UREGEX_COMMENTS
| UREGEX_DOTALL
|
3852 RegexPattern
*pat1
= RegexPattern::compile(".*", flags
, pe
, status
);
3853 REGEX_ASSERT(status
== U_REGEX_UNIMPLEMENTED
);
3858 // Quantifiers are allowed only after something that can be quantified.
3859 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX
);
3860 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX
);
3861 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX
);
3863 // Mal-formed {min,max} quantifiers
3864 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL
);
3865 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN
);
3866 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL
);
3867 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL
);
3868 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL
);
3869 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG
);
3870 REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG
); // Overflows int during scan
3871 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG
); // Overflows regex binary format
3872 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG
);
3875 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX
);
3877 // Invalid Back Reference \0
3878 // For ICU 3.8 and earlier
3879 // For ICU versions newer than 3.8, \0 introduces an octal escape.
3881 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE
);
3886 //-------------------------------------------------------------------------------
3888 // Read a text data file, convert it to UChars, and return the data
3889 // in one big UChar * buffer, which the caller must delete.
3891 //--------------------------------------------------------------------------------
3892 UChar
*RegexTest::ReadAndConvertFile(const char *fileName
, int32_t &ulen
,
3893 const char *defEncoding
, UErrorCode
&status
) {
3894 UChar
*retPtr
= NULL
;
3895 char *fileBuf
= NULL
;
3896 UConverter
* conv
= NULL
;
3900 if (U_FAILURE(status
)) {
3907 f
= fopen(fileName
, "rb");
3909 dataerrln("Error opening test data file %s\n", fileName
);
3910 status
= U_FILE_ACCESS_ERROR
;
3919 fseek( f
, 0, SEEK_END
);
3920 fileSize
= ftell(f
);
3921 fileBuf
= new char[fileSize
];
3922 fseek(f
, 0, SEEK_SET
);
3923 amt_read
= fread(fileBuf
, 1, fileSize
, f
);
3924 if (amt_read
!= fileSize
|| fileSize
<= 0) {
3925 errln("Error reading test data file.");
3926 goto cleanUpAndReturn
;
3930 // Look for a Unicode Signature (BOM) on the data just read
3932 int32_t signatureLength
;
3933 const char * fileBufC
;
3934 const char* encoding
;
3937 encoding
= ucnv_detectUnicodeSignature(
3938 fileBuf
, fileSize
, &signatureLength
, &status
);
3939 if(encoding
!=NULL
){
3940 fileBufC
+= signatureLength
;
3941 fileSize
-= signatureLength
;
3943 encoding
= defEncoding
;
3944 if (strcmp(encoding
, "utf-8") == 0) {
3945 errln("file %s is missing its BOM", fileName
);
3950 // Open a converter to take the rule file to UTF-16
3952 conv
= ucnv_open(encoding
, &status
);
3953 if (U_FAILURE(status
)) {
3954 goto cleanUpAndReturn
;
3958 // Convert the rules to UChar.
3959 // Preflight first to determine required buffer size.
3961 ulen
= ucnv_toUChars(conv
,
3967 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
3968 // Buffer Overflow is expected from the preflight operation.
3969 status
= U_ZERO_ERROR
;
3971 retPtr
= new UChar
[ulen
+1];
3984 if (U_FAILURE(status
)) {
3985 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
3994 //-------------------------------------------------------------------------------
3996 // PerlTests - Run Perl's regular expression tests
3997 // The input file for this test is re_tests, the standard regular
3998 // expression test data distributed with the Perl source code.
4000 // Here is Perl's description of the test data file:
4002 // # The tests are in a separate file 't/op/re_tests'.
4003 // # Each line in that file is a separate test.
4004 // # There are five columns, separated by tabs.
4006 // # Column 1 contains the pattern, optionally enclosed in C<''>.
4007 // # Modifiers can be put after the closing C<'>.
4009 // # Column 2 contains the string to be matched.
4011 // # Column 3 contains the expected result:
4012 // # y expect a match
4013 // # n expect no match
4014 // # c expect an error
4015 // # B test exposes a known bug in Perl, should be skipped
4016 // # b test exposes a known bug in Perl, should be skipped if noamp
4018 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
4020 // # Column 4 contains a string, usually C<$&>.
4022 // # Column 5 contains the expected result of double-quote
4023 // # interpolating that string after the match, or start of error message.
4025 // # Column 6, if present, contains a reason why the test is skipped.
4026 // # This is printed with "skipped", for harness to pick up.
4028 // # \n in the tests are interpolated, as are variables of the form ${\w+}.
4030 // # If you want to add a regular expression test that can't be expressed
4031 // # in this format, don't add it here: put it in op/pat.t instead.
4033 // For ICU, if field 3 contains an 'i', the test will be skipped.
4034 // The test exposes is some known incompatibility between ICU and Perl regexps.
4035 // (The i is in addition to whatever was there before.)
4037 //-------------------------------------------------------------------------------
4038 void RegexTest::PerlTests() {
4040 const char *srcPath
;
4041 UErrorCode status
= U_ZERO_ERROR
;
4045 // Open and read the test data file.
4047 srcPath
=getPath(tdd
, "re_tests.txt");
4049 return; /* something went wrong, error already output */
4053 UChar
*testData
= ReadAndConvertFile(srcPath
, len
, "iso-8859-1", status
);
4054 if (U_FAILURE(status
)) {
4055 return; /* something went wrong, error already output */
4059 // Put the test data into a UnicodeString
4061 UnicodeString
testDataString(FALSE
, testData
, len
);
4064 // Regex to break the input file into lines, and strip the new lines.
4065 // One line per match, capture group one is the desired data.
4067 RegexPattern
* linePat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe
, status
);
4068 if (U_FAILURE(status
)) {
4069 dataerrln("RegexPattern::compile() error");
4072 RegexMatcher
* lineMat
= linePat
->matcher(testDataString
, status
);
4075 // Regex to split a test file line into fields.
4076 // There are six fields, separated by tabs.
4078 RegexPattern
* fieldPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe
, status
);
4081 // Regex to identify test patterns with flag settings, and to separate them.
4082 // Test patterns with flags look like 'pattern'i
4083 // Test patterns without flags are not quoted: pattern
4084 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4086 RegexPattern
*flagPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe
, status
);
4087 RegexMatcher
* flagMat
= flagPat
->matcher(status
);
4090 // The Perl tests reference several perl-isms, which are evaluated/substituted
4091 // in the test data. Not being perl, this must be done explicitly. Here
4092 // are string constants and REs for these constructs.
4094 UnicodeString
nulnulSrc("${nulnul}");
4095 UnicodeString
nulnul("\\u0000\\u0000", -1, US_INV
);
4096 nulnul
= nulnul
.unescape();
4098 UnicodeString
ffffSrc("${ffff}");
4099 UnicodeString
ffff("\\uffff", -1, US_INV
);
4100 ffff
= ffff
.unescape();
4102 // regexp for $-[0], $+[2], etc.
4103 RegexPattern
*groupsPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe
, status
);
4104 RegexMatcher
*groupsMat
= groupsPat
->matcher(status
);
4106 // regexp for $0, $1, $2, etc.
4107 RegexPattern
*cgPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe
, status
);
4108 RegexMatcher
*cgMat
= cgPat
->matcher(status
);
4112 // Main Loop for the Perl Tests, runs once per line from the
4115 int32_t lineNum
= 0;
4116 int32_t skippedUnimplementedCount
= 0;
4117 while (lineMat
->find()) {
4121 // Get a line, break it into its fields, do the Perl
4122 // variable substitutions.
4124 UnicodeString line
= lineMat
->group(1, status
);
4125 UnicodeString fields
[7];
4126 fieldPat
->split(line
, fields
, 7, status
);
4128 flagMat
->reset(fields
[0]);
4129 flagMat
->matches(status
);
4130 UnicodeString pattern
= flagMat
->group(2, status
);
4131 pattern
.findAndReplace("${bang}", "!");
4132 pattern
.findAndReplace(nulnulSrc
, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4133 pattern
.findAndReplace(ffffSrc
, ffff
);
4136 // Identify patterns that include match flag settings,
4137 // split off the flags, remove the extra quotes.
4139 UnicodeString flagStr
= flagMat
->group(3, status
);
4140 if (U_FAILURE(status
)) {
4141 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
4145 const UChar UChar_c
= 0x63; // Char constants for the flag letters.
4146 const UChar UChar_i
= 0x69; // (Damn the lack of Unicode support in C)
4147 const UChar UChar_m
= 0x6d;
4148 const UChar UChar_x
= 0x78;
4149 const UChar UChar_y
= 0x79;
4150 if (flagStr
.indexOf(UChar_i
) != -1) {
4151 flags
|= UREGEX_CASE_INSENSITIVE
;
4153 if (flagStr
.indexOf(UChar_m
) != -1) {
4154 flags
|= UREGEX_MULTILINE
;
4156 if (flagStr
.indexOf(UChar_x
) != -1) {
4157 flags
|= UREGEX_COMMENTS
;
4161 // Compile the test pattern.
4163 status
= U_ZERO_ERROR
;
4164 RegexPattern
*testPat
= RegexPattern::compile(pattern
, flags
, pe
, status
);
4165 if (status
== U_REGEX_UNIMPLEMENTED
) {
4167 // Test of a feature that is planned for ICU, but not yet implemented.
4169 skippedUnimplementedCount
++;
4171 status
= U_ZERO_ERROR
;
4175 if (U_FAILURE(status
)) {
4176 // Some tests are supposed to generate errors.
4177 // Only report an error for tests that are supposed to succeed.
4178 if (fields
[2].indexOf(UChar_c
) == -1 && // Compilation is not supposed to fail AND
4179 fields
[2].indexOf(UChar_i
) == -1) // it's not an accepted ICU incompatibility
4181 errln("line %d: ICU Error \"%s\"\n", lineNum
, u_errorName(status
));
4183 status
= U_ZERO_ERROR
;
4188 if (fields
[2].indexOf(UChar_i
) >= 0) {
4189 // ICU should skip this test.
4194 if (fields
[2].indexOf(UChar_c
) >= 0) {
4195 // This pattern should have caused a compilation error, but didn't/
4196 errln("line %d: Expected a pattern compile error, got success.", lineNum
);
4202 // replace the Perl variables that appear in some of the
4203 // match data strings.
4205 UnicodeString matchString
= fields
[1];
4206 matchString
.findAndReplace(nulnulSrc
, nulnul
);
4207 matchString
.findAndReplace(ffffSrc
, ffff
);
4209 // Replace any \n in the match string with an actual new-line char.
4210 // Don't do full unescape, as this unescapes more than Perl does, which
4211 // causes other spurious failures in the tests.
4212 matchString
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4217 // Run the test, check for expected match/don't match result.
4219 RegexMatcher
*testMat
= testPat
->matcher(matchString
, status
);
4220 UBool found
= testMat
->find();
4221 UBool expected
= FALSE
;
4222 if (fields
[2].indexOf(UChar_y
) >=0) {
4225 if (expected
!= found
) {
4226 errln("line %d: Expected %smatch, got %smatch",
4227 lineNum
, expected
?"":"no ", found
?"":"no " );
4231 // Don't try to check expected results if there is no match.
4232 // (Some have stuff in the expected fields)
4240 // Interpret the Perl expression from the fourth field of the data file,
4241 // building up an ICU string from the results of the ICU match.
4242 // The Perl expression will contain references to the results of
4243 // a regex match, including the matched string, capture group strings,
4244 // group starting and ending indicies, etc.
4246 UnicodeString resultString
;
4247 UnicodeString perlExpr
= fields
[3];
4248 #if SUPPORT_MUTATING_INPUT_STRING
4249 groupsMat
->reset(perlExpr
);
4250 cgMat
->reset(perlExpr
);
4253 while (perlExpr
.length() > 0) {
4254 #if !SUPPORT_MUTATING_INPUT_STRING
4255 // Perferred usage. Reset after any modification to input string.
4256 groupsMat
->reset(perlExpr
);
4257 cgMat
->reset(perlExpr
);
4260 if (perlExpr
.startsWith("$&")) {
4261 resultString
.append(testMat
->group(status
));
4262 perlExpr
.remove(0, 2);
4265 else if (groupsMat
->lookingAt(status
)) {
4267 UnicodeString digitString
= groupsMat
->group(2, status
);
4269 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4270 UnicodeString plusOrMinus
= groupsMat
->group(1, status
);
4271 int32_t matchPosition
;
4272 if (plusOrMinus
.compare("+") == 0) {
4273 matchPosition
= testMat
->end(groupNum
, status
);
4275 matchPosition
= testMat
->start(groupNum
, status
);
4277 if (matchPosition
!= -1) {
4278 ICU_Utility::appendNumber(resultString
, matchPosition
);
4280 perlExpr
.remove(0, groupsMat
->end(status
));
4283 else if (cgMat
->lookingAt(status
)) {
4285 UnicodeString digitString
= cgMat
->group(1, status
);
4287 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4288 if (U_SUCCESS(status
)) {
4289 resultString
.append(testMat
->group(groupNum
, status
));
4290 status
= U_ZERO_ERROR
;
4292 perlExpr
.remove(0, cgMat
->end(status
));
4295 else if (perlExpr
.startsWith("@-")) {
4297 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4299 resultString
.append(" ");
4301 ICU_Utility::appendNumber(resultString
, testMat
->start(i
, status
));
4303 perlExpr
.remove(0, 2);
4306 else if (perlExpr
.startsWith("@+")) {
4308 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4310 resultString
.append(" ");
4312 ICU_Utility::appendNumber(resultString
, testMat
->end(i
, status
));
4314 perlExpr
.remove(0, 2);
4317 else if (perlExpr
.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4318 // or as an escaped sequence (e.g. \n)
4319 if (perlExpr
.length() > 1) {
4320 perlExpr
.remove(0, 1); // Remove the '\', but only if not last char.
4322 UChar c
= perlExpr
.charAt(0);
4324 case 'n': c
= '\n'; break;
4325 // add any other escape sequences that show up in the test expected results.
4327 resultString
.append(c
);
4328 perlExpr
.remove(0, 1);
4332 // Any characters from the perl expression that we don't explicitly
4333 // recognize before here are assumed to be literals and copied
4334 // as-is to the expected results.
4335 resultString
.append(perlExpr
.charAt(0));
4336 perlExpr
.remove(0, 1);
4339 if (U_FAILURE(status
)) {
4340 errln("Line %d: ICU Error \"%s\"", lineNum
, u_errorName(status
));
4346 // Expected Results Compare
4348 UnicodeString
expectedS(fields
[4]);
4349 expectedS
.findAndReplace(nulnulSrc
, nulnul
);
4350 expectedS
.findAndReplace(ffffSrc
, ffff
);
4351 expectedS
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4354 if (expectedS
.compare(resultString
) != 0) {
4355 err("Line %d: Incorrect perl expression results.", lineNum
);
4356 infoln((UnicodeString
)"Expected \""+expectedS
+(UnicodeString
)"\"; got \""+resultString
+(UnicodeString
)"\"");
4364 // All done. Clean up allocated stuff.
4382 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount
);
4387 //-------------------------------------------------------------------------------
4389 // PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts
4390 // (instead of using UnicodeStrings) to test the alternate engine.
4391 // The input file for this test is re_tests, the standard regular
4392 // expression test data distributed with the Perl source code.
4393 // See PerlTests() for more information.
4395 //-------------------------------------------------------------------------------
4396 void RegexTest::PerlTestsUTF8() {
4398 const char *srcPath
;
4399 UErrorCode status
= U_ZERO_ERROR
;
4401 LocalUConverterPointer
UTF8Converter(ucnv_open("UTF-8", &status
));
4402 UText patternText
= UTEXT_INITIALIZER
;
4403 char *patternChars
= NULL
;
4404 int32_t patternLength
;
4405 int32_t patternCapacity
= 0;
4406 UText inputText
= UTEXT_INITIALIZER
;
4407 char *inputChars
= NULL
;
4408 int32_t inputLength
;
4409 int32_t inputCapacity
= 0;
4411 ucnv_setFromUCallBack(UTF8Converter
.getAlias(), UCNV_FROM_U_CALLBACK_STOP
, NULL
, NULL
, NULL
, &status
);
4414 // Open and read the test data file.
4416 srcPath
=getPath(tdd
, "re_tests.txt");
4418 return; /* something went wrong, error already output */
4422 UChar
*testData
= ReadAndConvertFile(srcPath
, len
, "iso-8859-1", status
);
4423 if (U_FAILURE(status
)) {
4424 return; /* something went wrong, error already output */
4428 // Put the test data into a UnicodeString
4430 UnicodeString
testDataString(FALSE
, testData
, len
);
4433 // Regex to break the input file into lines, and strip the new lines.
4434 // One line per match, capture group one is the desired data.
4436 RegexPattern
* linePat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe
, status
);
4437 if (U_FAILURE(status
)) {
4438 dataerrln("RegexPattern::compile() error");
4441 RegexMatcher
* lineMat
= linePat
->matcher(testDataString
, status
);
4444 // Regex to split a test file line into fields.
4445 // There are six fields, separated by tabs.
4447 RegexPattern
* fieldPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe
, status
);
4450 // Regex to identify test patterns with flag settings, and to separate them.
4451 // Test patterns with flags look like 'pattern'i
4452 // Test patterns without flags are not quoted: pattern
4453 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4455 RegexPattern
*flagPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe
, status
);
4456 RegexMatcher
* flagMat
= flagPat
->matcher(status
);
4459 // The Perl tests reference several perl-isms, which are evaluated/substituted
4460 // in the test data. Not being perl, this must be done explicitly. Here
4461 // are string constants and REs for these constructs.
4463 UnicodeString
nulnulSrc("${nulnul}");
4464 UnicodeString
nulnul("\\u0000\\u0000", -1, US_INV
);
4465 nulnul
= nulnul
.unescape();
4467 UnicodeString
ffffSrc("${ffff}");
4468 UnicodeString
ffff("\\uffff", -1, US_INV
);
4469 ffff
= ffff
.unescape();
4471 // regexp for $-[0], $+[2], etc.
4472 RegexPattern
*groupsPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe
, status
);
4473 RegexMatcher
*groupsMat
= groupsPat
->matcher(status
);
4475 // regexp for $0, $1, $2, etc.
4476 RegexPattern
*cgPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe
, status
);
4477 RegexMatcher
*cgMat
= cgPat
->matcher(status
);
4481 // Main Loop for the Perl Tests, runs once per line from the
4484 int32_t lineNum
= 0;
4485 int32_t skippedUnimplementedCount
= 0;
4486 while (lineMat
->find()) {
4490 // Get a line, break it into its fields, do the Perl
4491 // variable substitutions.
4493 UnicodeString line
= lineMat
->group(1, status
);
4494 UnicodeString fields
[7];
4495 fieldPat
->split(line
, fields
, 7, status
);
4497 flagMat
->reset(fields
[0]);
4498 flagMat
->matches(status
);
4499 UnicodeString pattern
= flagMat
->group(2, status
);
4500 pattern
.findAndReplace("${bang}", "!");
4501 pattern
.findAndReplace(nulnulSrc
, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4502 pattern
.findAndReplace(ffffSrc
, ffff
);
4505 // Identify patterns that include match flag settings,
4506 // split off the flags, remove the extra quotes.
4508 UnicodeString flagStr
= flagMat
->group(3, status
);
4509 if (U_FAILURE(status
)) {
4510 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
4514 const UChar UChar_c
= 0x63; // Char constants for the flag letters.
4515 const UChar UChar_i
= 0x69; // (Damn the lack of Unicode support in C)
4516 const UChar UChar_m
= 0x6d;
4517 const UChar UChar_x
= 0x78;
4518 const UChar UChar_y
= 0x79;
4519 if (flagStr
.indexOf(UChar_i
) != -1) {
4520 flags
|= UREGEX_CASE_INSENSITIVE
;
4522 if (flagStr
.indexOf(UChar_m
) != -1) {
4523 flags
|= UREGEX_MULTILINE
;
4525 if (flagStr
.indexOf(UChar_x
) != -1) {
4526 flags
|= UREGEX_COMMENTS
;
4530 // Put the pattern in a UTF-8 UText
4532 status
= U_ZERO_ERROR
;
4533 patternLength
= pattern
.extract(patternChars
, patternCapacity
, UTF8Converter
.getAlias(), status
);
4534 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
4535 status
= U_ZERO_ERROR
;
4536 delete[] patternChars
;
4537 patternCapacity
= patternLength
+ 1;
4538 patternChars
= new char[patternCapacity
];
4539 pattern
.extract(patternChars
, patternCapacity
, UTF8Converter
.getAlias(), status
);
4541 utext_openUTF8(&patternText
, patternChars
, patternLength
, &status
);
4544 // Compile the test pattern.
4546 RegexPattern
*testPat
= RegexPattern::compile(&patternText
, flags
, pe
, status
);
4547 if (status
== U_REGEX_UNIMPLEMENTED
) {
4549 // Test of a feature that is planned for ICU, but not yet implemented.
4551 skippedUnimplementedCount
++;
4553 status
= U_ZERO_ERROR
;
4557 if (U_FAILURE(status
)) {
4558 // Some tests are supposed to generate errors.
4559 // Only report an error for tests that are supposed to succeed.
4560 if (fields
[2].indexOf(UChar_c
) == -1 && // Compilation is not supposed to fail AND
4561 fields
[2].indexOf(UChar_i
) == -1) // it's not an accepted ICU incompatibility
4563 errln("line %d: ICU Error \"%s\"\n", lineNum
, u_errorName(status
));
4565 status
= U_ZERO_ERROR
;
4570 if (fields
[2].indexOf(UChar_i
) >= 0) {
4571 // ICU should skip this test.
4576 if (fields
[2].indexOf(UChar_c
) >= 0) {
4577 // This pattern should have caused a compilation error, but didn't/
4578 errln("line %d: Expected a pattern compile error, got success.", lineNum
);
4585 // replace the Perl variables that appear in some of the
4586 // match data strings.
4588 UnicodeString matchString
= fields
[1];
4589 matchString
.findAndReplace(nulnulSrc
, nulnul
);
4590 matchString
.findAndReplace(ffffSrc
, ffff
);
4592 // Replace any \n in the match string with an actual new-line char.
4593 // Don't do full unescape, as this unescapes more than Perl does, which
4594 // causes other spurious failures in the tests.
4595 matchString
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4598 // Put the input in a UTF-8 UText
4600 status
= U_ZERO_ERROR
;
4601 inputLength
= matchString
.extract(inputChars
, inputCapacity
, UTF8Converter
.getAlias(), status
);
4602 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
4603 status
= U_ZERO_ERROR
;
4604 delete[] inputChars
;
4605 inputCapacity
= inputLength
+ 1;
4606 inputChars
= new char[inputCapacity
];
4607 matchString
.extract(inputChars
, inputCapacity
, UTF8Converter
.getAlias(), status
);
4609 utext_openUTF8(&inputText
, inputChars
, inputLength
, &status
);
4612 // Run the test, check for expected match/don't match result.
4614 RegexMatcher
*testMat
= &testPat
->matcher(status
)->reset(&inputText
);
4615 UBool found
= testMat
->find();
4616 UBool expected
= FALSE
;
4617 if (fields
[2].indexOf(UChar_y
) >=0) {
4620 if (expected
!= found
) {
4621 errln("line %d: Expected %smatch, got %smatch",
4622 lineNum
, expected
?"":"no ", found
?"":"no " );
4626 // Don't try to check expected results if there is no match.
4627 // (Some have stuff in the expected fields)
4635 // Interpret the Perl expression from the fourth field of the data file,
4636 // building up an ICU string from the results of the ICU match.
4637 // The Perl expression will contain references to the results of
4638 // a regex match, including the matched string, capture group strings,
4639 // group starting and ending indicies, etc.
4641 UnicodeString resultString
;
4642 UnicodeString perlExpr
= fields
[3];
4644 while (perlExpr
.length() > 0) {
4645 groupsMat
->reset(perlExpr
);
4646 cgMat
->reset(perlExpr
);
4648 if (perlExpr
.startsWith("$&")) {
4649 resultString
.append(testMat
->group(status
));
4650 perlExpr
.remove(0, 2);
4653 else if (groupsMat
->lookingAt(status
)) {
4655 UnicodeString digitString
= groupsMat
->group(2, status
);
4657 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4658 UnicodeString plusOrMinus
= groupsMat
->group(1, status
);
4659 int32_t matchPosition
;
4660 if (plusOrMinus
.compare("+") == 0) {
4661 matchPosition
= testMat
->end(groupNum
, status
);
4663 matchPosition
= testMat
->start(groupNum
, status
);
4665 if (matchPosition
!= -1) {
4666 ICU_Utility::appendNumber(resultString
, matchPosition
);
4668 perlExpr
.remove(0, groupsMat
->end(status
));
4671 else if (cgMat
->lookingAt(status
)) {
4673 UnicodeString digitString
= cgMat
->group(1, status
);
4675 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4676 if (U_SUCCESS(status
)) {
4677 resultString
.append(testMat
->group(groupNum
, status
));
4678 status
= U_ZERO_ERROR
;
4680 perlExpr
.remove(0, cgMat
->end(status
));
4683 else if (perlExpr
.startsWith("@-")) {
4685 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4687 resultString
.append(" ");
4689 ICU_Utility::appendNumber(resultString
, testMat
->start(i
, status
));
4691 perlExpr
.remove(0, 2);
4694 else if (perlExpr
.startsWith("@+")) {
4696 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4698 resultString
.append(" ");
4700 ICU_Utility::appendNumber(resultString
, testMat
->end(i
, status
));
4702 perlExpr
.remove(0, 2);
4705 else if (perlExpr
.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4706 // or as an escaped sequence (e.g. \n)
4707 if (perlExpr
.length() > 1) {
4708 perlExpr
.remove(0, 1); // Remove the '\', but only if not last char.
4710 UChar c
= perlExpr
.charAt(0);
4712 case 'n': c
= '\n'; break;
4713 // add any other escape sequences that show up in the test expected results.
4715 resultString
.append(c
);
4716 perlExpr
.remove(0, 1);
4720 // Any characters from the perl expression that we don't explicitly
4721 // recognize before here are assumed to be literals and copied
4722 // as-is to the expected results.
4723 resultString
.append(perlExpr
.charAt(0));
4724 perlExpr
.remove(0, 1);
4727 if (U_FAILURE(status
)) {
4728 errln("Line %d: ICU Error \"%s\"", lineNum
, u_errorName(status
));
4734 // Expected Results Compare
4736 UnicodeString
expectedS(fields
[4]);
4737 expectedS
.findAndReplace(nulnulSrc
, nulnul
);
4738 expectedS
.findAndReplace(ffffSrc
, ffff
);
4739 expectedS
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4742 if (expectedS
.compare(resultString
) != 0) {
4743 err("Line %d: Incorrect perl expression results.", lineNum
);
4744 infoln((UnicodeString
)"Expected \""+expectedS
+(UnicodeString
)"\"; got \""+resultString
+(UnicodeString
)"\"");
4752 // All done. Clean up allocated stuff.
4769 utext_close(&patternText
);
4770 utext_close(&inputText
);
4772 delete [] patternChars
;
4773 delete [] inputChars
;
4776 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount
);
4781 //--------------------------------------------------------------
4783 // Bug6149 Verify limits to heap expansion for backtrack stack.
4784 // Use this pattern,
4785 // "(a?){1,8000000}"
4786 // Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4787 // This test is likely to be fragile, as further optimizations stop
4788 // more cases of pointless looping in the match engine.
4790 //---------------------------------------------------------------
4791 void RegexTest::Bug6149() {
4792 UnicodeString
pattern("(a?){1,8000000}");
4793 UnicodeString
s("xyz");
4795 UErrorCode status
= U_ZERO_ERROR
;
4797 RegexMatcher
matcher(pattern
, s
, flags
, status
);
4798 UBool result
= false;
4799 REGEX_ASSERT_FAIL(result
=matcher
.matches(status
), U_REGEX_STACK_OVERFLOW
);
4800 REGEX_ASSERT(result
== FALSE
);
4805 // Callbacks() Test the callback function.
4806 // When set, callbacks occur periodically during matching operations,
4807 // giving the application code the ability to abort the operation
4808 // before it's normal completion.
4811 struct callBackContext
{
4816 void reset(int32_t max
) {maxCalls
=max
; numCalls
=0; lastSteps
=0;};
4820 static UBool U_CALLCONV
4821 testCallBackFn(const void *context
, int32_t steps
) {
4822 callBackContext
*info
= (callBackContext
*)context
;
4823 if (info
->lastSteps
+1 != steps
) {
4824 info
->test
->errln("incorrect steps in callback. Expected %d, got %d\n", info
->lastSteps
+1, steps
);
4826 info
->lastSteps
= steps
;
4828 return (info
->numCalls
< info
->maxCalls
);
4832 void RegexTest::Callbacks() {
4834 // Getter returns NULLs if no callback has been set
4836 // The variables that the getter will fill in.
4837 // Init to non-null values so that the action of the getter can be seen.
4838 const void *returnedContext
= &returnedContext
;
4839 URegexMatchCallback
*returnedFn
= &testCallBackFn
;
4841 UErrorCode status
= U_ZERO_ERROR
;
4842 RegexMatcher
matcher("x", 0, status
);
4844 matcher
.getMatchCallback(returnedFn
, returnedContext
, status
);
4846 REGEX_ASSERT(returnedFn
== NULL
);
4847 REGEX_ASSERT(returnedContext
== NULL
);
4852 callBackContext cbInfo
= {this, 0, 0, 0};
4853 const void *returnedContext
;
4854 URegexMatchCallback
*returnedFn
;
4855 UErrorCode status
= U_ZERO_ERROR
;
4856 RegexMatcher
matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status
); // A pattern that can run long.
4858 matcher
.setMatchCallback(testCallBackFn
, &cbInfo
, status
);
4860 matcher
.getMatchCallback(returnedFn
, returnedContext
, status
);
4862 REGEX_ASSERT(returnedFn
== testCallBackFn
);
4863 REGEX_ASSERT(returnedContext
== &cbInfo
);
4865 // A short-running match shouldn't invoke the callback
4866 status
= U_ZERO_ERROR
;
4868 UnicodeString s
= "xxx";
4870 REGEX_ASSERT(matcher
.matches(status
));
4872 REGEX_ASSERT(cbInfo
.numCalls
== 0);
4874 // A medium-length match that runs long enough to invoke the
4875 // callback, but not so long that the callback aborts it.
4876 status
= U_ZERO_ERROR
;
4878 s
= "aaaaaaaaaaaaaaaaaaab";
4880 REGEX_ASSERT(matcher
.matches(status
)==FALSE
);
4882 REGEX_ASSERT(cbInfo
.numCalls
> 0);
4884 // A longer running match that the callback function will abort.
4885 status
= U_ZERO_ERROR
;
4887 s
= "aaaaaaaaaaaaaaaaaaaaaaab";
4889 REGEX_ASSERT(matcher
.matches(status
)==FALSE
);
4890 REGEX_ASSERT(status
== U_REGEX_STOPPED_BY_CALLER
);
4891 REGEX_ASSERT(cbInfo
.numCalls
== 4);
4893 // A longer running find that the callback function will abort.
4894 status
= U_ZERO_ERROR
;
4896 s
= "aaaaaaaaaaaaaaaaaaaaaaab";
4898 REGEX_ASSERT(matcher
.find(status
)==FALSE
);
4899 REGEX_ASSERT(status
== U_REGEX_STOPPED_BY_CALLER
);
4900 REGEX_ASSERT(cbInfo
.numCalls
== 4);
4908 // FindProgressCallbacks() Test the find "progress" callback function.
4909 // When set, the find progress callback will be invoked during a find operations
4910 // after each return from a match attempt, giving the application the opportunity
4911 // to terminate a long-running find operation before it's normal completion.
4914 struct progressCallBackContext
{
4919 void reset(int32_t max
) {maxCalls
=max
; numCalls
=0;lastIndex
=0;};
4922 // call-back function for find().
4923 // Return TRUE to continue the find().
4924 // Return FALSE to stop the find().
4926 static UBool U_CALLCONV
4927 testProgressCallBackFn(const void *context
, int64_t matchIndex
) {
4928 progressCallBackContext
*info
= (progressCallBackContext
*)context
;
4930 info
->lastIndex
= matchIndex
;
4931 // info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4932 return (info
->numCalls
< info
->maxCalls
);
4936 void RegexTest::FindProgressCallbacks() {
4938 // Getter returns NULLs if no callback has been set
4940 // The variables that the getter will fill in.
4941 // Init to non-null values so that the action of the getter can be seen.
4942 const void *returnedContext
= &returnedContext
;
4943 URegexFindProgressCallback
*returnedFn
= &testProgressCallBackFn
;
4945 UErrorCode status
= U_ZERO_ERROR
;
4946 RegexMatcher
matcher("x", 0, status
);
4948 matcher
.getFindProgressCallback(returnedFn
, returnedContext
, status
);
4950 REGEX_ASSERT(returnedFn
== NULL
);
4951 REGEX_ASSERT(returnedContext
== NULL
);
4956 progressCallBackContext cbInfo
= {this, 0, 0, 0};
4957 const void *returnedContext
;
4958 URegexFindProgressCallback
*returnedFn
;
4959 UErrorCode status
= U_ZERO_ERROR
;
4960 RegexMatcher
matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status
);
4962 matcher
.setFindProgressCallback(testProgressCallBackFn
, &cbInfo
, status
);
4964 matcher
.getFindProgressCallback(returnedFn
, returnedContext
, status
);
4966 REGEX_ASSERT(returnedFn
== testProgressCallBackFn
);
4967 REGEX_ASSERT(returnedContext
== &cbInfo
);
4969 // A find that matches on the initial position does NOT invoke the callback.
4970 status
= U_ZERO_ERROR
;
4972 UnicodeString s
= "aaxxx";
4975 matcher
.setTrace(TRUE
);
4977 REGEX_ASSERT(matcher
.find(0, status
));
4979 REGEX_ASSERT(cbInfo
.numCalls
== 0);
4981 // A medium running find() that causes matcher.find() to invoke our callback for each index,
4982 // but not so many times that we interrupt the operation.
4983 status
= U_ZERO_ERROR
;
4984 s
= "aaaaaaaaaaaaaaaaaaab";
4985 cbInfo
.reset(s
.length()); // Some upper limit for number of calls that is greater than size of our input string
4987 REGEX_ASSERT(matcher
.find(0, status
)==FALSE
);
4989 REGEX_ASSERT(cbInfo
.numCalls
> 0 && cbInfo
.numCalls
< 25);
4991 // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4992 status
= U_ZERO_ERROR
;
4993 UnicodeString s1
= "aaaaaaaaaaaaaaaaaaaaaaab";
4994 cbInfo
.reset(s1
.length() - 5); // Bail early somewhere near the end of input string
4996 REGEX_ASSERT(matcher
.find(0, status
)==FALSE
);
4997 REGEX_ASSERT(status
== U_REGEX_STOPPED_BY_CALLER
);
4998 REGEX_ASSERT(cbInfo
.numCalls
== s1
.length() - 5);
5000 // Now a match that will succeed, but after an interruption
5001 status
= U_ZERO_ERROR
;
5002 UnicodeString s2
= "aaaaaaaaaaaaaa aaaaaaaaab xxx";
5003 cbInfo
.reset(s2
.length() - 10); // Bail early somewhere near the end of input string
5005 REGEX_ASSERT(matcher
.find(0, status
)==FALSE
);
5006 REGEX_ASSERT(status
== U_REGEX_STOPPED_BY_CALLER
);
5007 // Now retry the match from where left off
5008 cbInfo
.maxCalls
= 100; // No callback limit
5009 status
= U_ZERO_ERROR
;
5010 REGEX_ASSERT(matcher
.find(cbInfo
.lastIndex
, status
));
5018 //---------------------------------------------------------------------------
5020 // PreAllocatedUTextCAPI Check the C API with pre-allocated mutable
5021 // UTexts. The pure-C implementation of UText
5022 // has no mutable backing stores, but we can
5023 // use UnicodeString here to test the functionality.
5025 //---------------------------------------------------------------------------
5026 void RegexTest::PreAllocatedUTextCAPI () {
5027 UErrorCode status
= U_ZERO_ERROR
;
5028 URegularExpression
*re
;
5029 UText patternText
= UTEXT_INITIALIZER
;
5030 UnicodeString buffer
;
5031 UText bufferText
= UTEXT_INITIALIZER
;
5033 utext_openUnicodeString(&bufferText
, &buffer
, &status
);
5036 * getText() and getUText()
5039 UText text1
= UTEXT_INITIALIZER
;
5040 UText text2
= UTEXT_INITIALIZER
;
5041 UChar text2Chars
[20];
5044 status
= U_ZERO_ERROR
;
5045 regextst_openUTF8FromInvariant(&text1
, "abcccd", -1, &status
);
5046 regextst_openUTF8FromInvariant(&text2
, "abcccxd", -1, &status
);
5047 u_uastrncpy(text2Chars
, "abcccxd", sizeof(text2
)/2);
5048 utext_openUChars(&text2
, text2Chars
, -1, &status
);
5050 regextst_openUTF8FromInvariant(&patternText
, "abc*d", -1, &status
);
5051 re
= uregex_openUText(&patternText
, 0, NULL
, &status
);
5053 /* First set a UText */
5054 uregex_setUText(re
, &text1
, &status
);
5055 resultText
= uregex_getUText(re
, &bufferText
, &status
);
5057 REGEX_ASSERT(resultText
== &bufferText
);
5058 utext_setNativeIndex(resultText
, 0);
5059 utext_setNativeIndex(&text1
, 0);
5060 REGEX_ASSERT(testUTextEqual(resultText
, &text1
));
5062 resultText
= uregex_getUText(re
, &bufferText
, &status
);
5064 REGEX_ASSERT(resultText
== &bufferText
);
5065 utext_setNativeIndex(resultText
, 0);
5066 utext_setNativeIndex(&text1
, 0);
5067 REGEX_ASSERT(testUTextEqual(resultText
, &text1
));
5069 /* Then set a UChar * */
5070 uregex_setText(re
, text2Chars
, 7, &status
);
5071 resultText
= uregex_getUText(re
, &bufferText
, &status
);
5073 REGEX_ASSERT(resultText
== &bufferText
);
5074 utext_setNativeIndex(resultText
, 0);
5075 utext_setNativeIndex(&text2
, 0);
5076 REGEX_ASSERT(testUTextEqual(resultText
, &text2
));
5079 utext_close(&text1
);
5080 utext_close(&text2
);
5092 u_uastrncpy(text1
, "noise abc interior def, and this is off the end", UPRV_LENGTHOF(text1
));
5093 // 012345678901234567890123456789012345678901234567
5096 status
= U_ZERO_ERROR
;
5097 re
= uregex_openC("abc(.*?)def", 0, NULL
, &status
);
5100 uregex_setText(re
, text1
, -1, &status
);
5101 result
= uregex_find(re
, 0, &status
);
5102 REGEX_ASSERT(result
==TRUE
);
5104 /* Capture Group 0, the full match. Should succeed. "abc interior def" */
5105 status
= U_ZERO_ERROR
;
5106 actual
= uregex_groupUText(re
, 0, &bufferText
, &length
, &status
);
5108 REGEX_ASSERT(actual
== &bufferText
);
5109 REGEX_ASSERT(utext_getNativeIndex(actual
) == 6);
5110 REGEX_ASSERT(length
== 16);
5111 REGEX_ASSERT(utext_nativeLength(actual
) == 47);
5113 /* Capture group #1. Should succeed, matching " interior ". */
5114 status
= U_ZERO_ERROR
;
5115 actual
= uregex_groupUText(re
, 1, &bufferText
, &length
, &status
);
5117 REGEX_ASSERT(actual
== &bufferText
);
5118 REGEX_ASSERT(utext_getNativeIndex(actual
) == 9); // position of " interior "
5119 REGEX_ASSERT(length
== 10);
5120 REGEX_ASSERT(utext_nativeLength(actual
) == 47);
5122 /* Capture group out of range. Error. */
5123 status
= U_ZERO_ERROR
;
5124 actual
= uregex_groupUText(re
, 2, &bufferText
, &length
, &status
);
5125 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
5126 REGEX_ASSERT(actual
== &bufferText
);
5137 UText replText
= UTEXT_INITIALIZER
;
5139 status
= U_ZERO_ERROR
;
5140 utext_openUnicodeString(&bufferText
, &buffer
, &status
);
5142 status
= U_ZERO_ERROR
;
5143 u_uastrncpy(text1
, "Replace xaax x1x x...x.", UPRV_LENGTHOF(text1
));
5144 u_uastrncpy(text2
, "No match here.", UPRV_LENGTHOF(text2
)/2);
5145 regextst_openUTF8FromInvariant(&replText
, "<$1>", -1, &status
);
5147 re
= uregex_openC("x(.*?)x", 0, NULL
, &status
);
5150 /* Normal case, with match */
5151 uregex_setText(re
, text1
, -1, &status
);
5153 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5155 result
= uregex_replaceFirstUText(re
, &replText
, &bufferText
, &status
);
5157 REGEX_ASSERT(result
== &bufferText
);
5158 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result
);
5160 /* No match. Text should copy to output with no changes. */
5161 uregex_setText(re
, text2
, -1, &status
);
5162 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5163 result
= uregex_replaceFirstUText(re
, &replText
, &bufferText
, &status
);
5165 REGEX_ASSERT(result
== &bufferText
);
5166 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result
);
5168 /* Unicode escapes */
5169 uregex_setText(re
, text1
, -1, &status
);
5170 regextst_openUTF8FromInvariant(&replText
, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status
);
5171 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5172 result
= uregex_replaceFirstUText(re
, &replText
, &bufferText
, &status
);
5174 REGEX_ASSERT(result
== &bufferText
);
5175 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result
);
5178 utext_close(&replText
);
5188 UText replText
= UTEXT_INITIALIZER
;
5191 status
= U_ZERO_ERROR
;
5192 u_uastrncpy(text1
, "Replace xaax x1x x...x.", sizeof(text1
)/2);
5193 u_uastrncpy(text2
, "No match here.", sizeof(text2
)/2);
5194 regextst_openUTF8FromInvariant(&replText
, "<$1>", -1, &status
);
5196 re
= uregex_openC("x(.*?)x", 0, NULL
, &status
);
5199 /* Normal case, with match */
5200 uregex_setText(re
, text1
, -1, &status
);
5201 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5202 result
= uregex_replaceAllUText(re
, &replText
, &bufferText
, &status
);
5204 REGEX_ASSERT(result
== &bufferText
);
5205 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result
);
5207 /* No match. Text should copy to output with no changes. */
5208 uregex_setText(re
, text2
, -1, &status
);
5209 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5210 result
= uregex_replaceAllUText(re
, &replText
, &bufferText
, &status
);
5212 REGEX_ASSERT(result
== &bufferText
);
5213 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result
);
5216 utext_close(&replText
);
5221 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5222 * so we don't need to test it here.
5225 utext_close(&bufferText
);
5226 utext_close(&patternText
);
5230 //--------------------------------------------------------------
5232 // NamedCapture Check basic named capture group functionality
5234 //--------------------------------------------------------------
5235 void RegexTest::NamedCapture() {
5236 UErrorCode status
= U_ZERO_ERROR
;
5237 RegexPattern
*pat
= RegexPattern::compile(UnicodeString(
5238 "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status
);
5240 int32_t group
= pat
->groupNumberFromName("five", -1, status
);
5242 REGEX_ASSERT(5 == group
);
5243 group
= pat
->groupNumberFromName("three", -1, status
);
5245 REGEX_ASSERT(3 == group
);
5247 status
= U_ZERO_ERROR
;
5248 group
= pat
->groupNumberFromName(UnicodeString("six"), status
);
5250 REGEX_ASSERT(6 == group
);
5252 status
= U_ZERO_ERROR
;
5253 group
= pat
->groupNumberFromName(UnicodeString("nosuch"), status
);
5254 U_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5256 status
= U_ZERO_ERROR
;
5258 // After copying a pattern, named capture should still work in the copy.
5259 RegexPattern
*copiedPat
= new RegexPattern(*pat
);
5260 REGEX_ASSERT(*copiedPat
== *pat
);
5261 delete pat
; pat
= NULL
; // Delete original, copy should have no references back to it.
5263 group
= copiedPat
->groupNumberFromName("five", -1, status
);
5265 REGEX_ASSERT(5 == group
);
5266 group
= copiedPat
->groupNumberFromName("three", -1, status
);
5268 REGEX_ASSERT(3 == group
);
5271 // ReplaceAll with named capture group.
5272 status
= U_ZERO_ERROR
;
5273 UnicodeString
text("Substitution of <<quotes>> for <<double brackets>>");
5274 RegexMatcher
*m
= new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text
, 0, status
);
5276 // m.pattern().dumpPattern();
5277 UnicodeString replacedText
= m
->replaceAll("'${mid}'", status
);
5279 REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText
);
5282 // ReplaceAll, allowed capture group numbers.
5283 text
= UnicodeString("abcmxyz");
5284 m
= new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text
, 0, status
);
5287 status
= U_ZERO_ERROR
;
5288 replacedText
= m
->replaceAll(UnicodeString("<$0>"), status
); // group 0, full match, is allowed.
5290 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText
);
5292 status
= U_ZERO_ERROR
;
5293 replacedText
= m
->replaceAll(UnicodeString("<$1>"), status
); // group 1 by number.
5295 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText
);
5297 status
= U_ZERO_ERROR
;
5298 replacedText
= m
->replaceAll(UnicodeString("<${one}>"), status
); // group 1 by name.
5300 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText
);
5302 status
= U_ZERO_ERROR
;
5303 replacedText
= m
->replaceAll(UnicodeString("<$2>"), status
); // group 2.
5305 REGEX_ASSERT(UnicodeString("a<x>z") == replacedText
);
5307 status
= U_ZERO_ERROR
;
5308 replacedText
= m
->replaceAll(UnicodeString("<$3>"), status
);
5310 REGEX_ASSERT(UnicodeString("a<y>z") == replacedText
);
5312 status
= U_ZERO_ERROR
;
5313 replacedText
= m
->replaceAll(UnicodeString("<$4>"), status
);
5314 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
5316 status
= U_ZERO_ERROR
;
5317 replacedText
= m
->replaceAll(UnicodeString("<$04>"), status
); // group 0, leading 0,
5318 REGEX_CHECK_STATUS
; // trailing out-of-range 4 passes through.
5319 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText
);
5321 status
= U_ZERO_ERROR
;
5322 replacedText
= m
->replaceAll(UnicodeString("<$000016>"), status
); // Consume leading zeroes. Don't consume digits
5323 REGEX_CHECK_STATUS
; // that push group num out of range.
5324 REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText
); // This is group 1.
5326 status
= U_ZERO_ERROR
;
5327 replacedText
= m
->replaceAll(UnicodeString("<$3$2$1${one}>"), status
);
5329 REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText
);
5331 status
= U_ZERO_ERROR
;
5332 replacedText
= m
->replaceAll(UnicodeString("$3$2$1${one}"), status
);
5334 REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText
);
5336 status
= U_ZERO_ERROR
;
5337 replacedText
= m
->replaceAll(UnicodeString("<${noSuchName}>"), status
);
5338 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5340 status
= U_ZERO_ERROR
;
5341 replacedText
= m
->replaceAll(UnicodeString("<${invalid-name}>"), status
);
5342 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5344 status
= U_ZERO_ERROR
;
5345 replacedText
= m
->replaceAll(UnicodeString("<${one"), status
);
5346 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5348 status
= U_ZERO_ERROR
;
5349 replacedText
= m
->replaceAll(UnicodeString("$not a capture group"), status
);
5350 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5354 // Repeat the above replaceAll() tests using the plain C API, which
5355 // has a separate implementation internally.
5356 // TODO: factor out the test data.
5358 status
= U_ZERO_ERROR
;
5359 URegularExpression
*re
= uregex_openC("..(?<one>m)(.)(.)", 0, NULL
, &status
);
5361 text
= UnicodeString("abcmxyz");
5362 uregex_setText(re
, text
.getBuffer(), text
.length(), &status
);
5365 UChar resultBuf
[100];
5366 int32_t resultLength
;
5369 status
= U_ZERO_ERROR
;
5370 repl
= UnicodeString("<$0>");
5371 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5373 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf
, resultLength
));
5375 status
= U_ZERO_ERROR
;
5376 repl
= UnicodeString("<$1>");
5377 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5379 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf
, resultLength
));
5381 status
= U_ZERO_ERROR
;
5382 repl
= UnicodeString("<${one}>");
5383 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5385 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf
, resultLength
));
5387 status
= U_ZERO_ERROR
;
5388 repl
= UnicodeString("<$2>");
5389 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5391 REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf
, resultLength
));
5393 status
= U_ZERO_ERROR
;
5394 repl
= UnicodeString("<$3>");
5395 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5397 REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf
, resultLength
));
5399 status
= U_ZERO_ERROR
;
5400 repl
= UnicodeString("<$4>");
5401 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5402 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
5404 status
= U_ZERO_ERROR
;
5405 repl
= UnicodeString("<$04>");
5406 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5408 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf
, resultLength
));
5410 status
= U_ZERO_ERROR
;
5411 repl
= UnicodeString("<$000016>");
5412 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5414 REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf
, resultLength
));
5416 status
= U_ZERO_ERROR
;
5417 repl
= UnicodeString("<$3$2$1${one}>");
5418 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5420 REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf
, resultLength
));
5422 status
= U_ZERO_ERROR
;
5423 repl
= UnicodeString("$3$2$1${one}");
5424 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5426 REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf
, resultLength
));
5428 status
= U_ZERO_ERROR
;
5429 repl
= UnicodeString("<${noSuchName}>");
5430 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5431 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5433 status
= U_ZERO_ERROR
;
5434 repl
= UnicodeString("<${invalid-name}>");
5435 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5436 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5438 status
= U_ZERO_ERROR
;
5439 repl
= UnicodeString("<${one");
5440 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5441 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5443 status
= U_ZERO_ERROR
;
5444 repl
= UnicodeString("$not a capture group");
5445 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5446 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5451 //--------------------------------------------------------------
5453 // NamedCaptureLimits Patterns with huge numbers of named capture groups.
5454 // The point is not so much what the exact limit is,
5455 // but that a largish number doesn't hit bad non-linear performance,
5456 // and that exceeding the limit fails cleanly.
5458 //--------------------------------------------------------------
5459 void RegexTest::NamedCaptureLimits() {
5461 logln("Skipping test. Runs in exhuastive mode only.");
5464 const int32_t goodLimit
= 1000000; // Pattern w this many groups builds successfully.
5465 const int32_t failLimit
= 10000000; // Pattern exceeds internal limits, fails to compile.
5467 UnicodeString pattern
;
5470 for (nn
=1; nn
<goodLimit
; nn
++) {
5471 sprintf(nnbuf
, "(?<nn%d>)", nn
);
5472 pattern
.append(UnicodeString(nnbuf
, -1, US_INV
));
5474 UErrorCode status
= U_ZERO_ERROR
;
5475 RegexPattern
*pat
= RegexPattern::compile(pattern
, 0, status
);
5477 for (nn
=1; nn
<goodLimit
; nn
++) {
5478 sprintf(nnbuf
, "nn%d", nn
);
5479 int32_t groupNum
= pat
->groupNumberFromName(nnbuf
, -1, status
);
5480 REGEX_ASSERT(nn
== groupNum
);
5481 if (nn
!= groupNum
) {
5488 for (nn
=1; nn
<failLimit
; nn
++) {
5489 sprintf(nnbuf
, "(?<nn%d>)", nn
);
5490 pattern
.append(UnicodeString(nnbuf
, -1, US_INV
));
5492 status
= U_ZERO_ERROR
;
5493 pat
= RegexPattern::compile(pattern
, 0, status
);
5494 REGEX_ASSERT(status
== U_REGEX_PATTERN_TOO_BIG
);
5499 //--------------------------------------------------------------
5501 // Bug7651 Regex pattern that exceeds default operator stack depth in matcher.
5503 //---------------------------------------------------------------
5504 void RegexTest::Bug7651() {
5505 UnicodeString
pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5506 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5507 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5508 UnicodeString
pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5509 UnicodeString
s("#ff @abcd This is test");
5510 RegexPattern
*REPattern
= NULL
;
5511 RegexMatcher
*REMatcher
= NULL
;
5512 UErrorCode status
= U_ZERO_ERROR
;
5515 REPattern
= RegexPattern::compile(pattern1
, 0, pe
, status
);
5517 REMatcher
= REPattern
->matcher(s
, status
);
5519 REGEX_ASSERT(REMatcher
->find());
5520 REGEX_ASSERT(REMatcher
->start(status
) == 0);
5523 status
= U_ZERO_ERROR
;
5525 REPattern
= RegexPattern::compile(pattern2
, 0, pe
, status
);
5527 REMatcher
= REPattern
->matcher(s
, status
);
5529 REGEX_ASSERT(REMatcher
->find());
5530 REGEX_ASSERT(REMatcher
->start(status
) == 0);
5533 status
= U_ZERO_ERROR
;
5536 void RegexTest::Bug7740() {
5537 UErrorCode status
= U_ZERO_ERROR
;
5538 UnicodeString pattern
= "(a)";
5539 UnicodeString text
= "abcdef";
5540 RegexMatcher
*m
= new RegexMatcher(pattern
, text
, 0, status
);
5542 REGEX_ASSERT(m
->lookingAt(status
));
5544 status
= U_ILLEGAL_ARGUMENT_ERROR
;
5545 UnicodeString s
= m
->group(1, status
); // Bug 7740: segfault here.
5546 REGEX_ASSERT(status
== U_ILLEGAL_ARGUMENT_ERROR
);
5547 REGEX_ASSERT(s
== "");
5551 // Bug 8479: was crashing whith a Bogus UnicodeString as input.
5553 void RegexTest::Bug8479() {
5554 UErrorCode status
= U_ZERO_ERROR
;
5556 RegexMatcher
* const pMatcher
= new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL
|UREGEX_CASE_INSENSITIVE
, status
);
5558 if (U_SUCCESS(status
))
5562 pMatcher
->reset(str
);
5563 status
= U_ZERO_ERROR
;
5564 pMatcher
->matches(status
);
5565 REGEX_ASSERT(status
== U_ILLEGAL_ARGUMENT_ERROR
);
5572 void RegexTest::Bug7029() {
5573 UErrorCode status
= U_ZERO_ERROR
;
5575 RegexMatcher
* const pMatcher
= new RegexMatcher(".", 0, status
);
5576 UnicodeString text
= "abc.def";
5577 UnicodeString splits
[10];
5579 int32_t numFields
= pMatcher
->split(text
, splits
, 10, status
);
5581 REGEX_ASSERT(numFields
== 8);
5586 // This test is checking for the existance of any supplemental characters that case-fold
5587 // to a bmp character.
5589 // At the time of this writing there are none. If any should appear in a subsequent release
5590 // of Unicode, the code in regular expressions compilation that determines the longest
5591 // posssible match for a literal string will need to be enhanced.
5593 // See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5594 // for details on what to do in case of a failure of this test.
5596 void RegexTest::Bug9283() {
5597 #if !UCONFIG_NO_NORMALIZATION
5598 UErrorCode status
= U_ZERO_ERROR
;
5599 UnicodeSet
supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status
);
5603 for (index
=0; ; index
++) {
5604 c
= supplementalsWithCaseFolding
.charAt(index
);
5608 UnicodeString cf
= UnicodeString(c
).foldCase();
5609 REGEX_ASSERT(cf
.length() >= 2);
5611 #endif /* #if !UCONFIG_NO_NORMALIZATION */
5615 void RegexTest::CheckInvBufSize() {
5616 if(inv_next
>=INV_BUFSIZ
) {
5617 errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5618 __FILE__
, INV_BUFSIZ
, inv_next
);
5620 logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__
, INV_BUFSIZ
, inv_next
);
5625 void RegexTest::Bug10459() {
5626 UErrorCode status
= U_ZERO_ERROR
;
5627 UnicodeString
patternString("(txt)");
5628 UnicodeString
txtString("txt");
5630 UText
*utext_pat
= utext_openUnicodeString(NULL
, &patternString
, &status
);
5632 UText
*utext_txt
= utext_openUnicodeString(NULL
, &txtString
, &status
);
5635 URegularExpression
*icu_re
= uregex_openUText(utext_pat
, 0, NULL
, &status
);
5638 uregex_setUText(icu_re
, utext_txt
, &status
);
5641 // The bug was that calling uregex_group() before doing a matching operation
5642 // was causing a segfault. Only for Regular Expressions created from UText.
5643 // It should set an U_REGEX_INVALID_STATE.
5646 int32_t len
= uregex_group(icu_re
, 0, buf
, UPRV_LENGTHOF(buf
), &status
);
5647 REGEX_ASSERT(status
== U_REGEX_INVALID_STATE
);
5648 REGEX_ASSERT(len
== 0);
5650 uregex_close(icu_re
);
5651 utext_close(utext_pat
);
5652 utext_close(utext_txt
);
5655 void RegexTest::TestCaseInsensitiveStarters() {
5656 // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
5657 // become stale because of new Unicode characters.
5658 // If it is stale, rerun the generation tool
5659 // svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
5660 // and replace the embedded data in i18n/regexcmp.cpp
5662 for (UChar32 cp
=0; cp
<=0x10ffff; cp
++) {
5663 if (!u_hasBinaryProperty(cp
, UCHAR_CASE_SENSITIVE
)) {
5666 UnicodeSet
s(cp
, cp
);
5667 s
.closeOver(USET_CASE_INSENSITIVE
);
5668 UnicodeSetIterator
setIter(s
);
5669 while (setIter
.next()) {
5670 if (!setIter
.isString()) {
5673 const UnicodeString
&str
= setIter
.getString();
5674 UChar32 firstChar
= str
.char32At(0);
5675 UnicodeSet starters
;
5676 RegexCompile::findCaseInsensitiveStarters(firstChar
, &starters
);
5677 if (!starters
.contains(cp
)) {
5678 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp
, firstChar
);
5686 void RegexTest::TestBug11049() {
5687 // Original bug report: pattern with match start consisting of one of several individual characters,
5688 // and the text being matched ending with a supplementary character. find() would read past the
5689 // end of the input text when searching for potential match starting points.
5691 // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
5692 // detect the bad read.
5694 TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE
, __LINE__
);
5695 TestCase11049("A|B|C", "string matches at end C", TRUE
, __LINE__
);
5697 // Test again with a pattern starting with a single character,
5698 // which takes a different code path than starting with an OR expression,
5699 // but with similar logic.
5700 TestCase11049("C", "a string \\ud800\\udc00", FALSE
, __LINE__
);
5701 TestCase11049("C", "string matches at end C", TRUE
, __LINE__
);
5704 // Run a single test case from TestBug11049(). Internal function.
5705 void RegexTest::TestCase11049(const char *pattern
, const char *data
, UBool expectMatch
, int32_t lineNumber
) {
5706 UErrorCode status
= U_ZERO_ERROR
;
5707 UnicodeString patternString
= UnicodeString(pattern
).unescape();
5708 LocalPointer
<RegexPattern
> compiledPat(RegexPattern::compile(patternString
, 0, status
));
5710 UnicodeString dataString
= UnicodeString(data
).unescape();
5711 UChar
*exactBuffer
= new UChar
[dataString
.length()];
5712 dataString
.extract(exactBuffer
, dataString
.length(), status
);
5713 UText
*ut
= utext_openUChars(NULL
, exactBuffer
, dataString
.length(), &status
);
5715 LocalPointer
<RegexMatcher
> matcher(compiledPat
->matcher(status
));
5718 UBool result
= matcher
->find();
5719 if (result
!= expectMatch
) {
5720 errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5721 __FILE__
, lineNumber
, expectMatch
, result
, pattern
, data
);
5724 // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
5725 // off-by-one on find() with match at the last code point.
5726 // Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
5727 // because string.unescape() will only shrink it.
5728 char * utf8Buffer
= new char[uprv_strlen(data
)+1];
5729 u_strToUTF8(utf8Buffer
, uprv_strlen(data
)+1, NULL
, dataString
.getBuffer(), dataString
.length(), &status
);
5731 ut
= utext_openUTF8(ut
, utf8Buffer
, -1, &status
);
5734 result
= matcher
->find();
5735 if (result
!= expectMatch
) {
5736 errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5737 __FILE__
, lineNumber
, expectMatch
, result
, pattern
, data
);
5739 delete [] utf8Buffer
;
5742 delete [] exactBuffer
;
5746 void RegexTest::TestBug11371() {
5748 logln("Skipping test. Runs in exhuastive mode only.");
5751 UErrorCode status
= U_ZERO_ERROR
;
5752 UnicodeString patternString
;
5754 for (int i
=0; i
<8000000; i
++) {
5755 patternString
.append(UnicodeString("()"));
5757 LocalPointer
<RegexPattern
> compiledPat(RegexPattern::compile(patternString
, 0, status
));
5758 if (status
!= U_REGEX_PATTERN_TOO_BIG
) {
5759 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5760 __FILE__
, __LINE__
, u_errorName(status
));
5763 status
= U_ZERO_ERROR
;
5764 patternString
= "(";
5765 for (int i
=0; i
<20000000; i
++) {
5766 patternString
.append(UnicodeString("A++"));
5768 patternString
.append(UnicodeString("){0}B++"));
5769 LocalPointer
<RegexPattern
> compiledPat2(RegexPattern::compile(patternString
, 0, status
));
5770 if (status
!= U_REGEX_PATTERN_TOO_BIG
) {
5771 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5772 __FILE__
, __LINE__
, u_errorName(status
));
5775 // Pattern with too much string data, such that string indexes overflow operand data field size
5776 // in compiled instruction.
5777 status
= U_ZERO_ERROR
;
5779 while (patternString
.length() < 0x00ffffff) {
5780 patternString
.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
5782 patternString
.append(UnicodeString("X? trailing string"));
5783 LocalPointer
<RegexPattern
> compiledPat3(RegexPattern::compile(patternString
, 0, status
));
5784 if (status
!= U_REGEX_PATTERN_TOO_BIG
) {
5785 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5786 __FILE__
, __LINE__
, u_errorName(status
));
5790 void RegexTest::TestBug11480() {
5791 // C API, get capture group of a group that does not participate in the match.
5792 // (Returns a zero length string, with nul termination,
5793 // indistinguishable from a group with a zero length match.)
5795 UErrorCode status
= U_ZERO_ERROR
;
5796 URegularExpression
*re
= uregex_openC("(A)|(B)", 0, NULL
, &status
);
5798 UnicodeString text
= UNICODE_STRING_SIMPLE("A");
5799 uregex_setText(re
, text
.getBuffer(), text
.length(), &status
);
5801 REGEX_ASSERT(uregex_lookingAt(re
, 0, &status
));
5802 UChar buf
[10] = {(UChar
)13, (UChar
)13, (UChar
)13, (UChar
)13};
5803 int32_t length
= uregex_group(re
, 2, buf
+1, UPRV_LENGTHOF(buf
)-1, &status
);
5804 REGEX_ASSERT(length
== 0);
5805 REGEX_ASSERT(buf
[0] == 13);
5806 REGEX_ASSERT(buf
[1] == 0);
5807 REGEX_ASSERT(buf
[2] == 13);
5810 // UText C++ API, length of match is 0 for non-participating matches.
5811 UText ut
= UTEXT_INITIALIZER
;
5812 utext_openUnicodeString(&ut
, &text
, &status
);
5813 RegexMatcher
matcher(UnicodeString("(A)|(B)"), 0, status
);
5816 REGEX_ASSERT(matcher
.lookingAt(0, status
));
5818 // UText C++ API, Capture group 1 matches "A", position 0, length 1.
5819 int64_t groupLen
= -666;
5820 UText group
= UTEXT_INITIALIZER
;
5821 matcher
.group(1, &group
, groupLen
, status
);
5823 REGEX_ASSERT(groupLen
== 1);
5824 REGEX_ASSERT(utext_getNativeIndex(&group
) == 0);
5826 // Capture group 2, the (B), does not participate in the match.
5827 matcher
.group(2, &group
, groupLen
, status
);
5829 REGEX_ASSERT(groupLen
== 0);
5830 REGEX_ASSERT(matcher
.start(2, status
) == -1);
5835 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */