1 /********************************************************************
3 * Copyright (c) 2002-2014, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
10 // ICU Regular Expressions test, part of intltest.
16 PLEASE be careful about ASCII assumptions in this test.
17 This test is one of the worst repeat offenders.
18 If you have questions, contact someone on the ICU PMC
19 who has access to an EBCDIC system.
24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
26 #include "unicode/regex.h"
27 #include "unicode/uchar.h"
28 #include "unicode/ucnv.h"
29 #include "unicode/uniset.h"
30 #include "unicode/uregex.h"
31 #include "unicode/ustring.h"
41 #define SUPPORT_MUTATING_INPUT_STRING 0
43 //---------------------------------------------------------------------------
45 // Test class boilerplate
47 //---------------------------------------------------------------------------
48 RegexTest::RegexTest()
53 RegexTest::~RegexTest()
59 void RegexTest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* /*par*/ )
61 if (exec
) logln("TestSuite RegexTest: ");
64 case 0: name
= "Basic";
67 case 1: name
= "API_Match";
68 if (exec
) API_Match();
70 case 2: name
= "API_Replace";
71 if (exec
) API_Replace();
73 case 3: name
= "API_Pattern";
74 if (exec
) API_Pattern();
77 #if !UCONFIG_NO_FILE_IO
84 case 5: name
= "Errors";
87 case 6: name
= "PerlTests";
88 if (exec
) PerlTests();
90 case 7: name
= "Callbacks";
91 if (exec
) Callbacks();
93 case 8: name
= "FindProgressCallbacks";
94 if (exec
) FindProgressCallbacks();
96 case 9: name
= "Bug 6149";
99 case 10: name
= "UTextBasic";
100 if (exec
) UTextBasic();
102 case 11: name
= "API_Match_UTF8";
103 if (exec
) API_Match_UTF8();
105 case 12: name
= "API_Replace_UTF8";
106 if (exec
) API_Replace_UTF8();
108 case 13: name
= "API_Pattern_UTF8";
109 if (exec
) API_Pattern_UTF8();
111 case 14: name
= "PerlTestsUTF8";
112 if (exec
) PerlTestsUTF8();
114 case 15: name
= "PreAllocatedUTextCAPI";
115 if (exec
) PreAllocatedUTextCAPI();
117 case 16: name
= "Bug 7651";
120 case 17: name
= "Bug 7740";
123 case 18: name
= "Bug 8479";
126 case 19: name
= "Bug 7029";
129 case 20: name
= "CheckInvBufSize";
130 if (exec
) CheckInvBufSize();
132 case 21: name
= "Bug 9283";
135 case 22: name
= "Bug10459";
136 if (exec
) Bug10459();
140 break; //needed to end loop
147 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
149 * @see utext_openUTF8
151 static UText
* regextst_openUTF8FromInvariant(UText
* ut
, const char *inv
, int64_t length
, UErrorCode
*status
);
153 //---------------------------------------------------------------------------
155 // Error Checking / Reporting macros used in all of the tests.
157 //---------------------------------------------------------------------------
159 static void utextToPrintable(char *buf
, int32_t bufLen
, UText
*text
) {
160 int64_t oldIndex
= utext_getNativeIndex(text
);
161 utext_setNativeIndex(text
, 0);
163 UChar32 c
= utext_next32From(text
, 0);
164 while ((c
!= U_SENTINEL
) && (bufPtr
< buf
+bufLen
)) {
165 if (0x000020<=c
&& c
<0x00007e) {
169 sprintf(bufPtr
,"U+%04X", c
);
170 bufPtr
+= strlen(bufPtr
)-1;
176 c
= UTEXT_NEXT32(text
);
179 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
180 char *ebuf
= (char*)malloc(bufLen
);
181 uprv_eastrncpy((unsigned char*)ebuf
, (const unsigned char*)buf
, bufLen
);
182 uprv_strncpy(buf
, ebuf
, bufLen
);
185 utext_setNativeIndex(text
, oldIndex
);
189 static char ASSERT_BUF
[1024];
191 const char* RegexTest::extractToAssertBuf(const UnicodeString
& message
) {
192 if(message
.length()==0) {
193 strcpy(ASSERT_BUF
, "[[empty UnicodeString]]");
196 IntlTest::prettify(message
,buf
);
197 if(buf
.length()==0) {
198 strcpy(ASSERT_BUF
, "[[escape() returned 0 chars]]");
200 buf
.extract(0, 0x7FFFFFFF, ASSERT_BUF
, sizeof(ASSERT_BUF
)-1);
201 if(ASSERT_BUF
[0]==0) {
203 for(int32_t i
=0;i
<buf
.length();i
++) {
205 sprintf(ASSERT_BUF
+strlen(ASSERT_BUF
),"\\u%02x",ch
);
210 ASSERT_BUF
[sizeof(ASSERT_BUF
)-1] = 0;
214 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
216 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
218 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \
219 __FILE__, __LINE__, u_errorName(status)); return;}}
221 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
223 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
224 if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \
225 __LINE__, u_errorName(errcode), u_errorName(status));};}
227 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
228 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
230 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
231 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
233 #define REGEX_ASSERT_UNISTR(ustr,inv) {if (!(ustr==inv)) {errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s,%s) failed \n", __FILE__, __LINE__, extractToAssertBuf(ustr),inv);};}
236 static UBool
testUTextEqual(UText
*uta
, UText
*utb
) {
239 utext_setNativeIndex(uta
, 0);
240 utext_setNativeIndex(utb
, 0);
242 ca
= utext_next32(uta
);
243 cb
= utext_next32(utb
);
247 } while (ca
!= U_SENTINEL
);
253 * @param expected expected text in UTF-8 (not platform) codepage
255 void RegexTest::assertUText(const char *expected
, UText
*actual
, const char *file
, int line
) {
256 UErrorCode status
= U_ZERO_ERROR
;
257 UText expectedText
= UTEXT_INITIALIZER
;
258 utext_openUTF8(&expectedText
, expected
, -1, &status
);
259 if(U_FAILURE(status
)) {
260 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file
, line
, u_errorName(status
), strlen(expected
));
263 if(utext_nativeLength(&expectedText
)==0 && (strlen(expected
)!=0)) {
264 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file
, line
, strlen(expected
));
267 utext_setNativeIndex(actual
, 0);
268 if (!testUTextEqual(&expectedText
, actual
)) {
269 char buf
[201 /*21*/];
270 char expectedBuf
[201];
271 utextToPrintable(buf
, sizeof(buf
)/sizeof(buf
[0]), actual
);
272 utextToPrintable(expectedBuf
, sizeof(expectedBuf
)/sizeof(expectedBuf
[0]), &expectedText
);
273 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file
, line
, expectedBuf
, (int)utext_nativeLength(&expectedText
), buf
, (int)utext_nativeLength(actual
));
275 utext_close(&expectedText
);
278 * @param expected invariant (platform local text) input
281 void RegexTest::assertUTextInvariant(const char *expected
, UText
*actual
, const char *file
, int line
) {
282 UErrorCode status
= U_ZERO_ERROR
;
283 UText expectedText
= UTEXT_INITIALIZER
;
284 regextst_openUTF8FromInvariant(&expectedText
, expected
, -1, &status
);
285 if(U_FAILURE(status
)) {
286 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file
, line
, u_errorName(status
), strlen(expected
));
289 utext_setNativeIndex(actual
, 0);
290 if (!testUTextEqual(&expectedText
, actual
)) {
291 char buf
[201 /*21*/];
292 char expectedBuf
[201];
293 utextToPrintable(buf
, sizeof(buf
)/sizeof(buf
[0]), actual
);
294 utextToPrintable(expectedBuf
, sizeof(expectedBuf
)/sizeof(expectedBuf
[0]), &expectedText
);
295 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file
, line
, expectedBuf
, (int)utext_nativeLength(&expectedText
), buf
, (int)utext_nativeLength(actual
));
297 utext_close(&expectedText
);
301 * Assumes utf-8 input
303 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
305 * Assumes Invariant input
307 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
310 * This buffer ( inv_buf ) is used to hold the UTF-8 strings
311 * passed into utext_openUTF8. An error will be given if
312 * INV_BUFSIZ is too small. It's only used on EBCDIC systems.
315 #define INV_BUFSIZ 2048 /* increase this if too small */
317 static int64_t inv_next
=0;
319 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
320 static char inv_buf
[INV_BUFSIZ
];
323 static UText
* regextst_openUTF8FromInvariant(UText
*ut
, const char *inv
, int64_t length
, UErrorCode
*status
) {
324 if(length
==-1) length
=strlen(inv
);
325 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
327 return utext_openUTF8(ut
, inv
, length
, status
);
329 if(inv_next
+length
+1>INV_BUFSIZ
) {
330 fprintf(stderr
, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
331 __FILE__
, __LINE__
, INV_BUFSIZ
, (inv_next
+length
+1));
332 *status
= U_MEMORY_ALLOCATION_ERROR
;
336 unsigned char *buf
= (unsigned char*)inv_buf
+inv_next
;
337 uprv_aestrncpy(buf
, (const uint8_t*)inv
, length
);
341 fprintf(stderr
, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ
, inv_next
);
344 return utext_openUTF8(ut
, (const char*)buf
, length
, status
);
349 //---------------------------------------------------------------------------
351 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests
352 // for the LookingAt() and Match() functions.
355 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
357 // The expected results are UBool - TRUE or FALSE.
358 // The input text is unescaped. The pattern is not.
361 //---------------------------------------------------------------------------
363 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
365 UBool
RegexTest::doRegexLMTest(const char *pat
, const char *text
, UBool looking
, UBool match
, int32_t line
) {
366 const UnicodeString
pattern(pat
, -1, US_INV
);
367 const UnicodeString
inputText(text
, -1, US_INV
);
368 UErrorCode status
= U_ZERO_ERROR
;
370 RegexPattern
*REPattern
= NULL
;
371 RegexMatcher
*REMatcher
= NULL
;
374 UnicodeString
patString(pat
, -1, US_INV
);
375 REPattern
= RegexPattern::compile(patString
, 0, pe
, status
);
376 if (U_FAILURE(status
)) {
377 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s",
378 line
, u_errorName(status
));
381 if (line
==376) { REPattern
->dumpPattern();}
383 UnicodeString
inputString(inputText
);
384 UnicodeString unEscapedInput
= inputString
.unescape();
385 REMatcher
= REPattern
->matcher(unEscapedInput
, status
);
386 if (U_FAILURE(status
)) {
387 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
388 line
, u_errorName(status
));
393 actualmatch
= REMatcher
->lookingAt(status
);
394 if (U_FAILURE(status
)) {
395 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
396 line
, u_errorName(status
));
399 if (actualmatch
!= looking
) {
400 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line
);
404 status
= U_ZERO_ERROR
;
405 actualmatch
= REMatcher
->matches(status
);
406 if (U_FAILURE(status
)) {
407 errln("RegexTest failure in matches() at line %d. Status = %s\n",
408 line
, u_errorName(status
));
411 if (actualmatch
!= match
) {
412 errln("RegexTest: wrong return from matches() at line %d.\n", line
);
416 if (retVal
== FALSE
) {
417 REPattern
->dumpPattern();
426 UBool
RegexTest::doRegexLMTestUTF8(const char *pat
, const char *text
, UBool looking
, UBool match
, int32_t line
) {
427 UText pattern
= UTEXT_INITIALIZER
;
428 int32_t inputUTF8Length
;
429 char *textChars
= NULL
;
430 UText inputText
= UTEXT_INITIALIZER
;
431 UErrorCode status
= U_ZERO_ERROR
;
433 RegexPattern
*REPattern
= NULL
;
434 RegexMatcher
*REMatcher
= NULL
;
437 regextst_openUTF8FromInvariant(&pattern
, pat
, -1, &status
);
438 REPattern
= RegexPattern::compile(&pattern
, 0, pe
, status
);
439 if (U_FAILURE(status
)) {
440 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n",
441 line
, u_errorName(status
));
445 UnicodeString
inputString(text
, -1, US_INV
);
446 UnicodeString unEscapedInput
= inputString
.unescape();
447 LocalUConverterPointer
UTF8Converter(ucnv_open("UTF8", &status
));
448 ucnv_setFromUCallBack(UTF8Converter
.getAlias(), UCNV_FROM_U_CALLBACK_STOP
, NULL
, NULL
, NULL
, &status
);
450 inputUTF8Length
= unEscapedInput
.extract(NULL
, 0, UTF8Converter
.getAlias(), status
);
451 if (U_FAILURE(status
) && status
!= U_BUFFER_OVERFLOW_ERROR
) {
452 // UTF-8 does not allow unpaired surrogates, so this could actually happen
453 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line
, u_errorName(status
));
454 return TRUE
; // not a failure of the Regex engine
456 status
= U_ZERO_ERROR
; // buffer overflow
457 textChars
= new char[inputUTF8Length
+1];
458 unEscapedInput
.extract(textChars
, inputUTF8Length
+1, UTF8Converter
.getAlias(), status
);
459 utext_openUTF8(&inputText
, textChars
, inputUTF8Length
, &status
);
461 REMatcher
= &REPattern
->matcher(status
)->reset(&inputText
);
462 if (U_FAILURE(status
)) {
463 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n",
464 line
, u_errorName(status
));
469 actualmatch
= REMatcher
->lookingAt(status
);
470 if (U_FAILURE(status
)) {
471 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n",
472 line
, u_errorName(status
));
475 if (actualmatch
!= looking
) {
476 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line
);
480 status
= U_ZERO_ERROR
;
481 actualmatch
= REMatcher
->matches(status
);
482 if (U_FAILURE(status
)) {
483 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n",
484 line
, u_errorName(status
));
487 if (actualmatch
!= match
) {
488 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line
);
492 if (retVal
== FALSE
) {
493 REPattern
->dumpPattern();
498 utext_close(&inputText
);
499 utext_close(&pattern
);
506 //---------------------------------------------------------------------------
508 // REGEX_ERR Macro + invocation function to simplify writing tests
509 // regex tests for incorrect patterns
512 // REGEX_ERR("pattern", expected error line, column, expected status);
514 //---------------------------------------------------------------------------
515 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
517 void RegexTest::regex_err(const char *pat
, int32_t errLine
, int32_t errCol
,
518 UErrorCode expectedStatus
, int32_t line
) {
519 UnicodeString
pattern(pat
);
521 UErrorCode status
= U_ZERO_ERROR
;
523 RegexPattern
*callerPattern
= NULL
;
526 // Compile the caller's pattern
528 UnicodeString
patString(pat
);
529 callerPattern
= RegexPattern::compile(patString
, 0, pe
, status
);
530 if (status
!= expectedStatus
) {
531 dataerrln("Line %d: unexpected error %s compiling pattern.", line
, u_errorName(status
));
533 if (status
!= U_ZERO_ERROR
) {
534 if (pe
.line
!= errLine
|| pe
.offset
!= errCol
) {
535 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
536 line
, errLine
, errCol
, pe
.line
, pe
.offset
);
541 delete callerPattern
;
544 // Compile again, using a UTF-8-based UText
546 UText patternText
= UTEXT_INITIALIZER
;
547 regextst_openUTF8FromInvariant(&patternText
, pat
, -1, &status
);
548 callerPattern
= RegexPattern::compile(&patternText
, 0, pe
, status
);
549 if (status
!= expectedStatus
) {
550 dataerrln("Line %d: unexpected error %s compiling pattern.", line
, u_errorName(status
));
552 if (status
!= U_ZERO_ERROR
) {
553 if (pe
.line
!= errLine
|| pe
.offset
!= errCol
) {
554 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
555 line
, errLine
, errCol
, pe
.line
, pe
.offset
);
560 delete callerPattern
;
561 utext_close(&patternText
);
566 //---------------------------------------------------------------------------
568 // Basic Check for basic functionality of regex pattern matching.
569 // Avoid the use of REGEX_FIND test macro, which has
570 // substantial dependencies on basic Regex functionality.
572 //---------------------------------------------------------------------------
573 void RegexTest::Basic() {
577 // Debug - slide failing test cases early
581 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
583 UErrorCode status
= U_ZERO_ERROR
;
584 RegexPattern
*pattern
;
585 pattern
= RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE
, pe
, status
);
586 pattern
->dumpPattern();
587 RegexMatcher
*m
= pattern
->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status
);
588 UBool result
= m
->find();
589 printf("result = %d\n", result
);
590 // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
591 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
598 // Pattern with parentheses
600 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE
, FALSE
);
601 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE
, TRUE
);
602 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE
, FALSE
);
607 REGEX_TESTLM("st(abc)*ring", "string", TRUE
, TRUE
);
608 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE
, TRUE
);
609 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE
, TRUE
);
610 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE
, FALSE
);
611 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE
, FALSE
);
613 REGEX_TESTLM("a*", "", TRUE
, TRUE
);
614 REGEX_TESTLM("a*", "b", TRUE
, FALSE
);
620 REGEX_TESTLM(".", "abc", TRUE
, FALSE
);
621 REGEX_TESTLM("...", "abc", TRUE
, TRUE
);
622 REGEX_TESTLM("....", "abc", FALSE
, FALSE
);
623 REGEX_TESTLM(".*", "abcxyz123", TRUE
, TRUE
);
624 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE
, FALSE
);
625 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE
, TRUE
);
626 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE
, TRUE
);
627 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE
, FALSE
);
630 // Patterns with * applied to chars at end of literal string
632 REGEX_TESTLM("abc*", "ab", TRUE
, TRUE
);
633 REGEX_TESTLM("abc*", "abccccc", TRUE
, TRUE
);
636 // Supplemental chars match as single chars, not a pair of surrogates.
638 REGEX_TESTLM(".", "\\U00011000", TRUE
, TRUE
);
639 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE
, TRUE
);
640 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE
, FALSE
);
644 // UnicodeSets in the pattern
646 REGEX_TESTLM("[1-6]", "1", TRUE
, TRUE
);
647 REGEX_TESTLM("[1-6]", "3", TRUE
, TRUE
);
648 REGEX_TESTLM("[1-6]", "7", FALSE
, FALSE
);
649 REGEX_TESTLM("a[1-6]", "a3", TRUE
, TRUE
);
650 REGEX_TESTLM("a[1-6]", "a3", TRUE
, TRUE
);
651 REGEX_TESTLM("a[1-6]b", "a3b", TRUE
, TRUE
);
653 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE
, TRUE
);
654 REGEX_TESTLM("a[0-9]*b", "abc", TRUE
, FALSE
);
655 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE
, TRUE
);
656 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE
, FALSE
); // note that * matches 0 occurences.
657 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE
, TRUE
);
660 // OR operator in patterns
662 REGEX_TESTLM("(a|b)", "a", TRUE
, TRUE
);
663 REGEX_TESTLM("(a|b)", "b", TRUE
, TRUE
);
664 REGEX_TESTLM("(a|b)", "c", FALSE
, FALSE
);
665 REGEX_TESTLM("a|b", "b", TRUE
, TRUE
);
667 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE
, TRUE
);
668 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE
, FALSE
);
669 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE
, TRUE
);
670 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE
, TRUE
);
671 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE
, TRUE
);
672 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE
, FALSE
);
677 REGEX_TESTLM("ab+", "abbc", TRUE
, FALSE
);
678 REGEX_TESTLM("ab+c", "ac", FALSE
, FALSE
);
679 REGEX_TESTLM("b+", "", FALSE
, FALSE
);
680 REGEX_TESTLM("(abc|def)+", "defabc", TRUE
, TRUE
);
681 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE
, FALSE
);
682 REGEX_TESTLM(".+y", "zippity dooy", TRUE
, TRUE
);
687 REGEX_TESTLM("ab?", "ab", TRUE
, TRUE
);
688 REGEX_TESTLM("ab?", "a", TRUE
, TRUE
);
689 REGEX_TESTLM("ab?", "ac", TRUE
, FALSE
);
690 REGEX_TESTLM("ab?", "abb", TRUE
, FALSE
);
691 REGEX_TESTLM("a(b|c)?d", "abd", TRUE
, TRUE
);
692 REGEX_TESTLM("a(b|c)?d", "acd", TRUE
, TRUE
);
693 REGEX_TESTLM("a(b|c)?d", "ad", TRUE
, TRUE
);
694 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE
, FALSE
);
695 REGEX_TESTLM("a(b|c)?d", "ab", FALSE
, FALSE
);
698 // Escape sequences that become single literal chars, handled internally
699 // by ICU's Unescape.
702 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
703 REGEX_TESTLM("\\a", "\\u0007", TRUE
, TRUE
); // BEL
704 REGEX_TESTLM("\\cL", "\\u000c", TRUE
, TRUE
); // Control-L
705 REGEX_TESTLM("\\e", "\\u001b", TRUE
, TRUE
); // Escape
706 REGEX_TESTLM("\\f", "\\u000c", TRUE
, TRUE
); // Form Feed
707 REGEX_TESTLM("\\n", "\\u000a", TRUE
, TRUE
); // new line
708 REGEX_TESTLM("\\r", "\\u000d", TRUE
, TRUE
); // CR
709 REGEX_TESTLM("\\t", "\\u0009", TRUE
, TRUE
); // Tab
710 REGEX_TESTLM("\\u1234", "\\u1234", TRUE
, TRUE
);
711 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE
, TRUE
);
713 REGEX_TESTLM(".*\\Ax", "xyz", TRUE
, FALSE
); // \A matches only at the beginning of input
714 REGEX_TESTLM(".*\\Ax", " xyz", FALSE
, FALSE
); // \A matches only at the beginning of input
716 // Escape of special chars in patterns
717 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE
, TRUE
);
721 //---------------------------------------------------------------------------
723 // UTextBasic Check for quirks that are specific to the UText
726 //---------------------------------------------------------------------------
727 void RegexTest::UTextBasic() {
728 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
729 UErrorCode status
= U_ZERO_ERROR
;
730 UText pattern
= UTEXT_INITIALIZER
;
731 utext_openUTF8(&pattern
, str_abc
, -1, &status
);
732 RegexMatcher
matcher(&pattern
, 0, status
);
735 UText input
= UTEXT_INITIALIZER
;
736 utext_openUTF8(&input
, str_abc
, -1, &status
);
738 matcher
.reset(&input
);
740 REGEX_ASSERT_UTEXT_UTF8(str_abc
, matcher
.inputText());
742 matcher
.reset(matcher
.inputText());
744 REGEX_ASSERT_UTEXT_UTF8(str_abc
, matcher
.inputText());
746 utext_close(&pattern
);
751 //---------------------------------------------------------------------------
753 // API_Match Test that the API for class RegexMatcher
754 // is present and nominally working, but excluding functions
755 // implementing replace operations.
757 //---------------------------------------------------------------------------
758 void RegexTest::API_Match() {
760 UErrorCode status
=U_ZERO_ERROR
;
764 // Debug - slide failing test cases early
773 // Simple pattern compilation
776 UnicodeString
re("abc");
778 pat2
= RegexPattern::compile(re
, flags
, pe
, status
);
781 UnicodeString inStr1
= "abcdef this is a test";
782 UnicodeString instr2
= "not abc";
783 UnicodeString empty
= "";
787 // Matcher creation and reset.
789 RegexMatcher
*m1
= pat2
->matcher(inStr1
, status
);
791 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
792 REGEX_ASSERT(m1
->input() == inStr1
);
794 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
795 REGEX_ASSERT(m1
->input() == instr2
);
797 REGEX_ASSERT(m1
->input() == inStr1
);
798 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
800 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
801 REGEX_ASSERT(m1
->input() == empty
);
802 REGEX_ASSERT(&m1
->pattern() == pat2
);
805 // reset(pos, status)
808 m1
->reset(4, status
);
810 REGEX_ASSERT(m1
->input() == inStr1
);
811 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
813 m1
->reset(-1, status
);
814 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
815 status
= U_ZERO_ERROR
;
817 m1
->reset(0, status
);
819 status
= U_ZERO_ERROR
;
821 int32_t len
= m1
->input().length();
822 m1
->reset(len
-1, status
);
824 status
= U_ZERO_ERROR
;
826 m1
->reset(len
, status
);
828 status
= U_ZERO_ERROR
;
830 m1
->reset(len
+1, status
);
831 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
832 status
= U_ZERO_ERROR
;
835 // match(pos, status)
838 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
840 REGEX_ASSERT(m1
->matches(3, status
) == FALSE
);
842 REGEX_ASSERT(m1
->matches(5, status
) == FALSE
);
843 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
844 REGEX_ASSERT(m1
->matches(-1, status
) == FALSE
);
845 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
847 // Match() at end of string should fail, but should not
849 status
= U_ZERO_ERROR
;
850 len
= m1
->input().length();
851 REGEX_ASSERT(m1
->matches(len
, status
) == FALSE
);
854 // Match beyond end of string should fail with an error.
855 status
= U_ZERO_ERROR
;
856 REGEX_ASSERT(m1
->matches(len
+1, status
) == FALSE
);
857 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
859 // Successful match at end of string.
861 status
= U_ZERO_ERROR
;
862 RegexMatcher
m("A?", 0, status
); // will match zero length string.
865 len
= inStr1
.length();
866 REGEX_ASSERT(m
.matches(len
, status
) == TRUE
);
869 REGEX_ASSERT(m
.matches(0, status
) == TRUE
);
875 // lookingAt(pos, status)
877 status
= U_ZERO_ERROR
;
878 m1
->reset(instr2
); // "not abc"
879 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
880 REGEX_ASSERT(m1
->lookingAt(5, status
) == FALSE
);
881 REGEX_ASSERT(m1
->lookingAt(3, status
) == FALSE
);
882 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
883 REGEX_ASSERT(m1
->lookingAt(-1, status
) == FALSE
);
884 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
885 status
= U_ZERO_ERROR
;
886 len
= m1
->input().length();
887 REGEX_ASSERT(m1
->lookingAt(len
, status
) == FALSE
);
889 REGEX_ASSERT(m1
->lookingAt(len
+1, status
) == FALSE
);
890 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
899 // RegexMatcher::start();
900 // RegexMatcher::end();
901 // RegexMatcher::groupCount();
906 UErrorCode status
=U_ZERO_ERROR
;
908 UnicodeString
re("01(23(45)67)(.*)");
909 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
911 UnicodeString data
= "0123456789";
913 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
915 REGEX_ASSERT(matcher
->lookingAt(status
) == TRUE
);
916 static const int32_t matchStarts
[] = {0, 2, 4, 8};
917 static const int32_t matchEnds
[] = {10, 8, 6, 10};
919 for (i
=0; i
<4; i
++) {
920 int32_t actualStart
= matcher
->start(i
, status
);
922 if (actualStart
!= matchStarts
[i
]) {
923 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
924 __LINE__
, i
, matchStarts
[i
], actualStart
);
926 int32_t actualEnd
= matcher
->end(i
, status
);
928 if (actualEnd
!= matchEnds
[i
]) {
929 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
930 __LINE__
, i
, matchEnds
[i
], actualEnd
);
934 REGEX_ASSERT(matcher
->start(0, status
) == matcher
->start(status
));
935 REGEX_ASSERT(matcher
->end(0, status
) == matcher
->end(status
));
937 REGEX_ASSERT_FAIL(matcher
->start(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
938 REGEX_ASSERT_FAIL(matcher
->start( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
940 REGEX_ASSERT_FAIL(matcher
->start( 0, status
), U_REGEX_INVALID_STATE
);
942 matcher
->lookingAt(status
);
943 REGEX_ASSERT(matcher
->group(status
) == "0123456789");
944 REGEX_ASSERT(matcher
->group(0, status
) == "0123456789");
945 REGEX_ASSERT(matcher
->group(1, status
) == "234567" );
946 REGEX_ASSERT(matcher
->group(2, status
) == "45" );
947 REGEX_ASSERT(matcher
->group(3, status
) == "89" );
949 REGEX_ASSERT_FAIL(matcher
->group(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
950 REGEX_ASSERT_FAIL(matcher
->group( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
952 REGEX_ASSERT_FAIL(matcher
->group( 0, status
), U_REGEX_INVALID_STATE
);
965 UErrorCode status
=U_ZERO_ERROR
;
967 UnicodeString
re("abc");
968 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
970 UnicodeString data
= ".abc..abc...abc..";
971 // 012345678901234567
973 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
975 REGEX_ASSERT(matcher
->find());
976 REGEX_ASSERT(matcher
->start(status
) == 1);
977 REGEX_ASSERT(matcher
->find());
978 REGEX_ASSERT(matcher
->start(status
) == 6);
979 REGEX_ASSERT(matcher
->find());
980 REGEX_ASSERT(matcher
->start(status
) == 12);
981 REGEX_ASSERT(matcher
->find() == FALSE
);
982 REGEX_ASSERT(matcher
->find() == FALSE
);
985 REGEX_ASSERT(matcher
->find());
986 REGEX_ASSERT(matcher
->start(status
) == 1);
988 REGEX_ASSERT(matcher
->find(0, status
));
989 REGEX_ASSERT(matcher
->start(status
) == 1);
990 REGEX_ASSERT(matcher
->find(1, status
));
991 REGEX_ASSERT(matcher
->start(status
) == 1);
992 REGEX_ASSERT(matcher
->find(2, status
));
993 REGEX_ASSERT(matcher
->start(status
) == 6);
994 REGEX_ASSERT(matcher
->find(12, status
));
995 REGEX_ASSERT(matcher
->start(status
) == 12);
996 REGEX_ASSERT(matcher
->find(13, status
) == FALSE
);
997 REGEX_ASSERT(matcher
->find(16, status
) == FALSE
);
998 REGEX_ASSERT(matcher
->find(17, status
) == FALSE
);
999 REGEX_ASSERT_FAIL(matcher
->start(status
), U_REGEX_INVALID_STATE
);
1001 status
= U_ZERO_ERROR
;
1002 REGEX_ASSERT_FAIL(matcher
->find(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
1003 status
= U_ZERO_ERROR
;
1004 REGEX_ASSERT_FAIL(matcher
->find(18, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
1006 REGEX_ASSERT(matcher
->groupCount() == 0);
1014 // find, with \G in pattern (true if at the end of a previous match).
1019 UErrorCode status
=U_ZERO_ERROR
;
1021 UnicodeString
re(".*?(?:(\\Gabc)|(abc))", -1, US_INV
);
1022 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
1024 UnicodeString data
= ".abcabc.abc..";
1025 // 012345678901234567
1027 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
1029 REGEX_ASSERT(matcher
->find());
1030 REGEX_ASSERT(matcher
->start(status
) == 0);
1031 REGEX_ASSERT(matcher
->start(1, status
) == -1);
1032 REGEX_ASSERT(matcher
->start(2, status
) == 1);
1034 REGEX_ASSERT(matcher
->find());
1035 REGEX_ASSERT(matcher
->start(status
) == 4);
1036 REGEX_ASSERT(matcher
->start(1, status
) == 4);
1037 REGEX_ASSERT(matcher
->start(2, status
) == -1);
1045 // find with zero length matches, match position should bump ahead
1046 // to prevent loops.
1050 UErrorCode status
=U_ZERO_ERROR
;
1051 RegexMatcher
m("(?= ?)", 0, status
); // This pattern will zero-length matches anywhere,
1052 // using an always-true look-ahead.
1054 UnicodeString
s(" ");
1057 if (m
.find() == FALSE
) {
1060 REGEX_ASSERT(m
.start(status
) == i
);
1061 REGEX_ASSERT(m
.end(status
) == i
);
1065 // Check that the bump goes over surrogate pairs OK
1066 s
= UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1070 if (m
.find() == FALSE
) {
1073 REGEX_ASSERT(m
.start(status
) == i
);
1074 REGEX_ASSERT(m
.end(status
) == i
);
1076 REGEX_ASSERT(i
==10);
1079 // find() loop breaking test.
1080 // with pattern of /.?/, should see a series of one char matches, then a single
1081 // match of zero length at the end of the input string.
1083 UErrorCode status
=U_ZERO_ERROR
;
1084 RegexMatcher
m(".?", 0, status
);
1086 UnicodeString
s(" ");
1089 if (m
.find() == FALSE
) {
1092 REGEX_ASSERT(m
.start(status
) == i
);
1093 REGEX_ASSERT(m
.end(status
) == (i
<4 ? i
+1 : i
));
1100 // Matchers with no input string behave as if they had an empty input string.
1104 UErrorCode status
= U_ZERO_ERROR
;
1105 RegexMatcher
m(".?", 0, status
);
1107 REGEX_ASSERT(m
.find());
1108 REGEX_ASSERT(m
.start(status
) == 0);
1109 REGEX_ASSERT(m
.input() == "");
1112 UErrorCode status
= U_ZERO_ERROR
;
1113 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
1114 RegexMatcher
*m
= p
->matcher(status
);
1117 REGEX_ASSERT(m
->find() == FALSE
);
1118 REGEX_ASSERT(m
->input() == "");
1127 UErrorCode status
= U_ZERO_ERROR
;
1128 UnicodeString
testString("This is test data");
1129 RegexMatcher
m(".*", testString
, 0, status
);
1131 REGEX_ASSERT(m
.regionStart() == 0);
1132 REGEX_ASSERT(m
.regionEnd() == testString
.length());
1133 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1134 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1136 m
.region(2,4, status
);
1138 REGEX_ASSERT(m
.matches(status
));
1139 REGEX_ASSERT(m
.start(status
)==2);
1140 REGEX_ASSERT(m
.end(status
)==4);
1144 REGEX_ASSERT(m
.regionStart() == 0);
1145 REGEX_ASSERT(m
.regionEnd() == testString
.length());
1147 UnicodeString
shorterString("short");
1148 m
.reset(shorterString
);
1149 REGEX_ASSERT(m
.regionStart() == 0);
1150 REGEX_ASSERT(m
.regionEnd() == shorterString
.length());
1152 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1153 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(FALSE
));
1154 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
1155 REGEX_ASSERT(&m
== &m
.reset());
1156 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
1158 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(TRUE
));
1159 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1160 REGEX_ASSERT(&m
== &m
.reset());
1161 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1163 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1164 REGEX_ASSERT(&m
== &m
.useTransparentBounds(TRUE
));
1165 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
1166 REGEX_ASSERT(&m
== &m
.reset());
1167 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
1169 REGEX_ASSERT(&m
== &m
.useTransparentBounds(FALSE
));
1170 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1171 REGEX_ASSERT(&m
== &m
.reset());
1172 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1177 // hitEnd() and requireEnd()
1180 UErrorCode status
= U_ZERO_ERROR
;
1181 UnicodeString
testString("aabb");
1182 RegexMatcher
m1(".*", testString
, 0, status
);
1183 REGEX_ASSERT(m1
.lookingAt(status
) == TRUE
);
1184 REGEX_ASSERT(m1
.hitEnd() == TRUE
);
1185 REGEX_ASSERT(m1
.requireEnd() == FALSE
);
1188 status
= U_ZERO_ERROR
;
1189 RegexMatcher
m2("a*", testString
, 0, status
);
1190 REGEX_ASSERT(m2
.lookingAt(status
) == TRUE
);
1191 REGEX_ASSERT(m2
.hitEnd() == FALSE
);
1192 REGEX_ASSERT(m2
.requireEnd() == FALSE
);
1195 status
= U_ZERO_ERROR
;
1196 RegexMatcher
m3(".*$", testString
, 0, status
);
1197 REGEX_ASSERT(m3
.lookingAt(status
) == TRUE
);
1198 REGEX_ASSERT(m3
.hitEnd() == TRUE
);
1199 REGEX_ASSERT(m3
.requireEnd() == TRUE
);
1205 // Compilation error on reset with UChar *
1206 // These were a hazard that people were stumbling over with runtime errors.
1207 // Changed them to compiler errors by adding private methods that more closely
1208 // matched the incorrect use of the functions.
1212 UErrorCode status
= U_ZERO_ERROR
;
1213 UChar ucharString
[20];
1214 RegexMatcher
m(".", 0, status
);
1215 m
.reset(ucharString
); // should not compile.
1217 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
1218 RegexMatcher
*m2
= p
->matcher(ucharString
, status
); // should not compile.
1220 RegexMatcher
m3(".", ucharString
, 0, status
); // Should not compile
1226 // Note: These tests will need to be changed when the regexp engine is
1227 // able to detect and cut short the exponential time behavior on
1228 // this type of match.
1231 UErrorCode status
= U_ZERO_ERROR
;
1232 // Enough 'a's in the string to cause the match to time out.
1233 // (Each on additonal 'a' doubles the time)
1234 UnicodeString
testString("aaaaaaaaaaaaaaaaaaaaa");
1235 RegexMatcher
matcher("(a+)+b", testString
, 0, status
);
1237 REGEX_ASSERT(matcher
.getTimeLimit() == 0);
1238 matcher
.setTimeLimit(100, status
);
1239 REGEX_ASSERT(matcher
.getTimeLimit() == 100);
1240 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1241 REGEX_ASSERT(status
== U_REGEX_TIME_OUT
);
1244 UErrorCode status
= U_ZERO_ERROR
;
1245 // Few enough 'a's to slip in under the time limit.
1246 UnicodeString
testString("aaaaaaaaaaaaaaaaaa");
1247 RegexMatcher
matcher("(a+)+b", testString
, 0, status
);
1249 matcher
.setTimeLimit(100, status
);
1250 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1258 UErrorCode status
= U_ZERO_ERROR
;
1259 UnicodeString
testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'
1261 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1262 // of the '+', and makes the stack frames larger.
1263 RegexMatcher
matcher("(A)+A$", testString
, 0, status
);
1265 // With the default stack, this match should fail to run
1266 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1267 REGEX_ASSERT(status
== U_REGEX_STACK_OVERFLOW
);
1269 // With unlimited stack, it should run
1270 status
= U_ZERO_ERROR
;
1271 matcher
.setStackLimit(0, status
);
1273 REGEX_ASSERT(matcher
.lookingAt(status
) == TRUE
);
1275 REGEX_ASSERT(matcher
.getStackLimit() == 0);
1277 // With a limited stack, it the match should fail
1278 status
= U_ZERO_ERROR
;
1279 matcher
.setStackLimit(10000, status
);
1280 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1281 REGEX_ASSERT(status
== U_REGEX_STACK_OVERFLOW
);
1282 REGEX_ASSERT(matcher
.getStackLimit() == 10000);
1285 // A pattern that doesn't save state should work with
1286 // a minimal sized stack
1288 UErrorCode status
= U_ZERO_ERROR
;
1289 UnicodeString testString
= "abc";
1290 RegexMatcher
matcher("abc", testString
, 0, status
);
1292 matcher
.setStackLimit(30, status
);
1294 REGEX_ASSERT(matcher
.matches(status
) == TRUE
);
1296 REGEX_ASSERT(matcher
.getStackLimit() == 30);
1298 // Negative stack sizes should fail
1299 status
= U_ZERO_ERROR
;
1300 matcher
.setStackLimit(1000, status
);
1302 matcher
.setStackLimit(-1, status
);
1303 REGEX_ASSERT(status
== U_ILLEGAL_ARGUMENT_ERROR
);
1304 REGEX_ASSERT(matcher
.getStackLimit() == 1000);
1315 //---------------------------------------------------------------------------
1317 // API_Replace API test for class RegexMatcher, testing the
1318 // Replace family of functions.
1320 //---------------------------------------------------------------------------
1321 void RegexTest::API_Replace() {
1327 UErrorCode status
=U_ZERO_ERROR
;
1329 UnicodeString
re("abc");
1330 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
1332 UnicodeString data
= ".abc..abc...abc..";
1333 // 012345678901234567
1334 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
1337 // Plain vanilla matches.
1340 dest
= matcher
->replaceFirst("yz", status
);
1342 REGEX_ASSERT(dest
== ".yz..abc...abc..");
1344 dest
= matcher
->replaceAll("yz", status
);
1346 REGEX_ASSERT(dest
== ".yz..yz...yz..");
1349 // Plain vanilla non-matches.
1351 UnicodeString d2
= ".abx..abx...abx..";
1353 dest
= matcher
->replaceFirst("yz", status
);
1355 REGEX_ASSERT(dest
== ".abx..abx...abx..");
1357 dest
= matcher
->replaceAll("yz", status
);
1359 REGEX_ASSERT(dest
== ".abx..abx...abx..");
1362 // Empty source string
1364 UnicodeString d3
= "";
1366 dest
= matcher
->replaceFirst("yz", status
);
1368 REGEX_ASSERT(dest
== "");
1370 dest
= matcher
->replaceAll("yz", status
);
1372 REGEX_ASSERT(dest
== "");
1375 // Empty substitution string
1377 matcher
->reset(data
); // ".abc..abc...abc.."
1378 dest
= matcher
->replaceFirst("", status
);
1380 REGEX_ASSERT(dest
== "...abc...abc..");
1382 dest
= matcher
->replaceAll("", status
);
1384 REGEX_ASSERT(dest
== "........");
1387 // match whole string
1389 UnicodeString d4
= "abc";
1391 dest
= matcher
->replaceFirst("xyz", status
);
1393 REGEX_ASSERT(dest
== "xyz");
1395 dest
= matcher
->replaceAll("xyz", status
);
1397 REGEX_ASSERT(dest
== "xyz");
1400 // Capture Group, simple case
1402 UnicodeString
re2("a(..)");
1403 RegexPattern
*pat2
= RegexPattern::compile(re2
, flags
, pe
, status
);
1405 UnicodeString d5
= "abcdefg";
1406 RegexMatcher
*matcher2
= pat2
->matcher(d5
, status
);
1408 dest
= matcher2
->replaceFirst("$1$1", status
);
1410 REGEX_ASSERT(dest
== "bcbcdefg");
1412 dest
= matcher2
->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status
);
1414 REGEX_ASSERT(dest
== "The value of $1 is bc.defg");
1416 dest
= matcher2
->replaceFirst("$ by itself, no group number $$$", status
);
1418 REGEX_ASSERT(dest
== "$ by itself, no group number $$$defg");
1420 UnicodeString replacement
= UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1421 replacement
= replacement
.unescape();
1422 dest
= matcher2
->replaceFirst(replacement
, status
);
1424 REGEX_ASSERT(dest
== "Supplemental Digit 1 bc.defg");
1426 REGEX_ASSERT_FAIL(matcher2
->replaceFirst("bad capture group number $5...",status
), U_INDEX_OUTOFBOUNDS_ERROR
);
1430 // Replacement String with \u hex escapes
1433 UnicodeString src
= "abc 1 abc 2 abc 3";
1434 UnicodeString substitute
= UNICODE_STRING_SIMPLE("--\\u0043--");
1435 matcher
->reset(src
);
1436 UnicodeString result
= matcher
->replaceAll(substitute
, status
);
1438 REGEX_ASSERT(result
== "--C-- 1 --C-- 2 --C-- 3");
1441 UnicodeString src
= "abc !";
1442 UnicodeString substitute
= UNICODE_STRING_SIMPLE("--\\U00010000--");
1443 matcher
->reset(src
);
1444 UnicodeString result
= matcher
->replaceAll(substitute
, status
);
1446 UnicodeString expected
= UnicodeString("--");
1447 expected
.append((UChar32
)0x10000);
1448 expected
.append("-- !");
1449 REGEX_ASSERT(result
== expected
);
1451 // TODO: need more through testing of capture substitutions.
1456 status
= U_ZERO_ERROR
;
1457 UnicodeString s
= "The matches start with ss and end with ee ss stuff ee fin";
1458 RegexMatcher
m("ss(.*?)ee", 0, status
);
1460 UnicodeString result
;
1462 // Multiple finds do NOT bump up the previous appendReplacement postion.
1466 m
.appendReplacement(result
, "ooh", status
);
1468 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1470 // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1471 status
= U_ZERO_ERROR
;
1473 m
.reset(10, status
);
1476 m
.appendReplacement(result
, "ooh", status
);
1478 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1480 // find() at interior of string, appendReplacemnt still starts at beginning.
1481 status
= U_ZERO_ERROR
;
1486 m
.appendReplacement(result
, "ooh", status
);
1488 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1490 m
.appendTail(result
);
1491 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh fin");
1502 //---------------------------------------------------------------------------
1504 // API_Pattern Test that the API for class RegexPattern is
1505 // present and nominally working.
1507 //---------------------------------------------------------------------------
1508 void RegexTest::API_Pattern() {
1509 RegexPattern pata
; // Test default constructor to not crash.
1512 REGEX_ASSERT(pata
== patb
);
1513 REGEX_ASSERT(pata
== pata
);
1515 UnicodeString
re1("abc[a-l][m-z]");
1516 UnicodeString
re2("def");
1517 UErrorCode status
= U_ZERO_ERROR
;
1520 RegexPattern
*pat1
= RegexPattern::compile(re1
, 0, pe
, status
);
1521 RegexPattern
*pat2
= RegexPattern::compile(re2
, 0, pe
, status
);
1523 REGEX_ASSERT(*pat1
== *pat1
);
1524 REGEX_ASSERT(*pat1
!= pata
);
1528 REGEX_ASSERT(patb
== *pat1
);
1531 RegexPattern
patc(*pat1
);
1532 REGEX_ASSERT(patc
== *pat1
);
1533 REGEX_ASSERT(patb
== patc
);
1534 REGEX_ASSERT(pat1
!= pat2
);
1536 REGEX_ASSERT(patb
!= patc
);
1537 REGEX_ASSERT(patb
== *pat2
);
1539 // Compile with no flags.
1540 RegexPattern
*pat1a
= RegexPattern::compile(re1
, pe
, status
);
1541 REGEX_ASSERT(*pat1a
== *pat1
);
1543 REGEX_ASSERT(pat1a
->flags() == 0);
1545 // Compile with different flags should be not equal
1546 RegexPattern
*pat1b
= RegexPattern::compile(re1
, UREGEX_CASE_INSENSITIVE
, pe
, status
);
1549 REGEX_ASSERT(*pat1b
!= *pat1a
);
1550 REGEX_ASSERT(pat1b
->flags() == UREGEX_CASE_INSENSITIVE
);
1551 REGEX_ASSERT(pat1a
->flags() == 0);
1555 RegexPattern
*pat1c
= pat1
->clone();
1556 REGEX_ASSERT(*pat1c
== *pat1
);
1557 REGEX_ASSERT(*pat1c
!= *pat2
);
1566 // Verify that a matcher created from a cloned pattern works.
1570 UErrorCode status
= U_ZERO_ERROR
;
1571 RegexPattern
*pSource
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status
);
1572 RegexPattern
*pClone
= pSource
->clone();
1574 RegexMatcher
*mFromClone
= pClone
->matcher(status
);
1576 UnicodeString s
= "Hello World";
1577 mFromClone
->reset(s
);
1578 REGEX_ASSERT(mFromClone
->find() == TRUE
);
1579 REGEX_ASSERT(mFromClone
->group(status
) == "Hello");
1580 REGEX_ASSERT(mFromClone
->find() == TRUE
);
1581 REGEX_ASSERT(mFromClone
->group(status
) == "World");
1582 REGEX_ASSERT(mFromClone
->find() == FALSE
);
1588 // matches convenience API
1590 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe
, status
) == TRUE
);
1592 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe
, status
) == FALSE
);
1594 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe
, status
) == TRUE
);
1596 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe
, status
) == TRUE
);
1598 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe
, status
) == FALSE
);
1600 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1601 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe
, status
) == FALSE
);
1602 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1608 status
= U_ZERO_ERROR
;
1609 pat1
= RegexPattern::compile(" +", pe
, status
);
1611 UnicodeString fields
[10];
1614 n
= pat1
->split("Now is the time", fields
, 10, status
);
1617 REGEX_ASSERT(fields
[0]=="Now");
1618 REGEX_ASSERT(fields
[1]=="is");
1619 REGEX_ASSERT(fields
[2]=="the");
1620 REGEX_ASSERT(fields
[3]=="time");
1621 REGEX_ASSERT(fields
[4]=="");
1623 n
= pat1
->split("Now is the time", fields
, 2, status
);
1626 REGEX_ASSERT(fields
[0]=="Now");
1627 REGEX_ASSERT(fields
[1]=="is the time");
1628 REGEX_ASSERT(fields
[2]=="the"); // left over from previous test
1631 status
= U_ZERO_ERROR
;
1632 n
= pat1
->split("Now is the time", fields
, 1, status
);
1635 REGEX_ASSERT(fields
[0]=="Now is the time");
1636 REGEX_ASSERT(fields
[1]=="*");
1637 status
= U_ZERO_ERROR
;
1639 n
= pat1
->split(" Now is the time ", fields
, 10, status
);
1642 REGEX_ASSERT(fields
[0]=="");
1643 REGEX_ASSERT(fields
[1]=="Now");
1644 REGEX_ASSERT(fields
[2]=="is");
1645 REGEX_ASSERT(fields
[3]=="the");
1646 REGEX_ASSERT(fields
[4]=="time");
1647 REGEX_ASSERT(fields
[5]=="");
1649 n
= pat1
->split(" ", fields
, 10, status
);
1652 REGEX_ASSERT(fields
[0]=="");
1653 REGEX_ASSERT(fields
[1]=="");
1656 n
= pat1
->split("", fields
, 10, status
);
1659 REGEX_ASSERT(fields
[0]=="foo");
1663 // split, with a pattern with (capture)
1664 pat1
= RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe
, status
);
1667 status
= U_ZERO_ERROR
;
1668 n
= pat1
->split("<a>Now is <b>the time<c>", fields
, 10, status
);
1671 REGEX_ASSERT(fields
[0]=="");
1672 REGEX_ASSERT(fields
[1]=="a");
1673 REGEX_ASSERT(fields
[2]=="Now is ");
1674 REGEX_ASSERT(fields
[3]=="b");
1675 REGEX_ASSERT(fields
[4]=="the time");
1676 REGEX_ASSERT(fields
[5]=="c");
1677 REGEX_ASSERT(fields
[6]=="");
1678 REGEX_ASSERT(status
==U_ZERO_ERROR
);
1680 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 10, status
);
1683 REGEX_ASSERT(fields
[0]==" ");
1684 REGEX_ASSERT(fields
[1]=="a");
1685 REGEX_ASSERT(fields
[2]=="Now is ");
1686 REGEX_ASSERT(fields
[3]=="b");
1687 REGEX_ASSERT(fields
[4]=="the time");
1688 REGEX_ASSERT(fields
[5]=="c");
1689 REGEX_ASSERT(fields
[6]=="");
1691 status
= U_ZERO_ERROR
;
1693 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 6, status
);
1696 REGEX_ASSERT(fields
[0]==" ");
1697 REGEX_ASSERT(fields
[1]=="a");
1698 REGEX_ASSERT(fields
[2]=="Now is ");
1699 REGEX_ASSERT(fields
[3]=="b");
1700 REGEX_ASSERT(fields
[4]=="the time");
1701 REGEX_ASSERT(fields
[5]==""); // All text following "<c>" field delimiter.
1702 REGEX_ASSERT(fields
[6]=="foo");
1704 status
= U_ZERO_ERROR
;
1706 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 5, status
);
1709 REGEX_ASSERT(fields
[0]==" ");
1710 REGEX_ASSERT(fields
[1]=="a");
1711 REGEX_ASSERT(fields
[2]=="Now is ");
1712 REGEX_ASSERT(fields
[3]=="b");
1713 REGEX_ASSERT(fields
[4]=="the time<c>");
1714 REGEX_ASSERT(fields
[5]=="foo");
1716 status
= U_ZERO_ERROR
;
1718 n
= pat1
->split(" <a>Now is <b>the time", fields
, 5, status
);
1721 REGEX_ASSERT(fields
[0]==" ");
1722 REGEX_ASSERT(fields
[1]=="a");
1723 REGEX_ASSERT(fields
[2]=="Now is ");
1724 REGEX_ASSERT(fields
[3]=="b");
1725 REGEX_ASSERT(fields
[4]=="the time");
1726 REGEX_ASSERT(fields
[5]=="foo");
1728 status
= U_ZERO_ERROR
;
1729 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 4, status
);
1732 REGEX_ASSERT(fields
[0]==" ");
1733 REGEX_ASSERT(fields
[1]=="a");
1734 REGEX_ASSERT(fields
[2]=="Now is ");
1735 REGEX_ASSERT(fields
[3]=="the time<c>");
1736 status
= U_ZERO_ERROR
;
1739 pat1
= RegexPattern::compile("([-,])", pe
, status
);
1741 n
= pat1
->split("1-10,20", fields
, 10, status
);
1744 REGEX_ASSERT(fields
[0]=="1");
1745 REGEX_ASSERT(fields
[1]=="-");
1746 REGEX_ASSERT(fields
[2]=="10");
1747 REGEX_ASSERT(fields
[3]==",");
1748 REGEX_ASSERT(fields
[4]=="20");
1751 // Test split of string with empty trailing fields
1752 pat1
= RegexPattern::compile(",", pe
, status
);
1754 n
= pat1
->split("a,b,c,", fields
, 10, status
);
1757 REGEX_ASSERT(fields
[0]=="a");
1758 REGEX_ASSERT(fields
[1]=="b");
1759 REGEX_ASSERT(fields
[2]=="c");
1760 REGEX_ASSERT(fields
[3]=="");
1762 n
= pat1
->split("a,,,", fields
, 10, status
);
1765 REGEX_ASSERT(fields
[0]=="a");
1766 REGEX_ASSERT(fields
[1]=="");
1767 REGEX_ASSERT(fields
[2]=="");
1768 REGEX_ASSERT(fields
[3]=="");
1771 // Split Separator with zero length match.
1772 pat1
= RegexPattern::compile(":?", pe
, status
);
1774 n
= pat1
->split("abc", fields
, 10, status
);
1777 REGEX_ASSERT(fields
[0]=="");
1778 REGEX_ASSERT(fields
[1]=="a");
1779 REGEX_ASSERT(fields
[2]=="b");
1780 REGEX_ASSERT(fields
[3]=="c");
1781 REGEX_ASSERT(fields
[4]=="");
1786 // RegexPattern::pattern()
1788 pat1
= new RegexPattern();
1789 REGEX_ASSERT(pat1
->pattern() == "");
1792 pat1
= RegexPattern::compile("(Hello, world)*", pe
, status
);
1794 REGEX_ASSERT(pat1
->pattern() == "(Hello, world)*");
1799 // classID functions
1801 pat1
= RegexPattern::compile("(Hello, world)*", pe
, status
);
1803 REGEX_ASSERT(pat1
->getDynamicClassID() == RegexPattern::getStaticClassID());
1804 REGEX_ASSERT(pat1
->getDynamicClassID() != NULL
);
1805 UnicodeString
Hello("Hello, world.");
1806 RegexMatcher
*m
= pat1
->matcher(Hello
, status
);
1807 REGEX_ASSERT(pat1
->getDynamicClassID() != m
->getDynamicClassID());
1808 REGEX_ASSERT(m
->getDynamicClassID() == RegexMatcher::getStaticClassID());
1809 REGEX_ASSERT(m
->getDynamicClassID() != NULL
);
1815 //---------------------------------------------------------------------------
1817 // API_Match_UTF8 Test that the alternate engine for class RegexMatcher
1818 // is present and working, but excluding functions
1819 // implementing replace operations.
1821 //---------------------------------------------------------------------------
1822 void RegexTest::API_Match_UTF8() {
1824 UErrorCode status
=U_ZERO_ERROR
;
1828 // Debug - slide failing test cases early
1837 // Simple pattern compilation
1840 UText re
= UTEXT_INITIALIZER
;
1841 regextst_openUTF8FromInvariant(&re
, "abc", -1, &status
);
1842 REGEX_VERBOSE_TEXT(&re
);
1844 pat2
= RegexPattern::compile(&re
, flags
, pe
, status
);
1847 UText input1
= UTEXT_INITIALIZER
;
1848 UText input2
= UTEXT_INITIALIZER
;
1849 UText empty
= UTEXT_INITIALIZER
;
1850 regextst_openUTF8FromInvariant(&input1
, "abcdef this is a test", -1, &status
);
1851 REGEX_VERBOSE_TEXT(&input1
);
1852 regextst_openUTF8FromInvariant(&input2
, "not abc", -1, &status
);
1853 REGEX_VERBOSE_TEXT(&input2
);
1854 utext_openUChars(&empty
, NULL
, 0, &status
);
1856 int32_t input1Len
= strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1857 int32_t input2Len
= strlen("not abc");
1861 // Matcher creation and reset.
1863 RegexMatcher
*m1
= &pat2
->matcher(status
)->reset(&input1
);
1865 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
1866 const char str_abcdefthisisatest
[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1867 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest
, m1
->inputText());
1869 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
1870 const char str_notabc
[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1871 REGEX_ASSERT_UTEXT_UTF8(str_notabc
, m1
->inputText());
1873 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest
, m1
->inputText());
1874 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
1876 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
1877 REGEX_ASSERT(utext_nativeLength(&empty
) == 0);
1880 // reset(pos, status)
1883 m1
->reset(4, status
);
1885 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest
, m1
->inputText());
1886 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
1888 m1
->reset(-1, status
);
1889 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1890 status
= U_ZERO_ERROR
;
1892 m1
->reset(0, status
);
1894 status
= U_ZERO_ERROR
;
1896 m1
->reset(input1Len
-1, status
);
1898 status
= U_ZERO_ERROR
;
1900 m1
->reset(input1Len
, status
);
1902 status
= U_ZERO_ERROR
;
1904 m1
->reset(input1Len
+1, status
);
1905 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1906 status
= U_ZERO_ERROR
;
1909 // match(pos, status)
1912 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
1914 REGEX_ASSERT(m1
->matches(3, status
) == FALSE
);
1916 REGEX_ASSERT(m1
->matches(5, status
) == FALSE
);
1917 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
1918 REGEX_ASSERT(m1
->matches(-1, status
) == FALSE
);
1919 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1921 // Match() at end of string should fail, but should not
1923 status
= U_ZERO_ERROR
;
1924 REGEX_ASSERT(m1
->matches(input2Len
, status
) == FALSE
);
1927 // Match beyond end of string should fail with an error.
1928 status
= U_ZERO_ERROR
;
1929 REGEX_ASSERT(m1
->matches(input2Len
+1, status
) == FALSE
);
1930 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1932 // Successful match at end of string.
1934 status
= U_ZERO_ERROR
;
1935 RegexMatcher
m("A?", 0, status
); // will match zero length string.
1938 REGEX_ASSERT(m
.matches(input1Len
, status
) == TRUE
);
1941 REGEX_ASSERT(m
.matches(0, status
) == TRUE
);
1947 // lookingAt(pos, status)
1949 status
= U_ZERO_ERROR
;
1950 m1
->reset(&input2
); // "not abc"
1951 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
1952 REGEX_ASSERT(m1
->lookingAt(5, status
) == FALSE
);
1953 REGEX_ASSERT(m1
->lookingAt(3, status
) == FALSE
);
1954 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
1955 REGEX_ASSERT(m1
->lookingAt(-1, status
) == FALSE
);
1956 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1957 status
= U_ZERO_ERROR
;
1958 REGEX_ASSERT(m1
->lookingAt(input2Len
, status
) == FALSE
);
1960 REGEX_ASSERT(m1
->lookingAt(input2Len
+1, status
) == FALSE
);
1961 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1967 utext_close(&input1
);
1968 utext_close(&input2
);
1969 utext_close(&empty
);
1975 // RegexMatcher::start();
1976 // RegexMatcher::end();
1977 // RegexMatcher::groupCount();
1982 UErrorCode status
=U_ZERO_ERROR
;
1983 UText re
=UTEXT_INITIALIZER
;
1984 const char str_01234567_pat
[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1985 utext_openUTF8(&re
, str_01234567_pat
, -1, &status
);
1987 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
1990 UText input
= UTEXT_INITIALIZER
;
1991 const char str_0123456789
[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1992 utext_openUTF8(&input
, str_0123456789
, -1, &status
);
1994 RegexMatcher
*matcher
= &pat
->matcher(status
)->reset(&input
);
1996 REGEX_ASSERT(matcher
->lookingAt(status
) == TRUE
);
1997 static const int32_t matchStarts
[] = {0, 2, 4, 8};
1998 static const int32_t matchEnds
[] = {10, 8, 6, 10};
2000 for (i
=0; i
<4; i
++) {
2001 int32_t actualStart
= matcher
->start(i
, status
);
2003 if (actualStart
!= matchStarts
[i
]) {
2004 errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n",
2005 __FILE__
, __LINE__
, i
, matchStarts
[i
], actualStart
);
2007 int32_t actualEnd
= matcher
->end(i
, status
);
2009 if (actualEnd
!= matchEnds
[i
]) {
2010 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n",
2011 __FILE__
, __LINE__
, i
, matchEnds
[i
], actualEnd
);
2015 REGEX_ASSERT(matcher
->start(0, status
) == matcher
->start(status
));
2016 REGEX_ASSERT(matcher
->end(0, status
) == matcher
->end(status
));
2018 REGEX_ASSERT_FAIL(matcher
->start(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2019 REGEX_ASSERT_FAIL(matcher
->start( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2021 REGEX_ASSERT_FAIL(matcher
->start( 0, status
), U_REGEX_INVALID_STATE
);
2023 matcher
->lookingAt(status
);
2026 UText destText
= UTEXT_INITIALIZER
;
2027 utext_openUnicodeString(&destText
, &dest
, &status
);
2029 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2030 // Test shallow-clone API
2032 result
= matcher
->group((UText
*)NULL
, group_len
, status
);
2034 REGEX_ASSERT_UTEXT_UTF8(str_0123456789
, result
);
2035 utext_close(result
);
2036 result
= matcher
->group(0, &destText
, group_len
, status
);
2038 REGEX_ASSERT(result
== &destText
);
2039 REGEX_ASSERT_UTEXT_UTF8(str_0123456789
, result
);
2040 // destText is now immutable, reopen it
2041 utext_close(&destText
);
2042 utext_openUnicodeString(&destText
, &dest
, &status
);
2044 result
= matcher
->group(0, NULL
, status
);
2046 REGEX_ASSERT_UTEXT_UTF8(str_0123456789
, result
);
2047 utext_close(result
);
2048 result
= matcher
->group(0, &destText
, status
);
2050 REGEX_ASSERT(result
== &destText
);
2051 REGEX_ASSERT_UTEXT_UTF8(str_0123456789
, result
);
2053 result
= matcher
->group(1, NULL
, status
);
2055 const char str_234567
[] = { 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x00 }; /* 234567 */
2056 REGEX_ASSERT_UTEXT_UTF8(str_234567
, result
);
2057 utext_close(result
);
2058 result
= matcher
->group(1, &destText
, status
);
2060 REGEX_ASSERT(result
== &destText
);
2061 REGEX_ASSERT_UTEXT_UTF8(str_234567
, result
);
2063 result
= matcher
->group(2, NULL
, status
);
2065 const char str_45
[] = { 0x34, 0x35, 0x00 }; /* 45 */
2066 REGEX_ASSERT_UTEXT_UTF8(str_45
, result
);
2067 utext_close(result
);
2068 result
= matcher
->group(2, &destText
, status
);
2070 REGEX_ASSERT(result
== &destText
);
2071 REGEX_ASSERT_UTEXT_UTF8(str_45
, result
);
2073 result
= matcher
->group(3, NULL
, status
);
2075 const char str_89
[] = { 0x38, 0x39, 0x00 }; /* 89 */
2076 REGEX_ASSERT_UTEXT_UTF8(str_89
, result
);
2077 utext_close(result
);
2078 result
= matcher
->group(3, &destText
, status
);
2080 REGEX_ASSERT(result
== &destText
);
2081 REGEX_ASSERT_UTEXT_UTF8(str_89
, result
);
2083 REGEX_ASSERT_FAIL(matcher
->group(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2084 REGEX_ASSERT_FAIL(matcher
->group( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2086 REGEX_ASSERT_FAIL(matcher
->group( 0, status
), U_REGEX_INVALID_STATE
);
2091 utext_close(&destText
);
2092 utext_close(&input
);
2102 UErrorCode status
=U_ZERO_ERROR
;
2103 UText re
=UTEXT_INITIALIZER
;
2104 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2105 utext_openUTF8(&re
, str_abc
, -1, &status
);
2107 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
2109 UText input
= UTEXT_INITIALIZER
;
2110 const char str_abcabcabc
[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2111 utext_openUTF8(&input
, str_abcabcabc
, -1, &status
);
2112 // 012345678901234567
2114 RegexMatcher
*matcher
= &pat
->matcher(status
)->reset(&input
);
2116 REGEX_ASSERT(matcher
->find());
2117 REGEX_ASSERT(matcher
->start(status
) == 1);
2118 REGEX_ASSERT(matcher
->find());
2119 REGEX_ASSERT(matcher
->start(status
) == 6);
2120 REGEX_ASSERT(matcher
->find());
2121 REGEX_ASSERT(matcher
->start(status
) == 12);
2122 REGEX_ASSERT(matcher
->find() == FALSE
);
2123 REGEX_ASSERT(matcher
->find() == FALSE
);
2126 REGEX_ASSERT(matcher
->find());
2127 REGEX_ASSERT(matcher
->start(status
) == 1);
2129 REGEX_ASSERT(matcher
->find(0, status
));
2130 REGEX_ASSERT(matcher
->start(status
) == 1);
2131 REGEX_ASSERT(matcher
->find(1, status
));
2132 REGEX_ASSERT(matcher
->start(status
) == 1);
2133 REGEX_ASSERT(matcher
->find(2, status
));
2134 REGEX_ASSERT(matcher
->start(status
) == 6);
2135 REGEX_ASSERT(matcher
->find(12, status
));
2136 REGEX_ASSERT(matcher
->start(status
) == 12);
2137 REGEX_ASSERT(matcher
->find(13, status
) == FALSE
);
2138 REGEX_ASSERT(matcher
->find(16, status
) == FALSE
);
2139 REGEX_ASSERT(matcher
->find(17, status
) == FALSE
);
2140 REGEX_ASSERT_FAIL(matcher
->start(status
), U_REGEX_INVALID_STATE
);
2142 status
= U_ZERO_ERROR
;
2143 REGEX_ASSERT_FAIL(matcher
->find(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2144 status
= U_ZERO_ERROR
;
2145 REGEX_ASSERT_FAIL(matcher
->find(18, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2147 REGEX_ASSERT(matcher
->groupCount() == 0);
2152 utext_close(&input
);
2158 // find, with \G in pattern (true if at the end of a previous match).
2163 UErrorCode status
=U_ZERO_ERROR
;
2164 UText re
=UTEXT_INITIALIZER
;
2165 const char str_Gabcabc
[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2166 utext_openUTF8(&re
, str_Gabcabc
, -1, &status
);
2168 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
2171 UText input
= UTEXT_INITIALIZER
;
2172 const char str_abcabcabc
[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2173 utext_openUTF8(&input
, str_abcabcabc
, -1, &status
);
2174 // 012345678901234567
2176 RegexMatcher
*matcher
= &pat
->matcher(status
)->reset(&input
);
2178 REGEX_ASSERT(matcher
->find());
2179 REGEX_ASSERT(matcher
->start(status
) == 0);
2180 REGEX_ASSERT(matcher
->start(1, status
) == -1);
2181 REGEX_ASSERT(matcher
->start(2, status
) == 1);
2183 REGEX_ASSERT(matcher
->find());
2184 REGEX_ASSERT(matcher
->start(status
) == 4);
2185 REGEX_ASSERT(matcher
->start(1, status
) == 4);
2186 REGEX_ASSERT(matcher
->start(2, status
) == -1);
2192 utext_close(&input
);
2197 // find with zero length matches, match position should bump ahead
2198 // to prevent loops.
2202 UErrorCode status
=U_ZERO_ERROR
;
2203 RegexMatcher
m("(?= ?)", 0, status
); // This pattern will zero-length matches anywhere,
2204 // using an always-true look-ahead.
2206 UText s
= UTEXT_INITIALIZER
;
2207 utext_openUTF8(&s
, " ", -1, &status
);
2210 if (m
.find() == FALSE
) {
2213 REGEX_ASSERT(m
.start(status
) == i
);
2214 REGEX_ASSERT(m
.end(status
) == i
);
2218 // Check that the bump goes over characters outside the BMP OK
2219 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2220 unsigned char aboveBMP
[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2221 utext_openUTF8(&s
, (char *)aboveBMP
, -1, &status
);
2224 if (m
.find() == FALSE
) {
2227 REGEX_ASSERT(m
.start(status
) == i
);
2228 REGEX_ASSERT(m
.end(status
) == i
);
2230 REGEX_ASSERT(i
==20);
2235 // find() loop breaking test.
2236 // with pattern of /.?/, should see a series of one char matches, then a single
2237 // match of zero length at the end of the input string.
2239 UErrorCode status
=U_ZERO_ERROR
;
2240 RegexMatcher
m(".?", 0, status
);
2242 UText s
= UTEXT_INITIALIZER
;
2243 utext_openUTF8(&s
, " ", -1, &status
);
2246 if (m
.find() == FALSE
) {
2249 REGEX_ASSERT(m
.start(status
) == i
);
2250 REGEX_ASSERT(m
.end(status
) == (i
<4 ? i
+1 : i
));
2259 // Matchers with no input string behave as if they had an empty input string.
2263 UErrorCode status
= U_ZERO_ERROR
;
2264 RegexMatcher
m(".?", 0, status
);
2266 REGEX_ASSERT(m
.find());
2267 REGEX_ASSERT(m
.start(status
) == 0);
2268 REGEX_ASSERT(m
.input() == "");
2271 UErrorCode status
= U_ZERO_ERROR
;
2272 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
2273 RegexMatcher
*m
= p
->matcher(status
);
2276 REGEX_ASSERT(m
->find() == FALSE
);
2277 REGEX_ASSERT(utext_nativeLength(m
->inputText()) == 0);
2286 UErrorCode status
= U_ZERO_ERROR
;
2287 UText testPattern
= UTEXT_INITIALIZER
;
2288 UText testText
= UTEXT_INITIALIZER
;
2289 regextst_openUTF8FromInvariant(&testPattern
, ".*", -1, &status
);
2290 REGEX_VERBOSE_TEXT(&testPattern
);
2291 regextst_openUTF8FromInvariant(&testText
, "This is test data", -1, &status
);
2292 REGEX_VERBOSE_TEXT(&testText
);
2294 RegexMatcher
m(&testPattern
, &testText
, 0, status
);
2296 REGEX_ASSERT(m
.regionStart() == 0);
2297 REGEX_ASSERT(m
.regionEnd() == (int32_t)strlen("This is test data"));
2298 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2299 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2301 m
.region(2,4, status
);
2303 REGEX_ASSERT(m
.matches(status
));
2304 REGEX_ASSERT(m
.start(status
)==2);
2305 REGEX_ASSERT(m
.end(status
)==4);
2309 REGEX_ASSERT(m
.regionStart() == 0);
2310 REGEX_ASSERT(m
.regionEnd() == (int32_t)strlen("This is test data"));
2312 regextst_openUTF8FromInvariant(&testText
, "short", -1, &status
);
2313 REGEX_VERBOSE_TEXT(&testText
);
2315 REGEX_ASSERT(m
.regionStart() == 0);
2316 REGEX_ASSERT(m
.regionEnd() == (int32_t)strlen("short"));
2318 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2319 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(FALSE
));
2320 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
2321 REGEX_ASSERT(&m
== &m
.reset());
2322 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
2324 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(TRUE
));
2325 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2326 REGEX_ASSERT(&m
== &m
.reset());
2327 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2329 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2330 REGEX_ASSERT(&m
== &m
.useTransparentBounds(TRUE
));
2331 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
2332 REGEX_ASSERT(&m
== &m
.reset());
2333 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
2335 REGEX_ASSERT(&m
== &m
.useTransparentBounds(FALSE
));
2336 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2337 REGEX_ASSERT(&m
== &m
.reset());
2338 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2340 utext_close(&testText
);
2341 utext_close(&testPattern
);
2345 // hitEnd() and requireEnd()
2348 UErrorCode status
= U_ZERO_ERROR
;
2349 UText testPattern
= UTEXT_INITIALIZER
;
2350 UText testText
= UTEXT_INITIALIZER
;
2351 const char str_
[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2352 const char str_aabb
[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2353 utext_openUTF8(&testPattern
, str_
, -1, &status
);
2354 utext_openUTF8(&testText
, str_aabb
, -1, &status
);
2356 RegexMatcher
m1(&testPattern
, &testText
, 0, status
);
2357 REGEX_ASSERT(m1
.lookingAt(status
) == TRUE
);
2358 REGEX_ASSERT(m1
.hitEnd() == TRUE
);
2359 REGEX_ASSERT(m1
.requireEnd() == FALSE
);
2362 status
= U_ZERO_ERROR
;
2363 const char str_a
[] = { 0x61, 0x2a, 0x00 }; /* a* */
2364 utext_openUTF8(&testPattern
, str_a
, -1, &status
);
2365 RegexMatcher
m2(&testPattern
, &testText
, 0, status
);
2366 REGEX_ASSERT(m2
.lookingAt(status
) == TRUE
);
2367 REGEX_ASSERT(m2
.hitEnd() == FALSE
);
2368 REGEX_ASSERT(m2
.requireEnd() == FALSE
);
2371 status
= U_ZERO_ERROR
;
2372 const char str_dotstardollar
[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2373 utext_openUTF8(&testPattern
, str_dotstardollar
, -1, &status
);
2374 RegexMatcher
m3(&testPattern
, &testText
, 0, status
);
2375 REGEX_ASSERT(m3
.lookingAt(status
) == TRUE
);
2376 REGEX_ASSERT(m3
.hitEnd() == TRUE
);
2377 REGEX_ASSERT(m3
.requireEnd() == TRUE
);
2380 utext_close(&testText
);
2381 utext_close(&testPattern
);
2386 //---------------------------------------------------------------------------
2388 // API_Replace_UTF8 API test for class RegexMatcher, testing the
2389 // Replace family of functions.
2391 //---------------------------------------------------------------------------
2392 void RegexTest::API_Replace_UTF8() {
2398 UErrorCode status
=U_ZERO_ERROR
;
2400 UText re
=UTEXT_INITIALIZER
;
2401 regextst_openUTF8FromInvariant(&re
, "abc", -1, &status
);
2402 REGEX_VERBOSE_TEXT(&re
);
2403 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
2406 char data
[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2407 // 012345678901234567
2408 UText dataText
= UTEXT_INITIALIZER
;
2409 utext_openUTF8(&dataText
, data
, -1, &status
);
2411 REGEX_VERBOSE_TEXT(&dataText
);
2412 RegexMatcher
*matcher
= &pat
->matcher(status
)->reset(&dataText
);
2415 // Plain vanilla matches.
2418 UText destText
= UTEXT_INITIALIZER
;
2419 utext_openUnicodeString(&destText
, &dest
, &status
);
2422 UText replText
= UTEXT_INITIALIZER
;
2424 const char str_yz
[] = { 0x79, 0x7a, 0x00 }; /* yz */
2425 utext_openUTF8(&replText
, str_yz
, -1, &status
);
2426 REGEX_VERBOSE_TEXT(&replText
);
2427 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2429 const char str_yzabcabc
[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2430 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc
, result
);
2431 utext_close(result
);
2432 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2434 REGEX_ASSERT(result
== &destText
);
2435 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc
, result
);
2437 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2439 const char str_yzyzyz
[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2440 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz
, result
);
2441 utext_close(result
);
2443 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2444 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2446 REGEX_ASSERT(result
== &destText
);
2447 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz
, result
);
2450 // Plain vanilla non-matches.
2452 const char str_abxabxabx
[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2453 utext_openUTF8(&dataText
, str_abxabxabx
, -1, &status
);
2454 matcher
->reset(&dataText
);
2456 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2458 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2459 utext_close(result
);
2460 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2462 REGEX_ASSERT(result
== &destText
);
2463 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2465 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2467 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2468 utext_close(result
);
2469 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2470 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2472 REGEX_ASSERT(result
== &destText
);
2473 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2476 // Empty source string
2478 utext_openUTF8(&dataText
, NULL
, 0, &status
);
2479 matcher
->reset(&dataText
);
2481 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2483 REGEX_ASSERT_UTEXT_UTF8("", result
);
2484 utext_close(result
);
2485 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2487 REGEX_ASSERT(result
== &destText
);
2488 REGEX_ASSERT_UTEXT_UTF8("", result
);
2490 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2492 REGEX_ASSERT_UTEXT_UTF8("", result
);
2493 utext_close(result
);
2494 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2496 REGEX_ASSERT(result
== &destText
);
2497 REGEX_ASSERT_UTEXT_UTF8("", result
);
2500 // Empty substitution string
2502 utext_openUTF8(&dataText
, data
, -1, &status
); // ".abc..abc...abc.."
2503 matcher
->reset(&dataText
);
2505 utext_openUTF8(&replText
, NULL
, 0, &status
);
2506 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2508 const char str_abcabc
[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2509 REGEX_ASSERT_UTEXT_UTF8(str_abcabc
, result
);
2510 utext_close(result
);
2511 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2513 REGEX_ASSERT(result
== &destText
);
2514 REGEX_ASSERT_UTEXT_UTF8(str_abcabc
, result
);
2516 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2518 const char str_dots
[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2519 REGEX_ASSERT_UTEXT_UTF8(str_dots
, result
);
2520 utext_close(result
);
2521 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2522 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2524 REGEX_ASSERT(result
== &destText
);
2525 REGEX_ASSERT_UTEXT_UTF8(str_dots
, result
);
2528 // match whole string
2530 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2531 utext_openUTF8(&dataText
, str_abc
, -1, &status
);
2532 matcher
->reset(&dataText
);
2534 const char str_xyz
[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2535 utext_openUTF8(&replText
, str_xyz
, -1, &status
);
2536 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2538 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2539 utext_close(result
);
2540 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2541 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2543 REGEX_ASSERT(result
== &destText
);
2544 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2546 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2548 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2549 utext_close(result
);
2550 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2551 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2553 REGEX_ASSERT(result
== &destText
);
2554 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2557 // Capture Group, simple case
2559 const char str_add
[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2560 utext_openUTF8(&re
, str_add
, -1, &status
);
2561 RegexPattern
*pat2
= RegexPattern::compile(&re
, flags
, pe
, status
);
2564 const char str_abcdefg
[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2565 utext_openUTF8(&dataText
, str_abcdefg
, -1, &status
);
2566 RegexMatcher
*matcher2
= &pat2
->matcher(status
)->reset(&dataText
);
2569 const char str_11
[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2570 utext_openUTF8(&replText
, str_11
, -1, &status
);
2571 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2573 const char str_bcbcdefg
[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2574 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg
, result
);
2575 utext_close(result
);
2576 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2577 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2579 REGEX_ASSERT(result
== &destText
);
2580 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg
, result
);
2582 const char str_v
[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2583 utext_openUTF8(&replText
, str_v
, -1, &status
);
2584 REGEX_VERBOSE_TEXT(&replText
);
2585 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2587 const char str_Thevalueof1isbcdefg
[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2588 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg
, result
);
2589 utext_close(result
);
2590 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2591 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2593 REGEX_ASSERT(result
== &destText
);
2594 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg
, result
);
2596 const char str_byitselfnogroupnumber
[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x00 }; /* $ by itself, no group number $$$ */
2597 utext_openUTF8(&replText
, str_byitselfnogroupnumber
, -1, &status
);
2598 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2600 const char str_byitselfnogroupnumberdefg
[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2601 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg
, result
);
2602 utext_close(result
);
2603 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2604 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2606 REGEX_ASSERT(result
== &destText
);
2607 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg
, result
);
2609 unsigned char supplDigitChars
[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2610 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2611 // 012345678901234567890123456
2612 supplDigitChars
[22] = 0xF0;
2613 supplDigitChars
[23] = 0x9D;
2614 supplDigitChars
[24] = 0x9F;
2615 supplDigitChars
[25] = 0x8F;
2616 utext_openUTF8(&replText
, (char *)supplDigitChars
, -1, &status
);
2618 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2620 const char str_SupplementalDigit1bcdefg
[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2621 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg
, result
);
2622 utext_close(result
);
2623 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2624 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2626 REGEX_ASSERT(result
== &destText
);
2627 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg
, result
);
2628 const char str_badcapturegroupnumber5
[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */
2629 utext_openUTF8(&replText
, str_badcapturegroupnumber5
, -1, &status
);
2630 REGEX_ASSERT_FAIL((result
= matcher2
->replaceFirst(&replText
, NULL
, status
)), U_INDEX_OUTOFBOUNDS_ERROR
);
2631 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2632 utext_close(result
);
2633 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2634 REGEX_ASSERT_FAIL((result
= matcher2
->replaceFirst(&replText
, &destText
, status
)), U_INDEX_OUTOFBOUNDS_ERROR
);
2635 REGEX_ASSERT(result
== &destText
);
2636 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2639 // Replacement String with \u hex escapes
2642 const char str_abc1abc2abc3
[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2643 const char str_u0043
[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2644 utext_openUTF8(&dataText
, str_abc1abc2abc3
, -1, &status
);
2645 utext_openUTF8(&replText
, str_u0043
, -1, &status
);
2646 matcher
->reset(&dataText
);
2648 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2650 const char str_C1C2C3
[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2651 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3
, result
);
2652 utext_close(result
);
2653 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2654 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2656 REGEX_ASSERT(result
== &destText
);
2657 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3
, result
);
2660 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2661 utext_openUTF8(&dataText
, str_abc
, -1, &status
);
2662 const char str_U00010000
[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2663 utext_openUTF8(&replText
, str_U00010000
, -1, &status
);
2664 matcher
->reset(&dataText
);
2666 unsigned char expected
[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2673 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2675 REGEX_ASSERT_UTEXT_UTF8((char *)expected
, result
);
2676 utext_close(result
);
2677 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2678 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2680 REGEX_ASSERT(result
== &destText
);
2681 REGEX_ASSERT_UTEXT_UTF8((char *)expected
, result
);
2683 // TODO: need more through testing of capture substitutions.
2688 status
= U_ZERO_ERROR
;
2689 const char str_ssee
[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2690 const char str_blah
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2691 const char str_ooh
[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2692 utext_openUTF8(&re
, str_ssee
, -1, &status
);
2693 utext_openUTF8(&dataText
, str_blah
, -1, &status
);
2694 utext_openUTF8(&replText
, str_ooh
, -1, &status
);
2696 RegexMatcher
m(&re
, 0, status
);
2699 UnicodeString result
;
2700 UText resultText
= UTEXT_INITIALIZER
;
2701 utext_openUnicodeString(&resultText
, &result
, &status
);
2703 // Multiple finds do NOT bump up the previous appendReplacement postion.
2707 m
.appendReplacement(&resultText
, &replText
, status
);
2709 const char str_blah2
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2710 REGEX_ASSERT_UTEXT_UTF8(str_blah2
, &resultText
);
2712 // After a reset into the interior of a string, appendReplacement still starts at beginning.
2713 status
= U_ZERO_ERROR
;
2715 utext_openUnicodeString(&resultText
, &result
, &status
);
2716 m
.reset(10, status
);
2719 m
.appendReplacement(&resultText
, &replText
, status
);
2721 const char str_blah3
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2722 REGEX_ASSERT_UTEXT_UTF8(str_blah3
, &resultText
);
2724 // find() at interior of string, appendReplacement still starts at beginning.
2725 status
= U_ZERO_ERROR
;
2727 utext_openUnicodeString(&resultText
, &result
, &status
);
2731 m
.appendReplacement(&resultText
, &replText
, status
);
2733 const char str_blah8
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2734 REGEX_ASSERT_UTEXT_UTF8(str_blah8
, &resultText
);
2736 m
.appendTail(&resultText
, status
);
2737 const char str_blah9
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2738 REGEX_ASSERT_UTEXT_UTF8(str_blah9
, &resultText
);
2740 utext_close(&resultText
);
2748 utext_close(&dataText
);
2749 utext_close(&replText
);
2750 utext_close(&destText
);
2755 //---------------------------------------------------------------------------
2757 // API_Pattern_UTF8 Test that the API for class RegexPattern is
2758 // present and nominally working.
2760 //---------------------------------------------------------------------------
2761 void RegexTest::API_Pattern_UTF8() {
2762 RegexPattern pata
; // Test default constructor to not crash.
2765 REGEX_ASSERT(pata
== patb
);
2766 REGEX_ASSERT(pata
== pata
);
2768 UText re1
= UTEXT_INITIALIZER
;
2769 UText re2
= UTEXT_INITIALIZER
;
2770 UErrorCode status
= U_ZERO_ERROR
;
2773 const char str_abcalmz
[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2774 const char str_def
[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2775 utext_openUTF8(&re1
, str_abcalmz
, -1, &status
);
2776 utext_openUTF8(&re2
, str_def
, -1, &status
);
2778 RegexPattern
*pat1
= RegexPattern::compile(&re1
, 0, pe
, status
);
2779 RegexPattern
*pat2
= RegexPattern::compile(&re2
, 0, pe
, status
);
2781 REGEX_ASSERT(*pat1
== *pat1
);
2782 REGEX_ASSERT(*pat1
!= pata
);
2786 REGEX_ASSERT(patb
== *pat1
);
2789 RegexPattern
patc(*pat1
);
2790 REGEX_ASSERT(patc
== *pat1
);
2791 REGEX_ASSERT(patb
== patc
);
2792 REGEX_ASSERT(pat1
!= pat2
);
2794 REGEX_ASSERT(patb
!= patc
);
2795 REGEX_ASSERT(patb
== *pat2
);
2797 // Compile with no flags.
2798 RegexPattern
*pat1a
= RegexPattern::compile(&re1
, pe
, status
);
2799 REGEX_ASSERT(*pat1a
== *pat1
);
2801 REGEX_ASSERT(pat1a
->flags() == 0);
2803 // Compile with different flags should be not equal
2804 RegexPattern
*pat1b
= RegexPattern::compile(&re1
, UREGEX_CASE_INSENSITIVE
, pe
, status
);
2807 REGEX_ASSERT(*pat1b
!= *pat1a
);
2808 REGEX_ASSERT(pat1b
->flags() == UREGEX_CASE_INSENSITIVE
);
2809 REGEX_ASSERT(pat1a
->flags() == 0);
2813 RegexPattern
*pat1c
= pat1
->clone();
2814 REGEX_ASSERT(*pat1c
== *pat1
);
2815 REGEX_ASSERT(*pat1c
!= *pat2
);
2827 // Verify that a matcher created from a cloned pattern works.
2831 UErrorCode status
= U_ZERO_ERROR
;
2832 UText pattern
= UTEXT_INITIALIZER
;
2833 const char str_pL
[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2834 utext_openUTF8(&pattern
, str_pL
, -1, &status
);
2836 RegexPattern
*pSource
= RegexPattern::compile(&pattern
, 0, status
);
2837 RegexPattern
*pClone
= pSource
->clone();
2839 RegexMatcher
*mFromClone
= pClone
->matcher(status
);
2842 UText input
= UTEXT_INITIALIZER
;
2843 const char str_HelloWorld
[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2844 utext_openUTF8(&input
, str_HelloWorld
, -1, &status
);
2845 mFromClone
->reset(&input
);
2846 REGEX_ASSERT(mFromClone
->find() == TRUE
);
2847 REGEX_ASSERT(mFromClone
->group(status
) == "Hello");
2848 REGEX_ASSERT(mFromClone
->find() == TRUE
);
2849 REGEX_ASSERT(mFromClone
->group(status
) == "World");
2850 REGEX_ASSERT(mFromClone
->find() == FALSE
);
2854 utext_close(&input
);
2855 utext_close(&pattern
);
2859 // matches convenience API
2862 UErrorCode status
= U_ZERO_ERROR
;
2863 UText pattern
= UTEXT_INITIALIZER
;
2864 UText input
= UTEXT_INITIALIZER
;
2866 const char str_randominput
[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2867 utext_openUTF8(&input
, str_randominput
, -1, &status
);
2869 const char str_dotstar
[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2870 utext_openUTF8(&pattern
, str_dotstar
, -1, &status
);
2871 REGEX_ASSERT(RegexPattern::matches(&pattern
, &input
, pe
, status
) == TRUE
);
2874 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2875 utext_openUTF8(&pattern
, str_abc
, -1, &status
);
2876 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe
, status
) == FALSE
);
2879 const char str_nput
[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2880 utext_openUTF8(&pattern
, str_nput
, -1, &status
);
2881 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe
, status
) == TRUE
);
2884 utext_openUTF8(&pattern
, str_randominput
, -1, &status
);
2885 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe
, status
) == TRUE
);
2888 const char str_u
[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2889 utext_openUTF8(&pattern
, str_u
, -1, &status
);
2890 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe
, status
) == FALSE
);
2893 utext_openUTF8(&input
, str_abc
, -1, &status
);
2894 utext_openUTF8(&pattern
, str_abc
, -1, &status
);
2895 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
2896 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe
, status
) == FALSE
);
2897 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
2899 utext_close(&input
);
2900 utext_close(&pattern
);
2907 status
= U_ZERO_ERROR
;
2908 const char str_spaceplus
[] = { 0x20, 0x2b, 0x00 }; /* + */
2909 utext_openUTF8(&re1
, str_spaceplus
, -1, &status
);
2910 pat1
= RegexPattern::compile(&re1
, pe
, status
);
2912 UnicodeString fields
[10];
2915 n
= pat1
->split("Now is the time", fields
, 10, status
);
2918 REGEX_ASSERT(fields
[0]=="Now");
2919 REGEX_ASSERT(fields
[1]=="is");
2920 REGEX_ASSERT(fields
[2]=="the");
2921 REGEX_ASSERT(fields
[3]=="time");
2922 REGEX_ASSERT(fields
[4]=="");
2924 n
= pat1
->split("Now is the time", fields
, 2, status
);
2927 REGEX_ASSERT(fields
[0]=="Now");
2928 REGEX_ASSERT(fields
[1]=="is the time");
2929 REGEX_ASSERT(fields
[2]=="the"); // left over from previous test
2932 status
= U_ZERO_ERROR
;
2933 n
= pat1
->split("Now is the time", fields
, 1, status
);
2936 REGEX_ASSERT(fields
[0]=="Now is the time");
2937 REGEX_ASSERT(fields
[1]=="*");
2938 status
= U_ZERO_ERROR
;
2940 n
= pat1
->split(" Now is the time ", fields
, 10, status
);
2943 REGEX_ASSERT(fields
[0]=="");
2944 REGEX_ASSERT(fields
[1]=="Now");
2945 REGEX_ASSERT(fields
[2]=="is");
2946 REGEX_ASSERT(fields
[3]=="the");
2947 REGEX_ASSERT(fields
[4]=="time");
2948 REGEX_ASSERT(fields
[5]=="");
2949 REGEX_ASSERT(fields
[6]=="");
2952 n
= pat1
->split(" ", fields
, 10, status
);
2955 REGEX_ASSERT(fields
[0]=="");
2956 REGEX_ASSERT(fields
[1]=="");
2957 REGEX_ASSERT(fields
[2]=="*");
2960 n
= pat1
->split("", fields
, 10, status
);
2963 REGEX_ASSERT(fields
[0]=="foo");
2967 // split, with a pattern with (capture)
2968 regextst_openUTF8FromInvariant(&re1
, "<(\\w*)>", -1, &status
);
2969 pat1
= RegexPattern::compile(&re1
, pe
, status
);
2972 status
= U_ZERO_ERROR
;
2973 fields
[6] = fields
[7] = "*";
2974 n
= pat1
->split("<a>Now is <b>the time<c>", fields
, 10, status
);
2977 REGEX_ASSERT(fields
[0]=="");
2978 REGEX_ASSERT(fields
[1]=="a");
2979 REGEX_ASSERT(fields
[2]=="Now is ");
2980 REGEX_ASSERT(fields
[3]=="b");
2981 REGEX_ASSERT(fields
[4]=="the time");
2982 REGEX_ASSERT(fields
[5]=="c");
2983 REGEX_ASSERT(fields
[6]=="");
2984 REGEX_ASSERT(fields
[7]=="*");
2985 REGEX_ASSERT(status
==U_ZERO_ERROR
);
2987 fields
[6] = fields
[7] = "*";
2988 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 10, status
);
2991 REGEX_ASSERT(fields
[0]==" ");
2992 REGEX_ASSERT(fields
[1]=="a");
2993 REGEX_ASSERT(fields
[2]=="Now is ");
2994 REGEX_ASSERT(fields
[3]=="b");
2995 REGEX_ASSERT(fields
[4]=="the time");
2996 REGEX_ASSERT(fields
[5]=="c");
2997 REGEX_ASSERT(fields
[6]=="");
2998 REGEX_ASSERT(fields
[7]=="*");
3000 status
= U_ZERO_ERROR
;
3002 n
= pat1
->split(" <a>Now is <b>the time<c> ", fields
, 6, status
);
3005 REGEX_ASSERT(fields
[0]==" ");
3006 REGEX_ASSERT(fields
[1]=="a");
3007 REGEX_ASSERT(fields
[2]=="Now is ");
3008 REGEX_ASSERT(fields
[3]=="b");
3009 REGEX_ASSERT(fields
[4]=="the time");
3010 REGEX_ASSERT(fields
[5]==" ");
3011 REGEX_ASSERT(fields
[6]=="foo");
3013 status
= U_ZERO_ERROR
;
3015 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 5, status
);
3018 REGEX_ASSERT(fields
[0]==" ");
3019 REGEX_ASSERT(fields
[1]=="a");
3020 REGEX_ASSERT(fields
[2]=="Now is ");
3021 REGEX_ASSERT(fields
[3]=="b");
3022 REGEX_ASSERT(fields
[4]=="the time<c>");
3023 REGEX_ASSERT(fields
[5]=="foo");
3025 status
= U_ZERO_ERROR
;
3027 n
= pat1
->split(" <a>Now is <b>the time", fields
, 5, status
);
3030 REGEX_ASSERT(fields
[0]==" ");
3031 REGEX_ASSERT(fields
[1]=="a");
3032 REGEX_ASSERT(fields
[2]=="Now is ");
3033 REGEX_ASSERT(fields
[3]=="b");
3034 REGEX_ASSERT(fields
[4]=="the time");
3035 REGEX_ASSERT(fields
[5]=="foo");
3037 status
= U_ZERO_ERROR
;
3038 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 4, status
);
3041 REGEX_ASSERT(fields
[0]==" ");
3042 REGEX_ASSERT(fields
[1]=="a");
3043 REGEX_ASSERT(fields
[2]=="Now is ");
3044 REGEX_ASSERT(fields
[3]=="the time<c>");
3045 status
= U_ZERO_ERROR
;
3048 regextst_openUTF8FromInvariant(&re1
, "([-,])", -1, &status
);
3049 pat1
= RegexPattern::compile(&re1
, pe
, status
);
3051 n
= pat1
->split("1-10,20", fields
, 10, status
);
3054 REGEX_ASSERT(fields
[0]=="1");
3055 REGEX_ASSERT(fields
[1]=="-");
3056 REGEX_ASSERT(fields
[2]=="10");
3057 REGEX_ASSERT(fields
[3]==",");
3058 REGEX_ASSERT(fields
[4]=="20");
3063 // RegexPattern::pattern() and patternText()
3065 pat1
= new RegexPattern();
3066 REGEX_ASSERT(pat1
->pattern() == "");
3067 REGEX_ASSERT_UTEXT_UTF8("", pat1
->patternText(status
));
3069 const char *helloWorldInvariant
= "(Hello, world)*";
3070 regextst_openUTF8FromInvariant(&re1
, helloWorldInvariant
, -1, &status
);
3071 pat1
= RegexPattern::compile(&re1
, pe
, status
);
3073 REGEX_ASSERT_UNISTR(pat1
->pattern(),"(Hello, world)*");
3074 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1
->patternText(status
));
3081 //---------------------------------------------------------------------------
3083 // Extended A more thorough check for features of regex patterns
3084 // The test cases are in a separate data file,
3085 // source/tests/testdata/regextst.txt
3086 // A description of the test data format is included in that file.
3088 //---------------------------------------------------------------------------
3091 RegexTest::getPath(char buffer
[2048], const char *filename
) {
3092 UErrorCode status
=U_ZERO_ERROR
;
3093 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
3094 if (U_FAILURE(status
)) {
3095 errln("ERROR: loadTestData() failed - %s", u_errorName(status
));
3099 strcpy(buffer
, testDataDirectory
);
3100 strcat(buffer
, filename
);
3104 void RegexTest::Extended() {
3106 const char *srcPath
;
3107 UErrorCode status
= U_ZERO_ERROR
;
3108 int32_t lineNum
= 0;
3111 // Open and read the test data file.
3113 srcPath
=getPath(tdd
, "regextst.txt");
3115 return; /* something went wrong, error already output */
3119 UChar
*testData
= ReadAndConvertFile(srcPath
, len
, "utf-8", status
);
3120 if (U_FAILURE(status
)) {
3121 return; /* something went wrong, error already output */
3125 // Put the test data into a UnicodeString
3127 UnicodeString
testString(FALSE
, testData
, len
);
3129 RegexMatcher
quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status
);
3130 RegexMatcher
commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status
);
3131 RegexMatcher
flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status
);
3133 RegexMatcher
lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString
, 0, status
);
3134 UnicodeString testPattern
; // The pattern for test from the test file.
3135 UnicodeString testFlags
; // the flags for a test.
3136 UnicodeString matchString
; // The marked up string to be used as input
3138 if (U_FAILURE(status
)){
3139 dataerrln("Construct RegexMatcher() error - %s", u_errorName(status
));
3145 // Loop over the test data file, once per line.
3147 while (lineMat
.find()) {
3149 if (U_FAILURE(status
)) {
3150 errln("%s:%d: ICU Error \"%s\"", srcPath
, lineNum
, u_errorName(status
));
3153 status
= U_ZERO_ERROR
;
3154 UnicodeString testLine
= lineMat
.group(1, status
);
3155 if (testLine
.length() == 0) {
3160 // Parse the test line. Skip blank and comment only lines.
3161 // Separate out the three main fields - pattern, flags, target.
3164 commentMat
.reset(testLine
);
3165 if (commentMat
.lookingAt(status
)) {
3166 // This line is a comment, or blank.
3171 // Pull out the pattern field, remove it from the test file line.
3173 quotedStuffMat
.reset(testLine
);
3174 if (quotedStuffMat
.lookingAt(status
)) {
3175 testPattern
= quotedStuffMat
.group(2, status
);
3176 testLine
.remove(0, quotedStuffMat
.end(0, status
));
3178 errln("Bad pattern (missing quotes?) at %s:%d", srcPath
, lineNum
);
3184 // Pull out the flags from the test file line.
3186 flagsMat
.reset(testLine
);
3187 flagsMat
.lookingAt(status
); // Will always match, possibly an empty string.
3188 testFlags
= flagsMat
.group(1, status
);
3189 if (flagsMat
.group(2, status
).length() > 0) {
3190 errln("Bad Match flag at line %d. Scanning %c\n",
3191 lineNum
, flagsMat
.group(2, status
).charAt(0));
3194 testLine
.remove(0, flagsMat
.end(0, status
));
3197 // Pull out the match string, as a whole.
3198 // We'll process the <tags> later.
3200 quotedStuffMat
.reset(testLine
);
3201 if (quotedStuffMat
.lookingAt(status
)) {
3202 matchString
= quotedStuffMat
.group(2, status
);
3203 testLine
.remove(0, quotedStuffMat
.end(0, status
));
3205 errln("Bad match string at test file line %d", lineNum
);
3210 // The only thing left from the input line should be an optional trailing comment.
3212 commentMat
.reset(testLine
);
3213 if (commentMat
.lookingAt(status
) == FALSE
) {
3214 errln("Line %d: unexpected characters at end of test line.", lineNum
);
3221 regex_find(testPattern
, testFlags
, matchString
, srcPath
, lineNum
);
3230 //---------------------------------------------------------------------------
3232 // regex_find(pattern, flags, inputString, lineNumber)
3234 // Function to run a single test from the Extended (data driven) tests.
3235 // See file test/testdata/regextst.txt for a description of the
3236 // pattern and inputString fields, and the allowed flags.
3237 // lineNumber is the source line in regextst.txt of the test.
3239 //---------------------------------------------------------------------------
3242 // Set a value into a UVector at position specified by a decimal number in
3243 // a UnicodeString. This is a utility function needed by the actual test function,
3245 static void set(UVector
&vec
, int32_t val
, UnicodeString index
) {
3246 UErrorCode status
=U_ZERO_ERROR
;
3248 for (int32_t i
=0; i
<index
.length(); i
++) {
3249 int32_t d
=u_charDigitValue(index
.charAt(i
));
3253 while (vec
.size()<idx
+1) {vec
.addElement(-1, status
);}
3254 vec
.setElementAt(val
, idx
);
3257 static void setInt(UVector
&vec
, int32_t val
, int32_t idx
) {
3258 UErrorCode status
=U_ZERO_ERROR
;
3259 while (vec
.size()<idx
+1) {vec
.addElement(-1, status
);}
3260 vec
.setElementAt(val
, idx
);
3263 static UBool
utextOffsetToNative(UText
*utext
, int32_t unistrOffset
, int32_t& nativeIndex
)
3265 UBool couldFind
= TRUE
;
3266 UTEXT_SETNATIVEINDEX(utext
, 0);
3268 while (i
< unistrOffset
) {
3269 UChar32 c
= UTEXT_NEXT32(utext
);
3270 if (c
!= U_SENTINEL
) {
3277 nativeIndex
= (int32_t)UTEXT_GETNATIVEINDEX(utext
);
3282 void RegexTest::regex_find(const UnicodeString
&pattern
,
3283 const UnicodeString
&flags
,
3284 const UnicodeString
&inputString
,
3285 const char *srcPath
,
3287 UnicodeString unEscapedInput
;
3288 UnicodeString deTaggedInput
;
3290 int32_t patternUTF8Length
, inputUTF8Length
;
3291 char *patternChars
= NULL
, *inputChars
= NULL
;
3292 UText patternText
= UTEXT_INITIALIZER
;
3293 UText inputText
= UTEXT_INITIALIZER
;
3294 UConverter
*UTF8Converter
= NULL
;
3296 UErrorCode status
= U_ZERO_ERROR
;
3298 RegexPattern
*parsePat
= NULL
;
3299 RegexMatcher
*parseMatcher
= NULL
;
3300 RegexPattern
*callerPattern
= NULL
, *UTF8Pattern
= NULL
;
3301 RegexMatcher
*matcher
= NULL
, *UTF8Matcher
= NULL
;
3302 UVector
groupStarts(status
);
3303 UVector
groupEnds(status
);
3304 UVector
groupStartsUTF8(status
);
3305 UVector
groupEndsUTF8(status
);
3306 UBool isMatch
= FALSE
, isUTF8Match
= FALSE
;
3307 UBool failed
= FALSE
;
3310 UBool useMatchesFunc
= FALSE
;
3311 UBool useLookingAtFunc
= FALSE
;
3312 int32_t regionStart
= -1;
3313 int32_t regionEnd
= -1;
3314 int32_t regionStartUTF8
= -1;
3315 int32_t regionEndUTF8
= -1;
3319 // Compile the caller's pattern
3321 uint32_t bflags
= 0;
3322 if (flags
.indexOf((UChar
)0x69) >= 0) { // 'i' flag
3323 bflags
|= UREGEX_CASE_INSENSITIVE
;
3325 if (flags
.indexOf((UChar
)0x78) >= 0) { // 'x' flag
3326 bflags
|= UREGEX_COMMENTS
;
3328 if (flags
.indexOf((UChar
)0x73) >= 0) { // 's' flag
3329 bflags
|= UREGEX_DOTALL
;
3331 if (flags
.indexOf((UChar
)0x6d) >= 0) { // 'm' flag
3332 bflags
|= UREGEX_MULTILINE
;
3335 if (flags
.indexOf((UChar
)0x65) >= 0) { // 'e' flag
3336 bflags
|= UREGEX_ERROR_ON_UNKNOWN_ESCAPES
;
3338 if (flags
.indexOf((UChar
)0x44) >= 0) { // 'D' flag
3339 bflags
|= UREGEX_UNIX_LINES
;
3341 if (flags
.indexOf((UChar
)0x51) >= 0) { // 'Q' flag
3342 bflags
|= UREGEX_LITERAL
;
3346 callerPattern
= RegexPattern::compile(pattern
, bflags
, pe
, status
);
3347 if (status
!= U_ZERO_ERROR
) {
3348 #if UCONFIG_NO_BREAK_ITERATION==1
3349 // 'v' test flag means that the test pattern should not compile if ICU was configured
3350 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3351 if (flags
.indexOf((UChar
)0x76) >= 0 /*'v'*/ && status
== U_UNSUPPORTED_ERROR
) {
3352 goto cleanupAndReturn
;
3355 if (flags
.indexOf((UChar
)0x45) >= 0) { // flags contain 'E'
3356 // Expected pattern compilation error.
3357 if (flags
.indexOf((UChar
)0x64) >= 0) { // flags contain 'd'
3358 logln("Pattern Compile returns \"%s\"", u_errorName(status
));
3360 goto cleanupAndReturn
;
3362 // Unexpected pattern compilation error.
3363 dataerrln("Line %d: error %s compiling pattern.", line
, u_errorName(status
));
3364 goto cleanupAndReturn
;
3368 UTF8Converter
= ucnv_open("UTF8", &status
);
3369 ucnv_setFromUCallBack(UTF8Converter
, UCNV_FROM_U_CALLBACK_STOP
, NULL
, NULL
, NULL
, &status
);
3371 patternUTF8Length
= pattern
.extract(NULL
, 0, UTF8Converter
, status
);
3372 status
= U_ZERO_ERROR
; // buffer overflow
3373 patternChars
= new char[patternUTF8Length
+1];
3374 pattern
.extract(patternChars
, patternUTF8Length
+1, UTF8Converter
, status
);
3375 utext_openUTF8(&patternText
, patternChars
, patternUTF8Length
, &status
);
3377 if (status
== U_ZERO_ERROR
) {
3378 UTF8Pattern
= RegexPattern::compile(&patternText
, bflags
, pe
, status
);
3380 if (status
!= U_ZERO_ERROR
) {
3381 #if UCONFIG_NO_BREAK_ITERATION==1
3382 // 'v' test flag means that the test pattern should not compile if ICU was configured
3383 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3384 if (flags
.indexOf((UChar
)0x76) >= 0 /*'v'*/ && status
== U_UNSUPPORTED_ERROR
) {
3385 goto cleanupAndReturn
;
3388 if (flags
.indexOf((UChar
)0x45) >= 0) { // flags contain 'E'
3389 // Expected pattern compilation error.
3390 if (flags
.indexOf((UChar
)0x64) >= 0) { // flags contain 'd'
3391 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status
));
3393 goto cleanupAndReturn
;
3395 // Unexpected pattern compilation error.
3396 errln("Line %d: error %s compiling pattern. (UTF8)", line
, u_errorName(status
));
3397 goto cleanupAndReturn
;
3402 if (UTF8Pattern
== NULL
) {
3403 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3404 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath
, line
);
3405 status
= U_ZERO_ERROR
;
3408 if (flags
.indexOf((UChar
)0x64) >= 0) { // 'd' flag
3409 callerPattern
->dumpPattern();
3412 if (flags
.indexOf((UChar
)0x45) >= 0) { // 'E' flag
3413 errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath
, line
);
3414 goto cleanupAndReturn
;
3419 // Number of times find() should be called on the test string, default to 1
3422 for (i
=2; i
<=9; i
++) {
3423 if (flags
.indexOf((UChar
)(0x30 + i
)) >= 0) { // digit flag
3424 if (numFinds
!= 1) {
3425 errln("Line %d: more than one digit flag. Scanning %d.", line
, i
);
3426 goto cleanupAndReturn
;
3432 // 'M' flag. Use matches() instead of find()
3433 if (flags
.indexOf((UChar
)0x4d) >= 0) {
3434 useMatchesFunc
= TRUE
;
3436 if (flags
.indexOf((UChar
)0x4c) >= 0) {
3437 useLookingAtFunc
= TRUE
;
3441 // Find the tags in the input data, remove them, and record the group boundary
3444 parsePat
= RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe
, status
);
3445 REGEX_CHECK_STATUS_L(line
);
3447 unEscapedInput
= inputString
.unescape();
3448 parseMatcher
= parsePat
->matcher(unEscapedInput
, status
);
3449 REGEX_CHECK_STATUS_L(line
);
3450 while(parseMatcher
->find()) {
3451 parseMatcher
->appendReplacement(deTaggedInput
, "", status
);
3453 UnicodeString groupNum
= parseMatcher
->group(2, status
);
3454 if (groupNum
== "r") {
3455 // <r> or </r>, a region specification within the string
3456 if (parseMatcher
->group(1, status
) == "/") {
3457 regionEnd
= deTaggedInput
.length();
3459 regionStart
= deTaggedInput
.length();
3462 // <digits> or </digits>, a group match boundary tag.
3463 if (parseMatcher
->group(1, status
) == "/") {
3464 set(groupEnds
, deTaggedInput
.length(), groupNum
);
3466 set(groupStarts
, deTaggedInput
.length(), groupNum
);
3470 parseMatcher
->appendTail(deTaggedInput
);
3471 REGEX_ASSERT_L(groupStarts
.size() == groupEnds
.size(), line
);
3472 if ((regionStart
>=0 || regionEnd
>=0) && (regionStart
<0 || regionStart
>regionEnd
)) {
3473 errln("mismatched <r> tags");
3475 goto cleanupAndReturn
;
3479 // Configure the matcher according to the flags specified with this test.
3481 matcher
= callerPattern
->matcher(deTaggedInput
, status
);
3482 REGEX_CHECK_STATUS_L(line
);
3483 if (flags
.indexOf((UChar
)0x74) >= 0) { // 't' trace flag
3484 matcher
->setTrace(TRUE
);
3487 if (UTF8Pattern
!= NULL
) {
3488 inputUTF8Length
= deTaggedInput
.extract(NULL
, 0, UTF8Converter
, status
);
3489 status
= U_ZERO_ERROR
; // buffer overflow
3490 inputChars
= new char[inputUTF8Length
+1];
3491 deTaggedInput
.extract(inputChars
, inputUTF8Length
+1, UTF8Converter
, status
);
3492 utext_openUTF8(&inputText
, inputChars
, inputUTF8Length
, &status
);
3494 if (status
== U_ZERO_ERROR
) {
3495 UTF8Matcher
= &UTF8Pattern
->matcher(status
)->reset(&inputText
);
3496 REGEX_CHECK_STATUS_L(line
);
3499 if (UTF8Matcher
== NULL
) {
3500 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3501 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath
, line
);
3502 status
= U_ZERO_ERROR
;
3507 // Generate native indices for UTF8 versions of region and capture group info
3509 if (UTF8Matcher
!= NULL
) {
3510 if (regionStart
>=0) (void) utextOffsetToNative(&inputText
, regionStart
, regionStartUTF8
);
3511 if (regionEnd
>=0) (void) utextOffsetToNative(&inputText
, regionEnd
, regionEndUTF8
);
3513 // Fill out the native index UVector info.
3514 // Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3515 for (i
=0; i
<groupStarts
.size(); i
++) {
3516 int32_t start
= groupStarts
.elementAti(i
);
3517 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3520 if (!utextOffsetToNative(&inputText
, start
, startUTF8
)) {
3521 errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line
, i
, start
);
3523 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3525 setInt(groupStartsUTF8
, startUTF8
, i
);
3528 int32_t end
= groupEnds
.elementAti(i
);
3529 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3532 if (!utextOffsetToNative(&inputText
, end
, endUTF8
)) {
3533 errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line
, i
, end
);
3535 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3537 setInt(groupEndsUTF8
, endUTF8
, i
);
3542 if (regionStart
>=0) {
3543 matcher
->region(regionStart
, regionEnd
, status
);
3544 REGEX_CHECK_STATUS_L(line
);
3545 if (UTF8Matcher
!= NULL
) {
3546 UTF8Matcher
->region(regionStartUTF8
, regionEndUTF8
, status
);
3547 REGEX_CHECK_STATUS_L(line
);
3550 if (flags
.indexOf((UChar
)0x61) >= 0) { // 'a' anchoring bounds flag
3551 matcher
->useAnchoringBounds(FALSE
);
3552 if (UTF8Matcher
!= NULL
) {
3553 UTF8Matcher
->useAnchoringBounds(FALSE
);
3556 if (flags
.indexOf((UChar
)0x62) >= 0) { // 'b' transparent bounds flag
3557 matcher
->useTransparentBounds(TRUE
);
3558 if (UTF8Matcher
!= NULL
) {
3559 UTF8Matcher
->useTransparentBounds(TRUE
);
3566 // Do a find on the de-tagged input using the caller's pattern
3567 // TODO: error on count>1 and not find().
3568 // error on both matches() and lookingAt().
3570 for (i
=0; i
<numFinds
; i
++) {
3571 if (useMatchesFunc
) {
3572 isMatch
= matcher
->matches(status
);
3573 if (UTF8Matcher
!= NULL
) {
3574 isUTF8Match
= UTF8Matcher
->matches(status
);
3576 } else if (useLookingAtFunc
) {
3577 isMatch
= matcher
->lookingAt(status
);
3578 if (UTF8Matcher
!= NULL
) {
3579 isUTF8Match
= UTF8Matcher
->lookingAt(status
);
3582 isMatch
= matcher
->find();
3583 if (UTF8Matcher
!= NULL
) {
3584 isUTF8Match
= UTF8Matcher
->find();
3588 matcher
->setTrace(FALSE
);
3589 if (U_FAILURE(status
)) {
3590 errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status
));
3594 // Match up the groups from the find() with the groups from the tags
3597 // number of tags should match number of groups from find operation.
3598 // matcher->groupCount does not include group 0, the entire match, hence the +1.
3599 // G option in test means that capture group data is not available in the
3600 // expected results, so the check needs to be suppressed.
3601 if (isMatch
== FALSE
&& groupStarts
.size() != 0) {
3602 dataerrln("Error at line %d: Match expected, but none found.", line
);
3604 goto cleanupAndReturn
;
3605 } else if (UTF8Matcher
!= NULL
&& isUTF8Match
== FALSE
&& groupStarts
.size() != 0) {
3606 errln("Error at line %d: Match expected, but none found. (UTF8)", line
);
3608 goto cleanupAndReturn
;
3611 if (flags
.indexOf((UChar
)0x47 /*G*/) >= 0) {
3612 // Only check for match / no match. Don't check capture groups.
3613 if (isMatch
&& groupStarts
.size() == 0) {
3614 errln("Error at line %d: No match expected, but one found.", line
);
3616 } else if (UTF8Matcher
!= NULL
&& isUTF8Match
&& groupStarts
.size() == 0) {
3617 errln("Error at line %d: No match expected, but one found. (UTF8)", line
);
3620 goto cleanupAndReturn
;
3623 REGEX_CHECK_STATUS_L(line
);
3624 for (i
=0; i
<=matcher
->groupCount(); i
++) {
3625 int32_t expectedStart
= (i
>= groupStarts
.size()? -1 : groupStarts
.elementAti(i
));
3626 int32_t expectedStartUTF8
= (i
>= groupStartsUTF8
.size()? -1 : groupStartsUTF8
.elementAti(i
));
3627 if (matcher
->start(i
, status
) != expectedStart
) {
3628 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
3629 line
, i
, expectedStart
, matcher
->start(i
, status
));
3631 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3632 } else if (UTF8Matcher
!= NULL
&& UTF8Matcher
->start(i
, status
) != expectedStartUTF8
) {
3633 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)",
3634 line
, i
, expectedStartUTF8
, UTF8Matcher
->start(i
, status
));
3636 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3639 int32_t expectedEnd
= (i
>= groupEnds
.size()? -1 : groupEnds
.elementAti(i
));
3640 int32_t expectedEndUTF8
= (i
>= groupEndsUTF8
.size()? -1 : groupEndsUTF8
.elementAti(i
));
3641 if (matcher
->end(i
, status
) != expectedEnd
) {
3642 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
3643 line
, i
, expectedEnd
, matcher
->end(i
, status
));
3645 // Error on end position; keep going; real error is probably yet to come as group
3646 // end positions work from end of the input data towards the front.
3647 } else if (UTF8Matcher
!= NULL
&& UTF8Matcher
->end(i
, status
) != expectedEndUTF8
) {
3648 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)",
3649 line
, i
, expectedEndUTF8
, UTF8Matcher
->end(i
, status
));
3651 // Error on end position; keep going; real error is probably yet to come as group
3652 // end positions work from end of the input data towards the front.
3655 if ( matcher
->groupCount()+1 < groupStarts
.size()) {
3656 errln("Error at line %d: Expected %d capture groups, found %d.",
3657 line
, groupStarts
.size()-1, matcher
->groupCount());
3660 else if (UTF8Matcher
!= NULL
&& UTF8Matcher
->groupCount()+1 < groupStarts
.size()) {
3661 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3662 line
, groupStarts
.size()-1, UTF8Matcher
->groupCount());
3666 if ((flags
.indexOf((UChar
)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3667 matcher
->requireEnd() == TRUE
) {
3668 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line
);
3670 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3671 UTF8Matcher
->requireEnd() == TRUE
) {
3672 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line
);
3676 if ((flags
.indexOf((UChar
)0x79) >= 0) && // 'y' flag: RequireEnd() == true
3677 matcher
->requireEnd() == FALSE
) {
3678 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line
);
3680 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x79) >= 0) && // 'Y' flag: RequireEnd() == false
3681 UTF8Matcher
->requireEnd() == FALSE
) {
3682 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line
);
3686 if ((flags
.indexOf((UChar
)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3687 matcher
->hitEnd() == TRUE
) {
3688 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line
);
3690 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3691 UTF8Matcher
->hitEnd() == TRUE
) {
3692 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line
);
3696 if ((flags
.indexOf((UChar
)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3697 matcher
->hitEnd() == FALSE
) {
3698 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line
);
3700 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3701 UTF8Matcher
->hitEnd() == FALSE
) {
3702 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line
);
3709 infoln((UnicodeString
)"\""+pattern
+(UnicodeString
)"\" "
3710 +flags
+(UnicodeString
)" \""+inputString
+(UnicodeString
)"\"");
3711 // callerPattern->dump();
3713 delete parseMatcher
;
3718 delete callerPattern
;
3720 utext_close(&inputText
);
3721 delete[] inputChars
;
3722 utext_close(&patternText
);
3723 delete[] patternChars
;
3724 ucnv_close(UTF8Converter
);
3730 //---------------------------------------------------------------------------
3732 // Errors Check for error handling in patterns.
3734 //---------------------------------------------------------------------------
3735 void RegexTest::Errors() {
3736 // \escape sequences that aren't implemented yet.
3737 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3739 // Missing close parentheses
3740 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN
);
3741 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN
);
3742 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN
);
3744 // Extra close paren
3745 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN
);
3746 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN
);
3747 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN
);
3749 // Look-ahead, Look-behind
3750 // TODO: add tests for unbounded length look-behinds.
3751 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX
); // illegal construct
3753 // Attempt to use non-default flags
3756 UErrorCode status
= U_ZERO_ERROR
;
3757 int32_t flags
= UREGEX_CANON_EQ
|
3758 UREGEX_COMMENTS
| UREGEX_DOTALL
|
3760 RegexPattern
*pat1
= RegexPattern::compile(".*", flags
, pe
, status
);
3761 REGEX_ASSERT(status
== U_REGEX_UNIMPLEMENTED
);
3766 // Quantifiers are allowed only after something that can be quantified.
3767 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX
);
3768 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX
);
3769 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX
);
3771 // Mal-formed {min,max} quantifiers
3772 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL
);
3773 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN
);
3774 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL
);
3775 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL
);
3776 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL
);
3777 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG
);
3778 REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG
); // Overflows int during scan
3779 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG
); // Overflows regex binary format
3780 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG
);
3783 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX
);
3785 // Invalid Back Reference \0
3786 // For ICU 3.8 and earlier
3787 // For ICU versions newer than 3.8, \0 introduces an octal escape.
3789 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE
);
3794 //-------------------------------------------------------------------------------
3796 // Read a text data file, convert it to UChars, and return the data
3797 // in one big UChar * buffer, which the caller must delete.
3799 //--------------------------------------------------------------------------------
3800 UChar
*RegexTest::ReadAndConvertFile(const char *fileName
, int32_t &ulen
,
3801 const char *defEncoding
, UErrorCode
&status
) {
3802 UChar
*retPtr
= NULL
;
3803 char *fileBuf
= NULL
;
3804 UConverter
* conv
= NULL
;
3808 if (U_FAILURE(status
)) {
3815 f
= fopen(fileName
, "rb");
3817 dataerrln("Error opening test data file %s\n", fileName
);
3818 status
= U_FILE_ACCESS_ERROR
;
3827 fseek( f
, 0, SEEK_END
);
3828 fileSize
= ftell(f
);
3829 fileBuf
= new char[fileSize
];
3830 fseek(f
, 0, SEEK_SET
);
3831 amt_read
= fread(fileBuf
, 1, fileSize
, f
);
3832 if (amt_read
!= fileSize
|| fileSize
<= 0) {
3833 errln("Error reading test data file.");
3834 goto cleanUpAndReturn
;
3838 // Look for a Unicode Signature (BOM) on the data just read
3840 int32_t signatureLength
;
3841 const char * fileBufC
;
3842 const char* encoding
;
3845 encoding
= ucnv_detectUnicodeSignature(
3846 fileBuf
, fileSize
, &signatureLength
, &status
);
3847 if(encoding
!=NULL
){
3848 fileBufC
+= signatureLength
;
3849 fileSize
-= signatureLength
;
3851 encoding
= defEncoding
;
3852 if (strcmp(encoding
, "utf-8") == 0) {
3853 errln("file %s is missing its BOM", fileName
);
3858 // Open a converter to take the rule file to UTF-16
3860 conv
= ucnv_open(encoding
, &status
);
3861 if (U_FAILURE(status
)) {
3862 goto cleanUpAndReturn
;
3866 // Convert the rules to UChar.
3867 // Preflight first to determine required buffer size.
3869 ulen
= ucnv_toUChars(conv
,
3875 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
3876 // Buffer Overflow is expected from the preflight operation.
3877 status
= U_ZERO_ERROR
;
3879 retPtr
= new UChar
[ulen
+1];
3892 if (U_FAILURE(status
)) {
3893 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
3902 //-------------------------------------------------------------------------------
3904 // PerlTests - Run Perl's regular expression tests
3905 // The input file for this test is re_tests, the standard regular
3906 // expression test data distributed with the Perl source code.
3908 // Here is Perl's description of the test data file:
3910 // # The tests are in a separate file 't/op/re_tests'.
3911 // # Each line in that file is a separate test.
3912 // # There are five columns, separated by tabs.
3914 // # Column 1 contains the pattern, optionally enclosed in C<''>.
3915 // # Modifiers can be put after the closing C<'>.
3917 // # Column 2 contains the string to be matched.
3919 // # Column 3 contains the expected result:
3920 // # y expect a match
3921 // # n expect no match
3922 // # c expect an error
3923 // # B test exposes a known bug in Perl, should be skipped
3924 // # b test exposes a known bug in Perl, should be skipped if noamp
3926 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3928 // # Column 4 contains a string, usually C<$&>.
3930 // # Column 5 contains the expected result of double-quote
3931 // # interpolating that string after the match, or start of error message.
3933 // # Column 6, if present, contains a reason why the test is skipped.
3934 // # This is printed with "skipped", for harness to pick up.
3936 // # \n in the tests are interpolated, as are variables of the form ${\w+}.
3938 // # If you want to add a regular expression test that can't be expressed
3939 // # in this format, don't add it here: put it in op/pat.t instead.
3941 // For ICU, if field 3 contains an 'i', the test will be skipped.
3942 // The test exposes is some known incompatibility between ICU and Perl regexps.
3943 // (The i is in addition to whatever was there before.)
3945 //-------------------------------------------------------------------------------
3946 void RegexTest::PerlTests() {
3948 const char *srcPath
;
3949 UErrorCode status
= U_ZERO_ERROR
;
3953 // Open and read the test data file.
3955 srcPath
=getPath(tdd
, "re_tests.txt");
3957 return; /* something went wrong, error already output */
3961 UChar
*testData
= ReadAndConvertFile(srcPath
, len
, "iso-8859-1", status
);
3962 if (U_FAILURE(status
)) {
3963 return; /* something went wrong, error already output */
3967 // Put the test data into a UnicodeString
3969 UnicodeString
testDataString(FALSE
, testData
, len
);
3972 // Regex to break the input file into lines, and strip the new lines.
3973 // One line per match, capture group one is the desired data.
3975 RegexPattern
* linePat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe
, status
);
3976 if (U_FAILURE(status
)) {
3977 dataerrln("RegexPattern::compile() error");
3980 RegexMatcher
* lineMat
= linePat
->matcher(testDataString
, status
);
3983 // Regex to split a test file line into fields.
3984 // There are six fields, separated by tabs.
3986 RegexPattern
* fieldPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe
, status
);
3989 // Regex to identify test patterns with flag settings, and to separate them.
3990 // Test patterns with flags look like 'pattern'i
3991 // Test patterns without flags are not quoted: pattern
3992 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
3994 RegexPattern
*flagPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe
, status
);
3995 RegexMatcher
* flagMat
= flagPat
->matcher(status
);
3998 // The Perl tests reference several perl-isms, which are evaluated/substituted
3999 // in the test data. Not being perl, this must be done explicitly. Here
4000 // are string constants and REs for these constructs.
4002 UnicodeString
nulnulSrc("${nulnul}");
4003 UnicodeString
nulnul("\\u0000\\u0000", -1, US_INV
);
4004 nulnul
= nulnul
.unescape();
4006 UnicodeString
ffffSrc("${ffff}");
4007 UnicodeString
ffff("\\uffff", -1, US_INV
);
4008 ffff
= ffff
.unescape();
4010 // regexp for $-[0], $+[2], etc.
4011 RegexPattern
*groupsPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe
, status
);
4012 RegexMatcher
*groupsMat
= groupsPat
->matcher(status
);
4014 // regexp for $0, $1, $2, etc.
4015 RegexPattern
*cgPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe
, status
);
4016 RegexMatcher
*cgMat
= cgPat
->matcher(status
);
4020 // Main Loop for the Perl Tests, runs once per line from the
4023 int32_t lineNum
= 0;
4024 int32_t skippedUnimplementedCount
= 0;
4025 while (lineMat
->find()) {
4029 // Get a line, break it into its fields, do the Perl
4030 // variable substitutions.
4032 UnicodeString line
= lineMat
->group(1, status
);
4033 UnicodeString fields
[7];
4034 fieldPat
->split(line
, fields
, 7, status
);
4036 flagMat
->reset(fields
[0]);
4037 flagMat
->matches(status
);
4038 UnicodeString pattern
= flagMat
->group(2, status
);
4039 pattern
.findAndReplace("${bang}", "!");
4040 pattern
.findAndReplace(nulnulSrc
, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4041 pattern
.findAndReplace(ffffSrc
, ffff
);
4044 // Identify patterns that include match flag settings,
4045 // split off the flags, remove the extra quotes.
4047 UnicodeString flagStr
= flagMat
->group(3, status
);
4048 if (U_FAILURE(status
)) {
4049 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
4053 const UChar UChar_c
= 0x63; // Char constants for the flag letters.
4054 const UChar UChar_i
= 0x69; // (Damn the lack of Unicode support in C)
4055 const UChar UChar_m
= 0x6d;
4056 const UChar UChar_x
= 0x78;
4057 const UChar UChar_y
= 0x79;
4058 if (flagStr
.indexOf(UChar_i
) != -1) {
4059 flags
|= UREGEX_CASE_INSENSITIVE
;
4061 if (flagStr
.indexOf(UChar_m
) != -1) {
4062 flags
|= UREGEX_MULTILINE
;
4064 if (flagStr
.indexOf(UChar_x
) != -1) {
4065 flags
|= UREGEX_COMMENTS
;
4069 // Compile the test pattern.
4071 status
= U_ZERO_ERROR
;
4072 RegexPattern
*testPat
= RegexPattern::compile(pattern
, flags
, pe
, status
);
4073 if (status
== U_REGEX_UNIMPLEMENTED
) {
4075 // Test of a feature that is planned for ICU, but not yet implemented.
4077 skippedUnimplementedCount
++;
4079 status
= U_ZERO_ERROR
;
4083 if (U_FAILURE(status
)) {
4084 // Some tests are supposed to generate errors.
4085 // Only report an error for tests that are supposed to succeed.
4086 if (fields
[2].indexOf(UChar_c
) == -1 && // Compilation is not supposed to fail AND
4087 fields
[2].indexOf(UChar_i
) == -1) // it's not an accepted ICU incompatibility
4089 errln("line %d: ICU Error \"%s\"\n", lineNum
, u_errorName(status
));
4091 status
= U_ZERO_ERROR
;
4096 if (fields
[2].indexOf(UChar_i
) >= 0) {
4097 // ICU should skip this test.
4102 if (fields
[2].indexOf(UChar_c
) >= 0) {
4103 // This pattern should have caused a compilation error, but didn't/
4104 errln("line %d: Expected a pattern compile error, got success.", lineNum
);
4110 // replace the Perl variables that appear in some of the
4111 // match data strings.
4113 UnicodeString matchString
= fields
[1];
4114 matchString
.findAndReplace(nulnulSrc
, nulnul
);
4115 matchString
.findAndReplace(ffffSrc
, ffff
);
4117 // Replace any \n in the match string with an actual new-line char.
4118 // Don't do full unescape, as this unescapes more than Perl does, which
4119 // causes other spurious failures in the tests.
4120 matchString
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4125 // Run the test, check for expected match/don't match result.
4127 RegexMatcher
*testMat
= testPat
->matcher(matchString
, status
);
4128 UBool found
= testMat
->find();
4129 UBool expected
= FALSE
;
4130 if (fields
[2].indexOf(UChar_y
) >=0) {
4133 if (expected
!= found
) {
4134 errln("line %d: Expected %smatch, got %smatch",
4135 lineNum
, expected
?"":"no ", found
?"":"no " );
4139 // Don't try to check expected results if there is no match.
4140 // (Some have stuff in the expected fields)
4148 // Interpret the Perl expression from the fourth field of the data file,
4149 // building up an ICU string from the results of the ICU match.
4150 // The Perl expression will contain references to the results of
4151 // a regex match, including the matched string, capture group strings,
4152 // group starting and ending indicies, etc.
4154 UnicodeString resultString
;
4155 UnicodeString perlExpr
= fields
[3];
4156 #if SUPPORT_MUTATING_INPUT_STRING
4157 groupsMat
->reset(perlExpr
);
4158 cgMat
->reset(perlExpr
);
4161 while (perlExpr
.length() > 0) {
4162 #if !SUPPORT_MUTATING_INPUT_STRING
4163 // Perferred usage. Reset after any modification to input string.
4164 groupsMat
->reset(perlExpr
);
4165 cgMat
->reset(perlExpr
);
4168 if (perlExpr
.startsWith("$&")) {
4169 resultString
.append(testMat
->group(status
));
4170 perlExpr
.remove(0, 2);
4173 else if (groupsMat
->lookingAt(status
)) {
4175 UnicodeString digitString
= groupsMat
->group(2, status
);
4177 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4178 UnicodeString plusOrMinus
= groupsMat
->group(1, status
);
4179 int32_t matchPosition
;
4180 if (plusOrMinus
.compare("+") == 0) {
4181 matchPosition
= testMat
->end(groupNum
, status
);
4183 matchPosition
= testMat
->start(groupNum
, status
);
4185 if (matchPosition
!= -1) {
4186 ICU_Utility::appendNumber(resultString
, matchPosition
);
4188 perlExpr
.remove(0, groupsMat
->end(status
));
4191 else if (cgMat
->lookingAt(status
)) {
4193 UnicodeString digitString
= cgMat
->group(1, status
);
4195 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4196 if (U_SUCCESS(status
)) {
4197 resultString
.append(testMat
->group(groupNum
, status
));
4198 status
= U_ZERO_ERROR
;
4200 perlExpr
.remove(0, cgMat
->end(status
));
4203 else if (perlExpr
.startsWith("@-")) {
4205 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4207 resultString
.append(" ");
4209 ICU_Utility::appendNumber(resultString
, testMat
->start(i
, status
));
4211 perlExpr
.remove(0, 2);
4214 else if (perlExpr
.startsWith("@+")) {
4216 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4218 resultString
.append(" ");
4220 ICU_Utility::appendNumber(resultString
, testMat
->end(i
, status
));
4222 perlExpr
.remove(0, 2);
4225 else if (perlExpr
.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4226 // or as an escaped sequence (e.g. \n)
4227 if (perlExpr
.length() > 1) {
4228 perlExpr
.remove(0, 1); // Remove the '\', but only if not last char.
4230 UChar c
= perlExpr
.charAt(0);
4232 case 'n': c
= '\n'; break;
4233 // add any other escape sequences that show up in the test expected results.
4235 resultString
.append(c
);
4236 perlExpr
.remove(0, 1);
4240 // Any characters from the perl expression that we don't explicitly
4241 // recognize before here are assumed to be literals and copied
4242 // as-is to the expected results.
4243 resultString
.append(perlExpr
.charAt(0));
4244 perlExpr
.remove(0, 1);
4247 if (U_FAILURE(status
)) {
4248 errln("Line %d: ICU Error \"%s\"", lineNum
, u_errorName(status
));
4254 // Expected Results Compare
4256 UnicodeString
expectedS(fields
[4]);
4257 expectedS
.findAndReplace(nulnulSrc
, nulnul
);
4258 expectedS
.findAndReplace(ffffSrc
, ffff
);
4259 expectedS
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4262 if (expectedS
.compare(resultString
) != 0) {
4263 err("Line %d: Incorrect perl expression results.", lineNum
);
4264 infoln((UnicodeString
)"Expected \""+expectedS
+(UnicodeString
)"\"; got \""+resultString
+(UnicodeString
)"\"");
4272 // All done. Clean up allocated stuff.
4290 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount
);
4295 //-------------------------------------------------------------------------------
4297 // PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts
4298 // (instead of using UnicodeStrings) to test the alternate engine.
4299 // The input file for this test is re_tests, the standard regular
4300 // expression test data distributed with the Perl source code.
4301 // See PerlTests() for more information.
4303 //-------------------------------------------------------------------------------
4304 void RegexTest::PerlTestsUTF8() {
4306 const char *srcPath
;
4307 UErrorCode status
= U_ZERO_ERROR
;
4309 LocalUConverterPointer
UTF8Converter(ucnv_open("UTF-8", &status
));
4310 UText patternText
= UTEXT_INITIALIZER
;
4311 char *patternChars
= NULL
;
4312 int32_t patternLength
;
4313 int32_t patternCapacity
= 0;
4314 UText inputText
= UTEXT_INITIALIZER
;
4315 char *inputChars
= NULL
;
4316 int32_t inputLength
;
4317 int32_t inputCapacity
= 0;
4319 ucnv_setFromUCallBack(UTF8Converter
.getAlias(), UCNV_FROM_U_CALLBACK_STOP
, NULL
, NULL
, NULL
, &status
);
4322 // Open and read the test data file.
4324 srcPath
=getPath(tdd
, "re_tests.txt");
4326 return; /* something went wrong, error already output */
4330 UChar
*testData
= ReadAndConvertFile(srcPath
, len
, "iso-8859-1", status
);
4331 if (U_FAILURE(status
)) {
4332 return; /* something went wrong, error already output */
4336 // Put the test data into a UnicodeString
4338 UnicodeString
testDataString(FALSE
, testData
, len
);
4341 // Regex to break the input file into lines, and strip the new lines.
4342 // One line per match, capture group one is the desired data.
4344 RegexPattern
* linePat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe
, status
);
4345 if (U_FAILURE(status
)) {
4346 dataerrln("RegexPattern::compile() error");
4349 RegexMatcher
* lineMat
= linePat
->matcher(testDataString
, status
);
4352 // Regex to split a test file line into fields.
4353 // There are six fields, separated by tabs.
4355 RegexPattern
* fieldPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe
, status
);
4358 // Regex to identify test patterns with flag settings, and to separate them.
4359 // Test patterns with flags look like 'pattern'i
4360 // Test patterns without flags are not quoted: pattern
4361 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4363 RegexPattern
*flagPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe
, status
);
4364 RegexMatcher
* flagMat
= flagPat
->matcher(status
);
4367 // The Perl tests reference several perl-isms, which are evaluated/substituted
4368 // in the test data. Not being perl, this must be done explicitly. Here
4369 // are string constants and REs for these constructs.
4371 UnicodeString
nulnulSrc("${nulnul}");
4372 UnicodeString
nulnul("\\u0000\\u0000", -1, US_INV
);
4373 nulnul
= nulnul
.unescape();
4375 UnicodeString
ffffSrc("${ffff}");
4376 UnicodeString
ffff("\\uffff", -1, US_INV
);
4377 ffff
= ffff
.unescape();
4379 // regexp for $-[0], $+[2], etc.
4380 RegexPattern
*groupsPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe
, status
);
4381 RegexMatcher
*groupsMat
= groupsPat
->matcher(status
);
4383 // regexp for $0, $1, $2, etc.
4384 RegexPattern
*cgPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe
, status
);
4385 RegexMatcher
*cgMat
= cgPat
->matcher(status
);
4389 // Main Loop for the Perl Tests, runs once per line from the
4392 int32_t lineNum
= 0;
4393 int32_t skippedUnimplementedCount
= 0;
4394 while (lineMat
->find()) {
4398 // Get a line, break it into its fields, do the Perl
4399 // variable substitutions.
4401 UnicodeString line
= lineMat
->group(1, status
);
4402 UnicodeString fields
[7];
4403 fieldPat
->split(line
, fields
, 7, status
);
4405 flagMat
->reset(fields
[0]);
4406 flagMat
->matches(status
);
4407 UnicodeString pattern
= flagMat
->group(2, status
);
4408 pattern
.findAndReplace("${bang}", "!");
4409 pattern
.findAndReplace(nulnulSrc
, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4410 pattern
.findAndReplace(ffffSrc
, ffff
);
4413 // Identify patterns that include match flag settings,
4414 // split off the flags, remove the extra quotes.
4416 UnicodeString flagStr
= flagMat
->group(3, status
);
4417 if (U_FAILURE(status
)) {
4418 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
4422 const UChar UChar_c
= 0x63; // Char constants for the flag letters.
4423 const UChar UChar_i
= 0x69; // (Damn the lack of Unicode support in C)
4424 const UChar UChar_m
= 0x6d;
4425 const UChar UChar_x
= 0x78;
4426 const UChar UChar_y
= 0x79;
4427 if (flagStr
.indexOf(UChar_i
) != -1) {
4428 flags
|= UREGEX_CASE_INSENSITIVE
;
4430 if (flagStr
.indexOf(UChar_m
) != -1) {
4431 flags
|= UREGEX_MULTILINE
;
4433 if (flagStr
.indexOf(UChar_x
) != -1) {
4434 flags
|= UREGEX_COMMENTS
;
4438 // Put the pattern in a UTF-8 UText
4440 status
= U_ZERO_ERROR
;
4441 patternLength
= pattern
.extract(patternChars
, patternCapacity
, UTF8Converter
.getAlias(), status
);
4442 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
4443 status
= U_ZERO_ERROR
;
4444 delete[] patternChars
;
4445 patternCapacity
= patternLength
+ 1;
4446 patternChars
= new char[patternCapacity
];
4447 pattern
.extract(patternChars
, patternCapacity
, UTF8Converter
.getAlias(), status
);
4449 utext_openUTF8(&patternText
, patternChars
, patternLength
, &status
);
4452 // Compile the test pattern.
4454 RegexPattern
*testPat
= RegexPattern::compile(&patternText
, flags
, pe
, status
);
4455 if (status
== U_REGEX_UNIMPLEMENTED
) {
4457 // Test of a feature that is planned for ICU, but not yet implemented.
4459 skippedUnimplementedCount
++;
4461 status
= U_ZERO_ERROR
;
4465 if (U_FAILURE(status
)) {
4466 // Some tests are supposed to generate errors.
4467 // Only report an error for tests that are supposed to succeed.
4468 if (fields
[2].indexOf(UChar_c
) == -1 && // Compilation is not supposed to fail AND
4469 fields
[2].indexOf(UChar_i
) == -1) // it's not an accepted ICU incompatibility
4471 errln("line %d: ICU Error \"%s\"\n", lineNum
, u_errorName(status
));
4473 status
= U_ZERO_ERROR
;
4478 if (fields
[2].indexOf(UChar_i
) >= 0) {
4479 // ICU should skip this test.
4484 if (fields
[2].indexOf(UChar_c
) >= 0) {
4485 // This pattern should have caused a compilation error, but didn't/
4486 errln("line %d: Expected a pattern compile error, got success.", lineNum
);
4493 // replace the Perl variables that appear in some of the
4494 // match data strings.
4496 UnicodeString matchString
= fields
[1];
4497 matchString
.findAndReplace(nulnulSrc
, nulnul
);
4498 matchString
.findAndReplace(ffffSrc
, ffff
);
4500 // Replace any \n in the match string with an actual new-line char.
4501 // Don't do full unescape, as this unescapes more than Perl does, which
4502 // causes other spurious failures in the tests.
4503 matchString
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4506 // Put the input in a UTF-8 UText
4508 status
= U_ZERO_ERROR
;
4509 inputLength
= matchString
.extract(inputChars
, inputCapacity
, UTF8Converter
.getAlias(), status
);
4510 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
4511 status
= U_ZERO_ERROR
;
4512 delete[] inputChars
;
4513 inputCapacity
= inputLength
+ 1;
4514 inputChars
= new char[inputCapacity
];
4515 matchString
.extract(inputChars
, inputCapacity
, UTF8Converter
.getAlias(), status
);
4517 utext_openUTF8(&inputText
, inputChars
, inputLength
, &status
);
4520 // Run the test, check for expected match/don't match result.
4522 RegexMatcher
*testMat
= &testPat
->matcher(status
)->reset(&inputText
);
4523 UBool found
= testMat
->find();
4524 UBool expected
= FALSE
;
4525 if (fields
[2].indexOf(UChar_y
) >=0) {
4528 if (expected
!= found
) {
4529 errln("line %d: Expected %smatch, got %smatch",
4530 lineNum
, expected
?"":"no ", found
?"":"no " );
4534 // Don't try to check expected results if there is no match.
4535 // (Some have stuff in the expected fields)
4543 // Interpret the Perl expression from the fourth field of the data file,
4544 // building up an ICU string from the results of the ICU match.
4545 // The Perl expression will contain references to the results of
4546 // a regex match, including the matched string, capture group strings,
4547 // group starting and ending indicies, etc.
4549 UnicodeString resultString
;
4550 UnicodeString perlExpr
= fields
[3];
4552 while (perlExpr
.length() > 0) {
4553 groupsMat
->reset(perlExpr
);
4554 cgMat
->reset(perlExpr
);
4556 if (perlExpr
.startsWith("$&")) {
4557 resultString
.append(testMat
->group(status
));
4558 perlExpr
.remove(0, 2);
4561 else if (groupsMat
->lookingAt(status
)) {
4563 UnicodeString digitString
= groupsMat
->group(2, status
);
4565 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4566 UnicodeString plusOrMinus
= groupsMat
->group(1, status
);
4567 int32_t matchPosition
;
4568 if (plusOrMinus
.compare("+") == 0) {
4569 matchPosition
= testMat
->end(groupNum
, status
);
4571 matchPosition
= testMat
->start(groupNum
, status
);
4573 if (matchPosition
!= -1) {
4574 ICU_Utility::appendNumber(resultString
, matchPosition
);
4576 perlExpr
.remove(0, groupsMat
->end(status
));
4579 else if (cgMat
->lookingAt(status
)) {
4581 UnicodeString digitString
= cgMat
->group(1, status
);
4583 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4584 if (U_SUCCESS(status
)) {
4585 resultString
.append(testMat
->group(groupNum
, status
));
4586 status
= U_ZERO_ERROR
;
4588 perlExpr
.remove(0, cgMat
->end(status
));
4591 else if (perlExpr
.startsWith("@-")) {
4593 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4595 resultString
.append(" ");
4597 ICU_Utility::appendNumber(resultString
, testMat
->start(i
, status
));
4599 perlExpr
.remove(0, 2);
4602 else if (perlExpr
.startsWith("@+")) {
4604 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4606 resultString
.append(" ");
4608 ICU_Utility::appendNumber(resultString
, testMat
->end(i
, status
));
4610 perlExpr
.remove(0, 2);
4613 else if (perlExpr
.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4614 // or as an escaped sequence (e.g. \n)
4615 if (perlExpr
.length() > 1) {
4616 perlExpr
.remove(0, 1); // Remove the '\', but only if not last char.
4618 UChar c
= perlExpr
.charAt(0);
4620 case 'n': c
= '\n'; break;
4621 // add any other escape sequences that show up in the test expected results.
4623 resultString
.append(c
);
4624 perlExpr
.remove(0, 1);
4628 // Any characters from the perl expression that we don't explicitly
4629 // recognize before here are assumed to be literals and copied
4630 // as-is to the expected results.
4631 resultString
.append(perlExpr
.charAt(0));
4632 perlExpr
.remove(0, 1);
4635 if (U_FAILURE(status
)) {
4636 errln("Line %d: ICU Error \"%s\"", lineNum
, u_errorName(status
));
4642 // Expected Results Compare
4644 UnicodeString
expectedS(fields
[4]);
4645 expectedS
.findAndReplace(nulnulSrc
, nulnul
);
4646 expectedS
.findAndReplace(ffffSrc
, ffff
);
4647 expectedS
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4650 if (expectedS
.compare(resultString
) != 0) {
4651 err("Line %d: Incorrect perl expression results.", lineNum
);
4652 infoln((UnicodeString
)"Expected \""+expectedS
+(UnicodeString
)"\"; got \""+resultString
+(UnicodeString
)"\"");
4660 // All done. Clean up allocated stuff.
4677 utext_close(&patternText
);
4678 utext_close(&inputText
);
4680 delete [] patternChars
;
4681 delete [] inputChars
;
4684 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount
);
4689 //--------------------------------------------------------------
4691 // Bug6149 Verify limits to heap expansion for backtrack stack.
4692 // Use this pattern,
4693 // "(a?){1,8000000}"
4694 // Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4695 // This test is likely to be fragile, as further optimizations stop
4696 // more cases of pointless looping in the match engine.
4698 //---------------------------------------------------------------
4699 void RegexTest::Bug6149() {
4700 UnicodeString
pattern("(a?){1,8000000}");
4701 UnicodeString
s("xyz");
4703 UErrorCode status
= U_ZERO_ERROR
;
4705 RegexMatcher
matcher(pattern
, s
, flags
, status
);
4706 UBool result
= false;
4707 REGEX_ASSERT_FAIL(result
=matcher
.matches(status
), U_REGEX_STACK_OVERFLOW
);
4708 REGEX_ASSERT(result
== FALSE
);
4713 // Callbacks() Test the callback function.
4714 // When set, callbacks occur periodically during matching operations,
4715 // giving the application code the ability to abort the operation
4716 // before it's normal completion.
4719 struct callBackContext
{
4724 void reset(int32_t max
) {maxCalls
=max
; numCalls
=0; lastSteps
=0;};
4728 static UBool U_CALLCONV
4729 testCallBackFn(const void *context
, int32_t steps
) {
4730 callBackContext
*info
= (callBackContext
*)context
;
4731 if (info
->lastSteps
+1 != steps
) {
4732 info
->test
->errln("incorrect steps in callback. Expected %d, got %d\n", info
->lastSteps
+1, steps
);
4734 info
->lastSteps
= steps
;
4736 return (info
->numCalls
< info
->maxCalls
);
4740 void RegexTest::Callbacks() {
4742 // Getter returns NULLs if no callback has been set
4744 // The variables that the getter will fill in.
4745 // Init to non-null values so that the action of the getter can be seen.
4746 const void *returnedContext
= &returnedContext
;
4747 URegexMatchCallback
*returnedFn
= &testCallBackFn
;
4749 UErrorCode status
= U_ZERO_ERROR
;
4750 RegexMatcher
matcher("x", 0, status
);
4752 matcher
.getMatchCallback(returnedFn
, returnedContext
, status
);
4754 REGEX_ASSERT(returnedFn
== NULL
);
4755 REGEX_ASSERT(returnedContext
== NULL
);
4760 callBackContext cbInfo
= {this, 0, 0, 0};
4761 const void *returnedContext
;
4762 URegexMatchCallback
*returnedFn
;
4763 UErrorCode status
= U_ZERO_ERROR
;
4764 RegexMatcher
matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status
); // A pattern that can run long.
4766 matcher
.setMatchCallback(testCallBackFn
, &cbInfo
, status
);
4768 matcher
.getMatchCallback(returnedFn
, returnedContext
, status
);
4770 REGEX_ASSERT(returnedFn
== testCallBackFn
);
4771 REGEX_ASSERT(returnedContext
== &cbInfo
);
4773 // A short-running match shouldn't invoke the callback
4774 status
= U_ZERO_ERROR
;
4776 UnicodeString s
= "xxx";
4778 REGEX_ASSERT(matcher
.matches(status
));
4780 REGEX_ASSERT(cbInfo
.numCalls
== 0);
4782 // A medium-length match that runs long enough to invoke the
4783 // callback, but not so long that the callback aborts it.
4784 status
= U_ZERO_ERROR
;
4786 s
= "aaaaaaaaaaaaaaaaaaab";
4788 REGEX_ASSERT(matcher
.matches(status
)==FALSE
);
4790 REGEX_ASSERT(cbInfo
.numCalls
> 0);
4792 // A longer running match that the callback function will abort.
4793 status
= U_ZERO_ERROR
;
4795 s
= "aaaaaaaaaaaaaaaaaaaaaaab";
4797 REGEX_ASSERT(matcher
.matches(status
)==FALSE
);
4798 REGEX_ASSERT(status
== U_REGEX_STOPPED_BY_CALLER
);
4799 REGEX_ASSERT(cbInfo
.numCalls
== 4);
4807 // FindProgressCallbacks() Test the find "progress" callback function.
4808 // When set, the find progress callback will be invoked during a find operations
4809 // after each return from a match attempt, giving the application the opportunity
4810 // to terminate a long-running find operation before it's normal completion.
4813 struct progressCallBackContext
{
4818 void reset(int32_t max
) {maxCalls
=max
; numCalls
=0;lastIndex
=0;};
4822 static UBool U_CALLCONV
4823 testProgressCallBackFn(const void *context
, int64_t matchIndex
) {
4824 progressCallBackContext
*info
= (progressCallBackContext
*)context
;
4826 info
->lastIndex
= matchIndex
;
4827 // info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4828 return (info
->numCalls
< info
->maxCalls
);
4832 void RegexTest::FindProgressCallbacks() {
4834 // Getter returns NULLs if no callback has been set
4836 // The variables that the getter will fill in.
4837 // Init to non-null values so that the action of the getter can be seen.
4838 const void *returnedContext
= &returnedContext
;
4839 URegexFindProgressCallback
*returnedFn
= &testProgressCallBackFn
;
4841 UErrorCode status
= U_ZERO_ERROR
;
4842 RegexMatcher
matcher("x", 0, status
);
4844 matcher
.getFindProgressCallback(returnedFn
, returnedContext
, status
);
4846 REGEX_ASSERT(returnedFn
== NULL
);
4847 REGEX_ASSERT(returnedContext
== NULL
);
4852 progressCallBackContext cbInfo
= {this, 0, 0, 0};
4853 const void *returnedContext
;
4854 URegexFindProgressCallback
*returnedFn
;
4855 UErrorCode status
= U_ZERO_ERROR
;
4856 RegexMatcher
matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status
); // A pattern that can run long.
4858 matcher
.setFindProgressCallback(testProgressCallBackFn
, &cbInfo
, status
);
4860 matcher
.getFindProgressCallback(returnedFn
, returnedContext
, status
);
4862 REGEX_ASSERT(returnedFn
== testProgressCallBackFn
);
4863 REGEX_ASSERT(returnedContext
== &cbInfo
);
4865 // A short-running match should NOT invoke the callback.
4866 status
= U_ZERO_ERROR
;
4868 UnicodeString s
= "abxxx";
4871 matcher
.setTrace(TRUE
);
4873 REGEX_ASSERT(matcher
.find(0, status
));
4875 REGEX_ASSERT(cbInfo
.numCalls
== 0);
4877 // A medium running match that causes matcher.find() to invoke our callback for each index.
4878 status
= U_ZERO_ERROR
;
4879 s
= "aaaaaaaaaaaaaaaaaaab";
4880 cbInfo
.reset(s
.length()); // Some upper limit for number of calls that is greater than size of our input string
4882 REGEX_ASSERT(matcher
.find(0, status
)==FALSE
);
4884 REGEX_ASSERT(cbInfo
.numCalls
> 0 && cbInfo
.numCalls
< 25);
4886 // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4887 status
= U_ZERO_ERROR
;
4888 UnicodeString s1
= "aaaaaaaaaaaaaaaaaaaaaaab";
4889 cbInfo
.reset(s1
.length() - 5); // Bail early somewhere near the end of input string
4891 REGEX_ASSERT(matcher
.find(0, status
)==FALSE
);
4893 REGEX_ASSERT(cbInfo
.numCalls
== s1
.length() - 5);
4896 // Now a match that will succeed, but after an interruption
4897 status
= U_ZERO_ERROR
;
4898 UnicodeString s2
= "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4899 cbInfo
.reset(s2
.length() - 10); // Bail early somewhere near the end of input string
4901 REGEX_ASSERT(matcher
.find(0, status
)==FALSE
);
4903 // Now retry the match from where left off
4904 cbInfo
.maxCalls
= 100; // No callback limit
4905 REGEX_ASSERT(matcher
.find(cbInfo
.lastIndex
, status
));
4914 //---------------------------------------------------------------------------
4916 // PreAllocatedUTextCAPI Check the C API with pre-allocated mutable
4917 // UTexts. The pure-C implementation of UText
4918 // has no mutable backing stores, but we can
4919 // use UnicodeString here to test the functionality.
4921 //---------------------------------------------------------------------------
4922 void RegexTest::PreAllocatedUTextCAPI () {
4923 UErrorCode status
= U_ZERO_ERROR
;
4924 URegularExpression
*re
;
4925 UText patternText
= UTEXT_INITIALIZER
;
4926 UnicodeString buffer
;
4927 UText bufferText
= UTEXT_INITIALIZER
;
4929 utext_openUnicodeString(&bufferText
, &buffer
, &status
);
4932 * getText() and getUText()
4935 UText text1
= UTEXT_INITIALIZER
;
4936 UText text2
= UTEXT_INITIALIZER
;
4937 UChar text2Chars
[20];
4940 status
= U_ZERO_ERROR
;
4941 regextst_openUTF8FromInvariant(&text1
, "abcccd", -1, &status
);
4942 regextst_openUTF8FromInvariant(&text2
, "abcccxd", -1, &status
);
4943 u_uastrncpy(text2Chars
, "abcccxd", sizeof(text2
)/2);
4944 utext_openUChars(&text2
, text2Chars
, -1, &status
);
4946 regextst_openUTF8FromInvariant(&patternText
, "abc*d", -1, &status
);
4947 re
= uregex_openUText(&patternText
, 0, NULL
, &status
);
4949 /* First set a UText */
4950 uregex_setUText(re
, &text1
, &status
);
4951 resultText
= uregex_getUText(re
, &bufferText
, &status
);
4953 REGEX_ASSERT(resultText
== &bufferText
);
4954 utext_setNativeIndex(resultText
, 0);
4955 utext_setNativeIndex(&text1
, 0);
4956 REGEX_ASSERT(testUTextEqual(resultText
, &text1
));
4958 resultText
= uregex_getUText(re
, &bufferText
, &status
);
4960 REGEX_ASSERT(resultText
== &bufferText
);
4961 utext_setNativeIndex(resultText
, 0);
4962 utext_setNativeIndex(&text1
, 0);
4963 REGEX_ASSERT(testUTextEqual(resultText
, &text1
));
4965 /* Then set a UChar * */
4966 uregex_setText(re
, text2Chars
, 7, &status
);
4967 resultText
= uregex_getUText(re
, &bufferText
, &status
);
4969 REGEX_ASSERT(resultText
== &bufferText
);
4970 utext_setNativeIndex(resultText
, 0);
4971 utext_setNativeIndex(&text2
, 0);
4972 REGEX_ASSERT(testUTextEqual(resultText
, &text2
));
4975 utext_close(&text1
);
4976 utext_close(&text2
);
4986 u_uastrncpy(text1
, "noise abc interior def, and this is off the end", sizeof(text1
)/2);
4988 status
= U_ZERO_ERROR
;
4989 re
= uregex_openC("abc(.*?)def", 0, NULL
, &status
);
4992 uregex_setText(re
, text1
, -1, &status
);
4993 result
= uregex_find(re
, 0, &status
);
4994 REGEX_ASSERT(result
==TRUE
);
4996 /* Capture Group 0, the full match. Should succeed. */
4997 status
= U_ZERO_ERROR
;
4998 actual
= uregex_groupUTextDeep(re
, 0, &bufferText
, &status
);
5000 REGEX_ASSERT(actual
== &bufferText
);
5001 REGEX_ASSERT_UTEXT_INVARIANT("abc interior def", actual
);
5003 /* Capture group #1. Should succeed. */
5004 status
= U_ZERO_ERROR
;
5005 actual
= uregex_groupUTextDeep(re
, 1, &bufferText
, &status
);
5007 REGEX_ASSERT(actual
== &bufferText
);
5008 REGEX_ASSERT_UTEXT_INVARIANT(" interior ", actual
);
5010 /* Capture group out of range. Error. */
5011 status
= U_ZERO_ERROR
;
5012 actual
= uregex_groupUTextDeep(re
, 2, &bufferText
, &status
);
5013 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
5014 REGEX_ASSERT(actual
== &bufferText
);
5026 UText replText
= UTEXT_INITIALIZER
;
5029 status
= U_ZERO_ERROR
;
5030 u_uastrncpy(text1
, "Replace xaax x1x x...x.", sizeof(text1
)/2);
5031 u_uastrncpy(text2
, "No match here.", sizeof(text2
)/2);
5032 regextst_openUTF8FromInvariant(&replText
, "<$1>", -1, &status
);
5034 re
= uregex_openC("x(.*?)x", 0, NULL
, &status
);
5037 /* Normal case, with match */
5038 uregex_setText(re
, text1
, -1, &status
);
5039 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5040 result
= uregex_replaceFirstUText(re
, &replText
, &bufferText
, &status
);
5042 REGEX_ASSERT(result
== &bufferText
);
5043 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result
);
5045 /* No match. Text should copy to output with no changes. */
5046 uregex_setText(re
, text2
, -1, &status
);
5047 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5048 result
= uregex_replaceFirstUText(re
, &replText
, &bufferText
, &status
);
5050 REGEX_ASSERT(result
== &bufferText
);
5051 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result
);
5053 /* Unicode escapes */
5054 uregex_setText(re
, text1
, -1, &status
);
5055 regextst_openUTF8FromInvariant(&replText
, "\\\\\\u0041$1\\U00000042$\\a", -1, &status
);
5056 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5057 result
= uregex_replaceFirstUText(re
, &replText
, &bufferText
, &status
);
5059 REGEX_ASSERT(result
== &bufferText
);
5060 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result
);
5063 utext_close(&replText
);
5073 UText replText
= UTEXT_INITIALIZER
;
5076 status
= U_ZERO_ERROR
;
5077 u_uastrncpy(text1
, "Replace xaax x1x x...x.", sizeof(text1
)/2);
5078 u_uastrncpy(text2
, "No match here.", sizeof(text2
)/2);
5079 regextst_openUTF8FromInvariant(&replText
, "<$1>", -1, &status
);
5081 re
= uregex_openC("x(.*?)x", 0, NULL
, &status
);
5084 /* Normal case, with match */
5085 uregex_setText(re
, text1
, -1, &status
);
5086 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5087 result
= uregex_replaceAllUText(re
, &replText
, &bufferText
, &status
);
5089 REGEX_ASSERT(result
== &bufferText
);
5090 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result
);
5092 /* No match. Text should copy to output with no changes. */
5093 uregex_setText(re
, text2
, -1, &status
);
5094 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5095 result
= uregex_replaceAllUText(re
, &replText
, &bufferText
, &status
);
5097 REGEX_ASSERT(result
== &bufferText
);
5098 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result
);
5101 utext_close(&replText
);
5106 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5107 * so we don't need to test it here.
5110 utext_close(&bufferText
);
5111 utext_close(&patternText
);
5114 //--------------------------------------------------------------
5116 // Bug7651 Regex pattern that exceeds default operator stack depth in matcher.
5118 //---------------------------------------------------------------
5119 void RegexTest::Bug7651() {
5120 UnicodeString
pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5121 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5122 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5123 UnicodeString
pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5124 UnicodeString
s("#ff @abcd This is test");
5125 RegexPattern
*REPattern
= NULL
;
5126 RegexMatcher
*REMatcher
= NULL
;
5127 UErrorCode status
= U_ZERO_ERROR
;
5130 REPattern
= RegexPattern::compile(pattern1
, 0, pe
, status
);
5132 REMatcher
= REPattern
->matcher(s
, status
);
5134 REGEX_ASSERT(REMatcher
->find());
5135 REGEX_ASSERT(REMatcher
->start(status
) == 0);
5138 status
= U_ZERO_ERROR
;
5140 REPattern
= RegexPattern::compile(pattern2
, 0, pe
, status
);
5142 REMatcher
= REPattern
->matcher(s
, status
);
5144 REGEX_ASSERT(REMatcher
->find());
5145 REGEX_ASSERT(REMatcher
->start(status
) == 0);
5148 status
= U_ZERO_ERROR
;
5151 void RegexTest::Bug7740() {
5152 UErrorCode status
= U_ZERO_ERROR
;
5153 UnicodeString pattern
= "(a)";
5154 UnicodeString text
= "abcdef";
5155 RegexMatcher
*m
= new RegexMatcher(pattern
, text
, 0, status
);
5157 REGEX_ASSERT(m
->lookingAt(status
));
5159 status
= U_ILLEGAL_ARGUMENT_ERROR
;
5160 UnicodeString s
= m
->group(1, status
); // Bug 7740: segfault here.
5161 REGEX_ASSERT(status
== U_ILLEGAL_ARGUMENT_ERROR
);
5162 REGEX_ASSERT(s
== "");
5166 // Bug 8479: was crashing whith a Bogus UnicodeString as input.
5168 void RegexTest::Bug8479() {
5169 UErrorCode status
= U_ZERO_ERROR
;
5171 RegexMatcher
* const pMatcher
= new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL
|UREGEX_CASE_INSENSITIVE
, status
);
5173 if (U_SUCCESS(status
))
5177 pMatcher
->reset(str
);
5178 status
= U_ZERO_ERROR
;
5179 pMatcher
->matches(status
);
5180 REGEX_ASSERT(status
== U_ILLEGAL_ARGUMENT_ERROR
);
5187 void RegexTest::Bug7029() {
5188 UErrorCode status
= U_ZERO_ERROR
;
5190 RegexMatcher
* const pMatcher
= new RegexMatcher(".", 0, status
);
5191 UnicodeString text
= "abc.def";
5192 UnicodeString splits
[10];
5194 int32_t numFields
= pMatcher
->split(text
, splits
, 10, status
);
5196 REGEX_ASSERT(numFields
== 8);
5201 // This test is checking for the existance of any supplemental characters that case-fold
5202 // to a bmp character.
5204 // At the time of this writing there are none. If any should appear in a subsequent release
5205 // of Unicode, the code in regular expressions compilation that determines the longest
5206 // posssible match for a literal string will need to be enhanced.
5208 // See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5209 // for details on what to do in case of a failure of this test.
5211 void RegexTest::Bug9283() {
5212 #if !UCONFIG_NO_NORMALIZATION
5213 UErrorCode status
= U_ZERO_ERROR
;
5214 UnicodeSet
supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status
);
5218 for (index
=0; ; index
++) {
5219 c
= supplementalsWithCaseFolding
.charAt(index
);
5223 UnicodeString cf
= UnicodeString(c
).foldCase();
5224 REGEX_ASSERT(cf
.length() >= 2);
5226 #endif /* #if !UCONFIG_NO_NORMALIZATION */
5230 void RegexTest::CheckInvBufSize() {
5231 if(inv_next
>=INV_BUFSIZ
) {
5232 errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5233 __FILE__
, INV_BUFSIZ
, inv_next
);
5235 logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__
, INV_BUFSIZ
, inv_next
);
5240 void RegexTest::Bug10459() {
5241 UErrorCode status
= U_ZERO_ERROR
;
5242 UnicodeString
patternString("(txt)");
5243 UnicodeString
txtString("txt");
5245 UText
*utext_pat
= utext_openUnicodeString(NULL
, &patternString
, &status
);
5247 UText
*utext_txt
= utext_openUnicodeString(NULL
, &txtString
, &status
);
5250 URegularExpression
*icu_re
= uregex_openUText(utext_pat
, 0, NULL
, &status
);
5253 uregex_setUText(icu_re
, utext_txt
, &status
);
5256 // The bug was that calling uregex_group() before doing a matching operation
5257 // was causing a segfault. Only for Regular Expressions created from UText.
5258 // It should set an U_REGEX_INVALID_STATE.
5261 int32_t len
= uregex_group(icu_re
, 0, buf
, LENGTHOF(buf
), &status
);
5262 REGEX_ASSERT(status
== U_REGEX_INVALID_STATE
);
5263 REGEX_ASSERT(len
== 0);
5265 uregex_close(icu_re
);
5266 utext_close(utext_pat
);
5267 utext_close(utext_txt
);
5270 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */