1 /********************************************************************
3 * Copyright (c) 2002-2012, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
10 // ICU Regular Expressions test, part of intltest.
16 PLEASE be careful about ASCII assumptions in this test.
17 This test is one of the worst repeat offenders.
18 If you have questions, contact someone on the ICU PMC
19 who has access to an EBCDIC system.
24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
26 #include "unicode/regex.h"
27 #include "unicode/uchar.h"
28 #include "unicode/ucnv.h"
29 #include "unicode/uniset.h"
30 #include "unicode/ustring.h"
40 #define SUPPORT_MUTATING_INPUT_STRING 0
42 //---------------------------------------------------------------------------
44 // Test class boilerplate
46 //---------------------------------------------------------------------------
47 RegexTest::RegexTest()
52 RegexTest::~RegexTest()
58 void RegexTest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* /*par*/ )
60 if (exec
) logln("TestSuite RegexTest: ");
63 case 0: name
= "Basic";
66 case 1: name
= "API_Match";
67 if (exec
) API_Match();
69 case 2: name
= "API_Replace";
70 if (exec
) API_Replace();
72 case 3: name
= "API_Pattern";
73 if (exec
) API_Pattern();
76 #if !UCONFIG_NO_FILE_IO
83 case 5: name
= "Errors";
86 case 6: name
= "PerlTests";
87 if (exec
) PerlTests();
89 case 7: name
= "Callbacks";
90 if (exec
) Callbacks();
92 case 8: name
= "FindProgressCallbacks";
93 if (exec
) FindProgressCallbacks();
95 case 9: name
= "Bug 6149";
98 case 10: name
= "UTextBasic";
99 if (exec
) UTextBasic();
101 case 11: name
= "API_Match_UTF8";
102 if (exec
) API_Match_UTF8();
104 case 12: name
= "API_Replace_UTF8";
105 if (exec
) API_Replace_UTF8();
107 case 13: name
= "API_Pattern_UTF8";
108 if (exec
) API_Pattern_UTF8();
110 case 14: name
= "PerlTestsUTF8";
111 if (exec
) PerlTestsUTF8();
113 case 15: name
= "PreAllocatedUTextCAPI";
114 if (exec
) PreAllocatedUTextCAPI();
116 case 16: name
= "Bug 7651";
119 case 17: name
= "Bug 7740";
122 case 18: name
= "Bug 8479";
125 case 19: name
= "Bug 7029";
128 case 20: name
= "CheckInvBufSize";
129 if (exec
) CheckInvBufSize();
131 case 21: name
= "Bug 9283";
136 break; //needed to end loop
143 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
145 * @see utext_openUTF8
147 static UText
* regextst_openUTF8FromInvariant(UText
* ut
, const char *inv
, int64_t length
, UErrorCode
*status
);
149 //---------------------------------------------------------------------------
151 // Error Checking / Reporting macros used in all of the tests.
153 //---------------------------------------------------------------------------
155 static void utextToPrintable(char *buf
, int32_t bufLen
, UText
*text
) {
156 int64_t oldIndex
= utext_getNativeIndex(text
);
157 utext_setNativeIndex(text
, 0);
159 UChar32 c
= utext_next32From(text
, 0);
160 while ((c
!= U_SENTINEL
) && (bufPtr
< buf
+bufLen
)) {
161 if (0x000020<=c
&& c
<0x00007e) {
165 sprintf(bufPtr
,"U+%04X", c
);
166 bufPtr
+= strlen(bufPtr
)-1;
172 c
= UTEXT_NEXT32(text
);
175 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
176 char *ebuf
= (char*)malloc(bufLen
);
177 uprv_eastrncpy((unsigned char*)ebuf
, (const unsigned char*)buf
, bufLen
);
178 uprv_strncpy(buf
, ebuf
, bufLen
);
181 utext_setNativeIndex(text
, oldIndex
);
185 static char ASSERT_BUF
[1024];
187 const char* RegexTest::extractToAssertBuf(const UnicodeString
& message
) {
188 if(message
.length()==0) {
189 strcpy(ASSERT_BUF
, "[[empty UnicodeString]]");
192 IntlTest::prettify(message
,buf
);
193 if(buf
.length()==0) {
194 strcpy(ASSERT_BUF
, "[[escape() returned 0 chars]]");
196 buf
.extract(0, 0x7FFFFFFF, ASSERT_BUF
, sizeof(ASSERT_BUF
)-1);
197 if(ASSERT_BUF
[0]==0) {
199 for(int32_t i
=0;i
<buf
.length();i
++) {
201 sprintf(ASSERT_BUF
+strlen(ASSERT_BUF
),"\\u%02x",ch
);
206 ASSERT_BUF
[sizeof(ASSERT_BUF
)-1] = 0;
211 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
213 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \
214 __FILE__, __LINE__, u_errorName(status)); return;}}
216 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
218 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
219 if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \
220 __LINE__, u_errorName(errcode), u_errorName(status));};}
222 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
223 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
225 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
226 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
228 #define REGEX_ASSERT_UNISTR(ustr,inv) {if (!(ustr==inv)) {errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s,%s) failed \n", __FILE__, __LINE__, extractToAssertBuf(ustr),inv);};}
231 static UBool
testUTextEqual(UText
*uta
, UText
*utb
) {
234 utext_setNativeIndex(uta
, 0);
235 utext_setNativeIndex(utb
, 0);
237 ca
= utext_next32(uta
);
238 cb
= utext_next32(utb
);
242 } while (ca
!= U_SENTINEL
);
248 * @param expected expected text in UTF-8 (not platform) codepage
250 void RegexTest::assertUText(const char *expected
, UText
*actual
, const char *file
, int line
) {
251 UErrorCode status
= U_ZERO_ERROR
;
252 UText expectedText
= UTEXT_INITIALIZER
;
253 utext_openUTF8(&expectedText
, expected
, -1, &status
);
254 if(U_FAILURE(status
)) {
255 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file
, line
, u_errorName(status
), strlen(expected
));
258 if(utext_nativeLength(&expectedText
)==0 && (strlen(expected
)!=0)) {
259 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file
, line
, strlen(expected
));
262 utext_setNativeIndex(actual
, 0);
263 if (!testUTextEqual(&expectedText
, actual
)) {
264 char buf
[201 /*21*/];
265 char expectedBuf
[201];
266 utextToPrintable(buf
, sizeof(buf
)/sizeof(buf
[0]), actual
);
267 utextToPrintable(expectedBuf
, sizeof(expectedBuf
)/sizeof(expectedBuf
[0]), &expectedText
);
268 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file
, line
, expectedBuf
, (int)utext_nativeLength(&expectedText
), buf
, (int)utext_nativeLength(actual
));
270 utext_close(&expectedText
);
273 * @param expected invariant (platform local text) input
276 void RegexTest::assertUTextInvariant(const char *expected
, UText
*actual
, const char *file
, int line
) {
277 UErrorCode status
= U_ZERO_ERROR
;
278 UText expectedText
= UTEXT_INITIALIZER
;
279 regextst_openUTF8FromInvariant(&expectedText
, expected
, -1, &status
);
280 if(U_FAILURE(status
)) {
281 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file
, line
, u_errorName(status
), strlen(expected
));
284 utext_setNativeIndex(actual
, 0);
285 if (!testUTextEqual(&expectedText
, actual
)) {
286 char buf
[201 /*21*/];
287 char expectedBuf
[201];
288 utextToPrintable(buf
, sizeof(buf
)/sizeof(buf
[0]), actual
);
289 utextToPrintable(expectedBuf
, sizeof(expectedBuf
)/sizeof(expectedBuf
[0]), &expectedText
);
290 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file
, line
, expectedBuf
, (int)utext_nativeLength(&expectedText
), buf
, (int)utext_nativeLength(actual
));
292 utext_close(&expectedText
);
296 * Assumes utf-8 input
298 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
300 * Assumes Invariant input
302 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
305 * This buffer ( inv_buf ) is used to hold the UTF-8 strings
306 * passed into utext_openUTF8. An error will be given if
307 * INV_BUFSIZ is too small. It's only used on EBCDIC systems.
310 #define INV_BUFSIZ 2048 /* increase this if too small */
312 static int64_t inv_next
=0;
314 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
315 static char inv_buf
[INV_BUFSIZ
];
318 static UText
* regextst_openUTF8FromInvariant(UText
*ut
, const char *inv
, int64_t length
, UErrorCode
*status
) {
319 if(length
==-1) length
=strlen(inv
);
320 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
322 return utext_openUTF8(ut
, inv
, length
, status
);
324 if(inv_next
+length
+1>INV_BUFSIZ
) {
325 fprintf(stderr
, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
326 __FILE__
, __LINE__
, INV_BUFSIZ
, (inv_next
+length
+1));
327 *status
= U_MEMORY_ALLOCATION_ERROR
;
331 unsigned char *buf
= (unsigned char*)inv_buf
+inv_next
;
332 uprv_aestrncpy(buf
, (const uint8_t*)inv
, length
);
336 fprintf(stderr
, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ
, inv_next
);
339 return utext_openUTF8(ut
, (const char*)buf
, length
, status
);
344 //---------------------------------------------------------------------------
346 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests
347 // for the LookingAt() and Match() functions.
350 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
352 // The expected results are UBool - TRUE or FALSE.
353 // The input text is unescaped. The pattern is not.
356 //---------------------------------------------------------------------------
358 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
360 UBool
RegexTest::doRegexLMTest(const char *pat
, const char *text
, UBool looking
, UBool match
, int32_t line
) {
361 const UnicodeString
pattern(pat
, -1, US_INV
);
362 const UnicodeString
inputText(text
, -1, US_INV
);
363 UErrorCode status
= U_ZERO_ERROR
;
365 RegexPattern
*REPattern
= NULL
;
366 RegexMatcher
*REMatcher
= NULL
;
369 UnicodeString
patString(pat
, -1, US_INV
);
370 REPattern
= RegexPattern::compile(patString
, 0, pe
, status
);
371 if (U_FAILURE(status
)) {
372 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s",
373 line
, u_errorName(status
));
376 if (line
==376) { RegexPatternDump(REPattern
);}
378 UnicodeString
inputString(inputText
);
379 UnicodeString unEscapedInput
= inputString
.unescape();
380 REMatcher
= REPattern
->matcher(unEscapedInput
, status
);
381 if (U_FAILURE(status
)) {
382 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
383 line
, u_errorName(status
));
388 actualmatch
= REMatcher
->lookingAt(status
);
389 if (U_FAILURE(status
)) {
390 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
391 line
, u_errorName(status
));
394 if (actualmatch
!= looking
) {
395 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line
);
399 status
= U_ZERO_ERROR
;
400 actualmatch
= REMatcher
->matches(status
);
401 if (U_FAILURE(status
)) {
402 errln("RegexTest failure in matches() at line %d. Status = %s\n",
403 line
, u_errorName(status
));
406 if (actualmatch
!= match
) {
407 errln("RegexTest: wrong return from matches() at line %d.\n", line
);
411 if (retVal
== FALSE
) {
412 RegexPatternDump(REPattern
);
421 UBool
RegexTest::doRegexLMTestUTF8(const char *pat
, const char *text
, UBool looking
, UBool match
, int32_t line
) {
422 UText pattern
= UTEXT_INITIALIZER
;
423 int32_t inputUTF8Length
;
424 char *textChars
= NULL
;
425 UText inputText
= UTEXT_INITIALIZER
;
426 UErrorCode status
= U_ZERO_ERROR
;
428 RegexPattern
*REPattern
= NULL
;
429 RegexMatcher
*REMatcher
= NULL
;
432 regextst_openUTF8FromInvariant(&pattern
, pat
, -1, &status
);
433 REPattern
= RegexPattern::compile(&pattern
, 0, pe
, status
);
434 if (U_FAILURE(status
)) {
435 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n",
436 line
, u_errorName(status
));
440 UnicodeString
inputString(text
, -1, US_INV
);
441 UnicodeString unEscapedInput
= inputString
.unescape();
442 LocalUConverterPointer
UTF8Converter(ucnv_open("UTF8", &status
));
443 ucnv_setFromUCallBack(UTF8Converter
.getAlias(), UCNV_FROM_U_CALLBACK_STOP
, NULL
, NULL
, NULL
, &status
);
445 inputUTF8Length
= unEscapedInput
.extract(NULL
, 0, UTF8Converter
.getAlias(), status
);
446 if (U_FAILURE(status
) && status
!= U_BUFFER_OVERFLOW_ERROR
) {
447 // UTF-8 does not allow unpaired surrogates, so this could actually happen
448 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line
, u_errorName(status
));
449 return TRUE
; // not a failure of the Regex engine
451 status
= U_ZERO_ERROR
; // buffer overflow
452 textChars
= new char[inputUTF8Length
+1];
453 unEscapedInput
.extract(textChars
, inputUTF8Length
+1, UTF8Converter
.getAlias(), status
);
454 utext_openUTF8(&inputText
, textChars
, inputUTF8Length
, &status
);
456 REMatcher
= &REPattern
->matcher(status
)->reset(&inputText
);
457 if (U_FAILURE(status
)) {
458 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n",
459 line
, u_errorName(status
));
464 actualmatch
= REMatcher
->lookingAt(status
);
465 if (U_FAILURE(status
)) {
466 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n",
467 line
, u_errorName(status
));
470 if (actualmatch
!= looking
) {
471 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line
);
475 status
= U_ZERO_ERROR
;
476 actualmatch
= REMatcher
->matches(status
);
477 if (U_FAILURE(status
)) {
478 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n",
479 line
, u_errorName(status
));
482 if (actualmatch
!= match
) {
483 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line
);
487 if (retVal
== FALSE
) {
488 RegexPatternDump(REPattern
);
493 utext_close(&inputText
);
494 utext_close(&pattern
);
501 //---------------------------------------------------------------------------
503 // REGEX_ERR Macro + invocation function to simplify writing tests
504 // regex tests for incorrect patterns
507 // REGEX_ERR("pattern", expected error line, column, expected status);
509 //---------------------------------------------------------------------------
510 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
512 void RegexTest::regex_err(const char *pat
, int32_t errLine
, int32_t errCol
,
513 UErrorCode expectedStatus
, int32_t line
) {
514 UnicodeString
pattern(pat
);
516 UErrorCode status
= U_ZERO_ERROR
;
518 RegexPattern
*callerPattern
= NULL
;
521 // Compile the caller's pattern
523 UnicodeString
patString(pat
);
524 callerPattern
= RegexPattern::compile(patString
, 0, pe
, status
);
525 if (status
!= expectedStatus
) {
526 dataerrln("Line %d: unexpected error %s compiling pattern.", line
, u_errorName(status
));
528 if (status
!= U_ZERO_ERROR
) {
529 if (pe
.line
!= errLine
|| pe
.offset
!= errCol
) {
530 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
531 line
, errLine
, errCol
, pe
.line
, pe
.offset
);
536 delete callerPattern
;
539 // Compile again, using a UTF-8-based UText
541 UText patternText
= UTEXT_INITIALIZER
;
542 regextst_openUTF8FromInvariant(&patternText
, pat
, -1, &status
);
543 callerPattern
= RegexPattern::compile(&patternText
, 0, pe
, status
);
544 if (status
!= expectedStatus
) {
545 dataerrln("Line %d: unexpected error %s compiling pattern.", line
, u_errorName(status
));
547 if (status
!= U_ZERO_ERROR
) {
548 if (pe
.line
!= errLine
|| pe
.offset
!= errCol
) {
549 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
550 line
, errLine
, errCol
, pe
.line
, pe
.offset
);
555 delete callerPattern
;
556 utext_close(&patternText
);
561 //---------------------------------------------------------------------------
563 // Basic Check for basic functionality of regex pattern matching.
564 // Avoid the use of REGEX_FIND test macro, which has
565 // substantial dependencies on basic Regex functionality.
567 //---------------------------------------------------------------------------
568 void RegexTest::Basic() {
572 // Debug - slide failing test cases early
576 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
578 UErrorCode status
= U_ZERO_ERROR
;
579 RegexPattern
*pattern
;
580 pattern
= RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE
, pe
, status
);
581 RegexPatternDump(pattern
);
582 RegexMatcher
*m
= pattern
->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status
);
583 UBool result
= m
->find();
584 printf("result = %d\n", result
);
585 // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
586 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
593 // Pattern with parentheses
595 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE
, FALSE
);
596 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE
, TRUE
);
597 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE
, FALSE
);
602 REGEX_TESTLM("st(abc)*ring", "string", TRUE
, TRUE
);
603 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE
, TRUE
);
604 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE
, TRUE
);
605 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE
, FALSE
);
606 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE
, FALSE
);
608 REGEX_TESTLM("a*", "", TRUE
, TRUE
);
609 REGEX_TESTLM("a*", "b", TRUE
, FALSE
);
615 REGEX_TESTLM(".", "abc", TRUE
, FALSE
);
616 REGEX_TESTLM("...", "abc", TRUE
, TRUE
);
617 REGEX_TESTLM("....", "abc", FALSE
, FALSE
);
618 REGEX_TESTLM(".*", "abcxyz123", TRUE
, TRUE
);
619 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE
, FALSE
);
620 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE
, TRUE
);
621 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE
, TRUE
);
622 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE
, FALSE
);
625 // Patterns with * applied to chars at end of literal string
627 REGEX_TESTLM("abc*", "ab", TRUE
, TRUE
);
628 REGEX_TESTLM("abc*", "abccccc", TRUE
, TRUE
);
631 // Supplemental chars match as single chars, not a pair of surrogates.
633 REGEX_TESTLM(".", "\\U00011000", TRUE
, TRUE
);
634 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE
, TRUE
);
635 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE
, FALSE
);
639 // UnicodeSets in the pattern
641 REGEX_TESTLM("[1-6]", "1", TRUE
, TRUE
);
642 REGEX_TESTLM("[1-6]", "3", TRUE
, TRUE
);
643 REGEX_TESTLM("[1-6]", "7", FALSE
, FALSE
);
644 REGEX_TESTLM("a[1-6]", "a3", TRUE
, TRUE
);
645 REGEX_TESTLM("a[1-6]", "a3", TRUE
, TRUE
);
646 REGEX_TESTLM("a[1-6]b", "a3b", TRUE
, TRUE
);
648 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE
, TRUE
);
649 REGEX_TESTLM("a[0-9]*b", "abc", TRUE
, FALSE
);
650 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE
, TRUE
);
651 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE
, FALSE
); // note that * matches 0 occurences.
652 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE
, TRUE
);
655 // OR operator in patterns
657 REGEX_TESTLM("(a|b)", "a", TRUE
, TRUE
);
658 REGEX_TESTLM("(a|b)", "b", TRUE
, TRUE
);
659 REGEX_TESTLM("(a|b)", "c", FALSE
, FALSE
);
660 REGEX_TESTLM("a|b", "b", TRUE
, TRUE
);
662 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE
, TRUE
);
663 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE
, FALSE
);
664 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE
, TRUE
);
665 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE
, TRUE
);
666 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE
, TRUE
);
667 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE
, FALSE
);
672 REGEX_TESTLM("ab+", "abbc", TRUE
, FALSE
);
673 REGEX_TESTLM("ab+c", "ac", FALSE
, FALSE
);
674 REGEX_TESTLM("b+", "", FALSE
, FALSE
);
675 REGEX_TESTLM("(abc|def)+", "defabc", TRUE
, TRUE
);
676 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE
, FALSE
);
677 REGEX_TESTLM(".+y", "zippity dooy", TRUE
, TRUE
);
682 REGEX_TESTLM("ab?", "ab", TRUE
, TRUE
);
683 REGEX_TESTLM("ab?", "a", TRUE
, TRUE
);
684 REGEX_TESTLM("ab?", "ac", TRUE
, FALSE
);
685 REGEX_TESTLM("ab?", "abb", TRUE
, FALSE
);
686 REGEX_TESTLM("a(b|c)?d", "abd", TRUE
, TRUE
);
687 REGEX_TESTLM("a(b|c)?d", "acd", TRUE
, TRUE
);
688 REGEX_TESTLM("a(b|c)?d", "ad", TRUE
, TRUE
);
689 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE
, FALSE
);
690 REGEX_TESTLM("a(b|c)?d", "ab", FALSE
, FALSE
);
693 // Escape sequences that become single literal chars, handled internally
694 // by ICU's Unescape.
697 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
698 REGEX_TESTLM("\\a", "\\u0007", TRUE
, TRUE
); // BEL
699 REGEX_TESTLM("\\cL", "\\u000c", TRUE
, TRUE
); // Control-L
700 REGEX_TESTLM("\\e", "\\u001b", TRUE
, TRUE
); // Escape
701 REGEX_TESTLM("\\f", "\\u000c", TRUE
, TRUE
); // Form Feed
702 REGEX_TESTLM("\\n", "\\u000a", TRUE
, TRUE
); // new line
703 REGEX_TESTLM("\\r", "\\u000d", TRUE
, TRUE
); // CR
704 REGEX_TESTLM("\\t", "\\u0009", TRUE
, TRUE
); // Tab
705 REGEX_TESTLM("\\u1234", "\\u1234", TRUE
, TRUE
);
706 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE
, TRUE
);
708 REGEX_TESTLM(".*\\Ax", "xyz", TRUE
, FALSE
); // \A matches only at the beginning of input
709 REGEX_TESTLM(".*\\Ax", " xyz", FALSE
, FALSE
); // \A matches only at the beginning of input
711 // Escape of special chars in patterns
712 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE
, TRUE
);
716 //---------------------------------------------------------------------------
718 // UTextBasic Check for quirks that are specific to the UText
721 //---------------------------------------------------------------------------
722 void RegexTest::UTextBasic() {
723 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
724 UErrorCode status
= U_ZERO_ERROR
;
725 UText pattern
= UTEXT_INITIALIZER
;
726 utext_openUTF8(&pattern
, str_abc
, -1, &status
);
727 RegexMatcher
matcher(&pattern
, 0, status
);
730 UText input
= UTEXT_INITIALIZER
;
731 utext_openUTF8(&input
, str_abc
, -1, &status
);
733 matcher
.reset(&input
);
735 REGEX_ASSERT_UTEXT_UTF8(str_abc
, matcher
.inputText());
737 matcher
.reset(matcher
.inputText());
739 REGEX_ASSERT_UTEXT_UTF8(str_abc
, matcher
.inputText());
741 utext_close(&pattern
);
746 //---------------------------------------------------------------------------
748 // API_Match Test that the API for class RegexMatcher
749 // is present and nominally working, but excluding functions
750 // implementing replace operations.
752 //---------------------------------------------------------------------------
753 void RegexTest::API_Match() {
755 UErrorCode status
=U_ZERO_ERROR
;
759 // Debug - slide failing test cases early
768 // Simple pattern compilation
771 UnicodeString
re("abc");
773 pat2
= RegexPattern::compile(re
, flags
, pe
, status
);
776 UnicodeString inStr1
= "abcdef this is a test";
777 UnicodeString instr2
= "not abc";
778 UnicodeString empty
= "";
782 // Matcher creation and reset.
784 RegexMatcher
*m1
= pat2
->matcher(inStr1
, status
);
786 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
787 REGEX_ASSERT(m1
->input() == inStr1
);
789 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
790 REGEX_ASSERT(m1
->input() == instr2
);
792 REGEX_ASSERT(m1
->input() == inStr1
);
793 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
795 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
796 REGEX_ASSERT(m1
->input() == empty
);
797 REGEX_ASSERT(&m1
->pattern() == pat2
);
800 // reset(pos, status)
803 m1
->reset(4, status
);
805 REGEX_ASSERT(m1
->input() == inStr1
);
806 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
808 m1
->reset(-1, status
);
809 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
810 status
= U_ZERO_ERROR
;
812 m1
->reset(0, status
);
814 status
= U_ZERO_ERROR
;
816 int32_t len
= m1
->input().length();
817 m1
->reset(len
-1, status
);
819 status
= U_ZERO_ERROR
;
821 m1
->reset(len
, status
);
823 status
= U_ZERO_ERROR
;
825 m1
->reset(len
+1, status
);
826 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
827 status
= U_ZERO_ERROR
;
830 // match(pos, status)
833 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
835 REGEX_ASSERT(m1
->matches(3, status
) == FALSE
);
837 REGEX_ASSERT(m1
->matches(5, status
) == FALSE
);
838 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
839 REGEX_ASSERT(m1
->matches(-1, status
) == FALSE
);
840 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
842 // Match() at end of string should fail, but should not
844 status
= U_ZERO_ERROR
;
845 len
= m1
->input().length();
846 REGEX_ASSERT(m1
->matches(len
, status
) == FALSE
);
849 // Match beyond end of string should fail with an error.
850 status
= U_ZERO_ERROR
;
851 REGEX_ASSERT(m1
->matches(len
+1, status
) == FALSE
);
852 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
854 // Successful match at end of string.
856 status
= U_ZERO_ERROR
;
857 RegexMatcher
m("A?", 0, status
); // will match zero length string.
860 len
= inStr1
.length();
861 REGEX_ASSERT(m
.matches(len
, status
) == TRUE
);
864 REGEX_ASSERT(m
.matches(0, status
) == TRUE
);
870 // lookingAt(pos, status)
872 status
= U_ZERO_ERROR
;
873 m1
->reset(instr2
); // "not abc"
874 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
875 REGEX_ASSERT(m1
->lookingAt(5, status
) == FALSE
);
876 REGEX_ASSERT(m1
->lookingAt(3, status
) == FALSE
);
877 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
878 REGEX_ASSERT(m1
->lookingAt(-1, status
) == FALSE
);
879 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
880 status
= U_ZERO_ERROR
;
881 len
= m1
->input().length();
882 REGEX_ASSERT(m1
->lookingAt(len
, status
) == FALSE
);
884 REGEX_ASSERT(m1
->lookingAt(len
+1, status
) == FALSE
);
885 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
894 // RegexMatcher::start();
895 // RegexMatcher::end();
896 // RegexMatcher::groupCount();
901 UErrorCode status
=U_ZERO_ERROR
;
903 UnicodeString
re("01(23(45)67)(.*)");
904 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
906 UnicodeString data
= "0123456789";
908 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
910 REGEX_ASSERT(matcher
->lookingAt(status
) == TRUE
);
911 static const int32_t matchStarts
[] = {0, 2, 4, 8};
912 static const int32_t matchEnds
[] = {10, 8, 6, 10};
914 for (i
=0; i
<4; i
++) {
915 int32_t actualStart
= matcher
->start(i
, status
);
917 if (actualStart
!= matchStarts
[i
]) {
918 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
919 __LINE__
, i
, matchStarts
[i
], actualStart
);
921 int32_t actualEnd
= matcher
->end(i
, status
);
923 if (actualEnd
!= matchEnds
[i
]) {
924 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
925 __LINE__
, i
, matchEnds
[i
], actualEnd
);
929 REGEX_ASSERT(matcher
->start(0, status
) == matcher
->start(status
));
930 REGEX_ASSERT(matcher
->end(0, status
) == matcher
->end(status
));
932 REGEX_ASSERT_FAIL(matcher
->start(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
933 REGEX_ASSERT_FAIL(matcher
->start( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
935 REGEX_ASSERT_FAIL(matcher
->start( 0, status
), U_REGEX_INVALID_STATE
);
937 matcher
->lookingAt(status
);
938 REGEX_ASSERT(matcher
->group(status
) == "0123456789");
939 REGEX_ASSERT(matcher
->group(0, status
) == "0123456789");
940 REGEX_ASSERT(matcher
->group(1, status
) == "234567" );
941 REGEX_ASSERT(matcher
->group(2, status
) == "45" );
942 REGEX_ASSERT(matcher
->group(3, status
) == "89" );
944 REGEX_ASSERT_FAIL(matcher
->group(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
945 REGEX_ASSERT_FAIL(matcher
->group( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
947 REGEX_ASSERT_FAIL(matcher
->group( 0, status
), U_REGEX_INVALID_STATE
);
960 UErrorCode status
=U_ZERO_ERROR
;
962 UnicodeString
re("abc");
963 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
965 UnicodeString data
= ".abc..abc...abc..";
966 // 012345678901234567
968 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
970 REGEX_ASSERT(matcher
->find());
971 REGEX_ASSERT(matcher
->start(status
) == 1);
972 REGEX_ASSERT(matcher
->find());
973 REGEX_ASSERT(matcher
->start(status
) == 6);
974 REGEX_ASSERT(matcher
->find());
975 REGEX_ASSERT(matcher
->start(status
) == 12);
976 REGEX_ASSERT(matcher
->find() == FALSE
);
977 REGEX_ASSERT(matcher
->find() == FALSE
);
980 REGEX_ASSERT(matcher
->find());
981 REGEX_ASSERT(matcher
->start(status
) == 1);
983 REGEX_ASSERT(matcher
->find(0, status
));
984 REGEX_ASSERT(matcher
->start(status
) == 1);
985 REGEX_ASSERT(matcher
->find(1, status
));
986 REGEX_ASSERT(matcher
->start(status
) == 1);
987 REGEX_ASSERT(matcher
->find(2, status
));
988 REGEX_ASSERT(matcher
->start(status
) == 6);
989 REGEX_ASSERT(matcher
->find(12, status
));
990 REGEX_ASSERT(matcher
->start(status
) == 12);
991 REGEX_ASSERT(matcher
->find(13, status
) == FALSE
);
992 REGEX_ASSERT(matcher
->find(16, status
) == FALSE
);
993 REGEX_ASSERT(matcher
->find(17, status
) == FALSE
);
994 REGEX_ASSERT_FAIL(matcher
->start(status
), U_REGEX_INVALID_STATE
);
996 status
= U_ZERO_ERROR
;
997 REGEX_ASSERT_FAIL(matcher
->find(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
998 status
= U_ZERO_ERROR
;
999 REGEX_ASSERT_FAIL(matcher
->find(18, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
1001 REGEX_ASSERT(matcher
->groupCount() == 0);
1009 // find, with \G in pattern (true if at the end of a previous match).
1014 UErrorCode status
=U_ZERO_ERROR
;
1016 UnicodeString
re(".*?(?:(\\Gabc)|(abc))", -1, US_INV
);
1017 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
1019 UnicodeString data
= ".abcabc.abc..";
1020 // 012345678901234567
1022 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
1024 REGEX_ASSERT(matcher
->find());
1025 REGEX_ASSERT(matcher
->start(status
) == 0);
1026 REGEX_ASSERT(matcher
->start(1, status
) == -1);
1027 REGEX_ASSERT(matcher
->start(2, status
) == 1);
1029 REGEX_ASSERT(matcher
->find());
1030 REGEX_ASSERT(matcher
->start(status
) == 4);
1031 REGEX_ASSERT(matcher
->start(1, status
) == 4);
1032 REGEX_ASSERT(matcher
->start(2, status
) == -1);
1040 // find with zero length matches, match position should bump ahead
1041 // to prevent loops.
1045 UErrorCode status
=U_ZERO_ERROR
;
1046 RegexMatcher
m("(?= ?)", 0, status
); // This pattern will zero-length matches anywhere,
1047 // using an always-true look-ahead.
1049 UnicodeString
s(" ");
1052 if (m
.find() == FALSE
) {
1055 REGEX_ASSERT(m
.start(status
) == i
);
1056 REGEX_ASSERT(m
.end(status
) == i
);
1060 // Check that the bump goes over surrogate pairs OK
1061 s
= UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1065 if (m
.find() == FALSE
) {
1068 REGEX_ASSERT(m
.start(status
) == i
);
1069 REGEX_ASSERT(m
.end(status
) == i
);
1071 REGEX_ASSERT(i
==10);
1074 // find() loop breaking test.
1075 // with pattern of /.?/, should see a series of one char matches, then a single
1076 // match of zero length at the end of the input string.
1078 UErrorCode status
=U_ZERO_ERROR
;
1079 RegexMatcher
m(".?", 0, status
);
1081 UnicodeString
s(" ");
1084 if (m
.find() == FALSE
) {
1087 REGEX_ASSERT(m
.start(status
) == i
);
1088 REGEX_ASSERT(m
.end(status
) == (i
<4 ? i
+1 : i
));
1095 // Matchers with no input string behave as if they had an empty input string.
1099 UErrorCode status
= U_ZERO_ERROR
;
1100 RegexMatcher
m(".?", 0, status
);
1102 REGEX_ASSERT(m
.find());
1103 REGEX_ASSERT(m
.start(status
) == 0);
1104 REGEX_ASSERT(m
.input() == "");
1107 UErrorCode status
= U_ZERO_ERROR
;
1108 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
1109 RegexMatcher
*m
= p
->matcher(status
);
1112 REGEX_ASSERT(m
->find() == FALSE
);
1113 REGEX_ASSERT(m
->input() == "");
1122 UErrorCode status
= U_ZERO_ERROR
;
1123 UnicodeString
testString("This is test data");
1124 RegexMatcher
m(".*", testString
, 0, status
);
1126 REGEX_ASSERT(m
.regionStart() == 0);
1127 REGEX_ASSERT(m
.regionEnd() == testString
.length());
1128 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1129 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1131 m
.region(2,4, status
);
1133 REGEX_ASSERT(m
.matches(status
));
1134 REGEX_ASSERT(m
.start(status
)==2);
1135 REGEX_ASSERT(m
.end(status
)==4);
1139 REGEX_ASSERT(m
.regionStart() == 0);
1140 REGEX_ASSERT(m
.regionEnd() == testString
.length());
1142 UnicodeString
shorterString("short");
1143 m
.reset(shorterString
);
1144 REGEX_ASSERT(m
.regionStart() == 0);
1145 REGEX_ASSERT(m
.regionEnd() == shorterString
.length());
1147 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1148 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(FALSE
));
1149 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
1150 REGEX_ASSERT(&m
== &m
.reset());
1151 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
1153 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(TRUE
));
1154 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1155 REGEX_ASSERT(&m
== &m
.reset());
1156 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1158 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1159 REGEX_ASSERT(&m
== &m
.useTransparentBounds(TRUE
));
1160 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
1161 REGEX_ASSERT(&m
== &m
.reset());
1162 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
1164 REGEX_ASSERT(&m
== &m
.useTransparentBounds(FALSE
));
1165 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1166 REGEX_ASSERT(&m
== &m
.reset());
1167 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1172 // hitEnd() and requireEnd()
1175 UErrorCode status
= U_ZERO_ERROR
;
1176 UnicodeString
testString("aabb");
1177 RegexMatcher
m1(".*", testString
, 0, status
);
1178 REGEX_ASSERT(m1
.lookingAt(status
) == TRUE
);
1179 REGEX_ASSERT(m1
.hitEnd() == TRUE
);
1180 REGEX_ASSERT(m1
.requireEnd() == FALSE
);
1183 status
= U_ZERO_ERROR
;
1184 RegexMatcher
m2("a*", testString
, 0, status
);
1185 REGEX_ASSERT(m2
.lookingAt(status
) == TRUE
);
1186 REGEX_ASSERT(m2
.hitEnd() == FALSE
);
1187 REGEX_ASSERT(m2
.requireEnd() == FALSE
);
1190 status
= U_ZERO_ERROR
;
1191 RegexMatcher
m3(".*$", testString
, 0, status
);
1192 REGEX_ASSERT(m3
.lookingAt(status
) == TRUE
);
1193 REGEX_ASSERT(m3
.hitEnd() == TRUE
);
1194 REGEX_ASSERT(m3
.requireEnd() == TRUE
);
1200 // Compilation error on reset with UChar *
1201 // These were a hazard that people were stumbling over with runtime errors.
1202 // Changed them to compiler errors by adding private methods that more closely
1203 // matched the incorrect use of the functions.
1207 UErrorCode status
= U_ZERO_ERROR
;
1208 UChar ucharString
[20];
1209 RegexMatcher
m(".", 0, status
);
1210 m
.reset(ucharString
); // should not compile.
1212 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
1213 RegexMatcher
*m2
= p
->matcher(ucharString
, status
); // should not compile.
1215 RegexMatcher
m3(".", ucharString
, 0, status
); // Should not compile
1221 // Note: These tests will need to be changed when the regexp engine is
1222 // able to detect and cut short the exponential time behavior on
1223 // this type of match.
1226 UErrorCode status
= U_ZERO_ERROR
;
1227 // Enough 'a's in the string to cause the match to time out.
1228 // (Each on additonal 'a' doubles the time)
1229 UnicodeString
testString("aaaaaaaaaaaaaaaaaaaaa");
1230 RegexMatcher
matcher("(a+)+b", testString
, 0, status
);
1232 REGEX_ASSERT(matcher
.getTimeLimit() == 0);
1233 matcher
.setTimeLimit(100, status
);
1234 REGEX_ASSERT(matcher
.getTimeLimit() == 100);
1235 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1236 REGEX_ASSERT(status
== U_REGEX_TIME_OUT
);
1239 UErrorCode status
= U_ZERO_ERROR
;
1240 // Few enough 'a's to slip in under the time limit.
1241 UnicodeString
testString("aaaaaaaaaaaaaaaaaa");
1242 RegexMatcher
matcher("(a+)+b", testString
, 0, status
);
1244 matcher
.setTimeLimit(100, status
);
1245 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1253 UErrorCode status
= U_ZERO_ERROR
;
1254 UnicodeString
testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'
1256 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1257 // of the '+', and makes the stack frames larger.
1258 RegexMatcher
matcher("(A)+A$", testString
, 0, status
);
1260 // With the default stack, this match should fail to run
1261 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1262 REGEX_ASSERT(status
== U_REGEX_STACK_OVERFLOW
);
1264 // With unlimited stack, it should run
1265 status
= U_ZERO_ERROR
;
1266 matcher
.setStackLimit(0, status
);
1268 REGEX_ASSERT(matcher
.lookingAt(status
) == TRUE
);
1270 REGEX_ASSERT(matcher
.getStackLimit() == 0);
1272 // With a limited stack, it the match should fail
1273 status
= U_ZERO_ERROR
;
1274 matcher
.setStackLimit(10000, status
);
1275 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1276 REGEX_ASSERT(status
== U_REGEX_STACK_OVERFLOW
);
1277 REGEX_ASSERT(matcher
.getStackLimit() == 10000);
1280 // A pattern that doesn't save state should work with
1281 // a minimal sized stack
1283 UErrorCode status
= U_ZERO_ERROR
;
1284 UnicodeString testString
= "abc";
1285 RegexMatcher
matcher("abc", testString
, 0, status
);
1287 matcher
.setStackLimit(30, status
);
1289 REGEX_ASSERT(matcher
.matches(status
) == TRUE
);
1291 REGEX_ASSERT(matcher
.getStackLimit() == 30);
1293 // Negative stack sizes should fail
1294 status
= U_ZERO_ERROR
;
1295 matcher
.setStackLimit(1000, status
);
1297 matcher
.setStackLimit(-1, status
);
1298 REGEX_ASSERT(status
== U_ILLEGAL_ARGUMENT_ERROR
);
1299 REGEX_ASSERT(matcher
.getStackLimit() == 1000);
1310 //---------------------------------------------------------------------------
1312 // API_Replace API test for class RegexMatcher, testing the
1313 // Replace family of functions.
1315 //---------------------------------------------------------------------------
1316 void RegexTest::API_Replace() {
1322 UErrorCode status
=U_ZERO_ERROR
;
1324 UnicodeString
re("abc");
1325 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
1327 UnicodeString data
= ".abc..abc...abc..";
1328 // 012345678901234567
1329 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
1332 // Plain vanilla matches.
1335 dest
= matcher
->replaceFirst("yz", status
);
1337 REGEX_ASSERT(dest
== ".yz..abc...abc..");
1339 dest
= matcher
->replaceAll("yz", status
);
1341 REGEX_ASSERT(dest
== ".yz..yz...yz..");
1344 // Plain vanilla non-matches.
1346 UnicodeString d2
= ".abx..abx...abx..";
1348 dest
= matcher
->replaceFirst("yz", status
);
1350 REGEX_ASSERT(dest
== ".abx..abx...abx..");
1352 dest
= matcher
->replaceAll("yz", status
);
1354 REGEX_ASSERT(dest
== ".abx..abx...abx..");
1357 // Empty source string
1359 UnicodeString d3
= "";
1361 dest
= matcher
->replaceFirst("yz", status
);
1363 REGEX_ASSERT(dest
== "");
1365 dest
= matcher
->replaceAll("yz", status
);
1367 REGEX_ASSERT(dest
== "");
1370 // Empty substitution string
1372 matcher
->reset(data
); // ".abc..abc...abc.."
1373 dest
= matcher
->replaceFirst("", status
);
1375 REGEX_ASSERT(dest
== "...abc...abc..");
1377 dest
= matcher
->replaceAll("", status
);
1379 REGEX_ASSERT(dest
== "........");
1382 // match whole string
1384 UnicodeString d4
= "abc";
1386 dest
= matcher
->replaceFirst("xyz", status
);
1388 REGEX_ASSERT(dest
== "xyz");
1390 dest
= matcher
->replaceAll("xyz", status
);
1392 REGEX_ASSERT(dest
== "xyz");
1395 // Capture Group, simple case
1397 UnicodeString
re2("a(..)");
1398 RegexPattern
*pat2
= RegexPattern::compile(re2
, flags
, pe
, status
);
1400 UnicodeString d5
= "abcdefg";
1401 RegexMatcher
*matcher2
= pat2
->matcher(d5
, status
);
1403 dest
= matcher2
->replaceFirst("$1$1", status
);
1405 REGEX_ASSERT(dest
== "bcbcdefg");
1407 dest
= matcher2
->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status
);
1409 REGEX_ASSERT(dest
== "The value of $1 is bc.defg");
1411 dest
= matcher2
->replaceFirst("$ by itself, no group number $$$", status
);
1413 REGEX_ASSERT(dest
== "$ by itself, no group number $$$defg");
1415 UnicodeString replacement
= UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1416 replacement
= replacement
.unescape();
1417 dest
= matcher2
->replaceFirst(replacement
, status
);
1419 REGEX_ASSERT(dest
== "Supplemental Digit 1 bc.defg");
1421 REGEX_ASSERT_FAIL(matcher2
->replaceFirst("bad capture group number $5...",status
), U_INDEX_OUTOFBOUNDS_ERROR
);
1425 // Replacement String with \u hex escapes
1428 UnicodeString src
= "abc 1 abc 2 abc 3";
1429 UnicodeString substitute
= UNICODE_STRING_SIMPLE("--\\u0043--");
1430 matcher
->reset(src
);
1431 UnicodeString result
= matcher
->replaceAll(substitute
, status
);
1433 REGEX_ASSERT(result
== "--C-- 1 --C-- 2 --C-- 3");
1436 UnicodeString src
= "abc !";
1437 UnicodeString substitute
= UNICODE_STRING_SIMPLE("--\\U00010000--");
1438 matcher
->reset(src
);
1439 UnicodeString result
= matcher
->replaceAll(substitute
, status
);
1441 UnicodeString expected
= UnicodeString("--");
1442 expected
.append((UChar32
)0x10000);
1443 expected
.append("-- !");
1444 REGEX_ASSERT(result
== expected
);
1446 // TODO: need more through testing of capture substitutions.
1451 status
= U_ZERO_ERROR
;
1452 UnicodeString s
= "The matches start with ss and end with ee ss stuff ee fin";
1453 RegexMatcher
m("ss(.*?)ee", 0, status
);
1455 UnicodeString result
;
1457 // Multiple finds do NOT bump up the previous appendReplacement postion.
1461 m
.appendReplacement(result
, "ooh", status
);
1463 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1465 // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1466 status
= U_ZERO_ERROR
;
1468 m
.reset(10, status
);
1471 m
.appendReplacement(result
, "ooh", status
);
1473 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1475 // find() at interior of string, appendReplacemnt still starts at beginning.
1476 status
= U_ZERO_ERROR
;
1481 m
.appendReplacement(result
, "ooh", status
);
1483 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1485 m
.appendTail(result
);
1486 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh fin");
1497 //---------------------------------------------------------------------------
1499 // API_Pattern Test that the API for class RegexPattern is
1500 // present and nominally working.
1502 //---------------------------------------------------------------------------
1503 void RegexTest::API_Pattern() {
1504 RegexPattern pata
; // Test default constructor to not crash.
1507 REGEX_ASSERT(pata
== patb
);
1508 REGEX_ASSERT(pata
== pata
);
1510 UnicodeString
re1("abc[a-l][m-z]");
1511 UnicodeString
re2("def");
1512 UErrorCode status
= U_ZERO_ERROR
;
1515 RegexPattern
*pat1
= RegexPattern::compile(re1
, 0, pe
, status
);
1516 RegexPattern
*pat2
= RegexPattern::compile(re2
, 0, pe
, status
);
1518 REGEX_ASSERT(*pat1
== *pat1
);
1519 REGEX_ASSERT(*pat1
!= pata
);
1523 REGEX_ASSERT(patb
== *pat1
);
1526 RegexPattern
patc(*pat1
);
1527 REGEX_ASSERT(patc
== *pat1
);
1528 REGEX_ASSERT(patb
== patc
);
1529 REGEX_ASSERT(pat1
!= pat2
);
1531 REGEX_ASSERT(patb
!= patc
);
1532 REGEX_ASSERT(patb
== *pat2
);
1534 // Compile with no flags.
1535 RegexPattern
*pat1a
= RegexPattern::compile(re1
, pe
, status
);
1536 REGEX_ASSERT(*pat1a
== *pat1
);
1538 REGEX_ASSERT(pat1a
->flags() == 0);
1540 // Compile with different flags should be not equal
1541 RegexPattern
*pat1b
= RegexPattern::compile(re1
, UREGEX_CASE_INSENSITIVE
, pe
, status
);
1544 REGEX_ASSERT(*pat1b
!= *pat1a
);
1545 REGEX_ASSERT(pat1b
->flags() == UREGEX_CASE_INSENSITIVE
);
1546 REGEX_ASSERT(pat1a
->flags() == 0);
1550 RegexPattern
*pat1c
= pat1
->clone();
1551 REGEX_ASSERT(*pat1c
== *pat1
);
1552 REGEX_ASSERT(*pat1c
!= *pat2
);
1561 // Verify that a matcher created from a cloned pattern works.
1565 UErrorCode status
= U_ZERO_ERROR
;
1566 RegexPattern
*pSource
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status
);
1567 RegexPattern
*pClone
= pSource
->clone();
1569 RegexMatcher
*mFromClone
= pClone
->matcher(status
);
1571 UnicodeString s
= "Hello World";
1572 mFromClone
->reset(s
);
1573 REGEX_ASSERT(mFromClone
->find() == TRUE
);
1574 REGEX_ASSERT(mFromClone
->group(status
) == "Hello");
1575 REGEX_ASSERT(mFromClone
->find() == TRUE
);
1576 REGEX_ASSERT(mFromClone
->group(status
) == "World");
1577 REGEX_ASSERT(mFromClone
->find() == FALSE
);
1583 // matches convenience API
1585 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe
, status
) == TRUE
);
1587 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe
, status
) == FALSE
);
1589 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe
, status
) == TRUE
);
1591 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe
, status
) == TRUE
);
1593 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe
, status
) == FALSE
);
1595 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1596 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe
, status
) == FALSE
);
1597 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1603 status
= U_ZERO_ERROR
;
1604 pat1
= RegexPattern::compile(" +", pe
, status
);
1606 UnicodeString fields
[10];
1609 n
= pat1
->split("Now is the time", fields
, 10, status
);
1612 REGEX_ASSERT(fields
[0]=="Now");
1613 REGEX_ASSERT(fields
[1]=="is");
1614 REGEX_ASSERT(fields
[2]=="the");
1615 REGEX_ASSERT(fields
[3]=="time");
1616 REGEX_ASSERT(fields
[4]=="");
1618 n
= pat1
->split("Now is the time", fields
, 2, status
);
1621 REGEX_ASSERT(fields
[0]=="Now");
1622 REGEX_ASSERT(fields
[1]=="is the time");
1623 REGEX_ASSERT(fields
[2]=="the"); // left over from previous test
1626 status
= U_ZERO_ERROR
;
1627 n
= pat1
->split("Now is the time", fields
, 1, status
);
1630 REGEX_ASSERT(fields
[0]=="Now is the time");
1631 REGEX_ASSERT(fields
[1]=="*");
1632 status
= U_ZERO_ERROR
;
1634 n
= pat1
->split(" Now is the time ", fields
, 10, status
);
1637 REGEX_ASSERT(fields
[0]=="");
1638 REGEX_ASSERT(fields
[1]=="Now");
1639 REGEX_ASSERT(fields
[2]=="is");
1640 REGEX_ASSERT(fields
[3]=="the");
1641 REGEX_ASSERT(fields
[4]=="time");
1642 REGEX_ASSERT(fields
[5]=="");
1644 n
= pat1
->split(" ", fields
, 10, status
);
1647 REGEX_ASSERT(fields
[0]=="");
1648 REGEX_ASSERT(fields
[1]=="");
1651 n
= pat1
->split("", fields
, 10, status
);
1654 REGEX_ASSERT(fields
[0]=="foo");
1658 // split, with a pattern with (capture)
1659 pat1
= RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe
, status
);
1662 status
= U_ZERO_ERROR
;
1663 n
= pat1
->split("<a>Now is <b>the time<c>", fields
, 10, status
);
1666 REGEX_ASSERT(fields
[0]=="");
1667 REGEX_ASSERT(fields
[1]=="a");
1668 REGEX_ASSERT(fields
[2]=="Now is ");
1669 REGEX_ASSERT(fields
[3]=="b");
1670 REGEX_ASSERT(fields
[4]=="the time");
1671 REGEX_ASSERT(fields
[5]=="c");
1672 REGEX_ASSERT(fields
[6]=="");
1673 REGEX_ASSERT(status
==U_ZERO_ERROR
);
1675 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 10, status
);
1678 REGEX_ASSERT(fields
[0]==" ");
1679 REGEX_ASSERT(fields
[1]=="a");
1680 REGEX_ASSERT(fields
[2]=="Now is ");
1681 REGEX_ASSERT(fields
[3]=="b");
1682 REGEX_ASSERT(fields
[4]=="the time");
1683 REGEX_ASSERT(fields
[5]=="c");
1684 REGEX_ASSERT(fields
[6]=="");
1686 status
= U_ZERO_ERROR
;
1688 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 6, status
);
1691 REGEX_ASSERT(fields
[0]==" ");
1692 REGEX_ASSERT(fields
[1]=="a");
1693 REGEX_ASSERT(fields
[2]=="Now is ");
1694 REGEX_ASSERT(fields
[3]=="b");
1695 REGEX_ASSERT(fields
[4]=="the time");
1696 REGEX_ASSERT(fields
[5]==""); // All text following "<c>" field delimiter.
1697 REGEX_ASSERT(fields
[6]=="foo");
1699 status
= U_ZERO_ERROR
;
1701 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 5, status
);
1704 REGEX_ASSERT(fields
[0]==" ");
1705 REGEX_ASSERT(fields
[1]=="a");
1706 REGEX_ASSERT(fields
[2]=="Now is ");
1707 REGEX_ASSERT(fields
[3]=="b");
1708 REGEX_ASSERT(fields
[4]=="the time<c>");
1709 REGEX_ASSERT(fields
[5]=="foo");
1711 status
= U_ZERO_ERROR
;
1713 n
= pat1
->split(" <a>Now is <b>the time", fields
, 5, status
);
1716 REGEX_ASSERT(fields
[0]==" ");
1717 REGEX_ASSERT(fields
[1]=="a");
1718 REGEX_ASSERT(fields
[2]=="Now is ");
1719 REGEX_ASSERT(fields
[3]=="b");
1720 REGEX_ASSERT(fields
[4]=="the time");
1721 REGEX_ASSERT(fields
[5]=="foo");
1723 status
= U_ZERO_ERROR
;
1724 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 4, status
);
1727 REGEX_ASSERT(fields
[0]==" ");
1728 REGEX_ASSERT(fields
[1]=="a");
1729 REGEX_ASSERT(fields
[2]=="Now is ");
1730 REGEX_ASSERT(fields
[3]=="the time<c>");
1731 status
= U_ZERO_ERROR
;
1734 pat1
= RegexPattern::compile("([-,])", pe
, status
);
1736 n
= pat1
->split("1-10,20", fields
, 10, status
);
1739 REGEX_ASSERT(fields
[0]=="1");
1740 REGEX_ASSERT(fields
[1]=="-");
1741 REGEX_ASSERT(fields
[2]=="10");
1742 REGEX_ASSERT(fields
[3]==",");
1743 REGEX_ASSERT(fields
[4]=="20");
1746 // Test split of string with empty trailing fields
1747 pat1
= RegexPattern::compile(",", pe
, status
);
1749 n
= pat1
->split("a,b,c,", fields
, 10, status
);
1752 REGEX_ASSERT(fields
[0]=="a");
1753 REGEX_ASSERT(fields
[1]=="b");
1754 REGEX_ASSERT(fields
[2]=="c");
1755 REGEX_ASSERT(fields
[3]=="");
1757 n
= pat1
->split("a,,,", fields
, 10, status
);
1760 REGEX_ASSERT(fields
[0]=="a");
1761 REGEX_ASSERT(fields
[1]=="");
1762 REGEX_ASSERT(fields
[2]=="");
1763 REGEX_ASSERT(fields
[3]=="");
1766 // Split Separator with zero length match.
1767 pat1
= RegexPattern::compile(":?", pe
, status
);
1769 n
= pat1
->split("abc", fields
, 10, status
);
1772 REGEX_ASSERT(fields
[0]=="");
1773 REGEX_ASSERT(fields
[1]=="a");
1774 REGEX_ASSERT(fields
[2]=="b");
1775 REGEX_ASSERT(fields
[3]=="c");
1776 REGEX_ASSERT(fields
[4]=="");
1781 // RegexPattern::pattern()
1783 pat1
= new RegexPattern();
1784 REGEX_ASSERT(pat1
->pattern() == "");
1787 pat1
= RegexPattern::compile("(Hello, world)*", pe
, status
);
1789 REGEX_ASSERT(pat1
->pattern() == "(Hello, world)*");
1794 // classID functions
1796 pat1
= RegexPattern::compile("(Hello, world)*", pe
, status
);
1798 REGEX_ASSERT(pat1
->getDynamicClassID() == RegexPattern::getStaticClassID());
1799 REGEX_ASSERT(pat1
->getDynamicClassID() != NULL
);
1800 UnicodeString
Hello("Hello, world.");
1801 RegexMatcher
*m
= pat1
->matcher(Hello
, status
);
1802 REGEX_ASSERT(pat1
->getDynamicClassID() != m
->getDynamicClassID());
1803 REGEX_ASSERT(m
->getDynamicClassID() == RegexMatcher::getStaticClassID());
1804 REGEX_ASSERT(m
->getDynamicClassID() != NULL
);
1810 //---------------------------------------------------------------------------
1812 // API_Match_UTF8 Test that the alternate engine for class RegexMatcher
1813 // is present and working, but excluding functions
1814 // implementing replace operations.
1816 //---------------------------------------------------------------------------
1817 void RegexTest::API_Match_UTF8() {
1819 UErrorCode status
=U_ZERO_ERROR
;
1823 // Debug - slide failing test cases early
1832 // Simple pattern compilation
1835 UText re
= UTEXT_INITIALIZER
;
1836 regextst_openUTF8FromInvariant(&re
, "abc", -1, &status
);
1837 REGEX_VERBOSE_TEXT(&re
);
1839 pat2
= RegexPattern::compile(&re
, flags
, pe
, status
);
1842 UText input1
= UTEXT_INITIALIZER
;
1843 UText input2
= UTEXT_INITIALIZER
;
1844 UText empty
= UTEXT_INITIALIZER
;
1845 regextst_openUTF8FromInvariant(&input1
, "abcdef this is a test", -1, &status
);
1846 REGEX_VERBOSE_TEXT(&input1
);
1847 regextst_openUTF8FromInvariant(&input2
, "not abc", -1, &status
);
1848 REGEX_VERBOSE_TEXT(&input2
);
1849 utext_openUChars(&empty
, NULL
, 0, &status
);
1851 int32_t input1Len
= strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1852 int32_t input2Len
= strlen("not abc");
1856 // Matcher creation and reset.
1858 RegexMatcher
*m1
= &pat2
->matcher(status
)->reset(&input1
);
1860 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
1861 const char str_abcdefthisisatest
[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1862 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest
, m1
->inputText());
1864 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
1865 const char str_notabc
[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1866 REGEX_ASSERT_UTEXT_UTF8(str_notabc
, m1
->inputText());
1868 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest
, m1
->inputText());
1869 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
1871 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
1872 REGEX_ASSERT(utext_nativeLength(&empty
) == 0);
1875 // reset(pos, status)
1878 m1
->reset(4, status
);
1880 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest
, m1
->inputText());
1881 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
1883 m1
->reset(-1, status
);
1884 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1885 status
= U_ZERO_ERROR
;
1887 m1
->reset(0, status
);
1889 status
= U_ZERO_ERROR
;
1891 m1
->reset(input1Len
-1, status
);
1893 status
= U_ZERO_ERROR
;
1895 m1
->reset(input1Len
, status
);
1897 status
= U_ZERO_ERROR
;
1899 m1
->reset(input1Len
+1, status
);
1900 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1901 status
= U_ZERO_ERROR
;
1904 // match(pos, status)
1907 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
1909 REGEX_ASSERT(m1
->matches(3, status
) == FALSE
);
1911 REGEX_ASSERT(m1
->matches(5, status
) == FALSE
);
1912 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
1913 REGEX_ASSERT(m1
->matches(-1, status
) == FALSE
);
1914 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1916 // Match() at end of string should fail, but should not
1918 status
= U_ZERO_ERROR
;
1919 REGEX_ASSERT(m1
->matches(input2Len
, status
) == FALSE
);
1922 // Match beyond end of string should fail with an error.
1923 status
= U_ZERO_ERROR
;
1924 REGEX_ASSERT(m1
->matches(input2Len
+1, status
) == FALSE
);
1925 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1927 // Successful match at end of string.
1929 status
= U_ZERO_ERROR
;
1930 RegexMatcher
m("A?", 0, status
); // will match zero length string.
1933 REGEX_ASSERT(m
.matches(input1Len
, status
) == TRUE
);
1936 REGEX_ASSERT(m
.matches(0, status
) == TRUE
);
1942 // lookingAt(pos, status)
1944 status
= U_ZERO_ERROR
;
1945 m1
->reset(&input2
); // "not abc"
1946 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
1947 REGEX_ASSERT(m1
->lookingAt(5, status
) == FALSE
);
1948 REGEX_ASSERT(m1
->lookingAt(3, status
) == FALSE
);
1949 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
1950 REGEX_ASSERT(m1
->lookingAt(-1, status
) == FALSE
);
1951 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1952 status
= U_ZERO_ERROR
;
1953 REGEX_ASSERT(m1
->lookingAt(input2Len
, status
) == FALSE
);
1955 REGEX_ASSERT(m1
->lookingAt(input2Len
+1, status
) == FALSE
);
1956 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1962 utext_close(&input1
);
1963 utext_close(&input2
);
1964 utext_close(&empty
);
1970 // RegexMatcher::start();
1971 // RegexMatcher::end();
1972 // RegexMatcher::groupCount();
1977 UErrorCode status
=U_ZERO_ERROR
;
1978 UText re
=UTEXT_INITIALIZER
;
1979 const char str_01234567_pat
[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1980 utext_openUTF8(&re
, str_01234567_pat
, -1, &status
);
1982 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
1985 UText input
= UTEXT_INITIALIZER
;
1986 const char str_0123456789
[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1987 utext_openUTF8(&input
, str_0123456789
, -1, &status
);
1989 RegexMatcher
*matcher
= &pat
->matcher(status
)->reset(&input
);
1991 REGEX_ASSERT(matcher
->lookingAt(status
) == TRUE
);
1992 static const int32_t matchStarts
[] = {0, 2, 4, 8};
1993 static const int32_t matchEnds
[] = {10, 8, 6, 10};
1995 for (i
=0; i
<4; i
++) {
1996 int32_t actualStart
= matcher
->start(i
, status
);
1998 if (actualStart
!= matchStarts
[i
]) {
1999 errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n",
2000 __FILE__
, __LINE__
, i
, matchStarts
[i
], actualStart
);
2002 int32_t actualEnd
= matcher
->end(i
, status
);
2004 if (actualEnd
!= matchEnds
[i
]) {
2005 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n",
2006 __FILE__
, __LINE__
, i
, matchEnds
[i
], actualEnd
);
2010 REGEX_ASSERT(matcher
->start(0, status
) == matcher
->start(status
));
2011 REGEX_ASSERT(matcher
->end(0, status
) == matcher
->end(status
));
2013 REGEX_ASSERT_FAIL(matcher
->start(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2014 REGEX_ASSERT_FAIL(matcher
->start( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2016 REGEX_ASSERT_FAIL(matcher
->start( 0, status
), U_REGEX_INVALID_STATE
);
2018 matcher
->lookingAt(status
);
2021 UText destText
= UTEXT_INITIALIZER
;
2022 utext_openUnicodeString(&destText
, &dest
, &status
);
2024 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2025 // Test shallow-clone API
2027 result
= matcher
->group((UText
*)NULL
, group_len
, status
);
2029 REGEX_ASSERT_UTEXT_UTF8(str_0123456789
, result
);
2030 utext_close(result
);
2031 result
= matcher
->group(0, &destText
, group_len
, status
);
2033 REGEX_ASSERT(result
== &destText
);
2034 REGEX_ASSERT_UTEXT_UTF8(str_0123456789
, result
);
2035 // destText is now immutable, reopen it
2036 utext_close(&destText
);
2037 utext_openUnicodeString(&destText
, &dest
, &status
);
2039 result
= matcher
->group(0, NULL
, status
);
2041 REGEX_ASSERT_UTEXT_UTF8(str_0123456789
, result
);
2042 utext_close(result
);
2043 result
= matcher
->group(0, &destText
, status
);
2045 REGEX_ASSERT(result
== &destText
);
2046 REGEX_ASSERT_UTEXT_UTF8(str_0123456789
, result
);
2048 result
= matcher
->group(1, NULL
, status
);
2050 const char str_234567
[] = { 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x00 }; /* 234567 */
2051 REGEX_ASSERT_UTEXT_UTF8(str_234567
, result
);
2052 utext_close(result
);
2053 result
= matcher
->group(1, &destText
, status
);
2055 REGEX_ASSERT(result
== &destText
);
2056 REGEX_ASSERT_UTEXT_UTF8(str_234567
, result
);
2058 result
= matcher
->group(2, NULL
, status
);
2060 const char str_45
[] = { 0x34, 0x35, 0x00 }; /* 45 */
2061 REGEX_ASSERT_UTEXT_UTF8(str_45
, result
);
2062 utext_close(result
);
2063 result
= matcher
->group(2, &destText
, status
);
2065 REGEX_ASSERT(result
== &destText
);
2066 REGEX_ASSERT_UTEXT_UTF8(str_45
, result
);
2068 result
= matcher
->group(3, NULL
, status
);
2070 const char str_89
[] = { 0x38, 0x39, 0x00 }; /* 89 */
2071 REGEX_ASSERT_UTEXT_UTF8(str_89
, result
);
2072 utext_close(result
);
2073 result
= matcher
->group(3, &destText
, status
);
2075 REGEX_ASSERT(result
== &destText
);
2076 REGEX_ASSERT_UTEXT_UTF8(str_89
, result
);
2078 REGEX_ASSERT_FAIL(matcher
->group(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2079 REGEX_ASSERT_FAIL(matcher
->group( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2081 REGEX_ASSERT_FAIL(matcher
->group( 0, status
), U_REGEX_INVALID_STATE
);
2086 utext_close(&destText
);
2087 utext_close(&input
);
2097 UErrorCode status
=U_ZERO_ERROR
;
2098 UText re
=UTEXT_INITIALIZER
;
2099 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2100 utext_openUTF8(&re
, str_abc
, -1, &status
);
2102 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
2104 UText input
= UTEXT_INITIALIZER
;
2105 const char str_abcabcabc
[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2106 utext_openUTF8(&input
, str_abcabcabc
, -1, &status
);
2107 // 012345678901234567
2109 RegexMatcher
*matcher
= &pat
->matcher(status
)->reset(&input
);
2111 REGEX_ASSERT(matcher
->find());
2112 REGEX_ASSERT(matcher
->start(status
) == 1);
2113 REGEX_ASSERT(matcher
->find());
2114 REGEX_ASSERT(matcher
->start(status
) == 6);
2115 REGEX_ASSERT(matcher
->find());
2116 REGEX_ASSERT(matcher
->start(status
) == 12);
2117 REGEX_ASSERT(matcher
->find() == FALSE
);
2118 REGEX_ASSERT(matcher
->find() == FALSE
);
2121 REGEX_ASSERT(matcher
->find());
2122 REGEX_ASSERT(matcher
->start(status
) == 1);
2124 REGEX_ASSERT(matcher
->find(0, status
));
2125 REGEX_ASSERT(matcher
->start(status
) == 1);
2126 REGEX_ASSERT(matcher
->find(1, status
));
2127 REGEX_ASSERT(matcher
->start(status
) == 1);
2128 REGEX_ASSERT(matcher
->find(2, status
));
2129 REGEX_ASSERT(matcher
->start(status
) == 6);
2130 REGEX_ASSERT(matcher
->find(12, status
));
2131 REGEX_ASSERT(matcher
->start(status
) == 12);
2132 REGEX_ASSERT(matcher
->find(13, status
) == FALSE
);
2133 REGEX_ASSERT(matcher
->find(16, status
) == FALSE
);
2134 REGEX_ASSERT(matcher
->find(17, status
) == FALSE
);
2135 REGEX_ASSERT_FAIL(matcher
->start(status
), U_REGEX_INVALID_STATE
);
2137 status
= U_ZERO_ERROR
;
2138 REGEX_ASSERT_FAIL(matcher
->find(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2139 status
= U_ZERO_ERROR
;
2140 REGEX_ASSERT_FAIL(matcher
->find(18, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2142 REGEX_ASSERT(matcher
->groupCount() == 0);
2147 utext_close(&input
);
2153 // find, with \G in pattern (true if at the end of a previous match).
2158 UErrorCode status
=U_ZERO_ERROR
;
2159 UText re
=UTEXT_INITIALIZER
;
2160 const char str_Gabcabc
[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2161 utext_openUTF8(&re
, str_Gabcabc
, -1, &status
);
2163 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
2166 UText input
= UTEXT_INITIALIZER
;
2167 const char str_abcabcabc
[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2168 utext_openUTF8(&input
, str_abcabcabc
, -1, &status
);
2169 // 012345678901234567
2171 RegexMatcher
*matcher
= &pat
->matcher(status
)->reset(&input
);
2173 REGEX_ASSERT(matcher
->find());
2174 REGEX_ASSERT(matcher
->start(status
) == 0);
2175 REGEX_ASSERT(matcher
->start(1, status
) == -1);
2176 REGEX_ASSERT(matcher
->start(2, status
) == 1);
2178 REGEX_ASSERT(matcher
->find());
2179 REGEX_ASSERT(matcher
->start(status
) == 4);
2180 REGEX_ASSERT(matcher
->start(1, status
) == 4);
2181 REGEX_ASSERT(matcher
->start(2, status
) == -1);
2187 utext_close(&input
);
2192 // find with zero length matches, match position should bump ahead
2193 // to prevent loops.
2197 UErrorCode status
=U_ZERO_ERROR
;
2198 RegexMatcher
m("(?= ?)", 0, status
); // This pattern will zero-length matches anywhere,
2199 // using an always-true look-ahead.
2201 UText s
= UTEXT_INITIALIZER
;
2202 utext_openUTF8(&s
, " ", -1, &status
);
2205 if (m
.find() == FALSE
) {
2208 REGEX_ASSERT(m
.start(status
) == i
);
2209 REGEX_ASSERT(m
.end(status
) == i
);
2213 // Check that the bump goes over characters outside the BMP OK
2214 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2215 unsigned char aboveBMP
[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2216 utext_openUTF8(&s
, (char *)aboveBMP
, -1, &status
);
2219 if (m
.find() == FALSE
) {
2222 REGEX_ASSERT(m
.start(status
) == i
);
2223 REGEX_ASSERT(m
.end(status
) == i
);
2225 REGEX_ASSERT(i
==20);
2230 // find() loop breaking test.
2231 // with pattern of /.?/, should see a series of one char matches, then a single
2232 // match of zero length at the end of the input string.
2234 UErrorCode status
=U_ZERO_ERROR
;
2235 RegexMatcher
m(".?", 0, status
);
2237 UText s
= UTEXT_INITIALIZER
;
2238 utext_openUTF8(&s
, " ", -1, &status
);
2241 if (m
.find() == FALSE
) {
2244 REGEX_ASSERT(m
.start(status
) == i
);
2245 REGEX_ASSERT(m
.end(status
) == (i
<4 ? i
+1 : i
));
2254 // Matchers with no input string behave as if they had an empty input string.
2258 UErrorCode status
= U_ZERO_ERROR
;
2259 RegexMatcher
m(".?", 0, status
);
2261 REGEX_ASSERT(m
.find());
2262 REGEX_ASSERT(m
.start(status
) == 0);
2263 REGEX_ASSERT(m
.input() == "");
2266 UErrorCode status
= U_ZERO_ERROR
;
2267 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
2268 RegexMatcher
*m
= p
->matcher(status
);
2271 REGEX_ASSERT(m
->find() == FALSE
);
2272 REGEX_ASSERT(utext_nativeLength(m
->inputText()) == 0);
2281 UErrorCode status
= U_ZERO_ERROR
;
2282 UText testPattern
= UTEXT_INITIALIZER
;
2283 UText testText
= UTEXT_INITIALIZER
;
2284 regextst_openUTF8FromInvariant(&testPattern
, ".*", -1, &status
);
2285 REGEX_VERBOSE_TEXT(&testPattern
);
2286 regextst_openUTF8FromInvariant(&testText
, "This is test data", -1, &status
);
2287 REGEX_VERBOSE_TEXT(&testText
);
2289 RegexMatcher
m(&testPattern
, &testText
, 0, status
);
2291 REGEX_ASSERT(m
.regionStart() == 0);
2292 REGEX_ASSERT(m
.regionEnd() == (int32_t)strlen("This is test data"));
2293 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2294 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2296 m
.region(2,4, status
);
2298 REGEX_ASSERT(m
.matches(status
));
2299 REGEX_ASSERT(m
.start(status
)==2);
2300 REGEX_ASSERT(m
.end(status
)==4);
2304 REGEX_ASSERT(m
.regionStart() == 0);
2305 REGEX_ASSERT(m
.regionEnd() == (int32_t)strlen("This is test data"));
2307 regextst_openUTF8FromInvariant(&testText
, "short", -1, &status
);
2308 REGEX_VERBOSE_TEXT(&testText
);
2310 REGEX_ASSERT(m
.regionStart() == 0);
2311 REGEX_ASSERT(m
.regionEnd() == (int32_t)strlen("short"));
2313 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2314 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(FALSE
));
2315 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
2316 REGEX_ASSERT(&m
== &m
.reset());
2317 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
2319 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(TRUE
));
2320 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2321 REGEX_ASSERT(&m
== &m
.reset());
2322 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2324 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2325 REGEX_ASSERT(&m
== &m
.useTransparentBounds(TRUE
));
2326 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
2327 REGEX_ASSERT(&m
== &m
.reset());
2328 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
2330 REGEX_ASSERT(&m
== &m
.useTransparentBounds(FALSE
));
2331 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2332 REGEX_ASSERT(&m
== &m
.reset());
2333 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2335 utext_close(&testText
);
2336 utext_close(&testPattern
);
2340 // hitEnd() and requireEnd()
2343 UErrorCode status
= U_ZERO_ERROR
;
2344 UText testPattern
= UTEXT_INITIALIZER
;
2345 UText testText
= UTEXT_INITIALIZER
;
2346 const char str_
[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2347 const char str_aabb
[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2348 utext_openUTF8(&testPattern
, str_
, -1, &status
);
2349 utext_openUTF8(&testText
, str_aabb
, -1, &status
);
2351 RegexMatcher
m1(&testPattern
, &testText
, 0, status
);
2352 REGEX_ASSERT(m1
.lookingAt(status
) == TRUE
);
2353 REGEX_ASSERT(m1
.hitEnd() == TRUE
);
2354 REGEX_ASSERT(m1
.requireEnd() == FALSE
);
2357 status
= U_ZERO_ERROR
;
2358 const char str_a
[] = { 0x61, 0x2a, 0x00 }; /* a* */
2359 utext_openUTF8(&testPattern
, str_a
, -1, &status
);
2360 RegexMatcher
m2(&testPattern
, &testText
, 0, status
);
2361 REGEX_ASSERT(m2
.lookingAt(status
) == TRUE
);
2362 REGEX_ASSERT(m2
.hitEnd() == FALSE
);
2363 REGEX_ASSERT(m2
.requireEnd() == FALSE
);
2366 status
= U_ZERO_ERROR
;
2367 const char str_dotstardollar
[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2368 utext_openUTF8(&testPattern
, str_dotstardollar
, -1, &status
);
2369 RegexMatcher
m3(&testPattern
, &testText
, 0, status
);
2370 REGEX_ASSERT(m3
.lookingAt(status
) == TRUE
);
2371 REGEX_ASSERT(m3
.hitEnd() == TRUE
);
2372 REGEX_ASSERT(m3
.requireEnd() == TRUE
);
2375 utext_close(&testText
);
2376 utext_close(&testPattern
);
2381 //---------------------------------------------------------------------------
2383 // API_Replace_UTF8 API test for class RegexMatcher, testing the
2384 // Replace family of functions.
2386 //---------------------------------------------------------------------------
2387 void RegexTest::API_Replace_UTF8() {
2393 UErrorCode status
=U_ZERO_ERROR
;
2395 UText re
=UTEXT_INITIALIZER
;
2396 regextst_openUTF8FromInvariant(&re
, "abc", -1, &status
);
2397 REGEX_VERBOSE_TEXT(&re
);
2398 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
2401 char data
[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2402 // 012345678901234567
2403 UText dataText
= UTEXT_INITIALIZER
;
2404 utext_openUTF8(&dataText
, data
, -1, &status
);
2406 REGEX_VERBOSE_TEXT(&dataText
);
2407 RegexMatcher
*matcher
= &pat
->matcher(status
)->reset(&dataText
);
2410 // Plain vanilla matches.
2413 UText destText
= UTEXT_INITIALIZER
;
2414 utext_openUnicodeString(&destText
, &dest
, &status
);
2417 UText replText
= UTEXT_INITIALIZER
;
2419 const char str_yz
[] = { 0x79, 0x7a, 0x00 }; /* yz */
2420 utext_openUTF8(&replText
, str_yz
, -1, &status
);
2421 REGEX_VERBOSE_TEXT(&replText
);
2422 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2424 const char str_yzabcabc
[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2425 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc
, result
);
2426 utext_close(result
);
2427 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2429 REGEX_ASSERT(result
== &destText
);
2430 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc
, result
);
2432 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2434 const char str_yzyzyz
[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2435 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz
, result
);
2436 utext_close(result
);
2438 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2439 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2441 REGEX_ASSERT(result
== &destText
);
2442 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz
, result
);
2445 // Plain vanilla non-matches.
2447 const char str_abxabxabx
[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2448 utext_openUTF8(&dataText
, str_abxabxabx
, -1, &status
);
2449 matcher
->reset(&dataText
);
2451 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2453 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2454 utext_close(result
);
2455 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2457 REGEX_ASSERT(result
== &destText
);
2458 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2460 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2462 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2463 utext_close(result
);
2464 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2465 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2467 REGEX_ASSERT(result
== &destText
);
2468 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2471 // Empty source string
2473 utext_openUTF8(&dataText
, NULL
, 0, &status
);
2474 matcher
->reset(&dataText
);
2476 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2478 REGEX_ASSERT_UTEXT_UTF8("", result
);
2479 utext_close(result
);
2480 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2482 REGEX_ASSERT(result
== &destText
);
2483 REGEX_ASSERT_UTEXT_UTF8("", result
);
2485 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2487 REGEX_ASSERT_UTEXT_UTF8("", result
);
2488 utext_close(result
);
2489 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2491 REGEX_ASSERT(result
== &destText
);
2492 REGEX_ASSERT_UTEXT_UTF8("", result
);
2495 // Empty substitution string
2497 utext_openUTF8(&dataText
, data
, -1, &status
); // ".abc..abc...abc.."
2498 matcher
->reset(&dataText
);
2500 utext_openUTF8(&replText
, NULL
, 0, &status
);
2501 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2503 const char str_abcabc
[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2504 REGEX_ASSERT_UTEXT_UTF8(str_abcabc
, result
);
2505 utext_close(result
);
2506 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2508 REGEX_ASSERT(result
== &destText
);
2509 REGEX_ASSERT_UTEXT_UTF8(str_abcabc
, result
);
2511 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2513 const char str_dots
[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2514 REGEX_ASSERT_UTEXT_UTF8(str_dots
, result
);
2515 utext_close(result
);
2516 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2517 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2519 REGEX_ASSERT(result
== &destText
);
2520 REGEX_ASSERT_UTEXT_UTF8(str_dots
, result
);
2523 // match whole string
2525 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2526 utext_openUTF8(&dataText
, str_abc
, -1, &status
);
2527 matcher
->reset(&dataText
);
2529 const char str_xyz
[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2530 utext_openUTF8(&replText
, str_xyz
, -1, &status
);
2531 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2533 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2534 utext_close(result
);
2535 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2536 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2538 REGEX_ASSERT(result
== &destText
);
2539 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2541 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2543 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2544 utext_close(result
);
2545 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2546 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2548 REGEX_ASSERT(result
== &destText
);
2549 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2552 // Capture Group, simple case
2554 const char str_add
[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2555 utext_openUTF8(&re
, str_add
, -1, &status
);
2556 RegexPattern
*pat2
= RegexPattern::compile(&re
, flags
, pe
, status
);
2559 const char str_abcdefg
[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2560 utext_openUTF8(&dataText
, str_abcdefg
, -1, &status
);
2561 RegexMatcher
*matcher2
= &pat2
->matcher(status
)->reset(&dataText
);
2564 const char str_11
[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2565 utext_openUTF8(&replText
, str_11
, -1, &status
);
2566 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2568 const char str_bcbcdefg
[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2569 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg
, result
);
2570 utext_close(result
);
2571 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2572 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2574 REGEX_ASSERT(result
== &destText
);
2575 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg
, result
);
2577 const char str_v
[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2578 utext_openUTF8(&replText
, str_v
, -1, &status
);
2579 REGEX_VERBOSE_TEXT(&replText
);
2580 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2582 const char str_Thevalueof1isbcdefg
[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2583 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg
, result
);
2584 utext_close(result
);
2585 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2586 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2588 REGEX_ASSERT(result
== &destText
);
2589 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg
, result
);
2591 const char str_byitselfnogroupnumber
[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x00 }; /* $ by itself, no group number $$$ */
2592 utext_openUTF8(&replText
, str_byitselfnogroupnumber
, -1, &status
);
2593 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2595 const char str_byitselfnogroupnumberdefg
[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2596 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg
, result
);
2597 utext_close(result
);
2598 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2599 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2601 REGEX_ASSERT(result
== &destText
);
2602 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg
, result
);
2604 unsigned char supplDigitChars
[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2605 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2606 // 012345678901234567890123456
2607 supplDigitChars
[22] = 0xF0;
2608 supplDigitChars
[23] = 0x9D;
2609 supplDigitChars
[24] = 0x9F;
2610 supplDigitChars
[25] = 0x8F;
2611 utext_openUTF8(&replText
, (char *)supplDigitChars
, -1, &status
);
2613 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2615 const char str_SupplementalDigit1bcdefg
[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2616 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg
, result
);
2617 utext_close(result
);
2618 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2619 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2621 REGEX_ASSERT(result
== &destText
);
2622 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg
, result
);
2623 const char str_badcapturegroupnumber5
[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */
2624 utext_openUTF8(&replText
, str_badcapturegroupnumber5
, -1, &status
);
2625 REGEX_ASSERT_FAIL((result
= matcher2
->replaceFirst(&replText
, NULL
, status
)), U_INDEX_OUTOFBOUNDS_ERROR
);
2626 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2627 utext_close(result
);
2628 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2629 REGEX_ASSERT_FAIL((result
= matcher2
->replaceFirst(&replText
, &destText
, status
)), U_INDEX_OUTOFBOUNDS_ERROR
);
2630 REGEX_ASSERT(result
== &destText
);
2631 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2634 // Replacement String with \u hex escapes
2637 const char str_abc1abc2abc3
[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2638 const char str_u0043
[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2639 utext_openUTF8(&dataText
, str_abc1abc2abc3
, -1, &status
);
2640 utext_openUTF8(&replText
, str_u0043
, -1, &status
);
2641 matcher
->reset(&dataText
);
2643 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2645 const char str_C1C2C3
[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2646 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3
, result
);
2647 utext_close(result
);
2648 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2649 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2651 REGEX_ASSERT(result
== &destText
);
2652 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3
, result
);
2655 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2656 utext_openUTF8(&dataText
, str_abc
, -1, &status
);
2657 const char str_U00010000
[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2658 utext_openUTF8(&replText
, str_U00010000
, -1, &status
);
2659 matcher
->reset(&dataText
);
2661 unsigned char expected
[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2668 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2670 REGEX_ASSERT_UTEXT_UTF8((char *)expected
, result
);
2671 utext_close(result
);
2672 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2673 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2675 REGEX_ASSERT(result
== &destText
);
2676 REGEX_ASSERT_UTEXT_UTF8((char *)expected
, result
);
2678 // TODO: need more through testing of capture substitutions.
2683 status
= U_ZERO_ERROR
;
2684 const char str_ssee
[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2685 const char str_blah
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2686 const char str_ooh
[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2687 utext_openUTF8(&re
, str_ssee
, -1, &status
);
2688 utext_openUTF8(&dataText
, str_blah
, -1, &status
);
2689 utext_openUTF8(&replText
, str_ooh
, -1, &status
);
2691 RegexMatcher
m(&re
, 0, status
);
2694 UnicodeString result
;
2695 UText resultText
= UTEXT_INITIALIZER
;
2696 utext_openUnicodeString(&resultText
, &result
, &status
);
2698 // Multiple finds do NOT bump up the previous appendReplacement postion.
2702 m
.appendReplacement(&resultText
, &replText
, status
);
2704 const char str_blah2
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2705 REGEX_ASSERT_UTEXT_UTF8(str_blah2
, &resultText
);
2707 // After a reset into the interior of a string, appendReplacement still starts at beginning.
2708 status
= U_ZERO_ERROR
;
2710 utext_openUnicodeString(&resultText
, &result
, &status
);
2711 m
.reset(10, status
);
2714 m
.appendReplacement(&resultText
, &replText
, status
);
2716 const char str_blah3
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2717 REGEX_ASSERT_UTEXT_UTF8(str_blah3
, &resultText
);
2719 // find() at interior of string, appendReplacement still starts at beginning.
2720 status
= U_ZERO_ERROR
;
2722 utext_openUnicodeString(&resultText
, &result
, &status
);
2726 m
.appendReplacement(&resultText
, &replText
, status
);
2728 const char str_blah8
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2729 REGEX_ASSERT_UTEXT_UTF8(str_blah8
, &resultText
);
2731 m
.appendTail(&resultText
, status
);
2732 const char str_blah9
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2733 REGEX_ASSERT_UTEXT_UTF8(str_blah9
, &resultText
);
2735 utext_close(&resultText
);
2743 utext_close(&dataText
);
2744 utext_close(&replText
);
2745 utext_close(&destText
);
2750 //---------------------------------------------------------------------------
2752 // API_Pattern_UTF8 Test that the API for class RegexPattern is
2753 // present and nominally working.
2755 //---------------------------------------------------------------------------
2756 void RegexTest::API_Pattern_UTF8() {
2757 RegexPattern pata
; // Test default constructor to not crash.
2760 REGEX_ASSERT(pata
== patb
);
2761 REGEX_ASSERT(pata
== pata
);
2763 UText re1
= UTEXT_INITIALIZER
;
2764 UText re2
= UTEXT_INITIALIZER
;
2765 UErrorCode status
= U_ZERO_ERROR
;
2768 const char str_abcalmz
[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2769 const char str_def
[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2770 utext_openUTF8(&re1
, str_abcalmz
, -1, &status
);
2771 utext_openUTF8(&re2
, str_def
, -1, &status
);
2773 RegexPattern
*pat1
= RegexPattern::compile(&re1
, 0, pe
, status
);
2774 RegexPattern
*pat2
= RegexPattern::compile(&re2
, 0, pe
, status
);
2776 REGEX_ASSERT(*pat1
== *pat1
);
2777 REGEX_ASSERT(*pat1
!= pata
);
2781 REGEX_ASSERT(patb
== *pat1
);
2784 RegexPattern
patc(*pat1
);
2785 REGEX_ASSERT(patc
== *pat1
);
2786 REGEX_ASSERT(patb
== patc
);
2787 REGEX_ASSERT(pat1
!= pat2
);
2789 REGEX_ASSERT(patb
!= patc
);
2790 REGEX_ASSERT(patb
== *pat2
);
2792 // Compile with no flags.
2793 RegexPattern
*pat1a
= RegexPattern::compile(&re1
, pe
, status
);
2794 REGEX_ASSERT(*pat1a
== *pat1
);
2796 REGEX_ASSERT(pat1a
->flags() == 0);
2798 // Compile with different flags should be not equal
2799 RegexPattern
*pat1b
= RegexPattern::compile(&re1
, UREGEX_CASE_INSENSITIVE
, pe
, status
);
2802 REGEX_ASSERT(*pat1b
!= *pat1a
);
2803 REGEX_ASSERT(pat1b
->flags() == UREGEX_CASE_INSENSITIVE
);
2804 REGEX_ASSERT(pat1a
->flags() == 0);
2808 RegexPattern
*pat1c
= pat1
->clone();
2809 REGEX_ASSERT(*pat1c
== *pat1
);
2810 REGEX_ASSERT(*pat1c
!= *pat2
);
2822 // Verify that a matcher created from a cloned pattern works.
2826 UErrorCode status
= U_ZERO_ERROR
;
2827 UText pattern
= UTEXT_INITIALIZER
;
2828 const char str_pL
[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2829 utext_openUTF8(&pattern
, str_pL
, -1, &status
);
2831 RegexPattern
*pSource
= RegexPattern::compile(&pattern
, 0, status
);
2832 RegexPattern
*pClone
= pSource
->clone();
2834 RegexMatcher
*mFromClone
= pClone
->matcher(status
);
2837 UText input
= UTEXT_INITIALIZER
;
2838 const char str_HelloWorld
[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2839 utext_openUTF8(&input
, str_HelloWorld
, -1, &status
);
2840 mFromClone
->reset(&input
);
2841 REGEX_ASSERT(mFromClone
->find() == TRUE
);
2842 REGEX_ASSERT(mFromClone
->group(status
) == "Hello");
2843 REGEX_ASSERT(mFromClone
->find() == TRUE
);
2844 REGEX_ASSERT(mFromClone
->group(status
) == "World");
2845 REGEX_ASSERT(mFromClone
->find() == FALSE
);
2849 utext_close(&input
);
2850 utext_close(&pattern
);
2854 // matches convenience API
2857 UErrorCode status
= U_ZERO_ERROR
;
2858 UText pattern
= UTEXT_INITIALIZER
;
2859 UText input
= UTEXT_INITIALIZER
;
2861 const char str_randominput
[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2862 utext_openUTF8(&input
, str_randominput
, -1, &status
);
2864 const char str_dotstar
[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2865 utext_openUTF8(&pattern
, str_dotstar
, -1, &status
);
2866 REGEX_ASSERT(RegexPattern::matches(&pattern
, &input
, pe
, status
) == TRUE
);
2869 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2870 utext_openUTF8(&pattern
, str_abc
, -1, &status
);
2871 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe
, status
) == FALSE
);
2874 const char str_nput
[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2875 utext_openUTF8(&pattern
, str_nput
, -1, &status
);
2876 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe
, status
) == TRUE
);
2879 utext_openUTF8(&pattern
, str_randominput
, -1, &status
);
2880 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe
, status
) == TRUE
);
2883 const char str_u
[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2884 utext_openUTF8(&pattern
, str_u
, -1, &status
);
2885 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe
, status
) == FALSE
);
2888 utext_openUTF8(&input
, str_abc
, -1, &status
);
2889 utext_openUTF8(&pattern
, str_abc
, -1, &status
);
2890 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
2891 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe
, status
) == FALSE
);
2892 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
2894 utext_close(&input
);
2895 utext_close(&pattern
);
2902 status
= U_ZERO_ERROR
;
2903 const char str_spaceplus
[] = { 0x20, 0x2b, 0x00 }; /* + */
2904 utext_openUTF8(&re1
, str_spaceplus
, -1, &status
);
2905 pat1
= RegexPattern::compile(&re1
, pe
, status
);
2907 UnicodeString fields
[10];
2910 n
= pat1
->split("Now is the time", fields
, 10, status
);
2913 REGEX_ASSERT(fields
[0]=="Now");
2914 REGEX_ASSERT(fields
[1]=="is");
2915 REGEX_ASSERT(fields
[2]=="the");
2916 REGEX_ASSERT(fields
[3]=="time");
2917 REGEX_ASSERT(fields
[4]=="");
2919 n
= pat1
->split("Now is the time", fields
, 2, status
);
2922 REGEX_ASSERT(fields
[0]=="Now");
2923 REGEX_ASSERT(fields
[1]=="is the time");
2924 REGEX_ASSERT(fields
[2]=="the"); // left over from previous test
2927 status
= U_ZERO_ERROR
;
2928 n
= pat1
->split("Now is the time", fields
, 1, status
);
2931 REGEX_ASSERT(fields
[0]=="Now is the time");
2932 REGEX_ASSERT(fields
[1]=="*");
2933 status
= U_ZERO_ERROR
;
2935 n
= pat1
->split(" Now is the time ", fields
, 10, status
);
2938 REGEX_ASSERT(fields
[0]=="");
2939 REGEX_ASSERT(fields
[1]=="Now");
2940 REGEX_ASSERT(fields
[2]=="is");
2941 REGEX_ASSERT(fields
[3]=="the");
2942 REGEX_ASSERT(fields
[4]=="time");
2943 REGEX_ASSERT(fields
[5]=="");
2944 REGEX_ASSERT(fields
[6]=="");
2947 n
= pat1
->split(" ", fields
, 10, status
);
2950 REGEX_ASSERT(fields
[0]=="");
2951 REGEX_ASSERT(fields
[1]=="");
2952 REGEX_ASSERT(fields
[2]=="*");
2955 n
= pat1
->split("", fields
, 10, status
);
2958 REGEX_ASSERT(fields
[0]=="foo");
2962 // split, with a pattern with (capture)
2963 regextst_openUTF8FromInvariant(&re1
, "<(\\w*)>", -1, &status
);
2964 pat1
= RegexPattern::compile(&re1
, pe
, status
);
2967 status
= U_ZERO_ERROR
;
2968 fields
[6] = fields
[7] = "*";
2969 n
= pat1
->split("<a>Now is <b>the time<c>", fields
, 10, status
);
2972 REGEX_ASSERT(fields
[0]=="");
2973 REGEX_ASSERT(fields
[1]=="a");
2974 REGEX_ASSERT(fields
[2]=="Now is ");
2975 REGEX_ASSERT(fields
[3]=="b");
2976 REGEX_ASSERT(fields
[4]=="the time");
2977 REGEX_ASSERT(fields
[5]=="c");
2978 REGEX_ASSERT(fields
[6]=="");
2979 REGEX_ASSERT(fields
[7]=="*");
2980 REGEX_ASSERT(status
==U_ZERO_ERROR
);
2982 fields
[6] = fields
[7] = "*";
2983 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 10, status
);
2986 REGEX_ASSERT(fields
[0]==" ");
2987 REGEX_ASSERT(fields
[1]=="a");
2988 REGEX_ASSERT(fields
[2]=="Now is ");
2989 REGEX_ASSERT(fields
[3]=="b");
2990 REGEX_ASSERT(fields
[4]=="the time");
2991 REGEX_ASSERT(fields
[5]=="c");
2992 REGEX_ASSERT(fields
[6]=="");
2993 REGEX_ASSERT(fields
[7]=="*");
2995 status
= U_ZERO_ERROR
;
2997 n
= pat1
->split(" <a>Now is <b>the time<c> ", fields
, 6, status
);
3000 REGEX_ASSERT(fields
[0]==" ");
3001 REGEX_ASSERT(fields
[1]=="a");
3002 REGEX_ASSERT(fields
[2]=="Now is ");
3003 REGEX_ASSERT(fields
[3]=="b");
3004 REGEX_ASSERT(fields
[4]=="the time");
3005 REGEX_ASSERT(fields
[5]==" ");
3006 REGEX_ASSERT(fields
[6]=="foo");
3008 status
= U_ZERO_ERROR
;
3010 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 5, status
);
3013 REGEX_ASSERT(fields
[0]==" ");
3014 REGEX_ASSERT(fields
[1]=="a");
3015 REGEX_ASSERT(fields
[2]=="Now is ");
3016 REGEX_ASSERT(fields
[3]=="b");
3017 REGEX_ASSERT(fields
[4]=="the time<c>");
3018 REGEX_ASSERT(fields
[5]=="foo");
3020 status
= U_ZERO_ERROR
;
3022 n
= pat1
->split(" <a>Now is <b>the time", fields
, 5, status
);
3025 REGEX_ASSERT(fields
[0]==" ");
3026 REGEX_ASSERT(fields
[1]=="a");
3027 REGEX_ASSERT(fields
[2]=="Now is ");
3028 REGEX_ASSERT(fields
[3]=="b");
3029 REGEX_ASSERT(fields
[4]=="the time");
3030 REGEX_ASSERT(fields
[5]=="foo");
3032 status
= U_ZERO_ERROR
;
3033 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 4, status
);
3036 REGEX_ASSERT(fields
[0]==" ");
3037 REGEX_ASSERT(fields
[1]=="a");
3038 REGEX_ASSERT(fields
[2]=="Now is ");
3039 REGEX_ASSERT(fields
[3]=="the time<c>");
3040 status
= U_ZERO_ERROR
;
3043 regextst_openUTF8FromInvariant(&re1
, "([-,])", -1, &status
);
3044 pat1
= RegexPattern::compile(&re1
, pe
, status
);
3046 n
= pat1
->split("1-10,20", fields
, 10, status
);
3049 REGEX_ASSERT(fields
[0]=="1");
3050 REGEX_ASSERT(fields
[1]=="-");
3051 REGEX_ASSERT(fields
[2]=="10");
3052 REGEX_ASSERT(fields
[3]==",");
3053 REGEX_ASSERT(fields
[4]=="20");
3058 // RegexPattern::pattern() and patternText()
3060 pat1
= new RegexPattern();
3061 REGEX_ASSERT(pat1
->pattern() == "");
3062 REGEX_ASSERT_UTEXT_UTF8("", pat1
->patternText(status
));
3064 const char *helloWorldInvariant
= "(Hello, world)*";
3065 regextst_openUTF8FromInvariant(&re1
, helloWorldInvariant
, -1, &status
);
3066 pat1
= RegexPattern::compile(&re1
, pe
, status
);
3068 REGEX_ASSERT_UNISTR(pat1
->pattern(),"(Hello, world)*");
3069 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1
->patternText(status
));
3076 //---------------------------------------------------------------------------
3078 // Extended A more thorough check for features of regex patterns
3079 // The test cases are in a separate data file,
3080 // source/tests/testdata/regextst.txt
3081 // A description of the test data format is included in that file.
3083 //---------------------------------------------------------------------------
3086 RegexTest::getPath(char buffer
[2048], const char *filename
) {
3087 UErrorCode status
=U_ZERO_ERROR
;
3088 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
3089 if (U_FAILURE(status
)) {
3090 errln("ERROR: loadTestData() failed - %s", u_errorName(status
));
3094 strcpy(buffer
, testDataDirectory
);
3095 strcat(buffer
, filename
);
3099 void RegexTest::Extended() {
3101 const char *srcPath
;
3102 UErrorCode status
= U_ZERO_ERROR
;
3103 int32_t lineNum
= 0;
3106 // Open and read the test data file.
3108 srcPath
=getPath(tdd
, "regextst.txt");
3110 return; /* something went wrong, error already output */
3114 UChar
*testData
= ReadAndConvertFile(srcPath
, len
, "utf-8", status
);
3115 if (U_FAILURE(status
)) {
3116 return; /* something went wrong, error already output */
3120 // Put the test data into a UnicodeString
3122 UnicodeString
testString(FALSE
, testData
, len
);
3124 RegexMatcher
quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status
);
3125 RegexMatcher
commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status
);
3126 RegexMatcher
flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status
);
3128 RegexMatcher
lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString
, 0, status
);
3129 UnicodeString testPattern
; // The pattern for test from the test file.
3130 UnicodeString testFlags
; // the flags for a test.
3131 UnicodeString matchString
; // The marked up string to be used as input
3133 if (U_FAILURE(status
)){
3134 dataerrln("Construct RegexMatcher() error.");
3140 // Loop over the test data file, once per line.
3142 while (lineMat
.find()) {
3144 if (U_FAILURE(status
)) {
3145 errln("%s:%d: ICU Error \"%s\"", srcPath
, lineNum
, u_errorName(status
));
3148 status
= U_ZERO_ERROR
;
3149 UnicodeString testLine
= lineMat
.group(1, status
);
3150 if (testLine
.length() == 0) {
3155 // Parse the test line. Skip blank and comment only lines.
3156 // Separate out the three main fields - pattern, flags, target.
3159 commentMat
.reset(testLine
);
3160 if (commentMat
.lookingAt(status
)) {
3161 // This line is a comment, or blank.
3166 // Pull out the pattern field, remove it from the test file line.
3168 quotedStuffMat
.reset(testLine
);
3169 if (quotedStuffMat
.lookingAt(status
)) {
3170 testPattern
= quotedStuffMat
.group(2, status
);
3171 testLine
.remove(0, quotedStuffMat
.end(0, status
));
3173 errln("Bad pattern (missing quotes?) at %s:%d", srcPath
, lineNum
);
3179 // Pull out the flags from the test file line.
3181 flagsMat
.reset(testLine
);
3182 flagsMat
.lookingAt(status
); // Will always match, possibly an empty string.
3183 testFlags
= flagsMat
.group(1, status
);
3184 if (flagsMat
.group(2, status
).length() > 0) {
3185 errln("Bad Match flag at line %d. Scanning %c\n",
3186 lineNum
, flagsMat
.group(2, status
).charAt(0));
3189 testLine
.remove(0, flagsMat
.end(0, status
));
3192 // Pull out the match string, as a whole.
3193 // We'll process the <tags> later.
3195 quotedStuffMat
.reset(testLine
);
3196 if (quotedStuffMat
.lookingAt(status
)) {
3197 matchString
= quotedStuffMat
.group(2, status
);
3198 testLine
.remove(0, quotedStuffMat
.end(0, status
));
3200 errln("Bad match string at test file line %d", lineNum
);
3205 // The only thing left from the input line should be an optional trailing comment.
3207 commentMat
.reset(testLine
);
3208 if (commentMat
.lookingAt(status
) == FALSE
) {
3209 errln("Line %d: unexpected characters at end of test line.", lineNum
);
3216 regex_find(testPattern
, testFlags
, matchString
, srcPath
, lineNum
);
3225 //---------------------------------------------------------------------------
3227 // regex_find(pattern, flags, inputString, lineNumber)
3229 // Function to run a single test from the Extended (data driven) tests.
3230 // See file test/testdata/regextst.txt for a description of the
3231 // pattern and inputString fields, and the allowed flags.
3232 // lineNumber is the source line in regextst.txt of the test.
3234 //---------------------------------------------------------------------------
3237 // Set a value into a UVector at position specified by a decimal number in
3238 // a UnicodeString. This is a utility function needed by the actual test function,
3240 static void set(UVector
&vec
, int32_t val
, UnicodeString index
) {
3241 UErrorCode status
=U_ZERO_ERROR
;
3243 for (int32_t i
=0; i
<index
.length(); i
++) {
3244 int32_t d
=u_charDigitValue(index
.charAt(i
));
3248 while (vec
.size()<idx
+1) {vec
.addElement(-1, status
);}
3249 vec
.setElementAt(val
, idx
);
3252 static void setInt(UVector
&vec
, int32_t val
, int32_t idx
) {
3253 UErrorCode status
=U_ZERO_ERROR
;
3254 while (vec
.size()<idx
+1) {vec
.addElement(-1, status
);}
3255 vec
.setElementAt(val
, idx
);
3258 static UBool
utextOffsetToNative(UText
*utext
, int32_t unistrOffset
, int32_t& nativeIndex
)
3260 UBool couldFind
= TRUE
;
3261 UTEXT_SETNATIVEINDEX(utext
, 0);
3263 while (i
< unistrOffset
) {
3264 UChar32 c
= UTEXT_NEXT32(utext
);
3265 if (c
!= U_SENTINEL
) {
3272 nativeIndex
= (int32_t)UTEXT_GETNATIVEINDEX(utext
);
3277 void RegexTest::regex_find(const UnicodeString
&pattern
,
3278 const UnicodeString
&flags
,
3279 const UnicodeString
&inputString
,
3280 const char *srcPath
,
3282 UnicodeString unEscapedInput
;
3283 UnicodeString deTaggedInput
;
3285 int32_t patternUTF8Length
, inputUTF8Length
;
3286 char *patternChars
= NULL
, *inputChars
= NULL
;
3287 UText patternText
= UTEXT_INITIALIZER
;
3288 UText inputText
= UTEXT_INITIALIZER
;
3289 UConverter
*UTF8Converter
= NULL
;
3291 UErrorCode status
= U_ZERO_ERROR
;
3293 RegexPattern
*parsePat
= NULL
;
3294 RegexMatcher
*parseMatcher
= NULL
;
3295 RegexPattern
*callerPattern
= NULL
, *UTF8Pattern
= NULL
;
3296 RegexMatcher
*matcher
= NULL
, *UTF8Matcher
= NULL
;
3297 UVector
groupStarts(status
);
3298 UVector
groupEnds(status
);
3299 UVector
groupStartsUTF8(status
);
3300 UVector
groupEndsUTF8(status
);
3301 UBool isMatch
= FALSE
, isUTF8Match
= FALSE
;
3302 UBool failed
= FALSE
;
3305 UBool useMatchesFunc
= FALSE
;
3306 UBool useLookingAtFunc
= FALSE
;
3307 int32_t regionStart
= -1;
3308 int32_t regionEnd
= -1;
3309 int32_t regionStartUTF8
= -1;
3310 int32_t regionEndUTF8
= -1;
3314 // Compile the caller's pattern
3316 uint32_t bflags
= 0;
3317 if (flags
.indexOf((UChar
)0x69) >= 0) { // 'i' flag
3318 bflags
|= UREGEX_CASE_INSENSITIVE
;
3320 if (flags
.indexOf((UChar
)0x78) >= 0) { // 'x' flag
3321 bflags
|= UREGEX_COMMENTS
;
3323 if (flags
.indexOf((UChar
)0x73) >= 0) { // 's' flag
3324 bflags
|= UREGEX_DOTALL
;
3326 if (flags
.indexOf((UChar
)0x6d) >= 0) { // 'm' flag
3327 bflags
|= UREGEX_MULTILINE
;
3330 if (flags
.indexOf((UChar
)0x65) >= 0) { // 'e' flag
3331 bflags
|= UREGEX_ERROR_ON_UNKNOWN_ESCAPES
;
3333 if (flags
.indexOf((UChar
)0x44) >= 0) { // 'D' flag
3334 bflags
|= UREGEX_UNIX_LINES
;
3336 if (flags
.indexOf((UChar
)0x51) >= 0) { // 'Q' flag
3337 bflags
|= UREGEX_LITERAL
;
3341 callerPattern
= RegexPattern::compile(pattern
, bflags
, pe
, status
);
3342 if (status
!= U_ZERO_ERROR
) {
3343 #if UCONFIG_NO_BREAK_ITERATION==1
3344 // 'v' test flag means that the test pattern should not compile if ICU was configured
3345 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3346 if (flags
.indexOf((UChar
)0x76) >= 0 /*'v'*/ && status
== U_UNSUPPORTED_ERROR
) {
3347 goto cleanupAndReturn
;
3350 if (flags
.indexOf((UChar
)0x45) >= 0) { // flags contain 'E'
3351 // Expected pattern compilation error.
3352 if (flags
.indexOf((UChar
)0x64) >= 0) { // flags contain 'd'
3353 logln("Pattern Compile returns \"%s\"", u_errorName(status
));
3355 goto cleanupAndReturn
;
3357 // Unexpected pattern compilation error.
3358 dataerrln("Line %d: error %s compiling pattern.", line
, u_errorName(status
));
3359 goto cleanupAndReturn
;
3363 UTF8Converter
= ucnv_open("UTF8", &status
);
3364 ucnv_setFromUCallBack(UTF8Converter
, UCNV_FROM_U_CALLBACK_STOP
, NULL
, NULL
, NULL
, &status
);
3366 patternUTF8Length
= pattern
.extract(NULL
, 0, UTF8Converter
, status
);
3367 status
= U_ZERO_ERROR
; // buffer overflow
3368 patternChars
= new char[patternUTF8Length
+1];
3369 pattern
.extract(patternChars
, patternUTF8Length
+1, UTF8Converter
, status
);
3370 utext_openUTF8(&patternText
, patternChars
, patternUTF8Length
, &status
);
3372 if (status
== U_ZERO_ERROR
) {
3373 UTF8Pattern
= RegexPattern::compile(&patternText
, bflags
, pe
, status
);
3375 if (status
!= U_ZERO_ERROR
) {
3376 #if UCONFIG_NO_BREAK_ITERATION==1
3377 // 'v' test flag means that the test pattern should not compile if ICU was configured
3378 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3379 if (flags
.indexOf((UChar
)0x76) >= 0 /*'v'*/ && status
== U_UNSUPPORTED_ERROR
) {
3380 goto cleanupAndReturn
;
3383 if (flags
.indexOf((UChar
)0x45) >= 0) { // flags contain 'E'
3384 // Expected pattern compilation error.
3385 if (flags
.indexOf((UChar
)0x64) >= 0) { // flags contain 'd'
3386 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status
));
3388 goto cleanupAndReturn
;
3390 // Unexpected pattern compilation error.
3391 errln("Line %d: error %s compiling pattern. (UTF8)", line
, u_errorName(status
));
3392 goto cleanupAndReturn
;
3397 if (UTF8Pattern
== NULL
) {
3398 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3399 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath
, line
);
3400 status
= U_ZERO_ERROR
;
3403 if (flags
.indexOf((UChar
)0x64) >= 0) { // 'd' flag
3404 RegexPatternDump(callerPattern
);
3407 if (flags
.indexOf((UChar
)0x45) >= 0) { // 'E' flag
3408 errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath
, line
);
3409 goto cleanupAndReturn
;
3414 // Number of times find() should be called on the test string, default to 1
3417 for (i
=2; i
<=9; i
++) {
3418 if (flags
.indexOf((UChar
)(0x30 + i
)) >= 0) { // digit flag
3419 if (numFinds
!= 1) {
3420 errln("Line %d: more than one digit flag. Scanning %d.", line
, i
);
3421 goto cleanupAndReturn
;
3427 // 'M' flag. Use matches() instead of find()
3428 if (flags
.indexOf((UChar
)0x4d) >= 0) {
3429 useMatchesFunc
= TRUE
;
3431 if (flags
.indexOf((UChar
)0x4c) >= 0) {
3432 useLookingAtFunc
= TRUE
;
3436 // Find the tags in the input data, remove them, and record the group boundary
3439 parsePat
= RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe
, status
);
3440 REGEX_CHECK_STATUS_L(line
);
3442 unEscapedInput
= inputString
.unescape();
3443 parseMatcher
= parsePat
->matcher(unEscapedInput
, status
);
3444 REGEX_CHECK_STATUS_L(line
);
3445 while(parseMatcher
->find()) {
3446 parseMatcher
->appendReplacement(deTaggedInput
, "", status
);
3448 UnicodeString groupNum
= parseMatcher
->group(2, status
);
3449 if (groupNum
== "r") {
3450 // <r> or </r>, a region specification within the string
3451 if (parseMatcher
->group(1, status
) == "/") {
3452 regionEnd
= deTaggedInput
.length();
3454 regionStart
= deTaggedInput
.length();
3457 // <digits> or </digits>, a group match boundary tag.
3458 if (parseMatcher
->group(1, status
) == "/") {
3459 set(groupEnds
, deTaggedInput
.length(), groupNum
);
3461 set(groupStarts
, deTaggedInput
.length(), groupNum
);
3465 parseMatcher
->appendTail(deTaggedInput
);
3466 REGEX_ASSERT_L(groupStarts
.size() == groupEnds
.size(), line
);
3467 if ((regionStart
>=0 || regionEnd
>=0) && (regionStart
<0 || regionStart
>regionEnd
)) {
3468 errln("mismatched <r> tags");
3470 goto cleanupAndReturn
;
3474 // Configure the matcher according to the flags specified with this test.
3476 matcher
= callerPattern
->matcher(deTaggedInput
, status
);
3477 REGEX_CHECK_STATUS_L(line
);
3478 if (flags
.indexOf((UChar
)0x74) >= 0) { // 't' trace flag
3479 matcher
->setTrace(TRUE
);
3482 if (UTF8Pattern
!= NULL
) {
3483 inputUTF8Length
= deTaggedInput
.extract(NULL
, 0, UTF8Converter
, status
);
3484 status
= U_ZERO_ERROR
; // buffer overflow
3485 inputChars
= new char[inputUTF8Length
+1];
3486 deTaggedInput
.extract(inputChars
, inputUTF8Length
+1, UTF8Converter
, status
);
3487 utext_openUTF8(&inputText
, inputChars
, inputUTF8Length
, &status
);
3489 if (status
== U_ZERO_ERROR
) {
3490 UTF8Matcher
= &UTF8Pattern
->matcher(status
)->reset(&inputText
);
3491 REGEX_CHECK_STATUS_L(line
);
3494 if (UTF8Matcher
== NULL
) {
3495 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3496 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath
, line
);
3497 status
= U_ZERO_ERROR
;
3502 // Generate native indices for UTF8 versions of region and capture group info
3504 if (UTF8Matcher
!= NULL
) {
3505 if (regionStart
>=0) (void) utextOffsetToNative(&inputText
, regionStart
, regionStartUTF8
);
3506 if (regionEnd
>=0) (void) utextOffsetToNative(&inputText
, regionEnd
, regionEndUTF8
);
3508 // Fill out the native index UVector info.
3509 // Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3510 for (i
=0; i
<groupStarts
.size(); i
++) {
3511 int32_t start
= groupStarts
.elementAti(i
);
3512 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3515 if (!utextOffsetToNative(&inputText
, start
, startUTF8
)) {
3516 errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line
, i
, start
);
3518 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3520 setInt(groupStartsUTF8
, startUTF8
, i
);
3523 int32_t end
= groupEnds
.elementAti(i
);
3524 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3527 if (!utextOffsetToNative(&inputText
, end
, endUTF8
)) {
3528 errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line
, i
, end
);
3530 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3532 setInt(groupEndsUTF8
, endUTF8
, i
);
3537 if (regionStart
>=0) {
3538 matcher
->region(regionStart
, regionEnd
, status
);
3539 REGEX_CHECK_STATUS_L(line
);
3540 if (UTF8Matcher
!= NULL
) {
3541 UTF8Matcher
->region(regionStartUTF8
, regionEndUTF8
, status
);
3542 REGEX_CHECK_STATUS_L(line
);
3545 if (flags
.indexOf((UChar
)0x61) >= 0) { // 'a' anchoring bounds flag
3546 matcher
->useAnchoringBounds(FALSE
);
3547 if (UTF8Matcher
!= NULL
) {
3548 UTF8Matcher
->useAnchoringBounds(FALSE
);
3551 if (flags
.indexOf((UChar
)0x62) >= 0) { // 'b' transparent bounds flag
3552 matcher
->useTransparentBounds(TRUE
);
3553 if (UTF8Matcher
!= NULL
) {
3554 UTF8Matcher
->useTransparentBounds(TRUE
);
3561 // Do a find on the de-tagged input using the caller's pattern
3562 // TODO: error on count>1 and not find().
3563 // error on both matches() and lookingAt().
3565 for (i
=0; i
<numFinds
; i
++) {
3566 if (useMatchesFunc
) {
3567 isMatch
= matcher
->matches(status
);
3568 if (UTF8Matcher
!= NULL
) {
3569 isUTF8Match
= UTF8Matcher
->matches(status
);
3571 } else if (useLookingAtFunc
) {
3572 isMatch
= matcher
->lookingAt(status
);
3573 if (UTF8Matcher
!= NULL
) {
3574 isUTF8Match
= UTF8Matcher
->lookingAt(status
);
3577 isMatch
= matcher
->find();
3578 if (UTF8Matcher
!= NULL
) {
3579 isUTF8Match
= UTF8Matcher
->find();
3583 matcher
->setTrace(FALSE
);
3586 // Match up the groups from the find() with the groups from the tags
3589 // number of tags should match number of groups from find operation.
3590 // matcher->groupCount does not include group 0, the entire match, hence the +1.
3591 // G option in test means that capture group data is not available in the
3592 // expected results, so the check needs to be suppressed.
3593 if (isMatch
== FALSE
&& groupStarts
.size() != 0) {
3594 dataerrln("Error at line %d: Match expected, but none found.", line
);
3596 goto cleanupAndReturn
;
3597 } else if (UTF8Matcher
!= NULL
&& isUTF8Match
== FALSE
&& groupStarts
.size() != 0) {
3598 errln("Error at line %d: Match expected, but none found. (UTF8)", line
);
3600 goto cleanupAndReturn
;
3603 if (flags
.indexOf((UChar
)0x47 /*G*/) >= 0) {
3604 // Only check for match / no match. Don't check capture groups.
3605 if (isMatch
&& groupStarts
.size() == 0) {
3606 errln("Error at line %d: No match expected, but one found.", line
);
3608 } else if (UTF8Matcher
!= NULL
&& isUTF8Match
&& groupStarts
.size() == 0) {
3609 errln("Error at line %d: No match expected, but one found. (UTF8)", line
);
3612 goto cleanupAndReturn
;
3615 REGEX_CHECK_STATUS_L(line
);
3616 for (i
=0; i
<=matcher
->groupCount(); i
++) {
3617 int32_t expectedStart
= (i
>= groupStarts
.size()? -1 : groupStarts
.elementAti(i
));
3618 int32_t expectedStartUTF8
= (i
>= groupStartsUTF8
.size()? -1 : groupStartsUTF8
.elementAti(i
));
3619 if (matcher
->start(i
, status
) != expectedStart
) {
3620 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
3621 line
, i
, expectedStart
, matcher
->start(i
, status
));
3623 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3624 } else if (UTF8Matcher
!= NULL
&& UTF8Matcher
->start(i
, status
) != expectedStartUTF8
) {
3625 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)",
3626 line
, i
, expectedStartUTF8
, UTF8Matcher
->start(i
, status
));
3628 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3631 int32_t expectedEnd
= (i
>= groupEnds
.size()? -1 : groupEnds
.elementAti(i
));
3632 int32_t expectedEndUTF8
= (i
>= groupEndsUTF8
.size()? -1 : groupEndsUTF8
.elementAti(i
));
3633 if (matcher
->end(i
, status
) != expectedEnd
) {
3634 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
3635 line
, i
, expectedEnd
, matcher
->end(i
, status
));
3637 // Error on end position; keep going; real error is probably yet to come as group
3638 // end positions work from end of the input data towards the front.
3639 } else if (UTF8Matcher
!= NULL
&& UTF8Matcher
->end(i
, status
) != expectedEndUTF8
) {
3640 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)",
3641 line
, i
, expectedEndUTF8
, UTF8Matcher
->end(i
, status
));
3643 // Error on end position; keep going; real error is probably yet to come as group
3644 // end positions work from end of the input data towards the front.
3647 if ( matcher
->groupCount()+1 < groupStarts
.size()) {
3648 errln("Error at line %d: Expected %d capture groups, found %d.",
3649 line
, groupStarts
.size()-1, matcher
->groupCount());
3652 else if (UTF8Matcher
!= NULL
&& UTF8Matcher
->groupCount()+1 < groupStarts
.size()) {
3653 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3654 line
, groupStarts
.size()-1, UTF8Matcher
->groupCount());
3658 if ((flags
.indexOf((UChar
)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3659 matcher
->requireEnd() == TRUE
) {
3660 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line
);
3662 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3663 UTF8Matcher
->requireEnd() == TRUE
) {
3664 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line
);
3668 if ((flags
.indexOf((UChar
)0x79) >= 0) && // 'y' flag: RequireEnd() == true
3669 matcher
->requireEnd() == FALSE
) {
3670 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line
);
3672 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x79) >= 0) && // 'Y' flag: RequireEnd() == false
3673 UTF8Matcher
->requireEnd() == FALSE
) {
3674 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line
);
3678 if ((flags
.indexOf((UChar
)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3679 matcher
->hitEnd() == TRUE
) {
3680 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line
);
3682 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3683 UTF8Matcher
->hitEnd() == TRUE
) {
3684 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line
);
3688 if ((flags
.indexOf((UChar
)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3689 matcher
->hitEnd() == FALSE
) {
3690 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line
);
3692 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3693 UTF8Matcher
->hitEnd() == FALSE
) {
3694 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line
);
3701 infoln((UnicodeString
)"\""+pattern
+(UnicodeString
)"\" "
3702 +flags
+(UnicodeString
)" \""+inputString
+(UnicodeString
)"\"");
3703 // callerPattern->dump();
3705 delete parseMatcher
;
3710 delete callerPattern
;
3712 utext_close(&inputText
);
3713 delete[] inputChars
;
3714 utext_close(&patternText
);
3715 delete[] patternChars
;
3716 ucnv_close(UTF8Converter
);
3722 //---------------------------------------------------------------------------
3724 // Errors Check for error handling in patterns.
3726 //---------------------------------------------------------------------------
3727 void RegexTest::Errors() {
3728 // \escape sequences that aren't implemented yet.
3729 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3731 // Missing close parentheses
3732 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN
);
3733 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN
);
3734 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN
);
3736 // Extra close paren
3737 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN
);
3738 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN
);
3739 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN
);
3741 // Look-ahead, Look-behind
3742 // TODO: add tests for unbounded length look-behinds.
3743 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX
); // illegal construct
3745 // Attempt to use non-default flags
3748 UErrorCode status
= U_ZERO_ERROR
;
3749 int32_t flags
= UREGEX_CANON_EQ
|
3750 UREGEX_COMMENTS
| UREGEX_DOTALL
|
3752 RegexPattern
*pat1
= RegexPattern::compile(".*", flags
, pe
, status
);
3753 REGEX_ASSERT(status
== U_REGEX_UNIMPLEMENTED
);
3758 // Quantifiers are allowed only after something that can be quantified.
3759 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX
);
3760 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX
);
3761 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX
);
3763 // Mal-formed {min,max} quantifiers
3764 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL
);
3765 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN
);
3766 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL
);
3767 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL
);
3768 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL
);
3769 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG
);
3770 REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG
); // Overflows int during scan
3771 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG
); // Overflows regex binary format
3772 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG
);
3775 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX
);
3777 // Invalid Back Reference \0
3778 // For ICU 3.8 and earlier
3779 // For ICU versions newer than 3.8, \0 introduces an octal escape.
3781 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE
);
3786 //-------------------------------------------------------------------------------
3788 // Read a text data file, convert it to UChars, and return the data
3789 // in one big UChar * buffer, which the caller must delete.
3791 //--------------------------------------------------------------------------------
3792 UChar
*RegexTest::ReadAndConvertFile(const char *fileName
, int32_t &ulen
,
3793 const char *defEncoding
, UErrorCode
&status
) {
3794 UChar
*retPtr
= NULL
;
3795 char *fileBuf
= NULL
;
3796 UConverter
* conv
= NULL
;
3800 if (U_FAILURE(status
)) {
3807 f
= fopen(fileName
, "rb");
3809 dataerrln("Error opening test data file %s\n", fileName
);
3810 status
= U_FILE_ACCESS_ERROR
;
3819 fseek( f
, 0, SEEK_END
);
3820 fileSize
= ftell(f
);
3821 fileBuf
= new char[fileSize
];
3822 fseek(f
, 0, SEEK_SET
);
3823 amt_read
= fread(fileBuf
, 1, fileSize
, f
);
3824 if (amt_read
!= fileSize
|| fileSize
<= 0) {
3825 errln("Error reading test data file.");
3826 goto cleanUpAndReturn
;
3830 // Look for a Unicode Signature (BOM) on the data just read
3832 int32_t signatureLength
;
3833 const char * fileBufC
;
3834 const char* encoding
;
3837 encoding
= ucnv_detectUnicodeSignature(
3838 fileBuf
, fileSize
, &signatureLength
, &status
);
3839 if(encoding
!=NULL
){
3840 fileBufC
+= signatureLength
;
3841 fileSize
-= signatureLength
;
3843 encoding
= defEncoding
;
3844 if (strcmp(encoding
, "utf-8") == 0) {
3845 errln("file %s is missing its BOM", fileName
);
3850 // Open a converter to take the rule file to UTF-16
3852 conv
= ucnv_open(encoding
, &status
);
3853 if (U_FAILURE(status
)) {
3854 goto cleanUpAndReturn
;
3858 // Convert the rules to UChar.
3859 // Preflight first to determine required buffer size.
3861 ulen
= ucnv_toUChars(conv
,
3867 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
3868 // Buffer Overflow is expected from the preflight operation.
3869 status
= U_ZERO_ERROR
;
3871 retPtr
= new UChar
[ulen
+1];
3884 if (U_FAILURE(status
)) {
3885 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
3894 //-------------------------------------------------------------------------------
3896 // PerlTests - Run Perl's regular expression tests
3897 // The input file for this test is re_tests, the standard regular
3898 // expression test data distributed with the Perl source code.
3900 // Here is Perl's description of the test data file:
3902 // # The tests are in a separate file 't/op/re_tests'.
3903 // # Each line in that file is a separate test.
3904 // # There are five columns, separated by tabs.
3906 // # Column 1 contains the pattern, optionally enclosed in C<''>.
3907 // # Modifiers can be put after the closing C<'>.
3909 // # Column 2 contains the string to be matched.
3911 // # Column 3 contains the expected result:
3912 // # y expect a match
3913 // # n expect no match
3914 // # c expect an error
3915 // # B test exposes a known bug in Perl, should be skipped
3916 // # b test exposes a known bug in Perl, should be skipped if noamp
3918 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3920 // # Column 4 contains a string, usually C<$&>.
3922 // # Column 5 contains the expected result of double-quote
3923 // # interpolating that string after the match, or start of error message.
3925 // # Column 6, if present, contains a reason why the test is skipped.
3926 // # This is printed with "skipped", for harness to pick up.
3928 // # \n in the tests are interpolated, as are variables of the form ${\w+}.
3930 // # If you want to add a regular expression test that can't be expressed
3931 // # in this format, don't add it here: put it in op/pat.t instead.
3933 // For ICU, if field 3 contains an 'i', the test will be skipped.
3934 // The test exposes is some known incompatibility between ICU and Perl regexps.
3935 // (The i is in addition to whatever was there before.)
3937 //-------------------------------------------------------------------------------
3938 void RegexTest::PerlTests() {
3940 const char *srcPath
;
3941 UErrorCode status
= U_ZERO_ERROR
;
3945 // Open and read the test data file.
3947 srcPath
=getPath(tdd
, "re_tests.txt");
3949 return; /* something went wrong, error already output */
3953 UChar
*testData
= ReadAndConvertFile(srcPath
, len
, "iso-8859-1", status
);
3954 if (U_FAILURE(status
)) {
3955 return; /* something went wrong, error already output */
3959 // Put the test data into a UnicodeString
3961 UnicodeString
testDataString(FALSE
, testData
, len
);
3964 // Regex to break the input file into lines, and strip the new lines.
3965 // One line per match, capture group one is the desired data.
3967 RegexPattern
* linePat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe
, status
);
3968 if (U_FAILURE(status
)) {
3969 dataerrln("RegexPattern::compile() error");
3972 RegexMatcher
* lineMat
= linePat
->matcher(testDataString
, status
);
3975 // Regex to split a test file line into fields.
3976 // There are six fields, separated by tabs.
3978 RegexPattern
* fieldPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe
, status
);
3981 // Regex to identify test patterns with flag settings, and to separate them.
3982 // Test patterns with flags look like 'pattern'i
3983 // Test patterns without flags are not quoted: pattern
3984 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
3986 RegexPattern
*flagPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe
, status
);
3987 RegexMatcher
* flagMat
= flagPat
->matcher(status
);
3990 // The Perl tests reference several perl-isms, which are evaluated/substituted
3991 // in the test data. Not being perl, this must be done explicitly. Here
3992 // are string constants and REs for these constructs.
3994 UnicodeString
nulnulSrc("${nulnul}");
3995 UnicodeString
nulnul("\\u0000\\u0000", -1, US_INV
);
3996 nulnul
= nulnul
.unescape();
3998 UnicodeString
ffffSrc("${ffff}");
3999 UnicodeString
ffff("\\uffff", -1, US_INV
);
4000 ffff
= ffff
.unescape();
4002 // regexp for $-[0], $+[2], etc.
4003 RegexPattern
*groupsPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe
, status
);
4004 RegexMatcher
*groupsMat
= groupsPat
->matcher(status
);
4006 // regexp for $0, $1, $2, etc.
4007 RegexPattern
*cgPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe
, status
);
4008 RegexMatcher
*cgMat
= cgPat
->matcher(status
);
4012 // Main Loop for the Perl Tests, runs once per line from the
4015 int32_t lineNum
= 0;
4016 int32_t skippedUnimplementedCount
= 0;
4017 while (lineMat
->find()) {
4021 // Get a line, break it into its fields, do the Perl
4022 // variable substitutions.
4024 UnicodeString line
= lineMat
->group(1, status
);
4025 UnicodeString fields
[7];
4026 fieldPat
->split(line
, fields
, 7, status
);
4028 flagMat
->reset(fields
[0]);
4029 flagMat
->matches(status
);
4030 UnicodeString pattern
= flagMat
->group(2, status
);
4031 pattern
.findAndReplace("${bang}", "!");
4032 pattern
.findAndReplace(nulnulSrc
, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4033 pattern
.findAndReplace(ffffSrc
, ffff
);
4036 // Identify patterns that include match flag settings,
4037 // split off the flags, remove the extra quotes.
4039 UnicodeString flagStr
= flagMat
->group(3, status
);
4040 if (U_FAILURE(status
)) {
4041 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
4045 const UChar UChar_c
= 0x63; // Char constants for the flag letters.
4046 const UChar UChar_i
= 0x69; // (Damn the lack of Unicode support in C)
4047 const UChar UChar_m
= 0x6d;
4048 const UChar UChar_x
= 0x78;
4049 const UChar UChar_y
= 0x79;
4050 if (flagStr
.indexOf(UChar_i
) != -1) {
4051 flags
|= UREGEX_CASE_INSENSITIVE
;
4053 if (flagStr
.indexOf(UChar_m
) != -1) {
4054 flags
|= UREGEX_MULTILINE
;
4056 if (flagStr
.indexOf(UChar_x
) != -1) {
4057 flags
|= UREGEX_COMMENTS
;
4061 // Compile the test pattern.
4063 status
= U_ZERO_ERROR
;
4064 RegexPattern
*testPat
= RegexPattern::compile(pattern
, flags
, pe
, status
);
4065 if (status
== U_REGEX_UNIMPLEMENTED
) {
4067 // Test of a feature that is planned for ICU, but not yet implemented.
4069 skippedUnimplementedCount
++;
4071 status
= U_ZERO_ERROR
;
4075 if (U_FAILURE(status
)) {
4076 // Some tests are supposed to generate errors.
4077 // Only report an error for tests that are supposed to succeed.
4078 if (fields
[2].indexOf(UChar_c
) == -1 && // Compilation is not supposed to fail AND
4079 fields
[2].indexOf(UChar_i
) == -1) // it's not an accepted ICU incompatibility
4081 errln("line %d: ICU Error \"%s\"\n", lineNum
, u_errorName(status
));
4083 status
= U_ZERO_ERROR
;
4088 if (fields
[2].indexOf(UChar_i
) >= 0) {
4089 // ICU should skip this test.
4094 if (fields
[2].indexOf(UChar_c
) >= 0) {
4095 // This pattern should have caused a compilation error, but didn't/
4096 errln("line %d: Expected a pattern compile error, got success.", lineNum
);
4102 // replace the Perl variables that appear in some of the
4103 // match data strings.
4105 UnicodeString matchString
= fields
[1];
4106 matchString
.findAndReplace(nulnulSrc
, nulnul
);
4107 matchString
.findAndReplace(ffffSrc
, ffff
);
4109 // Replace any \n in the match string with an actual new-line char.
4110 // Don't do full unescape, as this unescapes more than Perl does, which
4111 // causes other spurious failures in the tests.
4112 matchString
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4117 // Run the test, check for expected match/don't match result.
4119 RegexMatcher
*testMat
= testPat
->matcher(matchString
, status
);
4120 UBool found
= testMat
->find();
4121 UBool expected
= FALSE
;
4122 if (fields
[2].indexOf(UChar_y
) >=0) {
4125 if (expected
!= found
) {
4126 errln("line %d: Expected %smatch, got %smatch",
4127 lineNum
, expected
?"":"no ", found
?"":"no " );
4131 // Don't try to check expected results if there is no match.
4132 // (Some have stuff in the expected fields)
4140 // Interpret the Perl expression from the fourth field of the data file,
4141 // building up an ICU string from the results of the ICU match.
4142 // The Perl expression will contain references to the results of
4143 // a regex match, including the matched string, capture group strings,
4144 // group starting and ending indicies, etc.
4146 UnicodeString resultString
;
4147 UnicodeString perlExpr
= fields
[3];
4148 #if SUPPORT_MUTATING_INPUT_STRING
4149 groupsMat
->reset(perlExpr
);
4150 cgMat
->reset(perlExpr
);
4153 while (perlExpr
.length() > 0) {
4154 #if !SUPPORT_MUTATING_INPUT_STRING
4155 // Perferred usage. Reset after any modification to input string.
4156 groupsMat
->reset(perlExpr
);
4157 cgMat
->reset(perlExpr
);
4160 if (perlExpr
.startsWith("$&")) {
4161 resultString
.append(testMat
->group(status
));
4162 perlExpr
.remove(0, 2);
4165 else if (groupsMat
->lookingAt(status
)) {
4167 UnicodeString digitString
= groupsMat
->group(2, status
);
4169 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4170 UnicodeString plusOrMinus
= groupsMat
->group(1, status
);
4171 int32_t matchPosition
;
4172 if (plusOrMinus
.compare("+") == 0) {
4173 matchPosition
= testMat
->end(groupNum
, status
);
4175 matchPosition
= testMat
->start(groupNum
, status
);
4177 if (matchPosition
!= -1) {
4178 ICU_Utility::appendNumber(resultString
, matchPosition
);
4180 perlExpr
.remove(0, groupsMat
->end(status
));
4183 else if (cgMat
->lookingAt(status
)) {
4185 UnicodeString digitString
= cgMat
->group(1, status
);
4187 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4188 if (U_SUCCESS(status
)) {
4189 resultString
.append(testMat
->group(groupNum
, status
));
4190 status
= U_ZERO_ERROR
;
4192 perlExpr
.remove(0, cgMat
->end(status
));
4195 else if (perlExpr
.startsWith("@-")) {
4197 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4199 resultString
.append(" ");
4201 ICU_Utility::appendNumber(resultString
, testMat
->start(i
, status
));
4203 perlExpr
.remove(0, 2);
4206 else if (perlExpr
.startsWith("@+")) {
4208 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4210 resultString
.append(" ");
4212 ICU_Utility::appendNumber(resultString
, testMat
->end(i
, status
));
4214 perlExpr
.remove(0, 2);
4217 else if (perlExpr
.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4218 // or as an escaped sequence (e.g. \n)
4219 if (perlExpr
.length() > 1) {
4220 perlExpr
.remove(0, 1); // Remove the '\', but only if not last char.
4222 UChar c
= perlExpr
.charAt(0);
4224 case 'n': c
= '\n'; break;
4225 // add any other escape sequences that show up in the test expected results.
4227 resultString
.append(c
);
4228 perlExpr
.remove(0, 1);
4232 // Any characters from the perl expression that we don't explicitly
4233 // recognize before here are assumed to be literals and copied
4234 // as-is to the expected results.
4235 resultString
.append(perlExpr
.charAt(0));
4236 perlExpr
.remove(0, 1);
4239 if (U_FAILURE(status
)) {
4240 errln("Line %d: ICU Error \"%s\"", lineNum
, u_errorName(status
));
4246 // Expected Results Compare
4248 UnicodeString
expectedS(fields
[4]);
4249 expectedS
.findAndReplace(nulnulSrc
, nulnul
);
4250 expectedS
.findAndReplace(ffffSrc
, ffff
);
4251 expectedS
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4254 if (expectedS
.compare(resultString
) != 0) {
4255 err("Line %d: Incorrect perl expression results.", lineNum
);
4256 infoln((UnicodeString
)"Expected \""+expectedS
+(UnicodeString
)"\"; got \""+resultString
+(UnicodeString
)"\"");
4264 // All done. Clean up allocated stuff.
4282 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount
);
4287 //-------------------------------------------------------------------------------
4289 // PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts
4290 // (instead of using UnicodeStrings) to test the alternate engine.
4291 // The input file for this test is re_tests, the standard regular
4292 // expression test data distributed with the Perl source code.
4293 // See PerlTests() for more information.
4295 //-------------------------------------------------------------------------------
4296 void RegexTest::PerlTestsUTF8() {
4298 const char *srcPath
;
4299 UErrorCode status
= U_ZERO_ERROR
;
4301 LocalUConverterPointer
UTF8Converter(ucnv_open("UTF-8", &status
));
4302 UText patternText
= UTEXT_INITIALIZER
;
4303 char *patternChars
= NULL
;
4304 int32_t patternLength
;
4305 int32_t patternCapacity
= 0;
4306 UText inputText
= UTEXT_INITIALIZER
;
4307 char *inputChars
= NULL
;
4308 int32_t inputLength
;
4309 int32_t inputCapacity
= 0;
4311 ucnv_setFromUCallBack(UTF8Converter
.getAlias(), UCNV_FROM_U_CALLBACK_STOP
, NULL
, NULL
, NULL
, &status
);
4314 // Open and read the test data file.
4316 srcPath
=getPath(tdd
, "re_tests.txt");
4318 return; /* something went wrong, error already output */
4322 UChar
*testData
= ReadAndConvertFile(srcPath
, len
, "iso-8859-1", status
);
4323 if (U_FAILURE(status
)) {
4324 return; /* something went wrong, error already output */
4328 // Put the test data into a UnicodeString
4330 UnicodeString
testDataString(FALSE
, testData
, len
);
4333 // Regex to break the input file into lines, and strip the new lines.
4334 // One line per match, capture group one is the desired data.
4336 RegexPattern
* linePat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe
, status
);
4337 if (U_FAILURE(status
)) {
4338 dataerrln("RegexPattern::compile() error");
4341 RegexMatcher
* lineMat
= linePat
->matcher(testDataString
, status
);
4344 // Regex to split a test file line into fields.
4345 // There are six fields, separated by tabs.
4347 RegexPattern
* fieldPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe
, status
);
4350 // Regex to identify test patterns with flag settings, and to separate them.
4351 // Test patterns with flags look like 'pattern'i
4352 // Test patterns without flags are not quoted: pattern
4353 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4355 RegexPattern
*flagPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe
, status
);
4356 RegexMatcher
* flagMat
= flagPat
->matcher(status
);
4359 // The Perl tests reference several perl-isms, which are evaluated/substituted
4360 // in the test data. Not being perl, this must be done explicitly. Here
4361 // are string constants and REs for these constructs.
4363 UnicodeString
nulnulSrc("${nulnul}");
4364 UnicodeString
nulnul("\\u0000\\u0000", -1, US_INV
);
4365 nulnul
= nulnul
.unescape();
4367 UnicodeString
ffffSrc("${ffff}");
4368 UnicodeString
ffff("\\uffff", -1, US_INV
);
4369 ffff
= ffff
.unescape();
4371 // regexp for $-[0], $+[2], etc.
4372 RegexPattern
*groupsPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe
, status
);
4373 RegexMatcher
*groupsMat
= groupsPat
->matcher(status
);
4375 // regexp for $0, $1, $2, etc.
4376 RegexPattern
*cgPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe
, status
);
4377 RegexMatcher
*cgMat
= cgPat
->matcher(status
);
4381 // Main Loop for the Perl Tests, runs once per line from the
4384 int32_t lineNum
= 0;
4385 int32_t skippedUnimplementedCount
= 0;
4386 while (lineMat
->find()) {
4390 // Get a line, break it into its fields, do the Perl
4391 // variable substitutions.
4393 UnicodeString line
= lineMat
->group(1, status
);
4394 UnicodeString fields
[7];
4395 fieldPat
->split(line
, fields
, 7, status
);
4397 flagMat
->reset(fields
[0]);
4398 flagMat
->matches(status
);
4399 UnicodeString pattern
= flagMat
->group(2, status
);
4400 pattern
.findAndReplace("${bang}", "!");
4401 pattern
.findAndReplace(nulnulSrc
, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4402 pattern
.findAndReplace(ffffSrc
, ffff
);
4405 // Identify patterns that include match flag settings,
4406 // split off the flags, remove the extra quotes.
4408 UnicodeString flagStr
= flagMat
->group(3, status
);
4409 if (U_FAILURE(status
)) {
4410 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
4414 const UChar UChar_c
= 0x63; // Char constants for the flag letters.
4415 const UChar UChar_i
= 0x69; // (Damn the lack of Unicode support in C)
4416 const UChar UChar_m
= 0x6d;
4417 const UChar UChar_x
= 0x78;
4418 const UChar UChar_y
= 0x79;
4419 if (flagStr
.indexOf(UChar_i
) != -1) {
4420 flags
|= UREGEX_CASE_INSENSITIVE
;
4422 if (flagStr
.indexOf(UChar_m
) != -1) {
4423 flags
|= UREGEX_MULTILINE
;
4425 if (flagStr
.indexOf(UChar_x
) != -1) {
4426 flags
|= UREGEX_COMMENTS
;
4430 // Put the pattern in a UTF-8 UText
4432 status
= U_ZERO_ERROR
;
4433 patternLength
= pattern
.extract(patternChars
, patternCapacity
, UTF8Converter
.getAlias(), status
);
4434 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
4435 status
= U_ZERO_ERROR
;
4436 delete[] patternChars
;
4437 patternCapacity
= patternLength
+ 1;
4438 patternChars
= new char[patternCapacity
];
4439 pattern
.extract(patternChars
, patternCapacity
, UTF8Converter
.getAlias(), status
);
4441 utext_openUTF8(&patternText
, patternChars
, patternLength
, &status
);
4444 // Compile the test pattern.
4446 RegexPattern
*testPat
= RegexPattern::compile(&patternText
, flags
, pe
, status
);
4447 if (status
== U_REGEX_UNIMPLEMENTED
) {
4449 // Test of a feature that is planned for ICU, but not yet implemented.
4451 skippedUnimplementedCount
++;
4453 status
= U_ZERO_ERROR
;
4457 if (U_FAILURE(status
)) {
4458 // Some tests are supposed to generate errors.
4459 // Only report an error for tests that are supposed to succeed.
4460 if (fields
[2].indexOf(UChar_c
) == -1 && // Compilation is not supposed to fail AND
4461 fields
[2].indexOf(UChar_i
) == -1) // it's not an accepted ICU incompatibility
4463 errln("line %d: ICU Error \"%s\"\n", lineNum
, u_errorName(status
));
4465 status
= U_ZERO_ERROR
;
4470 if (fields
[2].indexOf(UChar_i
) >= 0) {
4471 // ICU should skip this test.
4476 if (fields
[2].indexOf(UChar_c
) >= 0) {
4477 // This pattern should have caused a compilation error, but didn't/
4478 errln("line %d: Expected a pattern compile error, got success.", lineNum
);
4485 // replace the Perl variables that appear in some of the
4486 // match data strings.
4488 UnicodeString matchString
= fields
[1];
4489 matchString
.findAndReplace(nulnulSrc
, nulnul
);
4490 matchString
.findAndReplace(ffffSrc
, ffff
);
4492 // Replace any \n in the match string with an actual new-line char.
4493 // Don't do full unescape, as this unescapes more than Perl does, which
4494 // causes other spurious failures in the tests.
4495 matchString
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4498 // Put the input in a UTF-8 UText
4500 status
= U_ZERO_ERROR
;
4501 inputLength
= matchString
.extract(inputChars
, inputCapacity
, UTF8Converter
.getAlias(), status
);
4502 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
4503 status
= U_ZERO_ERROR
;
4504 delete[] inputChars
;
4505 inputCapacity
= inputLength
+ 1;
4506 inputChars
= new char[inputCapacity
];
4507 matchString
.extract(inputChars
, inputCapacity
, UTF8Converter
.getAlias(), status
);
4509 utext_openUTF8(&inputText
, inputChars
, inputLength
, &status
);
4512 // Run the test, check for expected match/don't match result.
4514 RegexMatcher
*testMat
= &testPat
->matcher(status
)->reset(&inputText
);
4515 UBool found
= testMat
->find();
4516 UBool expected
= FALSE
;
4517 if (fields
[2].indexOf(UChar_y
) >=0) {
4520 if (expected
!= found
) {
4521 errln("line %d: Expected %smatch, got %smatch",
4522 lineNum
, expected
?"":"no ", found
?"":"no " );
4526 // Don't try to check expected results if there is no match.
4527 // (Some have stuff in the expected fields)
4535 // Interpret the Perl expression from the fourth field of the data file,
4536 // building up an ICU string from the results of the ICU match.
4537 // The Perl expression will contain references to the results of
4538 // a regex match, including the matched string, capture group strings,
4539 // group starting and ending indicies, etc.
4541 UnicodeString resultString
;
4542 UnicodeString perlExpr
= fields
[3];
4544 while (perlExpr
.length() > 0) {
4545 groupsMat
->reset(perlExpr
);
4546 cgMat
->reset(perlExpr
);
4548 if (perlExpr
.startsWith("$&")) {
4549 resultString
.append(testMat
->group(status
));
4550 perlExpr
.remove(0, 2);
4553 else if (groupsMat
->lookingAt(status
)) {
4555 UnicodeString digitString
= groupsMat
->group(2, status
);
4557 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4558 UnicodeString plusOrMinus
= groupsMat
->group(1, status
);
4559 int32_t matchPosition
;
4560 if (plusOrMinus
.compare("+") == 0) {
4561 matchPosition
= testMat
->end(groupNum
, status
);
4563 matchPosition
= testMat
->start(groupNum
, status
);
4565 if (matchPosition
!= -1) {
4566 ICU_Utility::appendNumber(resultString
, matchPosition
);
4568 perlExpr
.remove(0, groupsMat
->end(status
));
4571 else if (cgMat
->lookingAt(status
)) {
4573 UnicodeString digitString
= cgMat
->group(1, status
);
4575 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4576 if (U_SUCCESS(status
)) {
4577 resultString
.append(testMat
->group(groupNum
, status
));
4578 status
= U_ZERO_ERROR
;
4580 perlExpr
.remove(0, cgMat
->end(status
));
4583 else if (perlExpr
.startsWith("@-")) {
4585 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4587 resultString
.append(" ");
4589 ICU_Utility::appendNumber(resultString
, testMat
->start(i
, status
));
4591 perlExpr
.remove(0, 2);
4594 else if (perlExpr
.startsWith("@+")) {
4596 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4598 resultString
.append(" ");
4600 ICU_Utility::appendNumber(resultString
, testMat
->end(i
, status
));
4602 perlExpr
.remove(0, 2);
4605 else if (perlExpr
.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4606 // or as an escaped sequence (e.g. \n)
4607 if (perlExpr
.length() > 1) {
4608 perlExpr
.remove(0, 1); // Remove the '\', but only if not last char.
4610 UChar c
= perlExpr
.charAt(0);
4612 case 'n': c
= '\n'; break;
4613 // add any other escape sequences that show up in the test expected results.
4615 resultString
.append(c
);
4616 perlExpr
.remove(0, 1);
4620 // Any characters from the perl expression that we don't explicitly
4621 // recognize before here are assumed to be literals and copied
4622 // as-is to the expected results.
4623 resultString
.append(perlExpr
.charAt(0));
4624 perlExpr
.remove(0, 1);
4627 if (U_FAILURE(status
)) {
4628 errln("Line %d: ICU Error \"%s\"", lineNum
, u_errorName(status
));
4634 // Expected Results Compare
4636 UnicodeString
expectedS(fields
[4]);
4637 expectedS
.findAndReplace(nulnulSrc
, nulnul
);
4638 expectedS
.findAndReplace(ffffSrc
, ffff
);
4639 expectedS
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4642 if (expectedS
.compare(resultString
) != 0) {
4643 err("Line %d: Incorrect perl expression results.", lineNum
);
4644 infoln((UnicodeString
)"Expected \""+expectedS
+(UnicodeString
)"\"; got \""+resultString
+(UnicodeString
)"\"");
4652 // All done. Clean up allocated stuff.
4669 utext_close(&patternText
);
4670 utext_close(&inputText
);
4672 delete [] patternChars
;
4673 delete [] inputChars
;
4676 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount
);
4681 //--------------------------------------------------------------
4683 // Bug6149 Verify limits to heap expansion for backtrack stack.
4684 // Use this pattern,
4686 // The zero-length match will repeat forever.
4687 // (That this goes into a loop is another bug)
4689 //---------------------------------------------------------------
4690 void RegexTest::Bug6149() {
4691 UnicodeString
pattern("(a?){1,}");
4692 UnicodeString
s("xyz");
4694 UErrorCode status
= U_ZERO_ERROR
;
4696 RegexMatcher
matcher(pattern
, s
, flags
, status
);
4697 UBool result
= false;
4698 REGEX_ASSERT_FAIL(result
=matcher
.matches(status
), U_REGEX_STACK_OVERFLOW
);
4699 REGEX_ASSERT(result
== FALSE
);
4704 // Callbacks() Test the callback function.
4705 // When set, callbacks occur periodically during matching operations,
4706 // giving the application code the ability to abort the operation
4707 // before it's normal completion.
4710 struct callBackContext
{
4715 void reset(int32_t max
) {maxCalls
=max
; numCalls
=0; lastSteps
=0;};
4719 static UBool U_CALLCONV
4720 testCallBackFn(const void *context
, int32_t steps
) {
4721 callBackContext
*info
= (callBackContext
*)context
;
4722 if (info
->lastSteps
+1 != steps
) {
4723 info
->test
->errln("incorrect steps in callback. Expected %d, got %d\n", info
->lastSteps
+1, steps
);
4725 info
->lastSteps
= steps
;
4727 return (info
->numCalls
< info
->maxCalls
);
4731 void RegexTest::Callbacks() {
4733 // Getter returns NULLs if no callback has been set
4735 // The variables that the getter will fill in.
4736 // Init to non-null values so that the action of the getter can be seen.
4737 const void *returnedContext
= &returnedContext
;
4738 URegexMatchCallback
*returnedFn
= &testCallBackFn
;
4740 UErrorCode status
= U_ZERO_ERROR
;
4741 RegexMatcher
matcher("x", 0, status
);
4743 matcher
.getMatchCallback(returnedFn
, returnedContext
, status
);
4745 REGEX_ASSERT(returnedFn
== NULL
);
4746 REGEX_ASSERT(returnedContext
== NULL
);
4751 callBackContext cbInfo
= {this, 0, 0, 0};
4752 const void *returnedContext
;
4753 URegexMatchCallback
*returnedFn
;
4754 UErrorCode status
= U_ZERO_ERROR
;
4755 RegexMatcher
matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status
); // A pattern that can run long.
4757 matcher
.setMatchCallback(testCallBackFn
, &cbInfo
, status
);
4759 matcher
.getMatchCallback(returnedFn
, returnedContext
, status
);
4761 REGEX_ASSERT(returnedFn
== testCallBackFn
);
4762 REGEX_ASSERT(returnedContext
== &cbInfo
);
4764 // A short-running match shouldn't invoke the callback
4765 status
= U_ZERO_ERROR
;
4767 UnicodeString s
= "xxx";
4769 REGEX_ASSERT(matcher
.matches(status
));
4771 REGEX_ASSERT(cbInfo
.numCalls
== 0);
4773 // A medium-length match that runs long enough to invoke the
4774 // callback, but not so long that the callback aborts it.
4775 status
= U_ZERO_ERROR
;
4777 s
= "aaaaaaaaaaaaaaaaaaab";
4779 REGEX_ASSERT(matcher
.matches(status
)==FALSE
);
4781 REGEX_ASSERT(cbInfo
.numCalls
> 0);
4783 // A longer running match that the callback function will abort.
4784 status
= U_ZERO_ERROR
;
4786 s
= "aaaaaaaaaaaaaaaaaaaaaaab";
4788 REGEX_ASSERT(matcher
.matches(status
)==FALSE
);
4789 REGEX_ASSERT(status
== U_REGEX_STOPPED_BY_CALLER
);
4790 REGEX_ASSERT(cbInfo
.numCalls
== 4);
4798 // FindProgressCallbacks() Test the find "progress" callback function.
4799 // When set, the find progress callback will be invoked during a find operations
4800 // after each return from a match attempt, giving the application the opportunity
4801 // to terminate a long-running find operation before it's normal completion.
4804 struct progressCallBackContext
{
4809 void reset(int32_t max
) {maxCalls
=max
; numCalls
=0;lastIndex
=0;};
4813 static UBool U_CALLCONV
4814 testProgressCallBackFn(const void *context
, int64_t matchIndex
) {
4815 progressCallBackContext
*info
= (progressCallBackContext
*)context
;
4817 info
->lastIndex
= matchIndex
;
4818 // info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4819 return (info
->numCalls
< info
->maxCalls
);
4823 void RegexTest::FindProgressCallbacks() {
4825 // Getter returns NULLs if no callback has been set
4827 // The variables that the getter will fill in.
4828 // Init to non-null values so that the action of the getter can be seen.
4829 const void *returnedContext
= &returnedContext
;
4830 URegexFindProgressCallback
*returnedFn
= &testProgressCallBackFn
;
4832 UErrorCode status
= U_ZERO_ERROR
;
4833 RegexMatcher
matcher("x", 0, status
);
4835 matcher
.getFindProgressCallback(returnedFn
, returnedContext
, status
);
4837 REGEX_ASSERT(returnedFn
== NULL
);
4838 REGEX_ASSERT(returnedContext
== NULL
);
4843 progressCallBackContext cbInfo
= {this, 0, 0, 0};
4844 const void *returnedContext
;
4845 URegexFindProgressCallback
*returnedFn
;
4846 UErrorCode status
= U_ZERO_ERROR
;
4847 RegexMatcher
matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status
); // A pattern that can run long.
4849 matcher
.setFindProgressCallback(testProgressCallBackFn
, &cbInfo
, status
);
4851 matcher
.getFindProgressCallback(returnedFn
, returnedContext
, status
);
4853 REGEX_ASSERT(returnedFn
== testProgressCallBackFn
);
4854 REGEX_ASSERT(returnedContext
== &cbInfo
);
4856 // A short-running match should NOT invoke the callback.
4857 status
= U_ZERO_ERROR
;
4859 UnicodeString s
= "abxxx";
4862 matcher
.setTrace(TRUE
);
4864 REGEX_ASSERT(matcher
.find(0, status
));
4866 REGEX_ASSERT(cbInfo
.numCalls
== 0);
4868 // A medium running match that causes matcher.find() to invoke our callback for each index.
4869 status
= U_ZERO_ERROR
;
4870 s
= "aaaaaaaaaaaaaaaaaaab";
4871 cbInfo
.reset(s
.length()); // Some upper limit for number of calls that is greater than size of our input string
4873 REGEX_ASSERT(matcher
.find(0, status
)==FALSE
);
4875 REGEX_ASSERT(cbInfo
.numCalls
> 0 && cbInfo
.numCalls
< 25);
4877 // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4878 status
= U_ZERO_ERROR
;
4879 UnicodeString s1
= "aaaaaaaaaaaaaaaaaaaaaaab";
4880 cbInfo
.reset(s1
.length() - 5); // Bail early somewhere near the end of input string
4882 REGEX_ASSERT(matcher
.find(0, status
)==FALSE
);
4884 REGEX_ASSERT(cbInfo
.numCalls
== s1
.length() - 5);
4887 // Now a match that will succeed, but after an interruption
4888 status
= U_ZERO_ERROR
;
4889 UnicodeString s2
= "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4890 cbInfo
.reset(s2
.length() - 10); // Bail early somewhere near the end of input string
4892 REGEX_ASSERT(matcher
.find(0, status
)==FALSE
);
4894 // Now retry the match from where left off
4895 cbInfo
.maxCalls
= 100; // No callback limit
4896 REGEX_ASSERT(matcher
.find(cbInfo
.lastIndex
, status
));
4905 //---------------------------------------------------------------------------
4907 // PreAllocatedUTextCAPI Check the C API with pre-allocated mutable
4908 // UTexts. The pure-C implementation of UText
4909 // has no mutable backing stores, but we can
4910 // use UnicodeString here to test the functionality.
4912 //---------------------------------------------------------------------------
4913 void RegexTest::PreAllocatedUTextCAPI () {
4914 UErrorCode status
= U_ZERO_ERROR
;
4915 URegularExpression
*re
;
4916 UText patternText
= UTEXT_INITIALIZER
;
4917 UnicodeString buffer
;
4918 UText bufferText
= UTEXT_INITIALIZER
;
4920 utext_openUnicodeString(&bufferText
, &buffer
, &status
);
4923 * getText() and getUText()
4926 UText text1
= UTEXT_INITIALIZER
;
4927 UText text2
= UTEXT_INITIALIZER
;
4928 UChar text2Chars
[20];
4931 status
= U_ZERO_ERROR
;
4932 regextst_openUTF8FromInvariant(&text1
, "abcccd", -1, &status
);
4933 regextst_openUTF8FromInvariant(&text2
, "abcccxd", -1, &status
);
4934 u_uastrncpy(text2Chars
, "abcccxd", sizeof(text2
)/2);
4935 utext_openUChars(&text2
, text2Chars
, -1, &status
);
4937 regextst_openUTF8FromInvariant(&patternText
, "abc*d", -1, &status
);
4938 re
= uregex_openUText(&patternText
, 0, NULL
, &status
);
4940 /* First set a UText */
4941 uregex_setUText(re
, &text1
, &status
);
4942 resultText
= uregex_getUText(re
, &bufferText
, &status
);
4944 REGEX_ASSERT(resultText
== &bufferText
);
4945 utext_setNativeIndex(resultText
, 0);
4946 utext_setNativeIndex(&text1
, 0);
4947 REGEX_ASSERT(testUTextEqual(resultText
, &text1
));
4949 resultText
= uregex_getUText(re
, &bufferText
, &status
);
4951 REGEX_ASSERT(resultText
== &bufferText
);
4952 utext_setNativeIndex(resultText
, 0);
4953 utext_setNativeIndex(&text1
, 0);
4954 REGEX_ASSERT(testUTextEqual(resultText
, &text1
));
4956 /* Then set a UChar * */
4957 uregex_setText(re
, text2Chars
, 7, &status
);
4958 resultText
= uregex_getUText(re
, &bufferText
, &status
);
4960 REGEX_ASSERT(resultText
== &bufferText
);
4961 utext_setNativeIndex(resultText
, 0);
4962 utext_setNativeIndex(&text2
, 0);
4963 REGEX_ASSERT(testUTextEqual(resultText
, &text2
));
4966 utext_close(&text1
);
4967 utext_close(&text2
);
4977 u_uastrncpy(text1
, "noise abc interior def, and this is off the end", sizeof(text1
)/2);
4979 status
= U_ZERO_ERROR
;
4980 re
= uregex_openC("abc(.*?)def", 0, NULL
, &status
);
4983 uregex_setText(re
, text1
, -1, &status
);
4984 result
= uregex_find(re
, 0, &status
);
4985 REGEX_ASSERT(result
==TRUE
);
4987 /* Capture Group 0, the full match. Should succeed. */
4988 status
= U_ZERO_ERROR
;
4989 actual
= uregex_groupUTextDeep(re
, 0, &bufferText
, &status
);
4991 REGEX_ASSERT(actual
== &bufferText
);
4992 REGEX_ASSERT_UTEXT_INVARIANT("abc interior def", actual
);
4994 /* Capture group #1. Should succeed. */
4995 status
= U_ZERO_ERROR
;
4996 actual
= uregex_groupUTextDeep(re
, 1, &bufferText
, &status
);
4998 REGEX_ASSERT(actual
== &bufferText
);
4999 REGEX_ASSERT_UTEXT_INVARIANT(" interior ", actual
);
5001 /* Capture group out of range. Error. */
5002 status
= U_ZERO_ERROR
;
5003 actual
= uregex_groupUTextDeep(re
, 2, &bufferText
, &status
);
5004 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
5005 REGEX_ASSERT(actual
== &bufferText
);
5017 UText replText
= UTEXT_INITIALIZER
;
5020 status
= U_ZERO_ERROR
;
5021 u_uastrncpy(text1
, "Replace xaax x1x x...x.", sizeof(text1
)/2);
5022 u_uastrncpy(text2
, "No match here.", sizeof(text2
)/2);
5023 regextst_openUTF8FromInvariant(&replText
, "<$1>", -1, &status
);
5025 re
= uregex_openC("x(.*?)x", 0, NULL
, &status
);
5028 /* Normal case, with match */
5029 uregex_setText(re
, text1
, -1, &status
);
5030 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5031 result
= uregex_replaceFirstUText(re
, &replText
, &bufferText
, &status
);
5033 REGEX_ASSERT(result
== &bufferText
);
5034 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result
);
5036 /* No match. Text should copy to output with no changes. */
5037 uregex_setText(re
, text2
, -1, &status
);
5038 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5039 result
= uregex_replaceFirstUText(re
, &replText
, &bufferText
, &status
);
5041 REGEX_ASSERT(result
== &bufferText
);
5042 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result
);
5044 /* Unicode escapes */
5045 uregex_setText(re
, text1
, -1, &status
);
5046 regextst_openUTF8FromInvariant(&replText
, "\\\\\\u0041$1\\U00000042$\\a", -1, &status
);
5047 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5048 result
= uregex_replaceFirstUText(re
, &replText
, &bufferText
, &status
);
5050 REGEX_ASSERT(result
== &bufferText
);
5051 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result
);
5054 utext_close(&replText
);
5064 UText replText
= UTEXT_INITIALIZER
;
5067 status
= U_ZERO_ERROR
;
5068 u_uastrncpy(text1
, "Replace xaax x1x x...x.", sizeof(text1
)/2);
5069 u_uastrncpy(text2
, "No match here.", sizeof(text2
)/2);
5070 regextst_openUTF8FromInvariant(&replText
, "<$1>", -1, &status
);
5072 re
= uregex_openC("x(.*?)x", 0, NULL
, &status
);
5075 /* Normal case, with match */
5076 uregex_setText(re
, text1
, -1, &status
);
5077 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5078 result
= uregex_replaceAllUText(re
, &replText
, &bufferText
, &status
);
5080 REGEX_ASSERT(result
== &bufferText
);
5081 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result
);
5083 /* No match. Text should copy to output with no changes. */
5084 uregex_setText(re
, text2
, -1, &status
);
5085 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5086 result
= uregex_replaceAllUText(re
, &replText
, &bufferText
, &status
);
5088 REGEX_ASSERT(result
== &bufferText
);
5089 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result
);
5092 utext_close(&replText
);
5097 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5098 * so we don't need to test it here.
5101 utext_close(&bufferText
);
5102 utext_close(&patternText
);
5105 //--------------------------------------------------------------
5107 // Bug7651 Regex pattern that exceeds default operator stack depth in matcher.
5109 //---------------------------------------------------------------
5110 void RegexTest::Bug7651() {
5111 UnicodeString
pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5112 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5113 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5114 UnicodeString
pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5115 UnicodeString
s("#ff @abcd This is test");
5116 RegexPattern
*REPattern
= NULL
;
5117 RegexMatcher
*REMatcher
= NULL
;
5118 UErrorCode status
= U_ZERO_ERROR
;
5121 REPattern
= RegexPattern::compile(pattern1
, 0, pe
, status
);
5123 REMatcher
= REPattern
->matcher(s
, status
);
5125 REGEX_ASSERT(REMatcher
->find());
5126 REGEX_ASSERT(REMatcher
->start(status
) == 0);
5129 status
= U_ZERO_ERROR
;
5131 REPattern
= RegexPattern::compile(pattern2
, 0, pe
, status
);
5133 REMatcher
= REPattern
->matcher(s
, status
);
5135 REGEX_ASSERT(REMatcher
->find());
5136 REGEX_ASSERT(REMatcher
->start(status
) == 0);
5139 status
= U_ZERO_ERROR
;
5142 void RegexTest::Bug7740() {
5143 UErrorCode status
= U_ZERO_ERROR
;
5144 UnicodeString pattern
= "(a)";
5145 UnicodeString text
= "abcdef";
5146 RegexMatcher
*m
= new RegexMatcher(pattern
, text
, 0, status
);
5148 REGEX_ASSERT(m
->lookingAt(status
));
5150 status
= U_ILLEGAL_ARGUMENT_ERROR
;
5151 UnicodeString s
= m
->group(1, status
); // Bug 7740: segfault here.
5152 REGEX_ASSERT(status
== U_ILLEGAL_ARGUMENT_ERROR
);
5153 REGEX_ASSERT(s
== "");
5157 // Bug 8479: was crashing whith a Bogus UnicodeString as input.
5159 void RegexTest::Bug8479() {
5160 UErrorCode status
= U_ZERO_ERROR
;
5162 RegexMatcher
* const pMatcher
= new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL
|UREGEX_CASE_INSENSITIVE
, status
);
5164 if (U_SUCCESS(status
))
5168 pMatcher
->reset(str
);
5169 status
= U_ZERO_ERROR
;
5170 pMatcher
->matches(status
);
5171 REGEX_ASSERT(status
== U_ILLEGAL_ARGUMENT_ERROR
);
5178 void RegexTest::Bug7029() {
5179 UErrorCode status
= U_ZERO_ERROR
;
5181 RegexMatcher
* const pMatcher
= new RegexMatcher(".", 0, status
);
5182 UnicodeString text
= "abc.def";
5183 UnicodeString splits
[10];
5185 int32_t numFields
= pMatcher
->split(text
, splits
, 10, status
);
5187 REGEX_ASSERT(numFields
== 8);
5192 // This test is checking for the existance of any supplemental characters that case-fold
5193 // to a bmp character.
5195 // At the time of this writing there are none. If any should appear in a subsequent release
5196 // of Unicode, the code in regular expressions compilation that determines the longest
5197 // posssible match for a literal string will need to be enhanced.
5199 // See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5200 // for details on what to do in case of a failure of this test.
5202 void RegexTest::Bug9283() {
5203 UErrorCode status
= U_ZERO_ERROR
;
5204 UnicodeSet
supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status
);
5208 for (index
=0; ; index
++) {
5209 c
= supplementalsWithCaseFolding
.charAt(index
);
5213 UnicodeString cf
= UnicodeString(c
).foldCase();
5214 REGEX_ASSERT(cf
.length() >= 2);
5219 void RegexTest::CheckInvBufSize() {
5220 if(inv_next
>=INV_BUFSIZ
) {
5221 errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5222 __FILE__
, INV_BUFSIZ
, inv_next
);
5224 logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__
, INV_BUFSIZ
, inv_next
);
5228 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */