1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
5 * Copyright (c) 2002-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
12 // ICU Regular Expressions test, part of intltest.
18 PLEASE be careful about ASCII assumptions in this test.
19 This test is one of the worst repeat offenders.
20 If you have questions, contact someone on the ICU PMC
21 who has access to an EBCDIC system.
26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
32 #include "unicode/localpointer.h"
33 #include "unicode/regex.h"
34 #include "unicode/uchar.h"
35 #include "unicode/ucnv.h"
36 #include "unicode/uniset.h"
37 #include "unicode/uregex.h"
38 #include "unicode/usetiter.h"
39 #include "unicode/ustring.h"
40 #include "unicode/utext.h"
41 #include "unicode/utf16.h"
51 #define SUPPORT_MUTATING_INPUT_STRING 0
53 //---------------------------------------------------------------------------
55 // Test class boilerplate
57 //---------------------------------------------------------------------------
58 RegexTest::RegexTest()
63 RegexTest::~RegexTest()
69 void RegexTest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* /*par*/ )
71 if (exec
) logln("TestSuite RegexTest: ");
74 TESTCASE_AUTO(API_Match
);
75 TESTCASE_AUTO(API_Replace
);
76 TESTCASE_AUTO(API_Pattern
);
77 #if !UCONFIG_NO_FILE_IO
78 TESTCASE_AUTO(Extended
);
80 TESTCASE_AUTO(Errors
);
81 TESTCASE_AUTO(PerlTests
);
82 TESTCASE_AUTO(Callbacks
);
83 TESTCASE_AUTO(FindProgressCallbacks
);
84 TESTCASE_AUTO(Bug6149
);
85 TESTCASE_AUTO(UTextBasic
);
86 TESTCASE_AUTO(API_Match_UTF8
);
87 TESTCASE_AUTO(API_Replace_UTF8
);
88 TESTCASE_AUTO(API_Pattern_UTF8
);
89 TESTCASE_AUTO(PerlTestsUTF8
);
90 TESTCASE_AUTO(PreAllocatedUTextCAPI
);
91 TESTCASE_AUTO(Bug7651
);
92 TESTCASE_AUTO(Bug7740
);
93 TESTCASE_AUTO(Bug8479
);
94 TESTCASE_AUTO(Bug7029
);
95 TESTCASE_AUTO(CheckInvBufSize
);
96 TESTCASE_AUTO(Bug9283
);
97 TESTCASE_AUTO(Bug10459
);
98 TESTCASE_AUTO(TestCaseInsensitiveStarters
);
99 TESTCASE_AUTO(TestBug11049
);
100 TESTCASE_AUTO(TestBug11371
);
101 TESTCASE_AUTO(TestBug11480
);
102 TESTCASE_AUTO(NamedCapture
);
103 TESTCASE_AUTO(NamedCaptureLimits
);
104 TESTCASE_AUTO(TestBug12884
);
105 TESTCASE_AUTO(TestBug13631
);
106 TESTCASE_AUTO(TestBug13632
);
107 TESTCASE_AUTO(TestBug20359
);
113 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
115 * @see utext_openUTF8
117 static UText
* regextst_openUTF8FromInvariant(UText
* ut
, const char *inv
, int64_t length
, UErrorCode
*status
);
119 //---------------------------------------------------------------------------
121 // Error Checking / Reporting macros used in all of the tests.
123 //---------------------------------------------------------------------------
125 static void utextToPrintable(char *buf
, int32_t bufLen
, UText
*text
) {
126 int64_t oldIndex
= utext_getNativeIndex(text
);
127 utext_setNativeIndex(text
, 0);
129 UChar32 c
= utext_next32From(text
, 0);
130 while ((c
!= U_SENTINEL
) && (bufPtr
< buf
+bufLen
)) {
131 if (0x000020<=c
&& c
<0x00007e) {
135 sprintf(bufPtr
,"U+%04X", c
);
136 bufPtr
+= strlen(bufPtr
)-1;
142 c
= UTEXT_NEXT32(text
);
145 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
146 char *ebuf
= (char*)malloc(bufLen
);
147 uprv_eastrncpy((unsigned char*)ebuf
, (const unsigned char*)buf
, bufLen
);
148 uprv_strncpy(buf
, ebuf
, bufLen
);
151 utext_setNativeIndex(text
, oldIndex
);
155 static char ASSERT_BUF
[1024];
157 const char* RegexTest::extractToAssertBuf(const UnicodeString
& message
) {
158 if(message
.length()==0) {
159 strcpy(ASSERT_BUF
, "[[empty UnicodeString]]");
162 IntlTest::prettify(message
,buf
);
163 if(buf
.length()==0) {
164 strcpy(ASSERT_BUF
, "[[escape() returned 0 chars]]");
166 buf
.extract(0, 0x7FFFFFFF, ASSERT_BUF
, sizeof(ASSERT_BUF
)-1);
167 if(ASSERT_BUF
[0]==0) {
169 for(int32_t i
=0;i
<buf
.length();i
++) {
171 sprintf(ASSERT_BUF
+strlen(ASSERT_BUF
),"\\u%02x",ch
);
176 ASSERT_BUF
[sizeof(ASSERT_BUF
)-1] = 0;
180 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,UPRV_LENGTHOF(buf),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
182 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \
183 __FILE__, __LINE__, u_errorName(status)); return;}}
185 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
187 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
188 if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \
189 __LINE__, u_errorName(errcode), u_errorName(status));};}
191 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
192 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
194 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
195 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
197 // expected: const char * , restricted to invariant characters.
198 // actual: const UnicodeString &
199 #define REGEX_ASSERT_UNISTR(expected, actual) { \
200 if (UnicodeString(expected, -1, US_INV) != (actual)) { \
201 errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n", \
202 __FILE__, __LINE__, expected, extractToAssertBuf(actual));};}
205 static UBool
testUTextEqual(UText
*uta
, UText
*utb
) {
208 utext_setNativeIndex(uta
, 0);
209 utext_setNativeIndex(utb
, 0);
211 ca
= utext_next32(uta
);
212 cb
= utext_next32(utb
);
216 } while (ca
!= U_SENTINEL
);
222 * @param expected expected text in UTF-8 (not platform) codepage
224 void RegexTest::assertUText(const char *expected
, UText
*actual
, const char *file
, int line
) {
225 UErrorCode status
= U_ZERO_ERROR
;
226 UText expectedText
= UTEXT_INITIALIZER
;
227 utext_openUTF8(&expectedText
, expected
, -1, &status
);
228 if(U_FAILURE(status
)) {
229 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file
, line
, u_errorName(status
), strlen(expected
));
232 if(utext_nativeLength(&expectedText
)==0 && (strlen(expected
)!=0)) {
233 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file
, line
, strlen(expected
));
236 utext_setNativeIndex(actual
, 0);
237 if (!testUTextEqual(&expectedText
, actual
)) {
238 char buf
[201 /*21*/];
239 char expectedBuf
[201];
240 utextToPrintable(buf
, UPRV_LENGTHOF(buf
), actual
);
241 utextToPrintable(expectedBuf
, UPRV_LENGTHOF(expectedBuf
), &expectedText
);
242 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file
, line
, expectedBuf
, (int)utext_nativeLength(&expectedText
), buf
, (int)utext_nativeLength(actual
));
244 utext_close(&expectedText
);
247 * @param expected invariant (platform local text) input
250 void RegexTest::assertUTextInvariant(const char *expected
, UText
*actual
, const char *file
, int line
) {
251 UErrorCode status
= U_ZERO_ERROR
;
252 UText expectedText
= UTEXT_INITIALIZER
;
253 regextst_openUTF8FromInvariant(&expectedText
, expected
, -1, &status
);
254 if(U_FAILURE(status
)) {
255 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file
, line
, u_errorName(status
), strlen(expected
));
258 utext_setNativeIndex(actual
, 0);
259 if (!testUTextEqual(&expectedText
, actual
)) {
260 char buf
[201 /*21*/];
261 char expectedBuf
[201];
262 utextToPrintable(buf
, UPRV_LENGTHOF(buf
), actual
);
263 utextToPrintable(expectedBuf
, UPRV_LENGTHOF(expectedBuf
), &expectedText
);
264 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file
, line
, expectedBuf
, (int)utext_nativeLength(&expectedText
), buf
, (int)utext_nativeLength(actual
));
266 utext_close(&expectedText
);
270 * Assumes utf-8 input
272 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
274 * Assumes Invariant input
276 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
279 * This buffer ( inv_buf ) is used to hold the UTF-8 strings
280 * passed into utext_openUTF8. An error will be given if
281 * INV_BUFSIZ is too small. It's only used on EBCDIC systems.
284 #define INV_BUFSIZ 2048 /* increase this if too small */
286 static int64_t inv_next
=0;
288 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
289 static char inv_buf
[INV_BUFSIZ
];
292 static UText
* regextst_openUTF8FromInvariant(UText
*ut
, const char *inv
, int64_t length
, UErrorCode
*status
) {
293 if(length
==-1) length
=strlen(inv
);
294 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
296 return utext_openUTF8(ut
, inv
, length
, status
);
298 if(inv_next
+length
+1>INV_BUFSIZ
) {
299 fprintf(stderr
, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
300 __FILE__
, __LINE__
, INV_BUFSIZ
, (inv_next
+length
+1));
301 *status
= U_MEMORY_ALLOCATION_ERROR
;
305 unsigned char *buf
= (unsigned char*)inv_buf
+inv_next
;
306 uprv_aestrncpy(buf
, (const uint8_t*)inv
, length
);
310 fprintf(stderr
, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ
, inv_next
);
313 return utext_openUTF8(ut
, (const char*)buf
, length
, status
);
318 //---------------------------------------------------------------------------
320 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests
321 // for the LookingAt() and Match() functions.
324 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
326 // The expected results are UBool - TRUE or FALSE.
327 // The input text is unescaped. The pattern is not.
330 //---------------------------------------------------------------------------
332 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
334 UBool
RegexTest::doRegexLMTest(const char *pat
, const char *text
, UBool looking
, UBool match
, int32_t line
) {
335 const UnicodeString
pattern(pat
, -1, US_INV
);
336 const UnicodeString
inputText(text
, -1, US_INV
);
337 UErrorCode status
= U_ZERO_ERROR
;
339 RegexPattern
*REPattern
= NULL
;
340 RegexMatcher
*REMatcher
= NULL
;
343 UnicodeString
patString(pat
, -1, US_INV
);
344 REPattern
= RegexPattern::compile(patString
, 0, pe
, status
);
345 if (U_FAILURE(status
)) {
346 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s",
347 line
, u_errorName(status
));
350 if (line
==376) { REPattern
->dumpPattern();}
352 UnicodeString
inputString(inputText
);
353 UnicodeString unEscapedInput
= inputString
.unescape();
354 REMatcher
= REPattern
->matcher(unEscapedInput
, status
);
355 if (U_FAILURE(status
)) {
356 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
357 line
, u_errorName(status
));
362 actualmatch
= REMatcher
->lookingAt(status
);
363 if (U_FAILURE(status
)) {
364 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
365 line
, u_errorName(status
));
368 if (actualmatch
!= looking
) {
369 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line
);
373 status
= U_ZERO_ERROR
;
374 actualmatch
= REMatcher
->matches(status
);
375 if (U_FAILURE(status
)) {
376 errln("RegexTest failure in matches() at line %d. Status = %s\n",
377 line
, u_errorName(status
));
380 if (actualmatch
!= match
) {
381 errln("RegexTest: wrong return from matches() at line %d.\n", line
);
385 if (retVal
== FALSE
) {
386 REPattern
->dumpPattern();
395 UBool
RegexTest::doRegexLMTestUTF8(const char *pat
, const char *text
, UBool looking
, UBool match
, int32_t line
) {
396 UText pattern
= UTEXT_INITIALIZER
;
397 int32_t inputUTF8Length
;
398 char *textChars
= NULL
;
399 UText inputText
= UTEXT_INITIALIZER
;
400 UErrorCode status
= U_ZERO_ERROR
;
402 RegexPattern
*REPattern
= NULL
;
403 RegexMatcher
*REMatcher
= NULL
;
406 regextst_openUTF8FromInvariant(&pattern
, pat
, -1, &status
);
407 REPattern
= RegexPattern::compile(&pattern
, 0, pe
, status
);
408 if (U_FAILURE(status
)) {
409 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n",
410 line
, u_errorName(status
));
414 UnicodeString
inputString(text
, -1, US_INV
);
415 UnicodeString unEscapedInput
= inputString
.unescape();
416 LocalUConverterPointer
UTF8Converter(ucnv_open("UTF8", &status
));
417 ucnv_setFromUCallBack(UTF8Converter
.getAlias(), UCNV_FROM_U_CALLBACK_STOP
, NULL
, NULL
, NULL
, &status
);
419 inputUTF8Length
= unEscapedInput
.extract(NULL
, 0, UTF8Converter
.getAlias(), status
);
420 if (U_FAILURE(status
) && status
!= U_BUFFER_OVERFLOW_ERROR
) {
421 // UTF-8 does not allow unpaired surrogates, so this could actually happen
422 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line
, u_errorName(status
));
423 return TRUE
; // not a failure of the Regex engine
425 status
= U_ZERO_ERROR
; // buffer overflow
426 textChars
= new char[inputUTF8Length
+1];
427 unEscapedInput
.extract(textChars
, inputUTF8Length
+1, UTF8Converter
.getAlias(), status
);
428 utext_openUTF8(&inputText
, textChars
, inputUTF8Length
, &status
);
430 REMatcher
= &REPattern
->matcher(status
)->reset(&inputText
);
431 if (U_FAILURE(status
)) {
432 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n",
433 line
, u_errorName(status
));
438 actualmatch
= REMatcher
->lookingAt(status
);
439 if (U_FAILURE(status
)) {
440 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n",
441 line
, u_errorName(status
));
444 if (actualmatch
!= looking
) {
445 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line
);
449 status
= U_ZERO_ERROR
;
450 actualmatch
= REMatcher
->matches(status
);
451 if (U_FAILURE(status
)) {
452 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n",
453 line
, u_errorName(status
));
456 if (actualmatch
!= match
) {
457 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line
);
461 if (retVal
== FALSE
) {
462 REPattern
->dumpPattern();
467 utext_close(&inputText
);
468 utext_close(&pattern
);
475 //---------------------------------------------------------------------------
477 // REGEX_ERR Macro + invocation function to simplify writing tests
478 // regex tests for incorrect patterns
481 // REGEX_ERR("pattern", expected error line, column, expected status);
483 //---------------------------------------------------------------------------
484 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
486 void RegexTest::regex_err(const char *pat
, int32_t errLine
, int32_t errCol
,
487 UErrorCode expectedStatus
, int32_t line
) {
488 UnicodeString
pattern(pat
);
490 UErrorCode status
= U_ZERO_ERROR
;
492 RegexPattern
*callerPattern
= NULL
;
495 // Compile the caller's pattern
497 UnicodeString
patString(pat
);
498 callerPattern
= RegexPattern::compile(patString
, 0, pe
, status
);
499 if (status
!= expectedStatus
) {
500 dataerrln("Line %d: unexpected error %s compiling pattern.", line
, u_errorName(status
));
502 if (status
!= U_ZERO_ERROR
) {
503 if (pe
.line
!= errLine
|| pe
.offset
!= errCol
) {
504 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
505 line
, errLine
, errCol
, pe
.line
, pe
.offset
);
510 delete callerPattern
;
513 // Compile again, using a UTF-8-based UText
515 UText patternText
= UTEXT_INITIALIZER
;
516 regextst_openUTF8FromInvariant(&patternText
, pat
, -1, &status
);
517 callerPattern
= RegexPattern::compile(&patternText
, 0, pe
, status
);
518 if (status
!= expectedStatus
) {
519 dataerrln("Line %d: unexpected error %s compiling pattern.", line
, u_errorName(status
));
521 if (status
!= U_ZERO_ERROR
) {
522 if (pe
.line
!= errLine
|| pe
.offset
!= errCol
) {
523 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
524 line
, errLine
, errCol
, pe
.line
, pe
.offset
);
529 delete callerPattern
;
530 utext_close(&patternText
);
535 //---------------------------------------------------------------------------
537 // Basic Check for basic functionality of regex pattern matching.
538 // Avoid the use of REGEX_FIND test macro, which has
539 // substantial dependencies on basic Regex functionality.
541 //---------------------------------------------------------------------------
542 void RegexTest::Basic() {
546 // Debug - slide failing test cases early
550 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
552 UErrorCode status
= U_ZERO_ERROR
;
553 RegexPattern
*pattern
;
554 pattern
= RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE
, pe
, status
);
555 pattern
->dumpPattern();
556 RegexMatcher
*m
= pattern
->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status
);
557 UBool result
= m
->find();
558 printf("result = %d\n", result
);
559 // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
560 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
567 // Pattern with parentheses
569 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE
, FALSE
);
570 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE
, TRUE
);
571 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE
, FALSE
);
576 REGEX_TESTLM("st(abc)*ring", "string", TRUE
, TRUE
);
577 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE
, TRUE
);
578 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE
, TRUE
);
579 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE
, FALSE
);
580 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE
, FALSE
);
582 REGEX_TESTLM("a*", "", TRUE
, TRUE
);
583 REGEX_TESTLM("a*", "b", TRUE
, FALSE
);
589 REGEX_TESTLM(".", "abc", TRUE
, FALSE
);
590 REGEX_TESTLM("...", "abc", TRUE
, TRUE
);
591 REGEX_TESTLM("....", "abc", FALSE
, FALSE
);
592 REGEX_TESTLM(".*", "abcxyz123", TRUE
, TRUE
);
593 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE
, FALSE
);
594 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE
, TRUE
);
595 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE
, TRUE
);
596 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE
, FALSE
);
599 // Patterns with * applied to chars at end of literal string
601 REGEX_TESTLM("abc*", "ab", TRUE
, TRUE
);
602 REGEX_TESTLM("abc*", "abccccc", TRUE
, TRUE
);
605 // Supplemental chars match as single chars, not a pair of surrogates.
607 REGEX_TESTLM(".", "\\U00011000", TRUE
, TRUE
);
608 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE
, TRUE
);
609 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE
, FALSE
);
613 // UnicodeSets in the pattern
615 REGEX_TESTLM("[1-6]", "1", TRUE
, TRUE
);
616 REGEX_TESTLM("[1-6]", "3", TRUE
, TRUE
);
617 REGEX_TESTLM("[1-6]", "7", FALSE
, FALSE
);
618 REGEX_TESTLM("a[1-6]", "a3", TRUE
, TRUE
);
619 REGEX_TESTLM("a[1-6]", "a3", TRUE
, TRUE
);
620 REGEX_TESTLM("a[1-6]b", "a3b", TRUE
, TRUE
);
622 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE
, TRUE
);
623 REGEX_TESTLM("a[0-9]*b", "abc", TRUE
, FALSE
);
624 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE
, TRUE
);
625 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE
, FALSE
); // note that * matches 0 occurences.
626 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE
, TRUE
);
629 // OR operator in patterns
631 REGEX_TESTLM("(a|b)", "a", TRUE
, TRUE
);
632 REGEX_TESTLM("(a|b)", "b", TRUE
, TRUE
);
633 REGEX_TESTLM("(a|b)", "c", FALSE
, FALSE
);
634 REGEX_TESTLM("a|b", "b", TRUE
, TRUE
);
636 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE
, TRUE
);
637 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE
, FALSE
);
638 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE
, TRUE
);
639 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE
, TRUE
);
640 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE
, TRUE
);
641 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE
, FALSE
);
646 REGEX_TESTLM("ab+", "abbc", TRUE
, FALSE
);
647 REGEX_TESTLM("ab+c", "ac", FALSE
, FALSE
);
648 REGEX_TESTLM("b+", "", FALSE
, FALSE
);
649 REGEX_TESTLM("(abc|def)+", "defabc", TRUE
, TRUE
);
650 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE
, FALSE
);
651 REGEX_TESTLM(".+y", "zippity dooy", TRUE
, TRUE
);
656 REGEX_TESTLM("ab?", "ab", TRUE
, TRUE
);
657 REGEX_TESTLM("ab?", "a", TRUE
, TRUE
);
658 REGEX_TESTLM("ab?", "ac", TRUE
, FALSE
);
659 REGEX_TESTLM("ab?", "abb", TRUE
, FALSE
);
660 REGEX_TESTLM("a(b|c)?d", "abd", TRUE
, TRUE
);
661 REGEX_TESTLM("a(b|c)?d", "acd", TRUE
, TRUE
);
662 REGEX_TESTLM("a(b|c)?d", "ad", TRUE
, TRUE
);
663 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE
, FALSE
);
664 REGEX_TESTLM("a(b|c)?d", "ab", FALSE
, FALSE
);
667 // Escape sequences that become single literal chars, handled internally
668 // by ICU's Unescape.
671 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
672 REGEX_TESTLM("\\a", "\\u0007", TRUE
, TRUE
); // BEL
673 REGEX_TESTLM("\\cL", "\\u000c", TRUE
, TRUE
); // Control-L
674 REGEX_TESTLM("\\e", "\\u001b", TRUE
, TRUE
); // Escape
675 REGEX_TESTLM("\\f", "\\u000c", TRUE
, TRUE
); // Form Feed
676 REGEX_TESTLM("\\n", "\\u000a", TRUE
, TRUE
); // new line
677 REGEX_TESTLM("\\r", "\\u000d", TRUE
, TRUE
); // CR
678 REGEX_TESTLM("\\t", "\\u0009", TRUE
, TRUE
); // Tab
679 REGEX_TESTLM("\\u1234", "\\u1234", TRUE
, TRUE
);
680 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE
, TRUE
);
682 REGEX_TESTLM(".*\\Ax", "xyz", TRUE
, FALSE
); // \A matches only at the beginning of input
683 REGEX_TESTLM(".*\\Ax", " xyz", FALSE
, FALSE
); // \A matches only at the beginning of input
685 // Escape of special chars in patterns
686 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE
, TRUE
);
690 //---------------------------------------------------------------------------
692 // UTextBasic Check for quirks that are specific to the UText
695 //---------------------------------------------------------------------------
696 void RegexTest::UTextBasic() {
697 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
698 UErrorCode status
= U_ZERO_ERROR
;
699 UText pattern
= UTEXT_INITIALIZER
;
700 utext_openUTF8(&pattern
, str_abc
, -1, &status
);
701 RegexMatcher
matcher(&pattern
, 0, status
);
704 UText input
= UTEXT_INITIALIZER
;
705 utext_openUTF8(&input
, str_abc
, -1, &status
);
707 matcher
.reset(&input
);
709 REGEX_ASSERT_UTEXT_UTF8(str_abc
, matcher
.inputText());
711 matcher
.reset(matcher
.inputText());
713 REGEX_ASSERT_UTEXT_UTF8(str_abc
, matcher
.inputText());
715 utext_close(&pattern
);
720 //---------------------------------------------------------------------------
722 // API_Match Test that the API for class RegexMatcher
723 // is present and nominally working, but excluding functions
724 // implementing replace operations.
726 //---------------------------------------------------------------------------
727 void RegexTest::API_Match() {
729 UErrorCode status
=U_ZERO_ERROR
;
733 // Debug - slide failing test cases early
742 // Simple pattern compilation
745 UnicodeString
re("abc");
747 pat2
= RegexPattern::compile(re
, flags
, pe
, status
);
750 UnicodeString inStr1
= "abcdef this is a test";
751 UnicodeString instr2
= "not abc";
752 UnicodeString empty
= "";
756 // Matcher creation and reset.
758 RegexMatcher
*m1
= pat2
->matcher(inStr1
, status
);
760 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
761 REGEX_ASSERT(m1
->input() == inStr1
);
763 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
764 REGEX_ASSERT(m1
->input() == instr2
);
766 REGEX_ASSERT(m1
->input() == inStr1
);
767 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
769 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
770 REGEX_ASSERT(m1
->input() == empty
);
771 REGEX_ASSERT(&m1
->pattern() == pat2
);
774 // reset(pos, status)
777 m1
->reset(4, status
);
779 REGEX_ASSERT(m1
->input() == inStr1
);
780 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
782 m1
->reset(-1, status
);
783 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
784 status
= U_ZERO_ERROR
;
786 m1
->reset(0, status
);
788 status
= U_ZERO_ERROR
;
790 int32_t len
= m1
->input().length();
791 m1
->reset(len
-1, status
);
793 status
= U_ZERO_ERROR
;
795 m1
->reset(len
, status
);
797 status
= U_ZERO_ERROR
;
799 m1
->reset(len
+1, status
);
800 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
801 status
= U_ZERO_ERROR
;
804 // match(pos, status)
807 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
809 REGEX_ASSERT(m1
->matches(3, status
) == FALSE
);
811 REGEX_ASSERT(m1
->matches(5, status
) == FALSE
);
812 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
813 REGEX_ASSERT(m1
->matches(-1, status
) == FALSE
);
814 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
816 // Match() at end of string should fail, but should not
818 status
= U_ZERO_ERROR
;
819 len
= m1
->input().length();
820 REGEX_ASSERT(m1
->matches(len
, status
) == FALSE
);
823 // Match beyond end of string should fail with an error.
824 status
= U_ZERO_ERROR
;
825 REGEX_ASSERT(m1
->matches(len
+1, status
) == FALSE
);
826 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
828 // Successful match at end of string.
830 status
= U_ZERO_ERROR
;
831 RegexMatcher
m("A?", 0, status
); // will match zero length string.
834 len
= inStr1
.length();
835 REGEX_ASSERT(m
.matches(len
, status
) == TRUE
);
838 REGEX_ASSERT(m
.matches(0, status
) == TRUE
);
844 // lookingAt(pos, status)
846 status
= U_ZERO_ERROR
;
847 m1
->reset(instr2
); // "not abc"
848 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
849 REGEX_ASSERT(m1
->lookingAt(5, status
) == FALSE
);
850 REGEX_ASSERT(m1
->lookingAt(3, status
) == FALSE
);
851 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
852 REGEX_ASSERT(m1
->lookingAt(-1, status
) == FALSE
);
853 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
854 status
= U_ZERO_ERROR
;
855 len
= m1
->input().length();
856 REGEX_ASSERT(m1
->lookingAt(len
, status
) == FALSE
);
858 REGEX_ASSERT(m1
->lookingAt(len
+1, status
) == FALSE
);
859 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
868 // RegexMatcher::start();
869 // RegexMatcher::end();
870 // RegexMatcher::groupCount();
875 UErrorCode status
=U_ZERO_ERROR
;
877 UnicodeString
re("01(23(45)67)(.*)");
878 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
880 UnicodeString data
= "0123456789";
882 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
884 REGEX_ASSERT(matcher
->lookingAt(status
) == TRUE
);
885 static const int32_t matchStarts
[] = {0, 2, 4, 8};
886 static const int32_t matchEnds
[] = {10, 8, 6, 10};
888 for (i
=0; i
<4; i
++) {
889 int32_t actualStart
= matcher
->start(i
, status
);
891 if (actualStart
!= matchStarts
[i
]) {
892 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
893 __LINE__
, i
, matchStarts
[i
], actualStart
);
895 int32_t actualEnd
= matcher
->end(i
, status
);
897 if (actualEnd
!= matchEnds
[i
]) {
898 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
899 __LINE__
, i
, matchEnds
[i
], actualEnd
);
903 REGEX_ASSERT(matcher
->start(0, status
) == matcher
->start(status
));
904 REGEX_ASSERT(matcher
->end(0, status
) == matcher
->end(status
));
906 REGEX_ASSERT_FAIL(matcher
->start(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
907 REGEX_ASSERT_FAIL(matcher
->start( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
909 REGEX_ASSERT_FAIL(matcher
->start( 0, status
), U_REGEX_INVALID_STATE
);
911 matcher
->lookingAt(status
);
912 REGEX_ASSERT(matcher
->group(status
) == "0123456789");
913 REGEX_ASSERT(matcher
->group(0, status
) == "0123456789");
914 REGEX_ASSERT(matcher
->group(1, status
) == "234567" );
915 REGEX_ASSERT(matcher
->group(2, status
) == "45" );
916 REGEX_ASSERT(matcher
->group(3, status
) == "89" );
918 REGEX_ASSERT_FAIL(matcher
->group(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
919 REGEX_ASSERT_FAIL(matcher
->group( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
921 REGEX_ASSERT_FAIL(matcher
->group( 0, status
), U_REGEX_INVALID_STATE
);
934 UErrorCode status
=U_ZERO_ERROR
;
936 UnicodeString
re("abc");
937 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
939 UnicodeString data
= ".abc..abc...abc..";
940 // 012345678901234567
942 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
944 REGEX_ASSERT(matcher
->find());
945 REGEX_ASSERT(matcher
->start(status
) == 1);
946 REGEX_ASSERT(matcher
->find());
947 REGEX_ASSERT(matcher
->start(status
) == 6);
948 REGEX_ASSERT(matcher
->find());
949 REGEX_ASSERT(matcher
->start(status
) == 12);
950 REGEX_ASSERT(matcher
->find() == FALSE
);
951 REGEX_ASSERT(matcher
->find() == FALSE
);
954 REGEX_ASSERT(matcher
->find());
955 REGEX_ASSERT(matcher
->start(status
) == 1);
957 REGEX_ASSERT(matcher
->find(0, status
));
958 REGEX_ASSERT(matcher
->start(status
) == 1);
959 REGEX_ASSERT(matcher
->find(1, status
));
960 REGEX_ASSERT(matcher
->start(status
) == 1);
961 REGEX_ASSERT(matcher
->find(2, status
));
962 REGEX_ASSERT(matcher
->start(status
) == 6);
963 REGEX_ASSERT(matcher
->find(12, status
));
964 REGEX_ASSERT(matcher
->start(status
) == 12);
965 REGEX_ASSERT(matcher
->find(13, status
) == FALSE
);
966 REGEX_ASSERT(matcher
->find(16, status
) == FALSE
);
967 REGEX_ASSERT(matcher
->find(17, status
) == FALSE
);
968 REGEX_ASSERT_FAIL(matcher
->start(status
), U_REGEX_INVALID_STATE
);
970 status
= U_ZERO_ERROR
;
971 REGEX_ASSERT_FAIL(matcher
->find(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
972 status
= U_ZERO_ERROR
;
973 REGEX_ASSERT_FAIL(matcher
->find(18, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
975 REGEX_ASSERT(matcher
->groupCount() == 0);
983 // find, with \G in pattern (true if at the end of a previous match).
988 UErrorCode status
=U_ZERO_ERROR
;
990 UnicodeString
re(".*?(?:(\\Gabc)|(abc))", -1, US_INV
);
991 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
993 UnicodeString data
= ".abcabc.abc..";
994 // 012345678901234567
996 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
998 REGEX_ASSERT(matcher
->find());
999 REGEX_ASSERT(matcher
->start(status
) == 0);
1000 REGEX_ASSERT(matcher
->start(1, status
) == -1);
1001 REGEX_ASSERT(matcher
->start(2, status
) == 1);
1003 REGEX_ASSERT(matcher
->find());
1004 REGEX_ASSERT(matcher
->start(status
) == 4);
1005 REGEX_ASSERT(matcher
->start(1, status
) == 4);
1006 REGEX_ASSERT(matcher
->start(2, status
) == -1);
1014 // find with zero length matches, match position should bump ahead
1015 // to prevent loops.
1019 UErrorCode status
=U_ZERO_ERROR
;
1020 RegexMatcher
m("(?= ?)", 0, status
); // This pattern will zero-length matches anywhere,
1021 // using an always-true look-ahead.
1023 UnicodeString
s(" ");
1026 if (m
.find() == FALSE
) {
1029 REGEX_ASSERT(m
.start(status
) == i
);
1030 REGEX_ASSERT(m
.end(status
) == i
);
1034 // Check that the bump goes over surrogate pairs OK
1035 s
= UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1039 if (m
.find() == FALSE
) {
1042 REGEX_ASSERT(m
.start(status
) == i
);
1043 REGEX_ASSERT(m
.end(status
) == i
);
1045 REGEX_ASSERT(i
==10);
1048 // find() loop breaking test.
1049 // with pattern of /.?/, should see a series of one char matches, then a single
1050 // match of zero length at the end of the input string.
1052 UErrorCode status
=U_ZERO_ERROR
;
1053 RegexMatcher
m(".?", 0, status
);
1055 UnicodeString
s(" ");
1058 if (m
.find() == FALSE
) {
1061 REGEX_ASSERT(m
.start(status
) == i
);
1062 REGEX_ASSERT(m
.end(status
) == (i
<4 ? i
+1 : i
));
1069 // Matchers with no input string behave as if they had an empty input string.
1073 UErrorCode status
= U_ZERO_ERROR
;
1074 RegexMatcher
m(".?", 0, status
);
1076 REGEX_ASSERT(m
.find());
1077 REGEX_ASSERT(m
.start(status
) == 0);
1078 REGEX_ASSERT(m
.input() == "");
1081 UErrorCode status
= U_ZERO_ERROR
;
1082 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
1083 RegexMatcher
*m
= p
->matcher(status
);
1086 REGEX_ASSERT(m
->find() == FALSE
);
1087 REGEX_ASSERT(m
->input() == "");
1096 UErrorCode status
= U_ZERO_ERROR
;
1097 UnicodeString
testString("This is test data");
1098 RegexMatcher
m(".*", testString
, 0, status
);
1100 REGEX_ASSERT(m
.regionStart() == 0);
1101 REGEX_ASSERT(m
.regionEnd() == testString
.length());
1102 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1103 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1105 m
.region(2,4, status
);
1107 REGEX_ASSERT(m
.matches(status
));
1108 REGEX_ASSERT(m
.start(status
)==2);
1109 REGEX_ASSERT(m
.end(status
)==4);
1113 REGEX_ASSERT(m
.regionStart() == 0);
1114 REGEX_ASSERT(m
.regionEnd() == testString
.length());
1116 UnicodeString
shorterString("short");
1117 m
.reset(shorterString
);
1118 REGEX_ASSERT(m
.regionStart() == 0);
1119 REGEX_ASSERT(m
.regionEnd() == shorterString
.length());
1121 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1122 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(FALSE
));
1123 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
1124 REGEX_ASSERT(&m
== &m
.reset());
1125 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
1127 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(TRUE
));
1128 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1129 REGEX_ASSERT(&m
== &m
.reset());
1130 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1132 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1133 REGEX_ASSERT(&m
== &m
.useTransparentBounds(TRUE
));
1134 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
1135 REGEX_ASSERT(&m
== &m
.reset());
1136 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
1138 REGEX_ASSERT(&m
== &m
.useTransparentBounds(FALSE
));
1139 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1140 REGEX_ASSERT(&m
== &m
.reset());
1141 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1146 // hitEnd() and requireEnd()
1149 UErrorCode status
= U_ZERO_ERROR
;
1150 UnicodeString
testString("aabb");
1151 RegexMatcher
m1(".*", testString
, 0, status
);
1152 REGEX_ASSERT(m1
.lookingAt(status
) == TRUE
);
1153 REGEX_ASSERT(m1
.hitEnd() == TRUE
);
1154 REGEX_ASSERT(m1
.requireEnd() == FALSE
);
1157 status
= U_ZERO_ERROR
;
1158 RegexMatcher
m2("a*", testString
, 0, status
);
1159 REGEX_ASSERT(m2
.lookingAt(status
) == TRUE
);
1160 REGEX_ASSERT(m2
.hitEnd() == FALSE
);
1161 REGEX_ASSERT(m2
.requireEnd() == FALSE
);
1164 status
= U_ZERO_ERROR
;
1165 RegexMatcher
m3(".*$", testString
, 0, status
);
1166 REGEX_ASSERT(m3
.lookingAt(status
) == TRUE
);
1167 REGEX_ASSERT(m3
.hitEnd() == TRUE
);
1168 REGEX_ASSERT(m3
.requireEnd() == TRUE
);
1174 // Compilation error on reset with UChar *
1175 // These were a hazard that people were stumbling over with runtime errors.
1176 // Changed them to compiler errors by adding private methods that more closely
1177 // matched the incorrect use of the functions.
1181 UErrorCode status
= U_ZERO_ERROR
;
1182 UChar ucharString
[20];
1183 RegexMatcher
m(".", 0, status
);
1184 m
.reset(ucharString
); // should not compile.
1186 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
1187 RegexMatcher
*m2
= p
->matcher(ucharString
, status
); // should not compile.
1189 RegexMatcher
m3(".", ucharString
, 0, status
); // Should not compile
1195 // Note: These tests will need to be changed when the regexp engine is
1196 // able to detect and cut short the exponential time behavior on
1197 // this type of match.
1200 UErrorCode status
= U_ZERO_ERROR
;
1201 // Enough 'a's in the string to cause the match to time out.
1202 // (Each on additonal 'a' doubles the time)
1203 UnicodeString
testString("aaaaaaaaaaaaaaaaaaaaa");
1204 RegexMatcher
matcher("(a+)+b", testString
, 0, status
);
1206 REGEX_ASSERT(matcher
.getTimeLimit() == 0);
1207 matcher
.setTimeLimit(100, status
);
1208 REGEX_ASSERT(matcher
.getTimeLimit() == 100);
1209 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1210 REGEX_ASSERT(status
== U_REGEX_TIME_OUT
);
1213 UErrorCode status
= U_ZERO_ERROR
;
1214 // Few enough 'a's to slip in under the time limit.
1215 UnicodeString
testString("aaaaaaaaaaaaaaaaaa");
1216 RegexMatcher
matcher("(a+)+b", testString
, 0, status
);
1218 matcher
.setTimeLimit(100, status
);
1219 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1227 UErrorCode status
= U_ZERO_ERROR
;
1228 UnicodeString
testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'
1230 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1231 // of the '+', and makes the stack frames larger.
1232 RegexMatcher
matcher("(A)+A$", testString
, 0, status
);
1234 // With the default stack, this match should fail to run
1235 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1236 REGEX_ASSERT(status
== U_REGEX_STACK_OVERFLOW
);
1238 // With unlimited stack, it should run
1239 status
= U_ZERO_ERROR
;
1240 matcher
.setStackLimit(0, status
);
1242 REGEX_ASSERT(matcher
.lookingAt(status
) == TRUE
);
1244 REGEX_ASSERT(matcher
.getStackLimit() == 0);
1246 // With a limited stack, it the match should fail
1247 status
= U_ZERO_ERROR
;
1248 matcher
.setStackLimit(10000, status
);
1249 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1250 REGEX_ASSERT(status
== U_REGEX_STACK_OVERFLOW
);
1251 REGEX_ASSERT(matcher
.getStackLimit() == 10000);
1254 // A pattern that doesn't save state should work with
1255 // a minimal sized stack
1257 UErrorCode status
= U_ZERO_ERROR
;
1258 UnicodeString testString
= "abc";
1259 RegexMatcher
matcher("abc", testString
, 0, status
);
1261 matcher
.setStackLimit(30, status
);
1263 REGEX_ASSERT(matcher
.matches(status
) == TRUE
);
1265 REGEX_ASSERT(matcher
.getStackLimit() == 30);
1267 // Negative stack sizes should fail
1268 status
= U_ZERO_ERROR
;
1269 matcher
.setStackLimit(1000, status
);
1271 matcher
.setStackLimit(-1, status
);
1272 REGEX_ASSERT(status
== U_ILLEGAL_ARGUMENT_ERROR
);
1273 REGEX_ASSERT(matcher
.getStackLimit() == 1000);
1284 //---------------------------------------------------------------------------
1286 // API_Replace API test for class RegexMatcher, testing the
1287 // Replace family of functions.
1289 //---------------------------------------------------------------------------
1290 void RegexTest::API_Replace() {
1296 UErrorCode status
=U_ZERO_ERROR
;
1298 UnicodeString
re("abc");
1299 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
1301 UnicodeString data
= ".abc..abc...abc..";
1302 // 012345678901234567
1303 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
1306 // Plain vanilla matches.
1309 dest
= matcher
->replaceFirst("yz", status
);
1311 REGEX_ASSERT(dest
== ".yz..abc...abc..");
1313 dest
= matcher
->replaceAll("yz", status
);
1315 REGEX_ASSERT(dest
== ".yz..yz...yz..");
1318 // Plain vanilla non-matches.
1320 UnicodeString d2
= ".abx..abx...abx..";
1322 dest
= matcher
->replaceFirst("yz", status
);
1324 REGEX_ASSERT(dest
== ".abx..abx...abx..");
1326 dest
= matcher
->replaceAll("yz", status
);
1328 REGEX_ASSERT(dest
== ".abx..abx...abx..");
1331 // Empty source string
1333 UnicodeString d3
= "";
1335 dest
= matcher
->replaceFirst("yz", status
);
1337 REGEX_ASSERT(dest
== "");
1339 dest
= matcher
->replaceAll("yz", status
);
1341 REGEX_ASSERT(dest
== "");
1344 // Empty substitution string
1346 matcher
->reset(data
); // ".abc..abc...abc.."
1347 dest
= matcher
->replaceFirst("", status
);
1349 REGEX_ASSERT(dest
== "...abc...abc..");
1351 dest
= matcher
->replaceAll("", status
);
1353 REGEX_ASSERT(dest
== "........");
1356 // match whole string
1358 UnicodeString d4
= "abc";
1360 dest
= matcher
->replaceFirst("xyz", status
);
1362 REGEX_ASSERT(dest
== "xyz");
1364 dest
= matcher
->replaceAll("xyz", status
);
1366 REGEX_ASSERT(dest
== "xyz");
1369 // Capture Group, simple case
1371 UnicodeString
re2("a(..)");
1372 RegexPattern
*pat2
= RegexPattern::compile(re2
, flags
, pe
, status
);
1374 UnicodeString d5
= "abcdefg";
1375 RegexMatcher
*matcher2
= pat2
->matcher(d5
, status
);
1377 dest
= matcher2
->replaceFirst("$1$1", status
);
1379 REGEX_ASSERT(dest
== "bcbcdefg");
1381 dest
= matcher2
->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status
);
1383 REGEX_ASSERT(dest
== "The value of $1 is bc.defg");
1385 dest
= matcher2
->replaceFirst("$ by itself, no group number $$$", status
);
1386 REGEX_ASSERT(U_FAILURE(status
));
1387 status
= U_ZERO_ERROR
;
1389 UnicodeString replacement
= UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1390 replacement
= replacement
.unescape();
1391 dest
= matcher2
->replaceFirst(replacement
, status
);
1393 REGEX_ASSERT(dest
== "Supplemental Digit 1 bc.defg");
1395 REGEX_ASSERT_FAIL(matcher2
->replaceFirst("bad capture group number $5...",status
), U_INDEX_OUTOFBOUNDS_ERROR
);
1399 // Replacement String with \u hex escapes
1402 UnicodeString src
= "abc 1 abc 2 abc 3";
1403 UnicodeString substitute
= UNICODE_STRING_SIMPLE("--\\u0043--");
1404 matcher
->reset(src
);
1405 UnicodeString result
= matcher
->replaceAll(substitute
, status
);
1407 REGEX_ASSERT(result
== "--C-- 1 --C-- 2 --C-- 3");
1410 UnicodeString src
= "abc !";
1411 UnicodeString substitute
= UNICODE_STRING_SIMPLE("--\\U00010000--");
1412 matcher
->reset(src
);
1413 UnicodeString result
= matcher
->replaceAll(substitute
, status
);
1415 UnicodeString expected
= UnicodeString("--");
1416 expected
.append((UChar32
)0x10000);
1417 expected
.append("-- !");
1418 REGEX_ASSERT(result
== expected
);
1420 // TODO: need more through testing of capture substitutions.
1425 status
= U_ZERO_ERROR
;
1426 UnicodeString s
= "The matches start with ss and end with ee ss stuff ee fin";
1427 RegexMatcher
m("ss(.*?)ee", 0, status
);
1429 UnicodeString result
;
1431 // Multiple finds do NOT bump up the previous appendReplacement postion.
1435 m
.appendReplacement(result
, "ooh", status
);
1437 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1439 // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1440 status
= U_ZERO_ERROR
;
1442 m
.reset(10, status
);
1445 m
.appendReplacement(result
, "ooh", status
);
1447 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1449 // find() at interior of string, appendReplacemnt still starts at beginning.
1450 status
= U_ZERO_ERROR
;
1455 m
.appendReplacement(result
, "ooh", status
);
1457 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1459 m
.appendTail(result
);
1460 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh fin");
1471 //---------------------------------------------------------------------------
1473 // API_Pattern Test that the API for class RegexPattern is
1474 // present and nominally working.
1476 //---------------------------------------------------------------------------
1477 void RegexTest::API_Pattern() {
1478 RegexPattern pata
; // Test default constructor to not crash.
1481 REGEX_ASSERT(pata
== patb
);
1482 REGEX_ASSERT(pata
== pata
);
1484 UnicodeString
re1("abc[a-l][m-z]");
1485 UnicodeString
re2("def");
1486 UErrorCode status
= U_ZERO_ERROR
;
1489 RegexPattern
*pat1
= RegexPattern::compile(re1
, 0, pe
, status
);
1490 RegexPattern
*pat2
= RegexPattern::compile(re2
, 0, pe
, status
);
1492 REGEX_ASSERT(*pat1
== *pat1
);
1493 REGEX_ASSERT(*pat1
!= pata
);
1497 REGEX_ASSERT(patb
== *pat1
);
1500 RegexPattern
patc(*pat1
);
1501 REGEX_ASSERT(patc
== *pat1
);
1502 REGEX_ASSERT(patb
== patc
);
1503 REGEX_ASSERT(pat1
!= pat2
);
1505 REGEX_ASSERT(patb
!= patc
);
1506 REGEX_ASSERT(patb
== *pat2
);
1508 // Compile with no flags.
1509 RegexPattern
*pat1a
= RegexPattern::compile(re1
, pe
, status
);
1510 REGEX_ASSERT(*pat1a
== *pat1
);
1512 REGEX_ASSERT(pat1a
->flags() == 0);
1514 // Compile with different flags should be not equal
1515 RegexPattern
*pat1b
= RegexPattern::compile(re1
, UREGEX_CASE_INSENSITIVE
, pe
, status
);
1518 REGEX_ASSERT(*pat1b
!= *pat1a
);
1519 REGEX_ASSERT(pat1b
->flags() == UREGEX_CASE_INSENSITIVE
);
1520 REGEX_ASSERT(pat1a
->flags() == 0);
1524 RegexPattern
*pat1c
= pat1
->clone();
1525 REGEX_ASSERT(*pat1c
== *pat1
);
1526 REGEX_ASSERT(*pat1c
!= *pat2
);
1535 // Verify that a matcher created from a cloned pattern works.
1539 UErrorCode status
= U_ZERO_ERROR
;
1540 RegexPattern
*pSource
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status
);
1541 RegexPattern
*pClone
= pSource
->clone();
1543 RegexMatcher
*mFromClone
= pClone
->matcher(status
);
1545 UnicodeString s
= "Hello World";
1546 mFromClone
->reset(s
);
1547 REGEX_ASSERT(mFromClone
->find() == TRUE
);
1548 REGEX_ASSERT(mFromClone
->group(status
) == "Hello");
1549 REGEX_ASSERT(mFromClone
->find() == TRUE
);
1550 REGEX_ASSERT(mFromClone
->group(status
) == "World");
1551 REGEX_ASSERT(mFromClone
->find() == FALSE
);
1557 // matches convenience API
1559 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe
, status
) == TRUE
);
1561 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe
, status
) == FALSE
);
1563 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe
, status
) == TRUE
);
1565 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe
, status
) == TRUE
);
1567 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe
, status
) == FALSE
);
1569 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1570 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe
, status
) == FALSE
);
1571 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1577 status
= U_ZERO_ERROR
;
1578 pat1
= RegexPattern::compile(" +", pe
, status
);
1580 UnicodeString fields
[10];
1583 n
= pat1
->split("Now is the time", fields
, 10, status
);
1586 REGEX_ASSERT(fields
[0]=="Now");
1587 REGEX_ASSERT(fields
[1]=="is");
1588 REGEX_ASSERT(fields
[2]=="the");
1589 REGEX_ASSERT(fields
[3]=="time");
1590 REGEX_ASSERT(fields
[4]=="");
1592 n
= pat1
->split("Now is the time", fields
, 2, status
);
1595 REGEX_ASSERT(fields
[0]=="Now");
1596 REGEX_ASSERT(fields
[1]=="is the time");
1597 REGEX_ASSERT(fields
[2]=="the"); // left over from previous test
1600 status
= U_ZERO_ERROR
;
1601 n
= pat1
->split("Now is the time", fields
, 1, status
);
1604 REGEX_ASSERT(fields
[0]=="Now is the time");
1605 REGEX_ASSERT(fields
[1]=="*");
1606 status
= U_ZERO_ERROR
;
1608 n
= pat1
->split(" Now is the time ", fields
, 10, status
);
1611 REGEX_ASSERT(fields
[0]=="");
1612 REGEX_ASSERT(fields
[1]=="Now");
1613 REGEX_ASSERT(fields
[2]=="is");
1614 REGEX_ASSERT(fields
[3]=="the");
1615 REGEX_ASSERT(fields
[4]=="time");
1616 REGEX_ASSERT(fields
[5]=="");
1618 n
= pat1
->split(" ", fields
, 10, status
);
1621 REGEX_ASSERT(fields
[0]=="");
1622 REGEX_ASSERT(fields
[1]=="");
1625 n
= pat1
->split("", fields
, 10, status
);
1628 REGEX_ASSERT(fields
[0]=="foo");
1632 // split, with a pattern with (capture)
1633 pat1
= RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe
, status
);
1636 status
= U_ZERO_ERROR
;
1637 n
= pat1
->split("<a>Now is <b>the time<c>", fields
, 10, status
);
1640 REGEX_ASSERT(fields
[0]=="");
1641 REGEX_ASSERT(fields
[1]=="a");
1642 REGEX_ASSERT(fields
[2]=="Now is ");
1643 REGEX_ASSERT(fields
[3]=="b");
1644 REGEX_ASSERT(fields
[4]=="the time");
1645 REGEX_ASSERT(fields
[5]=="c");
1646 REGEX_ASSERT(fields
[6]=="");
1647 REGEX_ASSERT(status
==U_ZERO_ERROR
);
1649 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 10, status
);
1652 REGEX_ASSERT(fields
[0]==" ");
1653 REGEX_ASSERT(fields
[1]=="a");
1654 REGEX_ASSERT(fields
[2]=="Now is ");
1655 REGEX_ASSERT(fields
[3]=="b");
1656 REGEX_ASSERT(fields
[4]=="the time");
1657 REGEX_ASSERT(fields
[5]=="c");
1658 REGEX_ASSERT(fields
[6]=="");
1660 status
= U_ZERO_ERROR
;
1662 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 6, status
);
1665 REGEX_ASSERT(fields
[0]==" ");
1666 REGEX_ASSERT(fields
[1]=="a");
1667 REGEX_ASSERT(fields
[2]=="Now is ");
1668 REGEX_ASSERT(fields
[3]=="b");
1669 REGEX_ASSERT(fields
[4]=="the time");
1670 REGEX_ASSERT(fields
[5]==""); // All text following "<c>" field delimiter.
1671 REGEX_ASSERT(fields
[6]=="foo");
1673 status
= U_ZERO_ERROR
;
1675 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 5, status
);
1678 REGEX_ASSERT(fields
[0]==" ");
1679 REGEX_ASSERT(fields
[1]=="a");
1680 REGEX_ASSERT(fields
[2]=="Now is ");
1681 REGEX_ASSERT(fields
[3]=="b");
1682 REGEX_ASSERT(fields
[4]=="the time<c>");
1683 REGEX_ASSERT(fields
[5]=="foo");
1685 status
= U_ZERO_ERROR
;
1687 n
= pat1
->split(" <a>Now is <b>the time", fields
, 5, status
);
1690 REGEX_ASSERT(fields
[0]==" ");
1691 REGEX_ASSERT(fields
[1]=="a");
1692 REGEX_ASSERT(fields
[2]=="Now is ");
1693 REGEX_ASSERT(fields
[3]=="b");
1694 REGEX_ASSERT(fields
[4]=="the time");
1695 REGEX_ASSERT(fields
[5]=="foo");
1697 status
= U_ZERO_ERROR
;
1698 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 4, status
);
1701 REGEX_ASSERT(fields
[0]==" ");
1702 REGEX_ASSERT(fields
[1]=="a");
1703 REGEX_ASSERT(fields
[2]=="Now is ");
1704 REGEX_ASSERT(fields
[3]=="the time<c>");
1705 status
= U_ZERO_ERROR
;
1708 pat1
= RegexPattern::compile("([-,])", pe
, status
);
1710 n
= pat1
->split("1-10,20", fields
, 10, status
);
1713 REGEX_ASSERT(fields
[0]=="1");
1714 REGEX_ASSERT(fields
[1]=="-");
1715 REGEX_ASSERT(fields
[2]=="10");
1716 REGEX_ASSERT(fields
[3]==",");
1717 REGEX_ASSERT(fields
[4]=="20");
1720 // Test split of string with empty trailing fields
1721 pat1
= RegexPattern::compile(",", pe
, status
);
1723 n
= pat1
->split("a,b,c,", fields
, 10, status
);
1726 REGEX_ASSERT(fields
[0]=="a");
1727 REGEX_ASSERT(fields
[1]=="b");
1728 REGEX_ASSERT(fields
[2]=="c");
1729 REGEX_ASSERT(fields
[3]=="");
1731 n
= pat1
->split("a,,,", fields
, 10, status
);
1734 REGEX_ASSERT(fields
[0]=="a");
1735 REGEX_ASSERT(fields
[1]=="");
1736 REGEX_ASSERT(fields
[2]=="");
1737 REGEX_ASSERT(fields
[3]=="");
1740 // Split Separator with zero length match.
1741 pat1
= RegexPattern::compile(":?", pe
, status
);
1743 n
= pat1
->split("abc", fields
, 10, status
);
1746 REGEX_ASSERT(fields
[0]=="");
1747 REGEX_ASSERT(fields
[1]=="a");
1748 REGEX_ASSERT(fields
[2]=="b");
1749 REGEX_ASSERT(fields
[3]=="c");
1750 REGEX_ASSERT(fields
[4]=="");
1755 // RegexPattern::pattern()
1757 pat1
= new RegexPattern();
1758 REGEX_ASSERT(pat1
->pattern() == "");
1761 pat1
= RegexPattern::compile("(Hello, world)*", pe
, status
);
1763 REGEX_ASSERT(pat1
->pattern() == "(Hello, world)*");
1768 // classID functions
1770 pat1
= RegexPattern::compile("(Hello, world)*", pe
, status
);
1772 REGEX_ASSERT(pat1
->getDynamicClassID() == RegexPattern::getStaticClassID());
1773 REGEX_ASSERT(pat1
->getDynamicClassID() != NULL
);
1774 UnicodeString
Hello("Hello, world.");
1775 RegexMatcher
*m
= pat1
->matcher(Hello
, status
);
1776 REGEX_ASSERT(pat1
->getDynamicClassID() != m
->getDynamicClassID());
1777 REGEX_ASSERT(m
->getDynamicClassID() == RegexMatcher::getStaticClassID());
1778 REGEX_ASSERT(m
->getDynamicClassID() != NULL
);
1784 //---------------------------------------------------------------------------
1786 // API_Match_UTF8 Test that the alternate engine for class RegexMatcher
1787 // is present and working, but excluding functions
1788 // implementing replace operations.
1790 //---------------------------------------------------------------------------
1791 void RegexTest::API_Match_UTF8() {
1793 UErrorCode status
=U_ZERO_ERROR
;
1797 // Debug - slide failing test cases early
1806 // Simple pattern compilation
1809 UText re
= UTEXT_INITIALIZER
;
1810 regextst_openUTF8FromInvariant(&re
, "abc", -1, &status
);
1811 REGEX_VERBOSE_TEXT(&re
);
1813 pat2
= RegexPattern::compile(&re
, flags
, pe
, status
);
1816 UText input1
= UTEXT_INITIALIZER
;
1817 UText input2
= UTEXT_INITIALIZER
;
1818 UText empty
= UTEXT_INITIALIZER
;
1819 regextst_openUTF8FromInvariant(&input1
, "abcdef this is a test", -1, &status
);
1820 REGEX_VERBOSE_TEXT(&input1
);
1821 regextst_openUTF8FromInvariant(&input2
, "not abc", -1, &status
);
1822 REGEX_VERBOSE_TEXT(&input2
);
1823 utext_openUChars(&empty
, NULL
, 0, &status
);
1825 int32_t input1Len
= static_cast<int32_t>(strlen("abcdef this is a test")); /* TODO: why not nativelen (input1) ? */
1826 int32_t input2Len
= static_cast<int32_t>(strlen("not abc"));
1830 // Matcher creation and reset.
1832 RegexMatcher
*m1
= &pat2
->matcher(status
)->reset(&input1
);
1834 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
1835 const char str_abcdefthisisatest
[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1836 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest
, m1
->inputText());
1838 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
1839 const char str_notabc
[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1840 REGEX_ASSERT_UTEXT_UTF8(str_notabc
, m1
->inputText());
1842 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest
, m1
->inputText());
1843 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
1845 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
1846 REGEX_ASSERT(utext_nativeLength(&empty
) == 0);
1849 // reset(pos, status)
1852 m1
->reset(4, status
);
1854 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest
, m1
->inputText());
1855 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
1857 m1
->reset(-1, status
);
1858 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1859 status
= U_ZERO_ERROR
;
1861 m1
->reset(0, status
);
1863 status
= U_ZERO_ERROR
;
1865 m1
->reset(input1Len
-1, status
);
1867 status
= U_ZERO_ERROR
;
1869 m1
->reset(input1Len
, status
);
1871 status
= U_ZERO_ERROR
;
1873 m1
->reset(input1Len
+1, status
);
1874 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1875 status
= U_ZERO_ERROR
;
1878 // match(pos, status)
1881 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
1883 REGEX_ASSERT(m1
->matches(3, status
) == FALSE
);
1885 REGEX_ASSERT(m1
->matches(5, status
) == FALSE
);
1886 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
1887 REGEX_ASSERT(m1
->matches(-1, status
) == FALSE
);
1888 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1890 // Match() at end of string should fail, but should not
1892 status
= U_ZERO_ERROR
;
1893 REGEX_ASSERT(m1
->matches(input2Len
, status
) == FALSE
);
1896 // Match beyond end of string should fail with an error.
1897 status
= U_ZERO_ERROR
;
1898 REGEX_ASSERT(m1
->matches(input2Len
+1, status
) == FALSE
);
1899 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1901 // Successful match at end of string.
1903 status
= U_ZERO_ERROR
;
1904 RegexMatcher
m("A?", 0, status
); // will match zero length string.
1907 REGEX_ASSERT(m
.matches(input1Len
, status
) == TRUE
);
1910 REGEX_ASSERT(m
.matches(0, status
) == TRUE
);
1916 // lookingAt(pos, status)
1918 status
= U_ZERO_ERROR
;
1919 m1
->reset(&input2
); // "not abc"
1920 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
1921 REGEX_ASSERT(m1
->lookingAt(5, status
) == FALSE
);
1922 REGEX_ASSERT(m1
->lookingAt(3, status
) == FALSE
);
1923 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
1924 REGEX_ASSERT(m1
->lookingAt(-1, status
) == FALSE
);
1925 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1926 status
= U_ZERO_ERROR
;
1927 REGEX_ASSERT(m1
->lookingAt(input2Len
, status
) == FALSE
);
1929 REGEX_ASSERT(m1
->lookingAt(input2Len
+1, status
) == FALSE
);
1930 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1936 utext_close(&input1
);
1937 utext_close(&input2
);
1938 utext_close(&empty
);
1944 // RegexMatcher::start();
1945 // RegexMatcher::end();
1946 // RegexMatcher::groupCount();
1951 UErrorCode status
=U_ZERO_ERROR
;
1952 UText re
=UTEXT_INITIALIZER
;
1953 const char str_01234567_pat
[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1954 utext_openUTF8(&re
, str_01234567_pat
, -1, &status
);
1956 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
1959 UText input
= UTEXT_INITIALIZER
;
1960 const char str_0123456789
[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1961 utext_openUTF8(&input
, str_0123456789
, -1, &status
);
1963 RegexMatcher
*matcher
= &pat
->matcher(status
)->reset(&input
);
1965 REGEX_ASSERT(matcher
->lookingAt(status
) == TRUE
);
1966 static const int32_t matchStarts
[] = {0, 2, 4, 8};
1967 static const int32_t matchEnds
[] = {10, 8, 6, 10};
1969 for (i
=0; i
<4; i
++) {
1970 int32_t actualStart
= matcher
->start(i
, status
);
1972 if (actualStart
!= matchStarts
[i
]) {
1973 errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n",
1974 __FILE__
, __LINE__
, i
, matchStarts
[i
], actualStart
);
1976 int32_t actualEnd
= matcher
->end(i
, status
);
1978 if (actualEnd
!= matchEnds
[i
]) {
1979 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n",
1980 __FILE__
, __LINE__
, i
, matchEnds
[i
], actualEnd
);
1984 REGEX_ASSERT(matcher
->start(0, status
) == matcher
->start(status
));
1985 REGEX_ASSERT(matcher
->end(0, status
) == matcher
->end(status
));
1987 REGEX_ASSERT_FAIL(matcher
->start(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
1988 REGEX_ASSERT_FAIL(matcher
->start( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
1990 REGEX_ASSERT_FAIL(matcher
->start( 0, status
), U_REGEX_INVALID_STATE
);
1992 matcher
->lookingAt(status
);
1995 UText destText
= UTEXT_INITIALIZER
;
1996 utext_openUnicodeString(&destText
, &dest
, &status
);
1998 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1999 // Test shallow-clone API
2001 result
= matcher
->group((UText
*)NULL
, group_len
, status
);
2003 REGEX_ASSERT_UTEXT_UTF8(str_0123456789
, result
);
2004 utext_close(result
);
2005 result
= matcher
->group(0, &destText
, group_len
, status
);
2007 REGEX_ASSERT(result
== &destText
);
2008 REGEX_ASSERT_UTEXT_UTF8(str_0123456789
, result
);
2009 // destText is now immutable, reopen it
2010 utext_close(&destText
);
2011 utext_openUnicodeString(&destText
, &dest
, &status
);
2014 result
= matcher
->group(0, NULL
, length
, status
);
2016 REGEX_ASSERT_UTEXT_UTF8(str_0123456789
, result
);
2017 utext_close(result
);
2018 result
= matcher
->group(0, &destText
, length
, status
);
2020 REGEX_ASSERT(result
== &destText
);
2021 REGEX_ASSERT(utext_getNativeIndex(result
) == 0);
2022 REGEX_ASSERT(length
== 10);
2023 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2025 // Capture Group 1 == "234567"
2026 result
= matcher
->group(1, NULL
, length
, status
);
2028 REGEX_ASSERT(utext_getNativeIndex(result
) == 2);
2029 REGEX_ASSERT(length
== 6);
2030 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2031 utext_close(result
);
2033 result
= matcher
->group(1, &destText
, length
, status
);
2035 REGEX_ASSERT(result
== &destText
);
2036 REGEX_ASSERT(utext_getNativeIndex(result
) == 2);
2037 REGEX_ASSERT(length
== 6);
2038 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2039 utext_close(result
);
2041 // Capture Group 2 == "45"
2042 result
= matcher
->group(2, NULL
, length
, status
);
2044 REGEX_ASSERT(utext_getNativeIndex(result
) == 4);
2045 REGEX_ASSERT(length
== 2);
2046 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2047 utext_close(result
);
2049 result
= matcher
->group(2, &destText
, length
, status
);
2051 REGEX_ASSERT(result
== &destText
);
2052 REGEX_ASSERT(utext_getNativeIndex(result
) == 4);
2053 REGEX_ASSERT(length
== 2);
2054 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2055 utext_close(result
);
2057 // Capture Group 3 == "89"
2058 result
= matcher
->group(3, NULL
, length
, status
);
2060 REGEX_ASSERT(utext_getNativeIndex(result
) == 8);
2061 REGEX_ASSERT(length
== 2);
2062 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2063 utext_close(result
);
2065 result
= matcher
->group(3, &destText
, length
, status
);
2067 REGEX_ASSERT(result
== &destText
);
2068 REGEX_ASSERT(utext_getNativeIndex(result
) == 8);
2069 REGEX_ASSERT(length
== 2);
2070 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2071 utext_close(result
);
2073 // Capture Group number out of range.
2074 status
= U_ZERO_ERROR
;
2075 REGEX_ASSERT_FAIL(matcher
->group(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2076 status
= U_ZERO_ERROR
;
2077 REGEX_ASSERT_FAIL(matcher
->group( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2078 status
= U_ZERO_ERROR
;
2080 REGEX_ASSERT_FAIL(matcher
->group( 0, status
), U_REGEX_INVALID_STATE
);
2085 utext_close(&destText
);
2086 utext_close(&input
);
2096 UErrorCode status
=U_ZERO_ERROR
;
2097 UText re
=UTEXT_INITIALIZER
;
2098 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2099 utext_openUTF8(&re
, str_abc
, -1, &status
);
2101 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
2103 UText input
= UTEXT_INITIALIZER
;
2104 const char str_abcabcabc
[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2105 utext_openUTF8(&input
, str_abcabcabc
, -1, &status
);
2106 // 012345678901234567
2108 RegexMatcher
*matcher
= &pat
->matcher(status
)->reset(&input
);
2110 REGEX_ASSERT(matcher
->find());
2111 REGEX_ASSERT(matcher
->start(status
) == 1);
2112 REGEX_ASSERT(matcher
->find());
2113 REGEX_ASSERT(matcher
->start(status
) == 6);
2114 REGEX_ASSERT(matcher
->find());
2115 REGEX_ASSERT(matcher
->start(status
) == 12);
2116 REGEX_ASSERT(matcher
->find() == FALSE
);
2117 REGEX_ASSERT(matcher
->find() == FALSE
);
2120 REGEX_ASSERT(matcher
->find());
2121 REGEX_ASSERT(matcher
->start(status
) == 1);
2123 REGEX_ASSERT(matcher
->find(0, status
));
2124 REGEX_ASSERT(matcher
->start(status
) == 1);
2125 REGEX_ASSERT(matcher
->find(1, status
));
2126 REGEX_ASSERT(matcher
->start(status
) == 1);
2127 REGEX_ASSERT(matcher
->find(2, status
));
2128 REGEX_ASSERT(matcher
->start(status
) == 6);
2129 REGEX_ASSERT(matcher
->find(12, status
));
2130 REGEX_ASSERT(matcher
->start(status
) == 12);
2131 REGEX_ASSERT(matcher
->find(13, status
) == FALSE
);
2132 REGEX_ASSERT(matcher
->find(16, status
) == FALSE
);
2133 REGEX_ASSERT(matcher
->find(17, status
) == FALSE
);
2134 REGEX_ASSERT_FAIL(matcher
->start(status
), U_REGEX_INVALID_STATE
);
2136 status
= U_ZERO_ERROR
;
2137 REGEX_ASSERT_FAIL(matcher
->find(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2138 status
= U_ZERO_ERROR
;
2139 REGEX_ASSERT_FAIL(matcher
->find(18, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2141 REGEX_ASSERT(matcher
->groupCount() == 0);
2146 utext_close(&input
);
2152 // find, with \G in pattern (true if at the end of a previous match).
2157 UErrorCode status
=U_ZERO_ERROR
;
2158 UText re
=UTEXT_INITIALIZER
;
2159 const char str_Gabcabc
[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2160 utext_openUTF8(&re
, str_Gabcabc
, -1, &status
);
2162 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
2165 UText input
= UTEXT_INITIALIZER
;
2166 const char str_abcabcabc
[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2167 utext_openUTF8(&input
, str_abcabcabc
, -1, &status
);
2168 // 012345678901234567
2170 RegexMatcher
*matcher
= &pat
->matcher(status
)->reset(&input
);
2172 REGEX_ASSERT(matcher
->find());
2173 REGEX_ASSERT(matcher
->start(status
) == 0);
2174 REGEX_ASSERT(matcher
->start(1, status
) == -1);
2175 REGEX_ASSERT(matcher
->start(2, status
) == 1);
2177 REGEX_ASSERT(matcher
->find());
2178 REGEX_ASSERT(matcher
->start(status
) == 4);
2179 REGEX_ASSERT(matcher
->start(1, status
) == 4);
2180 REGEX_ASSERT(matcher
->start(2, status
) == -1);
2186 utext_close(&input
);
2191 // find with zero length matches, match position should bump ahead
2192 // to prevent loops.
2196 UErrorCode status
=U_ZERO_ERROR
;
2197 RegexMatcher
m("(?= ?)", 0, status
); // This pattern will zero-length matches anywhere,
2198 // using an always-true look-ahead.
2200 UText s
= UTEXT_INITIALIZER
;
2201 utext_openUTF8(&s
, " ", -1, &status
);
2204 if (m
.find() == FALSE
) {
2207 REGEX_ASSERT(m
.start(status
) == i
);
2208 REGEX_ASSERT(m
.end(status
) == i
);
2212 // Check that the bump goes over characters outside the BMP OK
2213 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2214 unsigned char aboveBMP
[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2215 utext_openUTF8(&s
, (char *)aboveBMP
, -1, &status
);
2218 if (m
.find() == FALSE
) {
2221 REGEX_ASSERT(m
.start(status
) == i
);
2222 REGEX_ASSERT(m
.end(status
) == i
);
2224 REGEX_ASSERT(i
==20);
2229 // find() loop breaking test.
2230 // with pattern of /.?/, should see a series of one char matches, then a single
2231 // match of zero length at the end of the input string.
2233 UErrorCode status
=U_ZERO_ERROR
;
2234 RegexMatcher
m(".?", 0, status
);
2236 UText s
= UTEXT_INITIALIZER
;
2237 utext_openUTF8(&s
, " ", -1, &status
);
2240 if (m
.find() == FALSE
) {
2243 REGEX_ASSERT(m
.start(status
) == i
);
2244 REGEX_ASSERT(m
.end(status
) == (i
<4 ? i
+1 : i
));
2253 // Matchers with no input string behave as if they had an empty input string.
2257 UErrorCode status
= U_ZERO_ERROR
;
2258 RegexMatcher
m(".?", 0, status
);
2260 REGEX_ASSERT(m
.find());
2261 REGEX_ASSERT(m
.start(status
) == 0);
2262 REGEX_ASSERT(m
.input() == "");
2265 UErrorCode status
= U_ZERO_ERROR
;
2266 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
2267 RegexMatcher
*m
= p
->matcher(status
);
2270 REGEX_ASSERT(m
->find() == FALSE
);
2271 REGEX_ASSERT(utext_nativeLength(m
->inputText()) == 0);
2280 UErrorCode status
= U_ZERO_ERROR
;
2281 UText testPattern
= UTEXT_INITIALIZER
;
2282 UText testText
= UTEXT_INITIALIZER
;
2283 regextst_openUTF8FromInvariant(&testPattern
, ".*", -1, &status
);
2284 REGEX_VERBOSE_TEXT(&testPattern
);
2285 regextst_openUTF8FromInvariant(&testText
, "This is test data", -1, &status
);
2286 REGEX_VERBOSE_TEXT(&testText
);
2288 RegexMatcher
m(&testPattern
, &testText
, 0, status
);
2290 REGEX_ASSERT(m
.regionStart() == 0);
2291 REGEX_ASSERT(m
.regionEnd() == (int32_t)strlen("This is test data"));
2292 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2293 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2295 m
.region(2,4, status
);
2297 REGEX_ASSERT(m
.matches(status
));
2298 REGEX_ASSERT(m
.start(status
)==2);
2299 REGEX_ASSERT(m
.end(status
)==4);
2303 REGEX_ASSERT(m
.regionStart() == 0);
2304 REGEX_ASSERT(m
.regionEnd() == (int32_t)strlen("This is test data"));
2306 regextst_openUTF8FromInvariant(&testText
, "short", -1, &status
);
2307 REGEX_VERBOSE_TEXT(&testText
);
2309 REGEX_ASSERT(m
.regionStart() == 0);
2310 REGEX_ASSERT(m
.regionEnd() == (int32_t)strlen("short"));
2312 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2313 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(FALSE
));
2314 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
2315 REGEX_ASSERT(&m
== &m
.reset());
2316 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
2318 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(TRUE
));
2319 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2320 REGEX_ASSERT(&m
== &m
.reset());
2321 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2323 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2324 REGEX_ASSERT(&m
== &m
.useTransparentBounds(TRUE
));
2325 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
2326 REGEX_ASSERT(&m
== &m
.reset());
2327 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
2329 REGEX_ASSERT(&m
== &m
.useTransparentBounds(FALSE
));
2330 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2331 REGEX_ASSERT(&m
== &m
.reset());
2332 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2334 utext_close(&testText
);
2335 utext_close(&testPattern
);
2339 // hitEnd() and requireEnd()
2342 UErrorCode status
= U_ZERO_ERROR
;
2343 UText testPattern
= UTEXT_INITIALIZER
;
2344 UText testText
= UTEXT_INITIALIZER
;
2345 const char str_
[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2346 const char str_aabb
[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2347 utext_openUTF8(&testPattern
, str_
, -1, &status
);
2348 utext_openUTF8(&testText
, str_aabb
, -1, &status
);
2350 RegexMatcher
m1(&testPattern
, &testText
, 0, status
);
2351 REGEX_ASSERT(m1
.lookingAt(status
) == TRUE
);
2352 REGEX_ASSERT(m1
.hitEnd() == TRUE
);
2353 REGEX_ASSERT(m1
.requireEnd() == FALSE
);
2356 status
= U_ZERO_ERROR
;
2357 const char str_a
[] = { 0x61, 0x2a, 0x00 }; /* a* */
2358 utext_openUTF8(&testPattern
, str_a
, -1, &status
);
2359 RegexMatcher
m2(&testPattern
, &testText
, 0, status
);
2360 REGEX_ASSERT(m2
.lookingAt(status
) == TRUE
);
2361 REGEX_ASSERT(m2
.hitEnd() == FALSE
);
2362 REGEX_ASSERT(m2
.requireEnd() == FALSE
);
2365 status
= U_ZERO_ERROR
;
2366 const char str_dotstardollar
[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2367 utext_openUTF8(&testPattern
, str_dotstardollar
, -1, &status
);
2368 RegexMatcher
m3(&testPattern
, &testText
, 0, status
);
2369 REGEX_ASSERT(m3
.lookingAt(status
) == TRUE
);
2370 REGEX_ASSERT(m3
.hitEnd() == TRUE
);
2371 REGEX_ASSERT(m3
.requireEnd() == TRUE
);
2374 utext_close(&testText
);
2375 utext_close(&testPattern
);
2380 //---------------------------------------------------------------------------
2382 // API_Replace_UTF8 API test for class RegexMatcher, testing the
2383 // Replace family of functions.
2385 //---------------------------------------------------------------------------
2386 void RegexTest::API_Replace_UTF8() {
2392 UErrorCode status
=U_ZERO_ERROR
;
2394 UText re
=UTEXT_INITIALIZER
;
2395 regextst_openUTF8FromInvariant(&re
, "abc", -1, &status
);
2396 REGEX_VERBOSE_TEXT(&re
);
2397 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
2400 char data
[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2401 // 012345678901234567
2402 UText dataText
= UTEXT_INITIALIZER
;
2403 utext_openUTF8(&dataText
, data
, -1, &status
);
2405 REGEX_VERBOSE_TEXT(&dataText
);
2406 RegexMatcher
*matcher
= &pat
->matcher(status
)->reset(&dataText
);
2409 // Plain vanilla matches.
2412 UText destText
= UTEXT_INITIALIZER
;
2413 utext_openUnicodeString(&destText
, &dest
, &status
);
2416 UText replText
= UTEXT_INITIALIZER
;
2418 const char str_yz
[] = { 0x79, 0x7a, 0x00 }; /* yz */
2419 utext_openUTF8(&replText
, str_yz
, -1, &status
);
2420 REGEX_VERBOSE_TEXT(&replText
);
2421 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2423 const char str_yzabcabc
[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2424 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc
, result
);
2425 utext_close(result
);
2426 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2428 REGEX_ASSERT(result
== &destText
);
2429 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc
, result
);
2431 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2433 const char str_yzyzyz
[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2434 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz
, result
);
2435 utext_close(result
);
2437 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2438 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2440 REGEX_ASSERT(result
== &destText
);
2441 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz
, result
);
2444 // Plain vanilla non-matches.
2446 const char str_abxabxabx
[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2447 utext_openUTF8(&dataText
, str_abxabxabx
, -1, &status
);
2448 matcher
->reset(&dataText
);
2450 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2452 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2453 utext_close(result
);
2454 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2456 REGEX_ASSERT(result
== &destText
);
2457 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2459 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2461 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2462 utext_close(result
);
2463 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2464 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2466 REGEX_ASSERT(result
== &destText
);
2467 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2470 // Empty source string
2472 utext_openUTF8(&dataText
, NULL
, 0, &status
);
2473 matcher
->reset(&dataText
);
2475 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2477 REGEX_ASSERT_UTEXT_UTF8("", result
);
2478 utext_close(result
);
2479 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2481 REGEX_ASSERT(result
== &destText
);
2482 REGEX_ASSERT_UTEXT_UTF8("", result
);
2484 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2486 REGEX_ASSERT_UTEXT_UTF8("", result
);
2487 utext_close(result
);
2488 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2490 REGEX_ASSERT(result
== &destText
);
2491 REGEX_ASSERT_UTEXT_UTF8("", result
);
2494 // Empty substitution string
2496 utext_openUTF8(&dataText
, data
, -1, &status
); // ".abc..abc...abc.."
2497 matcher
->reset(&dataText
);
2499 utext_openUTF8(&replText
, NULL
, 0, &status
);
2500 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2502 const char str_abcabc
[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2503 REGEX_ASSERT_UTEXT_UTF8(str_abcabc
, result
);
2504 utext_close(result
);
2505 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2507 REGEX_ASSERT(result
== &destText
);
2508 REGEX_ASSERT_UTEXT_UTF8(str_abcabc
, result
);
2510 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2512 const char str_dots
[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2513 REGEX_ASSERT_UTEXT_UTF8(str_dots
, result
);
2514 utext_close(result
);
2515 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2516 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2518 REGEX_ASSERT(result
== &destText
);
2519 REGEX_ASSERT_UTEXT_UTF8(str_dots
, result
);
2522 // match whole string
2524 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2525 utext_openUTF8(&dataText
, str_abc
, -1, &status
);
2526 matcher
->reset(&dataText
);
2528 const char str_xyz
[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2529 utext_openUTF8(&replText
, str_xyz
, -1, &status
);
2530 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2532 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2533 utext_close(result
);
2534 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2535 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2537 REGEX_ASSERT(result
== &destText
);
2538 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2540 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2542 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2543 utext_close(result
);
2544 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2545 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2547 REGEX_ASSERT(result
== &destText
);
2548 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2551 // Capture Group, simple case
2553 const char str_add
[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2554 utext_openUTF8(&re
, str_add
, -1, &status
);
2555 RegexPattern
*pat2
= RegexPattern::compile(&re
, flags
, pe
, status
);
2558 const char str_abcdefg
[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2559 utext_openUTF8(&dataText
, str_abcdefg
, -1, &status
);
2560 RegexMatcher
*matcher2
= &pat2
->matcher(status
)->reset(&dataText
);
2563 const char str_11
[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2564 utext_openUTF8(&replText
, str_11
, -1, &status
);
2565 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2567 const char str_bcbcdefg
[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2568 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg
, result
);
2569 utext_close(result
);
2570 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2571 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2573 REGEX_ASSERT(result
== &destText
);
2574 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg
, result
);
2576 const char str_v
[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2577 utext_openUTF8(&replText
, str_v
, -1, &status
);
2578 REGEX_VERBOSE_TEXT(&replText
);
2579 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2581 const char str_Thevalueof1isbcdefg
[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2582 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg
, result
);
2583 utext_close(result
);
2584 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2585 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2587 REGEX_ASSERT(result
== &destText
);
2588 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg
, result
);
2590 const char str_byitselfnogroupnumber
[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
2591 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
2592 0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
2593 utext_openUTF8(&replText
, str_byitselfnogroupnumber
, -1, &status
);
2594 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2596 const char str_byitselfnogroupnumberdefg
[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2597 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg
, result
);
2598 utext_close(result
);
2599 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2600 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2602 REGEX_ASSERT(result
== &destText
);
2603 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg
, result
);
2605 unsigned char supplDigitChars
[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2606 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2607 // 012345678901234567890123456
2608 supplDigitChars
[22] = 0xF0;
2609 supplDigitChars
[23] = 0x9D;
2610 supplDigitChars
[24] = 0x9F;
2611 supplDigitChars
[25] = 0x8F;
2612 utext_openUTF8(&replText
, (char *)supplDigitChars
, -1, &status
);
2614 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2616 const char str_SupplementalDigit1bcdefg
[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2617 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg
, result
);
2618 utext_close(result
);
2619 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2620 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2622 REGEX_ASSERT(result
== &destText
);
2623 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg
, result
);
2624 const char str_badcapturegroupnumber5
[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */
2625 utext_openUTF8(&replText
, str_badcapturegroupnumber5
, -1, &status
);
2626 REGEX_ASSERT_FAIL((result
= matcher2
->replaceFirst(&replText
, NULL
, status
)), U_INDEX_OUTOFBOUNDS_ERROR
);
2627 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2628 utext_close(result
);
2629 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2630 REGEX_ASSERT_FAIL((result
= matcher2
->replaceFirst(&replText
, &destText
, status
)), U_INDEX_OUTOFBOUNDS_ERROR
);
2631 REGEX_ASSERT(result
== &destText
);
2632 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2635 // Replacement String with \u hex escapes
2638 const char str_abc1abc2abc3
[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2639 const char str_u0043
[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2640 utext_openUTF8(&dataText
, str_abc1abc2abc3
, -1, &status
);
2641 utext_openUTF8(&replText
, str_u0043
, -1, &status
);
2642 matcher
->reset(&dataText
);
2644 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2646 const char str_C1C2C3
[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2647 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3
, result
);
2648 utext_close(result
);
2649 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2650 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2652 REGEX_ASSERT(result
== &destText
);
2653 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3
, result
);
2656 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2657 utext_openUTF8(&dataText
, str_abc
, -1, &status
);
2658 const char str_U00010000
[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2659 utext_openUTF8(&replText
, str_U00010000
, -1, &status
);
2660 matcher
->reset(&dataText
);
2662 unsigned char expected
[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2669 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2671 REGEX_ASSERT_UTEXT_UTF8((char *)expected
, result
);
2672 utext_close(result
);
2673 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2674 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2676 REGEX_ASSERT(result
== &destText
);
2677 REGEX_ASSERT_UTEXT_UTF8((char *)expected
, result
);
2679 // TODO: need more through testing of capture substitutions.
2684 status
= U_ZERO_ERROR
;
2685 const char str_ssee
[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2686 const char str_blah
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2687 const char str_ooh
[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2688 utext_openUTF8(&re
, str_ssee
, -1, &status
);
2689 utext_openUTF8(&dataText
, str_blah
, -1, &status
);
2690 utext_openUTF8(&replText
, str_ooh
, -1, &status
);
2692 RegexMatcher
m(&re
, 0, status
);
2695 UnicodeString result
;
2696 UText resultText
= UTEXT_INITIALIZER
;
2697 utext_openUnicodeString(&resultText
, &result
, &status
);
2699 // Multiple finds do NOT bump up the previous appendReplacement postion.
2703 m
.appendReplacement(&resultText
, &replText
, status
);
2705 const char str_blah2
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2706 REGEX_ASSERT_UTEXT_UTF8(str_blah2
, &resultText
);
2708 // After a reset into the interior of a string, appendReplacement still starts at beginning.
2709 status
= U_ZERO_ERROR
;
2711 utext_openUnicodeString(&resultText
, &result
, &status
);
2712 m
.reset(10, status
);
2715 m
.appendReplacement(&resultText
, &replText
, status
);
2717 const char str_blah3
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2718 REGEX_ASSERT_UTEXT_UTF8(str_blah3
, &resultText
);
2720 // find() at interior of string, appendReplacement still starts at beginning.
2721 status
= U_ZERO_ERROR
;
2723 utext_openUnicodeString(&resultText
, &result
, &status
);
2727 m
.appendReplacement(&resultText
, &replText
, status
);
2729 const char str_blah8
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2730 REGEX_ASSERT_UTEXT_UTF8(str_blah8
, &resultText
);
2732 m
.appendTail(&resultText
, status
);
2733 const char str_blah9
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2734 REGEX_ASSERT_UTEXT_UTF8(str_blah9
, &resultText
);
2736 utext_close(&resultText
);
2744 utext_close(&dataText
);
2745 utext_close(&replText
);
2746 utext_close(&destText
);
2751 //---------------------------------------------------------------------------
2753 // API_Pattern_UTF8 Test that the API for class RegexPattern is
2754 // present and nominally working.
2756 //---------------------------------------------------------------------------
2757 void RegexTest::API_Pattern_UTF8() {
2758 RegexPattern pata
; // Test default constructor to not crash.
2761 REGEX_ASSERT(pata
== patb
);
2762 REGEX_ASSERT(pata
== pata
);
2764 UText re1
= UTEXT_INITIALIZER
;
2765 UText re2
= UTEXT_INITIALIZER
;
2766 UErrorCode status
= U_ZERO_ERROR
;
2769 const char str_abcalmz
[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2770 const char str_def
[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2771 utext_openUTF8(&re1
, str_abcalmz
, -1, &status
);
2772 utext_openUTF8(&re2
, str_def
, -1, &status
);
2774 RegexPattern
*pat1
= RegexPattern::compile(&re1
, 0, pe
, status
);
2775 RegexPattern
*pat2
= RegexPattern::compile(&re2
, 0, pe
, status
);
2777 REGEX_ASSERT(*pat1
== *pat1
);
2778 REGEX_ASSERT(*pat1
!= pata
);
2782 REGEX_ASSERT(patb
== *pat1
);
2785 RegexPattern
patc(*pat1
);
2786 REGEX_ASSERT(patc
== *pat1
);
2787 REGEX_ASSERT(patb
== patc
);
2788 REGEX_ASSERT(pat1
!= pat2
);
2790 REGEX_ASSERT(patb
!= patc
);
2791 REGEX_ASSERT(patb
== *pat2
);
2793 // Compile with no flags.
2794 RegexPattern
*pat1a
= RegexPattern::compile(&re1
, pe
, status
);
2795 REGEX_ASSERT(*pat1a
== *pat1
);
2797 REGEX_ASSERT(pat1a
->flags() == 0);
2799 // Compile with different flags should be not equal
2800 RegexPattern
*pat1b
= RegexPattern::compile(&re1
, UREGEX_CASE_INSENSITIVE
, pe
, status
);
2803 REGEX_ASSERT(*pat1b
!= *pat1a
);
2804 REGEX_ASSERT(pat1b
->flags() == UREGEX_CASE_INSENSITIVE
);
2805 REGEX_ASSERT(pat1a
->flags() == 0);
2809 RegexPattern
*pat1c
= pat1
->clone();
2810 REGEX_ASSERT(*pat1c
== *pat1
);
2811 REGEX_ASSERT(*pat1c
!= *pat2
);
2823 // Verify that a matcher created from a cloned pattern works.
2827 UErrorCode status
= U_ZERO_ERROR
;
2828 UText pattern
= UTEXT_INITIALIZER
;
2829 const char str_pL
[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2830 utext_openUTF8(&pattern
, str_pL
, -1, &status
);
2832 RegexPattern
*pSource
= RegexPattern::compile(&pattern
, 0, status
);
2833 RegexPattern
*pClone
= pSource
->clone();
2835 RegexMatcher
*mFromClone
= pClone
->matcher(status
);
2838 UText input
= UTEXT_INITIALIZER
;
2839 const char str_HelloWorld
[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2840 utext_openUTF8(&input
, str_HelloWorld
, -1, &status
);
2841 mFromClone
->reset(&input
);
2842 REGEX_ASSERT(mFromClone
->find() == TRUE
);
2843 REGEX_ASSERT(mFromClone
->group(status
) == "Hello");
2844 REGEX_ASSERT(mFromClone
->find() == TRUE
);
2845 REGEX_ASSERT(mFromClone
->group(status
) == "World");
2846 REGEX_ASSERT(mFromClone
->find() == FALSE
);
2850 utext_close(&input
);
2851 utext_close(&pattern
);
2855 // matches convenience API
2858 UErrorCode status
= U_ZERO_ERROR
;
2859 UText pattern
= UTEXT_INITIALIZER
;
2860 UText input
= UTEXT_INITIALIZER
;
2862 const char str_randominput
[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2863 utext_openUTF8(&input
, str_randominput
, -1, &status
);
2865 const char str_dotstar
[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2866 utext_openUTF8(&pattern
, str_dotstar
, -1, &status
);
2867 REGEX_ASSERT(RegexPattern::matches(&pattern
, &input
, pe
, status
) == TRUE
);
2870 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2871 utext_openUTF8(&pattern
, str_abc
, -1, &status
);
2872 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe
, status
) == FALSE
);
2875 const char str_nput
[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2876 utext_openUTF8(&pattern
, str_nput
, -1, &status
);
2877 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe
, status
) == TRUE
);
2880 utext_openUTF8(&pattern
, str_randominput
, -1, &status
);
2881 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe
, status
) == TRUE
);
2884 const char str_u
[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2885 utext_openUTF8(&pattern
, str_u
, -1, &status
);
2886 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe
, status
) == FALSE
);
2889 utext_openUTF8(&input
, str_abc
, -1, &status
);
2890 utext_openUTF8(&pattern
, str_abc
, -1, &status
);
2891 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
2892 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe
, status
) == FALSE
);
2893 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
2895 utext_close(&input
);
2896 utext_close(&pattern
);
2903 status
= U_ZERO_ERROR
;
2904 const char str_spaceplus
[] = { 0x20, 0x2b, 0x00 }; /* + */
2905 utext_openUTF8(&re1
, str_spaceplus
, -1, &status
);
2906 pat1
= RegexPattern::compile(&re1
, pe
, status
);
2908 UnicodeString fields
[10];
2911 n
= pat1
->split("Now is the time", fields
, 10, status
);
2914 REGEX_ASSERT(fields
[0]=="Now");
2915 REGEX_ASSERT(fields
[1]=="is");
2916 REGEX_ASSERT(fields
[2]=="the");
2917 REGEX_ASSERT(fields
[3]=="time");
2918 REGEX_ASSERT(fields
[4]=="");
2920 n
= pat1
->split("Now is the time", fields
, 2, status
);
2923 REGEX_ASSERT(fields
[0]=="Now");
2924 REGEX_ASSERT(fields
[1]=="is the time");
2925 REGEX_ASSERT(fields
[2]=="the"); // left over from previous test
2928 status
= U_ZERO_ERROR
;
2929 n
= pat1
->split("Now is the time", fields
, 1, status
);
2932 REGEX_ASSERT(fields
[0]=="Now is the time");
2933 REGEX_ASSERT(fields
[1]=="*");
2934 status
= U_ZERO_ERROR
;
2936 n
= pat1
->split(" Now is the time ", fields
, 10, status
);
2939 REGEX_ASSERT(fields
[0]=="");
2940 REGEX_ASSERT(fields
[1]=="Now");
2941 REGEX_ASSERT(fields
[2]=="is");
2942 REGEX_ASSERT(fields
[3]=="the");
2943 REGEX_ASSERT(fields
[4]=="time");
2944 REGEX_ASSERT(fields
[5]=="");
2945 REGEX_ASSERT(fields
[6]=="");
2948 n
= pat1
->split(" ", fields
, 10, status
);
2951 REGEX_ASSERT(fields
[0]=="");
2952 REGEX_ASSERT(fields
[1]=="");
2953 REGEX_ASSERT(fields
[2]=="*");
2956 n
= pat1
->split("", fields
, 10, status
);
2959 REGEX_ASSERT(fields
[0]=="foo");
2963 // split, with a pattern with (capture)
2964 regextst_openUTF8FromInvariant(&re1
, "<(\\w*)>", -1, &status
);
2965 pat1
= RegexPattern::compile(&re1
, pe
, status
);
2968 status
= U_ZERO_ERROR
;
2969 fields
[6] = fields
[7] = "*";
2970 n
= pat1
->split("<a>Now is <b>the time<c>", fields
, 10, status
);
2973 REGEX_ASSERT(fields
[0]=="");
2974 REGEX_ASSERT(fields
[1]=="a");
2975 REGEX_ASSERT(fields
[2]=="Now is ");
2976 REGEX_ASSERT(fields
[3]=="b");
2977 REGEX_ASSERT(fields
[4]=="the time");
2978 REGEX_ASSERT(fields
[5]=="c");
2979 REGEX_ASSERT(fields
[6]=="");
2980 REGEX_ASSERT(fields
[7]=="*");
2981 REGEX_ASSERT(status
==U_ZERO_ERROR
);
2983 fields
[6] = fields
[7] = "*";
2984 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 10, status
);
2987 REGEX_ASSERT(fields
[0]==" ");
2988 REGEX_ASSERT(fields
[1]=="a");
2989 REGEX_ASSERT(fields
[2]=="Now is ");
2990 REGEX_ASSERT(fields
[3]=="b");
2991 REGEX_ASSERT(fields
[4]=="the time");
2992 REGEX_ASSERT(fields
[5]=="c");
2993 REGEX_ASSERT(fields
[6]=="");
2994 REGEX_ASSERT(fields
[7]=="*");
2996 status
= U_ZERO_ERROR
;
2998 n
= pat1
->split(" <a>Now is <b>the time<c> ", fields
, 6, status
);
3001 REGEX_ASSERT(fields
[0]==" ");
3002 REGEX_ASSERT(fields
[1]=="a");
3003 REGEX_ASSERT(fields
[2]=="Now is ");
3004 REGEX_ASSERT(fields
[3]=="b");
3005 REGEX_ASSERT(fields
[4]=="the time");
3006 REGEX_ASSERT(fields
[5]==" ");
3007 REGEX_ASSERT(fields
[6]=="foo");
3009 status
= U_ZERO_ERROR
;
3011 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 5, status
);
3014 REGEX_ASSERT(fields
[0]==" ");
3015 REGEX_ASSERT(fields
[1]=="a");
3016 REGEX_ASSERT(fields
[2]=="Now is ");
3017 REGEX_ASSERT(fields
[3]=="b");
3018 REGEX_ASSERT(fields
[4]=="the time<c>");
3019 REGEX_ASSERT(fields
[5]=="foo");
3021 status
= U_ZERO_ERROR
;
3023 n
= pat1
->split(" <a>Now is <b>the time", fields
, 5, status
);
3026 REGEX_ASSERT(fields
[0]==" ");
3027 REGEX_ASSERT(fields
[1]=="a");
3028 REGEX_ASSERT(fields
[2]=="Now is ");
3029 REGEX_ASSERT(fields
[3]=="b");
3030 REGEX_ASSERT(fields
[4]=="the time");
3031 REGEX_ASSERT(fields
[5]=="foo");
3033 status
= U_ZERO_ERROR
;
3034 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 4, status
);
3037 REGEX_ASSERT(fields
[0]==" ");
3038 REGEX_ASSERT(fields
[1]=="a");
3039 REGEX_ASSERT(fields
[2]=="Now is ");
3040 REGEX_ASSERT(fields
[3]=="the time<c>");
3041 status
= U_ZERO_ERROR
;
3044 regextst_openUTF8FromInvariant(&re1
, "([-,])", -1, &status
);
3045 pat1
= RegexPattern::compile(&re1
, pe
, status
);
3047 n
= pat1
->split("1-10,20", fields
, 10, status
);
3050 REGEX_ASSERT(fields
[0]=="1");
3051 REGEX_ASSERT(fields
[1]=="-");
3052 REGEX_ASSERT(fields
[2]=="10");
3053 REGEX_ASSERT(fields
[3]==",");
3054 REGEX_ASSERT(fields
[4]=="20");
3059 // split of a UText based string, with library allocating output UTexts.
3062 status
= U_ZERO_ERROR
;
3063 RegexMatcher
matcher(UnicodeString("(:)"), 0, status
);
3064 UnicodeString
stringToSplit("first:second:third");
3065 UText
*textToSplit
= utext_openUnicodeString(NULL
, &stringToSplit
, &status
);
3068 UText
*splits
[10] = {NULL
};
3069 int32_t numFields
= matcher
.split(textToSplit
, splits
, UPRV_LENGTHOF(splits
), status
);
3071 REGEX_ASSERT(numFields
== 5);
3072 REGEX_ASSERT_UTEXT_INVARIANT("first", splits
[0]);
3073 REGEX_ASSERT_UTEXT_INVARIANT(":", splits
[1]);
3074 REGEX_ASSERT_UTEXT_INVARIANT("second", splits
[2]);
3075 REGEX_ASSERT_UTEXT_INVARIANT(":", splits
[3]);
3076 REGEX_ASSERT_UTEXT_INVARIANT("third", splits
[4]);
3077 REGEX_ASSERT(splits
[5] == NULL
);
3079 for (int i
=0; i
<UPRV_LENGTHOF(splits
); i
++) {
3081 utext_close(splits
[i
]);
3085 utext_close(textToSplit
);
3090 // RegexPattern::pattern() and patternText()
3092 pat1
= new RegexPattern();
3093 REGEX_ASSERT(pat1
->pattern() == "");
3094 REGEX_ASSERT_UTEXT_UTF8("", pat1
->patternText(status
));
3096 const char *helloWorldInvariant
= "(Hello, world)*";
3097 regextst_openUTF8FromInvariant(&re1
, helloWorldInvariant
, -1, &status
);
3098 pat1
= RegexPattern::compile(&re1
, pe
, status
);
3100 REGEX_ASSERT_UNISTR("(Hello, world)*", pat1
->pattern());
3101 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1
->patternText(status
));
3108 //---------------------------------------------------------------------------
3110 // Extended A more thorough check for features of regex patterns
3111 // The test cases are in a separate data file,
3112 // source/tests/testdata/regextst.txt
3113 // A description of the test data format is included in that file.
3115 //---------------------------------------------------------------------------
3118 RegexTest::getPath(char buffer
[2048], const char *filename
) {
3119 UErrorCode status
=U_ZERO_ERROR
;
3120 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
3121 if (U_FAILURE(status
)) {
3122 errln("ERROR: loadTestData() failed - %s", u_errorName(status
));
3126 strcpy(buffer
, testDataDirectory
);
3127 strcat(buffer
, filename
);
3131 void RegexTest::Extended() {
3133 const char *srcPath
;
3134 UErrorCode status
= U_ZERO_ERROR
;
3135 int32_t lineNum
= 0;
3138 // Open and read the test data file.
3140 srcPath
=getPath(tdd
, "regextst.txt");
3142 return; /* something went wrong, error already output */
3146 UChar
*testData
= ReadAndConvertFile(srcPath
, len
, "utf-8", status
);
3147 if (U_FAILURE(status
)) {
3148 return; /* something went wrong, error already output */
3152 // Put the test data into a UnicodeString
3154 UnicodeString
testString(FALSE
, testData
, len
);
3156 RegexMatcher
quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status
);
3157 RegexMatcher
commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status
);
3158 RegexMatcher
flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status
);
3160 RegexMatcher
lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString
, 0, status
);
3161 UnicodeString testPattern
; // The pattern for test from the test file.
3162 UnicodeString testFlags
; // the flags for a test.
3163 UnicodeString matchString
; // The marked up string to be used as input
3165 if (U_FAILURE(status
)){
3166 dataerrln("Construct RegexMatcher() error - %s", u_errorName(status
));
3172 // Loop over the test data file, once per line.
3174 while (lineMat
.find()) {
3176 if (U_FAILURE(status
)) {
3177 errln("%s:%d: ICU Error \"%s\"", srcPath
, lineNum
, u_errorName(status
));
3180 status
= U_ZERO_ERROR
;
3181 UnicodeString testLine
= lineMat
.group(1, status
);
3182 if (testLine
.length() == 0) {
3187 // Parse the test line. Skip blank and comment only lines.
3188 // Separate out the three main fields - pattern, flags, target.
3191 commentMat
.reset(testLine
);
3192 if (commentMat
.lookingAt(status
)) {
3193 // This line is a comment, or blank.
3198 // Pull out the pattern field, remove it from the test file line.
3200 quotedStuffMat
.reset(testLine
);
3201 if (quotedStuffMat
.lookingAt(status
)) {
3202 testPattern
= quotedStuffMat
.group(2, status
);
3203 testLine
.remove(0, quotedStuffMat
.end(0, status
));
3205 errln("Bad pattern (missing quotes?) at %s:%d", srcPath
, lineNum
);
3211 // Pull out the flags from the test file line.
3213 flagsMat
.reset(testLine
);
3214 flagsMat
.lookingAt(status
); // Will always match, possibly an empty string.
3215 testFlags
= flagsMat
.group(1, status
);
3216 if (flagsMat
.group(2, status
).length() > 0) {
3217 errln("Bad Match flag at line %d. Scanning %c\n",
3218 lineNum
, flagsMat
.group(2, status
).charAt(0));
3221 testLine
.remove(0, flagsMat
.end(0, status
));
3224 // Pull out the match string, as a whole.
3225 // We'll process the <tags> later.
3227 quotedStuffMat
.reset(testLine
);
3228 if (quotedStuffMat
.lookingAt(status
)) {
3229 matchString
= quotedStuffMat
.group(2, status
);
3230 testLine
.remove(0, quotedStuffMat
.end(0, status
));
3232 errln("Bad match string at test file line %d", lineNum
);
3237 // The only thing left from the input line should be an optional trailing comment.
3239 commentMat
.reset(testLine
);
3240 if (commentMat
.lookingAt(status
) == FALSE
) {
3241 errln("Line %d: unexpected characters at end of test line.", lineNum
);
3248 regex_find(testPattern
, testFlags
, matchString
, srcPath
, lineNum
);
3257 //---------------------------------------------------------------------------
3259 // regex_find(pattern, flags, inputString, lineNumber)
3261 // Function to run a single test from the Extended (data driven) tests.
3262 // See file test/testdata/regextst.txt for a description of the
3263 // pattern and inputString fields, and the allowed flags.
3264 // lineNumber is the source line in regextst.txt of the test.
3266 //---------------------------------------------------------------------------
3269 // Set a value into a UVector at position specified by a decimal number in
3270 // a UnicodeString. This is a utility function needed by the actual test function,
3272 static void set(UVector
&vec
, int32_t val
, UnicodeString index
) {
3273 UErrorCode status
=U_ZERO_ERROR
;
3275 for (int32_t i
=0; i
<index
.length(); i
++) {
3276 int32_t d
=u_charDigitValue(index
.charAt(i
));
3280 while (vec
.size()<idx
+1) {vec
.addElement(-1, status
);}
3281 vec
.setElementAt(val
, idx
);
3284 static void setInt(UVector
&vec
, int32_t val
, int32_t idx
) {
3285 UErrorCode status
=U_ZERO_ERROR
;
3286 while (vec
.size()<idx
+1) {vec
.addElement(-1, status
);}
3287 vec
.setElementAt(val
, idx
);
3290 static UBool
utextOffsetToNative(UText
*utext
, int32_t unistrOffset
, int32_t& nativeIndex
)
3292 UBool couldFind
= TRUE
;
3293 UTEXT_SETNATIVEINDEX(utext
, 0);
3295 while (i
< unistrOffset
) {
3296 UChar32 c
= UTEXT_NEXT32(utext
);
3297 if (c
!= U_SENTINEL
) {
3304 nativeIndex
= (int32_t)UTEXT_GETNATIVEINDEX(utext
);
3309 void RegexTest::regex_find(const UnicodeString
&pattern
,
3310 const UnicodeString
&flags
,
3311 const UnicodeString
&inputString
,
3312 const char *srcPath
,
3314 UnicodeString unEscapedInput
;
3315 UnicodeString deTaggedInput
;
3317 int32_t patternUTF8Length
, inputUTF8Length
;
3318 char *patternChars
= NULL
, *inputChars
= NULL
;
3319 UText patternText
= UTEXT_INITIALIZER
;
3320 UText inputText
= UTEXT_INITIALIZER
;
3321 UConverter
*UTF8Converter
= NULL
;
3323 UErrorCode status
= U_ZERO_ERROR
;
3325 RegexPattern
*parsePat
= NULL
;
3326 RegexMatcher
*parseMatcher
= NULL
;
3327 RegexPattern
*callerPattern
= NULL
, *UTF8Pattern
= NULL
;
3328 RegexMatcher
*matcher
= NULL
, *UTF8Matcher
= NULL
;
3329 UVector
groupStarts(status
);
3330 UVector
groupEnds(status
);
3331 UVector
groupStartsUTF8(status
);
3332 UVector
groupEndsUTF8(status
);
3333 UBool isMatch
= FALSE
, isUTF8Match
= FALSE
;
3334 UBool failed
= FALSE
;
3337 UBool useMatchesFunc
= FALSE
;
3338 UBool useLookingAtFunc
= FALSE
;
3339 int32_t regionStart
= -1;
3340 int32_t regionEnd
= -1;
3341 int32_t regionStartUTF8
= -1;
3342 int32_t regionEndUTF8
= -1;
3346 // Compile the caller's pattern
3348 uint32_t bflags
= 0;
3349 if (flags
.indexOf((UChar
)0x69) >= 0) { // 'i' flag
3350 bflags
|= UREGEX_CASE_INSENSITIVE
;
3352 if (flags
.indexOf((UChar
)0x78) >= 0) { // 'x' flag
3353 bflags
|= UREGEX_COMMENTS
;
3355 if (flags
.indexOf((UChar
)0x73) >= 0) { // 's' flag
3356 bflags
|= UREGEX_DOTALL
;
3358 if (flags
.indexOf((UChar
)0x6d) >= 0) { // 'm' flag
3359 bflags
|= UREGEX_MULTILINE
;
3362 if (flags
.indexOf((UChar
)0x65) >= 0) { // 'e' flag
3363 bflags
|= UREGEX_ERROR_ON_UNKNOWN_ESCAPES
;
3365 if (flags
.indexOf((UChar
)0x44) >= 0) { // 'D' flag
3366 bflags
|= UREGEX_UNIX_LINES
;
3368 if (flags
.indexOf((UChar
)0x51) >= 0) { // 'Q' flag
3369 bflags
|= UREGEX_LITERAL
;
3373 callerPattern
= RegexPattern::compile(pattern
, bflags
, pe
, status
);
3374 if (status
!= U_ZERO_ERROR
) {
3375 #if UCONFIG_NO_BREAK_ITERATION==1
3376 // 'v' test flag means that the test pattern should not compile if ICU was configured
3377 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3378 if (flags
.indexOf((UChar
)0x76) >= 0 /*'v'*/ && status
== U_UNSUPPORTED_ERROR
) {
3379 goto cleanupAndReturn
;
3382 if (flags
.indexOf((UChar
)0x45) >= 0) { // flags contain 'E'
3383 // Expected pattern compilation error.
3384 if (flags
.indexOf((UChar
)0x64) >= 0) { // flags contain 'd'
3385 logln("Pattern Compile returns \"%s\"", u_errorName(status
));
3387 goto cleanupAndReturn
;
3389 // Unexpected pattern compilation error.
3390 dataerrln("Line %d: error %s compiling pattern.", line
, u_errorName(status
));
3391 goto cleanupAndReturn
;
3395 UTF8Converter
= ucnv_open("UTF8", &status
);
3396 ucnv_setFromUCallBack(UTF8Converter
, UCNV_FROM_U_CALLBACK_STOP
, NULL
, NULL
, NULL
, &status
);
3398 patternUTF8Length
= pattern
.extract(NULL
, 0, UTF8Converter
, status
);
3399 status
= U_ZERO_ERROR
; // buffer overflow
3400 patternChars
= new char[patternUTF8Length
+1];
3401 pattern
.extract(patternChars
, patternUTF8Length
+1, UTF8Converter
, status
);
3402 utext_openUTF8(&patternText
, patternChars
, patternUTF8Length
, &status
);
3404 if (status
== U_ZERO_ERROR
) {
3405 UTF8Pattern
= RegexPattern::compile(&patternText
, bflags
, pe
, status
);
3407 if (status
!= U_ZERO_ERROR
) {
3408 #if UCONFIG_NO_BREAK_ITERATION==1
3409 // 'v' test flag means that the test pattern should not compile if ICU was configured
3410 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3411 if (flags
.indexOf((UChar
)0x76) >= 0 /*'v'*/ && status
== U_UNSUPPORTED_ERROR
) {
3412 goto cleanupAndReturn
;
3415 if (flags
.indexOf((UChar
)0x45) >= 0) { // flags contain 'E'
3416 // Expected pattern compilation error.
3417 if (flags
.indexOf((UChar
)0x64) >= 0) { // flags contain 'd'
3418 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status
));
3420 goto cleanupAndReturn
;
3422 // Unexpected pattern compilation error.
3423 errln("Line %d: error %s compiling pattern. (UTF8)", line
, u_errorName(status
));
3424 goto cleanupAndReturn
;
3429 if (UTF8Pattern
== NULL
) {
3430 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3431 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath
, line
);
3432 status
= U_ZERO_ERROR
;
3435 if (flags
.indexOf((UChar
)0x64) >= 0) { // 'd' flag
3436 callerPattern
->dumpPattern();
3439 if (flags
.indexOf((UChar
)0x45) >= 0) { // 'E' flag
3440 errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath
, line
);
3441 goto cleanupAndReturn
;
3446 // Number of times find() should be called on the test string, default to 1
3449 for (i
=2; i
<=9; i
++) {
3450 if (flags
.indexOf((UChar
)(0x30 + i
)) >= 0) { // digit flag
3451 if (numFinds
!= 1) {
3452 errln("Line %d: more than one digit flag. Scanning %d.", line
, i
);
3453 goto cleanupAndReturn
;
3459 // 'M' flag. Use matches() instead of find()
3460 if (flags
.indexOf((UChar
)0x4d) >= 0) {
3461 useMatchesFunc
= TRUE
;
3463 if (flags
.indexOf((UChar
)0x4c) >= 0) {
3464 useLookingAtFunc
= TRUE
;
3468 // Find the tags in the input data, remove them, and record the group boundary
3471 parsePat
= RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe
, status
);
3472 REGEX_CHECK_STATUS_L(line
);
3474 unEscapedInput
= inputString
.unescape();
3475 parseMatcher
= parsePat
->matcher(unEscapedInput
, status
);
3476 REGEX_CHECK_STATUS_L(line
);
3477 while(parseMatcher
->find()) {
3478 parseMatcher
->appendReplacement(deTaggedInput
, "", status
);
3480 UnicodeString groupNum
= parseMatcher
->group(2, status
);
3481 if (groupNum
== "r") {
3482 // <r> or </r>, a region specification within the string
3483 if (parseMatcher
->group(1, status
) == "/") {
3484 regionEnd
= deTaggedInput
.length();
3486 regionStart
= deTaggedInput
.length();
3489 // <digits> or </digits>, a group match boundary tag.
3490 if (parseMatcher
->group(1, status
) == "/") {
3491 set(groupEnds
, deTaggedInput
.length(), groupNum
);
3493 set(groupStarts
, deTaggedInput
.length(), groupNum
);
3497 parseMatcher
->appendTail(deTaggedInput
);
3498 REGEX_ASSERT_L(groupStarts
.size() == groupEnds
.size(), line
);
3499 if ((regionStart
>=0 || regionEnd
>=0) && (regionStart
<0 || regionStart
>regionEnd
)) {
3500 errln("mismatched <r> tags");
3502 goto cleanupAndReturn
;
3506 // Configure the matcher according to the flags specified with this test.
3508 matcher
= callerPattern
->matcher(deTaggedInput
, status
);
3509 REGEX_CHECK_STATUS_L(line
);
3510 if (flags
.indexOf((UChar
)0x74) >= 0) { // 't' trace flag
3511 matcher
->setTrace(TRUE
);
3514 if (UTF8Pattern
!= NULL
) {
3515 inputUTF8Length
= deTaggedInput
.extract(NULL
, 0, UTF8Converter
, status
);
3516 status
= U_ZERO_ERROR
; // buffer overflow
3517 inputChars
= new char[inputUTF8Length
+1];
3518 deTaggedInput
.extract(inputChars
, inputUTF8Length
+1, UTF8Converter
, status
);
3519 utext_openUTF8(&inputText
, inputChars
, inputUTF8Length
, &status
);
3521 if (status
== U_ZERO_ERROR
) {
3522 UTF8Matcher
= &UTF8Pattern
->matcher(status
)->reset(&inputText
);
3523 REGEX_CHECK_STATUS_L(line
);
3526 if (UTF8Matcher
== NULL
) {
3527 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3528 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath
, line
);
3529 status
= U_ZERO_ERROR
;
3534 // Generate native indices for UTF8 versions of region and capture group info
3536 if (UTF8Matcher
!= NULL
) {
3537 if (flags
.indexOf((UChar
)0x74) >= 0) { // 't' trace flag
3538 UTF8Matcher
->setTrace(TRUE
);
3540 if (regionStart
>=0) (void) utextOffsetToNative(&inputText
, regionStart
, regionStartUTF8
);
3541 if (regionEnd
>=0) (void) utextOffsetToNative(&inputText
, regionEnd
, regionEndUTF8
);
3543 // Fill out the native index UVector info.
3544 // Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3545 for (i
=0; i
<groupStarts
.size(); i
++) {
3546 int32_t start
= groupStarts
.elementAti(i
);
3547 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3550 if (!utextOffsetToNative(&inputText
, start
, startUTF8
)) {
3551 errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line
, i
, start
);
3553 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3555 setInt(groupStartsUTF8
, startUTF8
, i
);
3558 int32_t end
= groupEnds
.elementAti(i
);
3559 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3562 if (!utextOffsetToNative(&inputText
, end
, endUTF8
)) {
3563 errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line
, i
, end
);
3565 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3567 setInt(groupEndsUTF8
, endUTF8
, i
);
3572 if (regionStart
>=0) {
3573 matcher
->region(regionStart
, regionEnd
, status
);
3574 REGEX_CHECK_STATUS_L(line
);
3575 if (UTF8Matcher
!= NULL
) {
3576 UTF8Matcher
->region(regionStartUTF8
, regionEndUTF8
, status
);
3577 REGEX_CHECK_STATUS_L(line
);
3580 if (flags
.indexOf((UChar
)0x61) >= 0) { // 'a' anchoring bounds flag
3581 matcher
->useAnchoringBounds(FALSE
);
3582 if (UTF8Matcher
!= NULL
) {
3583 UTF8Matcher
->useAnchoringBounds(FALSE
);
3586 if (flags
.indexOf((UChar
)0x62) >= 0) { // 'b' transparent bounds flag
3587 matcher
->useTransparentBounds(TRUE
);
3588 if (UTF8Matcher
!= NULL
) {
3589 UTF8Matcher
->useTransparentBounds(TRUE
);
3596 // Do a find on the de-tagged input using the caller's pattern
3597 // TODO: error on count>1 and not find().
3598 // error on both matches() and lookingAt().
3600 for (i
=0; i
<numFinds
; i
++) {
3601 if (useMatchesFunc
) {
3602 isMatch
= matcher
->matches(status
);
3603 if (UTF8Matcher
!= NULL
) {
3604 isUTF8Match
= UTF8Matcher
->matches(status
);
3606 } else if (useLookingAtFunc
) {
3607 isMatch
= matcher
->lookingAt(status
);
3608 if (UTF8Matcher
!= NULL
) {
3609 isUTF8Match
= UTF8Matcher
->lookingAt(status
);
3612 isMatch
= matcher
->find();
3613 if (UTF8Matcher
!= NULL
) {
3614 isUTF8Match
= UTF8Matcher
->find();
3618 matcher
->setTrace(FALSE
);
3620 UTF8Matcher
->setTrace(FALSE
);
3622 if (U_FAILURE(status
)) {
3623 errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status
));
3627 // Match up the groups from the find() with the groups from the tags
3630 // number of tags should match number of groups from find operation.
3631 // matcher->groupCount does not include group 0, the entire match, hence the +1.
3632 // G option in test means that capture group data is not available in the
3633 // expected results, so the check needs to be suppressed.
3634 if (isMatch
== FALSE
&& groupStarts
.size() != 0) {
3635 dataerrln("Error at line %d: Match expected, but none found.", line
);
3637 goto cleanupAndReturn
;
3638 } else if (UTF8Matcher
!= NULL
&& isUTF8Match
== FALSE
&& groupStarts
.size() != 0) {
3639 errln("Error at line %d: Match expected, but none found. (UTF8)", line
);
3641 goto cleanupAndReturn
;
3643 if (isMatch
&& groupStarts
.size() == 0) {
3644 errln("Error at line %d: No match expected, but one found at position %d.", line
, matcher
->start(status
));
3647 if (UTF8Matcher
&& isUTF8Match
&& groupStarts
.size() == 0) {
3648 errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line
, UTF8Matcher
->start(status
));
3652 if (flags
.indexOf((UChar
)0x47 /*G*/) >= 0) {
3653 // Only check for match / no match. Don't check capture groups.
3654 goto cleanupAndReturn
;
3657 REGEX_CHECK_STATUS_L(line
);
3658 for (i
=0; i
<=matcher
->groupCount(); i
++) {
3659 int32_t expectedStart
= (i
>= groupStarts
.size()? -1 : groupStarts
.elementAti(i
));
3660 int32_t expectedStartUTF8
= (i
>= groupStartsUTF8
.size()? -1 : groupStartsUTF8
.elementAti(i
));
3661 if (matcher
->start(i
, status
) != expectedStart
) {
3662 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
3663 line
, i
, expectedStart
, matcher
->start(i
, status
));
3665 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3666 } else if (UTF8Matcher
!= NULL
&& UTF8Matcher
->start(i
, status
) != expectedStartUTF8
) {
3667 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)",
3668 line
, i
, expectedStartUTF8
, UTF8Matcher
->start(i
, status
));
3670 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3673 int32_t expectedEnd
= (i
>= groupEnds
.size()? -1 : groupEnds
.elementAti(i
));
3674 int32_t expectedEndUTF8
= (i
>= groupEndsUTF8
.size()? -1 : groupEndsUTF8
.elementAti(i
));
3675 if (matcher
->end(i
, status
) != expectedEnd
) {
3676 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
3677 line
, i
, expectedEnd
, matcher
->end(i
, status
));
3679 // Error on end position; keep going; real error is probably yet to come as group
3680 // end positions work from end of the input data towards the front.
3681 } else if (UTF8Matcher
!= NULL
&& UTF8Matcher
->end(i
, status
) != expectedEndUTF8
) {
3682 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)",
3683 line
, i
, expectedEndUTF8
, UTF8Matcher
->end(i
, status
));
3685 // Error on end position; keep going; real error is probably yet to come as group
3686 // end positions work from end of the input data towards the front.
3689 if ( matcher
->groupCount()+1 < groupStarts
.size()) {
3690 errln("Error at line %d: Expected %d capture groups, found %d.",
3691 line
, groupStarts
.size()-1, matcher
->groupCount());
3694 else if (UTF8Matcher
!= NULL
&& UTF8Matcher
->groupCount()+1 < groupStarts
.size()) {
3695 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3696 line
, groupStarts
.size()-1, UTF8Matcher
->groupCount());
3700 if ((flags
.indexOf((UChar
)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3701 matcher
->requireEnd() == TRUE
) {
3702 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line
);
3704 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3705 UTF8Matcher
->requireEnd() == TRUE
) {
3706 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line
);
3710 if ((flags
.indexOf((UChar
)0x79) >= 0) && // 'y' flag: RequireEnd() == true
3711 matcher
->requireEnd() == FALSE
) {
3712 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line
);
3714 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x79) >= 0) && // 'Y' flag: RequireEnd() == false
3715 UTF8Matcher
->requireEnd() == FALSE
) {
3716 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line
);
3720 if ((flags
.indexOf((UChar
)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3721 matcher
->hitEnd() == TRUE
) {
3722 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line
);
3724 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3725 UTF8Matcher
->hitEnd() == TRUE
) {
3726 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line
);
3730 if ((flags
.indexOf((UChar
)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3731 matcher
->hitEnd() == FALSE
) {
3732 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line
);
3734 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3735 UTF8Matcher
->hitEnd() == FALSE
) {
3736 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line
);
3743 infoln((UnicodeString
)"\""+pattern
+(UnicodeString
)"\" "
3744 +flags
+(UnicodeString
)" \""+inputString
+(UnicodeString
)"\"");
3745 // callerPattern->dump();
3747 delete parseMatcher
;
3752 delete callerPattern
;
3754 utext_close(&inputText
);
3755 delete[] inputChars
;
3756 utext_close(&patternText
);
3757 delete[] patternChars
;
3758 ucnv_close(UTF8Converter
);
3764 //---------------------------------------------------------------------------
3766 // Errors Check for error handling in patterns.
3768 //---------------------------------------------------------------------------
3769 void RegexTest::Errors() {
3770 // \escape sequences that aren't implemented yet.
3771 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3773 // Missing close parentheses
3774 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN
);
3775 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN
);
3776 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN
);
3778 // Extra close paren
3779 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN
);
3780 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN
);
3781 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN
);
3783 // Look-ahead, Look-behind
3784 // TODO: add tests for unbounded length look-behinds.
3785 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX
); // illegal construct
3787 // Attempt to use non-default flags
3790 UErrorCode status
= U_ZERO_ERROR
;
3791 int32_t flags
= UREGEX_CANON_EQ
|
3792 UREGEX_COMMENTS
| UREGEX_DOTALL
|
3794 RegexPattern
*pat1
= RegexPattern::compile(".*", flags
, pe
, status
);
3795 REGEX_ASSERT(status
== U_REGEX_UNIMPLEMENTED
);
3800 // Quantifiers are allowed only after something that can be quantified.
3801 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX
);
3802 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX
);
3803 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX
);
3805 // Mal-formed {min,max} quantifiers
3806 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL
);
3807 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN
);
3808 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL
);
3809 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL
);
3810 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL
);
3811 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG
);
3812 REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG
); // Overflows int during scan
3813 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG
); // Overflows regex binary format
3814 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG
);
3817 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX
);
3819 // Invalid Back Reference \0
3820 // For ICU 3.8 and earlier
3821 // For ICU versions newer than 3.8, \0 introduces an octal escape.
3823 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE
);
3828 //-------------------------------------------------------------------------------
3830 // Read a text data file, convert it to UChars, and return the data
3831 // in one big UChar * buffer, which the caller must delete.
3833 //--------------------------------------------------------------------------------
3834 UChar
*RegexTest::ReadAndConvertFile(const char *fileName
, int32_t &ulen
,
3835 const char *defEncoding
, UErrorCode
&status
) {
3836 UChar
*retPtr
= NULL
;
3837 char *fileBuf
= NULL
;
3838 UConverter
* conv
= NULL
;
3842 if (U_FAILURE(status
)) {
3849 f
= fopen(fileName
, "rb");
3851 dataerrln("Error opening test data file %s\n", fileName
);
3852 status
= U_FILE_ACCESS_ERROR
;
3861 fseek( f
, 0, SEEK_END
);
3862 fileSize
= ftell(f
);
3863 fileBuf
= new char[fileSize
];
3864 fseek(f
, 0, SEEK_SET
);
3865 amt_read
= static_cast<int32_t>(fread(fileBuf
, 1, fileSize
, f
));
3866 if (amt_read
!= fileSize
|| fileSize
<= 0) {
3867 errln("Error reading test data file.");
3868 goto cleanUpAndReturn
;
3872 // Look for a Unicode Signature (BOM) on the data just read
3874 int32_t signatureLength
;
3875 const char * fileBufC
;
3876 const char* encoding
;
3879 encoding
= ucnv_detectUnicodeSignature(
3880 fileBuf
, fileSize
, &signatureLength
, &status
);
3881 if(encoding
!=NULL
){
3882 fileBufC
+= signatureLength
;
3883 fileSize
-= signatureLength
;
3885 encoding
= defEncoding
;
3886 if (strcmp(encoding
, "utf-8") == 0) {
3887 errln("file %s is missing its BOM", fileName
);
3892 // Open a converter to take the rule file to UTF-16
3894 conv
= ucnv_open(encoding
, &status
);
3895 if (U_FAILURE(status
)) {
3896 goto cleanUpAndReturn
;
3900 // Convert the rules to UChar.
3901 // Preflight first to determine required buffer size.
3903 ulen
= ucnv_toUChars(conv
,
3909 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
3910 // Buffer Overflow is expected from the preflight operation.
3911 status
= U_ZERO_ERROR
;
3913 retPtr
= new UChar
[ulen
+1];
3926 if (U_FAILURE(status
)) {
3927 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
3936 //-------------------------------------------------------------------------------
3938 // PerlTests - Run Perl's regular expression tests
3939 // The input file for this test is re_tests, the standard regular
3940 // expression test data distributed with the Perl source code.
3942 // Here is Perl's description of the test data file:
3944 // # The tests are in a separate file 't/op/re_tests'.
3945 // # Each line in that file is a separate test.
3946 // # There are five columns, separated by tabs.
3948 // # Column 1 contains the pattern, optionally enclosed in C<''>.
3949 // # Modifiers can be put after the closing C<'>.
3951 // # Column 2 contains the string to be matched.
3953 // # Column 3 contains the expected result:
3954 // # y expect a match
3955 // # n expect no match
3956 // # c expect an error
3957 // # B test exposes a known bug in Perl, should be skipped
3958 // # b test exposes a known bug in Perl, should be skipped if noamp
3960 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3962 // # Column 4 contains a string, usually C<$&>.
3964 // # Column 5 contains the expected result of double-quote
3965 // # interpolating that string after the match, or start of error message.
3967 // # Column 6, if present, contains a reason why the test is skipped.
3968 // # This is printed with "skipped", for harness to pick up.
3970 // # \n in the tests are interpolated, as are variables of the form ${\w+}.
3972 // # If you want to add a regular expression test that can't be expressed
3973 // # in this format, don't add it here: put it in op/pat.t instead.
3975 // For ICU, if field 3 contains an 'i', the test will be skipped.
3976 // The test exposes is some known incompatibility between ICU and Perl regexps.
3977 // (The i is in addition to whatever was there before.)
3979 //-------------------------------------------------------------------------------
3980 void RegexTest::PerlTests() {
3982 const char *srcPath
;
3983 UErrorCode status
= U_ZERO_ERROR
;
3987 // Open and read the test data file.
3989 srcPath
=getPath(tdd
, "re_tests.txt");
3991 return; /* something went wrong, error already output */
3995 UChar
*testData
= ReadAndConvertFile(srcPath
, len
, "iso-8859-1", status
);
3996 if (U_FAILURE(status
)) {
3997 return; /* something went wrong, error already output */
4001 // Put the test data into a UnicodeString
4003 UnicodeString
testDataString(FALSE
, testData
, len
);
4006 // Regex to break the input file into lines, and strip the new lines.
4007 // One line per match, capture group one is the desired data.
4009 RegexPattern
* linePat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe
, status
);
4010 if (U_FAILURE(status
)) {
4011 dataerrln("RegexPattern::compile() error");
4014 RegexMatcher
* lineMat
= linePat
->matcher(testDataString
, status
);
4017 // Regex to split a test file line into fields.
4018 // There are six fields, separated by tabs.
4020 RegexPattern
* fieldPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe
, status
);
4023 // Regex to identify test patterns with flag settings, and to separate them.
4024 // Test patterns with flags look like 'pattern'i
4025 // Test patterns without flags are not quoted: pattern
4026 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4028 RegexPattern
*flagPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe
, status
);
4029 RegexMatcher
* flagMat
= flagPat
->matcher(status
);
4032 // The Perl tests reference several perl-isms, which are evaluated/substituted
4033 // in the test data. Not being perl, this must be done explicitly. Here
4034 // are string constants and REs for these constructs.
4036 UnicodeString
nulnulSrc("${nulnul}");
4037 UnicodeString
nulnul("\\u0000\\u0000", -1, US_INV
);
4038 nulnul
= nulnul
.unescape();
4040 UnicodeString
ffffSrc("${ffff}");
4041 UnicodeString
ffff("\\uffff", -1, US_INV
);
4042 ffff
= ffff
.unescape();
4044 // regexp for $-[0], $+[2], etc.
4045 RegexPattern
*groupsPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe
, status
);
4046 RegexMatcher
*groupsMat
= groupsPat
->matcher(status
);
4048 // regexp for $0, $1, $2, etc.
4049 RegexPattern
*cgPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe
, status
);
4050 RegexMatcher
*cgMat
= cgPat
->matcher(status
);
4054 // Main Loop for the Perl Tests, runs once per line from the
4057 int32_t lineNum
= 0;
4058 int32_t skippedUnimplementedCount
= 0;
4059 while (lineMat
->find()) {
4063 // Get a line, break it into its fields, do the Perl
4064 // variable substitutions.
4066 UnicodeString line
= lineMat
->group(1, status
);
4067 UnicodeString fields
[7];
4068 fieldPat
->split(line
, fields
, 7, status
);
4070 flagMat
->reset(fields
[0]);
4071 flagMat
->matches(status
);
4072 UnicodeString pattern
= flagMat
->group(2, status
);
4073 pattern
.findAndReplace("${bang}", "!");
4074 pattern
.findAndReplace(nulnulSrc
, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4075 pattern
.findAndReplace(ffffSrc
, ffff
);
4078 // Identify patterns that include match flag settings,
4079 // split off the flags, remove the extra quotes.
4081 UnicodeString flagStr
= flagMat
->group(3, status
);
4082 if (U_FAILURE(status
)) {
4083 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
4087 const UChar UChar_c
= 0x63; // Char constants for the flag letters.
4088 const UChar UChar_i
= 0x69; // (Damn the lack of Unicode support in C)
4089 const UChar UChar_m
= 0x6d;
4090 const UChar UChar_x
= 0x78;
4091 const UChar UChar_y
= 0x79;
4092 if (flagStr
.indexOf(UChar_i
) != -1) {
4093 flags
|= UREGEX_CASE_INSENSITIVE
;
4095 if (flagStr
.indexOf(UChar_m
) != -1) {
4096 flags
|= UREGEX_MULTILINE
;
4098 if (flagStr
.indexOf(UChar_x
) != -1) {
4099 flags
|= UREGEX_COMMENTS
;
4103 // Compile the test pattern.
4105 status
= U_ZERO_ERROR
;
4106 RegexPattern
*testPat
= RegexPattern::compile(pattern
, flags
, pe
, status
);
4107 if (status
== U_REGEX_UNIMPLEMENTED
) {
4109 // Test of a feature that is planned for ICU, but not yet implemented.
4111 skippedUnimplementedCount
++;
4113 status
= U_ZERO_ERROR
;
4117 if (U_FAILURE(status
)) {
4118 // Some tests are supposed to generate errors.
4119 // Only report an error for tests that are supposed to succeed.
4120 if (fields
[2].indexOf(UChar_c
) == -1 && // Compilation is not supposed to fail AND
4121 fields
[2].indexOf(UChar_i
) == -1) // it's not an accepted ICU incompatibility
4123 errln("line %d: ICU Error \"%s\"\n", lineNum
, u_errorName(status
));
4125 status
= U_ZERO_ERROR
;
4130 if (fields
[2].indexOf(UChar_i
) >= 0) {
4131 // ICU should skip this test.
4136 if (fields
[2].indexOf(UChar_c
) >= 0) {
4137 // This pattern should have caused a compilation error, but didn't/
4138 errln("line %d: Expected a pattern compile error, got success.", lineNum
);
4144 // replace the Perl variables that appear in some of the
4145 // match data strings.
4147 UnicodeString matchString
= fields
[1];
4148 matchString
.findAndReplace(nulnulSrc
, nulnul
);
4149 matchString
.findAndReplace(ffffSrc
, ffff
);
4151 // Replace any \n in the match string with an actual new-line char.
4152 // Don't do full unescape, as this unescapes more than Perl does, which
4153 // causes other spurious failures in the tests.
4154 matchString
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4159 // Run the test, check for expected match/don't match result.
4161 RegexMatcher
*testMat
= testPat
->matcher(matchString
, status
);
4162 UBool found
= testMat
->find();
4163 UBool expected
= FALSE
;
4164 if (fields
[2].indexOf(UChar_y
) >=0) {
4167 if (expected
!= found
) {
4168 errln("line %d: Expected %smatch, got %smatch",
4169 lineNum
, expected
?"":"no ", found
?"":"no " );
4173 // Don't try to check expected results if there is no match.
4174 // (Some have stuff in the expected fields)
4182 // Interpret the Perl expression from the fourth field of the data file,
4183 // building up an ICU string from the results of the ICU match.
4184 // The Perl expression will contain references to the results of
4185 // a regex match, including the matched string, capture group strings,
4186 // group starting and ending indicies, etc.
4188 UnicodeString resultString
;
4189 UnicodeString perlExpr
= fields
[3];
4190 #if SUPPORT_MUTATING_INPUT_STRING
4191 groupsMat
->reset(perlExpr
);
4192 cgMat
->reset(perlExpr
);
4195 while (perlExpr
.length() > 0) {
4196 #if !SUPPORT_MUTATING_INPUT_STRING
4197 // Perferred usage. Reset after any modification to input string.
4198 groupsMat
->reset(perlExpr
);
4199 cgMat
->reset(perlExpr
);
4202 if (perlExpr
.startsWith("$&")) {
4203 resultString
.append(testMat
->group(status
));
4204 perlExpr
.remove(0, 2);
4207 else if (groupsMat
->lookingAt(status
)) {
4209 UnicodeString digitString
= groupsMat
->group(2, status
);
4211 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4212 UnicodeString plusOrMinus
= groupsMat
->group(1, status
);
4213 int32_t matchPosition
;
4214 if (plusOrMinus
.compare("+") == 0) {
4215 matchPosition
= testMat
->end(groupNum
, status
);
4217 matchPosition
= testMat
->start(groupNum
, status
);
4219 if (matchPosition
!= -1) {
4220 ICU_Utility::appendNumber(resultString
, matchPosition
);
4222 perlExpr
.remove(0, groupsMat
->end(status
));
4225 else if (cgMat
->lookingAt(status
)) {
4227 UnicodeString digitString
= cgMat
->group(1, status
);
4229 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4230 if (U_SUCCESS(status
)) {
4231 resultString
.append(testMat
->group(groupNum
, status
));
4232 status
= U_ZERO_ERROR
;
4234 perlExpr
.remove(0, cgMat
->end(status
));
4237 else if (perlExpr
.startsWith("@-")) {
4239 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4241 resultString
.append(" ");
4243 ICU_Utility::appendNumber(resultString
, testMat
->start(i
, status
));
4245 perlExpr
.remove(0, 2);
4248 else if (perlExpr
.startsWith("@+")) {
4250 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4252 resultString
.append(" ");
4254 ICU_Utility::appendNumber(resultString
, testMat
->end(i
, status
));
4256 perlExpr
.remove(0, 2);
4259 else if (perlExpr
.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4260 // or as an escaped sequence (e.g. \n)
4261 if (perlExpr
.length() > 1) {
4262 perlExpr
.remove(0, 1); // Remove the '\', but only if not last char.
4264 UChar c
= perlExpr
.charAt(0);
4266 case 'n': c
= '\n'; break;
4267 // add any other escape sequences that show up in the test expected results.
4269 resultString
.append(c
);
4270 perlExpr
.remove(0, 1);
4274 // Any characters from the perl expression that we don't explicitly
4275 // recognize before here are assumed to be literals and copied
4276 // as-is to the expected results.
4277 resultString
.append(perlExpr
.charAt(0));
4278 perlExpr
.remove(0, 1);
4281 if (U_FAILURE(status
)) {
4282 errln("Line %d: ICU Error \"%s\"", lineNum
, u_errorName(status
));
4288 // Expected Results Compare
4290 UnicodeString
expectedS(fields
[4]);
4291 expectedS
.findAndReplace(nulnulSrc
, nulnul
);
4292 expectedS
.findAndReplace(ffffSrc
, ffff
);
4293 expectedS
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4296 if (expectedS
.compare(resultString
) != 0) {
4297 err("Line %d: Incorrect perl expression results.", lineNum
);
4298 infoln((UnicodeString
)"Expected \""+expectedS
+(UnicodeString
)"\"; got \""+resultString
+(UnicodeString
)"\"");
4306 // All done. Clean up allocated stuff.
4324 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount
);
4329 //-------------------------------------------------------------------------------
4331 // PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts
4332 // (instead of using UnicodeStrings) to test the alternate engine.
4333 // The input file for this test is re_tests, the standard regular
4334 // expression test data distributed with the Perl source code.
4335 // See PerlTests() for more information.
4337 //-------------------------------------------------------------------------------
4338 void RegexTest::PerlTestsUTF8() {
4340 const char *srcPath
;
4341 UErrorCode status
= U_ZERO_ERROR
;
4343 LocalUConverterPointer
UTF8Converter(ucnv_open("UTF-8", &status
));
4344 UText patternText
= UTEXT_INITIALIZER
;
4345 char *patternChars
= NULL
;
4346 int32_t patternLength
;
4347 int32_t patternCapacity
= 0;
4348 UText inputText
= UTEXT_INITIALIZER
;
4349 char *inputChars
= NULL
;
4350 int32_t inputLength
;
4351 int32_t inputCapacity
= 0;
4353 ucnv_setFromUCallBack(UTF8Converter
.getAlias(), UCNV_FROM_U_CALLBACK_STOP
, NULL
, NULL
, NULL
, &status
);
4356 // Open and read the test data file.
4358 srcPath
=getPath(tdd
, "re_tests.txt");
4360 return; /* something went wrong, error already output */
4364 UChar
*testData
= ReadAndConvertFile(srcPath
, len
, "iso-8859-1", status
);
4365 if (U_FAILURE(status
)) {
4366 return; /* something went wrong, error already output */
4370 // Put the test data into a UnicodeString
4372 UnicodeString
testDataString(FALSE
, testData
, len
);
4375 // Regex to break the input file into lines, and strip the new lines.
4376 // One line per match, capture group one is the desired data.
4378 RegexPattern
* linePat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe
, status
);
4379 if (U_FAILURE(status
)) {
4380 dataerrln("RegexPattern::compile() error");
4383 RegexMatcher
* lineMat
= linePat
->matcher(testDataString
, status
);
4386 // Regex to split a test file line into fields.
4387 // There are six fields, separated by tabs.
4389 RegexPattern
* fieldPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe
, status
);
4392 // Regex to identify test patterns with flag settings, and to separate them.
4393 // Test patterns with flags look like 'pattern'i
4394 // Test patterns without flags are not quoted: pattern
4395 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4397 RegexPattern
*flagPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe
, status
);
4398 RegexMatcher
* flagMat
= flagPat
->matcher(status
);
4401 // The Perl tests reference several perl-isms, which are evaluated/substituted
4402 // in the test data. Not being perl, this must be done explicitly. Here
4403 // are string constants and REs for these constructs.
4405 UnicodeString
nulnulSrc("${nulnul}");
4406 UnicodeString
nulnul("\\u0000\\u0000", -1, US_INV
);
4407 nulnul
= nulnul
.unescape();
4409 UnicodeString
ffffSrc("${ffff}");
4410 UnicodeString
ffff("\\uffff", -1, US_INV
);
4411 ffff
= ffff
.unescape();
4413 // regexp for $-[0], $+[2], etc.
4414 RegexPattern
*groupsPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe
, status
);
4415 RegexMatcher
*groupsMat
= groupsPat
->matcher(status
);
4417 // regexp for $0, $1, $2, etc.
4418 RegexPattern
*cgPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe
, status
);
4419 RegexMatcher
*cgMat
= cgPat
->matcher(status
);
4423 // Main Loop for the Perl Tests, runs once per line from the
4426 int32_t lineNum
= 0;
4427 int32_t skippedUnimplementedCount
= 0;
4428 while (lineMat
->find()) {
4432 // Get a line, break it into its fields, do the Perl
4433 // variable substitutions.
4435 UnicodeString line
= lineMat
->group(1, status
);
4436 UnicodeString fields
[7];
4437 fieldPat
->split(line
, fields
, 7, status
);
4439 flagMat
->reset(fields
[0]);
4440 flagMat
->matches(status
);
4441 UnicodeString pattern
= flagMat
->group(2, status
);
4442 pattern
.findAndReplace("${bang}", "!");
4443 pattern
.findAndReplace(nulnulSrc
, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4444 pattern
.findAndReplace(ffffSrc
, ffff
);
4447 // Identify patterns that include match flag settings,
4448 // split off the flags, remove the extra quotes.
4450 UnicodeString flagStr
= flagMat
->group(3, status
);
4451 if (U_FAILURE(status
)) {
4452 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
4456 const UChar UChar_c
= 0x63; // Char constants for the flag letters.
4457 const UChar UChar_i
= 0x69; // (Damn the lack of Unicode support in C)
4458 const UChar UChar_m
= 0x6d;
4459 const UChar UChar_x
= 0x78;
4460 const UChar UChar_y
= 0x79;
4461 if (flagStr
.indexOf(UChar_i
) != -1) {
4462 flags
|= UREGEX_CASE_INSENSITIVE
;
4464 if (flagStr
.indexOf(UChar_m
) != -1) {
4465 flags
|= UREGEX_MULTILINE
;
4467 if (flagStr
.indexOf(UChar_x
) != -1) {
4468 flags
|= UREGEX_COMMENTS
;
4472 // Put the pattern in a UTF-8 UText
4474 status
= U_ZERO_ERROR
;
4475 patternLength
= pattern
.extract(patternChars
, patternCapacity
, UTF8Converter
.getAlias(), status
);
4476 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
4477 status
= U_ZERO_ERROR
;
4478 delete[] patternChars
;
4479 patternCapacity
= patternLength
+ 1;
4480 patternChars
= new char[patternCapacity
];
4481 pattern
.extract(patternChars
, patternCapacity
, UTF8Converter
.getAlias(), status
);
4483 utext_openUTF8(&patternText
, patternChars
, patternLength
, &status
);
4486 // Compile the test pattern.
4488 RegexPattern
*testPat
= RegexPattern::compile(&patternText
, flags
, pe
, status
);
4489 if (status
== U_REGEX_UNIMPLEMENTED
) {
4491 // Test of a feature that is planned for ICU, but not yet implemented.
4493 skippedUnimplementedCount
++;
4495 status
= U_ZERO_ERROR
;
4499 if (U_FAILURE(status
)) {
4500 // Some tests are supposed to generate errors.
4501 // Only report an error for tests that are supposed to succeed.
4502 if (fields
[2].indexOf(UChar_c
) == -1 && // Compilation is not supposed to fail AND
4503 fields
[2].indexOf(UChar_i
) == -1) // it's not an accepted ICU incompatibility
4505 errln("line %d: ICU Error \"%s\"\n", lineNum
, u_errorName(status
));
4507 status
= U_ZERO_ERROR
;
4512 if (fields
[2].indexOf(UChar_i
) >= 0) {
4513 // ICU should skip this test.
4518 if (fields
[2].indexOf(UChar_c
) >= 0) {
4519 // This pattern should have caused a compilation error, but didn't/
4520 errln("line %d: Expected a pattern compile error, got success.", lineNum
);
4527 // replace the Perl variables that appear in some of the
4528 // match data strings.
4530 UnicodeString matchString
= fields
[1];
4531 matchString
.findAndReplace(nulnulSrc
, nulnul
);
4532 matchString
.findAndReplace(ffffSrc
, ffff
);
4534 // Replace any \n in the match string with an actual new-line char.
4535 // Don't do full unescape, as this unescapes more than Perl does, which
4536 // causes other spurious failures in the tests.
4537 matchString
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4540 // Put the input in a UTF-8 UText
4542 status
= U_ZERO_ERROR
;
4543 inputLength
= matchString
.extract(inputChars
, inputCapacity
, UTF8Converter
.getAlias(), status
);
4544 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
4545 status
= U_ZERO_ERROR
;
4546 delete[] inputChars
;
4547 inputCapacity
= inputLength
+ 1;
4548 inputChars
= new char[inputCapacity
];
4549 matchString
.extract(inputChars
, inputCapacity
, UTF8Converter
.getAlias(), status
);
4551 utext_openUTF8(&inputText
, inputChars
, inputLength
, &status
);
4554 // Run the test, check for expected match/don't match result.
4556 RegexMatcher
*testMat
= &testPat
->matcher(status
)->reset(&inputText
);
4557 UBool found
= testMat
->find();
4558 UBool expected
= FALSE
;
4559 if (fields
[2].indexOf(UChar_y
) >=0) {
4562 if (expected
!= found
) {
4563 errln("line %d: Expected %smatch, got %smatch",
4564 lineNum
, expected
?"":"no ", found
?"":"no " );
4568 // Don't try to check expected results if there is no match.
4569 // (Some have stuff in the expected fields)
4577 // Interpret the Perl expression from the fourth field of the data file,
4578 // building up an ICU string from the results of the ICU match.
4579 // The Perl expression will contain references to the results of
4580 // a regex match, including the matched string, capture group strings,
4581 // group starting and ending indicies, etc.
4583 UnicodeString resultString
;
4584 UnicodeString perlExpr
= fields
[3];
4586 while (perlExpr
.length() > 0) {
4587 groupsMat
->reset(perlExpr
);
4588 cgMat
->reset(perlExpr
);
4590 if (perlExpr
.startsWith("$&")) {
4591 resultString
.append(testMat
->group(status
));
4592 perlExpr
.remove(0, 2);
4595 else if (groupsMat
->lookingAt(status
)) {
4597 UnicodeString digitString
= groupsMat
->group(2, status
);
4599 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4600 UnicodeString plusOrMinus
= groupsMat
->group(1, status
);
4601 int32_t matchPosition
;
4602 if (plusOrMinus
.compare("+") == 0) {
4603 matchPosition
= testMat
->end(groupNum
, status
);
4605 matchPosition
= testMat
->start(groupNum
, status
);
4607 if (matchPosition
!= -1) {
4608 ICU_Utility::appendNumber(resultString
, matchPosition
);
4610 perlExpr
.remove(0, groupsMat
->end(status
));
4613 else if (cgMat
->lookingAt(status
)) {
4615 UnicodeString digitString
= cgMat
->group(1, status
);
4617 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4618 if (U_SUCCESS(status
)) {
4619 resultString
.append(testMat
->group(groupNum
, status
));
4620 status
= U_ZERO_ERROR
;
4622 perlExpr
.remove(0, cgMat
->end(status
));
4625 else if (perlExpr
.startsWith("@-")) {
4627 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4629 resultString
.append(" ");
4631 ICU_Utility::appendNumber(resultString
, testMat
->start(i
, status
));
4633 perlExpr
.remove(0, 2);
4636 else if (perlExpr
.startsWith("@+")) {
4638 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4640 resultString
.append(" ");
4642 ICU_Utility::appendNumber(resultString
, testMat
->end(i
, status
));
4644 perlExpr
.remove(0, 2);
4647 else if (perlExpr
.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4648 // or as an escaped sequence (e.g. \n)
4649 if (perlExpr
.length() > 1) {
4650 perlExpr
.remove(0, 1); // Remove the '\', but only if not last char.
4652 UChar c
= perlExpr
.charAt(0);
4654 case 'n': c
= '\n'; break;
4655 // add any other escape sequences that show up in the test expected results.
4657 resultString
.append(c
);
4658 perlExpr
.remove(0, 1);
4662 // Any characters from the perl expression that we don't explicitly
4663 // recognize before here are assumed to be literals and copied
4664 // as-is to the expected results.
4665 resultString
.append(perlExpr
.charAt(0));
4666 perlExpr
.remove(0, 1);
4669 if (U_FAILURE(status
)) {
4670 errln("Line %d: ICU Error \"%s\"", lineNum
, u_errorName(status
));
4676 // Expected Results Compare
4678 UnicodeString
expectedS(fields
[4]);
4679 expectedS
.findAndReplace(nulnulSrc
, nulnul
);
4680 expectedS
.findAndReplace(ffffSrc
, ffff
);
4681 expectedS
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4684 if (expectedS
.compare(resultString
) != 0) {
4685 err("Line %d: Incorrect perl expression results.", lineNum
);
4686 infoln((UnicodeString
)"Expected \""+expectedS
+(UnicodeString
)"\"; got \""+resultString
+(UnicodeString
)"\"");
4694 // All done. Clean up allocated stuff.
4711 utext_close(&patternText
);
4712 utext_close(&inputText
);
4714 delete [] patternChars
;
4715 delete [] inputChars
;
4718 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount
);
4723 //--------------------------------------------------------------
4725 // Bug6149 Verify limits to heap expansion for backtrack stack.
4726 // Use this pattern,
4727 // "(a?){1,8000000}"
4728 // Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4729 // This test is likely to be fragile, as further optimizations stop
4730 // more cases of pointless looping in the match engine.
4732 //---------------------------------------------------------------
4733 void RegexTest::Bug6149() {
4734 UnicodeString
pattern("(a?){1,8000000}");
4735 UnicodeString
s("xyz");
4737 UErrorCode status
= U_ZERO_ERROR
;
4739 RegexMatcher
matcher(pattern
, s
, flags
, status
);
4740 UBool result
= false;
4741 REGEX_ASSERT_FAIL(result
=matcher
.matches(status
), U_REGEX_STACK_OVERFLOW
);
4742 REGEX_ASSERT(result
== FALSE
);
4747 // Callbacks() Test the callback function.
4748 // When set, callbacks occur periodically during matching operations,
4749 // giving the application code the ability to abort the operation
4750 // before it's normal completion.
4753 struct callBackContext
{
4758 void reset(int32_t max
) {maxCalls
=max
; numCalls
=0; lastSteps
=0;};
4762 static UBool U_CALLCONV
4763 testCallBackFn(const void *context
, int32_t steps
) {
4764 callBackContext
*info
= (callBackContext
*)context
;
4765 if (info
->lastSteps
+1 != steps
) {
4766 info
->test
->errln("incorrect steps in callback. Expected %d, got %d\n", info
->lastSteps
+1, steps
);
4768 info
->lastSteps
= steps
;
4770 return (info
->numCalls
< info
->maxCalls
);
4774 void RegexTest::Callbacks() {
4776 // Getter returns NULLs if no callback has been set
4778 // The variables that the getter will fill in.
4779 // Init to non-null values so that the action of the getter can be seen.
4780 const void *returnedContext
= &returnedContext
;
4781 URegexMatchCallback
*returnedFn
= &testCallBackFn
;
4783 UErrorCode status
= U_ZERO_ERROR
;
4784 RegexMatcher
matcher("x", 0, status
);
4786 matcher
.getMatchCallback(returnedFn
, returnedContext
, status
);
4788 REGEX_ASSERT(returnedFn
== NULL
);
4789 REGEX_ASSERT(returnedContext
== NULL
);
4794 callBackContext cbInfo
= {this, 0, 0, 0};
4795 const void *returnedContext
;
4796 URegexMatchCallback
*returnedFn
;
4797 UErrorCode status
= U_ZERO_ERROR
;
4798 RegexMatcher
matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status
); // A pattern that can run long.
4800 matcher
.setMatchCallback(testCallBackFn
, &cbInfo
, status
);
4802 matcher
.getMatchCallback(returnedFn
, returnedContext
, status
);
4804 REGEX_ASSERT(returnedFn
== testCallBackFn
);
4805 REGEX_ASSERT(returnedContext
== &cbInfo
);
4807 // A short-running match shouldn't invoke the callback
4808 status
= U_ZERO_ERROR
;
4810 UnicodeString s
= "xxx";
4812 REGEX_ASSERT(matcher
.matches(status
));
4814 REGEX_ASSERT(cbInfo
.numCalls
== 0);
4816 // A medium-length match that runs long enough to invoke the
4817 // callback, but not so long that the callback aborts it.
4818 status
= U_ZERO_ERROR
;
4820 s
= "aaaaaaaaaaaaaaaaaaab";
4822 REGEX_ASSERT(matcher
.matches(status
)==FALSE
);
4824 REGEX_ASSERT(cbInfo
.numCalls
> 0);
4826 // A longer running match that the callback function will abort.
4827 status
= U_ZERO_ERROR
;
4829 s
= "aaaaaaaaaaaaaaaaaaaaaaab";
4831 REGEX_ASSERT(matcher
.matches(status
)==FALSE
);
4832 REGEX_ASSERT(status
== U_REGEX_STOPPED_BY_CALLER
);
4833 REGEX_ASSERT(cbInfo
.numCalls
== 4);
4835 // A longer running find that the callback function will abort.
4836 status
= U_ZERO_ERROR
;
4838 s
= "aaaaaaaaaaaaaaaaaaaaaaab";
4840 REGEX_ASSERT(matcher
.find(status
)==FALSE
);
4841 REGEX_ASSERT(status
== U_REGEX_STOPPED_BY_CALLER
);
4842 REGEX_ASSERT(cbInfo
.numCalls
== 4);
4850 // FindProgressCallbacks() Test the find "progress" callback function.
4851 // When set, the find progress callback will be invoked during a find operations
4852 // after each return from a match attempt, giving the application the opportunity
4853 // to terminate a long-running find operation before it's normal completion.
4856 struct progressCallBackContext
{
4861 void reset(int32_t max
) {maxCalls
=max
; numCalls
=0;lastIndex
=0;};
4864 // call-back function for find().
4865 // Return TRUE to continue the find().
4866 // Return FALSE to stop the find().
4868 static UBool U_CALLCONV
4869 testProgressCallBackFn(const void *context
, int64_t matchIndex
) {
4870 progressCallBackContext
*info
= (progressCallBackContext
*)context
;
4872 info
->lastIndex
= matchIndex
;
4873 // info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4874 return (info
->numCalls
< info
->maxCalls
);
4878 void RegexTest::FindProgressCallbacks() {
4880 // Getter returns NULLs if no callback has been set
4882 // The variables that the getter will fill in.
4883 // Init to non-null values so that the action of the getter can be seen.
4884 const void *returnedContext
= &returnedContext
;
4885 URegexFindProgressCallback
*returnedFn
= &testProgressCallBackFn
;
4887 UErrorCode status
= U_ZERO_ERROR
;
4888 RegexMatcher
matcher("x", 0, status
);
4890 matcher
.getFindProgressCallback(returnedFn
, returnedContext
, status
);
4892 REGEX_ASSERT(returnedFn
== NULL
);
4893 REGEX_ASSERT(returnedContext
== NULL
);
4898 progressCallBackContext cbInfo
= {this, 0, 0, 0};
4899 const void *returnedContext
;
4900 URegexFindProgressCallback
*returnedFn
;
4901 UErrorCode status
= U_ZERO_ERROR
;
4902 RegexMatcher
matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status
);
4904 matcher
.setFindProgressCallback(testProgressCallBackFn
, &cbInfo
, status
);
4906 matcher
.getFindProgressCallback(returnedFn
, returnedContext
, status
);
4908 REGEX_ASSERT(returnedFn
== testProgressCallBackFn
);
4909 REGEX_ASSERT(returnedContext
== &cbInfo
);
4911 // A find that matches on the initial position does NOT invoke the callback.
4912 status
= U_ZERO_ERROR
;
4914 UnicodeString s
= "aaxxx";
4917 matcher
.setTrace(TRUE
);
4919 REGEX_ASSERT(matcher
.find(0, status
));
4921 REGEX_ASSERT(cbInfo
.numCalls
== 0);
4923 // A medium running find() that causes matcher.find() to invoke our callback for each index,
4924 // but not so many times that we interrupt the operation.
4925 status
= U_ZERO_ERROR
;
4926 s
= "aaaaaaaaaaaaaaaaaaab";
4927 cbInfo
.reset(s
.length()); // Some upper limit for number of calls that is greater than size of our input string
4929 REGEX_ASSERT(matcher
.find(0, status
)==FALSE
);
4931 REGEX_ASSERT(cbInfo
.numCalls
> 0 && cbInfo
.numCalls
< 25);
4933 // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4934 status
= U_ZERO_ERROR
;
4935 UnicodeString s1
= "aaaaaaaaaaaaaaaaaaaaaaab";
4936 cbInfo
.reset(s1
.length() - 5); // Bail early somewhere near the end of input string
4938 REGEX_ASSERT(matcher
.find(0, status
)==FALSE
);
4939 REGEX_ASSERT(status
== U_REGEX_STOPPED_BY_CALLER
);
4940 REGEX_ASSERT(cbInfo
.numCalls
== s1
.length() - 5);
4942 // Now a match that will succeed, but after an interruption
4943 status
= U_ZERO_ERROR
;
4944 UnicodeString s2
= "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4945 cbInfo
.reset(s2
.length() - 10); // Bail early somewhere near the end of input string
4947 REGEX_ASSERT(matcher
.find(0, status
)==FALSE
);
4948 REGEX_ASSERT(status
== U_REGEX_STOPPED_BY_CALLER
);
4949 // Now retry the match from where left off
4950 cbInfo
.maxCalls
= 100; // No callback limit
4951 status
= U_ZERO_ERROR
;
4952 REGEX_ASSERT(matcher
.find(cbInfo
.lastIndex
, status
));
4960 //---------------------------------------------------------------------------
4962 // PreAllocatedUTextCAPI Check the C API with pre-allocated mutable
4963 // UTexts. The pure-C implementation of UText
4964 // has no mutable backing stores, but we can
4965 // use UnicodeString here to test the functionality.
4967 //---------------------------------------------------------------------------
4968 void RegexTest::PreAllocatedUTextCAPI () {
4969 UErrorCode status
= U_ZERO_ERROR
;
4970 URegularExpression
*re
;
4971 UText patternText
= UTEXT_INITIALIZER
;
4972 UnicodeString buffer
;
4973 UText bufferText
= UTEXT_INITIALIZER
;
4975 utext_openUnicodeString(&bufferText
, &buffer
, &status
);
4978 * getText() and getUText()
4981 UText text1
= UTEXT_INITIALIZER
;
4982 UText text2
= UTEXT_INITIALIZER
;
4983 UChar text2Chars
[20];
4986 status
= U_ZERO_ERROR
;
4987 regextst_openUTF8FromInvariant(&text1
, "abcccd", -1, &status
);
4988 regextst_openUTF8FromInvariant(&text2
, "abcccxd", -1, &status
);
4989 u_uastrncpy(text2Chars
, "abcccxd", sizeof(text2
)/2);
4990 utext_openUChars(&text2
, text2Chars
, -1, &status
);
4992 regextst_openUTF8FromInvariant(&patternText
, "abc*d", -1, &status
);
4993 re
= uregex_openUText(&patternText
, 0, NULL
, &status
);
4995 /* First set a UText */
4996 uregex_setUText(re
, &text1
, &status
);
4997 resultText
= uregex_getUText(re
, &bufferText
, &status
);
4999 REGEX_ASSERT(resultText
== &bufferText
);
5000 utext_setNativeIndex(resultText
, 0);
5001 utext_setNativeIndex(&text1
, 0);
5002 REGEX_ASSERT(testUTextEqual(resultText
, &text1
));
5004 resultText
= uregex_getUText(re
, &bufferText
, &status
);
5006 REGEX_ASSERT(resultText
== &bufferText
);
5007 utext_setNativeIndex(resultText
, 0);
5008 utext_setNativeIndex(&text1
, 0);
5009 REGEX_ASSERT(testUTextEqual(resultText
, &text1
));
5011 /* Then set a UChar * */
5012 uregex_setText(re
, text2Chars
, 7, &status
);
5013 resultText
= uregex_getUText(re
, &bufferText
, &status
);
5015 REGEX_ASSERT(resultText
== &bufferText
);
5016 utext_setNativeIndex(resultText
, 0);
5017 utext_setNativeIndex(&text2
, 0);
5018 REGEX_ASSERT(testUTextEqual(resultText
, &text2
));
5021 utext_close(&text1
);
5022 utext_close(&text2
);
5034 u_uastrncpy(text1
, "noise abc interior def, and this is off the end", UPRV_LENGTHOF(text1
));
5035 // 012345678901234567890123456789012345678901234567
5038 status
= U_ZERO_ERROR
;
5039 re
= uregex_openC("abc(.*?)def", 0, NULL
, &status
);
5042 uregex_setText(re
, text1
, -1, &status
);
5043 result
= uregex_find(re
, 0, &status
);
5044 REGEX_ASSERT(result
==TRUE
);
5046 /* Capture Group 0, the full match. Should succeed. "abc interior def" */
5047 status
= U_ZERO_ERROR
;
5048 actual
= uregex_groupUText(re
, 0, &bufferText
, &length
, &status
);
5050 REGEX_ASSERT(actual
== &bufferText
);
5051 REGEX_ASSERT(utext_getNativeIndex(actual
) == 6);
5052 REGEX_ASSERT(length
== 16);
5053 REGEX_ASSERT(utext_nativeLength(actual
) == 47);
5055 /* Capture group #1. Should succeed, matching " interior ". */
5056 status
= U_ZERO_ERROR
;
5057 actual
= uregex_groupUText(re
, 1, &bufferText
, &length
, &status
);
5059 REGEX_ASSERT(actual
== &bufferText
);
5060 REGEX_ASSERT(utext_getNativeIndex(actual
) == 9); // position of " interior "
5061 REGEX_ASSERT(length
== 10);
5062 REGEX_ASSERT(utext_nativeLength(actual
) == 47);
5064 /* Capture group out of range. Error. */
5065 status
= U_ZERO_ERROR
;
5066 actual
= uregex_groupUText(re
, 2, &bufferText
, &length
, &status
);
5067 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
5068 REGEX_ASSERT(actual
== &bufferText
);
5079 UText replText
= UTEXT_INITIALIZER
;
5081 status
= U_ZERO_ERROR
;
5082 utext_openUnicodeString(&bufferText
, &buffer
, &status
);
5084 status
= U_ZERO_ERROR
;
5085 u_uastrncpy(text1
, "Replace xaax x1x x...x.", UPRV_LENGTHOF(text1
));
5086 u_uastrncpy(text2
, "No match here.", UPRV_LENGTHOF(text2
)/2);
5087 regextst_openUTF8FromInvariant(&replText
, "<$1>", -1, &status
);
5089 re
= uregex_openC("x(.*?)x", 0, NULL
, &status
);
5092 /* Normal case, with match */
5093 uregex_setText(re
, text1
, -1, &status
);
5095 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5097 result
= uregex_replaceFirstUText(re
, &replText
, &bufferText
, &status
);
5099 REGEX_ASSERT(result
== &bufferText
);
5100 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result
);
5102 /* No match. Text should copy to output with no changes. */
5103 uregex_setText(re
, text2
, -1, &status
);
5104 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5105 result
= uregex_replaceFirstUText(re
, &replText
, &bufferText
, &status
);
5107 REGEX_ASSERT(result
== &bufferText
);
5108 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result
);
5110 /* Unicode escapes */
5111 uregex_setText(re
, text1
, -1, &status
);
5112 regextst_openUTF8FromInvariant(&replText
, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status
);
5113 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5114 result
= uregex_replaceFirstUText(re
, &replText
, &bufferText
, &status
);
5116 REGEX_ASSERT(result
== &bufferText
);
5117 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result
);
5120 utext_close(&replText
);
5130 UText replText
= UTEXT_INITIALIZER
;
5133 status
= U_ZERO_ERROR
;
5134 u_uastrncpy(text1
, "Replace xaax x1x x...x.", sizeof(text1
)/2);
5135 u_uastrncpy(text2
, "No match here.", sizeof(text2
)/2);
5136 regextst_openUTF8FromInvariant(&replText
, "<$1>", -1, &status
);
5138 re
= uregex_openC("x(.*?)x", 0, NULL
, &status
);
5141 /* Normal case, with match */
5142 uregex_setText(re
, text1
, -1, &status
);
5143 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5144 result
= uregex_replaceAllUText(re
, &replText
, &bufferText
, &status
);
5146 REGEX_ASSERT(result
== &bufferText
);
5147 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result
);
5149 /* No match. Text should copy to output with no changes. */
5150 uregex_setText(re
, text2
, -1, &status
);
5151 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5152 result
= uregex_replaceAllUText(re
, &replText
, &bufferText
, &status
);
5154 REGEX_ASSERT(result
== &bufferText
);
5155 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result
);
5158 utext_close(&replText
);
5163 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5164 * so we don't need to test it here.
5167 utext_close(&bufferText
);
5168 utext_close(&patternText
);
5172 //--------------------------------------------------------------
5174 // NamedCapture Check basic named capture group functionality
5176 //--------------------------------------------------------------
5177 void RegexTest::NamedCapture() {
5178 UErrorCode status
= U_ZERO_ERROR
;
5179 RegexPattern
*pat
= RegexPattern::compile(UnicodeString(
5180 "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status
);
5182 int32_t group
= pat
->groupNumberFromName("five", -1, status
);
5184 REGEX_ASSERT(5 == group
);
5185 group
= pat
->groupNumberFromName("three", -1, status
);
5187 REGEX_ASSERT(3 == group
);
5189 status
= U_ZERO_ERROR
;
5190 group
= pat
->groupNumberFromName(UnicodeString("six"), status
);
5192 REGEX_ASSERT(6 == group
);
5194 status
= U_ZERO_ERROR
;
5195 group
= pat
->groupNumberFromName(UnicodeString("nosuch"), status
);
5196 U_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5198 status
= U_ZERO_ERROR
;
5200 // After copying a pattern, named capture should still work in the copy.
5201 RegexPattern
*copiedPat
= new RegexPattern(*pat
);
5202 REGEX_ASSERT(*copiedPat
== *pat
);
5203 delete pat
; pat
= NULL
; // Delete original, copy should have no references back to it.
5205 group
= copiedPat
->groupNumberFromName("five", -1, status
);
5207 REGEX_ASSERT(5 == group
);
5208 group
= copiedPat
->groupNumberFromName("three", -1, status
);
5210 REGEX_ASSERT(3 == group
);
5213 // ReplaceAll with named capture group.
5214 status
= U_ZERO_ERROR
;
5215 UnicodeString
text("Substitution of <<quotes>> for <<double brackets>>");
5216 RegexMatcher
*m
= new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text
, 0, status
);
5218 // m.pattern().dumpPattern();
5219 UnicodeString replacedText
= m
->replaceAll("'${mid}'", status
);
5221 REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText
);
5224 // ReplaceAll, allowed capture group numbers.
5225 text
= UnicodeString("abcmxyz");
5226 m
= new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text
, 0, status
);
5229 status
= U_ZERO_ERROR
;
5230 replacedText
= m
->replaceAll(UnicodeString("<$0>"), status
); // group 0, full match, is allowed.
5232 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText
);
5234 status
= U_ZERO_ERROR
;
5235 replacedText
= m
->replaceAll(UnicodeString("<$1>"), status
); // group 1 by number.
5237 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText
);
5239 status
= U_ZERO_ERROR
;
5240 replacedText
= m
->replaceAll(UnicodeString("<${one}>"), status
); // group 1 by name.
5242 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText
);
5244 status
= U_ZERO_ERROR
;
5245 replacedText
= m
->replaceAll(UnicodeString("<$2>"), status
); // group 2.
5247 REGEX_ASSERT(UnicodeString("a<x>z") == replacedText
);
5249 status
= U_ZERO_ERROR
;
5250 replacedText
= m
->replaceAll(UnicodeString("<$3>"), status
);
5252 REGEX_ASSERT(UnicodeString("a<y>z") == replacedText
);
5254 status
= U_ZERO_ERROR
;
5255 replacedText
= m
->replaceAll(UnicodeString("<$4>"), status
);
5256 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
5258 status
= U_ZERO_ERROR
;
5259 replacedText
= m
->replaceAll(UnicodeString("<$04>"), status
); // group 0, leading 0,
5260 REGEX_CHECK_STATUS
; // trailing out-of-range 4 passes through.
5261 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText
);
5263 status
= U_ZERO_ERROR
;
5264 replacedText
= m
->replaceAll(UnicodeString("<$000016>"), status
); // Consume leading zeroes. Don't consume digits
5265 REGEX_CHECK_STATUS
; // that push group num out of range.
5266 REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText
); // This is group 1.
5268 status
= U_ZERO_ERROR
;
5269 replacedText
= m
->replaceAll(UnicodeString("<$3$2$1${one}>"), status
);
5271 REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText
);
5273 status
= U_ZERO_ERROR
;
5274 replacedText
= m
->replaceAll(UnicodeString("$3$2$1${one}"), status
);
5276 REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText
);
5278 status
= U_ZERO_ERROR
;
5279 replacedText
= m
->replaceAll(UnicodeString("<${noSuchName}>"), status
);
5280 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5282 status
= U_ZERO_ERROR
;
5283 replacedText
= m
->replaceAll(UnicodeString("<${invalid-name}>"), status
);
5284 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5286 status
= U_ZERO_ERROR
;
5287 replacedText
= m
->replaceAll(UnicodeString("<${one"), status
);
5288 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5290 status
= U_ZERO_ERROR
;
5291 replacedText
= m
->replaceAll(UnicodeString("$not a capture group"), status
);
5292 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5296 // Repeat the above replaceAll() tests using the plain C API, which
5297 // has a separate implementation internally.
5298 // TODO: factor out the test data.
5300 status
= U_ZERO_ERROR
;
5301 URegularExpression
*re
= uregex_openC("..(?<one>m)(.)(.)", 0, NULL
, &status
);
5303 text
= UnicodeString("abcmxyz");
5304 uregex_setText(re
, text
.getBuffer(), text
.length(), &status
);
5307 UChar resultBuf
[100];
5308 int32_t resultLength
;
5311 status
= U_ZERO_ERROR
;
5312 repl
= UnicodeString("<$0>");
5313 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5315 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf
, resultLength
));
5317 status
= U_ZERO_ERROR
;
5318 repl
= UnicodeString("<$1>");
5319 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5321 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf
, resultLength
));
5323 status
= U_ZERO_ERROR
;
5324 repl
= UnicodeString("<${one}>");
5325 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5327 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf
, resultLength
));
5329 status
= U_ZERO_ERROR
;
5330 repl
= UnicodeString("<$2>");
5331 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5333 REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf
, resultLength
));
5335 status
= U_ZERO_ERROR
;
5336 repl
= UnicodeString("<$3>");
5337 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5339 REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf
, resultLength
));
5341 status
= U_ZERO_ERROR
;
5342 repl
= UnicodeString("<$4>");
5343 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5344 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
5346 status
= U_ZERO_ERROR
;
5347 repl
= UnicodeString("<$04>");
5348 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5350 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf
, resultLength
));
5352 status
= U_ZERO_ERROR
;
5353 repl
= UnicodeString("<$000016>");
5354 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5356 REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf
, resultLength
));
5358 status
= U_ZERO_ERROR
;
5359 repl
= UnicodeString("<$3$2$1${one}>");
5360 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5362 REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf
, resultLength
));
5364 status
= U_ZERO_ERROR
;
5365 repl
= UnicodeString("$3$2$1${one}");
5366 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5368 REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf
, resultLength
));
5370 status
= U_ZERO_ERROR
;
5371 repl
= UnicodeString("<${noSuchName}>");
5372 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5373 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5375 status
= U_ZERO_ERROR
;
5376 repl
= UnicodeString("<${invalid-name}>");
5377 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5378 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5380 status
= U_ZERO_ERROR
;
5381 repl
= UnicodeString("<${one");
5382 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5383 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5385 status
= U_ZERO_ERROR
;
5386 repl
= UnicodeString("$not a capture group");
5387 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5388 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5393 //--------------------------------------------------------------
5395 // NamedCaptureLimits Patterns with huge numbers of named capture groups.
5396 // The point is not so much what the exact limit is,
5397 // but that a largish number doesn't hit bad non-linear performance,
5398 // and that exceeding the limit fails cleanly.
5400 //--------------------------------------------------------------
5401 void RegexTest::NamedCaptureLimits() {
5403 logln("Skipping test. Runs in exhuastive mode only.");
5406 const int32_t goodLimit
= 1000000; // Pattern w this many groups builds successfully.
5407 const int32_t failLimit
= 10000000; // Pattern exceeds internal limits, fails to compile.
5409 UnicodeString pattern
;
5412 for (nn
=1; nn
<goodLimit
; nn
++) {
5413 sprintf(nnbuf
, "(?<nn%d>)", nn
);
5414 pattern
.append(UnicodeString(nnbuf
, -1, US_INV
));
5416 UErrorCode status
= U_ZERO_ERROR
;
5417 RegexPattern
*pat
= RegexPattern::compile(pattern
, 0, status
);
5419 for (nn
=1; nn
<goodLimit
; nn
++) {
5420 sprintf(nnbuf
, "nn%d", nn
);
5421 int32_t groupNum
= pat
->groupNumberFromName(nnbuf
, -1, status
);
5422 REGEX_ASSERT(nn
== groupNum
);
5423 if (nn
!= groupNum
) {
5430 for (nn
=1; nn
<failLimit
; nn
++) {
5431 sprintf(nnbuf
, "(?<nn%d>)", nn
);
5432 pattern
.append(UnicodeString(nnbuf
, -1, US_INV
));
5434 status
= U_ZERO_ERROR
;
5435 pat
= RegexPattern::compile(pattern
, 0, status
);
5436 REGEX_ASSERT(status
== U_REGEX_PATTERN_TOO_BIG
);
5441 //--------------------------------------------------------------
5443 // Bug7651 Regex pattern that exceeds default operator stack depth in matcher.
5445 //---------------------------------------------------------------
5446 void RegexTest::Bug7651() {
5447 UnicodeString
pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5448 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5449 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5450 UnicodeString
pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5451 UnicodeString
s("#ff @abcd This is test");
5452 RegexPattern
*REPattern
= NULL
;
5453 RegexMatcher
*REMatcher
= NULL
;
5454 UErrorCode status
= U_ZERO_ERROR
;
5457 REPattern
= RegexPattern::compile(pattern1
, 0, pe
, status
);
5459 REMatcher
= REPattern
->matcher(s
, status
);
5461 REGEX_ASSERT(REMatcher
->find());
5462 REGEX_ASSERT(REMatcher
->start(status
) == 0);
5465 status
= U_ZERO_ERROR
;
5467 REPattern
= RegexPattern::compile(pattern2
, 0, pe
, status
);
5469 REMatcher
= REPattern
->matcher(s
, status
);
5471 REGEX_ASSERT(REMatcher
->find());
5472 REGEX_ASSERT(REMatcher
->start(status
) == 0);
5475 status
= U_ZERO_ERROR
;
5478 void RegexTest::Bug7740() {
5479 UErrorCode status
= U_ZERO_ERROR
;
5480 UnicodeString pattern
= "(a)";
5481 UnicodeString text
= "abcdef";
5482 RegexMatcher
*m
= new RegexMatcher(pattern
, text
, 0, status
);
5484 REGEX_ASSERT(m
->lookingAt(status
));
5486 status
= U_ILLEGAL_ARGUMENT_ERROR
;
5487 UnicodeString s
= m
->group(1, status
); // Bug 7740: segfault here.
5488 REGEX_ASSERT(status
== U_ILLEGAL_ARGUMENT_ERROR
);
5489 REGEX_ASSERT(s
== "");
5493 // Bug 8479: was crashing whith a Bogus UnicodeString as input.
5495 void RegexTest::Bug8479() {
5496 UErrorCode status
= U_ZERO_ERROR
;
5498 RegexMatcher
* const pMatcher
= new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL
|UREGEX_CASE_INSENSITIVE
, status
);
5500 if (U_SUCCESS(status
))
5504 pMatcher
->reset(str
);
5505 status
= U_ZERO_ERROR
;
5506 pMatcher
->matches(status
);
5507 REGEX_ASSERT(status
== U_ILLEGAL_ARGUMENT_ERROR
);
5514 void RegexTest::Bug7029() {
5515 UErrorCode status
= U_ZERO_ERROR
;
5517 RegexMatcher
* const pMatcher
= new RegexMatcher(".", 0, status
);
5518 UnicodeString text
= "abc.def";
5519 UnicodeString splits
[10];
5521 int32_t numFields
= pMatcher
->split(text
, splits
, 10, status
);
5523 REGEX_ASSERT(numFields
== 8);
5528 // This test is checking for the existance of any supplemental characters that case-fold
5529 // to a bmp character.
5531 // At the time of this writing there are none. If any should appear in a subsequent release
5532 // of Unicode, the code in regular expressions compilation that determines the longest
5533 // posssible match for a literal string will need to be enhanced.
5535 // See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5536 // for details on what to do in case of a failure of this test.
5538 void RegexTest::Bug9283() {
5539 #if !UCONFIG_NO_NORMALIZATION
5540 UErrorCode status
= U_ZERO_ERROR
;
5541 UnicodeSet
supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status
);
5545 for (index
=0; ; index
++) {
5546 c
= supplementalsWithCaseFolding
.charAt(index
);
5550 UnicodeString cf
= UnicodeString(c
).foldCase();
5551 REGEX_ASSERT(cf
.length() >= 2);
5553 #endif /* #if !UCONFIG_NO_NORMALIZATION */
5557 void RegexTest::CheckInvBufSize() {
5558 if(inv_next
>=INV_BUFSIZ
) {
5559 errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5560 __FILE__
, INV_BUFSIZ
, inv_next
);
5562 logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__
, INV_BUFSIZ
, inv_next
);
5567 void RegexTest::Bug10459() {
5568 UErrorCode status
= U_ZERO_ERROR
;
5569 UnicodeString
patternString("(txt)");
5570 UnicodeString
txtString("txt");
5572 UText
*utext_pat
= utext_openUnicodeString(NULL
, &patternString
, &status
);
5574 UText
*utext_txt
= utext_openUnicodeString(NULL
, &txtString
, &status
);
5577 URegularExpression
*icu_re
= uregex_openUText(utext_pat
, 0, NULL
, &status
);
5580 uregex_setUText(icu_re
, utext_txt
, &status
);
5583 // The bug was that calling uregex_group() before doing a matching operation
5584 // was causing a segfault. Only for Regular Expressions created from UText.
5585 // It should set an U_REGEX_INVALID_STATE.
5588 int32_t len
= uregex_group(icu_re
, 0, buf
, UPRV_LENGTHOF(buf
), &status
);
5589 REGEX_ASSERT(status
== U_REGEX_INVALID_STATE
);
5590 REGEX_ASSERT(len
== 0);
5592 uregex_close(icu_re
);
5593 utext_close(utext_pat
);
5594 utext_close(utext_txt
);
5597 void RegexTest::TestCaseInsensitiveStarters() {
5598 // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
5599 // become stale because of new Unicode characters.
5600 // If it is stale, rerun the generation tool
5601 // svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
5602 // and replace the embedded data in i18n/regexcmp.cpp
5604 for (UChar32 cp
=0; cp
<=0x10ffff; cp
++) {
5605 if (!u_hasBinaryProperty(cp
, UCHAR_CASE_SENSITIVE
)) {
5608 UnicodeSet
s(cp
, cp
);
5609 s
.closeOver(USET_CASE_INSENSITIVE
);
5610 UnicodeSetIterator
setIter(s
);
5611 while (setIter
.next()) {
5612 if (!setIter
.isString()) {
5615 const UnicodeString
&str
= setIter
.getString();
5616 UChar32 firstChar
= str
.char32At(0);
5617 UnicodeSet starters
;
5618 RegexCompile::findCaseInsensitiveStarters(firstChar
, &starters
);
5619 if (!starters
.contains(cp
)) {
5620 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp
, firstChar
);
5628 void RegexTest::TestBug11049() {
5629 // Original bug report: pattern with match start consisting of one of several individual characters,
5630 // and the text being matched ending with a supplementary character. find() would read past the
5631 // end of the input text when searching for potential match starting points.
5633 // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
5634 // detect the bad read.
5636 TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE
, __LINE__
);
5637 TestCase11049("A|B|C", "string matches at end C", TRUE
, __LINE__
);
5639 // Test again with a pattern starting with a single character,
5640 // which takes a different code path than starting with an OR expression,
5641 // but with similar logic.
5642 TestCase11049("C", "a string \\ud800\\udc00", FALSE
, __LINE__
);
5643 TestCase11049("C", "string matches at end C", TRUE
, __LINE__
);
5646 // Run a single test case from TestBug11049(). Internal function.
5647 void RegexTest::TestCase11049(const char *pattern
, const char *data
, UBool expectMatch
, int32_t lineNumber
) {
5648 UErrorCode status
= U_ZERO_ERROR
;
5649 UnicodeString patternString
= UnicodeString(pattern
).unescape();
5650 LocalPointer
<RegexPattern
> compiledPat(RegexPattern::compile(patternString
, 0, status
));
5652 UnicodeString dataString
= UnicodeString(data
).unescape();
5653 UChar
*exactBuffer
= new UChar
[dataString
.length()];
5654 dataString
.extract(exactBuffer
, dataString
.length(), status
);
5655 UText
*ut
= utext_openUChars(NULL
, exactBuffer
, dataString
.length(), &status
);
5657 LocalPointer
<RegexMatcher
> matcher(compiledPat
->matcher(status
));
5660 UBool result
= matcher
->find();
5661 if (result
!= expectMatch
) {
5662 errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5663 __FILE__
, lineNumber
, expectMatch
, result
, pattern
, data
);
5666 // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
5667 // off-by-one on find() with match at the last code point.
5668 // Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
5669 // because string.unescape() will only shrink it.
5670 char * utf8Buffer
= new char[uprv_strlen(data
)+1];
5671 u_strToUTF8(utf8Buffer
, static_cast<int32_t>(uprv_strlen(data
)+1), NULL
, dataString
.getBuffer(), dataString
.length(), &status
);
5673 ut
= utext_openUTF8(ut
, utf8Buffer
, -1, &status
);
5676 result
= matcher
->find();
5677 if (result
!= expectMatch
) {
5678 errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5679 __FILE__
, lineNumber
, expectMatch
, result
, pattern
, data
);
5681 delete [] utf8Buffer
;
5684 delete [] exactBuffer
;
5688 void RegexTest::TestBug11371() {
5690 logln("Skipping test. Runs in exhuastive mode only.");
5693 UErrorCode status
= U_ZERO_ERROR
;
5694 UnicodeString patternString
;
5696 for (int i
=0; i
<8000000; i
++) {
5697 patternString
.append(UnicodeString("()"));
5699 LocalPointer
<RegexPattern
> compiledPat(RegexPattern::compile(patternString
, 0, status
));
5700 if (status
!= U_REGEX_PATTERN_TOO_BIG
) {
5701 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5702 __FILE__
, __LINE__
, u_errorName(status
));
5705 status
= U_ZERO_ERROR
;
5706 patternString
= "(";
5707 for (int i
=0; i
<20000000; i
++) {
5708 patternString
.append(UnicodeString("A++"));
5710 patternString
.append(UnicodeString("){0}B++"));
5711 LocalPointer
<RegexPattern
> compiledPat2(RegexPattern::compile(patternString
, 0, status
));
5712 if (status
!= U_REGEX_PATTERN_TOO_BIG
) {
5713 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5714 __FILE__
, __LINE__
, u_errorName(status
));
5717 // Pattern with too much string data, such that string indexes overflow operand data field size
5718 // in compiled instruction.
5719 status
= U_ZERO_ERROR
;
5721 while (patternString
.length() < 0x00ffffff) {
5722 patternString
.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
5724 patternString
.append(UnicodeString("X? trailing string"));
5725 LocalPointer
<RegexPattern
> compiledPat3(RegexPattern::compile(patternString
, 0, status
));
5726 if (status
!= U_REGEX_PATTERN_TOO_BIG
) {
5727 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5728 __FILE__
, __LINE__
, u_errorName(status
));
5732 void RegexTest::TestBug11480() {
5733 // C API, get capture group of a group that does not participate in the match.
5734 // (Returns a zero length string, with nul termination,
5735 // indistinguishable from a group with a zero length match.)
5737 UErrorCode status
= U_ZERO_ERROR
;
5738 URegularExpression
*re
= uregex_openC("(A)|(B)", 0, NULL
, &status
);
5740 UnicodeString text
= UNICODE_STRING_SIMPLE("A");
5741 uregex_setText(re
, text
.getBuffer(), text
.length(), &status
);
5743 REGEX_ASSERT(uregex_lookingAt(re
, 0, &status
));
5744 UChar buf
[10] = {(UChar
)13, (UChar
)13, (UChar
)13, (UChar
)13};
5745 int32_t length
= uregex_group(re
, 2, buf
+1, UPRV_LENGTHOF(buf
)-1, &status
);
5746 REGEX_ASSERT(length
== 0);
5747 REGEX_ASSERT(buf
[0] == 13);
5748 REGEX_ASSERT(buf
[1] == 0);
5749 REGEX_ASSERT(buf
[2] == 13);
5752 // UText C++ API, length of match is 0 for non-participating matches.
5753 UText ut
= UTEXT_INITIALIZER
;
5754 utext_openUnicodeString(&ut
, &text
, &status
);
5755 RegexMatcher
matcher(UnicodeString("(A)|(B)"), 0, status
);
5758 REGEX_ASSERT(matcher
.lookingAt(0, status
));
5760 // UText C++ API, Capture group 1 matches "A", position 0, length 1.
5761 int64_t groupLen
= -666;
5762 UText group
= UTEXT_INITIALIZER
;
5763 matcher
.group(1, &group
, groupLen
, status
);
5765 REGEX_ASSERT(groupLen
== 1);
5766 REGEX_ASSERT(utext_getNativeIndex(&group
) == 0);
5768 // Capture group 2, the (B), does not participate in the match.
5769 matcher
.group(2, &group
, groupLen
, status
);
5771 REGEX_ASSERT(groupLen
== 0);
5772 REGEX_ASSERT(matcher
.start(2, status
) == -1);
5776 void RegexTest::TestBug12884() {
5777 // setTimeLimit() was not effective for empty sub-patterns with large {minimum counts}
5778 UnicodeString
pattern(u
"(((((((){120}){11}){11}){11}){80}){11}){4}");
5779 UnicodeString
text(u
"hello");
5780 UErrorCode status
= U_ZERO_ERROR
;
5781 RegexMatcher
m(pattern
, text
, 0, status
);
5783 m
.setTimeLimit(5, status
);
5785 REGEX_ASSERT(status
== U_REGEX_TIME_OUT
);
5787 // Non-greedy loops. They take a different code path during matching.
5788 UnicodeString
ngPattern(u
"(((((((){120}?){11}?){11}?){11}?){80}?){11}?){4}?");
5789 status
= U_ZERO_ERROR
;
5790 RegexMatcher
ngM(ngPattern
, text
, 0, status
);
5792 ngM
.setTimeLimit(5, status
);
5794 REGEX_ASSERT(status
== U_REGEX_TIME_OUT
);
5796 // UText, wrapping non-UTF-16 text, also takes a different execution path.
5797 const char *text8
= u8
"¿Qué es Unicode? Unicode proporciona un número único para cada"
5798 "carácter, sin importar la plataforma, sin importar el programa,"
5799 "sin importar el idioma.";
5800 status
= U_ZERO_ERROR
;
5801 LocalUTextPointer
ut(utext_openUTF8(NULL
, text8
, -1, &status
));
5803 m
.reset(ut
.getAlias());
5805 REGEX_ASSERT(status
== U_REGEX_TIME_OUT
);
5807 status
= U_ZERO_ERROR
;
5808 ngM
.reset(ut
.getAlias());
5810 REGEX_ASSERT(status
== U_REGEX_TIME_OUT
);
5813 // Bug 13631. A find() of a pattern with a zero length look-behind assertions
5814 // can cause a read past the end of the input text.
5815 // The failure is seen when running this test with Clang's Addresss Sanitizer.
5817 void RegexTest::TestBug13631() {
5818 const UChar
*pats
[] = { u
"(?<!^)",
5822 for (const UChar
**pat
=pats
; *pat
; ++pat
) {
5823 UErrorCode status
= U_ZERO_ERROR
;
5824 UnicodeString
upat(*pat
);
5825 RegexMatcher
matcher(upat
, 0, status
);
5826 const UChar s
=u
'a';
5827 UText
*ut
= utext_openUChars(nullptr, &s
, 1, &status
);
5830 while (matcher
.find()) {
5836 // Bug 13632 Out of bounds memory reference if a replacement string ends with a '$',
5837 // where a following group specification would be expected.
5838 // Failure shows when running the test under Clang's Address Sanitizer.
5840 void RegexTest::TestBug13632() {
5841 UErrorCode status
= U_ZERO_ERROR
;
5842 URegularExpression
*re
= uregex_openC(" ", 0, nullptr, &status
);
5843 const char16_t *sourceString
= u
"Hello, world.";
5844 uregex_setText(re
, sourceString
, u_strlen(sourceString
), &status
);
5846 const int32_t destCap
= 20;
5847 char16_t dest
[destCap
] = {};
5848 const char16_t replacement
[] = {u
'x', u
'$'}; // Not nul terminated string.
5849 uregex_replaceAll(re
, replacement
, 2, dest
, destCap
, &status
);
5851 assertEquals("", U_REGEX_INVALID_CAPTURE_GROUP_NAME
, status
);
5855 void RegexTest::TestBug20359() {
5856 // The bug was stack overflow while parsing a pattern with a huge number of adjacent \Q\E
5857 // pairs. (Enter and exit pattern literal quote mode). Logic was correct.
5858 // Changed implementation to loop instead of recursing.
5860 UnicodeString pattern
;
5861 for (int i
=0; i
<50000; ++i
) {
5862 pattern
+= u
"\\Q\\E";
5866 UErrorCode status
= U_ZERO_ERROR
;
5867 LocalURegularExpressionPointer
re(uregex_open(pattern
.getBuffer(), pattern
.length(),
5868 0, nullptr, &status
));
5869 assertSuccess(WHERE
, status
);
5871 // We have passed the point where the bug crashed. The following is a small sanity
5872 // check that the pattern works, that all the \Q\E\Q\E... didn't cause other problems.
5874 uregex_setText(re
.getAlias(), u
"abcxyz", -1, &status
);
5875 assertSuccess(WHERE
, status
);
5876 assertTrue(WHERE
, uregex_find(re
.getAlias(), 0, &status
));
5877 assertEquals(WHERE
, 3, uregex_start(re
.getAlias(), 0, &status
));
5878 assertSuccess(WHERE
, status
);
5881 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */