1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
5 * Copyright (c) 2002-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
12 // ICU Regular Expressions test, part of intltest.
18 PLEASE be careful about ASCII assumptions in this test.
19 This test is one of the worst repeat offenders.
20 If you have questions, contact someone on the ICU PMC
21 who has access to an EBCDIC system.
26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
32 #include "unicode/localpointer.h"
33 #include "unicode/regex.h"
34 #include "unicode/uchar.h"
35 #include "unicode/ucnv.h"
36 #include "unicode/uniset.h"
37 #include "unicode/uregex.h"
38 #include "unicode/usetiter.h"
39 #include "unicode/ustring.h"
40 #include "unicode/utext.h"
50 #define SUPPORT_MUTATING_INPUT_STRING 0
52 //---------------------------------------------------------------------------
54 // Test class boilerplate
56 //---------------------------------------------------------------------------
57 RegexTest::RegexTest()
62 RegexTest::~RegexTest()
68 void RegexTest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* /*par*/ )
70 if (exec
) logln("TestSuite RegexTest: ");
73 TESTCASE_AUTO(API_Match
);
74 TESTCASE_AUTO(API_Replace
);
75 TESTCASE_AUTO(API_Pattern
);
76 #if !UCONFIG_NO_FILE_IO
77 TESTCASE_AUTO(Extended
);
79 TESTCASE_AUTO(Errors
);
80 TESTCASE_AUTO(PerlTests
);
81 TESTCASE_AUTO(Callbacks
);
82 TESTCASE_AUTO(FindProgressCallbacks
);
83 TESTCASE_AUTO(Bug6149
);
84 TESTCASE_AUTO(UTextBasic
);
85 TESTCASE_AUTO(API_Match_UTF8
);
86 TESTCASE_AUTO(API_Replace_UTF8
);
87 TESTCASE_AUTO(API_Pattern_UTF8
);
88 TESTCASE_AUTO(PerlTestsUTF8
);
89 TESTCASE_AUTO(PreAllocatedUTextCAPI
);
90 TESTCASE_AUTO(Bug7651
);
91 TESTCASE_AUTO(Bug7740
);
92 TESTCASE_AUTO(Bug8479
);
93 TESTCASE_AUTO(Bug7029
);
94 TESTCASE_AUTO(CheckInvBufSize
);
95 TESTCASE_AUTO(Bug9283
);
96 TESTCASE_AUTO(Bug10459
);
97 TESTCASE_AUTO(TestCaseInsensitiveStarters
);
98 TESTCASE_AUTO(TestBug11049
);
99 TESTCASE_AUTO(TestBug11371
);
100 TESTCASE_AUTO(TestBug11480
);
101 TESTCASE_AUTO(NamedCapture
);
102 TESTCASE_AUTO(NamedCaptureLimits
);
103 TESTCASE_AUTO(TestBug12884
);
109 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
111 * @see utext_openUTF8
113 static UText
* regextst_openUTF8FromInvariant(UText
* ut
, const char *inv
, int64_t length
, UErrorCode
*status
);
115 //---------------------------------------------------------------------------
117 // Error Checking / Reporting macros used in all of the tests.
119 //---------------------------------------------------------------------------
121 static void utextToPrintable(char *buf
, int32_t bufLen
, UText
*text
) {
122 int64_t oldIndex
= utext_getNativeIndex(text
);
123 utext_setNativeIndex(text
, 0);
125 UChar32 c
= utext_next32From(text
, 0);
126 while ((c
!= U_SENTINEL
) && (bufPtr
< buf
+bufLen
)) {
127 if (0x000020<=c
&& c
<0x00007e) {
131 sprintf(bufPtr
,"U+%04X", c
);
132 bufPtr
+= strlen(bufPtr
)-1;
138 c
= UTEXT_NEXT32(text
);
141 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
142 char *ebuf
= (char*)malloc(bufLen
);
143 uprv_eastrncpy((unsigned char*)ebuf
, (const unsigned char*)buf
, bufLen
);
144 uprv_strncpy(buf
, ebuf
, bufLen
);
147 utext_setNativeIndex(text
, oldIndex
);
151 static char ASSERT_BUF
[1024];
153 const char* RegexTest::extractToAssertBuf(const UnicodeString
& message
) {
154 if(message
.length()==0) {
155 strcpy(ASSERT_BUF
, "[[empty UnicodeString]]");
158 IntlTest::prettify(message
,buf
);
159 if(buf
.length()==0) {
160 strcpy(ASSERT_BUF
, "[[escape() returned 0 chars]]");
162 buf
.extract(0, 0x7FFFFFFF, ASSERT_BUF
, sizeof(ASSERT_BUF
)-1);
163 if(ASSERT_BUF
[0]==0) {
165 for(int32_t i
=0;i
<buf
.length();i
++) {
167 sprintf(ASSERT_BUF
+strlen(ASSERT_BUF
),"\\u%02x",ch
);
172 ASSERT_BUF
[sizeof(ASSERT_BUF
)-1] = 0;
176 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,UPRV_LENGTHOF(buf),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
178 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \
179 __FILE__, __LINE__, u_errorName(status)); return;}}
181 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
183 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
184 if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \
185 __LINE__, u_errorName(errcode), u_errorName(status));};}
187 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
188 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
190 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
191 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
193 // expected: const char * , restricted to invariant characters.
194 // actual: const UnicodeString &
195 #define REGEX_ASSERT_UNISTR(expected, actual) { \
196 if (UnicodeString(expected, -1, US_INV) != (actual)) { \
197 errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n", \
198 __FILE__, __LINE__, expected, extractToAssertBuf(actual));};}
201 static UBool
testUTextEqual(UText
*uta
, UText
*utb
) {
204 utext_setNativeIndex(uta
, 0);
205 utext_setNativeIndex(utb
, 0);
207 ca
= utext_next32(uta
);
208 cb
= utext_next32(utb
);
212 } while (ca
!= U_SENTINEL
);
218 * @param expected expected text in UTF-8 (not platform) codepage
220 void RegexTest::assertUText(const char *expected
, UText
*actual
, const char *file
, int line
) {
221 UErrorCode status
= U_ZERO_ERROR
;
222 UText expectedText
= UTEXT_INITIALIZER
;
223 utext_openUTF8(&expectedText
, expected
, -1, &status
);
224 if(U_FAILURE(status
)) {
225 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file
, line
, u_errorName(status
), strlen(expected
));
228 if(utext_nativeLength(&expectedText
)==0 && (strlen(expected
)!=0)) {
229 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file
, line
, strlen(expected
));
232 utext_setNativeIndex(actual
, 0);
233 if (!testUTextEqual(&expectedText
, actual
)) {
234 char buf
[201 /*21*/];
235 char expectedBuf
[201];
236 utextToPrintable(buf
, UPRV_LENGTHOF(buf
), actual
);
237 utextToPrintable(expectedBuf
, UPRV_LENGTHOF(expectedBuf
), &expectedText
);
238 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file
, line
, expectedBuf
, (int)utext_nativeLength(&expectedText
), buf
, (int)utext_nativeLength(actual
));
240 utext_close(&expectedText
);
243 * @param expected invariant (platform local text) input
246 void RegexTest::assertUTextInvariant(const char *expected
, UText
*actual
, const char *file
, int line
) {
247 UErrorCode status
= U_ZERO_ERROR
;
248 UText expectedText
= UTEXT_INITIALIZER
;
249 regextst_openUTF8FromInvariant(&expectedText
, expected
, -1, &status
);
250 if(U_FAILURE(status
)) {
251 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file
, line
, u_errorName(status
), strlen(expected
));
254 utext_setNativeIndex(actual
, 0);
255 if (!testUTextEqual(&expectedText
, actual
)) {
256 char buf
[201 /*21*/];
257 char expectedBuf
[201];
258 utextToPrintable(buf
, UPRV_LENGTHOF(buf
), actual
);
259 utextToPrintable(expectedBuf
, UPRV_LENGTHOF(expectedBuf
), &expectedText
);
260 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file
, line
, expectedBuf
, (int)utext_nativeLength(&expectedText
), buf
, (int)utext_nativeLength(actual
));
262 utext_close(&expectedText
);
266 * Assumes utf-8 input
268 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
270 * Assumes Invariant input
272 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
275 * This buffer ( inv_buf ) is used to hold the UTF-8 strings
276 * passed into utext_openUTF8. An error will be given if
277 * INV_BUFSIZ is too small. It's only used on EBCDIC systems.
280 #define INV_BUFSIZ 2048 /* increase this if too small */
282 static int64_t inv_next
=0;
284 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
285 static char inv_buf
[INV_BUFSIZ
];
288 static UText
* regextst_openUTF8FromInvariant(UText
*ut
, const char *inv
, int64_t length
, UErrorCode
*status
) {
289 if(length
==-1) length
=strlen(inv
);
290 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
292 return utext_openUTF8(ut
, inv
, length
, status
);
294 if(inv_next
+length
+1>INV_BUFSIZ
) {
295 fprintf(stderr
, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
296 __FILE__
, __LINE__
, INV_BUFSIZ
, (inv_next
+length
+1));
297 *status
= U_MEMORY_ALLOCATION_ERROR
;
301 unsigned char *buf
= (unsigned char*)inv_buf
+inv_next
;
302 uprv_aestrncpy(buf
, (const uint8_t*)inv
, length
);
306 fprintf(stderr
, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ
, inv_next
);
309 return utext_openUTF8(ut
, (const char*)buf
, length
, status
);
314 //---------------------------------------------------------------------------
316 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests
317 // for the LookingAt() and Match() functions.
320 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
322 // The expected results are UBool - TRUE or FALSE.
323 // The input text is unescaped. The pattern is not.
326 //---------------------------------------------------------------------------
328 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
330 UBool
RegexTest::doRegexLMTest(const char *pat
, const char *text
, UBool looking
, UBool match
, int32_t line
) {
331 const UnicodeString
pattern(pat
, -1, US_INV
);
332 const UnicodeString
inputText(text
, -1, US_INV
);
333 UErrorCode status
= U_ZERO_ERROR
;
335 RegexPattern
*REPattern
= NULL
;
336 RegexMatcher
*REMatcher
= NULL
;
339 UnicodeString
patString(pat
, -1, US_INV
);
340 REPattern
= RegexPattern::compile(patString
, 0, pe
, status
);
341 if (U_FAILURE(status
)) {
342 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s",
343 line
, u_errorName(status
));
346 if (line
==376) { REPattern
->dumpPattern();}
348 UnicodeString
inputString(inputText
);
349 UnicodeString unEscapedInput
= inputString
.unescape();
350 REMatcher
= REPattern
->matcher(unEscapedInput
, status
);
351 if (U_FAILURE(status
)) {
352 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
353 line
, u_errorName(status
));
358 actualmatch
= REMatcher
->lookingAt(status
);
359 if (U_FAILURE(status
)) {
360 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
361 line
, u_errorName(status
));
364 if (actualmatch
!= looking
) {
365 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line
);
369 status
= U_ZERO_ERROR
;
370 actualmatch
= REMatcher
->matches(status
);
371 if (U_FAILURE(status
)) {
372 errln("RegexTest failure in matches() at line %d. Status = %s\n",
373 line
, u_errorName(status
));
376 if (actualmatch
!= match
) {
377 errln("RegexTest: wrong return from matches() at line %d.\n", line
);
381 if (retVal
== FALSE
) {
382 REPattern
->dumpPattern();
391 UBool
RegexTest::doRegexLMTestUTF8(const char *pat
, const char *text
, UBool looking
, UBool match
, int32_t line
) {
392 UText pattern
= UTEXT_INITIALIZER
;
393 int32_t inputUTF8Length
;
394 char *textChars
= NULL
;
395 UText inputText
= UTEXT_INITIALIZER
;
396 UErrorCode status
= U_ZERO_ERROR
;
398 RegexPattern
*REPattern
= NULL
;
399 RegexMatcher
*REMatcher
= NULL
;
402 regextst_openUTF8FromInvariant(&pattern
, pat
, -1, &status
);
403 REPattern
= RegexPattern::compile(&pattern
, 0, pe
, status
);
404 if (U_FAILURE(status
)) {
405 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n",
406 line
, u_errorName(status
));
410 UnicodeString
inputString(text
, -1, US_INV
);
411 UnicodeString unEscapedInput
= inputString
.unescape();
412 LocalUConverterPointer
UTF8Converter(ucnv_open("UTF8", &status
));
413 ucnv_setFromUCallBack(UTF8Converter
.getAlias(), UCNV_FROM_U_CALLBACK_STOP
, NULL
, NULL
, NULL
, &status
);
415 inputUTF8Length
= unEscapedInput
.extract(NULL
, 0, UTF8Converter
.getAlias(), status
);
416 if (U_FAILURE(status
) && status
!= U_BUFFER_OVERFLOW_ERROR
) {
417 // UTF-8 does not allow unpaired surrogates, so this could actually happen
418 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line
, u_errorName(status
));
419 return TRUE
; // not a failure of the Regex engine
421 status
= U_ZERO_ERROR
; // buffer overflow
422 textChars
= new char[inputUTF8Length
+1];
423 unEscapedInput
.extract(textChars
, inputUTF8Length
+1, UTF8Converter
.getAlias(), status
);
424 utext_openUTF8(&inputText
, textChars
, inputUTF8Length
, &status
);
426 REMatcher
= &REPattern
->matcher(status
)->reset(&inputText
);
427 if (U_FAILURE(status
)) {
428 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n",
429 line
, u_errorName(status
));
434 actualmatch
= REMatcher
->lookingAt(status
);
435 if (U_FAILURE(status
)) {
436 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n",
437 line
, u_errorName(status
));
440 if (actualmatch
!= looking
) {
441 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line
);
445 status
= U_ZERO_ERROR
;
446 actualmatch
= REMatcher
->matches(status
);
447 if (U_FAILURE(status
)) {
448 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n",
449 line
, u_errorName(status
));
452 if (actualmatch
!= match
) {
453 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line
);
457 if (retVal
== FALSE
) {
458 REPattern
->dumpPattern();
463 utext_close(&inputText
);
464 utext_close(&pattern
);
471 //---------------------------------------------------------------------------
473 // REGEX_ERR Macro + invocation function to simplify writing tests
474 // regex tests for incorrect patterns
477 // REGEX_ERR("pattern", expected error line, column, expected status);
479 //---------------------------------------------------------------------------
480 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
482 void RegexTest::regex_err(const char *pat
, int32_t errLine
, int32_t errCol
,
483 UErrorCode expectedStatus
, int32_t line
) {
484 UnicodeString
pattern(pat
);
486 UErrorCode status
= U_ZERO_ERROR
;
488 RegexPattern
*callerPattern
= NULL
;
491 // Compile the caller's pattern
493 UnicodeString
patString(pat
);
494 callerPattern
= RegexPattern::compile(patString
, 0, pe
, status
);
495 if (status
!= expectedStatus
) {
496 dataerrln("Line %d: unexpected error %s compiling pattern.", line
, u_errorName(status
));
498 if (status
!= U_ZERO_ERROR
) {
499 if (pe
.line
!= errLine
|| pe
.offset
!= errCol
) {
500 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
501 line
, errLine
, errCol
, pe
.line
, pe
.offset
);
506 delete callerPattern
;
509 // Compile again, using a UTF-8-based UText
511 UText patternText
= UTEXT_INITIALIZER
;
512 regextst_openUTF8FromInvariant(&patternText
, pat
, -1, &status
);
513 callerPattern
= RegexPattern::compile(&patternText
, 0, pe
, status
);
514 if (status
!= expectedStatus
) {
515 dataerrln("Line %d: unexpected error %s compiling pattern.", line
, u_errorName(status
));
517 if (status
!= U_ZERO_ERROR
) {
518 if (pe
.line
!= errLine
|| pe
.offset
!= errCol
) {
519 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
520 line
, errLine
, errCol
, pe
.line
, pe
.offset
);
525 delete callerPattern
;
526 utext_close(&patternText
);
531 //---------------------------------------------------------------------------
533 // Basic Check for basic functionality of regex pattern matching.
534 // Avoid the use of REGEX_FIND test macro, which has
535 // substantial dependencies on basic Regex functionality.
537 //---------------------------------------------------------------------------
538 void RegexTest::Basic() {
542 // Debug - slide failing test cases early
546 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
548 UErrorCode status
= U_ZERO_ERROR
;
549 RegexPattern
*pattern
;
550 pattern
= RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE
, pe
, status
);
551 pattern
->dumpPattern();
552 RegexMatcher
*m
= pattern
->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status
);
553 UBool result
= m
->find();
554 printf("result = %d\n", result
);
555 // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
556 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
563 // Pattern with parentheses
565 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE
, FALSE
);
566 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE
, TRUE
);
567 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE
, FALSE
);
572 REGEX_TESTLM("st(abc)*ring", "string", TRUE
, TRUE
);
573 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE
, TRUE
);
574 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE
, TRUE
);
575 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE
, FALSE
);
576 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE
, FALSE
);
578 REGEX_TESTLM("a*", "", TRUE
, TRUE
);
579 REGEX_TESTLM("a*", "b", TRUE
, FALSE
);
585 REGEX_TESTLM(".", "abc", TRUE
, FALSE
);
586 REGEX_TESTLM("...", "abc", TRUE
, TRUE
);
587 REGEX_TESTLM("....", "abc", FALSE
, FALSE
);
588 REGEX_TESTLM(".*", "abcxyz123", TRUE
, TRUE
);
589 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE
, FALSE
);
590 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE
, TRUE
);
591 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE
, TRUE
);
592 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE
, FALSE
);
595 // Patterns with * applied to chars at end of literal string
597 REGEX_TESTLM("abc*", "ab", TRUE
, TRUE
);
598 REGEX_TESTLM("abc*", "abccccc", TRUE
, TRUE
);
601 // Supplemental chars match as single chars, not a pair of surrogates.
603 REGEX_TESTLM(".", "\\U00011000", TRUE
, TRUE
);
604 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE
, TRUE
);
605 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE
, FALSE
);
609 // UnicodeSets in the pattern
611 REGEX_TESTLM("[1-6]", "1", TRUE
, TRUE
);
612 REGEX_TESTLM("[1-6]", "3", TRUE
, TRUE
);
613 REGEX_TESTLM("[1-6]", "7", FALSE
, FALSE
);
614 REGEX_TESTLM("a[1-6]", "a3", TRUE
, TRUE
);
615 REGEX_TESTLM("a[1-6]", "a3", TRUE
, TRUE
);
616 REGEX_TESTLM("a[1-6]b", "a3b", TRUE
, TRUE
);
618 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE
, TRUE
);
619 REGEX_TESTLM("a[0-9]*b", "abc", TRUE
, FALSE
);
620 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE
, TRUE
);
621 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE
, FALSE
); // note that * matches 0 occurences.
622 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE
, TRUE
);
625 // OR operator in patterns
627 REGEX_TESTLM("(a|b)", "a", TRUE
, TRUE
);
628 REGEX_TESTLM("(a|b)", "b", TRUE
, TRUE
);
629 REGEX_TESTLM("(a|b)", "c", FALSE
, FALSE
);
630 REGEX_TESTLM("a|b", "b", TRUE
, TRUE
);
632 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE
, TRUE
);
633 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE
, FALSE
);
634 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE
, TRUE
);
635 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE
, TRUE
);
636 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE
, TRUE
);
637 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE
, FALSE
);
642 REGEX_TESTLM("ab+", "abbc", TRUE
, FALSE
);
643 REGEX_TESTLM("ab+c", "ac", FALSE
, FALSE
);
644 REGEX_TESTLM("b+", "", FALSE
, FALSE
);
645 REGEX_TESTLM("(abc|def)+", "defabc", TRUE
, TRUE
);
646 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE
, FALSE
);
647 REGEX_TESTLM(".+y", "zippity dooy", TRUE
, TRUE
);
652 REGEX_TESTLM("ab?", "ab", TRUE
, TRUE
);
653 REGEX_TESTLM("ab?", "a", TRUE
, TRUE
);
654 REGEX_TESTLM("ab?", "ac", TRUE
, FALSE
);
655 REGEX_TESTLM("ab?", "abb", TRUE
, FALSE
);
656 REGEX_TESTLM("a(b|c)?d", "abd", TRUE
, TRUE
);
657 REGEX_TESTLM("a(b|c)?d", "acd", TRUE
, TRUE
);
658 REGEX_TESTLM("a(b|c)?d", "ad", TRUE
, TRUE
);
659 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE
, FALSE
);
660 REGEX_TESTLM("a(b|c)?d", "ab", FALSE
, FALSE
);
663 // Escape sequences that become single literal chars, handled internally
664 // by ICU's Unescape.
667 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
668 REGEX_TESTLM("\\a", "\\u0007", TRUE
, TRUE
); // BEL
669 REGEX_TESTLM("\\cL", "\\u000c", TRUE
, TRUE
); // Control-L
670 REGEX_TESTLM("\\e", "\\u001b", TRUE
, TRUE
); // Escape
671 REGEX_TESTLM("\\f", "\\u000c", TRUE
, TRUE
); // Form Feed
672 REGEX_TESTLM("\\n", "\\u000a", TRUE
, TRUE
); // new line
673 REGEX_TESTLM("\\r", "\\u000d", TRUE
, TRUE
); // CR
674 REGEX_TESTLM("\\t", "\\u0009", TRUE
, TRUE
); // Tab
675 REGEX_TESTLM("\\u1234", "\\u1234", TRUE
, TRUE
);
676 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE
, TRUE
);
678 REGEX_TESTLM(".*\\Ax", "xyz", TRUE
, FALSE
); // \A matches only at the beginning of input
679 REGEX_TESTLM(".*\\Ax", " xyz", FALSE
, FALSE
); // \A matches only at the beginning of input
681 // Escape of special chars in patterns
682 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE
, TRUE
);
686 //---------------------------------------------------------------------------
688 // UTextBasic Check for quirks that are specific to the UText
691 //---------------------------------------------------------------------------
692 void RegexTest::UTextBasic() {
693 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
694 UErrorCode status
= U_ZERO_ERROR
;
695 UText pattern
= UTEXT_INITIALIZER
;
696 utext_openUTF8(&pattern
, str_abc
, -1, &status
);
697 RegexMatcher
matcher(&pattern
, 0, status
);
700 UText input
= UTEXT_INITIALIZER
;
701 utext_openUTF8(&input
, str_abc
, -1, &status
);
703 matcher
.reset(&input
);
705 REGEX_ASSERT_UTEXT_UTF8(str_abc
, matcher
.inputText());
707 matcher
.reset(matcher
.inputText());
709 REGEX_ASSERT_UTEXT_UTF8(str_abc
, matcher
.inputText());
711 utext_close(&pattern
);
716 //---------------------------------------------------------------------------
718 // API_Match Test that the API for class RegexMatcher
719 // is present and nominally working, but excluding functions
720 // implementing replace operations.
722 //---------------------------------------------------------------------------
723 void RegexTest::API_Match() {
725 UErrorCode status
=U_ZERO_ERROR
;
729 // Debug - slide failing test cases early
738 // Simple pattern compilation
741 UnicodeString
re("abc");
743 pat2
= RegexPattern::compile(re
, flags
, pe
, status
);
746 UnicodeString inStr1
= "abcdef this is a test";
747 UnicodeString instr2
= "not abc";
748 UnicodeString empty
= "";
752 // Matcher creation and reset.
754 RegexMatcher
*m1
= pat2
->matcher(inStr1
, status
);
756 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
757 REGEX_ASSERT(m1
->input() == inStr1
);
759 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
760 REGEX_ASSERT(m1
->input() == instr2
);
762 REGEX_ASSERT(m1
->input() == inStr1
);
763 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
765 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
766 REGEX_ASSERT(m1
->input() == empty
);
767 REGEX_ASSERT(&m1
->pattern() == pat2
);
770 // reset(pos, status)
773 m1
->reset(4, status
);
775 REGEX_ASSERT(m1
->input() == inStr1
);
776 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
778 m1
->reset(-1, status
);
779 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
780 status
= U_ZERO_ERROR
;
782 m1
->reset(0, status
);
784 status
= U_ZERO_ERROR
;
786 int32_t len
= m1
->input().length();
787 m1
->reset(len
-1, status
);
789 status
= U_ZERO_ERROR
;
791 m1
->reset(len
, status
);
793 status
= U_ZERO_ERROR
;
795 m1
->reset(len
+1, status
);
796 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
797 status
= U_ZERO_ERROR
;
800 // match(pos, status)
803 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
805 REGEX_ASSERT(m1
->matches(3, status
) == FALSE
);
807 REGEX_ASSERT(m1
->matches(5, status
) == FALSE
);
808 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
809 REGEX_ASSERT(m1
->matches(-1, status
) == FALSE
);
810 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
812 // Match() at end of string should fail, but should not
814 status
= U_ZERO_ERROR
;
815 len
= m1
->input().length();
816 REGEX_ASSERT(m1
->matches(len
, status
) == FALSE
);
819 // Match beyond end of string should fail with an error.
820 status
= U_ZERO_ERROR
;
821 REGEX_ASSERT(m1
->matches(len
+1, status
) == FALSE
);
822 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
824 // Successful match at end of string.
826 status
= U_ZERO_ERROR
;
827 RegexMatcher
m("A?", 0, status
); // will match zero length string.
830 len
= inStr1
.length();
831 REGEX_ASSERT(m
.matches(len
, status
) == TRUE
);
834 REGEX_ASSERT(m
.matches(0, status
) == TRUE
);
840 // lookingAt(pos, status)
842 status
= U_ZERO_ERROR
;
843 m1
->reset(instr2
); // "not abc"
844 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
845 REGEX_ASSERT(m1
->lookingAt(5, status
) == FALSE
);
846 REGEX_ASSERT(m1
->lookingAt(3, status
) == FALSE
);
847 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
848 REGEX_ASSERT(m1
->lookingAt(-1, status
) == FALSE
);
849 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
850 status
= U_ZERO_ERROR
;
851 len
= m1
->input().length();
852 REGEX_ASSERT(m1
->lookingAt(len
, status
) == FALSE
);
854 REGEX_ASSERT(m1
->lookingAt(len
+1, status
) == FALSE
);
855 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
864 // RegexMatcher::start();
865 // RegexMatcher::end();
866 // RegexMatcher::groupCount();
871 UErrorCode status
=U_ZERO_ERROR
;
873 UnicodeString
re("01(23(45)67)(.*)");
874 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
876 UnicodeString data
= "0123456789";
878 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
880 REGEX_ASSERT(matcher
->lookingAt(status
) == TRUE
);
881 static const int32_t matchStarts
[] = {0, 2, 4, 8};
882 static const int32_t matchEnds
[] = {10, 8, 6, 10};
884 for (i
=0; i
<4; i
++) {
885 int32_t actualStart
= matcher
->start(i
, status
);
887 if (actualStart
!= matchStarts
[i
]) {
888 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
889 __LINE__
, i
, matchStarts
[i
], actualStart
);
891 int32_t actualEnd
= matcher
->end(i
, status
);
893 if (actualEnd
!= matchEnds
[i
]) {
894 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
895 __LINE__
, i
, matchEnds
[i
], actualEnd
);
899 REGEX_ASSERT(matcher
->start(0, status
) == matcher
->start(status
));
900 REGEX_ASSERT(matcher
->end(0, status
) == matcher
->end(status
));
902 REGEX_ASSERT_FAIL(matcher
->start(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
903 REGEX_ASSERT_FAIL(matcher
->start( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
905 REGEX_ASSERT_FAIL(matcher
->start( 0, status
), U_REGEX_INVALID_STATE
);
907 matcher
->lookingAt(status
);
908 REGEX_ASSERT(matcher
->group(status
) == "0123456789");
909 REGEX_ASSERT(matcher
->group(0, status
) == "0123456789");
910 REGEX_ASSERT(matcher
->group(1, status
) == "234567" );
911 REGEX_ASSERT(matcher
->group(2, status
) == "45" );
912 REGEX_ASSERT(matcher
->group(3, status
) == "89" );
914 REGEX_ASSERT_FAIL(matcher
->group(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
915 REGEX_ASSERT_FAIL(matcher
->group( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
917 REGEX_ASSERT_FAIL(matcher
->group( 0, status
), U_REGEX_INVALID_STATE
);
930 UErrorCode status
=U_ZERO_ERROR
;
932 UnicodeString
re("abc");
933 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
935 UnicodeString data
= ".abc..abc...abc..";
936 // 012345678901234567
938 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
940 REGEX_ASSERT(matcher
->find());
941 REGEX_ASSERT(matcher
->start(status
) == 1);
942 REGEX_ASSERT(matcher
->find());
943 REGEX_ASSERT(matcher
->start(status
) == 6);
944 REGEX_ASSERT(matcher
->find());
945 REGEX_ASSERT(matcher
->start(status
) == 12);
946 REGEX_ASSERT(matcher
->find() == FALSE
);
947 REGEX_ASSERT(matcher
->find() == FALSE
);
950 REGEX_ASSERT(matcher
->find());
951 REGEX_ASSERT(matcher
->start(status
) == 1);
953 REGEX_ASSERT(matcher
->find(0, status
));
954 REGEX_ASSERT(matcher
->start(status
) == 1);
955 REGEX_ASSERT(matcher
->find(1, status
));
956 REGEX_ASSERT(matcher
->start(status
) == 1);
957 REGEX_ASSERT(matcher
->find(2, status
));
958 REGEX_ASSERT(matcher
->start(status
) == 6);
959 REGEX_ASSERT(matcher
->find(12, status
));
960 REGEX_ASSERT(matcher
->start(status
) == 12);
961 REGEX_ASSERT(matcher
->find(13, status
) == FALSE
);
962 REGEX_ASSERT(matcher
->find(16, status
) == FALSE
);
963 REGEX_ASSERT(matcher
->find(17, status
) == FALSE
);
964 REGEX_ASSERT_FAIL(matcher
->start(status
), U_REGEX_INVALID_STATE
);
966 status
= U_ZERO_ERROR
;
967 REGEX_ASSERT_FAIL(matcher
->find(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
968 status
= U_ZERO_ERROR
;
969 REGEX_ASSERT_FAIL(matcher
->find(18, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
971 REGEX_ASSERT(matcher
->groupCount() == 0);
979 // find, with \G in pattern (true if at the end of a previous match).
984 UErrorCode status
=U_ZERO_ERROR
;
986 UnicodeString
re(".*?(?:(\\Gabc)|(abc))", -1, US_INV
);
987 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
989 UnicodeString data
= ".abcabc.abc..";
990 // 012345678901234567
992 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
994 REGEX_ASSERT(matcher
->find());
995 REGEX_ASSERT(matcher
->start(status
) == 0);
996 REGEX_ASSERT(matcher
->start(1, status
) == -1);
997 REGEX_ASSERT(matcher
->start(2, status
) == 1);
999 REGEX_ASSERT(matcher
->find());
1000 REGEX_ASSERT(matcher
->start(status
) == 4);
1001 REGEX_ASSERT(matcher
->start(1, status
) == 4);
1002 REGEX_ASSERT(matcher
->start(2, status
) == -1);
1010 // find with zero length matches, match position should bump ahead
1011 // to prevent loops.
1015 UErrorCode status
=U_ZERO_ERROR
;
1016 RegexMatcher
m("(?= ?)", 0, status
); // This pattern will zero-length matches anywhere,
1017 // using an always-true look-ahead.
1019 UnicodeString
s(" ");
1022 if (m
.find() == FALSE
) {
1025 REGEX_ASSERT(m
.start(status
) == i
);
1026 REGEX_ASSERT(m
.end(status
) == i
);
1030 // Check that the bump goes over surrogate pairs OK
1031 s
= UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1035 if (m
.find() == FALSE
) {
1038 REGEX_ASSERT(m
.start(status
) == i
);
1039 REGEX_ASSERT(m
.end(status
) == i
);
1041 REGEX_ASSERT(i
==10);
1044 // find() loop breaking test.
1045 // with pattern of /.?/, should see a series of one char matches, then a single
1046 // match of zero length at the end of the input string.
1048 UErrorCode status
=U_ZERO_ERROR
;
1049 RegexMatcher
m(".?", 0, status
);
1051 UnicodeString
s(" ");
1054 if (m
.find() == FALSE
) {
1057 REGEX_ASSERT(m
.start(status
) == i
);
1058 REGEX_ASSERT(m
.end(status
) == (i
<4 ? i
+1 : i
));
1065 // Matchers with no input string behave as if they had an empty input string.
1069 UErrorCode status
= U_ZERO_ERROR
;
1070 RegexMatcher
m(".?", 0, status
);
1072 REGEX_ASSERT(m
.find());
1073 REGEX_ASSERT(m
.start(status
) == 0);
1074 REGEX_ASSERT(m
.input() == "");
1077 UErrorCode status
= U_ZERO_ERROR
;
1078 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
1079 RegexMatcher
*m
= p
->matcher(status
);
1082 REGEX_ASSERT(m
->find() == FALSE
);
1083 REGEX_ASSERT(m
->input() == "");
1092 UErrorCode status
= U_ZERO_ERROR
;
1093 UnicodeString
testString("This is test data");
1094 RegexMatcher
m(".*", testString
, 0, status
);
1096 REGEX_ASSERT(m
.regionStart() == 0);
1097 REGEX_ASSERT(m
.regionEnd() == testString
.length());
1098 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1099 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1101 m
.region(2,4, status
);
1103 REGEX_ASSERT(m
.matches(status
));
1104 REGEX_ASSERT(m
.start(status
)==2);
1105 REGEX_ASSERT(m
.end(status
)==4);
1109 REGEX_ASSERT(m
.regionStart() == 0);
1110 REGEX_ASSERT(m
.regionEnd() == testString
.length());
1112 UnicodeString
shorterString("short");
1113 m
.reset(shorterString
);
1114 REGEX_ASSERT(m
.regionStart() == 0);
1115 REGEX_ASSERT(m
.regionEnd() == shorterString
.length());
1117 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1118 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(FALSE
));
1119 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
1120 REGEX_ASSERT(&m
== &m
.reset());
1121 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
1123 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(TRUE
));
1124 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1125 REGEX_ASSERT(&m
== &m
.reset());
1126 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1128 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1129 REGEX_ASSERT(&m
== &m
.useTransparentBounds(TRUE
));
1130 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
1131 REGEX_ASSERT(&m
== &m
.reset());
1132 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
1134 REGEX_ASSERT(&m
== &m
.useTransparentBounds(FALSE
));
1135 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1136 REGEX_ASSERT(&m
== &m
.reset());
1137 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1142 // hitEnd() and requireEnd()
1145 UErrorCode status
= U_ZERO_ERROR
;
1146 UnicodeString
testString("aabb");
1147 RegexMatcher
m1(".*", testString
, 0, status
);
1148 REGEX_ASSERT(m1
.lookingAt(status
) == TRUE
);
1149 REGEX_ASSERT(m1
.hitEnd() == TRUE
);
1150 REGEX_ASSERT(m1
.requireEnd() == FALSE
);
1153 status
= U_ZERO_ERROR
;
1154 RegexMatcher
m2("a*", testString
, 0, status
);
1155 REGEX_ASSERT(m2
.lookingAt(status
) == TRUE
);
1156 REGEX_ASSERT(m2
.hitEnd() == FALSE
);
1157 REGEX_ASSERT(m2
.requireEnd() == FALSE
);
1160 status
= U_ZERO_ERROR
;
1161 RegexMatcher
m3(".*$", testString
, 0, status
);
1162 REGEX_ASSERT(m3
.lookingAt(status
) == TRUE
);
1163 REGEX_ASSERT(m3
.hitEnd() == TRUE
);
1164 REGEX_ASSERT(m3
.requireEnd() == TRUE
);
1170 // Compilation error on reset with UChar *
1171 // These were a hazard that people were stumbling over with runtime errors.
1172 // Changed them to compiler errors by adding private methods that more closely
1173 // matched the incorrect use of the functions.
1177 UErrorCode status
= U_ZERO_ERROR
;
1178 UChar ucharString
[20];
1179 RegexMatcher
m(".", 0, status
);
1180 m
.reset(ucharString
); // should not compile.
1182 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
1183 RegexMatcher
*m2
= p
->matcher(ucharString
, status
); // should not compile.
1185 RegexMatcher
m3(".", ucharString
, 0, status
); // Should not compile
1191 // Note: These tests will need to be changed when the regexp engine is
1192 // able to detect and cut short the exponential time behavior on
1193 // this type of match.
1196 UErrorCode status
= U_ZERO_ERROR
;
1197 // Enough 'a's in the string to cause the match to time out.
1198 // (Each on additonal 'a' doubles the time)
1199 UnicodeString
testString("aaaaaaaaaaaaaaaaaaaaa");
1200 RegexMatcher
matcher("(a+)+b", testString
, 0, status
);
1202 REGEX_ASSERT(matcher
.getTimeLimit() == 0);
1203 matcher
.setTimeLimit(100, status
);
1204 REGEX_ASSERT(matcher
.getTimeLimit() == 100);
1205 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1206 REGEX_ASSERT(status
== U_REGEX_TIME_OUT
);
1209 UErrorCode status
= U_ZERO_ERROR
;
1210 // Few enough 'a's to slip in under the time limit.
1211 UnicodeString
testString("aaaaaaaaaaaaaaaaaa");
1212 RegexMatcher
matcher("(a+)+b", testString
, 0, status
);
1214 matcher
.setTimeLimit(100, status
);
1215 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1223 UErrorCode status
= U_ZERO_ERROR
;
1224 UnicodeString
testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'
1226 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1227 // of the '+', and makes the stack frames larger.
1228 RegexMatcher
matcher("(A)+A$", testString
, 0, status
);
1230 // With the default stack, this match should fail to run
1231 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1232 REGEX_ASSERT(status
== U_REGEX_STACK_OVERFLOW
);
1234 // With unlimited stack, it should run
1235 status
= U_ZERO_ERROR
;
1236 matcher
.setStackLimit(0, status
);
1238 REGEX_ASSERT(matcher
.lookingAt(status
) == TRUE
);
1240 REGEX_ASSERT(matcher
.getStackLimit() == 0);
1242 // With a limited stack, it the match should fail
1243 status
= U_ZERO_ERROR
;
1244 matcher
.setStackLimit(10000, status
);
1245 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1246 REGEX_ASSERT(status
== U_REGEX_STACK_OVERFLOW
);
1247 REGEX_ASSERT(matcher
.getStackLimit() == 10000);
1250 // A pattern that doesn't save state should work with
1251 // a minimal sized stack
1253 UErrorCode status
= U_ZERO_ERROR
;
1254 UnicodeString testString
= "abc";
1255 RegexMatcher
matcher("abc", testString
, 0, status
);
1257 matcher
.setStackLimit(30, status
);
1259 REGEX_ASSERT(matcher
.matches(status
) == TRUE
);
1261 REGEX_ASSERT(matcher
.getStackLimit() == 30);
1263 // Negative stack sizes should fail
1264 status
= U_ZERO_ERROR
;
1265 matcher
.setStackLimit(1000, status
);
1267 matcher
.setStackLimit(-1, status
);
1268 REGEX_ASSERT(status
== U_ILLEGAL_ARGUMENT_ERROR
);
1269 REGEX_ASSERT(matcher
.getStackLimit() == 1000);
1280 //---------------------------------------------------------------------------
1282 // API_Replace API test for class RegexMatcher, testing the
1283 // Replace family of functions.
1285 //---------------------------------------------------------------------------
1286 void RegexTest::API_Replace() {
1292 UErrorCode status
=U_ZERO_ERROR
;
1294 UnicodeString
re("abc");
1295 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
1297 UnicodeString data
= ".abc..abc...abc..";
1298 // 012345678901234567
1299 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
1302 // Plain vanilla matches.
1305 dest
= matcher
->replaceFirst("yz", status
);
1307 REGEX_ASSERT(dest
== ".yz..abc...abc..");
1309 dest
= matcher
->replaceAll("yz", status
);
1311 REGEX_ASSERT(dest
== ".yz..yz...yz..");
1314 // Plain vanilla non-matches.
1316 UnicodeString d2
= ".abx..abx...abx..";
1318 dest
= matcher
->replaceFirst("yz", status
);
1320 REGEX_ASSERT(dest
== ".abx..abx...abx..");
1322 dest
= matcher
->replaceAll("yz", status
);
1324 REGEX_ASSERT(dest
== ".abx..abx...abx..");
1327 // Empty source string
1329 UnicodeString d3
= "";
1331 dest
= matcher
->replaceFirst("yz", status
);
1333 REGEX_ASSERT(dest
== "");
1335 dest
= matcher
->replaceAll("yz", status
);
1337 REGEX_ASSERT(dest
== "");
1340 // Empty substitution string
1342 matcher
->reset(data
); // ".abc..abc...abc.."
1343 dest
= matcher
->replaceFirst("", status
);
1345 REGEX_ASSERT(dest
== "...abc...abc..");
1347 dest
= matcher
->replaceAll("", status
);
1349 REGEX_ASSERT(dest
== "........");
1352 // match whole string
1354 UnicodeString d4
= "abc";
1356 dest
= matcher
->replaceFirst("xyz", status
);
1358 REGEX_ASSERT(dest
== "xyz");
1360 dest
= matcher
->replaceAll("xyz", status
);
1362 REGEX_ASSERT(dest
== "xyz");
1365 // Capture Group, simple case
1367 UnicodeString
re2("a(..)");
1368 RegexPattern
*pat2
= RegexPattern::compile(re2
, flags
, pe
, status
);
1370 UnicodeString d5
= "abcdefg";
1371 RegexMatcher
*matcher2
= pat2
->matcher(d5
, status
);
1373 dest
= matcher2
->replaceFirst("$1$1", status
);
1375 REGEX_ASSERT(dest
== "bcbcdefg");
1377 dest
= matcher2
->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status
);
1379 REGEX_ASSERT(dest
== "The value of $1 is bc.defg");
1381 dest
= matcher2
->replaceFirst("$ by itself, no group number $$$", status
);
1382 REGEX_ASSERT(U_FAILURE(status
));
1383 status
= U_ZERO_ERROR
;
1385 UnicodeString replacement
= UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1386 replacement
= replacement
.unescape();
1387 dest
= matcher2
->replaceFirst(replacement
, status
);
1389 REGEX_ASSERT(dest
== "Supplemental Digit 1 bc.defg");
1391 REGEX_ASSERT_FAIL(matcher2
->replaceFirst("bad capture group number $5...",status
), U_INDEX_OUTOFBOUNDS_ERROR
);
1395 // Replacement String with \u hex escapes
1398 UnicodeString src
= "abc 1 abc 2 abc 3";
1399 UnicodeString substitute
= UNICODE_STRING_SIMPLE("--\\u0043--");
1400 matcher
->reset(src
);
1401 UnicodeString result
= matcher
->replaceAll(substitute
, status
);
1403 REGEX_ASSERT(result
== "--C-- 1 --C-- 2 --C-- 3");
1406 UnicodeString src
= "abc !";
1407 UnicodeString substitute
= UNICODE_STRING_SIMPLE("--\\U00010000--");
1408 matcher
->reset(src
);
1409 UnicodeString result
= matcher
->replaceAll(substitute
, status
);
1411 UnicodeString expected
= UnicodeString("--");
1412 expected
.append((UChar32
)0x10000);
1413 expected
.append("-- !");
1414 REGEX_ASSERT(result
== expected
);
1416 // TODO: need more through testing of capture substitutions.
1421 status
= U_ZERO_ERROR
;
1422 UnicodeString s
= "The matches start with ss and end with ee ss stuff ee fin";
1423 RegexMatcher
m("ss(.*?)ee", 0, status
);
1425 UnicodeString result
;
1427 // Multiple finds do NOT bump up the previous appendReplacement postion.
1431 m
.appendReplacement(result
, "ooh", status
);
1433 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1435 // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1436 status
= U_ZERO_ERROR
;
1438 m
.reset(10, status
);
1441 m
.appendReplacement(result
, "ooh", status
);
1443 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1445 // find() at interior of string, appendReplacemnt still starts at beginning.
1446 status
= U_ZERO_ERROR
;
1451 m
.appendReplacement(result
, "ooh", status
);
1453 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1455 m
.appendTail(result
);
1456 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh fin");
1467 //---------------------------------------------------------------------------
1469 // API_Pattern Test that the API for class RegexPattern is
1470 // present and nominally working.
1472 //---------------------------------------------------------------------------
1473 void RegexTest::API_Pattern() {
1474 RegexPattern pata
; // Test default constructor to not crash.
1477 REGEX_ASSERT(pata
== patb
);
1478 REGEX_ASSERT(pata
== pata
);
1480 UnicodeString
re1("abc[a-l][m-z]");
1481 UnicodeString
re2("def");
1482 UErrorCode status
= U_ZERO_ERROR
;
1485 RegexPattern
*pat1
= RegexPattern::compile(re1
, 0, pe
, status
);
1486 RegexPattern
*pat2
= RegexPattern::compile(re2
, 0, pe
, status
);
1488 REGEX_ASSERT(*pat1
== *pat1
);
1489 REGEX_ASSERT(*pat1
!= pata
);
1493 REGEX_ASSERT(patb
== *pat1
);
1496 RegexPattern
patc(*pat1
);
1497 REGEX_ASSERT(patc
== *pat1
);
1498 REGEX_ASSERT(patb
== patc
);
1499 REGEX_ASSERT(pat1
!= pat2
);
1501 REGEX_ASSERT(patb
!= patc
);
1502 REGEX_ASSERT(patb
== *pat2
);
1504 // Compile with no flags.
1505 RegexPattern
*pat1a
= RegexPattern::compile(re1
, pe
, status
);
1506 REGEX_ASSERT(*pat1a
== *pat1
);
1508 REGEX_ASSERT(pat1a
->flags() == 0);
1510 // Compile with different flags should be not equal
1511 RegexPattern
*pat1b
= RegexPattern::compile(re1
, UREGEX_CASE_INSENSITIVE
, pe
, status
);
1514 REGEX_ASSERT(*pat1b
!= *pat1a
);
1515 REGEX_ASSERT(pat1b
->flags() == UREGEX_CASE_INSENSITIVE
);
1516 REGEX_ASSERT(pat1a
->flags() == 0);
1520 RegexPattern
*pat1c
= pat1
->clone();
1521 REGEX_ASSERT(*pat1c
== *pat1
);
1522 REGEX_ASSERT(*pat1c
!= *pat2
);
1531 // Verify that a matcher created from a cloned pattern works.
1535 UErrorCode status
= U_ZERO_ERROR
;
1536 RegexPattern
*pSource
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status
);
1537 RegexPattern
*pClone
= pSource
->clone();
1539 RegexMatcher
*mFromClone
= pClone
->matcher(status
);
1541 UnicodeString s
= "Hello World";
1542 mFromClone
->reset(s
);
1543 REGEX_ASSERT(mFromClone
->find() == TRUE
);
1544 REGEX_ASSERT(mFromClone
->group(status
) == "Hello");
1545 REGEX_ASSERT(mFromClone
->find() == TRUE
);
1546 REGEX_ASSERT(mFromClone
->group(status
) == "World");
1547 REGEX_ASSERT(mFromClone
->find() == FALSE
);
1553 // matches convenience API
1555 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe
, status
) == TRUE
);
1557 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe
, status
) == FALSE
);
1559 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe
, status
) == TRUE
);
1561 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe
, status
) == TRUE
);
1563 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe
, status
) == FALSE
);
1565 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1566 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe
, status
) == FALSE
);
1567 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1573 status
= U_ZERO_ERROR
;
1574 pat1
= RegexPattern::compile(" +", pe
, status
);
1576 UnicodeString fields
[10];
1579 n
= pat1
->split("Now is the time", fields
, 10, status
);
1582 REGEX_ASSERT(fields
[0]=="Now");
1583 REGEX_ASSERT(fields
[1]=="is");
1584 REGEX_ASSERT(fields
[2]=="the");
1585 REGEX_ASSERT(fields
[3]=="time");
1586 REGEX_ASSERT(fields
[4]=="");
1588 n
= pat1
->split("Now is the time", fields
, 2, status
);
1591 REGEX_ASSERT(fields
[0]=="Now");
1592 REGEX_ASSERT(fields
[1]=="is the time");
1593 REGEX_ASSERT(fields
[2]=="the"); // left over from previous test
1596 status
= U_ZERO_ERROR
;
1597 n
= pat1
->split("Now is the time", fields
, 1, status
);
1600 REGEX_ASSERT(fields
[0]=="Now is the time");
1601 REGEX_ASSERT(fields
[1]=="*");
1602 status
= U_ZERO_ERROR
;
1604 n
= pat1
->split(" Now is the time ", fields
, 10, status
);
1607 REGEX_ASSERT(fields
[0]=="");
1608 REGEX_ASSERT(fields
[1]=="Now");
1609 REGEX_ASSERT(fields
[2]=="is");
1610 REGEX_ASSERT(fields
[3]=="the");
1611 REGEX_ASSERT(fields
[4]=="time");
1612 REGEX_ASSERT(fields
[5]=="");
1614 n
= pat1
->split(" ", fields
, 10, status
);
1617 REGEX_ASSERT(fields
[0]=="");
1618 REGEX_ASSERT(fields
[1]=="");
1621 n
= pat1
->split("", fields
, 10, status
);
1624 REGEX_ASSERT(fields
[0]=="foo");
1628 // split, with a pattern with (capture)
1629 pat1
= RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe
, status
);
1632 status
= U_ZERO_ERROR
;
1633 n
= pat1
->split("<a>Now is <b>the time<c>", fields
, 10, status
);
1636 REGEX_ASSERT(fields
[0]=="");
1637 REGEX_ASSERT(fields
[1]=="a");
1638 REGEX_ASSERT(fields
[2]=="Now is ");
1639 REGEX_ASSERT(fields
[3]=="b");
1640 REGEX_ASSERT(fields
[4]=="the time");
1641 REGEX_ASSERT(fields
[5]=="c");
1642 REGEX_ASSERT(fields
[6]=="");
1643 REGEX_ASSERT(status
==U_ZERO_ERROR
);
1645 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 10, status
);
1648 REGEX_ASSERT(fields
[0]==" ");
1649 REGEX_ASSERT(fields
[1]=="a");
1650 REGEX_ASSERT(fields
[2]=="Now is ");
1651 REGEX_ASSERT(fields
[3]=="b");
1652 REGEX_ASSERT(fields
[4]=="the time");
1653 REGEX_ASSERT(fields
[5]=="c");
1654 REGEX_ASSERT(fields
[6]=="");
1656 status
= U_ZERO_ERROR
;
1658 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 6, status
);
1661 REGEX_ASSERT(fields
[0]==" ");
1662 REGEX_ASSERT(fields
[1]=="a");
1663 REGEX_ASSERT(fields
[2]=="Now is ");
1664 REGEX_ASSERT(fields
[3]=="b");
1665 REGEX_ASSERT(fields
[4]=="the time");
1666 REGEX_ASSERT(fields
[5]==""); // All text following "<c>" field delimiter.
1667 REGEX_ASSERT(fields
[6]=="foo");
1669 status
= U_ZERO_ERROR
;
1671 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 5, status
);
1674 REGEX_ASSERT(fields
[0]==" ");
1675 REGEX_ASSERT(fields
[1]=="a");
1676 REGEX_ASSERT(fields
[2]=="Now is ");
1677 REGEX_ASSERT(fields
[3]=="b");
1678 REGEX_ASSERT(fields
[4]=="the time<c>");
1679 REGEX_ASSERT(fields
[5]=="foo");
1681 status
= U_ZERO_ERROR
;
1683 n
= pat1
->split(" <a>Now is <b>the time", fields
, 5, status
);
1686 REGEX_ASSERT(fields
[0]==" ");
1687 REGEX_ASSERT(fields
[1]=="a");
1688 REGEX_ASSERT(fields
[2]=="Now is ");
1689 REGEX_ASSERT(fields
[3]=="b");
1690 REGEX_ASSERT(fields
[4]=="the time");
1691 REGEX_ASSERT(fields
[5]=="foo");
1693 status
= U_ZERO_ERROR
;
1694 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 4, status
);
1697 REGEX_ASSERT(fields
[0]==" ");
1698 REGEX_ASSERT(fields
[1]=="a");
1699 REGEX_ASSERT(fields
[2]=="Now is ");
1700 REGEX_ASSERT(fields
[3]=="the time<c>");
1701 status
= U_ZERO_ERROR
;
1704 pat1
= RegexPattern::compile("([-,])", pe
, status
);
1706 n
= pat1
->split("1-10,20", fields
, 10, status
);
1709 REGEX_ASSERT(fields
[0]=="1");
1710 REGEX_ASSERT(fields
[1]=="-");
1711 REGEX_ASSERT(fields
[2]=="10");
1712 REGEX_ASSERT(fields
[3]==",");
1713 REGEX_ASSERT(fields
[4]=="20");
1716 // Test split of string with empty trailing fields
1717 pat1
= RegexPattern::compile(",", pe
, status
);
1719 n
= pat1
->split("a,b,c,", fields
, 10, status
);
1722 REGEX_ASSERT(fields
[0]=="a");
1723 REGEX_ASSERT(fields
[1]=="b");
1724 REGEX_ASSERT(fields
[2]=="c");
1725 REGEX_ASSERT(fields
[3]=="");
1727 n
= pat1
->split("a,,,", fields
, 10, status
);
1730 REGEX_ASSERT(fields
[0]=="a");
1731 REGEX_ASSERT(fields
[1]=="");
1732 REGEX_ASSERT(fields
[2]=="");
1733 REGEX_ASSERT(fields
[3]=="");
1736 // Split Separator with zero length match.
1737 pat1
= RegexPattern::compile(":?", pe
, status
);
1739 n
= pat1
->split("abc", fields
, 10, status
);
1742 REGEX_ASSERT(fields
[0]=="");
1743 REGEX_ASSERT(fields
[1]=="a");
1744 REGEX_ASSERT(fields
[2]=="b");
1745 REGEX_ASSERT(fields
[3]=="c");
1746 REGEX_ASSERT(fields
[4]=="");
1751 // RegexPattern::pattern()
1753 pat1
= new RegexPattern();
1754 REGEX_ASSERT(pat1
->pattern() == "");
1757 pat1
= RegexPattern::compile("(Hello, world)*", pe
, status
);
1759 REGEX_ASSERT(pat1
->pattern() == "(Hello, world)*");
1764 // classID functions
1766 pat1
= RegexPattern::compile("(Hello, world)*", pe
, status
);
1768 REGEX_ASSERT(pat1
->getDynamicClassID() == RegexPattern::getStaticClassID());
1769 REGEX_ASSERT(pat1
->getDynamicClassID() != NULL
);
1770 UnicodeString
Hello("Hello, world.");
1771 RegexMatcher
*m
= pat1
->matcher(Hello
, status
);
1772 REGEX_ASSERT(pat1
->getDynamicClassID() != m
->getDynamicClassID());
1773 REGEX_ASSERT(m
->getDynamicClassID() == RegexMatcher::getStaticClassID());
1774 REGEX_ASSERT(m
->getDynamicClassID() != NULL
);
1780 //---------------------------------------------------------------------------
1782 // API_Match_UTF8 Test that the alternate engine for class RegexMatcher
1783 // is present and working, but excluding functions
1784 // implementing replace operations.
1786 //---------------------------------------------------------------------------
1787 void RegexTest::API_Match_UTF8() {
1789 UErrorCode status
=U_ZERO_ERROR
;
1793 // Debug - slide failing test cases early
1802 // Simple pattern compilation
1805 UText re
= UTEXT_INITIALIZER
;
1806 regextst_openUTF8FromInvariant(&re
, "abc", -1, &status
);
1807 REGEX_VERBOSE_TEXT(&re
);
1809 pat2
= RegexPattern::compile(&re
, flags
, pe
, status
);
1812 UText input1
= UTEXT_INITIALIZER
;
1813 UText input2
= UTEXT_INITIALIZER
;
1814 UText empty
= UTEXT_INITIALIZER
;
1815 regextst_openUTF8FromInvariant(&input1
, "abcdef this is a test", -1, &status
);
1816 REGEX_VERBOSE_TEXT(&input1
);
1817 regextst_openUTF8FromInvariant(&input2
, "not abc", -1, &status
);
1818 REGEX_VERBOSE_TEXT(&input2
);
1819 utext_openUChars(&empty
, NULL
, 0, &status
);
1821 int32_t input1Len
= strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1822 int32_t input2Len
= strlen("not abc");
1826 // Matcher creation and reset.
1828 RegexMatcher
*m1
= &pat2
->matcher(status
)->reset(&input1
);
1830 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
1831 const char str_abcdefthisisatest
[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1832 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest
, m1
->inputText());
1834 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
1835 const char str_notabc
[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1836 REGEX_ASSERT_UTEXT_UTF8(str_notabc
, m1
->inputText());
1838 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest
, m1
->inputText());
1839 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
1841 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
1842 REGEX_ASSERT(utext_nativeLength(&empty
) == 0);
1845 // reset(pos, status)
1848 m1
->reset(4, status
);
1850 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest
, m1
->inputText());
1851 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
1853 m1
->reset(-1, status
);
1854 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1855 status
= U_ZERO_ERROR
;
1857 m1
->reset(0, status
);
1859 status
= U_ZERO_ERROR
;
1861 m1
->reset(input1Len
-1, status
);
1863 status
= U_ZERO_ERROR
;
1865 m1
->reset(input1Len
, status
);
1867 status
= U_ZERO_ERROR
;
1869 m1
->reset(input1Len
+1, status
);
1870 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1871 status
= U_ZERO_ERROR
;
1874 // match(pos, status)
1877 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
1879 REGEX_ASSERT(m1
->matches(3, status
) == FALSE
);
1881 REGEX_ASSERT(m1
->matches(5, status
) == FALSE
);
1882 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
1883 REGEX_ASSERT(m1
->matches(-1, status
) == FALSE
);
1884 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1886 // Match() at end of string should fail, but should not
1888 status
= U_ZERO_ERROR
;
1889 REGEX_ASSERT(m1
->matches(input2Len
, status
) == FALSE
);
1892 // Match beyond end of string should fail with an error.
1893 status
= U_ZERO_ERROR
;
1894 REGEX_ASSERT(m1
->matches(input2Len
+1, status
) == FALSE
);
1895 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1897 // Successful match at end of string.
1899 status
= U_ZERO_ERROR
;
1900 RegexMatcher
m("A?", 0, status
); // will match zero length string.
1903 REGEX_ASSERT(m
.matches(input1Len
, status
) == TRUE
);
1906 REGEX_ASSERT(m
.matches(0, status
) == TRUE
);
1912 // lookingAt(pos, status)
1914 status
= U_ZERO_ERROR
;
1915 m1
->reset(&input2
); // "not abc"
1916 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
1917 REGEX_ASSERT(m1
->lookingAt(5, status
) == FALSE
);
1918 REGEX_ASSERT(m1
->lookingAt(3, status
) == FALSE
);
1919 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
1920 REGEX_ASSERT(m1
->lookingAt(-1, status
) == FALSE
);
1921 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1922 status
= U_ZERO_ERROR
;
1923 REGEX_ASSERT(m1
->lookingAt(input2Len
, status
) == FALSE
);
1925 REGEX_ASSERT(m1
->lookingAt(input2Len
+1, status
) == FALSE
);
1926 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1932 utext_close(&input1
);
1933 utext_close(&input2
);
1934 utext_close(&empty
);
1940 // RegexMatcher::start();
1941 // RegexMatcher::end();
1942 // RegexMatcher::groupCount();
1947 UErrorCode status
=U_ZERO_ERROR
;
1948 UText re
=UTEXT_INITIALIZER
;
1949 const char str_01234567_pat
[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1950 utext_openUTF8(&re
, str_01234567_pat
, -1, &status
);
1952 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
1955 UText input
= UTEXT_INITIALIZER
;
1956 const char str_0123456789
[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1957 utext_openUTF8(&input
, str_0123456789
, -1, &status
);
1959 RegexMatcher
*matcher
= &pat
->matcher(status
)->reset(&input
);
1961 REGEX_ASSERT(matcher
->lookingAt(status
) == TRUE
);
1962 static const int32_t matchStarts
[] = {0, 2, 4, 8};
1963 static const int32_t matchEnds
[] = {10, 8, 6, 10};
1965 for (i
=0; i
<4; i
++) {
1966 int32_t actualStart
= matcher
->start(i
, status
);
1968 if (actualStart
!= matchStarts
[i
]) {
1969 errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n",
1970 __FILE__
, __LINE__
, i
, matchStarts
[i
], actualStart
);
1972 int32_t actualEnd
= matcher
->end(i
, status
);
1974 if (actualEnd
!= matchEnds
[i
]) {
1975 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n",
1976 __FILE__
, __LINE__
, i
, matchEnds
[i
], actualEnd
);
1980 REGEX_ASSERT(matcher
->start(0, status
) == matcher
->start(status
));
1981 REGEX_ASSERT(matcher
->end(0, status
) == matcher
->end(status
));
1983 REGEX_ASSERT_FAIL(matcher
->start(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
1984 REGEX_ASSERT_FAIL(matcher
->start( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
1986 REGEX_ASSERT_FAIL(matcher
->start( 0, status
), U_REGEX_INVALID_STATE
);
1988 matcher
->lookingAt(status
);
1991 UText destText
= UTEXT_INITIALIZER
;
1992 utext_openUnicodeString(&destText
, &dest
, &status
);
1994 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1995 // Test shallow-clone API
1997 result
= matcher
->group((UText
*)NULL
, group_len
, status
);
1999 REGEX_ASSERT_UTEXT_UTF8(str_0123456789
, result
);
2000 utext_close(result
);
2001 result
= matcher
->group(0, &destText
, group_len
, status
);
2003 REGEX_ASSERT(result
== &destText
);
2004 REGEX_ASSERT_UTEXT_UTF8(str_0123456789
, result
);
2005 // destText is now immutable, reopen it
2006 utext_close(&destText
);
2007 utext_openUnicodeString(&destText
, &dest
, &status
);
2010 result
= matcher
->group(0, NULL
, length
, status
);
2012 REGEX_ASSERT_UTEXT_UTF8(str_0123456789
, result
);
2013 utext_close(result
);
2014 result
= matcher
->group(0, &destText
, length
, status
);
2016 REGEX_ASSERT(result
== &destText
);
2017 REGEX_ASSERT(utext_getNativeIndex(result
) == 0);
2018 REGEX_ASSERT(length
== 10);
2019 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2021 // Capture Group 1 == "234567"
2022 result
= matcher
->group(1, NULL
, length
, status
);
2024 REGEX_ASSERT(utext_getNativeIndex(result
) == 2);
2025 REGEX_ASSERT(length
== 6);
2026 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2027 utext_close(result
);
2029 result
= matcher
->group(1, &destText
, length
, status
);
2031 REGEX_ASSERT(result
== &destText
);
2032 REGEX_ASSERT(utext_getNativeIndex(result
) == 2);
2033 REGEX_ASSERT(length
== 6);
2034 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2035 utext_close(result
);
2037 // Capture Group 2 == "45"
2038 result
= matcher
->group(2, NULL
, length
, status
);
2040 REGEX_ASSERT(utext_getNativeIndex(result
) == 4);
2041 REGEX_ASSERT(length
== 2);
2042 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2043 utext_close(result
);
2045 result
= matcher
->group(2, &destText
, length
, status
);
2047 REGEX_ASSERT(result
== &destText
);
2048 REGEX_ASSERT(utext_getNativeIndex(result
) == 4);
2049 REGEX_ASSERT(length
== 2);
2050 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2051 utext_close(result
);
2053 // Capture Group 3 == "89"
2054 result
= matcher
->group(3, NULL
, length
, status
);
2056 REGEX_ASSERT(utext_getNativeIndex(result
) == 8);
2057 REGEX_ASSERT(length
== 2);
2058 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2059 utext_close(result
);
2061 result
= matcher
->group(3, &destText
, length
, status
);
2063 REGEX_ASSERT(result
== &destText
);
2064 REGEX_ASSERT(utext_getNativeIndex(result
) == 8);
2065 REGEX_ASSERT(length
== 2);
2066 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result
);
2067 utext_close(result
);
2069 // Capture Group number out of range.
2070 status
= U_ZERO_ERROR
;
2071 REGEX_ASSERT_FAIL(matcher
->group(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2072 status
= U_ZERO_ERROR
;
2073 REGEX_ASSERT_FAIL(matcher
->group( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2074 status
= U_ZERO_ERROR
;
2076 REGEX_ASSERT_FAIL(matcher
->group( 0, status
), U_REGEX_INVALID_STATE
);
2081 utext_close(&destText
);
2082 utext_close(&input
);
2092 UErrorCode status
=U_ZERO_ERROR
;
2093 UText re
=UTEXT_INITIALIZER
;
2094 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2095 utext_openUTF8(&re
, str_abc
, -1, &status
);
2097 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
2099 UText input
= UTEXT_INITIALIZER
;
2100 const char str_abcabcabc
[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2101 utext_openUTF8(&input
, str_abcabcabc
, -1, &status
);
2102 // 012345678901234567
2104 RegexMatcher
*matcher
= &pat
->matcher(status
)->reset(&input
);
2106 REGEX_ASSERT(matcher
->find());
2107 REGEX_ASSERT(matcher
->start(status
) == 1);
2108 REGEX_ASSERT(matcher
->find());
2109 REGEX_ASSERT(matcher
->start(status
) == 6);
2110 REGEX_ASSERT(matcher
->find());
2111 REGEX_ASSERT(matcher
->start(status
) == 12);
2112 REGEX_ASSERT(matcher
->find() == FALSE
);
2113 REGEX_ASSERT(matcher
->find() == FALSE
);
2116 REGEX_ASSERT(matcher
->find());
2117 REGEX_ASSERT(matcher
->start(status
) == 1);
2119 REGEX_ASSERT(matcher
->find(0, status
));
2120 REGEX_ASSERT(matcher
->start(status
) == 1);
2121 REGEX_ASSERT(matcher
->find(1, status
));
2122 REGEX_ASSERT(matcher
->start(status
) == 1);
2123 REGEX_ASSERT(matcher
->find(2, status
));
2124 REGEX_ASSERT(matcher
->start(status
) == 6);
2125 REGEX_ASSERT(matcher
->find(12, status
));
2126 REGEX_ASSERT(matcher
->start(status
) == 12);
2127 REGEX_ASSERT(matcher
->find(13, status
) == FALSE
);
2128 REGEX_ASSERT(matcher
->find(16, status
) == FALSE
);
2129 REGEX_ASSERT(matcher
->find(17, status
) == FALSE
);
2130 REGEX_ASSERT_FAIL(matcher
->start(status
), U_REGEX_INVALID_STATE
);
2132 status
= U_ZERO_ERROR
;
2133 REGEX_ASSERT_FAIL(matcher
->find(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2134 status
= U_ZERO_ERROR
;
2135 REGEX_ASSERT_FAIL(matcher
->find(18, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2137 REGEX_ASSERT(matcher
->groupCount() == 0);
2142 utext_close(&input
);
2148 // find, with \G in pattern (true if at the end of a previous match).
2153 UErrorCode status
=U_ZERO_ERROR
;
2154 UText re
=UTEXT_INITIALIZER
;
2155 const char str_Gabcabc
[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2156 utext_openUTF8(&re
, str_Gabcabc
, -1, &status
);
2158 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
2161 UText input
= UTEXT_INITIALIZER
;
2162 const char str_abcabcabc
[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2163 utext_openUTF8(&input
, str_abcabcabc
, -1, &status
);
2164 // 012345678901234567
2166 RegexMatcher
*matcher
= &pat
->matcher(status
)->reset(&input
);
2168 REGEX_ASSERT(matcher
->find());
2169 REGEX_ASSERT(matcher
->start(status
) == 0);
2170 REGEX_ASSERT(matcher
->start(1, status
) == -1);
2171 REGEX_ASSERT(matcher
->start(2, status
) == 1);
2173 REGEX_ASSERT(matcher
->find());
2174 REGEX_ASSERT(matcher
->start(status
) == 4);
2175 REGEX_ASSERT(matcher
->start(1, status
) == 4);
2176 REGEX_ASSERT(matcher
->start(2, status
) == -1);
2182 utext_close(&input
);
2187 // find with zero length matches, match position should bump ahead
2188 // to prevent loops.
2192 UErrorCode status
=U_ZERO_ERROR
;
2193 RegexMatcher
m("(?= ?)", 0, status
); // This pattern will zero-length matches anywhere,
2194 // using an always-true look-ahead.
2196 UText s
= UTEXT_INITIALIZER
;
2197 utext_openUTF8(&s
, " ", -1, &status
);
2200 if (m
.find() == FALSE
) {
2203 REGEX_ASSERT(m
.start(status
) == i
);
2204 REGEX_ASSERT(m
.end(status
) == i
);
2208 // Check that the bump goes over characters outside the BMP OK
2209 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2210 unsigned char aboveBMP
[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2211 utext_openUTF8(&s
, (char *)aboveBMP
, -1, &status
);
2214 if (m
.find() == FALSE
) {
2217 REGEX_ASSERT(m
.start(status
) == i
);
2218 REGEX_ASSERT(m
.end(status
) == i
);
2220 REGEX_ASSERT(i
==20);
2225 // find() loop breaking test.
2226 // with pattern of /.?/, should see a series of one char matches, then a single
2227 // match of zero length at the end of the input string.
2229 UErrorCode status
=U_ZERO_ERROR
;
2230 RegexMatcher
m(".?", 0, status
);
2232 UText s
= UTEXT_INITIALIZER
;
2233 utext_openUTF8(&s
, " ", -1, &status
);
2236 if (m
.find() == FALSE
) {
2239 REGEX_ASSERT(m
.start(status
) == i
);
2240 REGEX_ASSERT(m
.end(status
) == (i
<4 ? i
+1 : i
));
2249 // Matchers with no input string behave as if they had an empty input string.
2253 UErrorCode status
= U_ZERO_ERROR
;
2254 RegexMatcher
m(".?", 0, status
);
2256 REGEX_ASSERT(m
.find());
2257 REGEX_ASSERT(m
.start(status
) == 0);
2258 REGEX_ASSERT(m
.input() == "");
2261 UErrorCode status
= U_ZERO_ERROR
;
2262 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
2263 RegexMatcher
*m
= p
->matcher(status
);
2266 REGEX_ASSERT(m
->find() == FALSE
);
2267 REGEX_ASSERT(utext_nativeLength(m
->inputText()) == 0);
2276 UErrorCode status
= U_ZERO_ERROR
;
2277 UText testPattern
= UTEXT_INITIALIZER
;
2278 UText testText
= UTEXT_INITIALIZER
;
2279 regextst_openUTF8FromInvariant(&testPattern
, ".*", -1, &status
);
2280 REGEX_VERBOSE_TEXT(&testPattern
);
2281 regextst_openUTF8FromInvariant(&testText
, "This is test data", -1, &status
);
2282 REGEX_VERBOSE_TEXT(&testText
);
2284 RegexMatcher
m(&testPattern
, &testText
, 0, status
);
2286 REGEX_ASSERT(m
.regionStart() == 0);
2287 REGEX_ASSERT(m
.regionEnd() == (int32_t)strlen("This is test data"));
2288 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2289 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2291 m
.region(2,4, status
);
2293 REGEX_ASSERT(m
.matches(status
));
2294 REGEX_ASSERT(m
.start(status
)==2);
2295 REGEX_ASSERT(m
.end(status
)==4);
2299 REGEX_ASSERT(m
.regionStart() == 0);
2300 REGEX_ASSERT(m
.regionEnd() == (int32_t)strlen("This is test data"));
2302 regextst_openUTF8FromInvariant(&testText
, "short", -1, &status
);
2303 REGEX_VERBOSE_TEXT(&testText
);
2305 REGEX_ASSERT(m
.regionStart() == 0);
2306 REGEX_ASSERT(m
.regionEnd() == (int32_t)strlen("short"));
2308 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2309 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(FALSE
));
2310 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
2311 REGEX_ASSERT(&m
== &m
.reset());
2312 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
2314 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(TRUE
));
2315 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2316 REGEX_ASSERT(&m
== &m
.reset());
2317 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2319 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2320 REGEX_ASSERT(&m
== &m
.useTransparentBounds(TRUE
));
2321 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
2322 REGEX_ASSERT(&m
== &m
.reset());
2323 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
2325 REGEX_ASSERT(&m
== &m
.useTransparentBounds(FALSE
));
2326 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2327 REGEX_ASSERT(&m
== &m
.reset());
2328 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2330 utext_close(&testText
);
2331 utext_close(&testPattern
);
2335 // hitEnd() and requireEnd()
2338 UErrorCode status
= U_ZERO_ERROR
;
2339 UText testPattern
= UTEXT_INITIALIZER
;
2340 UText testText
= UTEXT_INITIALIZER
;
2341 const char str_
[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2342 const char str_aabb
[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2343 utext_openUTF8(&testPattern
, str_
, -1, &status
);
2344 utext_openUTF8(&testText
, str_aabb
, -1, &status
);
2346 RegexMatcher
m1(&testPattern
, &testText
, 0, status
);
2347 REGEX_ASSERT(m1
.lookingAt(status
) == TRUE
);
2348 REGEX_ASSERT(m1
.hitEnd() == TRUE
);
2349 REGEX_ASSERT(m1
.requireEnd() == FALSE
);
2352 status
= U_ZERO_ERROR
;
2353 const char str_a
[] = { 0x61, 0x2a, 0x00 }; /* a* */
2354 utext_openUTF8(&testPattern
, str_a
, -1, &status
);
2355 RegexMatcher
m2(&testPattern
, &testText
, 0, status
);
2356 REGEX_ASSERT(m2
.lookingAt(status
) == TRUE
);
2357 REGEX_ASSERT(m2
.hitEnd() == FALSE
);
2358 REGEX_ASSERT(m2
.requireEnd() == FALSE
);
2361 status
= U_ZERO_ERROR
;
2362 const char str_dotstardollar
[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2363 utext_openUTF8(&testPattern
, str_dotstardollar
, -1, &status
);
2364 RegexMatcher
m3(&testPattern
, &testText
, 0, status
);
2365 REGEX_ASSERT(m3
.lookingAt(status
) == TRUE
);
2366 REGEX_ASSERT(m3
.hitEnd() == TRUE
);
2367 REGEX_ASSERT(m3
.requireEnd() == TRUE
);
2370 utext_close(&testText
);
2371 utext_close(&testPattern
);
2376 //---------------------------------------------------------------------------
2378 // API_Replace_UTF8 API test for class RegexMatcher, testing the
2379 // Replace family of functions.
2381 //---------------------------------------------------------------------------
2382 void RegexTest::API_Replace_UTF8() {
2388 UErrorCode status
=U_ZERO_ERROR
;
2390 UText re
=UTEXT_INITIALIZER
;
2391 regextst_openUTF8FromInvariant(&re
, "abc", -1, &status
);
2392 REGEX_VERBOSE_TEXT(&re
);
2393 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
2396 char data
[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2397 // 012345678901234567
2398 UText dataText
= UTEXT_INITIALIZER
;
2399 utext_openUTF8(&dataText
, data
, -1, &status
);
2401 REGEX_VERBOSE_TEXT(&dataText
);
2402 RegexMatcher
*matcher
= &pat
->matcher(status
)->reset(&dataText
);
2405 // Plain vanilla matches.
2408 UText destText
= UTEXT_INITIALIZER
;
2409 utext_openUnicodeString(&destText
, &dest
, &status
);
2412 UText replText
= UTEXT_INITIALIZER
;
2414 const char str_yz
[] = { 0x79, 0x7a, 0x00 }; /* yz */
2415 utext_openUTF8(&replText
, str_yz
, -1, &status
);
2416 REGEX_VERBOSE_TEXT(&replText
);
2417 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2419 const char str_yzabcabc
[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2420 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc
, result
);
2421 utext_close(result
);
2422 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2424 REGEX_ASSERT(result
== &destText
);
2425 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc
, result
);
2427 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2429 const char str_yzyzyz
[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2430 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz
, result
);
2431 utext_close(result
);
2433 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2434 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2436 REGEX_ASSERT(result
== &destText
);
2437 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz
, result
);
2440 // Plain vanilla non-matches.
2442 const char str_abxabxabx
[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2443 utext_openUTF8(&dataText
, str_abxabxabx
, -1, &status
);
2444 matcher
->reset(&dataText
);
2446 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2448 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2449 utext_close(result
);
2450 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2452 REGEX_ASSERT(result
== &destText
);
2453 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2455 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2457 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2458 utext_close(result
);
2459 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2460 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2462 REGEX_ASSERT(result
== &destText
);
2463 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2466 // Empty source string
2468 utext_openUTF8(&dataText
, NULL
, 0, &status
);
2469 matcher
->reset(&dataText
);
2471 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2473 REGEX_ASSERT_UTEXT_UTF8("", result
);
2474 utext_close(result
);
2475 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2477 REGEX_ASSERT(result
== &destText
);
2478 REGEX_ASSERT_UTEXT_UTF8("", result
);
2480 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2482 REGEX_ASSERT_UTEXT_UTF8("", result
);
2483 utext_close(result
);
2484 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2486 REGEX_ASSERT(result
== &destText
);
2487 REGEX_ASSERT_UTEXT_UTF8("", result
);
2490 // Empty substitution string
2492 utext_openUTF8(&dataText
, data
, -1, &status
); // ".abc..abc...abc.."
2493 matcher
->reset(&dataText
);
2495 utext_openUTF8(&replText
, NULL
, 0, &status
);
2496 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2498 const char str_abcabc
[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2499 REGEX_ASSERT_UTEXT_UTF8(str_abcabc
, result
);
2500 utext_close(result
);
2501 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2503 REGEX_ASSERT(result
== &destText
);
2504 REGEX_ASSERT_UTEXT_UTF8(str_abcabc
, result
);
2506 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2508 const char str_dots
[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2509 REGEX_ASSERT_UTEXT_UTF8(str_dots
, result
);
2510 utext_close(result
);
2511 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2512 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2514 REGEX_ASSERT(result
== &destText
);
2515 REGEX_ASSERT_UTEXT_UTF8(str_dots
, result
);
2518 // match whole string
2520 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2521 utext_openUTF8(&dataText
, str_abc
, -1, &status
);
2522 matcher
->reset(&dataText
);
2524 const char str_xyz
[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2525 utext_openUTF8(&replText
, str_xyz
, -1, &status
);
2526 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2528 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2529 utext_close(result
);
2530 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2531 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2533 REGEX_ASSERT(result
== &destText
);
2534 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2536 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2538 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2539 utext_close(result
);
2540 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2541 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2543 REGEX_ASSERT(result
== &destText
);
2544 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2547 // Capture Group, simple case
2549 const char str_add
[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2550 utext_openUTF8(&re
, str_add
, -1, &status
);
2551 RegexPattern
*pat2
= RegexPattern::compile(&re
, flags
, pe
, status
);
2554 const char str_abcdefg
[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2555 utext_openUTF8(&dataText
, str_abcdefg
, -1, &status
);
2556 RegexMatcher
*matcher2
= &pat2
->matcher(status
)->reset(&dataText
);
2559 const char str_11
[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2560 utext_openUTF8(&replText
, str_11
, -1, &status
);
2561 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2563 const char str_bcbcdefg
[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2564 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg
, result
);
2565 utext_close(result
);
2566 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2567 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2569 REGEX_ASSERT(result
== &destText
);
2570 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg
, result
);
2572 const char str_v
[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2573 utext_openUTF8(&replText
, str_v
, -1, &status
);
2574 REGEX_VERBOSE_TEXT(&replText
);
2575 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2577 const char str_Thevalueof1isbcdefg
[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2578 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg
, result
);
2579 utext_close(result
);
2580 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2581 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2583 REGEX_ASSERT(result
== &destText
);
2584 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg
, result
);
2586 const char str_byitselfnogroupnumber
[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
2587 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
2588 0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
2589 utext_openUTF8(&replText
, str_byitselfnogroupnumber
, -1, &status
);
2590 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2592 const char str_byitselfnogroupnumberdefg
[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2593 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg
, result
);
2594 utext_close(result
);
2595 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2596 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2598 REGEX_ASSERT(result
== &destText
);
2599 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg
, result
);
2601 unsigned char supplDigitChars
[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2602 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2603 // 012345678901234567890123456
2604 supplDigitChars
[22] = 0xF0;
2605 supplDigitChars
[23] = 0x9D;
2606 supplDigitChars
[24] = 0x9F;
2607 supplDigitChars
[25] = 0x8F;
2608 utext_openUTF8(&replText
, (char *)supplDigitChars
, -1, &status
);
2610 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2612 const char str_SupplementalDigit1bcdefg
[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2613 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg
, result
);
2614 utext_close(result
);
2615 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2616 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2618 REGEX_ASSERT(result
== &destText
);
2619 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg
, result
);
2620 const char str_badcapturegroupnumber5
[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */
2621 utext_openUTF8(&replText
, str_badcapturegroupnumber5
, -1, &status
);
2622 REGEX_ASSERT_FAIL((result
= matcher2
->replaceFirst(&replText
, NULL
, status
)), U_INDEX_OUTOFBOUNDS_ERROR
);
2623 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2624 utext_close(result
);
2625 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2626 REGEX_ASSERT_FAIL((result
= matcher2
->replaceFirst(&replText
, &destText
, status
)), U_INDEX_OUTOFBOUNDS_ERROR
);
2627 REGEX_ASSERT(result
== &destText
);
2628 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2631 // Replacement String with \u hex escapes
2634 const char str_abc1abc2abc3
[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2635 const char str_u0043
[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2636 utext_openUTF8(&dataText
, str_abc1abc2abc3
, -1, &status
);
2637 utext_openUTF8(&replText
, str_u0043
, -1, &status
);
2638 matcher
->reset(&dataText
);
2640 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2642 const char str_C1C2C3
[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2643 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3
, result
);
2644 utext_close(result
);
2645 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2646 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2648 REGEX_ASSERT(result
== &destText
);
2649 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3
, result
);
2652 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2653 utext_openUTF8(&dataText
, str_abc
, -1, &status
);
2654 const char str_U00010000
[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2655 utext_openUTF8(&replText
, str_U00010000
, -1, &status
);
2656 matcher
->reset(&dataText
);
2658 unsigned char expected
[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2665 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2667 REGEX_ASSERT_UTEXT_UTF8((char *)expected
, result
);
2668 utext_close(result
);
2669 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2670 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2672 REGEX_ASSERT(result
== &destText
);
2673 REGEX_ASSERT_UTEXT_UTF8((char *)expected
, result
);
2675 // TODO: need more through testing of capture substitutions.
2680 status
= U_ZERO_ERROR
;
2681 const char str_ssee
[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2682 const char str_blah
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2683 const char str_ooh
[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2684 utext_openUTF8(&re
, str_ssee
, -1, &status
);
2685 utext_openUTF8(&dataText
, str_blah
, -1, &status
);
2686 utext_openUTF8(&replText
, str_ooh
, -1, &status
);
2688 RegexMatcher
m(&re
, 0, status
);
2691 UnicodeString result
;
2692 UText resultText
= UTEXT_INITIALIZER
;
2693 utext_openUnicodeString(&resultText
, &result
, &status
);
2695 // Multiple finds do NOT bump up the previous appendReplacement postion.
2699 m
.appendReplacement(&resultText
, &replText
, status
);
2701 const char str_blah2
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2702 REGEX_ASSERT_UTEXT_UTF8(str_blah2
, &resultText
);
2704 // After a reset into the interior of a string, appendReplacement still starts at beginning.
2705 status
= U_ZERO_ERROR
;
2707 utext_openUnicodeString(&resultText
, &result
, &status
);
2708 m
.reset(10, status
);
2711 m
.appendReplacement(&resultText
, &replText
, status
);
2713 const char str_blah3
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2714 REGEX_ASSERT_UTEXT_UTF8(str_blah3
, &resultText
);
2716 // find() at interior of string, appendReplacement still starts at beginning.
2717 status
= U_ZERO_ERROR
;
2719 utext_openUnicodeString(&resultText
, &result
, &status
);
2723 m
.appendReplacement(&resultText
, &replText
, status
);
2725 const char str_blah8
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2726 REGEX_ASSERT_UTEXT_UTF8(str_blah8
, &resultText
);
2728 m
.appendTail(&resultText
, status
);
2729 const char str_blah9
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2730 REGEX_ASSERT_UTEXT_UTF8(str_blah9
, &resultText
);
2732 utext_close(&resultText
);
2740 utext_close(&dataText
);
2741 utext_close(&replText
);
2742 utext_close(&destText
);
2747 //---------------------------------------------------------------------------
2749 // API_Pattern_UTF8 Test that the API for class RegexPattern is
2750 // present and nominally working.
2752 //---------------------------------------------------------------------------
2753 void RegexTest::API_Pattern_UTF8() {
2754 RegexPattern pata
; // Test default constructor to not crash.
2757 REGEX_ASSERT(pata
== patb
);
2758 REGEX_ASSERT(pata
== pata
);
2760 UText re1
= UTEXT_INITIALIZER
;
2761 UText re2
= UTEXT_INITIALIZER
;
2762 UErrorCode status
= U_ZERO_ERROR
;
2765 const char str_abcalmz
[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2766 const char str_def
[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2767 utext_openUTF8(&re1
, str_abcalmz
, -1, &status
);
2768 utext_openUTF8(&re2
, str_def
, -1, &status
);
2770 RegexPattern
*pat1
= RegexPattern::compile(&re1
, 0, pe
, status
);
2771 RegexPattern
*pat2
= RegexPattern::compile(&re2
, 0, pe
, status
);
2773 REGEX_ASSERT(*pat1
== *pat1
);
2774 REGEX_ASSERT(*pat1
!= pata
);
2778 REGEX_ASSERT(patb
== *pat1
);
2781 RegexPattern
patc(*pat1
);
2782 REGEX_ASSERT(patc
== *pat1
);
2783 REGEX_ASSERT(patb
== patc
);
2784 REGEX_ASSERT(pat1
!= pat2
);
2786 REGEX_ASSERT(patb
!= patc
);
2787 REGEX_ASSERT(patb
== *pat2
);
2789 // Compile with no flags.
2790 RegexPattern
*pat1a
= RegexPattern::compile(&re1
, pe
, status
);
2791 REGEX_ASSERT(*pat1a
== *pat1
);
2793 REGEX_ASSERT(pat1a
->flags() == 0);
2795 // Compile with different flags should be not equal
2796 RegexPattern
*pat1b
= RegexPattern::compile(&re1
, UREGEX_CASE_INSENSITIVE
, pe
, status
);
2799 REGEX_ASSERT(*pat1b
!= *pat1a
);
2800 REGEX_ASSERT(pat1b
->flags() == UREGEX_CASE_INSENSITIVE
);
2801 REGEX_ASSERT(pat1a
->flags() == 0);
2805 RegexPattern
*pat1c
= pat1
->clone();
2806 REGEX_ASSERT(*pat1c
== *pat1
);
2807 REGEX_ASSERT(*pat1c
!= *pat2
);
2819 // Verify that a matcher created from a cloned pattern works.
2823 UErrorCode status
= U_ZERO_ERROR
;
2824 UText pattern
= UTEXT_INITIALIZER
;
2825 const char str_pL
[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2826 utext_openUTF8(&pattern
, str_pL
, -1, &status
);
2828 RegexPattern
*pSource
= RegexPattern::compile(&pattern
, 0, status
);
2829 RegexPattern
*pClone
= pSource
->clone();
2831 RegexMatcher
*mFromClone
= pClone
->matcher(status
);
2834 UText input
= UTEXT_INITIALIZER
;
2835 const char str_HelloWorld
[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2836 utext_openUTF8(&input
, str_HelloWorld
, -1, &status
);
2837 mFromClone
->reset(&input
);
2838 REGEX_ASSERT(mFromClone
->find() == TRUE
);
2839 REGEX_ASSERT(mFromClone
->group(status
) == "Hello");
2840 REGEX_ASSERT(mFromClone
->find() == TRUE
);
2841 REGEX_ASSERT(mFromClone
->group(status
) == "World");
2842 REGEX_ASSERT(mFromClone
->find() == FALSE
);
2846 utext_close(&input
);
2847 utext_close(&pattern
);
2851 // matches convenience API
2854 UErrorCode status
= U_ZERO_ERROR
;
2855 UText pattern
= UTEXT_INITIALIZER
;
2856 UText input
= UTEXT_INITIALIZER
;
2858 const char str_randominput
[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2859 utext_openUTF8(&input
, str_randominput
, -1, &status
);
2861 const char str_dotstar
[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2862 utext_openUTF8(&pattern
, str_dotstar
, -1, &status
);
2863 REGEX_ASSERT(RegexPattern::matches(&pattern
, &input
, pe
, status
) == TRUE
);
2866 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2867 utext_openUTF8(&pattern
, str_abc
, -1, &status
);
2868 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe
, status
) == FALSE
);
2871 const char str_nput
[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2872 utext_openUTF8(&pattern
, str_nput
, -1, &status
);
2873 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe
, status
) == TRUE
);
2876 utext_openUTF8(&pattern
, str_randominput
, -1, &status
);
2877 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe
, status
) == TRUE
);
2880 const char str_u
[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2881 utext_openUTF8(&pattern
, str_u
, -1, &status
);
2882 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe
, status
) == FALSE
);
2885 utext_openUTF8(&input
, str_abc
, -1, &status
);
2886 utext_openUTF8(&pattern
, str_abc
, -1, &status
);
2887 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
2888 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe
, status
) == FALSE
);
2889 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
2891 utext_close(&input
);
2892 utext_close(&pattern
);
2899 status
= U_ZERO_ERROR
;
2900 const char str_spaceplus
[] = { 0x20, 0x2b, 0x00 }; /* + */
2901 utext_openUTF8(&re1
, str_spaceplus
, -1, &status
);
2902 pat1
= RegexPattern::compile(&re1
, pe
, status
);
2904 UnicodeString fields
[10];
2907 n
= pat1
->split("Now is the time", fields
, 10, status
);
2910 REGEX_ASSERT(fields
[0]=="Now");
2911 REGEX_ASSERT(fields
[1]=="is");
2912 REGEX_ASSERT(fields
[2]=="the");
2913 REGEX_ASSERT(fields
[3]=="time");
2914 REGEX_ASSERT(fields
[4]=="");
2916 n
= pat1
->split("Now is the time", fields
, 2, status
);
2919 REGEX_ASSERT(fields
[0]=="Now");
2920 REGEX_ASSERT(fields
[1]=="is the time");
2921 REGEX_ASSERT(fields
[2]=="the"); // left over from previous test
2924 status
= U_ZERO_ERROR
;
2925 n
= pat1
->split("Now is the time", fields
, 1, status
);
2928 REGEX_ASSERT(fields
[0]=="Now is the time");
2929 REGEX_ASSERT(fields
[1]=="*");
2930 status
= U_ZERO_ERROR
;
2932 n
= pat1
->split(" Now is the time ", fields
, 10, status
);
2935 REGEX_ASSERT(fields
[0]=="");
2936 REGEX_ASSERT(fields
[1]=="Now");
2937 REGEX_ASSERT(fields
[2]=="is");
2938 REGEX_ASSERT(fields
[3]=="the");
2939 REGEX_ASSERT(fields
[4]=="time");
2940 REGEX_ASSERT(fields
[5]=="");
2941 REGEX_ASSERT(fields
[6]=="");
2944 n
= pat1
->split(" ", fields
, 10, status
);
2947 REGEX_ASSERT(fields
[0]=="");
2948 REGEX_ASSERT(fields
[1]=="");
2949 REGEX_ASSERT(fields
[2]=="*");
2952 n
= pat1
->split("", fields
, 10, status
);
2955 REGEX_ASSERT(fields
[0]=="foo");
2959 // split, with a pattern with (capture)
2960 regextst_openUTF8FromInvariant(&re1
, "<(\\w*)>", -1, &status
);
2961 pat1
= RegexPattern::compile(&re1
, pe
, status
);
2964 status
= U_ZERO_ERROR
;
2965 fields
[6] = fields
[7] = "*";
2966 n
= pat1
->split("<a>Now is <b>the time<c>", fields
, 10, status
);
2969 REGEX_ASSERT(fields
[0]=="");
2970 REGEX_ASSERT(fields
[1]=="a");
2971 REGEX_ASSERT(fields
[2]=="Now is ");
2972 REGEX_ASSERT(fields
[3]=="b");
2973 REGEX_ASSERT(fields
[4]=="the time");
2974 REGEX_ASSERT(fields
[5]=="c");
2975 REGEX_ASSERT(fields
[6]=="");
2976 REGEX_ASSERT(fields
[7]=="*");
2977 REGEX_ASSERT(status
==U_ZERO_ERROR
);
2979 fields
[6] = fields
[7] = "*";
2980 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 10, status
);
2983 REGEX_ASSERT(fields
[0]==" ");
2984 REGEX_ASSERT(fields
[1]=="a");
2985 REGEX_ASSERT(fields
[2]=="Now is ");
2986 REGEX_ASSERT(fields
[3]=="b");
2987 REGEX_ASSERT(fields
[4]=="the time");
2988 REGEX_ASSERT(fields
[5]=="c");
2989 REGEX_ASSERT(fields
[6]=="");
2990 REGEX_ASSERT(fields
[7]=="*");
2992 status
= U_ZERO_ERROR
;
2994 n
= pat1
->split(" <a>Now is <b>the time<c> ", fields
, 6, status
);
2997 REGEX_ASSERT(fields
[0]==" ");
2998 REGEX_ASSERT(fields
[1]=="a");
2999 REGEX_ASSERT(fields
[2]=="Now is ");
3000 REGEX_ASSERT(fields
[3]=="b");
3001 REGEX_ASSERT(fields
[4]=="the time");
3002 REGEX_ASSERT(fields
[5]==" ");
3003 REGEX_ASSERT(fields
[6]=="foo");
3005 status
= U_ZERO_ERROR
;
3007 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 5, status
);
3010 REGEX_ASSERT(fields
[0]==" ");
3011 REGEX_ASSERT(fields
[1]=="a");
3012 REGEX_ASSERT(fields
[2]=="Now is ");
3013 REGEX_ASSERT(fields
[3]=="b");
3014 REGEX_ASSERT(fields
[4]=="the time<c>");
3015 REGEX_ASSERT(fields
[5]=="foo");
3017 status
= U_ZERO_ERROR
;
3019 n
= pat1
->split(" <a>Now is <b>the time", fields
, 5, status
);
3022 REGEX_ASSERT(fields
[0]==" ");
3023 REGEX_ASSERT(fields
[1]=="a");
3024 REGEX_ASSERT(fields
[2]=="Now is ");
3025 REGEX_ASSERT(fields
[3]=="b");
3026 REGEX_ASSERT(fields
[4]=="the time");
3027 REGEX_ASSERT(fields
[5]=="foo");
3029 status
= U_ZERO_ERROR
;
3030 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 4, status
);
3033 REGEX_ASSERT(fields
[0]==" ");
3034 REGEX_ASSERT(fields
[1]=="a");
3035 REGEX_ASSERT(fields
[2]=="Now is ");
3036 REGEX_ASSERT(fields
[3]=="the time<c>");
3037 status
= U_ZERO_ERROR
;
3040 regextst_openUTF8FromInvariant(&re1
, "([-,])", -1, &status
);
3041 pat1
= RegexPattern::compile(&re1
, pe
, status
);
3043 n
= pat1
->split("1-10,20", fields
, 10, status
);
3046 REGEX_ASSERT(fields
[0]=="1");
3047 REGEX_ASSERT(fields
[1]=="-");
3048 REGEX_ASSERT(fields
[2]=="10");
3049 REGEX_ASSERT(fields
[3]==",");
3050 REGEX_ASSERT(fields
[4]=="20");
3055 // split of a UText based string, with library allocating output UTexts.
3058 status
= U_ZERO_ERROR
;
3059 RegexMatcher
matcher(UnicodeString("(:)"), 0, status
);
3060 UnicodeString
stringToSplit("first:second:third");
3061 UText
*textToSplit
= utext_openUnicodeString(NULL
, &stringToSplit
, &status
);
3064 UText
*splits
[10] = {NULL
};
3065 int32_t numFields
= matcher
.split(textToSplit
, splits
, UPRV_LENGTHOF(splits
), status
);
3067 REGEX_ASSERT(numFields
== 5);
3068 REGEX_ASSERT_UTEXT_INVARIANT("first", splits
[0]);
3069 REGEX_ASSERT_UTEXT_INVARIANT(":", splits
[1]);
3070 REGEX_ASSERT_UTEXT_INVARIANT("second", splits
[2]);
3071 REGEX_ASSERT_UTEXT_INVARIANT(":", splits
[3]);
3072 REGEX_ASSERT_UTEXT_INVARIANT("third", splits
[4]);
3073 REGEX_ASSERT(splits
[5] == NULL
);
3075 for (int i
=0; i
<UPRV_LENGTHOF(splits
); i
++) {
3077 utext_close(splits
[i
]);
3081 utext_close(textToSplit
);
3086 // RegexPattern::pattern() and patternText()
3088 pat1
= new RegexPattern();
3089 REGEX_ASSERT(pat1
->pattern() == "");
3090 REGEX_ASSERT_UTEXT_UTF8("", pat1
->patternText(status
));
3092 const char *helloWorldInvariant
= "(Hello, world)*";
3093 regextst_openUTF8FromInvariant(&re1
, helloWorldInvariant
, -1, &status
);
3094 pat1
= RegexPattern::compile(&re1
, pe
, status
);
3096 REGEX_ASSERT_UNISTR("(Hello, world)*", pat1
->pattern());
3097 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1
->patternText(status
));
3104 //---------------------------------------------------------------------------
3106 // Extended A more thorough check for features of regex patterns
3107 // The test cases are in a separate data file,
3108 // source/tests/testdata/regextst.txt
3109 // A description of the test data format is included in that file.
3111 //---------------------------------------------------------------------------
3114 RegexTest::getPath(char buffer
[2048], const char *filename
) {
3115 UErrorCode status
=U_ZERO_ERROR
;
3116 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
3117 if (U_FAILURE(status
)) {
3118 errln("ERROR: loadTestData() failed - %s", u_errorName(status
));
3122 strcpy(buffer
, testDataDirectory
);
3123 strcat(buffer
, filename
);
3127 void RegexTest::Extended() {
3129 const char *srcPath
;
3130 UErrorCode status
= U_ZERO_ERROR
;
3131 int32_t lineNum
= 0;
3134 // Open and read the test data file.
3136 srcPath
=getPath(tdd
, "regextst.txt");
3138 return; /* something went wrong, error already output */
3142 UChar
*testData
= ReadAndConvertFile(srcPath
, len
, "utf-8", status
);
3143 if (U_FAILURE(status
)) {
3144 return; /* something went wrong, error already output */
3148 // Put the test data into a UnicodeString
3150 UnicodeString
testString(FALSE
, testData
, len
);
3152 RegexMatcher
quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status
);
3153 RegexMatcher
commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status
);
3154 RegexMatcher
flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status
);
3156 RegexMatcher
lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString
, 0, status
);
3157 UnicodeString testPattern
; // The pattern for test from the test file.
3158 UnicodeString testFlags
; // the flags for a test.
3159 UnicodeString matchString
; // The marked up string to be used as input
3161 if (U_FAILURE(status
)){
3162 dataerrln("Construct RegexMatcher() error - %s", u_errorName(status
));
3168 // Loop over the test data file, once per line.
3170 while (lineMat
.find()) {
3172 if (U_FAILURE(status
)) {
3173 errln("%s:%d: ICU Error \"%s\"", srcPath
, lineNum
, u_errorName(status
));
3176 status
= U_ZERO_ERROR
;
3177 UnicodeString testLine
= lineMat
.group(1, status
);
3178 if (testLine
.length() == 0) {
3183 // Parse the test line. Skip blank and comment only lines.
3184 // Separate out the three main fields - pattern, flags, target.
3187 commentMat
.reset(testLine
);
3188 if (commentMat
.lookingAt(status
)) {
3189 // This line is a comment, or blank.
3194 // Pull out the pattern field, remove it from the test file line.
3196 quotedStuffMat
.reset(testLine
);
3197 if (quotedStuffMat
.lookingAt(status
)) {
3198 testPattern
= quotedStuffMat
.group(2, status
);
3199 testLine
.remove(0, quotedStuffMat
.end(0, status
));
3201 errln("Bad pattern (missing quotes?) at %s:%d", srcPath
, lineNum
);
3207 // Pull out the flags from the test file line.
3209 flagsMat
.reset(testLine
);
3210 flagsMat
.lookingAt(status
); // Will always match, possibly an empty string.
3211 testFlags
= flagsMat
.group(1, status
);
3212 if (flagsMat
.group(2, status
).length() > 0) {
3213 errln("Bad Match flag at line %d. Scanning %c\n",
3214 lineNum
, flagsMat
.group(2, status
).charAt(0));
3217 testLine
.remove(0, flagsMat
.end(0, status
));
3220 // Pull out the match string, as a whole.
3221 // We'll process the <tags> later.
3223 quotedStuffMat
.reset(testLine
);
3224 if (quotedStuffMat
.lookingAt(status
)) {
3225 matchString
= quotedStuffMat
.group(2, status
);
3226 testLine
.remove(0, quotedStuffMat
.end(0, status
));
3228 errln("Bad match string at test file line %d", lineNum
);
3233 // The only thing left from the input line should be an optional trailing comment.
3235 commentMat
.reset(testLine
);
3236 if (commentMat
.lookingAt(status
) == FALSE
) {
3237 errln("Line %d: unexpected characters at end of test line.", lineNum
);
3244 regex_find(testPattern
, testFlags
, matchString
, srcPath
, lineNum
);
3253 //---------------------------------------------------------------------------
3255 // regex_find(pattern, flags, inputString, lineNumber)
3257 // Function to run a single test from the Extended (data driven) tests.
3258 // See file test/testdata/regextst.txt for a description of the
3259 // pattern and inputString fields, and the allowed flags.
3260 // lineNumber is the source line in regextst.txt of the test.
3262 //---------------------------------------------------------------------------
3265 // Set a value into a UVector at position specified by a decimal number in
3266 // a UnicodeString. This is a utility function needed by the actual test function,
3268 static void set(UVector
&vec
, int32_t val
, UnicodeString index
) {
3269 UErrorCode status
=U_ZERO_ERROR
;
3271 for (int32_t i
=0; i
<index
.length(); i
++) {
3272 int32_t d
=u_charDigitValue(index
.charAt(i
));
3276 while (vec
.size()<idx
+1) {vec
.addElement(-1, status
);}
3277 vec
.setElementAt(val
, idx
);
3280 static void setInt(UVector
&vec
, int32_t val
, int32_t idx
) {
3281 UErrorCode status
=U_ZERO_ERROR
;
3282 while (vec
.size()<idx
+1) {vec
.addElement(-1, status
);}
3283 vec
.setElementAt(val
, idx
);
3286 static UBool
utextOffsetToNative(UText
*utext
, int32_t unistrOffset
, int32_t& nativeIndex
)
3288 UBool couldFind
= TRUE
;
3289 UTEXT_SETNATIVEINDEX(utext
, 0);
3291 while (i
< unistrOffset
) {
3292 UChar32 c
= UTEXT_NEXT32(utext
);
3293 if (c
!= U_SENTINEL
) {
3300 nativeIndex
= (int32_t)UTEXT_GETNATIVEINDEX(utext
);
3305 void RegexTest::regex_find(const UnicodeString
&pattern
,
3306 const UnicodeString
&flags
,
3307 const UnicodeString
&inputString
,
3308 const char *srcPath
,
3310 UnicodeString unEscapedInput
;
3311 UnicodeString deTaggedInput
;
3313 int32_t patternUTF8Length
, inputUTF8Length
;
3314 char *patternChars
= NULL
, *inputChars
= NULL
;
3315 UText patternText
= UTEXT_INITIALIZER
;
3316 UText inputText
= UTEXT_INITIALIZER
;
3317 UConverter
*UTF8Converter
= NULL
;
3319 UErrorCode status
= U_ZERO_ERROR
;
3321 RegexPattern
*parsePat
= NULL
;
3322 RegexMatcher
*parseMatcher
= NULL
;
3323 RegexPattern
*callerPattern
= NULL
, *UTF8Pattern
= NULL
;
3324 RegexMatcher
*matcher
= NULL
, *UTF8Matcher
= NULL
;
3325 UVector
groupStarts(status
);
3326 UVector
groupEnds(status
);
3327 UVector
groupStartsUTF8(status
);
3328 UVector
groupEndsUTF8(status
);
3329 UBool isMatch
= FALSE
, isUTF8Match
= FALSE
;
3330 UBool failed
= FALSE
;
3333 UBool useMatchesFunc
= FALSE
;
3334 UBool useLookingAtFunc
= FALSE
;
3335 int32_t regionStart
= -1;
3336 int32_t regionEnd
= -1;
3337 int32_t regionStartUTF8
= -1;
3338 int32_t regionEndUTF8
= -1;
3342 // Compile the caller's pattern
3344 uint32_t bflags
= 0;
3345 if (flags
.indexOf((UChar
)0x69) >= 0) { // 'i' flag
3346 bflags
|= UREGEX_CASE_INSENSITIVE
;
3348 if (flags
.indexOf((UChar
)0x78) >= 0) { // 'x' flag
3349 bflags
|= UREGEX_COMMENTS
;
3351 if (flags
.indexOf((UChar
)0x73) >= 0) { // 's' flag
3352 bflags
|= UREGEX_DOTALL
;
3354 if (flags
.indexOf((UChar
)0x6d) >= 0) { // 'm' flag
3355 bflags
|= UREGEX_MULTILINE
;
3358 if (flags
.indexOf((UChar
)0x65) >= 0) { // 'e' flag
3359 bflags
|= UREGEX_ERROR_ON_UNKNOWN_ESCAPES
;
3361 if (flags
.indexOf((UChar
)0x44) >= 0) { // 'D' flag
3362 bflags
|= UREGEX_UNIX_LINES
;
3364 if (flags
.indexOf((UChar
)0x51) >= 0) { // 'Q' flag
3365 bflags
|= UREGEX_LITERAL
;
3369 callerPattern
= RegexPattern::compile(pattern
, bflags
, pe
, status
);
3370 if (status
!= U_ZERO_ERROR
) {
3371 #if UCONFIG_NO_BREAK_ITERATION==1
3372 // 'v' test flag means that the test pattern should not compile if ICU was configured
3373 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3374 if (flags
.indexOf((UChar
)0x76) >= 0 /*'v'*/ && status
== U_UNSUPPORTED_ERROR
) {
3375 goto cleanupAndReturn
;
3378 if (flags
.indexOf((UChar
)0x45) >= 0) { // flags contain 'E'
3379 // Expected pattern compilation error.
3380 if (flags
.indexOf((UChar
)0x64) >= 0) { // flags contain 'd'
3381 logln("Pattern Compile returns \"%s\"", u_errorName(status
));
3383 goto cleanupAndReturn
;
3385 // Unexpected pattern compilation error.
3386 dataerrln("Line %d: error %s compiling pattern.", line
, u_errorName(status
));
3387 goto cleanupAndReturn
;
3391 UTF8Converter
= ucnv_open("UTF8", &status
);
3392 ucnv_setFromUCallBack(UTF8Converter
, UCNV_FROM_U_CALLBACK_STOP
, NULL
, NULL
, NULL
, &status
);
3394 patternUTF8Length
= pattern
.extract(NULL
, 0, UTF8Converter
, status
);
3395 status
= U_ZERO_ERROR
; // buffer overflow
3396 patternChars
= new char[patternUTF8Length
+1];
3397 pattern
.extract(patternChars
, patternUTF8Length
+1, UTF8Converter
, status
);
3398 utext_openUTF8(&patternText
, patternChars
, patternUTF8Length
, &status
);
3400 if (status
== U_ZERO_ERROR
) {
3401 UTF8Pattern
= RegexPattern::compile(&patternText
, bflags
, pe
, status
);
3403 if (status
!= U_ZERO_ERROR
) {
3404 #if UCONFIG_NO_BREAK_ITERATION==1
3405 // 'v' test flag means that the test pattern should not compile if ICU was configured
3406 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3407 if (flags
.indexOf((UChar
)0x76) >= 0 /*'v'*/ && status
== U_UNSUPPORTED_ERROR
) {
3408 goto cleanupAndReturn
;
3411 if (flags
.indexOf((UChar
)0x45) >= 0) { // flags contain 'E'
3412 // Expected pattern compilation error.
3413 if (flags
.indexOf((UChar
)0x64) >= 0) { // flags contain 'd'
3414 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status
));
3416 goto cleanupAndReturn
;
3418 // Unexpected pattern compilation error.
3419 errln("Line %d: error %s compiling pattern. (UTF8)", line
, u_errorName(status
));
3420 goto cleanupAndReturn
;
3425 if (UTF8Pattern
== NULL
) {
3426 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3427 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath
, line
);
3428 status
= U_ZERO_ERROR
;
3431 if (flags
.indexOf((UChar
)0x64) >= 0) { // 'd' flag
3432 callerPattern
->dumpPattern();
3435 if (flags
.indexOf((UChar
)0x45) >= 0) { // 'E' flag
3436 errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath
, line
);
3437 goto cleanupAndReturn
;
3442 // Number of times find() should be called on the test string, default to 1
3445 for (i
=2; i
<=9; i
++) {
3446 if (flags
.indexOf((UChar
)(0x30 + i
)) >= 0) { // digit flag
3447 if (numFinds
!= 1) {
3448 errln("Line %d: more than one digit flag. Scanning %d.", line
, i
);
3449 goto cleanupAndReturn
;
3455 // 'M' flag. Use matches() instead of find()
3456 if (flags
.indexOf((UChar
)0x4d) >= 0) {
3457 useMatchesFunc
= TRUE
;
3459 if (flags
.indexOf((UChar
)0x4c) >= 0) {
3460 useLookingAtFunc
= TRUE
;
3464 // Find the tags in the input data, remove them, and record the group boundary
3467 parsePat
= RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe
, status
);
3468 REGEX_CHECK_STATUS_L(line
);
3470 unEscapedInput
= inputString
.unescape();
3471 parseMatcher
= parsePat
->matcher(unEscapedInput
, status
);
3472 REGEX_CHECK_STATUS_L(line
);
3473 while(parseMatcher
->find()) {
3474 parseMatcher
->appendReplacement(deTaggedInput
, "", status
);
3476 UnicodeString groupNum
= parseMatcher
->group(2, status
);
3477 if (groupNum
== "r") {
3478 // <r> or </r>, a region specification within the string
3479 if (parseMatcher
->group(1, status
) == "/") {
3480 regionEnd
= deTaggedInput
.length();
3482 regionStart
= deTaggedInput
.length();
3485 // <digits> or </digits>, a group match boundary tag.
3486 if (parseMatcher
->group(1, status
) == "/") {
3487 set(groupEnds
, deTaggedInput
.length(), groupNum
);
3489 set(groupStarts
, deTaggedInput
.length(), groupNum
);
3493 parseMatcher
->appendTail(deTaggedInput
);
3494 REGEX_ASSERT_L(groupStarts
.size() == groupEnds
.size(), line
);
3495 if ((regionStart
>=0 || regionEnd
>=0) && (regionStart
<0 || regionStart
>regionEnd
)) {
3496 errln("mismatched <r> tags");
3498 goto cleanupAndReturn
;
3502 // Configure the matcher according to the flags specified with this test.
3504 matcher
= callerPattern
->matcher(deTaggedInput
, status
);
3505 REGEX_CHECK_STATUS_L(line
);
3506 if (flags
.indexOf((UChar
)0x74) >= 0) { // 't' trace flag
3507 matcher
->setTrace(TRUE
);
3510 if (UTF8Pattern
!= NULL
) {
3511 inputUTF8Length
= deTaggedInput
.extract(NULL
, 0, UTF8Converter
, status
);
3512 status
= U_ZERO_ERROR
; // buffer overflow
3513 inputChars
= new char[inputUTF8Length
+1];
3514 deTaggedInput
.extract(inputChars
, inputUTF8Length
+1, UTF8Converter
, status
);
3515 utext_openUTF8(&inputText
, inputChars
, inputUTF8Length
, &status
);
3517 if (status
== U_ZERO_ERROR
) {
3518 UTF8Matcher
= &UTF8Pattern
->matcher(status
)->reset(&inputText
);
3519 REGEX_CHECK_STATUS_L(line
);
3522 if (UTF8Matcher
== NULL
) {
3523 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3524 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath
, line
);
3525 status
= U_ZERO_ERROR
;
3530 // Generate native indices for UTF8 versions of region and capture group info
3532 if (UTF8Matcher
!= NULL
) {
3533 if (flags
.indexOf((UChar
)0x74) >= 0) { // 't' trace flag
3534 UTF8Matcher
->setTrace(TRUE
);
3536 if (regionStart
>=0) (void) utextOffsetToNative(&inputText
, regionStart
, regionStartUTF8
);
3537 if (regionEnd
>=0) (void) utextOffsetToNative(&inputText
, regionEnd
, regionEndUTF8
);
3539 // Fill out the native index UVector info.
3540 // Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3541 for (i
=0; i
<groupStarts
.size(); i
++) {
3542 int32_t start
= groupStarts
.elementAti(i
);
3543 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3546 if (!utextOffsetToNative(&inputText
, start
, startUTF8
)) {
3547 errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line
, i
, start
);
3549 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3551 setInt(groupStartsUTF8
, startUTF8
, i
);
3554 int32_t end
= groupEnds
.elementAti(i
);
3555 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3558 if (!utextOffsetToNative(&inputText
, end
, endUTF8
)) {
3559 errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line
, i
, end
);
3561 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3563 setInt(groupEndsUTF8
, endUTF8
, i
);
3568 if (regionStart
>=0) {
3569 matcher
->region(regionStart
, regionEnd
, status
);
3570 REGEX_CHECK_STATUS_L(line
);
3571 if (UTF8Matcher
!= NULL
) {
3572 UTF8Matcher
->region(regionStartUTF8
, regionEndUTF8
, status
);
3573 REGEX_CHECK_STATUS_L(line
);
3576 if (flags
.indexOf((UChar
)0x61) >= 0) { // 'a' anchoring bounds flag
3577 matcher
->useAnchoringBounds(FALSE
);
3578 if (UTF8Matcher
!= NULL
) {
3579 UTF8Matcher
->useAnchoringBounds(FALSE
);
3582 if (flags
.indexOf((UChar
)0x62) >= 0) { // 'b' transparent bounds flag
3583 matcher
->useTransparentBounds(TRUE
);
3584 if (UTF8Matcher
!= NULL
) {
3585 UTF8Matcher
->useTransparentBounds(TRUE
);
3592 // Do a find on the de-tagged input using the caller's pattern
3593 // TODO: error on count>1 and not find().
3594 // error on both matches() and lookingAt().
3596 for (i
=0; i
<numFinds
; i
++) {
3597 if (useMatchesFunc
) {
3598 isMatch
= matcher
->matches(status
);
3599 if (UTF8Matcher
!= NULL
) {
3600 isUTF8Match
= UTF8Matcher
->matches(status
);
3602 } else if (useLookingAtFunc
) {
3603 isMatch
= matcher
->lookingAt(status
);
3604 if (UTF8Matcher
!= NULL
) {
3605 isUTF8Match
= UTF8Matcher
->lookingAt(status
);
3608 isMatch
= matcher
->find();
3609 if (UTF8Matcher
!= NULL
) {
3610 isUTF8Match
= UTF8Matcher
->find();
3614 matcher
->setTrace(FALSE
);
3616 UTF8Matcher
->setTrace(FALSE
);
3618 if (U_FAILURE(status
)) {
3619 errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status
));
3623 // Match up the groups from the find() with the groups from the tags
3626 // number of tags should match number of groups from find operation.
3627 // matcher->groupCount does not include group 0, the entire match, hence the +1.
3628 // G option in test means that capture group data is not available in the
3629 // expected results, so the check needs to be suppressed.
3630 if (isMatch
== FALSE
&& groupStarts
.size() != 0) {
3631 dataerrln("Error at line %d: Match expected, but none found.", line
);
3633 goto cleanupAndReturn
;
3634 } else if (UTF8Matcher
!= NULL
&& isUTF8Match
== FALSE
&& groupStarts
.size() != 0) {
3635 errln("Error at line %d: Match expected, but none found. (UTF8)", line
);
3637 goto cleanupAndReturn
;
3639 if (isMatch
&& groupStarts
.size() == 0) {
3640 errln("Error at line %d: No match expected, but one found at position %d.", line
, matcher
->start(status
));
3643 if (UTF8Matcher
&& isUTF8Match
&& groupStarts
.size() == 0) {
3644 errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line
, UTF8Matcher
->start(status
));
3648 if (flags
.indexOf((UChar
)0x47 /*G*/) >= 0) {
3649 // Only check for match / no match. Don't check capture groups.
3650 goto cleanupAndReturn
;
3653 REGEX_CHECK_STATUS_L(line
);
3654 for (i
=0; i
<=matcher
->groupCount(); i
++) {
3655 int32_t expectedStart
= (i
>= groupStarts
.size()? -1 : groupStarts
.elementAti(i
));
3656 int32_t expectedStartUTF8
= (i
>= groupStartsUTF8
.size()? -1 : groupStartsUTF8
.elementAti(i
));
3657 if (matcher
->start(i
, status
) != expectedStart
) {
3658 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
3659 line
, i
, expectedStart
, matcher
->start(i
, status
));
3661 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3662 } else if (UTF8Matcher
!= NULL
&& UTF8Matcher
->start(i
, status
) != expectedStartUTF8
) {
3663 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)",
3664 line
, i
, expectedStartUTF8
, UTF8Matcher
->start(i
, status
));
3666 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3669 int32_t expectedEnd
= (i
>= groupEnds
.size()? -1 : groupEnds
.elementAti(i
));
3670 int32_t expectedEndUTF8
= (i
>= groupEndsUTF8
.size()? -1 : groupEndsUTF8
.elementAti(i
));
3671 if (matcher
->end(i
, status
) != expectedEnd
) {
3672 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
3673 line
, i
, expectedEnd
, matcher
->end(i
, status
));
3675 // Error on end position; keep going; real error is probably yet to come as group
3676 // end positions work from end of the input data towards the front.
3677 } else if (UTF8Matcher
!= NULL
&& UTF8Matcher
->end(i
, status
) != expectedEndUTF8
) {
3678 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)",
3679 line
, i
, expectedEndUTF8
, UTF8Matcher
->end(i
, status
));
3681 // Error on end position; keep going; real error is probably yet to come as group
3682 // end positions work from end of the input data towards the front.
3685 if ( matcher
->groupCount()+1 < groupStarts
.size()) {
3686 errln("Error at line %d: Expected %d capture groups, found %d.",
3687 line
, groupStarts
.size()-1, matcher
->groupCount());
3690 else if (UTF8Matcher
!= NULL
&& UTF8Matcher
->groupCount()+1 < groupStarts
.size()) {
3691 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3692 line
, groupStarts
.size()-1, UTF8Matcher
->groupCount());
3696 if ((flags
.indexOf((UChar
)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3697 matcher
->requireEnd() == TRUE
) {
3698 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line
);
3700 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3701 UTF8Matcher
->requireEnd() == TRUE
) {
3702 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line
);
3706 if ((flags
.indexOf((UChar
)0x79) >= 0) && // 'y' flag: RequireEnd() == true
3707 matcher
->requireEnd() == FALSE
) {
3708 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line
);
3710 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x79) >= 0) && // 'Y' flag: RequireEnd() == false
3711 UTF8Matcher
->requireEnd() == FALSE
) {
3712 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line
);
3716 if ((flags
.indexOf((UChar
)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3717 matcher
->hitEnd() == TRUE
) {
3718 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line
);
3720 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3721 UTF8Matcher
->hitEnd() == TRUE
) {
3722 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line
);
3726 if ((flags
.indexOf((UChar
)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3727 matcher
->hitEnd() == FALSE
) {
3728 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line
);
3730 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3731 UTF8Matcher
->hitEnd() == FALSE
) {
3732 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line
);
3739 infoln((UnicodeString
)"\""+pattern
+(UnicodeString
)"\" "
3740 +flags
+(UnicodeString
)" \""+inputString
+(UnicodeString
)"\"");
3741 // callerPattern->dump();
3743 delete parseMatcher
;
3748 delete callerPattern
;
3750 utext_close(&inputText
);
3751 delete[] inputChars
;
3752 utext_close(&patternText
);
3753 delete[] patternChars
;
3754 ucnv_close(UTF8Converter
);
3760 //---------------------------------------------------------------------------
3762 // Errors Check for error handling in patterns.
3764 //---------------------------------------------------------------------------
3765 void RegexTest::Errors() {
3766 // \escape sequences that aren't implemented yet.
3767 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3769 // Missing close parentheses
3770 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN
);
3771 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN
);
3772 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN
);
3774 // Extra close paren
3775 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN
);
3776 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN
);
3777 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN
);
3779 // Look-ahead, Look-behind
3780 // TODO: add tests for unbounded length look-behinds.
3781 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX
); // illegal construct
3783 // Attempt to use non-default flags
3786 UErrorCode status
= U_ZERO_ERROR
;
3787 int32_t flags
= UREGEX_CANON_EQ
|
3788 UREGEX_COMMENTS
| UREGEX_DOTALL
|
3790 RegexPattern
*pat1
= RegexPattern::compile(".*", flags
, pe
, status
);
3791 REGEX_ASSERT(status
== U_REGEX_UNIMPLEMENTED
);
3796 // Quantifiers are allowed only after something that can be quantified.
3797 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX
);
3798 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX
);
3799 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX
);
3801 // Mal-formed {min,max} quantifiers
3802 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL
);
3803 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN
);
3804 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL
);
3805 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL
);
3806 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL
);
3807 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG
);
3808 REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG
); // Overflows int during scan
3809 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG
); // Overflows regex binary format
3810 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG
);
3813 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX
);
3815 // Invalid Back Reference \0
3816 // For ICU 3.8 and earlier
3817 // For ICU versions newer than 3.8, \0 introduces an octal escape.
3819 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE
);
3824 //-------------------------------------------------------------------------------
3826 // Read a text data file, convert it to UChars, and return the data
3827 // in one big UChar * buffer, which the caller must delete.
3829 //--------------------------------------------------------------------------------
3830 UChar
*RegexTest::ReadAndConvertFile(const char *fileName
, int32_t &ulen
,
3831 const char *defEncoding
, UErrorCode
&status
) {
3832 UChar
*retPtr
= NULL
;
3833 char *fileBuf
= NULL
;
3834 UConverter
* conv
= NULL
;
3838 if (U_FAILURE(status
)) {
3845 f
= fopen(fileName
, "rb");
3847 dataerrln("Error opening test data file %s\n", fileName
);
3848 status
= U_FILE_ACCESS_ERROR
;
3857 fseek( f
, 0, SEEK_END
);
3858 fileSize
= ftell(f
);
3859 fileBuf
= new char[fileSize
];
3860 fseek(f
, 0, SEEK_SET
);
3861 amt_read
= fread(fileBuf
, 1, fileSize
, f
);
3862 if (amt_read
!= fileSize
|| fileSize
<= 0) {
3863 errln("Error reading test data file.");
3864 goto cleanUpAndReturn
;
3868 // Look for a Unicode Signature (BOM) on the data just read
3870 int32_t signatureLength
;
3871 const char * fileBufC
;
3872 const char* encoding
;
3875 encoding
= ucnv_detectUnicodeSignature(
3876 fileBuf
, fileSize
, &signatureLength
, &status
);
3877 if(encoding
!=NULL
){
3878 fileBufC
+= signatureLength
;
3879 fileSize
-= signatureLength
;
3881 encoding
= defEncoding
;
3882 if (strcmp(encoding
, "utf-8") == 0) {
3883 errln("file %s is missing its BOM", fileName
);
3888 // Open a converter to take the rule file to UTF-16
3890 conv
= ucnv_open(encoding
, &status
);
3891 if (U_FAILURE(status
)) {
3892 goto cleanUpAndReturn
;
3896 // Convert the rules to UChar.
3897 // Preflight first to determine required buffer size.
3899 ulen
= ucnv_toUChars(conv
,
3905 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
3906 // Buffer Overflow is expected from the preflight operation.
3907 status
= U_ZERO_ERROR
;
3909 retPtr
= new UChar
[ulen
+1];
3922 if (U_FAILURE(status
)) {
3923 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
3932 //-------------------------------------------------------------------------------
3934 // PerlTests - Run Perl's regular expression tests
3935 // The input file for this test is re_tests, the standard regular
3936 // expression test data distributed with the Perl source code.
3938 // Here is Perl's description of the test data file:
3940 // # The tests are in a separate file 't/op/re_tests'.
3941 // # Each line in that file is a separate test.
3942 // # There are five columns, separated by tabs.
3944 // # Column 1 contains the pattern, optionally enclosed in C<''>.
3945 // # Modifiers can be put after the closing C<'>.
3947 // # Column 2 contains the string to be matched.
3949 // # Column 3 contains the expected result:
3950 // # y expect a match
3951 // # n expect no match
3952 // # c expect an error
3953 // # B test exposes a known bug in Perl, should be skipped
3954 // # b test exposes a known bug in Perl, should be skipped if noamp
3956 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3958 // # Column 4 contains a string, usually C<$&>.
3960 // # Column 5 contains the expected result of double-quote
3961 // # interpolating that string after the match, or start of error message.
3963 // # Column 6, if present, contains a reason why the test is skipped.
3964 // # This is printed with "skipped", for harness to pick up.
3966 // # \n in the tests are interpolated, as are variables of the form ${\w+}.
3968 // # If you want to add a regular expression test that can't be expressed
3969 // # in this format, don't add it here: put it in op/pat.t instead.
3971 // For ICU, if field 3 contains an 'i', the test will be skipped.
3972 // The test exposes is some known incompatibility between ICU and Perl regexps.
3973 // (The i is in addition to whatever was there before.)
3975 //-------------------------------------------------------------------------------
3976 void RegexTest::PerlTests() {
3978 const char *srcPath
;
3979 UErrorCode status
= U_ZERO_ERROR
;
3983 // Open and read the test data file.
3985 srcPath
=getPath(tdd
, "re_tests.txt");
3987 return; /* something went wrong, error already output */
3991 UChar
*testData
= ReadAndConvertFile(srcPath
, len
, "iso-8859-1", status
);
3992 if (U_FAILURE(status
)) {
3993 return; /* something went wrong, error already output */
3997 // Put the test data into a UnicodeString
3999 UnicodeString
testDataString(FALSE
, testData
, len
);
4002 // Regex to break the input file into lines, and strip the new lines.
4003 // One line per match, capture group one is the desired data.
4005 RegexPattern
* linePat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe
, status
);
4006 if (U_FAILURE(status
)) {
4007 dataerrln("RegexPattern::compile() error");
4010 RegexMatcher
* lineMat
= linePat
->matcher(testDataString
, status
);
4013 // Regex to split a test file line into fields.
4014 // There are six fields, separated by tabs.
4016 RegexPattern
* fieldPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe
, status
);
4019 // Regex to identify test patterns with flag settings, and to separate them.
4020 // Test patterns with flags look like 'pattern'i
4021 // Test patterns without flags are not quoted: pattern
4022 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4024 RegexPattern
*flagPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe
, status
);
4025 RegexMatcher
* flagMat
= flagPat
->matcher(status
);
4028 // The Perl tests reference several perl-isms, which are evaluated/substituted
4029 // in the test data. Not being perl, this must be done explicitly. Here
4030 // are string constants and REs for these constructs.
4032 UnicodeString
nulnulSrc("${nulnul}");
4033 UnicodeString
nulnul("\\u0000\\u0000", -1, US_INV
);
4034 nulnul
= nulnul
.unescape();
4036 UnicodeString
ffffSrc("${ffff}");
4037 UnicodeString
ffff("\\uffff", -1, US_INV
);
4038 ffff
= ffff
.unescape();
4040 // regexp for $-[0], $+[2], etc.
4041 RegexPattern
*groupsPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe
, status
);
4042 RegexMatcher
*groupsMat
= groupsPat
->matcher(status
);
4044 // regexp for $0, $1, $2, etc.
4045 RegexPattern
*cgPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe
, status
);
4046 RegexMatcher
*cgMat
= cgPat
->matcher(status
);
4050 // Main Loop for the Perl Tests, runs once per line from the
4053 int32_t lineNum
= 0;
4054 int32_t skippedUnimplementedCount
= 0;
4055 while (lineMat
->find()) {
4059 // Get a line, break it into its fields, do the Perl
4060 // variable substitutions.
4062 UnicodeString line
= lineMat
->group(1, status
);
4063 UnicodeString fields
[7];
4064 fieldPat
->split(line
, fields
, 7, status
);
4066 flagMat
->reset(fields
[0]);
4067 flagMat
->matches(status
);
4068 UnicodeString pattern
= flagMat
->group(2, status
);
4069 pattern
.findAndReplace("${bang}", "!");
4070 pattern
.findAndReplace(nulnulSrc
, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4071 pattern
.findAndReplace(ffffSrc
, ffff
);
4074 // Identify patterns that include match flag settings,
4075 // split off the flags, remove the extra quotes.
4077 UnicodeString flagStr
= flagMat
->group(3, status
);
4078 if (U_FAILURE(status
)) {
4079 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
4083 const UChar UChar_c
= 0x63; // Char constants for the flag letters.
4084 const UChar UChar_i
= 0x69; // (Damn the lack of Unicode support in C)
4085 const UChar UChar_m
= 0x6d;
4086 const UChar UChar_x
= 0x78;
4087 const UChar UChar_y
= 0x79;
4088 if (flagStr
.indexOf(UChar_i
) != -1) {
4089 flags
|= UREGEX_CASE_INSENSITIVE
;
4091 if (flagStr
.indexOf(UChar_m
) != -1) {
4092 flags
|= UREGEX_MULTILINE
;
4094 if (flagStr
.indexOf(UChar_x
) != -1) {
4095 flags
|= UREGEX_COMMENTS
;
4099 // Compile the test pattern.
4101 status
= U_ZERO_ERROR
;
4102 RegexPattern
*testPat
= RegexPattern::compile(pattern
, flags
, pe
, status
);
4103 if (status
== U_REGEX_UNIMPLEMENTED
) {
4105 // Test of a feature that is planned for ICU, but not yet implemented.
4107 skippedUnimplementedCount
++;
4109 status
= U_ZERO_ERROR
;
4113 if (U_FAILURE(status
)) {
4114 // Some tests are supposed to generate errors.
4115 // Only report an error for tests that are supposed to succeed.
4116 if (fields
[2].indexOf(UChar_c
) == -1 && // Compilation is not supposed to fail AND
4117 fields
[2].indexOf(UChar_i
) == -1) // it's not an accepted ICU incompatibility
4119 errln("line %d: ICU Error \"%s\"\n", lineNum
, u_errorName(status
));
4121 status
= U_ZERO_ERROR
;
4126 if (fields
[2].indexOf(UChar_i
) >= 0) {
4127 // ICU should skip this test.
4132 if (fields
[2].indexOf(UChar_c
) >= 0) {
4133 // This pattern should have caused a compilation error, but didn't/
4134 errln("line %d: Expected a pattern compile error, got success.", lineNum
);
4140 // replace the Perl variables that appear in some of the
4141 // match data strings.
4143 UnicodeString matchString
= fields
[1];
4144 matchString
.findAndReplace(nulnulSrc
, nulnul
);
4145 matchString
.findAndReplace(ffffSrc
, ffff
);
4147 // Replace any \n in the match string with an actual new-line char.
4148 // Don't do full unescape, as this unescapes more than Perl does, which
4149 // causes other spurious failures in the tests.
4150 matchString
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4155 // Run the test, check for expected match/don't match result.
4157 RegexMatcher
*testMat
= testPat
->matcher(matchString
, status
);
4158 UBool found
= testMat
->find();
4159 UBool expected
= FALSE
;
4160 if (fields
[2].indexOf(UChar_y
) >=0) {
4163 if (expected
!= found
) {
4164 errln("line %d: Expected %smatch, got %smatch",
4165 lineNum
, expected
?"":"no ", found
?"":"no " );
4169 // Don't try to check expected results if there is no match.
4170 // (Some have stuff in the expected fields)
4178 // Interpret the Perl expression from the fourth field of the data file,
4179 // building up an ICU string from the results of the ICU match.
4180 // The Perl expression will contain references to the results of
4181 // a regex match, including the matched string, capture group strings,
4182 // group starting and ending indicies, etc.
4184 UnicodeString resultString
;
4185 UnicodeString perlExpr
= fields
[3];
4186 #if SUPPORT_MUTATING_INPUT_STRING
4187 groupsMat
->reset(perlExpr
);
4188 cgMat
->reset(perlExpr
);
4191 while (perlExpr
.length() > 0) {
4192 #if !SUPPORT_MUTATING_INPUT_STRING
4193 // Perferred usage. Reset after any modification to input string.
4194 groupsMat
->reset(perlExpr
);
4195 cgMat
->reset(perlExpr
);
4198 if (perlExpr
.startsWith("$&")) {
4199 resultString
.append(testMat
->group(status
));
4200 perlExpr
.remove(0, 2);
4203 else if (groupsMat
->lookingAt(status
)) {
4205 UnicodeString digitString
= groupsMat
->group(2, status
);
4207 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4208 UnicodeString plusOrMinus
= groupsMat
->group(1, status
);
4209 int32_t matchPosition
;
4210 if (plusOrMinus
.compare("+") == 0) {
4211 matchPosition
= testMat
->end(groupNum
, status
);
4213 matchPosition
= testMat
->start(groupNum
, status
);
4215 if (matchPosition
!= -1) {
4216 ICU_Utility::appendNumber(resultString
, matchPosition
);
4218 perlExpr
.remove(0, groupsMat
->end(status
));
4221 else if (cgMat
->lookingAt(status
)) {
4223 UnicodeString digitString
= cgMat
->group(1, status
);
4225 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4226 if (U_SUCCESS(status
)) {
4227 resultString
.append(testMat
->group(groupNum
, status
));
4228 status
= U_ZERO_ERROR
;
4230 perlExpr
.remove(0, cgMat
->end(status
));
4233 else if (perlExpr
.startsWith("@-")) {
4235 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4237 resultString
.append(" ");
4239 ICU_Utility::appendNumber(resultString
, testMat
->start(i
, status
));
4241 perlExpr
.remove(0, 2);
4244 else if (perlExpr
.startsWith("@+")) {
4246 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4248 resultString
.append(" ");
4250 ICU_Utility::appendNumber(resultString
, testMat
->end(i
, status
));
4252 perlExpr
.remove(0, 2);
4255 else if (perlExpr
.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4256 // or as an escaped sequence (e.g. \n)
4257 if (perlExpr
.length() > 1) {
4258 perlExpr
.remove(0, 1); // Remove the '\', but only if not last char.
4260 UChar c
= perlExpr
.charAt(0);
4262 case 'n': c
= '\n'; break;
4263 // add any other escape sequences that show up in the test expected results.
4265 resultString
.append(c
);
4266 perlExpr
.remove(0, 1);
4270 // Any characters from the perl expression that we don't explicitly
4271 // recognize before here are assumed to be literals and copied
4272 // as-is to the expected results.
4273 resultString
.append(perlExpr
.charAt(0));
4274 perlExpr
.remove(0, 1);
4277 if (U_FAILURE(status
)) {
4278 errln("Line %d: ICU Error \"%s\"", lineNum
, u_errorName(status
));
4284 // Expected Results Compare
4286 UnicodeString
expectedS(fields
[4]);
4287 expectedS
.findAndReplace(nulnulSrc
, nulnul
);
4288 expectedS
.findAndReplace(ffffSrc
, ffff
);
4289 expectedS
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4292 if (expectedS
.compare(resultString
) != 0) {
4293 err("Line %d: Incorrect perl expression results.", lineNum
);
4294 infoln((UnicodeString
)"Expected \""+expectedS
+(UnicodeString
)"\"; got \""+resultString
+(UnicodeString
)"\"");
4302 // All done. Clean up allocated stuff.
4320 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount
);
4325 //-------------------------------------------------------------------------------
4327 // PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts
4328 // (instead of using UnicodeStrings) to test the alternate engine.
4329 // The input file for this test is re_tests, the standard regular
4330 // expression test data distributed with the Perl source code.
4331 // See PerlTests() for more information.
4333 //-------------------------------------------------------------------------------
4334 void RegexTest::PerlTestsUTF8() {
4336 const char *srcPath
;
4337 UErrorCode status
= U_ZERO_ERROR
;
4339 LocalUConverterPointer
UTF8Converter(ucnv_open("UTF-8", &status
));
4340 UText patternText
= UTEXT_INITIALIZER
;
4341 char *patternChars
= NULL
;
4342 int32_t patternLength
;
4343 int32_t patternCapacity
= 0;
4344 UText inputText
= UTEXT_INITIALIZER
;
4345 char *inputChars
= NULL
;
4346 int32_t inputLength
;
4347 int32_t inputCapacity
= 0;
4349 ucnv_setFromUCallBack(UTF8Converter
.getAlias(), UCNV_FROM_U_CALLBACK_STOP
, NULL
, NULL
, NULL
, &status
);
4352 // Open and read the test data file.
4354 srcPath
=getPath(tdd
, "re_tests.txt");
4356 return; /* something went wrong, error already output */
4360 UChar
*testData
= ReadAndConvertFile(srcPath
, len
, "iso-8859-1", status
);
4361 if (U_FAILURE(status
)) {
4362 return; /* something went wrong, error already output */
4366 // Put the test data into a UnicodeString
4368 UnicodeString
testDataString(FALSE
, testData
, len
);
4371 // Regex to break the input file into lines, and strip the new lines.
4372 // One line per match, capture group one is the desired data.
4374 RegexPattern
* linePat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe
, status
);
4375 if (U_FAILURE(status
)) {
4376 dataerrln("RegexPattern::compile() error");
4379 RegexMatcher
* lineMat
= linePat
->matcher(testDataString
, status
);
4382 // Regex to split a test file line into fields.
4383 // There are six fields, separated by tabs.
4385 RegexPattern
* fieldPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe
, status
);
4388 // Regex to identify test patterns with flag settings, and to separate them.
4389 // Test patterns with flags look like 'pattern'i
4390 // Test patterns without flags are not quoted: pattern
4391 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4393 RegexPattern
*flagPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe
, status
);
4394 RegexMatcher
* flagMat
= flagPat
->matcher(status
);
4397 // The Perl tests reference several perl-isms, which are evaluated/substituted
4398 // in the test data. Not being perl, this must be done explicitly. Here
4399 // are string constants and REs for these constructs.
4401 UnicodeString
nulnulSrc("${nulnul}");
4402 UnicodeString
nulnul("\\u0000\\u0000", -1, US_INV
);
4403 nulnul
= nulnul
.unescape();
4405 UnicodeString
ffffSrc("${ffff}");
4406 UnicodeString
ffff("\\uffff", -1, US_INV
);
4407 ffff
= ffff
.unescape();
4409 // regexp for $-[0], $+[2], etc.
4410 RegexPattern
*groupsPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe
, status
);
4411 RegexMatcher
*groupsMat
= groupsPat
->matcher(status
);
4413 // regexp for $0, $1, $2, etc.
4414 RegexPattern
*cgPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe
, status
);
4415 RegexMatcher
*cgMat
= cgPat
->matcher(status
);
4419 // Main Loop for the Perl Tests, runs once per line from the
4422 int32_t lineNum
= 0;
4423 int32_t skippedUnimplementedCount
= 0;
4424 while (lineMat
->find()) {
4428 // Get a line, break it into its fields, do the Perl
4429 // variable substitutions.
4431 UnicodeString line
= lineMat
->group(1, status
);
4432 UnicodeString fields
[7];
4433 fieldPat
->split(line
, fields
, 7, status
);
4435 flagMat
->reset(fields
[0]);
4436 flagMat
->matches(status
);
4437 UnicodeString pattern
= flagMat
->group(2, status
);
4438 pattern
.findAndReplace("${bang}", "!");
4439 pattern
.findAndReplace(nulnulSrc
, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4440 pattern
.findAndReplace(ffffSrc
, ffff
);
4443 // Identify patterns that include match flag settings,
4444 // split off the flags, remove the extra quotes.
4446 UnicodeString flagStr
= flagMat
->group(3, status
);
4447 if (U_FAILURE(status
)) {
4448 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
4452 const UChar UChar_c
= 0x63; // Char constants for the flag letters.
4453 const UChar UChar_i
= 0x69; // (Damn the lack of Unicode support in C)
4454 const UChar UChar_m
= 0x6d;
4455 const UChar UChar_x
= 0x78;
4456 const UChar UChar_y
= 0x79;
4457 if (flagStr
.indexOf(UChar_i
) != -1) {
4458 flags
|= UREGEX_CASE_INSENSITIVE
;
4460 if (flagStr
.indexOf(UChar_m
) != -1) {
4461 flags
|= UREGEX_MULTILINE
;
4463 if (flagStr
.indexOf(UChar_x
) != -1) {
4464 flags
|= UREGEX_COMMENTS
;
4468 // Put the pattern in a UTF-8 UText
4470 status
= U_ZERO_ERROR
;
4471 patternLength
= pattern
.extract(patternChars
, patternCapacity
, UTF8Converter
.getAlias(), status
);
4472 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
4473 status
= U_ZERO_ERROR
;
4474 delete[] patternChars
;
4475 patternCapacity
= patternLength
+ 1;
4476 patternChars
= new char[patternCapacity
];
4477 pattern
.extract(patternChars
, patternCapacity
, UTF8Converter
.getAlias(), status
);
4479 utext_openUTF8(&patternText
, patternChars
, patternLength
, &status
);
4482 // Compile the test pattern.
4484 RegexPattern
*testPat
= RegexPattern::compile(&patternText
, flags
, pe
, status
);
4485 if (status
== U_REGEX_UNIMPLEMENTED
) {
4487 // Test of a feature that is planned for ICU, but not yet implemented.
4489 skippedUnimplementedCount
++;
4491 status
= U_ZERO_ERROR
;
4495 if (U_FAILURE(status
)) {
4496 // Some tests are supposed to generate errors.
4497 // Only report an error for tests that are supposed to succeed.
4498 if (fields
[2].indexOf(UChar_c
) == -1 && // Compilation is not supposed to fail AND
4499 fields
[2].indexOf(UChar_i
) == -1) // it's not an accepted ICU incompatibility
4501 errln("line %d: ICU Error \"%s\"\n", lineNum
, u_errorName(status
));
4503 status
= U_ZERO_ERROR
;
4508 if (fields
[2].indexOf(UChar_i
) >= 0) {
4509 // ICU should skip this test.
4514 if (fields
[2].indexOf(UChar_c
) >= 0) {
4515 // This pattern should have caused a compilation error, but didn't/
4516 errln("line %d: Expected a pattern compile error, got success.", lineNum
);
4523 // replace the Perl variables that appear in some of the
4524 // match data strings.
4526 UnicodeString matchString
= fields
[1];
4527 matchString
.findAndReplace(nulnulSrc
, nulnul
);
4528 matchString
.findAndReplace(ffffSrc
, ffff
);
4530 // Replace any \n in the match string with an actual new-line char.
4531 // Don't do full unescape, as this unescapes more than Perl does, which
4532 // causes other spurious failures in the tests.
4533 matchString
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4536 // Put the input in a UTF-8 UText
4538 status
= U_ZERO_ERROR
;
4539 inputLength
= matchString
.extract(inputChars
, inputCapacity
, UTF8Converter
.getAlias(), status
);
4540 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
4541 status
= U_ZERO_ERROR
;
4542 delete[] inputChars
;
4543 inputCapacity
= inputLength
+ 1;
4544 inputChars
= new char[inputCapacity
];
4545 matchString
.extract(inputChars
, inputCapacity
, UTF8Converter
.getAlias(), status
);
4547 utext_openUTF8(&inputText
, inputChars
, inputLength
, &status
);
4550 // Run the test, check for expected match/don't match result.
4552 RegexMatcher
*testMat
= &testPat
->matcher(status
)->reset(&inputText
);
4553 UBool found
= testMat
->find();
4554 UBool expected
= FALSE
;
4555 if (fields
[2].indexOf(UChar_y
) >=0) {
4558 if (expected
!= found
) {
4559 errln("line %d: Expected %smatch, got %smatch",
4560 lineNum
, expected
?"":"no ", found
?"":"no " );
4564 // Don't try to check expected results if there is no match.
4565 // (Some have stuff in the expected fields)
4573 // Interpret the Perl expression from the fourth field of the data file,
4574 // building up an ICU string from the results of the ICU match.
4575 // The Perl expression will contain references to the results of
4576 // a regex match, including the matched string, capture group strings,
4577 // group starting and ending indicies, etc.
4579 UnicodeString resultString
;
4580 UnicodeString perlExpr
= fields
[3];
4582 while (perlExpr
.length() > 0) {
4583 groupsMat
->reset(perlExpr
);
4584 cgMat
->reset(perlExpr
);
4586 if (perlExpr
.startsWith("$&")) {
4587 resultString
.append(testMat
->group(status
));
4588 perlExpr
.remove(0, 2);
4591 else if (groupsMat
->lookingAt(status
)) {
4593 UnicodeString digitString
= groupsMat
->group(2, status
);
4595 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4596 UnicodeString plusOrMinus
= groupsMat
->group(1, status
);
4597 int32_t matchPosition
;
4598 if (plusOrMinus
.compare("+") == 0) {
4599 matchPosition
= testMat
->end(groupNum
, status
);
4601 matchPosition
= testMat
->start(groupNum
, status
);
4603 if (matchPosition
!= -1) {
4604 ICU_Utility::appendNumber(resultString
, matchPosition
);
4606 perlExpr
.remove(0, groupsMat
->end(status
));
4609 else if (cgMat
->lookingAt(status
)) {
4611 UnicodeString digitString
= cgMat
->group(1, status
);
4613 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4614 if (U_SUCCESS(status
)) {
4615 resultString
.append(testMat
->group(groupNum
, status
));
4616 status
= U_ZERO_ERROR
;
4618 perlExpr
.remove(0, cgMat
->end(status
));
4621 else if (perlExpr
.startsWith("@-")) {
4623 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4625 resultString
.append(" ");
4627 ICU_Utility::appendNumber(resultString
, testMat
->start(i
, status
));
4629 perlExpr
.remove(0, 2);
4632 else if (perlExpr
.startsWith("@+")) {
4634 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4636 resultString
.append(" ");
4638 ICU_Utility::appendNumber(resultString
, testMat
->end(i
, status
));
4640 perlExpr
.remove(0, 2);
4643 else if (perlExpr
.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4644 // or as an escaped sequence (e.g. \n)
4645 if (perlExpr
.length() > 1) {
4646 perlExpr
.remove(0, 1); // Remove the '\', but only if not last char.
4648 UChar c
= perlExpr
.charAt(0);
4650 case 'n': c
= '\n'; break;
4651 // add any other escape sequences that show up in the test expected results.
4653 resultString
.append(c
);
4654 perlExpr
.remove(0, 1);
4658 // Any characters from the perl expression that we don't explicitly
4659 // recognize before here are assumed to be literals and copied
4660 // as-is to the expected results.
4661 resultString
.append(perlExpr
.charAt(0));
4662 perlExpr
.remove(0, 1);
4665 if (U_FAILURE(status
)) {
4666 errln("Line %d: ICU Error \"%s\"", lineNum
, u_errorName(status
));
4672 // Expected Results Compare
4674 UnicodeString
expectedS(fields
[4]);
4675 expectedS
.findAndReplace(nulnulSrc
, nulnul
);
4676 expectedS
.findAndReplace(ffffSrc
, ffff
);
4677 expectedS
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4680 if (expectedS
.compare(resultString
) != 0) {
4681 err("Line %d: Incorrect perl expression results.", lineNum
);
4682 infoln((UnicodeString
)"Expected \""+expectedS
+(UnicodeString
)"\"; got \""+resultString
+(UnicodeString
)"\"");
4690 // All done. Clean up allocated stuff.
4707 utext_close(&patternText
);
4708 utext_close(&inputText
);
4710 delete [] patternChars
;
4711 delete [] inputChars
;
4714 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount
);
4719 //--------------------------------------------------------------
4721 // Bug6149 Verify limits to heap expansion for backtrack stack.
4722 // Use this pattern,
4723 // "(a?){1,8000000}"
4724 // Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4725 // This test is likely to be fragile, as further optimizations stop
4726 // more cases of pointless looping in the match engine.
4728 //---------------------------------------------------------------
4729 void RegexTest::Bug6149() {
4730 UnicodeString
pattern("(a?){1,8000000}");
4731 UnicodeString
s("xyz");
4733 UErrorCode status
= U_ZERO_ERROR
;
4735 RegexMatcher
matcher(pattern
, s
, flags
, status
);
4736 UBool result
= false;
4737 REGEX_ASSERT_FAIL(result
=matcher
.matches(status
), U_REGEX_STACK_OVERFLOW
);
4738 REGEX_ASSERT(result
== FALSE
);
4743 // Callbacks() Test the callback function.
4744 // When set, callbacks occur periodically during matching operations,
4745 // giving the application code the ability to abort the operation
4746 // before it's normal completion.
4749 struct callBackContext
{
4754 void reset(int32_t max
) {maxCalls
=max
; numCalls
=0; lastSteps
=0;};
4758 static UBool U_CALLCONV
4759 testCallBackFn(const void *context
, int32_t steps
) {
4760 callBackContext
*info
= (callBackContext
*)context
;
4761 if (info
->lastSteps
+1 != steps
) {
4762 info
->test
->errln("incorrect steps in callback. Expected %d, got %d\n", info
->lastSteps
+1, steps
);
4764 info
->lastSteps
= steps
;
4766 return (info
->numCalls
< info
->maxCalls
);
4770 void RegexTest::Callbacks() {
4772 // Getter returns NULLs if no callback has been set
4774 // The variables that the getter will fill in.
4775 // Init to non-null values so that the action of the getter can be seen.
4776 const void *returnedContext
= &returnedContext
;
4777 URegexMatchCallback
*returnedFn
= &testCallBackFn
;
4779 UErrorCode status
= U_ZERO_ERROR
;
4780 RegexMatcher
matcher("x", 0, status
);
4782 matcher
.getMatchCallback(returnedFn
, returnedContext
, status
);
4784 REGEX_ASSERT(returnedFn
== NULL
);
4785 REGEX_ASSERT(returnedContext
== NULL
);
4790 callBackContext cbInfo
= {this, 0, 0, 0};
4791 const void *returnedContext
;
4792 URegexMatchCallback
*returnedFn
;
4793 UErrorCode status
= U_ZERO_ERROR
;
4794 RegexMatcher
matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status
); // A pattern that can run long.
4796 matcher
.setMatchCallback(testCallBackFn
, &cbInfo
, status
);
4798 matcher
.getMatchCallback(returnedFn
, returnedContext
, status
);
4800 REGEX_ASSERT(returnedFn
== testCallBackFn
);
4801 REGEX_ASSERT(returnedContext
== &cbInfo
);
4803 // A short-running match shouldn't invoke the callback
4804 status
= U_ZERO_ERROR
;
4806 UnicodeString s
= "xxx";
4808 REGEX_ASSERT(matcher
.matches(status
));
4810 REGEX_ASSERT(cbInfo
.numCalls
== 0);
4812 // A medium-length match that runs long enough to invoke the
4813 // callback, but not so long that the callback aborts it.
4814 status
= U_ZERO_ERROR
;
4816 s
= "aaaaaaaaaaaaaaaaaaab";
4818 REGEX_ASSERT(matcher
.matches(status
)==FALSE
);
4820 REGEX_ASSERT(cbInfo
.numCalls
> 0);
4822 // A longer running match that the callback function will abort.
4823 status
= U_ZERO_ERROR
;
4825 s
= "aaaaaaaaaaaaaaaaaaaaaaab";
4827 REGEX_ASSERT(matcher
.matches(status
)==FALSE
);
4828 REGEX_ASSERT(status
== U_REGEX_STOPPED_BY_CALLER
);
4829 REGEX_ASSERT(cbInfo
.numCalls
== 4);
4831 // A longer running find that the callback function will abort.
4832 status
= U_ZERO_ERROR
;
4834 s
= "aaaaaaaaaaaaaaaaaaaaaaab";
4836 REGEX_ASSERT(matcher
.find(status
)==FALSE
);
4837 REGEX_ASSERT(status
== U_REGEX_STOPPED_BY_CALLER
);
4838 REGEX_ASSERT(cbInfo
.numCalls
== 4);
4846 // FindProgressCallbacks() Test the find "progress" callback function.
4847 // When set, the find progress callback will be invoked during a find operations
4848 // after each return from a match attempt, giving the application the opportunity
4849 // to terminate a long-running find operation before it's normal completion.
4852 struct progressCallBackContext
{
4857 void reset(int32_t max
) {maxCalls
=max
; numCalls
=0;lastIndex
=0;};
4860 // call-back function for find().
4861 // Return TRUE to continue the find().
4862 // Return FALSE to stop the find().
4864 static UBool U_CALLCONV
4865 testProgressCallBackFn(const void *context
, int64_t matchIndex
) {
4866 progressCallBackContext
*info
= (progressCallBackContext
*)context
;
4868 info
->lastIndex
= matchIndex
;
4869 // info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4870 return (info
->numCalls
< info
->maxCalls
);
4874 void RegexTest::FindProgressCallbacks() {
4876 // Getter returns NULLs if no callback has been set
4878 // The variables that the getter will fill in.
4879 // Init to non-null values so that the action of the getter can be seen.
4880 const void *returnedContext
= &returnedContext
;
4881 URegexFindProgressCallback
*returnedFn
= &testProgressCallBackFn
;
4883 UErrorCode status
= U_ZERO_ERROR
;
4884 RegexMatcher
matcher("x", 0, status
);
4886 matcher
.getFindProgressCallback(returnedFn
, returnedContext
, status
);
4888 REGEX_ASSERT(returnedFn
== NULL
);
4889 REGEX_ASSERT(returnedContext
== NULL
);
4894 progressCallBackContext cbInfo
= {this, 0, 0, 0};
4895 const void *returnedContext
;
4896 URegexFindProgressCallback
*returnedFn
;
4897 UErrorCode status
= U_ZERO_ERROR
;
4898 RegexMatcher
matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status
);
4900 matcher
.setFindProgressCallback(testProgressCallBackFn
, &cbInfo
, status
);
4902 matcher
.getFindProgressCallback(returnedFn
, returnedContext
, status
);
4904 REGEX_ASSERT(returnedFn
== testProgressCallBackFn
);
4905 REGEX_ASSERT(returnedContext
== &cbInfo
);
4907 // A find that matches on the initial position does NOT invoke the callback.
4908 status
= U_ZERO_ERROR
;
4910 UnicodeString s
= "aaxxx";
4913 matcher
.setTrace(TRUE
);
4915 REGEX_ASSERT(matcher
.find(0, status
));
4917 REGEX_ASSERT(cbInfo
.numCalls
== 0);
4919 // A medium running find() that causes matcher.find() to invoke our callback for each index,
4920 // but not so many times that we interrupt the operation.
4921 status
= U_ZERO_ERROR
;
4922 s
= "aaaaaaaaaaaaaaaaaaab";
4923 cbInfo
.reset(s
.length()); // Some upper limit for number of calls that is greater than size of our input string
4925 REGEX_ASSERT(matcher
.find(0, status
)==FALSE
);
4927 REGEX_ASSERT(cbInfo
.numCalls
> 0 && cbInfo
.numCalls
< 25);
4929 // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4930 status
= U_ZERO_ERROR
;
4931 UnicodeString s1
= "aaaaaaaaaaaaaaaaaaaaaaab";
4932 cbInfo
.reset(s1
.length() - 5); // Bail early somewhere near the end of input string
4934 REGEX_ASSERT(matcher
.find(0, status
)==FALSE
);
4935 REGEX_ASSERT(status
== U_REGEX_STOPPED_BY_CALLER
);
4936 REGEX_ASSERT(cbInfo
.numCalls
== s1
.length() - 5);
4938 // Now a match that will succeed, but after an interruption
4939 status
= U_ZERO_ERROR
;
4940 UnicodeString s2
= "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4941 cbInfo
.reset(s2
.length() - 10); // Bail early somewhere near the end of input string
4943 REGEX_ASSERT(matcher
.find(0, status
)==FALSE
);
4944 REGEX_ASSERT(status
== U_REGEX_STOPPED_BY_CALLER
);
4945 // Now retry the match from where left off
4946 cbInfo
.maxCalls
= 100; // No callback limit
4947 status
= U_ZERO_ERROR
;
4948 REGEX_ASSERT(matcher
.find(cbInfo
.lastIndex
, status
));
4956 //---------------------------------------------------------------------------
4958 // PreAllocatedUTextCAPI Check the C API with pre-allocated mutable
4959 // UTexts. The pure-C implementation of UText
4960 // has no mutable backing stores, but we can
4961 // use UnicodeString here to test the functionality.
4963 //---------------------------------------------------------------------------
4964 void RegexTest::PreAllocatedUTextCAPI () {
4965 UErrorCode status
= U_ZERO_ERROR
;
4966 URegularExpression
*re
;
4967 UText patternText
= UTEXT_INITIALIZER
;
4968 UnicodeString buffer
;
4969 UText bufferText
= UTEXT_INITIALIZER
;
4971 utext_openUnicodeString(&bufferText
, &buffer
, &status
);
4974 * getText() and getUText()
4977 UText text1
= UTEXT_INITIALIZER
;
4978 UText text2
= UTEXT_INITIALIZER
;
4979 UChar text2Chars
[20];
4982 status
= U_ZERO_ERROR
;
4983 regextst_openUTF8FromInvariant(&text1
, "abcccd", -1, &status
);
4984 regextst_openUTF8FromInvariant(&text2
, "abcccxd", -1, &status
);
4985 u_uastrncpy(text2Chars
, "abcccxd", sizeof(text2
)/2);
4986 utext_openUChars(&text2
, text2Chars
, -1, &status
);
4988 regextst_openUTF8FromInvariant(&patternText
, "abc*d", -1, &status
);
4989 re
= uregex_openUText(&patternText
, 0, NULL
, &status
);
4991 /* First set a UText */
4992 uregex_setUText(re
, &text1
, &status
);
4993 resultText
= uregex_getUText(re
, &bufferText
, &status
);
4995 REGEX_ASSERT(resultText
== &bufferText
);
4996 utext_setNativeIndex(resultText
, 0);
4997 utext_setNativeIndex(&text1
, 0);
4998 REGEX_ASSERT(testUTextEqual(resultText
, &text1
));
5000 resultText
= uregex_getUText(re
, &bufferText
, &status
);
5002 REGEX_ASSERT(resultText
== &bufferText
);
5003 utext_setNativeIndex(resultText
, 0);
5004 utext_setNativeIndex(&text1
, 0);
5005 REGEX_ASSERT(testUTextEqual(resultText
, &text1
));
5007 /* Then set a UChar * */
5008 uregex_setText(re
, text2Chars
, 7, &status
);
5009 resultText
= uregex_getUText(re
, &bufferText
, &status
);
5011 REGEX_ASSERT(resultText
== &bufferText
);
5012 utext_setNativeIndex(resultText
, 0);
5013 utext_setNativeIndex(&text2
, 0);
5014 REGEX_ASSERT(testUTextEqual(resultText
, &text2
));
5017 utext_close(&text1
);
5018 utext_close(&text2
);
5030 u_uastrncpy(text1
, "noise abc interior def, and this is off the end", UPRV_LENGTHOF(text1
));
5031 // 012345678901234567890123456789012345678901234567
5034 status
= U_ZERO_ERROR
;
5035 re
= uregex_openC("abc(.*?)def", 0, NULL
, &status
);
5038 uregex_setText(re
, text1
, -1, &status
);
5039 result
= uregex_find(re
, 0, &status
);
5040 REGEX_ASSERT(result
==TRUE
);
5042 /* Capture Group 0, the full match. Should succeed. "abc interior def" */
5043 status
= U_ZERO_ERROR
;
5044 actual
= uregex_groupUText(re
, 0, &bufferText
, &length
, &status
);
5046 REGEX_ASSERT(actual
== &bufferText
);
5047 REGEX_ASSERT(utext_getNativeIndex(actual
) == 6);
5048 REGEX_ASSERT(length
== 16);
5049 REGEX_ASSERT(utext_nativeLength(actual
) == 47);
5051 /* Capture group #1. Should succeed, matching " interior ". */
5052 status
= U_ZERO_ERROR
;
5053 actual
= uregex_groupUText(re
, 1, &bufferText
, &length
, &status
);
5055 REGEX_ASSERT(actual
== &bufferText
);
5056 REGEX_ASSERT(utext_getNativeIndex(actual
) == 9); // position of " interior "
5057 REGEX_ASSERT(length
== 10);
5058 REGEX_ASSERT(utext_nativeLength(actual
) == 47);
5060 /* Capture group out of range. Error. */
5061 status
= U_ZERO_ERROR
;
5062 actual
= uregex_groupUText(re
, 2, &bufferText
, &length
, &status
);
5063 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
5064 REGEX_ASSERT(actual
== &bufferText
);
5075 UText replText
= UTEXT_INITIALIZER
;
5077 status
= U_ZERO_ERROR
;
5078 utext_openUnicodeString(&bufferText
, &buffer
, &status
);
5080 status
= U_ZERO_ERROR
;
5081 u_uastrncpy(text1
, "Replace xaax x1x x...x.", UPRV_LENGTHOF(text1
));
5082 u_uastrncpy(text2
, "No match here.", UPRV_LENGTHOF(text2
)/2);
5083 regextst_openUTF8FromInvariant(&replText
, "<$1>", -1, &status
);
5085 re
= uregex_openC("x(.*?)x", 0, NULL
, &status
);
5088 /* Normal case, with match */
5089 uregex_setText(re
, text1
, -1, &status
);
5091 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5093 result
= uregex_replaceFirstUText(re
, &replText
, &bufferText
, &status
);
5095 REGEX_ASSERT(result
== &bufferText
);
5096 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result
);
5098 /* No match. Text should copy to output with no changes. */
5099 uregex_setText(re
, text2
, -1, &status
);
5100 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5101 result
= uregex_replaceFirstUText(re
, &replText
, &bufferText
, &status
);
5103 REGEX_ASSERT(result
== &bufferText
);
5104 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result
);
5106 /* Unicode escapes */
5107 uregex_setText(re
, text1
, -1, &status
);
5108 regextst_openUTF8FromInvariant(&replText
, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status
);
5109 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5110 result
= uregex_replaceFirstUText(re
, &replText
, &bufferText
, &status
);
5112 REGEX_ASSERT(result
== &bufferText
);
5113 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result
);
5116 utext_close(&replText
);
5126 UText replText
= UTEXT_INITIALIZER
;
5129 status
= U_ZERO_ERROR
;
5130 u_uastrncpy(text1
, "Replace xaax x1x x...x.", sizeof(text1
)/2);
5131 u_uastrncpy(text2
, "No match here.", sizeof(text2
)/2);
5132 regextst_openUTF8FromInvariant(&replText
, "<$1>", -1, &status
);
5134 re
= uregex_openC("x(.*?)x", 0, NULL
, &status
);
5137 /* Normal case, with match */
5138 uregex_setText(re
, text1
, -1, &status
);
5139 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5140 result
= uregex_replaceAllUText(re
, &replText
, &bufferText
, &status
);
5142 REGEX_ASSERT(result
== &bufferText
);
5143 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result
);
5145 /* No match. Text should copy to output with no changes. */
5146 uregex_setText(re
, text2
, -1, &status
);
5147 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
5148 result
= uregex_replaceAllUText(re
, &replText
, &bufferText
, &status
);
5150 REGEX_ASSERT(result
== &bufferText
);
5151 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result
);
5154 utext_close(&replText
);
5159 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5160 * so we don't need to test it here.
5163 utext_close(&bufferText
);
5164 utext_close(&patternText
);
5168 //--------------------------------------------------------------
5170 // NamedCapture Check basic named capture group functionality
5172 //--------------------------------------------------------------
5173 void RegexTest::NamedCapture() {
5174 UErrorCode status
= U_ZERO_ERROR
;
5175 RegexPattern
*pat
= RegexPattern::compile(UnicodeString(
5176 "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status
);
5178 int32_t group
= pat
->groupNumberFromName("five", -1, status
);
5180 REGEX_ASSERT(5 == group
);
5181 group
= pat
->groupNumberFromName("three", -1, status
);
5183 REGEX_ASSERT(3 == group
);
5185 status
= U_ZERO_ERROR
;
5186 group
= pat
->groupNumberFromName(UnicodeString("six"), status
);
5188 REGEX_ASSERT(6 == group
);
5190 status
= U_ZERO_ERROR
;
5191 group
= pat
->groupNumberFromName(UnicodeString("nosuch"), status
);
5192 U_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5194 status
= U_ZERO_ERROR
;
5196 // After copying a pattern, named capture should still work in the copy.
5197 RegexPattern
*copiedPat
= new RegexPattern(*pat
);
5198 REGEX_ASSERT(*copiedPat
== *pat
);
5199 delete pat
; pat
= NULL
; // Delete original, copy should have no references back to it.
5201 group
= copiedPat
->groupNumberFromName("five", -1, status
);
5203 REGEX_ASSERT(5 == group
);
5204 group
= copiedPat
->groupNumberFromName("three", -1, status
);
5206 REGEX_ASSERT(3 == group
);
5209 // ReplaceAll with named capture group.
5210 status
= U_ZERO_ERROR
;
5211 UnicodeString
text("Substitution of <<quotes>> for <<double brackets>>");
5212 RegexMatcher
*m
= new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text
, 0, status
);
5214 // m.pattern().dumpPattern();
5215 UnicodeString replacedText
= m
->replaceAll("'${mid}'", status
);
5217 REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText
);
5220 // ReplaceAll, allowed capture group numbers.
5221 text
= UnicodeString("abcmxyz");
5222 m
= new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text
, 0, status
);
5225 status
= U_ZERO_ERROR
;
5226 replacedText
= m
->replaceAll(UnicodeString("<$0>"), status
); // group 0, full match, is allowed.
5228 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText
);
5230 status
= U_ZERO_ERROR
;
5231 replacedText
= m
->replaceAll(UnicodeString("<$1>"), status
); // group 1 by number.
5233 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText
);
5235 status
= U_ZERO_ERROR
;
5236 replacedText
= m
->replaceAll(UnicodeString("<${one}>"), status
); // group 1 by name.
5238 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText
);
5240 status
= U_ZERO_ERROR
;
5241 replacedText
= m
->replaceAll(UnicodeString("<$2>"), status
); // group 2.
5243 REGEX_ASSERT(UnicodeString("a<x>z") == replacedText
);
5245 status
= U_ZERO_ERROR
;
5246 replacedText
= m
->replaceAll(UnicodeString("<$3>"), status
);
5248 REGEX_ASSERT(UnicodeString("a<y>z") == replacedText
);
5250 status
= U_ZERO_ERROR
;
5251 replacedText
= m
->replaceAll(UnicodeString("<$4>"), status
);
5252 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
5254 status
= U_ZERO_ERROR
;
5255 replacedText
= m
->replaceAll(UnicodeString("<$04>"), status
); // group 0, leading 0,
5256 REGEX_CHECK_STATUS
; // trailing out-of-range 4 passes through.
5257 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText
);
5259 status
= U_ZERO_ERROR
;
5260 replacedText
= m
->replaceAll(UnicodeString("<$000016>"), status
); // Consume leading zeroes. Don't consume digits
5261 REGEX_CHECK_STATUS
; // that push group num out of range.
5262 REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText
); // This is group 1.
5264 status
= U_ZERO_ERROR
;
5265 replacedText
= m
->replaceAll(UnicodeString("<$3$2$1${one}>"), status
);
5267 REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText
);
5269 status
= U_ZERO_ERROR
;
5270 replacedText
= m
->replaceAll(UnicodeString("$3$2$1${one}"), status
);
5272 REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText
);
5274 status
= U_ZERO_ERROR
;
5275 replacedText
= m
->replaceAll(UnicodeString("<${noSuchName}>"), status
);
5276 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5278 status
= U_ZERO_ERROR
;
5279 replacedText
= m
->replaceAll(UnicodeString("<${invalid-name}>"), status
);
5280 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5282 status
= U_ZERO_ERROR
;
5283 replacedText
= m
->replaceAll(UnicodeString("<${one"), status
);
5284 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5286 status
= U_ZERO_ERROR
;
5287 replacedText
= m
->replaceAll(UnicodeString("$not a capture group"), status
);
5288 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5292 // Repeat the above replaceAll() tests using the plain C API, which
5293 // has a separate implementation internally.
5294 // TODO: factor out the test data.
5296 status
= U_ZERO_ERROR
;
5297 URegularExpression
*re
= uregex_openC("..(?<one>m)(.)(.)", 0, NULL
, &status
);
5299 text
= UnicodeString("abcmxyz");
5300 uregex_setText(re
, text
.getBuffer(), text
.length(), &status
);
5303 UChar resultBuf
[100];
5304 int32_t resultLength
;
5307 status
= U_ZERO_ERROR
;
5308 repl
= UnicodeString("<$0>");
5309 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5311 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf
, resultLength
));
5313 status
= U_ZERO_ERROR
;
5314 repl
= UnicodeString("<$1>");
5315 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5317 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf
, resultLength
));
5319 status
= U_ZERO_ERROR
;
5320 repl
= UnicodeString("<${one}>");
5321 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5323 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf
, resultLength
));
5325 status
= U_ZERO_ERROR
;
5326 repl
= UnicodeString("<$2>");
5327 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5329 REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf
, resultLength
));
5331 status
= U_ZERO_ERROR
;
5332 repl
= UnicodeString("<$3>");
5333 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5335 REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf
, resultLength
));
5337 status
= U_ZERO_ERROR
;
5338 repl
= UnicodeString("<$4>");
5339 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5340 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
5342 status
= U_ZERO_ERROR
;
5343 repl
= UnicodeString("<$04>");
5344 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5346 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf
, resultLength
));
5348 status
= U_ZERO_ERROR
;
5349 repl
= UnicodeString("<$000016>");
5350 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5352 REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf
, resultLength
));
5354 status
= U_ZERO_ERROR
;
5355 repl
= UnicodeString("<$3$2$1${one}>");
5356 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5358 REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf
, resultLength
));
5360 status
= U_ZERO_ERROR
;
5361 repl
= UnicodeString("$3$2$1${one}");
5362 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5364 REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf
, resultLength
));
5366 status
= U_ZERO_ERROR
;
5367 repl
= UnicodeString("<${noSuchName}>");
5368 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5369 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5371 status
= U_ZERO_ERROR
;
5372 repl
= UnicodeString("<${invalid-name}>");
5373 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5374 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5376 status
= U_ZERO_ERROR
;
5377 repl
= UnicodeString("<${one");
5378 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5379 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5381 status
= U_ZERO_ERROR
;
5382 repl
= UnicodeString("$not a capture group");
5383 resultLength
= uregex_replaceAll(re
, repl
.getBuffer(), repl
.length(), resultBuf
, UPRV_LENGTHOF(resultBuf
), &status
);
5384 REGEX_ASSERT(status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
5389 //--------------------------------------------------------------
5391 // NamedCaptureLimits Patterns with huge numbers of named capture groups.
5392 // The point is not so much what the exact limit is,
5393 // but that a largish number doesn't hit bad non-linear performance,
5394 // and that exceeding the limit fails cleanly.
5396 //--------------------------------------------------------------
5397 void RegexTest::NamedCaptureLimits() {
5399 logln("Skipping test. Runs in exhuastive mode only.");
5402 const int32_t goodLimit
= 1000000; // Pattern w this many groups builds successfully.
5403 const int32_t failLimit
= 10000000; // Pattern exceeds internal limits, fails to compile.
5405 UnicodeString pattern
;
5408 for (nn
=1; nn
<goodLimit
; nn
++) {
5409 sprintf(nnbuf
, "(?<nn%d>)", nn
);
5410 pattern
.append(UnicodeString(nnbuf
, -1, US_INV
));
5412 UErrorCode status
= U_ZERO_ERROR
;
5413 RegexPattern
*pat
= RegexPattern::compile(pattern
, 0, status
);
5415 for (nn
=1; nn
<goodLimit
; nn
++) {
5416 sprintf(nnbuf
, "nn%d", nn
);
5417 int32_t groupNum
= pat
->groupNumberFromName(nnbuf
, -1, status
);
5418 REGEX_ASSERT(nn
== groupNum
);
5419 if (nn
!= groupNum
) {
5426 for (nn
=1; nn
<failLimit
; nn
++) {
5427 sprintf(nnbuf
, "(?<nn%d>)", nn
);
5428 pattern
.append(UnicodeString(nnbuf
, -1, US_INV
));
5430 status
= U_ZERO_ERROR
;
5431 pat
= RegexPattern::compile(pattern
, 0, status
);
5432 REGEX_ASSERT(status
== U_REGEX_PATTERN_TOO_BIG
);
5437 //--------------------------------------------------------------
5439 // Bug7651 Regex pattern that exceeds default operator stack depth in matcher.
5441 //---------------------------------------------------------------
5442 void RegexTest::Bug7651() {
5443 UnicodeString
pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5444 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5445 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5446 UnicodeString
pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5447 UnicodeString
s("#ff @abcd This is test");
5448 RegexPattern
*REPattern
= NULL
;
5449 RegexMatcher
*REMatcher
= NULL
;
5450 UErrorCode status
= U_ZERO_ERROR
;
5453 REPattern
= RegexPattern::compile(pattern1
, 0, pe
, status
);
5455 REMatcher
= REPattern
->matcher(s
, status
);
5457 REGEX_ASSERT(REMatcher
->find());
5458 REGEX_ASSERT(REMatcher
->start(status
) == 0);
5461 status
= U_ZERO_ERROR
;
5463 REPattern
= RegexPattern::compile(pattern2
, 0, pe
, status
);
5465 REMatcher
= REPattern
->matcher(s
, status
);
5467 REGEX_ASSERT(REMatcher
->find());
5468 REGEX_ASSERT(REMatcher
->start(status
) == 0);
5471 status
= U_ZERO_ERROR
;
5474 void RegexTest::Bug7740() {
5475 UErrorCode status
= U_ZERO_ERROR
;
5476 UnicodeString pattern
= "(a)";
5477 UnicodeString text
= "abcdef";
5478 RegexMatcher
*m
= new RegexMatcher(pattern
, text
, 0, status
);
5480 REGEX_ASSERT(m
->lookingAt(status
));
5482 status
= U_ILLEGAL_ARGUMENT_ERROR
;
5483 UnicodeString s
= m
->group(1, status
); // Bug 7740: segfault here.
5484 REGEX_ASSERT(status
== U_ILLEGAL_ARGUMENT_ERROR
);
5485 REGEX_ASSERT(s
== "");
5489 // Bug 8479: was crashing whith a Bogus UnicodeString as input.
5491 void RegexTest::Bug8479() {
5492 UErrorCode status
= U_ZERO_ERROR
;
5494 RegexMatcher
* const pMatcher
= new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL
|UREGEX_CASE_INSENSITIVE
, status
);
5496 if (U_SUCCESS(status
))
5500 pMatcher
->reset(str
);
5501 status
= U_ZERO_ERROR
;
5502 pMatcher
->matches(status
);
5503 REGEX_ASSERT(status
== U_ILLEGAL_ARGUMENT_ERROR
);
5510 void RegexTest::Bug7029() {
5511 UErrorCode status
= U_ZERO_ERROR
;
5513 RegexMatcher
* const pMatcher
= new RegexMatcher(".", 0, status
);
5514 UnicodeString text
= "abc.def";
5515 UnicodeString splits
[10];
5517 int32_t numFields
= pMatcher
->split(text
, splits
, 10, status
);
5519 REGEX_ASSERT(numFields
== 8);
5524 // This test is checking for the existance of any supplemental characters that case-fold
5525 // to a bmp character.
5527 // At the time of this writing there are none. If any should appear in a subsequent release
5528 // of Unicode, the code in regular expressions compilation that determines the longest
5529 // posssible match for a literal string will need to be enhanced.
5531 // See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5532 // for details on what to do in case of a failure of this test.
5534 void RegexTest::Bug9283() {
5535 #if !UCONFIG_NO_NORMALIZATION
5536 UErrorCode status
= U_ZERO_ERROR
;
5537 UnicodeSet
supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status
);
5541 for (index
=0; ; index
++) {
5542 c
= supplementalsWithCaseFolding
.charAt(index
);
5546 UnicodeString cf
= UnicodeString(c
).foldCase();
5547 REGEX_ASSERT(cf
.length() >= 2);
5549 #endif /* #if !UCONFIG_NO_NORMALIZATION */
5553 void RegexTest::CheckInvBufSize() {
5554 if(inv_next
>=INV_BUFSIZ
) {
5555 errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5556 __FILE__
, INV_BUFSIZ
, inv_next
);
5558 logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__
, INV_BUFSIZ
, inv_next
);
5563 void RegexTest::Bug10459() {
5564 UErrorCode status
= U_ZERO_ERROR
;
5565 UnicodeString
patternString("(txt)");
5566 UnicodeString
txtString("txt");
5568 UText
*utext_pat
= utext_openUnicodeString(NULL
, &patternString
, &status
);
5570 UText
*utext_txt
= utext_openUnicodeString(NULL
, &txtString
, &status
);
5573 URegularExpression
*icu_re
= uregex_openUText(utext_pat
, 0, NULL
, &status
);
5576 uregex_setUText(icu_re
, utext_txt
, &status
);
5579 // The bug was that calling uregex_group() before doing a matching operation
5580 // was causing a segfault. Only for Regular Expressions created from UText.
5581 // It should set an U_REGEX_INVALID_STATE.
5584 int32_t len
= uregex_group(icu_re
, 0, buf
, UPRV_LENGTHOF(buf
), &status
);
5585 REGEX_ASSERT(status
== U_REGEX_INVALID_STATE
);
5586 REGEX_ASSERT(len
== 0);
5588 uregex_close(icu_re
);
5589 utext_close(utext_pat
);
5590 utext_close(utext_txt
);
5593 void RegexTest::TestCaseInsensitiveStarters() {
5594 // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
5595 // become stale because of new Unicode characters.
5596 // If it is stale, rerun the generation tool
5597 // svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
5598 // and replace the embedded data in i18n/regexcmp.cpp
5600 for (UChar32 cp
=0; cp
<=0x10ffff; cp
++) {
5601 if (!u_hasBinaryProperty(cp
, UCHAR_CASE_SENSITIVE
)) {
5604 UnicodeSet
s(cp
, cp
);
5605 s
.closeOver(USET_CASE_INSENSITIVE
);
5606 UnicodeSetIterator
setIter(s
);
5607 while (setIter
.next()) {
5608 if (!setIter
.isString()) {
5611 const UnicodeString
&str
= setIter
.getString();
5612 UChar32 firstChar
= str
.char32At(0);
5613 UnicodeSet starters
;
5614 RegexCompile::findCaseInsensitiveStarters(firstChar
, &starters
);
5615 if (!starters
.contains(cp
)) {
5616 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp
, firstChar
);
5624 void RegexTest::TestBug11049() {
5625 // Original bug report: pattern with match start consisting of one of several individual characters,
5626 // and the text being matched ending with a supplementary character. find() would read past the
5627 // end of the input text when searching for potential match starting points.
5629 // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
5630 // detect the bad read.
5632 TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE
, __LINE__
);
5633 TestCase11049("A|B|C", "string matches at end C", TRUE
, __LINE__
);
5635 // Test again with a pattern starting with a single character,
5636 // which takes a different code path than starting with an OR expression,
5637 // but with similar logic.
5638 TestCase11049("C", "a string \\ud800\\udc00", FALSE
, __LINE__
);
5639 TestCase11049("C", "string matches at end C", TRUE
, __LINE__
);
5642 // Run a single test case from TestBug11049(). Internal function.
5643 void RegexTest::TestCase11049(const char *pattern
, const char *data
, UBool expectMatch
, int32_t lineNumber
) {
5644 UErrorCode status
= U_ZERO_ERROR
;
5645 UnicodeString patternString
= UnicodeString(pattern
).unescape();
5646 LocalPointer
<RegexPattern
> compiledPat(RegexPattern::compile(patternString
, 0, status
));
5648 UnicodeString dataString
= UnicodeString(data
).unescape();
5649 UChar
*exactBuffer
= new UChar
[dataString
.length()];
5650 dataString
.extract(exactBuffer
, dataString
.length(), status
);
5651 UText
*ut
= utext_openUChars(NULL
, exactBuffer
, dataString
.length(), &status
);
5653 LocalPointer
<RegexMatcher
> matcher(compiledPat
->matcher(status
));
5656 UBool result
= matcher
->find();
5657 if (result
!= expectMatch
) {
5658 errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5659 __FILE__
, lineNumber
, expectMatch
, result
, pattern
, data
);
5662 // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
5663 // off-by-one on find() with match at the last code point.
5664 // Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
5665 // because string.unescape() will only shrink it.
5666 char * utf8Buffer
= new char[uprv_strlen(data
)+1];
5667 u_strToUTF8(utf8Buffer
, uprv_strlen(data
)+1, NULL
, dataString
.getBuffer(), dataString
.length(), &status
);
5669 ut
= utext_openUTF8(ut
, utf8Buffer
, -1, &status
);
5672 result
= matcher
->find();
5673 if (result
!= expectMatch
) {
5674 errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5675 __FILE__
, lineNumber
, expectMatch
, result
, pattern
, data
);
5677 delete [] utf8Buffer
;
5680 delete [] exactBuffer
;
5684 void RegexTest::TestBug11371() {
5686 logln("Skipping test. Runs in exhuastive mode only.");
5689 UErrorCode status
= U_ZERO_ERROR
;
5690 UnicodeString patternString
;
5692 for (int i
=0; i
<8000000; i
++) {
5693 patternString
.append(UnicodeString("()"));
5695 LocalPointer
<RegexPattern
> compiledPat(RegexPattern::compile(patternString
, 0, status
));
5696 if (status
!= U_REGEX_PATTERN_TOO_BIG
) {
5697 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5698 __FILE__
, __LINE__
, u_errorName(status
));
5701 status
= U_ZERO_ERROR
;
5702 patternString
= "(";
5703 for (int i
=0; i
<20000000; i
++) {
5704 patternString
.append(UnicodeString("A++"));
5706 patternString
.append(UnicodeString("){0}B++"));
5707 LocalPointer
<RegexPattern
> compiledPat2(RegexPattern::compile(patternString
, 0, status
));
5708 if (status
!= U_REGEX_PATTERN_TOO_BIG
) {
5709 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5710 __FILE__
, __LINE__
, u_errorName(status
));
5713 // Pattern with too much string data, such that string indexes overflow operand data field size
5714 // in compiled instruction.
5715 status
= U_ZERO_ERROR
;
5717 while (patternString
.length() < 0x00ffffff) {
5718 patternString
.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
5720 patternString
.append(UnicodeString("X? trailing string"));
5721 LocalPointer
<RegexPattern
> compiledPat3(RegexPattern::compile(patternString
, 0, status
));
5722 if (status
!= U_REGEX_PATTERN_TOO_BIG
) {
5723 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5724 __FILE__
, __LINE__
, u_errorName(status
));
5728 void RegexTest::TestBug11480() {
5729 // C API, get capture group of a group that does not participate in the match.
5730 // (Returns a zero length string, with nul termination,
5731 // indistinguishable from a group with a zero length match.)
5733 UErrorCode status
= U_ZERO_ERROR
;
5734 URegularExpression
*re
= uregex_openC("(A)|(B)", 0, NULL
, &status
);
5736 UnicodeString text
= UNICODE_STRING_SIMPLE("A");
5737 uregex_setText(re
, text
.getBuffer(), text
.length(), &status
);
5739 REGEX_ASSERT(uregex_lookingAt(re
, 0, &status
));
5740 UChar buf
[10] = {(UChar
)13, (UChar
)13, (UChar
)13, (UChar
)13};
5741 int32_t length
= uregex_group(re
, 2, buf
+1, UPRV_LENGTHOF(buf
)-1, &status
);
5742 REGEX_ASSERT(length
== 0);
5743 REGEX_ASSERT(buf
[0] == 13);
5744 REGEX_ASSERT(buf
[1] == 0);
5745 REGEX_ASSERT(buf
[2] == 13);
5748 // UText C++ API, length of match is 0 for non-participating matches.
5749 UText ut
= UTEXT_INITIALIZER
;
5750 utext_openUnicodeString(&ut
, &text
, &status
);
5751 RegexMatcher
matcher(UnicodeString("(A)|(B)"), 0, status
);
5754 REGEX_ASSERT(matcher
.lookingAt(0, status
));
5756 // UText C++ API, Capture group 1 matches "A", position 0, length 1.
5757 int64_t groupLen
= -666;
5758 UText group
= UTEXT_INITIALIZER
;
5759 matcher
.group(1, &group
, groupLen
, status
);
5761 REGEX_ASSERT(groupLen
== 1);
5762 REGEX_ASSERT(utext_getNativeIndex(&group
) == 0);
5764 // Capture group 2, the (B), does not participate in the match.
5765 matcher
.group(2, &group
, groupLen
, status
);
5767 REGEX_ASSERT(groupLen
== 0);
5768 REGEX_ASSERT(matcher
.start(2, status
) == -1);
5772 void RegexTest::TestBug12884() {
5773 // setTimeLimit() was not effective for empty sub-patterns with large {minimum counts}
5774 UnicodeString
pattern(u
"(((((((){120}){11}){11}){11}){80}){11}){4}");
5775 UnicodeString
text(u
"hello");
5776 UErrorCode status
= U_ZERO_ERROR
;
5777 RegexMatcher
m(pattern
, text
, 0, status
);
5779 m
.setTimeLimit(5, status
);
5781 REGEX_ASSERT(status
== U_REGEX_TIME_OUT
);
5783 // Non-greedy loops. They take a different code path during matching.
5784 UnicodeString
ngPattern(u
"(((((((){120}?){11}?){11}?){11}?){80}?){11}?){4}?");
5785 status
= U_ZERO_ERROR
;
5786 RegexMatcher
ngM(ngPattern
, text
, 0, status
);
5788 ngM
.setTimeLimit(5, status
);
5790 REGEX_ASSERT(status
== U_REGEX_TIME_OUT
);
5792 // UText, wrapping non-UTF-16 text, also takes a different execution path.
5793 const char *text8
= u8
"¿Qué es Unicode? Unicode proporciona un número único para cada"
5794 "carácter, sin importar la plataforma, sin importar el programa,"
5795 "sin importar el idioma.";
5796 status
= U_ZERO_ERROR
;
5797 LocalUTextPointer
ut(utext_openUTF8(NULL
, text8
, -1, &status
));
5799 m
.reset(ut
.getAlias());
5801 REGEX_ASSERT(status
== U_REGEX_TIME_OUT
);
5803 status
= U_ZERO_ERROR
;
5804 ngM
.reset(ut
.getAlias());
5806 REGEX_ASSERT(status
== U_REGEX_TIME_OUT
);
5809 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */