1 /********************************************************************
3 * Copyright (c) 2002-2010, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
10 // ICU Regular Expressions test, part of intltest.
14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
16 #include "unicode/regex.h"
17 #include "unicode/uchar.h"
18 #include "unicode/ucnv.h"
19 #include "unicode/ustring.h"
29 #define SUPPORT_MUTATING_INPUT_STRING 0
31 //---------------------------------------------------------------------------
33 // Test class boilerplate
35 //---------------------------------------------------------------------------
36 RegexTest::RegexTest()
41 RegexTest::~RegexTest()
47 void RegexTest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* /*par*/ )
49 if (exec
) logln("TestSuite RegexTest: ");
52 case 0: name
= "Basic";
55 case 1: name
= "API_Match";
56 if (exec
) API_Match();
58 case 2: name
= "API_Replace";
59 if (exec
) API_Replace();
61 case 3: name
= "API_Pattern";
62 if (exec
) API_Pattern();
65 #if !UCONFIG_NO_FILE_IO
72 case 5: name
= "Errors";
75 case 6: name
= "PerlTests";
76 if (exec
) PerlTests();
78 case 7: name
= "Callbacks";
79 if (exec
) Callbacks();
81 case 8: name
= "FindProgressCallbacks";
82 if (exec
) FindProgressCallbacks();
84 case 9: name
= "Bug 6149";
87 case 10: name
= "UTextBasic";
88 if (exec
) UTextBasic();
90 case 11: name
= "API_Match_UTF8";
91 if (exec
) API_Match_UTF8();
93 case 12: name
= "API_Replace_UTF8";
94 if (exec
) API_Replace_UTF8();
96 case 13: name
= "API_Pattern_UTF8";
97 if (exec
) API_Pattern_UTF8();
99 case 14: name
= "PerlTestsUTF8";
100 if (exec
) PerlTestsUTF8();
102 case 15: name
= "PreAllocatedUTextCAPI";
103 if (exec
) PreAllocatedUTextCAPI();
105 case 16: name
= "Bug 7651";
108 case 17: name
= "Bug 7740";
113 break; //needed to end loop
119 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
121 * @see utext_openUTF8
123 static UText
* regextst_openUTF8FromInvariant(UText
* ut
, const char *inv
, int64_t length
, UErrorCode
*status
);
125 static UText
* regextst_openUTF8FromInvariant(UText
*ut
, const char *inv
, int64_t length
, UErrorCode
*status
) {
126 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
127 return utext_openUTF8(ut
, inv
, length
, status
);
131 uprv_aestrncpy((uint8_t*)buf
, (const uint8_t*)inv
, length
);
133 return utext_openUTF8(ut
, buf
, length
, status
);
137 //---------------------------------------------------------------------------
139 // Error Checking / Reporting macros used in all of the tests.
141 //---------------------------------------------------------------------------
143 static void utextToPrintable(char *buf
, int32_t bufLen
, UText
*text
) {
144 int64_t oldIndex
= utext_getNativeIndex(text
);
145 utext_setNativeIndex(text
, 0);
147 UChar32 c
= utext_next32From(text
, 0);
148 while ((c
!= U_SENTINEL
) && (bufPtr
< buf
+bufLen
)) {
149 if (0x000020<=c
&& c
<0x00007e) {
153 sprintf(bufPtr
,"U+%04X", c
);
154 bufPtr
+= strlen(bufPtr
)-1;
160 c
= UTEXT_NEXT32(text
);
163 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
164 char *ebuf
= (char*)malloc(bufLen
);
165 uprv_eastrncpy((unsigned char*)ebuf
, (const unsigned char*)buf
, bufLen
);
166 uprv_strncpy(buf
, ebuf
, bufLen
);
169 utext_setNativeIndex(text
, oldIndex
);
172 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
174 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \
175 __FILE__, __LINE__, u_errorName(status)); return;}}
177 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
179 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
180 if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \
181 __LINE__, u_errorName(errcode), u_errorName(status));};}
183 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
184 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
186 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
187 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
190 * @param expected expected text in UTF-8 (not platform) codepage
192 void RegexTest::assertUText(const char *expected
, UText
*actual
, const char *file
, int line
) {
193 UErrorCode status
= U_ZERO_ERROR
;
194 UText expectedText
= UTEXT_INITIALIZER
;
195 utext_openUTF8(&expectedText
, expected
, -1, &status
);
196 if(U_FAILURE(status
)) {
197 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file
, line
, u_errorName(status
), strlen(expected
));
200 if(utext_nativeLength(&expectedText
)==0 && (strlen(expected
)!=0)) {
201 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file
, line
, strlen(expected
));
204 utext_setNativeIndex(actual
, 0);
205 if (utext_compare(&expectedText
, -1, actual
, -1) != 0) {
206 char buf
[201 /*21*/];
207 char expectedBuf
[201];
208 utextToPrintable(buf
, sizeof(buf
)/sizeof(buf
[0]), actual
);
209 utextToPrintable(expectedBuf
, sizeof(expectedBuf
)/sizeof(expectedBuf
[0]), &expectedText
);
210 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file
, line
, expectedBuf
, (int)utext_nativeLength(&expectedText
), buf
, (int)utext_nativeLength(actual
));
212 utext_close(&expectedText
);
215 * @param expected invariant (platform local text) input
218 void RegexTest::assertUTextInvariant(const char *expected
, UText
*actual
, const char *file
, int line
) {
219 UErrorCode status
= U_ZERO_ERROR
;
220 UText expectedText
= UTEXT_INITIALIZER
;
221 regextst_openUTF8FromInvariant(&expectedText
, expected
, -1, &status
);
222 if(U_FAILURE(status
)) {
223 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file
, line
, u_errorName(status
), strlen(expected
));
226 utext_setNativeIndex(actual
, 0);
227 if (utext_compare(&expectedText
, -1, actual
, -1) != 0) {
228 char buf
[201 /*21*/];
229 char expectedBuf
[201];
230 utextToPrintable(buf
, sizeof(buf
)/sizeof(buf
[0]), actual
);
231 utextToPrintable(expectedBuf
, sizeof(expectedBuf
)/sizeof(expectedBuf
[0]), &expectedText
);
232 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file
, line
, expectedBuf
, (int)utext_nativeLength(&expectedText
), buf
, (int)utext_nativeLength(actual
));
234 utext_close(&expectedText
);
238 * Assumes utf-8 input
240 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
242 * Assumes Invariant input
244 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
247 //---------------------------------------------------------------------------
249 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests
250 // for the LookingAt() and Match() functions.
253 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
255 // The expected results are UBool - TRUE or FALSE.
256 // The input text is unescaped. The pattern is not.
259 //---------------------------------------------------------------------------
261 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
263 UBool
RegexTest::doRegexLMTest(const char *pat
, const char *text
, UBool looking
, UBool match
, int32_t line
) {
264 const UnicodeString
pattern(pat
, -1, US_INV
);
265 const UnicodeString
inputText(text
, -1, US_INV
);
266 UErrorCode status
= U_ZERO_ERROR
;
268 RegexPattern
*REPattern
= NULL
;
269 RegexMatcher
*REMatcher
= NULL
;
272 UnicodeString
patString(pat
, -1, US_INV
);
273 REPattern
= RegexPattern::compile(patString
, 0, pe
, status
);
274 if (U_FAILURE(status
)) {
275 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s",
276 line
, u_errorName(status
));
279 if (line
==376) { RegexPatternDump(REPattern
);}
281 UnicodeString
inputString(inputText
);
282 UnicodeString unEscapedInput
= inputString
.unescape();
283 REMatcher
= REPattern
->matcher(unEscapedInput
, status
);
284 if (U_FAILURE(status
)) {
285 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
286 line
, u_errorName(status
));
291 actualmatch
= REMatcher
->lookingAt(status
);
292 if (U_FAILURE(status
)) {
293 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
294 line
, u_errorName(status
));
297 if (actualmatch
!= looking
) {
298 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line
);
302 status
= U_ZERO_ERROR
;
303 actualmatch
= REMatcher
->matches(status
);
304 if (U_FAILURE(status
)) {
305 errln("RegexTest failure in matches() at line %d. Status = %s\n",
306 line
, u_errorName(status
));
309 if (actualmatch
!= match
) {
310 errln("RegexTest: wrong return from matches() at line %d.\n", line
);
314 if (retVal
== FALSE
) {
315 RegexPatternDump(REPattern
);
324 UBool
RegexTest::doRegexLMTestUTF8(const char *pat
, const char *text
, UBool looking
, UBool match
, int32_t line
) {
325 UText pattern
= UTEXT_INITIALIZER
;
326 int32_t inputUTF8Length
;
327 char *textChars
= NULL
;
328 UText inputText
= UTEXT_INITIALIZER
;
329 UErrorCode status
= U_ZERO_ERROR
;
331 RegexPattern
*REPattern
= NULL
;
332 RegexMatcher
*REMatcher
= NULL
;
335 regextst_openUTF8FromInvariant(&pattern
, pat
, -1, &status
);
336 REPattern
= RegexPattern::compile(&pattern
, 0, pe
, status
);
337 if (U_FAILURE(status
)) {
338 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n",
339 line
, u_errorName(status
));
343 UnicodeString
inputString(text
, -1, US_INV
);
344 UnicodeString unEscapedInput
= inputString
.unescape();
345 LocalUConverterPointer
UTF8Converter(ucnv_open("UTF8", &status
));
346 ucnv_setFromUCallBack(UTF8Converter
.getAlias(), UCNV_FROM_U_CALLBACK_STOP
, NULL
, NULL
, NULL
, &status
);
348 inputUTF8Length
= unEscapedInput
.extract(NULL
, 0, UTF8Converter
.getAlias(), status
);
349 if (U_FAILURE(status
) && status
!= U_BUFFER_OVERFLOW_ERROR
) {
350 // UTF-8 does not allow unpaired surrogates, so this could actually happen
351 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line
, u_errorName(status
));
352 return TRUE
; // not a failure of the Regex engine
354 status
= U_ZERO_ERROR
; // buffer overflow
355 textChars
= new char[inputUTF8Length
+1];
356 unEscapedInput
.extract(textChars
, inputUTF8Length
+1, UTF8Converter
.getAlias(), status
);
357 utext_openUTF8(&inputText
, textChars
, inputUTF8Length
, &status
);
359 REMatcher
= REPattern
->matcher(&inputText
, RegexPattern::PATTERN_IS_UTEXT
, status
);
360 if (U_FAILURE(status
)) {
361 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n",
362 line
, u_errorName(status
));
367 actualmatch
= REMatcher
->lookingAt(status
);
368 if (U_FAILURE(status
)) {
369 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n",
370 line
, u_errorName(status
));
373 if (actualmatch
!= looking
) {
374 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line
);
378 status
= U_ZERO_ERROR
;
379 actualmatch
= REMatcher
->matches(status
);
380 if (U_FAILURE(status
)) {
381 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n",
382 line
, u_errorName(status
));
385 if (actualmatch
!= match
) {
386 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line
);
390 if (retVal
== FALSE
) {
391 RegexPatternDump(REPattern
);
396 utext_close(&inputText
);
397 utext_close(&pattern
);
404 //---------------------------------------------------------------------------
406 // REGEX_ERR Macro + invocation function to simplify writing tests
407 // regex tests for incorrect patterns
410 // REGEX_ERR("pattern", expected error line, column, expected status);
412 //---------------------------------------------------------------------------
413 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
415 void RegexTest::regex_err(const char *pat
, int32_t errLine
, int32_t errCol
,
416 UErrorCode expectedStatus
, int32_t line
) {
417 UnicodeString
pattern(pat
);
419 UErrorCode status
= U_ZERO_ERROR
;
421 RegexPattern
*callerPattern
= NULL
;
424 // Compile the caller's pattern
426 UnicodeString
patString(pat
);
427 callerPattern
= RegexPattern::compile(patString
, 0, pe
, status
);
428 if (status
!= expectedStatus
) {
429 dataerrln("Line %d: unexpected error %s compiling pattern.", line
, u_errorName(status
));
431 if (status
!= U_ZERO_ERROR
) {
432 if (pe
.line
!= errLine
|| pe
.offset
!= errCol
) {
433 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
434 line
, errLine
, errCol
, pe
.line
, pe
.offset
);
439 delete callerPattern
;
442 // Compile again, using a UTF-8-based UText
444 UText patternText
= UTEXT_INITIALIZER
;
445 regextst_openUTF8FromInvariant(&patternText
, pat
, -1, &status
);
446 callerPattern
= RegexPattern::compile(&patternText
, 0, pe
, status
);
447 if (status
!= expectedStatus
) {
448 dataerrln("Line %d: unexpected error %s compiling pattern.", line
, u_errorName(status
));
450 if (status
!= U_ZERO_ERROR
) {
451 if (pe
.line
!= errLine
|| pe
.offset
!= errCol
) {
452 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
453 line
, errLine
, errCol
, pe
.line
, pe
.offset
);
458 delete callerPattern
;
459 utext_close(&patternText
);
464 //---------------------------------------------------------------------------
466 // Basic Check for basic functionality of regex pattern matching.
467 // Avoid the use of REGEX_FIND test macro, which has
468 // substantial dependencies on basic Regex functionality.
470 //---------------------------------------------------------------------------
471 void RegexTest::Basic() {
475 // Debug - slide failing test cases early
479 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
481 UErrorCode status
= U_ZERO_ERROR
;
482 RegexPattern::compile("^(?:a?b?)*$", 0, pe
, status
);
483 // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
484 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
491 // Pattern with parentheses
493 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE
, FALSE
);
494 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE
, TRUE
);
495 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE
, FALSE
);
500 REGEX_TESTLM("st(abc)*ring", "string", TRUE
, TRUE
);
501 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE
, TRUE
);
502 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE
, TRUE
);
503 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE
, FALSE
);
504 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE
, FALSE
);
506 REGEX_TESTLM("a*", "", TRUE
, TRUE
);
507 REGEX_TESTLM("a*", "b", TRUE
, FALSE
);
513 REGEX_TESTLM(".", "abc", TRUE
, FALSE
);
514 REGEX_TESTLM("...", "abc", TRUE
, TRUE
);
515 REGEX_TESTLM("....", "abc", FALSE
, FALSE
);
516 REGEX_TESTLM(".*", "abcxyz123", TRUE
, TRUE
);
517 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE
, FALSE
);
518 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE
, TRUE
);
519 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE
, TRUE
);
520 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE
, FALSE
);
523 // Patterns with * applied to chars at end of literal string
525 REGEX_TESTLM("abc*", "ab", TRUE
, TRUE
);
526 REGEX_TESTLM("abc*", "abccccc", TRUE
, TRUE
);
529 // Supplemental chars match as single chars, not a pair of surrogates.
531 REGEX_TESTLM(".", "\\U00011000", TRUE
, TRUE
);
532 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE
, TRUE
);
533 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE
, FALSE
);
537 // UnicodeSets in the pattern
539 REGEX_TESTLM("[1-6]", "1", TRUE
, TRUE
);
540 REGEX_TESTLM("[1-6]", "3", TRUE
, TRUE
);
541 REGEX_TESTLM("[1-6]", "7", FALSE
, FALSE
);
542 REGEX_TESTLM("a[1-6]", "a3", TRUE
, TRUE
);
543 REGEX_TESTLM("a[1-6]", "a3", TRUE
, TRUE
);
544 REGEX_TESTLM("a[1-6]b", "a3b", TRUE
, TRUE
);
546 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE
, TRUE
);
547 REGEX_TESTLM("a[0-9]*b", "abc", TRUE
, FALSE
);
548 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE
, TRUE
);
549 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE
, FALSE
); // note that * matches 0 occurences.
550 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE
, TRUE
);
553 // OR operator in patterns
555 REGEX_TESTLM("(a|b)", "a", TRUE
, TRUE
);
556 REGEX_TESTLM("(a|b)", "b", TRUE
, TRUE
);
557 REGEX_TESTLM("(a|b)", "c", FALSE
, FALSE
);
558 REGEX_TESTLM("a|b", "b", TRUE
, TRUE
);
560 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE
, TRUE
);
561 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE
, FALSE
);
562 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE
, TRUE
);
563 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE
, TRUE
);
564 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE
, TRUE
);
565 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE
, FALSE
);
570 REGEX_TESTLM("ab+", "abbc", TRUE
, FALSE
);
571 REGEX_TESTLM("ab+c", "ac", FALSE
, FALSE
);
572 REGEX_TESTLM("b+", "", FALSE
, FALSE
);
573 REGEX_TESTLM("(abc|def)+", "defabc", TRUE
, TRUE
);
574 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE
, FALSE
);
575 REGEX_TESTLM(".+y", "zippity dooy", TRUE
, TRUE
);
580 REGEX_TESTLM("ab?", "ab", TRUE
, TRUE
);
581 REGEX_TESTLM("ab?", "a", TRUE
, TRUE
);
582 REGEX_TESTLM("ab?", "ac", TRUE
, FALSE
);
583 REGEX_TESTLM("ab?", "abb", TRUE
, FALSE
);
584 REGEX_TESTLM("a(b|c)?d", "abd", TRUE
, TRUE
);
585 REGEX_TESTLM("a(b|c)?d", "acd", TRUE
, TRUE
);
586 REGEX_TESTLM("a(b|c)?d", "ad", TRUE
, TRUE
);
587 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE
, FALSE
);
588 REGEX_TESTLM("a(b|c)?d", "ab", FALSE
, FALSE
);
591 // Escape sequences that become single literal chars, handled internally
592 // by ICU's Unescape.
595 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
596 REGEX_TESTLM("\\a", "\\u0007", TRUE
, TRUE
); // BEL
597 REGEX_TESTLM("\\cL", "\\u000c", TRUE
, TRUE
); // Control-L
598 REGEX_TESTLM("\\e", "\\u001b", TRUE
, TRUE
); // Escape
599 REGEX_TESTLM("\\f", "\\u000c", TRUE
, TRUE
); // Form Feed
600 REGEX_TESTLM("\\n", "\\u000a", TRUE
, TRUE
); // new line
601 REGEX_TESTLM("\\r", "\\u000d", TRUE
, TRUE
); // CR
602 REGEX_TESTLM("\\t", "\\u0009", TRUE
, TRUE
); // Tab
603 REGEX_TESTLM("\\u1234", "\\u1234", TRUE
, TRUE
);
604 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE
, TRUE
);
606 REGEX_TESTLM(".*\\Ax", "xyz", TRUE
, FALSE
); // \A matches only at the beginning of input
607 REGEX_TESTLM(".*\\Ax", " xyz", FALSE
, FALSE
); // \A matches only at the beginning of input
609 // Escape of special chars in patterns
610 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE
, TRUE
);
614 //---------------------------------------------------------------------------
616 // UTextBasic Check for quirks that are specific to the UText
619 //---------------------------------------------------------------------------
620 void RegexTest::UTextBasic() {
621 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
622 UErrorCode status
= U_ZERO_ERROR
;
623 UText pattern
= UTEXT_INITIALIZER
;
624 utext_openUTF8(&pattern
, str_abc
, -1, &status
);
625 RegexMatcher
matcher(&pattern
, 0, status
);
628 UText input
= UTEXT_INITIALIZER
;
629 utext_openUTF8(&input
, str_abc
, -1, &status
);
631 matcher
.reset(&input
);
633 REGEX_ASSERT_UTEXT_UTF8(str_abc
, matcher
.inputText());
635 matcher
.reset(matcher
.inputText());
637 REGEX_ASSERT_UTEXT_UTF8(str_abc
, matcher
.inputText());
639 utext_close(&pattern
);
644 //---------------------------------------------------------------------------
646 // API_Match Test that the API for class RegexMatcher
647 // is present and nominally working, but excluding functions
648 // implementing replace operations.
650 //---------------------------------------------------------------------------
651 void RegexTest::API_Match() {
653 UErrorCode status
=U_ZERO_ERROR
;
657 // Debug - slide failing test cases early
666 // Simple pattern compilation
669 UnicodeString
re("abc");
671 pat2
= RegexPattern::compile(re
, flags
, pe
, status
);
674 UnicodeString inStr1
= "abcdef this is a test";
675 UnicodeString instr2
= "not abc";
676 UnicodeString empty
= "";
680 // Matcher creation and reset.
682 RegexMatcher
*m1
= pat2
->matcher(inStr1
, status
);
684 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
685 REGEX_ASSERT(m1
->input() == inStr1
);
687 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
688 REGEX_ASSERT(m1
->input() == instr2
);
690 REGEX_ASSERT(m1
->input() == inStr1
);
691 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
693 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
694 REGEX_ASSERT(m1
->input() == empty
);
695 REGEX_ASSERT(&m1
->pattern() == pat2
);
698 // reset(pos, status)
701 m1
->reset(4, status
);
703 REGEX_ASSERT(m1
->input() == inStr1
);
704 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
706 m1
->reset(-1, status
);
707 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
708 status
= U_ZERO_ERROR
;
710 m1
->reset(0, status
);
712 status
= U_ZERO_ERROR
;
714 int32_t len
= m1
->input().length();
715 m1
->reset(len
-1, status
);
717 status
= U_ZERO_ERROR
;
719 m1
->reset(len
, status
);
721 status
= U_ZERO_ERROR
;
723 m1
->reset(len
+1, status
);
724 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
725 status
= U_ZERO_ERROR
;
728 // match(pos, status)
731 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
733 REGEX_ASSERT(m1
->matches(3, status
) == FALSE
);
735 REGEX_ASSERT(m1
->matches(5, status
) == FALSE
);
736 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
737 REGEX_ASSERT(m1
->matches(-1, status
) == FALSE
);
738 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
740 // Match() at end of string should fail, but should not
742 status
= U_ZERO_ERROR
;
743 len
= m1
->input().length();
744 REGEX_ASSERT(m1
->matches(len
, status
) == FALSE
);
747 // Match beyond end of string should fail with an error.
748 status
= U_ZERO_ERROR
;
749 REGEX_ASSERT(m1
->matches(len
+1, status
) == FALSE
);
750 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
752 // Successful match at end of string.
754 status
= U_ZERO_ERROR
;
755 RegexMatcher
m("A?", 0, status
); // will match zero length string.
758 len
= inStr1
.length();
759 REGEX_ASSERT(m
.matches(len
, status
) == TRUE
);
762 REGEX_ASSERT(m
.matches(0, status
) == TRUE
);
768 // lookingAt(pos, status)
770 status
= U_ZERO_ERROR
;
771 m1
->reset(instr2
); // "not abc"
772 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
773 REGEX_ASSERT(m1
->lookingAt(5, status
) == FALSE
);
774 REGEX_ASSERT(m1
->lookingAt(3, status
) == FALSE
);
775 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
776 REGEX_ASSERT(m1
->lookingAt(-1, status
) == FALSE
);
777 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
778 status
= U_ZERO_ERROR
;
779 len
= m1
->input().length();
780 REGEX_ASSERT(m1
->lookingAt(len
, status
) == FALSE
);
782 REGEX_ASSERT(m1
->lookingAt(len
+1, status
) == FALSE
);
783 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
792 // RegexMatcher::start();
793 // RegexMatcher::end();
794 // RegexMatcher::groupCount();
799 UErrorCode status
=U_ZERO_ERROR
;
801 UnicodeString
re("01(23(45)67)(.*)");
802 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
804 UnicodeString data
= "0123456789";
806 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
808 REGEX_ASSERT(matcher
->lookingAt(status
) == TRUE
);
809 static const int32_t matchStarts
[] = {0, 2, 4, 8};
810 static const int32_t matchEnds
[] = {10, 8, 6, 10};
812 for (i
=0; i
<4; i
++) {
813 int32_t actualStart
= matcher
->start(i
, status
);
815 if (actualStart
!= matchStarts
[i
]) {
816 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
817 __LINE__
, i
, matchStarts
[i
], actualStart
);
819 int32_t actualEnd
= matcher
->end(i
, status
);
821 if (actualEnd
!= matchEnds
[i
]) {
822 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
823 __LINE__
, i
, matchEnds
[i
], actualEnd
);
827 REGEX_ASSERT(matcher
->start(0, status
) == matcher
->start(status
));
828 REGEX_ASSERT(matcher
->end(0, status
) == matcher
->end(status
));
830 REGEX_ASSERT_FAIL(matcher
->start(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
831 REGEX_ASSERT_FAIL(matcher
->start( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
833 REGEX_ASSERT_FAIL(matcher
->start( 0, status
), U_REGEX_INVALID_STATE
);
835 matcher
->lookingAt(status
);
836 REGEX_ASSERT(matcher
->group(status
) == "0123456789");
837 REGEX_ASSERT(matcher
->group(0, status
) == "0123456789");
838 REGEX_ASSERT(matcher
->group(1, status
) == "234567" );
839 REGEX_ASSERT(matcher
->group(2, status
) == "45" );
840 REGEX_ASSERT(matcher
->group(3, status
) == "89" );
842 REGEX_ASSERT_FAIL(matcher
->group(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
843 REGEX_ASSERT_FAIL(matcher
->group( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
845 REGEX_ASSERT_FAIL(matcher
->group( 0, status
), U_REGEX_INVALID_STATE
);
858 UErrorCode status
=U_ZERO_ERROR
;
860 UnicodeString
re("abc");
861 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
863 UnicodeString data
= ".abc..abc...abc..";
864 // 012345678901234567
866 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
868 REGEX_ASSERT(matcher
->find());
869 REGEX_ASSERT(matcher
->start(status
) == 1);
870 REGEX_ASSERT(matcher
->find());
871 REGEX_ASSERT(matcher
->start(status
) == 6);
872 REGEX_ASSERT(matcher
->find());
873 REGEX_ASSERT(matcher
->start(status
) == 12);
874 REGEX_ASSERT(matcher
->find() == FALSE
);
875 REGEX_ASSERT(matcher
->find() == FALSE
);
878 REGEX_ASSERT(matcher
->find());
879 REGEX_ASSERT(matcher
->start(status
) == 1);
881 REGEX_ASSERT(matcher
->find(0, status
));
882 REGEX_ASSERT(matcher
->start(status
) == 1);
883 REGEX_ASSERT(matcher
->find(1, status
));
884 REGEX_ASSERT(matcher
->start(status
) == 1);
885 REGEX_ASSERT(matcher
->find(2, status
));
886 REGEX_ASSERT(matcher
->start(status
) == 6);
887 REGEX_ASSERT(matcher
->find(12, status
));
888 REGEX_ASSERT(matcher
->start(status
) == 12);
889 REGEX_ASSERT(matcher
->find(13, status
) == FALSE
);
890 REGEX_ASSERT(matcher
->find(16, status
) == FALSE
);
891 REGEX_ASSERT(matcher
->find(17, status
) == FALSE
);
892 REGEX_ASSERT_FAIL(matcher
->start(status
), U_REGEX_INVALID_STATE
);
894 status
= U_ZERO_ERROR
;
895 REGEX_ASSERT_FAIL(matcher
->find(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
896 status
= U_ZERO_ERROR
;
897 REGEX_ASSERT_FAIL(matcher
->find(18, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
899 REGEX_ASSERT(matcher
->groupCount() == 0);
907 // find, with \G in pattern (true if at the end of a previous match).
912 UErrorCode status
=U_ZERO_ERROR
;
914 UnicodeString
re(".*?(?:(\\Gabc)|(abc))", -1, US_INV
);
915 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
917 UnicodeString data
= ".abcabc.abc..";
918 // 012345678901234567
920 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
922 REGEX_ASSERT(matcher
->find());
923 REGEX_ASSERT(matcher
->start(status
) == 0);
924 REGEX_ASSERT(matcher
->start(1, status
) == -1);
925 REGEX_ASSERT(matcher
->start(2, status
) == 1);
927 REGEX_ASSERT(matcher
->find());
928 REGEX_ASSERT(matcher
->start(status
) == 4);
929 REGEX_ASSERT(matcher
->start(1, status
) == 4);
930 REGEX_ASSERT(matcher
->start(2, status
) == -1);
938 // find with zero length matches, match position should bump ahead
943 UErrorCode status
=U_ZERO_ERROR
;
944 RegexMatcher
m("(?= ?)", 0, status
); // This pattern will zero-length matches anywhere,
945 // using an always-true look-ahead.
947 UnicodeString
s(" ");
950 if (m
.find() == FALSE
) {
953 REGEX_ASSERT(m
.start(status
) == i
);
954 REGEX_ASSERT(m
.end(status
) == i
);
958 // Check that the bump goes over surrogate pairs OK
959 s
= UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
963 if (m
.find() == FALSE
) {
966 REGEX_ASSERT(m
.start(status
) == i
);
967 REGEX_ASSERT(m
.end(status
) == i
);
972 // find() loop breaking test.
973 // with pattern of /.?/, should see a series of one char matches, then a single
974 // match of zero length at the end of the input string.
976 UErrorCode status
=U_ZERO_ERROR
;
977 RegexMatcher
m(".?", 0, status
);
979 UnicodeString
s(" ");
982 if (m
.find() == FALSE
) {
985 REGEX_ASSERT(m
.start(status
) == i
);
986 REGEX_ASSERT(m
.end(status
) == (i
<4 ? i
+1 : i
));
993 // Matchers with no input string behave as if they had an empty input string.
997 UErrorCode status
= U_ZERO_ERROR
;
998 RegexMatcher
m(".?", 0, status
);
1000 REGEX_ASSERT(m
.find());
1001 REGEX_ASSERT(m
.start(status
) == 0);
1002 REGEX_ASSERT(m
.input() == "");
1005 UErrorCode status
= U_ZERO_ERROR
;
1006 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
1007 RegexMatcher
*m
= p
->matcher(status
);
1010 REGEX_ASSERT(m
->find() == FALSE
);
1011 REGEX_ASSERT(m
->input() == "");
1020 UErrorCode status
= U_ZERO_ERROR
;
1021 UnicodeString
testString("This is test data");
1022 RegexMatcher
m(".*", testString
, 0, status
);
1024 REGEX_ASSERT(m
.regionStart() == 0);
1025 REGEX_ASSERT(m
.regionEnd() == testString
.length());
1026 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1027 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1029 m
.region(2,4, status
);
1031 REGEX_ASSERT(m
.matches(status
));
1032 REGEX_ASSERT(m
.start(status
)==2);
1033 REGEX_ASSERT(m
.end(status
)==4);
1037 REGEX_ASSERT(m
.regionStart() == 0);
1038 REGEX_ASSERT(m
.regionEnd() == testString
.length());
1040 UnicodeString
shorterString("short");
1041 m
.reset(shorterString
);
1042 REGEX_ASSERT(m
.regionStart() == 0);
1043 REGEX_ASSERT(m
.regionEnd() == shorterString
.length());
1045 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1046 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(FALSE
));
1047 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
1048 REGEX_ASSERT(&m
== &m
.reset());
1049 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
1051 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(TRUE
));
1052 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1053 REGEX_ASSERT(&m
== &m
.reset());
1054 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
1056 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1057 REGEX_ASSERT(&m
== &m
.useTransparentBounds(TRUE
));
1058 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
1059 REGEX_ASSERT(&m
== &m
.reset());
1060 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
1062 REGEX_ASSERT(&m
== &m
.useTransparentBounds(FALSE
));
1063 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1064 REGEX_ASSERT(&m
== &m
.reset());
1065 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
1070 // hitEnd() and requireEnd()
1073 UErrorCode status
= U_ZERO_ERROR
;
1074 UnicodeString
testString("aabb");
1075 RegexMatcher
m1(".*", testString
, 0, status
);
1076 REGEX_ASSERT(m1
.lookingAt(status
) == TRUE
);
1077 REGEX_ASSERT(m1
.hitEnd() == TRUE
);
1078 REGEX_ASSERT(m1
.requireEnd() == FALSE
);
1081 status
= U_ZERO_ERROR
;
1082 RegexMatcher
m2("a*", testString
, 0, status
);
1083 REGEX_ASSERT(m2
.lookingAt(status
) == TRUE
);
1084 REGEX_ASSERT(m2
.hitEnd() == FALSE
);
1085 REGEX_ASSERT(m2
.requireEnd() == FALSE
);
1088 status
= U_ZERO_ERROR
;
1089 RegexMatcher
m3(".*$", testString
, 0, status
);
1090 REGEX_ASSERT(m3
.lookingAt(status
) == TRUE
);
1091 REGEX_ASSERT(m3
.hitEnd() == TRUE
);
1092 REGEX_ASSERT(m3
.requireEnd() == TRUE
);
1098 // Compilation error on reset with UChar *
1099 // These were a hazard that people were stumbling over with runtime errors.
1100 // Changed them to compiler errors by adding private methods that more closely
1101 // matched the incorrect use of the functions.
1105 UErrorCode status
= U_ZERO_ERROR
;
1106 UChar ucharString
[20];
1107 RegexMatcher
m(".", 0, status
);
1108 m
.reset(ucharString
); // should not compile.
1110 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
1111 RegexMatcher
*m2
= p
->matcher(ucharString
, status
); // should not compile.
1113 RegexMatcher
m3(".", ucharString
, 0, status
); // Should not compile
1119 // Note: These tests will need to be changed when the regexp engine is
1120 // able to detect and cut short the exponential time behavior on
1121 // this type of match.
1124 UErrorCode status
= U_ZERO_ERROR
;
1125 // Enough 'a's in the string to cause the match to time out.
1126 // (Each on additonal 'a' doubles the time)
1127 UnicodeString
testString("aaaaaaaaaaaaaaaaaaaaa");
1128 RegexMatcher
matcher("(a+)+b", testString
, 0, status
);
1130 REGEX_ASSERT(matcher
.getTimeLimit() == 0);
1131 matcher
.setTimeLimit(100, status
);
1132 REGEX_ASSERT(matcher
.getTimeLimit() == 100);
1133 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1134 REGEX_ASSERT(status
== U_REGEX_TIME_OUT
);
1137 UErrorCode status
= U_ZERO_ERROR
;
1138 // Few enough 'a's to slip in under the time limit.
1139 UnicodeString
testString("aaaaaaaaaaaaaaaaaa");
1140 RegexMatcher
matcher("(a+)+b", testString
, 0, status
);
1142 matcher
.setTimeLimit(100, status
);
1143 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1151 UErrorCode status
= U_ZERO_ERROR
;
1152 UnicodeString
testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'
1154 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1155 // of the '+', and makes the stack frames larger.
1156 RegexMatcher
matcher("(A)+A$", testString
, 0, status
);
1158 // With the default stack, this match should fail to run
1159 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1160 REGEX_ASSERT(status
== U_REGEX_STACK_OVERFLOW
);
1162 // With unlimited stack, it should run
1163 status
= U_ZERO_ERROR
;
1164 matcher
.setStackLimit(0, status
);
1166 REGEX_ASSERT(matcher
.lookingAt(status
) == TRUE
);
1168 REGEX_ASSERT(matcher
.getStackLimit() == 0);
1170 // With a limited stack, it the match should fail
1171 status
= U_ZERO_ERROR
;
1172 matcher
.setStackLimit(10000, status
);
1173 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
1174 REGEX_ASSERT(status
== U_REGEX_STACK_OVERFLOW
);
1175 REGEX_ASSERT(matcher
.getStackLimit() == 10000);
1178 // A pattern that doesn't save state should work with
1179 // a minimal sized stack
1181 UErrorCode status
= U_ZERO_ERROR
;
1182 UnicodeString testString
= "abc";
1183 RegexMatcher
matcher("abc", testString
, 0, status
);
1185 matcher
.setStackLimit(30, status
);
1187 REGEX_ASSERT(matcher
.matches(status
) == TRUE
);
1189 REGEX_ASSERT(matcher
.getStackLimit() == 30);
1191 // Negative stack sizes should fail
1192 status
= U_ZERO_ERROR
;
1193 matcher
.setStackLimit(1000, status
);
1195 matcher
.setStackLimit(-1, status
);
1196 REGEX_ASSERT(status
== U_ILLEGAL_ARGUMENT_ERROR
);
1197 REGEX_ASSERT(matcher
.getStackLimit() == 1000);
1208 //---------------------------------------------------------------------------
1210 // API_Replace API test for class RegexMatcher, testing the
1211 // Replace family of functions.
1213 //---------------------------------------------------------------------------
1214 void RegexTest::API_Replace() {
1220 UErrorCode status
=U_ZERO_ERROR
;
1222 UnicodeString
re("abc");
1223 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
1225 UnicodeString data
= ".abc..abc...abc..";
1226 // 012345678901234567
1227 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
1230 // Plain vanilla matches.
1233 dest
= matcher
->replaceFirst("yz", status
);
1235 REGEX_ASSERT(dest
== ".yz..abc...abc..");
1237 dest
= matcher
->replaceAll("yz", status
);
1239 REGEX_ASSERT(dest
== ".yz..yz...yz..");
1242 // Plain vanilla non-matches.
1244 UnicodeString d2
= ".abx..abx...abx..";
1246 dest
= matcher
->replaceFirst("yz", status
);
1248 REGEX_ASSERT(dest
== ".abx..abx...abx..");
1250 dest
= matcher
->replaceAll("yz", status
);
1252 REGEX_ASSERT(dest
== ".abx..abx...abx..");
1255 // Empty source string
1257 UnicodeString d3
= "";
1259 dest
= matcher
->replaceFirst("yz", status
);
1261 REGEX_ASSERT(dest
== "");
1263 dest
= matcher
->replaceAll("yz", status
);
1265 REGEX_ASSERT(dest
== "");
1268 // Empty substitution string
1270 matcher
->reset(data
); // ".abc..abc...abc.."
1271 dest
= matcher
->replaceFirst("", status
);
1273 REGEX_ASSERT(dest
== "...abc...abc..");
1275 dest
= matcher
->replaceAll("", status
);
1277 REGEX_ASSERT(dest
== "........");
1280 // match whole string
1282 UnicodeString d4
= "abc";
1284 dest
= matcher
->replaceFirst("xyz", status
);
1286 REGEX_ASSERT(dest
== "xyz");
1288 dest
= matcher
->replaceAll("xyz", status
);
1290 REGEX_ASSERT(dest
== "xyz");
1293 // Capture Group, simple case
1295 UnicodeString
re2("a(..)");
1296 RegexPattern
*pat2
= RegexPattern::compile(re2
, flags
, pe
, status
);
1298 UnicodeString d5
= "abcdefg";
1299 RegexMatcher
*matcher2
= pat2
->matcher(d5
, status
);
1301 dest
= matcher2
->replaceFirst("$1$1", status
);
1303 REGEX_ASSERT(dest
== "bcbcdefg");
1305 dest
= matcher2
->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status
);
1307 REGEX_ASSERT(dest
== "The value of $1 is bc.defg");
1309 dest
= matcher2
->replaceFirst("$ by itself, no group number $$$", status
);
1311 REGEX_ASSERT(dest
== "$ by itself, no group number $$$defg");
1313 UnicodeString replacement
= UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1314 replacement
= replacement
.unescape();
1315 dest
= matcher2
->replaceFirst(replacement
, status
);
1317 REGEX_ASSERT(dest
== "Supplemental Digit 1 bc.defg");
1319 REGEX_ASSERT_FAIL(matcher2
->replaceFirst("bad capture group number $5...",status
), U_INDEX_OUTOFBOUNDS_ERROR
);
1323 // Replacement String with \u hex escapes
1326 UnicodeString src
= "abc 1 abc 2 abc 3";
1327 UnicodeString substitute
= UNICODE_STRING_SIMPLE("--\\u0043--");
1328 matcher
->reset(src
);
1329 UnicodeString result
= matcher
->replaceAll(substitute
, status
);
1331 REGEX_ASSERT(result
== "--C-- 1 --C-- 2 --C-- 3");
1334 UnicodeString src
= "abc !";
1335 UnicodeString substitute
= UNICODE_STRING_SIMPLE("--\\U00010000--");
1336 matcher
->reset(src
);
1337 UnicodeString result
= matcher
->replaceAll(substitute
, status
);
1339 UnicodeString expected
= UnicodeString("--");
1340 expected
.append((UChar32
)0x10000);
1341 expected
.append("-- !");
1342 REGEX_ASSERT(result
== expected
);
1344 // TODO: need more through testing of capture substitutions.
1349 status
= U_ZERO_ERROR
;
1350 UnicodeString s
= "The matches start with ss and end with ee ss stuff ee fin";
1351 RegexMatcher
m("ss(.*?)ee", 0, status
);
1353 UnicodeString result
;
1355 // Multiple finds do NOT bump up the previous appendReplacement postion.
1359 m
.appendReplacement(result
, "ooh", status
);
1361 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1363 // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1364 status
= U_ZERO_ERROR
;
1366 m
.reset(10, status
);
1369 m
.appendReplacement(result
, "ooh", status
);
1371 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1373 // find() at interior of string, appendReplacemnt still starts at beginning.
1374 status
= U_ZERO_ERROR
;
1379 m
.appendReplacement(result
, "ooh", status
);
1381 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1383 m
.appendTail(result
);
1384 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh fin");
1395 //---------------------------------------------------------------------------
1397 // API_Pattern Test that the API for class RegexPattern is
1398 // present and nominally working.
1400 //---------------------------------------------------------------------------
1401 void RegexTest::API_Pattern() {
1402 RegexPattern pata
; // Test default constructor to not crash.
1405 REGEX_ASSERT(pata
== patb
);
1406 REGEX_ASSERT(pata
== pata
);
1408 UnicodeString
re1("abc[a-l][m-z]");
1409 UnicodeString
re2("def");
1410 UErrorCode status
= U_ZERO_ERROR
;
1413 RegexPattern
*pat1
= RegexPattern::compile(re1
, 0, pe
, status
);
1414 RegexPattern
*pat2
= RegexPattern::compile(re2
, 0, pe
, status
);
1416 REGEX_ASSERT(*pat1
== *pat1
);
1417 REGEX_ASSERT(*pat1
!= pata
);
1421 REGEX_ASSERT(patb
== *pat1
);
1424 RegexPattern
patc(*pat1
);
1425 REGEX_ASSERT(patc
== *pat1
);
1426 REGEX_ASSERT(patb
== patc
);
1427 REGEX_ASSERT(pat1
!= pat2
);
1429 REGEX_ASSERT(patb
!= patc
);
1430 REGEX_ASSERT(patb
== *pat2
);
1432 // Compile with no flags.
1433 RegexPattern
*pat1a
= RegexPattern::compile(re1
, pe
, status
);
1434 REGEX_ASSERT(*pat1a
== *pat1
);
1436 REGEX_ASSERT(pat1a
->flags() == 0);
1438 // Compile with different flags should be not equal
1439 RegexPattern
*pat1b
= RegexPattern::compile(re1
, UREGEX_CASE_INSENSITIVE
, pe
, status
);
1442 REGEX_ASSERT(*pat1b
!= *pat1a
);
1443 REGEX_ASSERT(pat1b
->flags() == UREGEX_CASE_INSENSITIVE
);
1444 REGEX_ASSERT(pat1a
->flags() == 0);
1448 RegexPattern
*pat1c
= pat1
->clone();
1449 REGEX_ASSERT(*pat1c
== *pat1
);
1450 REGEX_ASSERT(*pat1c
!= *pat2
);
1459 // Verify that a matcher created from a cloned pattern works.
1463 UErrorCode status
= U_ZERO_ERROR
;
1464 RegexPattern
*pSource
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status
);
1465 RegexPattern
*pClone
= pSource
->clone();
1467 RegexMatcher
*mFromClone
= pClone
->matcher(status
);
1469 UnicodeString s
= "Hello World";
1470 mFromClone
->reset(s
);
1471 REGEX_ASSERT(mFromClone
->find() == TRUE
);
1472 REGEX_ASSERT(mFromClone
->group(status
) == "Hello");
1473 REGEX_ASSERT(mFromClone
->find() == TRUE
);
1474 REGEX_ASSERT(mFromClone
->group(status
) == "World");
1475 REGEX_ASSERT(mFromClone
->find() == FALSE
);
1481 // matches convenience API
1483 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe
, status
) == TRUE
);
1485 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe
, status
) == FALSE
);
1487 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe
, status
) == TRUE
);
1489 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe
, status
) == TRUE
);
1491 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe
, status
) == FALSE
);
1493 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1494 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe
, status
) == FALSE
);
1495 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1501 status
= U_ZERO_ERROR
;
1502 pat1
= RegexPattern::compile(" +", pe
, status
);
1504 UnicodeString fields
[10];
1507 n
= pat1
->split("Now is the time", fields
, 10, status
);
1510 REGEX_ASSERT(fields
[0]=="Now");
1511 REGEX_ASSERT(fields
[1]=="is");
1512 REGEX_ASSERT(fields
[2]=="the");
1513 REGEX_ASSERT(fields
[3]=="time");
1514 REGEX_ASSERT(fields
[4]=="");
1516 n
= pat1
->split("Now is the time", fields
, 2, status
);
1519 REGEX_ASSERT(fields
[0]=="Now");
1520 REGEX_ASSERT(fields
[1]=="is the time");
1521 REGEX_ASSERT(fields
[2]=="the"); // left over from previous test
1524 status
= U_ZERO_ERROR
;
1525 n
= pat1
->split("Now is the time", fields
, 1, status
);
1528 REGEX_ASSERT(fields
[0]=="Now is the time");
1529 REGEX_ASSERT(fields
[1]=="*");
1530 status
= U_ZERO_ERROR
;
1532 n
= pat1
->split(" Now is the time ", fields
, 10, status
);
1535 REGEX_ASSERT(fields
[0]=="");
1536 REGEX_ASSERT(fields
[1]=="Now");
1537 REGEX_ASSERT(fields
[2]=="is");
1538 REGEX_ASSERT(fields
[3]=="the");
1539 REGEX_ASSERT(fields
[4]=="time");
1540 REGEX_ASSERT(fields
[5]=="");
1542 n
= pat1
->split(" ", fields
, 10, status
);
1545 REGEX_ASSERT(fields
[0]=="");
1548 n
= pat1
->split("", fields
, 10, status
);
1551 REGEX_ASSERT(fields
[0]=="foo");
1555 // split, with a pattern with (capture)
1556 pat1
= RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe
, status
);
1559 status
= U_ZERO_ERROR
;
1560 n
= pat1
->split("<a>Now is <b>the time<c>", fields
, 10, status
);
1563 REGEX_ASSERT(fields
[0]=="");
1564 REGEX_ASSERT(fields
[1]=="a");
1565 REGEX_ASSERT(fields
[2]=="Now is ");
1566 REGEX_ASSERT(fields
[3]=="b");
1567 REGEX_ASSERT(fields
[4]=="the time");
1568 REGEX_ASSERT(fields
[5]=="c");
1569 REGEX_ASSERT(fields
[6]=="");
1570 REGEX_ASSERT(status
==U_ZERO_ERROR
);
1572 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 10, status
);
1575 REGEX_ASSERT(fields
[0]==" ");
1576 REGEX_ASSERT(fields
[1]=="a");
1577 REGEX_ASSERT(fields
[2]=="Now is ");
1578 REGEX_ASSERT(fields
[3]=="b");
1579 REGEX_ASSERT(fields
[4]=="the time");
1580 REGEX_ASSERT(fields
[5]=="c");
1581 REGEX_ASSERT(fields
[6]=="");
1583 status
= U_ZERO_ERROR
;
1585 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 6, status
);
1588 REGEX_ASSERT(fields
[0]==" ");
1589 REGEX_ASSERT(fields
[1]=="a");
1590 REGEX_ASSERT(fields
[2]=="Now is ");
1591 REGEX_ASSERT(fields
[3]=="b");
1592 REGEX_ASSERT(fields
[4]=="the time");
1593 REGEX_ASSERT(fields
[5]=="c");
1594 REGEX_ASSERT(fields
[6]=="foo");
1596 status
= U_ZERO_ERROR
;
1598 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 5, status
);
1601 REGEX_ASSERT(fields
[0]==" ");
1602 REGEX_ASSERT(fields
[1]=="a");
1603 REGEX_ASSERT(fields
[2]=="Now is ");
1604 REGEX_ASSERT(fields
[3]=="b");
1605 REGEX_ASSERT(fields
[4]=="the time<c>");
1606 REGEX_ASSERT(fields
[5]=="foo");
1608 status
= U_ZERO_ERROR
;
1610 n
= pat1
->split(" <a>Now is <b>the time", fields
, 5, status
);
1613 REGEX_ASSERT(fields
[0]==" ");
1614 REGEX_ASSERT(fields
[1]=="a");
1615 REGEX_ASSERT(fields
[2]=="Now is ");
1616 REGEX_ASSERT(fields
[3]=="b");
1617 REGEX_ASSERT(fields
[4]=="the time");
1618 REGEX_ASSERT(fields
[5]=="foo");
1620 status
= U_ZERO_ERROR
;
1621 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 4, status
);
1624 REGEX_ASSERT(fields
[0]==" ");
1625 REGEX_ASSERT(fields
[1]=="a");
1626 REGEX_ASSERT(fields
[2]=="Now is ");
1627 REGEX_ASSERT(fields
[3]=="the time<c>");
1628 status
= U_ZERO_ERROR
;
1631 pat1
= RegexPattern::compile("([-,])", pe
, status
);
1633 n
= pat1
->split("1-10,20", fields
, 10, status
);
1636 REGEX_ASSERT(fields
[0]=="1");
1637 REGEX_ASSERT(fields
[1]=="-");
1638 REGEX_ASSERT(fields
[2]=="10");
1639 REGEX_ASSERT(fields
[3]==",");
1640 REGEX_ASSERT(fields
[4]=="20");
1645 // RegexPattern::pattern()
1647 pat1
= new RegexPattern();
1648 REGEX_ASSERT(pat1
->pattern() == "");
1651 pat1
= RegexPattern::compile("(Hello, world)*", pe
, status
);
1653 REGEX_ASSERT(pat1
->pattern() == "(Hello, world)*");
1658 // classID functions
1660 pat1
= RegexPattern::compile("(Hello, world)*", pe
, status
);
1662 REGEX_ASSERT(pat1
->getDynamicClassID() == RegexPattern::getStaticClassID());
1663 REGEX_ASSERT(pat1
->getDynamicClassID() != NULL
);
1664 UnicodeString
Hello("Hello, world.");
1665 RegexMatcher
*m
= pat1
->matcher(Hello
, status
);
1666 REGEX_ASSERT(pat1
->getDynamicClassID() != m
->getDynamicClassID());
1667 REGEX_ASSERT(m
->getDynamicClassID() == RegexMatcher::getStaticClassID());
1668 REGEX_ASSERT(m
->getDynamicClassID() != NULL
);
1674 //---------------------------------------------------------------------------
1676 // API_Match_UTF8 Test that the alternate engine for class RegexMatcher
1677 // is present and working, but excluding functions
1678 // implementing replace operations.
1680 //---------------------------------------------------------------------------
1681 void RegexTest::API_Match_UTF8() {
1683 UErrorCode status
=U_ZERO_ERROR
;
1687 // Debug - slide failing test cases early
1696 // Simple pattern compilation
1699 UText re
= UTEXT_INITIALIZER
;
1700 regextst_openUTF8FromInvariant(&re
, "abc", -1, &status
);
1702 pat2
= RegexPattern::compile(&re
, flags
, pe
, status
);
1705 UText input1
= UTEXT_INITIALIZER
;
1706 UText input2
= UTEXT_INITIALIZER
;
1707 UText empty
= UTEXT_INITIALIZER
;
1708 regextst_openUTF8FromInvariant(&input1
, "abcdef this is a test", -1, &status
);
1709 REGEX_VERBOSE_TEXT(&input1
);
1710 regextst_openUTF8FromInvariant(&input2
, "not abc", -1, &status
);
1711 REGEX_VERBOSE_TEXT(&input2
);
1712 utext_openUChars(&empty
, NULL
, 0, &status
);
1714 int32_t input1Len
= strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1715 int32_t input2Len
= strlen("not abc");
1719 // Matcher creation and reset.
1721 RegexMatcher
*m1
= pat2
->matcher(&input1
, RegexPattern::PATTERN_IS_UTEXT
, status
);
1723 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
1724 const char str_abcdefthisisatest
[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1725 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest
, m1
->inputText());
1727 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
1728 const char str_notabc
[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1729 REGEX_ASSERT_UTEXT_UTF8(str_notabc
, m1
->inputText());
1731 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest
, m1
->inputText());
1732 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
1734 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
1735 REGEX_ASSERT(utext_nativeLength(&empty
) == 0);
1738 // reset(pos, status)
1741 m1
->reset(4, status
);
1743 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest
, m1
->inputText());
1744 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
1746 m1
->reset(-1, status
);
1747 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1748 status
= U_ZERO_ERROR
;
1750 m1
->reset(0, status
);
1752 status
= U_ZERO_ERROR
;
1754 m1
->reset(input1Len
-1, status
);
1756 status
= U_ZERO_ERROR
;
1758 m1
->reset(input1Len
, status
);
1760 status
= U_ZERO_ERROR
;
1762 m1
->reset(input1Len
+1, status
);
1763 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1764 status
= U_ZERO_ERROR
;
1767 // match(pos, status)
1770 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
1772 REGEX_ASSERT(m1
->matches(3, status
) == FALSE
);
1774 REGEX_ASSERT(m1
->matches(5, status
) == FALSE
);
1775 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
1776 REGEX_ASSERT(m1
->matches(-1, status
) == FALSE
);
1777 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1779 // Match() at end of string should fail, but should not
1781 status
= U_ZERO_ERROR
;
1782 REGEX_ASSERT(m1
->matches(input2Len
, status
) == FALSE
);
1785 // Match beyond end of string should fail with an error.
1786 status
= U_ZERO_ERROR
;
1787 REGEX_ASSERT(m1
->matches(input2Len
+1, status
) == FALSE
);
1788 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1790 // Successful match at end of string.
1792 status
= U_ZERO_ERROR
;
1793 RegexMatcher
m("A?", 0, status
); // will match zero length string.
1796 REGEX_ASSERT(m
.matches(input1Len
, status
) == TRUE
);
1799 REGEX_ASSERT(m
.matches(0, status
) == TRUE
);
1805 // lookingAt(pos, status)
1807 status
= U_ZERO_ERROR
;
1808 m1
->reset(&input2
); // "not abc"
1809 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
1810 REGEX_ASSERT(m1
->lookingAt(5, status
) == FALSE
);
1811 REGEX_ASSERT(m1
->lookingAt(3, status
) == FALSE
);
1812 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
1813 REGEX_ASSERT(m1
->lookingAt(-1, status
) == FALSE
);
1814 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1815 status
= U_ZERO_ERROR
;
1816 REGEX_ASSERT(m1
->lookingAt(input2Len
, status
) == FALSE
);
1818 REGEX_ASSERT(m1
->lookingAt(input2Len
+1, status
) == FALSE
);
1819 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1825 utext_close(&input1
);
1826 utext_close(&input2
);
1827 utext_close(&empty
);
1833 // RegexMatcher::start();
1834 // RegexMatcher::end();
1835 // RegexMatcher::groupCount();
1840 UErrorCode status
=U_ZERO_ERROR
;
1841 UText re
=UTEXT_INITIALIZER
;
1842 const char str_01234567_pat
[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1843 utext_openUTF8(&re
, str_01234567_pat
, -1, &status
);
1845 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
1848 UText input
= UTEXT_INITIALIZER
;
1849 const char str_0123456789
[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1850 utext_openUTF8(&input
, str_0123456789
, -1, &status
);
1852 RegexMatcher
*matcher
= pat
->matcher(&input
, RegexPattern::PATTERN_IS_UTEXT
, status
);
1854 REGEX_ASSERT(matcher
->lookingAt(status
) == TRUE
);
1855 static const int32_t matchStarts
[] = {0, 2, 4, 8};
1856 static const int32_t matchEnds
[] = {10, 8, 6, 10};
1858 for (i
=0; i
<4; i
++) {
1859 int32_t actualStart
= matcher
->start(i
, status
);
1861 if (actualStart
!= matchStarts
[i
]) {
1862 errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n",
1863 __FILE__
, __LINE__
, i
, matchStarts
[i
], actualStart
);
1865 int32_t actualEnd
= matcher
->end(i
, status
);
1867 if (actualEnd
!= matchEnds
[i
]) {
1868 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n",
1869 __FILE__
, __LINE__
, i
, matchEnds
[i
], actualEnd
);
1873 REGEX_ASSERT(matcher
->start(0, status
) == matcher
->start(status
));
1874 REGEX_ASSERT(matcher
->end(0, status
) == matcher
->end(status
));
1876 REGEX_ASSERT_FAIL(matcher
->start(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
1877 REGEX_ASSERT_FAIL(matcher
->start( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
1879 REGEX_ASSERT_FAIL(matcher
->start( 0, status
), U_REGEX_INVALID_STATE
);
1881 matcher
->lookingAt(status
);
1884 UText destText
= UTEXT_INITIALIZER
;
1885 utext_openUnicodeString(&destText
, &dest
, &status
);
1887 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1888 // Test shallow-clone API
1890 result
= matcher
->group((UText
*)NULL
, group_len
, status
);
1892 REGEX_ASSERT_UTEXT_UTF8(str_0123456789
, result
);
1893 utext_close(result
);
1894 result
= matcher
->group(0, &destText
, group_len
, status
);
1896 REGEX_ASSERT(result
== &destText
);
1897 REGEX_ASSERT_UTEXT_UTF8(str_0123456789
, result
);
1898 // destText is now immutable, reopen it
1899 utext_close(&destText
);
1900 utext_openUnicodeString(&destText
, &dest
, &status
);
1902 result
= matcher
->group(0, NULL
, status
);
1904 REGEX_ASSERT_UTEXT_UTF8(str_0123456789
, result
);
1905 utext_close(result
);
1906 result
= matcher
->group(0, &destText
, status
);
1908 REGEX_ASSERT(result
== &destText
);
1909 REGEX_ASSERT_UTEXT_UTF8(str_0123456789
, result
);
1911 result
= matcher
->group(1, NULL
, status
);
1913 const char str_234567
[] = { 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x00 }; /* 234567 */
1914 REGEX_ASSERT_UTEXT_UTF8(str_234567
, result
);
1915 utext_close(result
);
1916 result
= matcher
->group(1, &destText
, status
);
1918 REGEX_ASSERT(result
== &destText
);
1919 REGEX_ASSERT_UTEXT_UTF8(str_234567
, result
);
1921 result
= matcher
->group(2, NULL
, status
);
1923 const char str_45
[] = { 0x34, 0x35, 0x00 }; /* 45 */
1924 REGEX_ASSERT_UTEXT_UTF8(str_45
, result
);
1925 utext_close(result
);
1926 result
= matcher
->group(2, &destText
, status
);
1928 REGEX_ASSERT(result
== &destText
);
1929 REGEX_ASSERT_UTEXT_UTF8(str_45
, result
);
1931 result
= matcher
->group(3, NULL
, status
);
1933 const char str_89
[] = { 0x38, 0x39, 0x00 }; /* 89 */
1934 REGEX_ASSERT_UTEXT_UTF8(str_89
, result
);
1935 utext_close(result
);
1936 result
= matcher
->group(3, &destText
, status
);
1938 REGEX_ASSERT(result
== &destText
);
1939 REGEX_ASSERT_UTEXT_UTF8(str_89
, result
);
1941 REGEX_ASSERT_FAIL(matcher
->group(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
1942 REGEX_ASSERT_FAIL(matcher
->group( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
1944 REGEX_ASSERT_FAIL(matcher
->group( 0, status
), U_REGEX_INVALID_STATE
);
1949 utext_close(&destText
);
1950 utext_close(&input
);
1960 UErrorCode status
=U_ZERO_ERROR
;
1961 UText re
=UTEXT_INITIALIZER
;
1962 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
1963 utext_openUTF8(&re
, str_abc
, -1, &status
);
1965 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
1967 UText input
= UTEXT_INITIALIZER
;
1968 const char str_abcabcabc
[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
1969 utext_openUTF8(&input
, str_abcabcabc
, -1, &status
);
1970 // 012345678901234567
1972 RegexMatcher
*matcher
= pat
->matcher(&input
, RegexPattern::PATTERN_IS_UTEXT
, status
);
1974 REGEX_ASSERT(matcher
->find());
1975 REGEX_ASSERT(matcher
->start(status
) == 1);
1976 REGEX_ASSERT(matcher
->find());
1977 REGEX_ASSERT(matcher
->start(status
) == 6);
1978 REGEX_ASSERT(matcher
->find());
1979 REGEX_ASSERT(matcher
->start(status
) == 12);
1980 REGEX_ASSERT(matcher
->find() == FALSE
);
1981 REGEX_ASSERT(matcher
->find() == FALSE
);
1984 REGEX_ASSERT(matcher
->find());
1985 REGEX_ASSERT(matcher
->start(status
) == 1);
1987 REGEX_ASSERT(matcher
->find(0, status
));
1988 REGEX_ASSERT(matcher
->start(status
) == 1);
1989 REGEX_ASSERT(matcher
->find(1, status
));
1990 REGEX_ASSERT(matcher
->start(status
) == 1);
1991 REGEX_ASSERT(matcher
->find(2, status
));
1992 REGEX_ASSERT(matcher
->start(status
) == 6);
1993 REGEX_ASSERT(matcher
->find(12, status
));
1994 REGEX_ASSERT(matcher
->start(status
) == 12);
1995 REGEX_ASSERT(matcher
->find(13, status
) == FALSE
);
1996 REGEX_ASSERT(matcher
->find(16, status
) == FALSE
);
1997 REGEX_ASSERT(matcher
->find(17, status
) == FALSE
);
1998 REGEX_ASSERT_FAIL(matcher
->start(status
), U_REGEX_INVALID_STATE
);
2000 status
= U_ZERO_ERROR
;
2001 REGEX_ASSERT_FAIL(matcher
->find(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2002 status
= U_ZERO_ERROR
;
2003 REGEX_ASSERT_FAIL(matcher
->find(18, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
2005 REGEX_ASSERT(matcher
->groupCount() == 0);
2010 utext_close(&input
);
2016 // find, with \G in pattern (true if at the end of a previous match).
2021 UErrorCode status
=U_ZERO_ERROR
;
2022 UText re
=UTEXT_INITIALIZER
;
2023 const char str_Gabcabc
[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2024 utext_openUTF8(&re
, str_Gabcabc
, -1, &status
);
2026 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
2029 UText input
= UTEXT_INITIALIZER
;
2030 const char str_abcabcabc
[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2031 utext_openUTF8(&input
, str_abcabcabc
, -1, &status
);
2032 // 012345678901234567
2034 RegexMatcher
*matcher
= pat
->matcher(&input
, RegexPattern::PATTERN_IS_UTEXT
, status
);
2036 REGEX_ASSERT(matcher
->find());
2037 REGEX_ASSERT(matcher
->start(status
) == 0);
2038 REGEX_ASSERT(matcher
->start(1, status
) == -1);
2039 REGEX_ASSERT(matcher
->start(2, status
) == 1);
2041 REGEX_ASSERT(matcher
->find());
2042 REGEX_ASSERT(matcher
->start(status
) == 4);
2043 REGEX_ASSERT(matcher
->start(1, status
) == 4);
2044 REGEX_ASSERT(matcher
->start(2, status
) == -1);
2050 utext_close(&input
);
2055 // find with zero length matches, match position should bump ahead
2056 // to prevent loops.
2060 UErrorCode status
=U_ZERO_ERROR
;
2061 RegexMatcher
m("(?= ?)", 0, status
); // This pattern will zero-length matches anywhere,
2062 // using an always-true look-ahead.
2064 UText s
= UTEXT_INITIALIZER
;
2065 utext_openUTF8(&s
, " ", -1, &status
);
2068 if (m
.find() == FALSE
) {
2071 REGEX_ASSERT(m
.start(status
) == i
);
2072 REGEX_ASSERT(m
.end(status
) == i
);
2076 // Check that the bump goes over characters outside the BMP OK
2077 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2078 unsigned char aboveBMP
[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2079 utext_openUTF8(&s
, (char *)aboveBMP
, -1, &status
);
2082 if (m
.find() == FALSE
) {
2085 REGEX_ASSERT(m
.start(status
) == i
);
2086 REGEX_ASSERT(m
.end(status
) == i
);
2088 REGEX_ASSERT(i
==20);
2093 // find() loop breaking test.
2094 // with pattern of /.?/, should see a series of one char matches, then a single
2095 // match of zero length at the end of the input string.
2097 UErrorCode status
=U_ZERO_ERROR
;
2098 RegexMatcher
m(".?", 0, status
);
2100 UText s
= UTEXT_INITIALIZER
;
2101 utext_openUTF8(&s
, " ", -1, &status
);
2104 if (m
.find() == FALSE
) {
2107 REGEX_ASSERT(m
.start(status
) == i
);
2108 REGEX_ASSERT(m
.end(status
) == (i
<4 ? i
+1 : i
));
2117 // Matchers with no input string behave as if they had an empty input string.
2121 UErrorCode status
= U_ZERO_ERROR
;
2122 RegexMatcher
m(".?", 0, status
);
2124 REGEX_ASSERT(m
.find());
2125 REGEX_ASSERT(m
.start(status
) == 0);
2126 REGEX_ASSERT(m
.input() == "");
2129 UErrorCode status
= U_ZERO_ERROR
;
2130 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
2131 RegexMatcher
*m
= p
->matcher(status
);
2134 REGEX_ASSERT(m
->find() == FALSE
);
2135 REGEX_ASSERT(utext_nativeLength(m
->inputText()) == 0);
2144 UErrorCode status
= U_ZERO_ERROR
;
2145 UText testPattern
= UTEXT_INITIALIZER
;
2146 UText testText
= UTEXT_INITIALIZER
;
2147 regextst_openUTF8FromInvariant(&testPattern
, ".*", -1, &status
);
2148 REGEX_VERBOSE_TEXT(&testPattern
);
2149 regextst_openUTF8FromInvariant(&testText
, "This is test data", -1, &status
);
2150 REGEX_VERBOSE_TEXT(&testText
);
2152 RegexMatcher
m(&testPattern
, &testText
, 0, status
);
2154 REGEX_ASSERT(m
.regionStart() == 0);
2155 REGEX_ASSERT(m
.regionEnd() == (int32_t)strlen("This is test data"));
2156 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2157 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2159 m
.region(2,4, status
);
2161 REGEX_ASSERT(m
.matches(status
));
2162 REGEX_ASSERT(m
.start(status
)==2);
2163 REGEX_ASSERT(m
.end(status
)==4);
2167 REGEX_ASSERT(m
.regionStart() == 0);
2168 REGEX_ASSERT(m
.regionEnd() == (int32_t)strlen("This is test data"));
2170 regextst_openUTF8FromInvariant(&testText
, "short", -1, &status
);
2171 REGEX_VERBOSE_TEXT(&testText
);
2173 REGEX_ASSERT(m
.regionStart() == 0);
2174 REGEX_ASSERT(m
.regionEnd() == (int32_t)strlen("short"));
2176 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2177 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(FALSE
));
2178 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
2179 REGEX_ASSERT(&m
== &m
.reset());
2180 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
2182 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(TRUE
));
2183 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2184 REGEX_ASSERT(&m
== &m
.reset());
2185 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
2187 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2188 REGEX_ASSERT(&m
== &m
.useTransparentBounds(TRUE
));
2189 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
2190 REGEX_ASSERT(&m
== &m
.reset());
2191 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
2193 REGEX_ASSERT(&m
== &m
.useTransparentBounds(FALSE
));
2194 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2195 REGEX_ASSERT(&m
== &m
.reset());
2196 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
2198 utext_close(&testText
);
2199 utext_close(&testPattern
);
2203 // hitEnd() and requireEnd()
2206 UErrorCode status
= U_ZERO_ERROR
;
2207 UText testPattern
= UTEXT_INITIALIZER
;
2208 UText testText
= UTEXT_INITIALIZER
;
2209 const char str_
[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2210 const char str_aabb
[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2211 utext_openUTF8(&testPattern
, str_
, -1, &status
);
2212 utext_openUTF8(&testText
, str_aabb
, -1, &status
);
2214 RegexMatcher
m1(&testPattern
, &testText
, 0, status
);
2215 REGEX_ASSERT(m1
.lookingAt(status
) == TRUE
);
2216 REGEX_ASSERT(m1
.hitEnd() == TRUE
);
2217 REGEX_ASSERT(m1
.requireEnd() == FALSE
);
2220 status
= U_ZERO_ERROR
;
2221 const char str_a
[] = { 0x61, 0x2a, 0x00 }; /* a* */
2222 utext_openUTF8(&testPattern
, str_a
, -1, &status
);
2223 RegexMatcher
m2(&testPattern
, &testText
, 0, status
);
2224 REGEX_ASSERT(m2
.lookingAt(status
) == TRUE
);
2225 REGEX_ASSERT(m2
.hitEnd() == FALSE
);
2226 REGEX_ASSERT(m2
.requireEnd() == FALSE
);
2229 status
= U_ZERO_ERROR
;
2230 const char str_dotstardollar
[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2231 utext_openUTF8(&testPattern
, str_dotstardollar
, -1, &status
);
2232 RegexMatcher
m3(&testPattern
, &testText
, 0, status
);
2233 REGEX_ASSERT(m3
.lookingAt(status
) == TRUE
);
2234 REGEX_ASSERT(m3
.hitEnd() == TRUE
);
2235 REGEX_ASSERT(m3
.requireEnd() == TRUE
);
2238 utext_close(&testText
);
2239 utext_close(&testPattern
);
2244 //---------------------------------------------------------------------------
2246 // API_Replace_UTF8 API test for class RegexMatcher, testing the
2247 // Replace family of functions.
2249 //---------------------------------------------------------------------------
2250 void RegexTest::API_Replace_UTF8() {
2256 UErrorCode status
=U_ZERO_ERROR
;
2258 UText re
=UTEXT_INITIALIZER
;
2259 regextst_openUTF8FromInvariant(&re
, "abc", -1, &status
);
2260 REGEX_VERBOSE_TEXT(&re
);
2261 RegexPattern
*pat
= RegexPattern::compile(&re
, flags
, pe
, status
);
2264 char data
[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2265 // 012345678901234567
2266 UText dataText
= UTEXT_INITIALIZER
;
2267 utext_openUTF8(&dataText
, data
, -1, &status
);
2269 REGEX_VERBOSE_TEXT(&dataText
);
2270 RegexMatcher
*matcher
= pat
->matcher(&dataText
, RegexPattern::PATTERN_IS_UTEXT
, status
);
2273 // Plain vanilla matches.
2276 UText destText
= UTEXT_INITIALIZER
;
2277 utext_openUnicodeString(&destText
, &dest
, &status
);
2280 UText replText
= UTEXT_INITIALIZER
;
2282 const char str_yz
[] = { 0x79, 0x7a, 0x00 }; /* yz */
2283 utext_openUTF8(&replText
, str_yz
, -1, &status
);
2284 REGEX_VERBOSE_TEXT(&replText
);
2285 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2287 const char str_yzabcabc
[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2288 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc
, result
);
2289 utext_close(result
);
2290 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2292 REGEX_ASSERT(result
== &destText
);
2293 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc
, result
);
2295 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2297 const char str_yzyzyz
[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2298 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz
, result
);
2299 utext_close(result
);
2301 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2302 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2304 REGEX_ASSERT(result
== &destText
);
2305 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz
, result
);
2308 // Plain vanilla non-matches.
2310 const char str_abxabxabx
[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2311 utext_openUTF8(&dataText
, str_abxabxabx
, -1, &status
);
2312 matcher
->reset(&dataText
);
2314 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2316 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2317 utext_close(result
);
2318 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2320 REGEX_ASSERT(result
== &destText
);
2321 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2323 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2325 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2326 utext_close(result
);
2327 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2328 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2330 REGEX_ASSERT(result
== &destText
);
2331 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx
, result
);
2334 // Empty source string
2336 utext_openUTF8(&dataText
, NULL
, 0, &status
);
2337 matcher
->reset(&dataText
);
2339 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2341 REGEX_ASSERT_UTEXT_UTF8("", result
);
2342 utext_close(result
);
2343 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2345 REGEX_ASSERT(result
== &destText
);
2346 REGEX_ASSERT_UTEXT_UTF8("", result
);
2348 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2350 REGEX_ASSERT_UTEXT_UTF8("", result
);
2351 utext_close(result
);
2352 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2354 REGEX_ASSERT(result
== &destText
);
2355 REGEX_ASSERT_UTEXT_UTF8("", result
);
2358 // Empty substitution string
2360 utext_openUTF8(&dataText
, data
, -1, &status
); // ".abc..abc...abc.."
2361 matcher
->reset(&dataText
);
2363 utext_openUTF8(&replText
, NULL
, 0, &status
);
2364 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2366 const char str_abcabc
[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2367 REGEX_ASSERT_UTEXT_UTF8(str_abcabc
, result
);
2368 utext_close(result
);
2369 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2371 REGEX_ASSERT(result
== &destText
);
2372 REGEX_ASSERT_UTEXT_UTF8(str_abcabc
, result
);
2374 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2376 const char str_dots
[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2377 REGEX_ASSERT_UTEXT_UTF8(str_dots
, result
);
2378 utext_close(result
);
2379 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2380 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2382 REGEX_ASSERT(result
== &destText
);
2383 REGEX_ASSERT_UTEXT_UTF8(str_dots
, result
);
2386 // match whole string
2388 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2389 utext_openUTF8(&dataText
, str_abc
, -1, &status
);
2390 matcher
->reset(&dataText
);
2392 const char str_xyz
[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2393 utext_openUTF8(&replText
, str_xyz
, -1, &status
);
2394 result
= matcher
->replaceFirst(&replText
, NULL
, status
);
2396 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2397 utext_close(result
);
2398 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2399 result
= matcher
->replaceFirst(&replText
, &destText
, status
);
2401 REGEX_ASSERT(result
== &destText
);
2402 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2404 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2406 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2407 utext_close(result
);
2408 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2409 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2411 REGEX_ASSERT(result
== &destText
);
2412 REGEX_ASSERT_UTEXT_UTF8(str_xyz
, result
);
2415 // Capture Group, simple case
2417 const char str_add
[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2418 utext_openUTF8(&re
, str_add
, -1, &status
);
2419 RegexPattern
*pat2
= RegexPattern::compile(&re
, flags
, pe
, status
);
2422 const char str_abcdefg
[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2423 utext_openUTF8(&dataText
, str_abcdefg
, -1, &status
);
2424 RegexMatcher
*matcher2
= pat2
->matcher(&dataText
, RegexPattern::PATTERN_IS_UTEXT
, status
);
2427 const char str_11
[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2428 utext_openUTF8(&replText
, str_11
, -1, &status
);
2429 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2431 const char str_bcbcdefg
[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2432 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg
, result
);
2433 utext_close(result
);
2434 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2435 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2437 REGEX_ASSERT(result
== &destText
);
2438 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg
, result
);
2440 regextst_openUTF8FromInvariant(&replText
, "The value of \\$1 is $1.", -1, &status
);
2441 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2443 const char str_Thevalueof1isbcdefg
[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2444 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg
, result
);
2445 utext_close(result
);
2446 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2447 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2449 REGEX_ASSERT(result
== &destText
);
2450 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg
, result
);
2452 const char str_byitselfnogroupnumber
[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x00 }; /* $ by itself, no group number $$$ */
2453 utext_openUTF8(&replText
, str_byitselfnogroupnumber
, -1, &status
);
2454 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2456 const char str_byitselfnogroupnumberdefg
[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2457 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg
, result
);
2458 utext_close(result
);
2459 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2460 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2462 REGEX_ASSERT(result
== &destText
);
2463 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg
, result
);
2465 unsigned char supplDigitChars
[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2466 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2467 // 012345678901234567890123456
2468 supplDigitChars
[22] = 0xF0;
2469 supplDigitChars
[23] = 0x9D;
2470 supplDigitChars
[24] = 0x9F;
2471 supplDigitChars
[25] = 0x8F;
2472 utext_openUTF8(&replText
, (char *)supplDigitChars
, -1, &status
);
2474 result
= matcher2
->replaceFirst(&replText
, NULL
, status
);
2476 const char str_SupplementalDigit1bcdefg
[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2477 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg
, result
);
2478 utext_close(result
);
2479 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2480 result
= matcher2
->replaceFirst(&replText
, &destText
, status
);
2482 REGEX_ASSERT(result
== &destText
);
2483 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg
, result
);
2484 const char str_badcapturegroupnumber5
[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */
2485 utext_openUTF8(&replText
, str_badcapturegroupnumber5
, -1, &status
);
2486 REGEX_ASSERT_FAIL((result
= matcher2
->replaceFirst(&replText
, NULL
, status
)), U_INDEX_OUTOFBOUNDS_ERROR
);
2487 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2488 utext_close(result
);
2489 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2490 REGEX_ASSERT_FAIL((result
= matcher2
->replaceFirst(&replText
, &destText
, status
)), U_INDEX_OUTOFBOUNDS_ERROR
);
2491 REGEX_ASSERT(result
== &destText
);
2492 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2495 // Replacement String with \u hex escapes
2498 const char str_abc1abc2abc3
[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2499 const char str_u0043
[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2500 utext_openUTF8(&dataText
, str_abc1abc2abc3
, -1, &status
);
2501 utext_openUTF8(&replText
, str_u0043
, -1, &status
);
2502 matcher
->reset(&dataText
);
2504 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2506 const char str_C1C2C3
[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2507 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3
, result
);
2508 utext_close(result
);
2509 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2510 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2512 REGEX_ASSERT(result
== &destText
);
2513 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3
, result
);
2516 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2517 utext_openUTF8(&dataText
, str_abc
, -1, &status
);
2518 const char str_U00010000
[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2519 utext_openUTF8(&replText
, str_U00010000
, -1, &status
);
2520 matcher
->reset(&dataText
);
2522 unsigned char expected
[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2529 result
= matcher
->replaceAll(&replText
, NULL
, status
);
2531 REGEX_ASSERT_UTEXT_UTF8((char *)expected
, result
);
2532 utext_close(result
);
2533 utext_replace(&destText
, 0, utext_nativeLength(&destText
), NULL
, 0, &status
);
2534 result
= matcher
->replaceAll(&replText
, &destText
, status
);
2536 REGEX_ASSERT(result
== &destText
);
2537 REGEX_ASSERT_UTEXT_UTF8((char *)expected
, result
);
2539 // TODO: need more through testing of capture substitutions.
2544 status
= U_ZERO_ERROR
;
2545 const char str_ssee
[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2546 const char str_blah
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2547 const char str_ooh
[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2548 utext_openUTF8(&re
, str_ssee
, -1, &status
);
2549 utext_openUTF8(&dataText
, str_blah
, -1, &status
);
2550 utext_openUTF8(&replText
, str_ooh
, -1, &status
);
2552 RegexMatcher
m(&re
, 0, status
);
2555 UnicodeString result
;
2556 UText resultText
= UTEXT_INITIALIZER
;
2557 utext_openUnicodeString(&resultText
, &result
, &status
);
2559 // Multiple finds do NOT bump up the previous appendReplacement postion.
2563 m
.appendReplacement(&resultText
, &replText
, status
);
2565 const char str_blah2
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2566 REGEX_ASSERT_UTEXT_UTF8(str_blah2
, &resultText
);
2568 // After a reset into the interior of a string, appendReplacement still starts at beginning.
2569 status
= U_ZERO_ERROR
;
2571 utext_openUnicodeString(&resultText
, &result
, &status
);
2572 m
.reset(10, status
);
2575 m
.appendReplacement(&resultText
, &replText
, status
);
2577 const char str_blah3
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2578 REGEX_ASSERT_UTEXT_UTF8(str_blah3
, &resultText
);
2580 // find() at interior of string, appendReplacement still starts at beginning.
2581 status
= U_ZERO_ERROR
;
2583 utext_openUnicodeString(&resultText
, &result
, &status
);
2587 m
.appendReplacement(&resultText
, &replText
, status
);
2589 const char str_blah8
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2590 REGEX_ASSERT_UTEXT_UTF8(str_blah8
, &resultText
);
2592 m
.appendTail(&resultText
, status
);
2593 const char str_blah9
[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2594 REGEX_ASSERT_UTEXT_UTF8(str_blah9
, &resultText
);
2596 utext_close(&resultText
);
2604 utext_close(&dataText
);
2605 utext_close(&replText
);
2606 utext_close(&destText
);
2611 //---------------------------------------------------------------------------
2613 // API_Pattern_UTF8 Test that the API for class RegexPattern is
2614 // present and nominally working.
2616 //---------------------------------------------------------------------------
2617 void RegexTest::API_Pattern_UTF8() {
2618 RegexPattern pata
; // Test default constructor to not crash.
2621 REGEX_ASSERT(pata
== patb
);
2622 REGEX_ASSERT(pata
== pata
);
2624 UText re1
= UTEXT_INITIALIZER
;
2625 UText re2
= UTEXT_INITIALIZER
;
2626 UErrorCode status
= U_ZERO_ERROR
;
2629 const char str_abcalmz
[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2630 const char str_def
[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2631 utext_openUTF8(&re1
, str_abcalmz
, -1, &status
);
2632 utext_openUTF8(&re2
, str_def
, -1, &status
);
2634 RegexPattern
*pat1
= RegexPattern::compile(&re1
, 0, pe
, status
);
2635 RegexPattern
*pat2
= RegexPattern::compile(&re2
, 0, pe
, status
);
2637 REGEX_ASSERT(*pat1
== *pat1
);
2638 REGEX_ASSERT(*pat1
!= pata
);
2642 REGEX_ASSERT(patb
== *pat1
);
2645 RegexPattern
patc(*pat1
);
2646 REGEX_ASSERT(patc
== *pat1
);
2647 REGEX_ASSERT(patb
== patc
);
2648 REGEX_ASSERT(pat1
!= pat2
);
2650 REGEX_ASSERT(patb
!= patc
);
2651 REGEX_ASSERT(patb
== *pat2
);
2653 // Compile with no flags.
2654 RegexPattern
*pat1a
= RegexPattern::compile(&re1
, pe
, status
);
2655 REGEX_ASSERT(*pat1a
== *pat1
);
2657 REGEX_ASSERT(pat1a
->flags() == 0);
2659 // Compile with different flags should be not equal
2660 RegexPattern
*pat1b
= RegexPattern::compile(&re1
, UREGEX_CASE_INSENSITIVE
, pe
, status
);
2663 REGEX_ASSERT(*pat1b
!= *pat1a
);
2664 REGEX_ASSERT(pat1b
->flags() == UREGEX_CASE_INSENSITIVE
);
2665 REGEX_ASSERT(pat1a
->flags() == 0);
2669 RegexPattern
*pat1c
= pat1
->clone();
2670 REGEX_ASSERT(*pat1c
== *pat1
);
2671 REGEX_ASSERT(*pat1c
!= *pat2
);
2683 // Verify that a matcher created from a cloned pattern works.
2687 UErrorCode status
= U_ZERO_ERROR
;
2688 UText pattern
= UTEXT_INITIALIZER
;
2689 const char str_pL
[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2690 utext_openUTF8(&pattern
, str_pL
, -1, &status
);
2692 RegexPattern
*pSource
= RegexPattern::compile(&pattern
, 0, status
);
2693 RegexPattern
*pClone
= pSource
->clone();
2695 RegexMatcher
*mFromClone
= pClone
->matcher(status
);
2698 UText input
= UTEXT_INITIALIZER
;
2699 const char str_HelloWorld
[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2700 utext_openUTF8(&input
, str_HelloWorld
, -1, &status
);
2701 mFromClone
->reset(&input
);
2702 REGEX_ASSERT(mFromClone
->find() == TRUE
);
2703 REGEX_ASSERT(mFromClone
->group(status
) == "Hello");
2704 REGEX_ASSERT(mFromClone
->find() == TRUE
);
2705 REGEX_ASSERT(mFromClone
->group(status
) == "World");
2706 REGEX_ASSERT(mFromClone
->find() == FALSE
);
2710 utext_close(&input
);
2711 utext_close(&pattern
);
2715 // matches convenience API
2718 UErrorCode status
= U_ZERO_ERROR
;
2719 UText pattern
= UTEXT_INITIALIZER
;
2720 UText input
= UTEXT_INITIALIZER
;
2722 const char str_randominput
[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2723 utext_openUTF8(&input
, str_randominput
, -1, &status
);
2725 const char str_dotstar
[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2726 utext_openUTF8(&pattern
, str_dotstar
, -1, &status
);
2727 REGEX_ASSERT(RegexPattern::matches(&pattern
, &input
, pe
, status
) == TRUE
);
2730 const char str_abc
[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2731 utext_openUTF8(&pattern
, str_abc
, -1, &status
);
2732 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe
, status
) == FALSE
);
2735 const char str_nput
[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2736 utext_openUTF8(&pattern
, str_nput
, -1, &status
);
2737 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe
, status
) == TRUE
);
2740 utext_openUTF8(&pattern
, str_randominput
, -1, &status
);
2741 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe
, status
) == TRUE
);
2744 const char str_u
[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2745 utext_openUTF8(&pattern
, str_u
, -1, &status
);
2746 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe
, status
) == FALSE
);
2749 utext_openUTF8(&input
, str_abc
, -1, &status
);
2750 utext_openUTF8(&pattern
, str_abc
, -1, &status
);
2751 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
2752 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe
, status
) == FALSE
);
2753 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
2755 utext_close(&input
);
2756 utext_close(&pattern
);
2763 status
= U_ZERO_ERROR
;
2764 const char str_spaceplus
[] = { 0x20, 0x2b, 0x00 }; /* + */
2765 utext_openUTF8(&re1
, str_spaceplus
, -1, &status
);
2766 pat1
= RegexPattern::compile(&re1
, pe
, status
);
2768 UnicodeString fields
[10];
2771 n
= pat1
->split("Now is the time", fields
, 10, status
);
2774 REGEX_ASSERT(fields
[0]=="Now");
2775 REGEX_ASSERT(fields
[1]=="is");
2776 REGEX_ASSERT(fields
[2]=="the");
2777 REGEX_ASSERT(fields
[3]=="time");
2778 REGEX_ASSERT(fields
[4]=="");
2780 n
= pat1
->split("Now is the time", fields
, 2, status
);
2783 REGEX_ASSERT(fields
[0]=="Now");
2784 REGEX_ASSERT(fields
[1]=="is the time");
2785 REGEX_ASSERT(fields
[2]=="the"); // left over from previous test
2788 status
= U_ZERO_ERROR
;
2789 n
= pat1
->split("Now is the time", fields
, 1, status
);
2792 REGEX_ASSERT(fields
[0]=="Now is the time");
2793 REGEX_ASSERT(fields
[1]=="*");
2794 status
= U_ZERO_ERROR
;
2796 n
= pat1
->split(" Now is the time ", fields
, 10, status
);
2799 REGEX_ASSERT(fields
[0]=="");
2800 REGEX_ASSERT(fields
[1]=="Now");
2801 REGEX_ASSERT(fields
[2]=="is");
2802 REGEX_ASSERT(fields
[3]=="the");
2803 REGEX_ASSERT(fields
[4]=="time");
2804 REGEX_ASSERT(fields
[5]=="");
2806 n
= pat1
->split(" ", fields
, 10, status
);
2809 REGEX_ASSERT(fields
[0]=="");
2812 n
= pat1
->split("", fields
, 10, status
);
2815 REGEX_ASSERT(fields
[0]=="foo");
2819 // split, with a pattern with (capture)
2820 regextst_openUTF8FromInvariant(&re1
, "<(\\w*)>", -1, &status
);
2821 pat1
= RegexPattern::compile(&re1
, pe
, status
);
2824 status
= U_ZERO_ERROR
;
2825 n
= pat1
->split("<a>Now is <b>the time<c>", fields
, 10, status
);
2828 REGEX_ASSERT(fields
[0]=="");
2829 REGEX_ASSERT(fields
[1]=="a");
2830 REGEX_ASSERT(fields
[2]=="Now is ");
2831 REGEX_ASSERT(fields
[3]=="b");
2832 REGEX_ASSERT(fields
[4]=="the time");
2833 REGEX_ASSERT(fields
[5]=="c");
2834 REGEX_ASSERT(fields
[6]=="");
2835 REGEX_ASSERT(status
==U_ZERO_ERROR
);
2837 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 10, status
);
2840 REGEX_ASSERT(fields
[0]==" ");
2841 REGEX_ASSERT(fields
[1]=="a");
2842 REGEX_ASSERT(fields
[2]=="Now is ");
2843 REGEX_ASSERT(fields
[3]=="b");
2844 REGEX_ASSERT(fields
[4]=="the time");
2845 REGEX_ASSERT(fields
[5]=="c");
2846 REGEX_ASSERT(fields
[6]=="");
2848 status
= U_ZERO_ERROR
;
2850 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 6, status
);
2853 REGEX_ASSERT(fields
[0]==" ");
2854 REGEX_ASSERT(fields
[1]=="a");
2855 REGEX_ASSERT(fields
[2]=="Now is ");
2856 REGEX_ASSERT(fields
[3]=="b");
2857 REGEX_ASSERT(fields
[4]=="the time");
2858 REGEX_ASSERT(fields
[5]=="c");
2859 REGEX_ASSERT(fields
[6]=="foo");
2861 status
= U_ZERO_ERROR
;
2863 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 5, status
);
2866 REGEX_ASSERT(fields
[0]==" ");
2867 REGEX_ASSERT(fields
[1]=="a");
2868 REGEX_ASSERT(fields
[2]=="Now is ");
2869 REGEX_ASSERT(fields
[3]=="b");
2870 REGEX_ASSERT(fields
[4]=="the time<c>");
2871 REGEX_ASSERT(fields
[5]=="foo");
2873 status
= U_ZERO_ERROR
;
2875 n
= pat1
->split(" <a>Now is <b>the time", fields
, 5, status
);
2878 REGEX_ASSERT(fields
[0]==" ");
2879 REGEX_ASSERT(fields
[1]=="a");
2880 REGEX_ASSERT(fields
[2]=="Now is ");
2881 REGEX_ASSERT(fields
[3]=="b");
2882 REGEX_ASSERT(fields
[4]=="the time");
2883 REGEX_ASSERT(fields
[5]=="foo");
2885 status
= U_ZERO_ERROR
;
2886 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 4, status
);
2889 REGEX_ASSERT(fields
[0]==" ");
2890 REGEX_ASSERT(fields
[1]=="a");
2891 REGEX_ASSERT(fields
[2]=="Now is ");
2892 REGEX_ASSERT(fields
[3]=="the time<c>");
2893 status
= U_ZERO_ERROR
;
2896 regextst_openUTF8FromInvariant(&re1
, "([-,])", -1, &status
);
2897 pat1
= RegexPattern::compile(&re1
, pe
, status
);
2899 n
= pat1
->split("1-10,20", fields
, 10, status
);
2902 REGEX_ASSERT(fields
[0]=="1");
2903 REGEX_ASSERT(fields
[1]=="-");
2904 REGEX_ASSERT(fields
[2]=="10");
2905 REGEX_ASSERT(fields
[3]==",");
2906 REGEX_ASSERT(fields
[4]=="20");
2911 // RegexPattern::pattern() and patternText()
2913 pat1
= new RegexPattern();
2914 REGEX_ASSERT(pat1
->pattern() == "");
2915 REGEX_ASSERT_UTEXT_UTF8("", pat1
->patternText(status
));
2918 regextst_openUTF8FromInvariant(&re1
, "(Hello, world)*", -1, &status
);
2919 pat1
= RegexPattern::compile(&re1
, pe
, status
);
2921 REGEX_ASSERT(pat1
->pattern() == "(Hello, world)*");
2922 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1
->patternText(status
));
2929 //---------------------------------------------------------------------------
2931 // Extended A more thorough check for features of regex patterns
2932 // The test cases are in a separate data file,
2933 // source/tests/testdata/regextst.txt
2934 // A description of the test data format is included in that file.
2936 //---------------------------------------------------------------------------
2939 RegexTest::getPath(char buffer
[2048], const char *filename
) {
2940 UErrorCode status
=U_ZERO_ERROR
;
2941 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
2942 if (U_FAILURE(status
)) {
2943 errln("ERROR: loadTestData() failed - %s", u_errorName(status
));
2947 strcpy(buffer
, testDataDirectory
);
2948 strcat(buffer
, filename
);
2952 void RegexTest::Extended() {
2954 const char *srcPath
;
2955 UErrorCode status
= U_ZERO_ERROR
;
2956 int32_t lineNum
= 0;
2959 // Open and read the test data file.
2961 srcPath
=getPath(tdd
, "regextst.txt");
2963 return; /* something went wrong, error already output */
2967 UChar
*testData
= ReadAndConvertFile(srcPath
, len
, "utf-8", status
);
2968 if (U_FAILURE(status
)) {
2969 return; /* something went wrong, error already output */
2973 // Put the test data into a UnicodeString
2975 UnicodeString
testString(FALSE
, testData
, len
);
2977 RegexMatcher
quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status
);
2978 RegexMatcher
commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status
);
2979 RegexMatcher
flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMvabtyYzZ2-9]*)([:letter:]*)"), 0, status
);
2981 RegexMatcher
lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString
, 0, status
);
2982 UnicodeString testPattern
; // The pattern for test from the test file.
2983 UnicodeString testFlags
; // the flags for a test.
2984 UnicodeString matchString
; // The marked up string to be used as input
2986 if (U_FAILURE(status
)){
2987 dataerrln("Construct RegexMatcher() error.");
2993 // Loop over the test data file, once per line.
2995 while (lineMat
.find()) {
2997 if (U_FAILURE(status
)) {
2998 errln("%s:%d: ICU Error \"%s\"", srcPath
, lineNum
, u_errorName(status
));
3001 status
= U_ZERO_ERROR
;
3002 UnicodeString testLine
= lineMat
.group(1, status
);
3003 if (testLine
.length() == 0) {
3008 // Parse the test line. Skip blank and comment only lines.
3009 // Separate out the three main fields - pattern, flags, target.
3012 commentMat
.reset(testLine
);
3013 if (commentMat
.lookingAt(status
)) {
3014 // This line is a comment, or blank.
3019 // Pull out the pattern field, remove it from the test file line.
3021 quotedStuffMat
.reset(testLine
);
3022 if (quotedStuffMat
.lookingAt(status
)) {
3023 testPattern
= quotedStuffMat
.group(2, status
);
3024 testLine
.remove(0, quotedStuffMat
.end(0, status
));
3026 errln("Bad pattern (missing quotes?) at %s:%d", srcPath
, lineNum
);
3032 // Pull out the flags from the test file line.
3034 flagsMat
.reset(testLine
);
3035 flagsMat
.lookingAt(status
); // Will always match, possibly an empty string.
3036 testFlags
= flagsMat
.group(1, status
);
3037 if (flagsMat
.group(2, status
).length() > 0) {
3038 errln("Bad Match flag at line %d. Scanning %c\n",
3039 lineNum
, flagsMat
.group(2, status
).charAt(0));
3042 testLine
.remove(0, flagsMat
.end(0, status
));
3045 // Pull out the match string, as a whole.
3046 // We'll process the <tags> later.
3048 quotedStuffMat
.reset(testLine
);
3049 if (quotedStuffMat
.lookingAt(status
)) {
3050 matchString
= quotedStuffMat
.group(2, status
);
3051 testLine
.remove(0, quotedStuffMat
.end(0, status
));
3053 errln("Bad match string at test file line %d", lineNum
);
3058 // The only thing left from the input line should be an optional trailing comment.
3060 commentMat
.reset(testLine
);
3061 if (commentMat
.lookingAt(status
) == FALSE
) {
3062 errln("Line %d: unexpected characters at end of test line.", lineNum
);
3069 regex_find(testPattern
, testFlags
, matchString
, srcPath
, lineNum
);
3078 //---------------------------------------------------------------------------
3080 // regex_find(pattern, flags, inputString, lineNumber)
3082 // Function to run a single test from the Extended (data driven) tests.
3083 // See file test/testdata/regextst.txt for a description of the
3084 // pattern and inputString fields, and the allowed flags.
3085 // lineNumber is the source line in regextst.txt of the test.
3087 //---------------------------------------------------------------------------
3090 // Set a value into a UVector at position specified by a decimal number in
3091 // a UnicodeString. This is a utility function needed by the actual test function,
3093 static void set(UVector
&vec
, int32_t val
, UnicodeString index
) {
3094 UErrorCode status
=U_ZERO_ERROR
;
3096 for (int32_t i
=0; i
<index
.length(); i
++) {
3097 int32_t d
=u_charDigitValue(index
.charAt(i
));
3101 while (vec
.size()<idx
+1) {vec
.addElement(-1, status
);}
3102 vec
.setElementAt(val
, idx
);
3105 static void setInt(UVector
&vec
, int32_t val
, int32_t idx
) {
3106 UErrorCode status
=U_ZERO_ERROR
;
3107 while (vec
.size()<idx
+1) {vec
.addElement(-1, status
);}
3108 vec
.setElementAt(val
, idx
);
3111 static UBool
utextOffsetToNative(UText
*utext
, int32_t unistrOffset
, int32_t& nativeIndex
)
3113 UBool couldFind
= TRUE
;
3114 UTEXT_SETNATIVEINDEX(utext
, 0);
3116 while (i
< unistrOffset
) {
3117 UChar32 c
= UTEXT_NEXT32(utext
);
3118 if (c
!= U_SENTINEL
) {
3125 nativeIndex
= UTEXT_GETNATIVEINDEX(utext
);
3130 void RegexTest::regex_find(const UnicodeString
&pattern
,
3131 const UnicodeString
&flags
,
3132 const UnicodeString
&inputString
,
3133 const char *srcPath
,
3135 UnicodeString unEscapedInput
;
3136 UnicodeString deTaggedInput
;
3138 int32_t patternUTF8Length
, inputUTF8Length
;
3139 char *patternChars
= NULL
, *inputChars
= NULL
;
3140 UText patternText
= UTEXT_INITIALIZER
;
3141 UText inputText
= UTEXT_INITIALIZER
;
3142 UConverter
*UTF8Converter
= NULL
;
3144 UErrorCode status
= U_ZERO_ERROR
;
3146 RegexPattern
*parsePat
= NULL
;
3147 RegexMatcher
*parseMatcher
= NULL
;
3148 RegexPattern
*callerPattern
= NULL
, *UTF8Pattern
= NULL
;
3149 RegexMatcher
*matcher
= NULL
, *UTF8Matcher
= NULL
;
3150 UVector
groupStarts(status
);
3151 UVector
groupEnds(status
);
3152 UVector
groupStartsUTF8(status
);
3153 UVector
groupEndsUTF8(status
);
3154 UBool isMatch
= FALSE
, isUTF8Match
= FALSE
;
3155 UBool failed
= FALSE
;
3158 UBool useMatchesFunc
= FALSE
;
3159 UBool useLookingAtFunc
= FALSE
;
3160 int32_t regionStart
= -1;
3161 int32_t regionEnd
= -1;
3162 int32_t regionStartUTF8
= -1;
3163 int32_t regionEndUTF8
= -1;
3167 // Compile the caller's pattern
3169 uint32_t bflags
= 0;
3170 if (flags
.indexOf((UChar
)0x69) >= 0) { // 'i' flag
3171 bflags
|= UREGEX_CASE_INSENSITIVE
;
3173 if (flags
.indexOf((UChar
)0x78) >= 0) { // 'x' flag
3174 bflags
|= UREGEX_COMMENTS
;
3176 if (flags
.indexOf((UChar
)0x73) >= 0) { // 's' flag
3177 bflags
|= UREGEX_DOTALL
;
3179 if (flags
.indexOf((UChar
)0x6d) >= 0) { // 'm' flag
3180 bflags
|= UREGEX_MULTILINE
;
3183 if (flags
.indexOf((UChar
)0x65) >= 0) { // 'e' flag
3184 bflags
|= UREGEX_ERROR_ON_UNKNOWN_ESCAPES
;
3186 if (flags
.indexOf((UChar
)0x44) >= 0) { // 'D' flag
3187 bflags
|= UREGEX_UNIX_LINES
;
3191 callerPattern
= RegexPattern::compile(pattern
, bflags
, pe
, status
);
3192 if (status
!= U_ZERO_ERROR
) {
3193 #if UCONFIG_NO_BREAK_ITERATION==1
3194 // 'v' test flag means that the test pattern should not compile if ICU was configured
3195 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3196 if (flags
.indexOf((UChar
)0x76) >= 0 /*'v'*/ && status
== U_UNSUPPORTED_ERROR
) {
3197 goto cleanupAndReturn
;
3200 if (flags
.indexOf((UChar
)0x45) >= 0) { // flags contain 'E'
3201 // Expected pattern compilation error.
3202 if (flags
.indexOf((UChar
)0x64) >= 0) { // flags contain 'd'
3203 logln("Pattern Compile returns \"%s\"", u_errorName(status
));
3205 goto cleanupAndReturn
;
3207 // Unexpected pattern compilation error.
3208 errln("Line %d: error %s compiling pattern.", line
, u_errorName(status
));
3209 goto cleanupAndReturn
;
3213 UTF8Converter
= ucnv_open("UTF8", &status
);
3214 ucnv_setFromUCallBack(UTF8Converter
, UCNV_FROM_U_CALLBACK_STOP
, NULL
, NULL
, NULL
, &status
);
3216 patternUTF8Length
= pattern
.extract(NULL
, 0, UTF8Converter
, status
);
3217 status
= U_ZERO_ERROR
; // buffer overflow
3218 patternChars
= new char[patternUTF8Length
+1];
3219 pattern
.extract(patternChars
, patternUTF8Length
+1, UTF8Converter
, status
);
3220 utext_openUTF8(&patternText
, patternChars
, patternUTF8Length
, &status
);
3222 if (status
== U_ZERO_ERROR
) {
3223 UTF8Pattern
= RegexPattern::compile(&patternText
, bflags
, pe
, status
);
3225 if (status
!= U_ZERO_ERROR
) {
3226 #if UCONFIG_NO_BREAK_ITERATION==1
3227 // 'v' test flag means that the test pattern should not compile if ICU was configured
3228 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3229 if (flags
.indexOf((UChar
)0x76) >= 0 /*'v'*/ && status
== U_UNSUPPORTED_ERROR
) {
3230 goto cleanupAndReturn
;
3233 if (flags
.indexOf((UChar
)0x45) >= 0) { // flags contain 'E'
3234 // Expected pattern compilation error.
3235 if (flags
.indexOf((UChar
)0x64) >= 0) { // flags contain 'd'
3236 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status
));
3238 goto cleanupAndReturn
;
3240 // Unexpected pattern compilation error.
3241 errln("Line %d: error %s compiling pattern. (UTF8)", line
, u_errorName(status
));
3242 goto cleanupAndReturn
;
3247 if (UTF8Pattern
== NULL
) {
3248 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3249 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath
, line
);
3250 status
= U_ZERO_ERROR
;
3253 if (flags
.indexOf((UChar
)0x64) >= 0) { // 'd' flag
3254 RegexPatternDump(callerPattern
);
3257 if (flags
.indexOf((UChar
)0x45) >= 0) { // 'E' flag
3258 errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath
, line
);
3259 goto cleanupAndReturn
;
3264 // Number of times find() should be called on the test string, default to 1
3267 for (i
=2; i
<=9; i
++) {
3268 if (flags
.indexOf((UChar
)(0x30 + i
)) >= 0) { // digit flag
3269 if (numFinds
!= 1) {
3270 errln("Line %d: more than one digit flag. Scanning %d.", line
, i
);
3271 goto cleanupAndReturn
;
3277 // 'M' flag. Use matches() instead of find()
3278 if (flags
.indexOf((UChar
)0x4d) >= 0) {
3279 useMatchesFunc
= TRUE
;
3281 if (flags
.indexOf((UChar
)0x4c) >= 0) {
3282 useLookingAtFunc
= TRUE
;
3286 // Find the tags in the input data, remove them, and record the group boundary
3289 parsePat
= RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe
, status
);
3290 REGEX_CHECK_STATUS_L(line
);
3292 unEscapedInput
= inputString
.unescape();
3293 parseMatcher
= parsePat
->matcher(unEscapedInput
, status
);
3294 REGEX_CHECK_STATUS_L(line
);
3295 while(parseMatcher
->find()) {
3296 parseMatcher
->appendReplacement(deTaggedInput
, "", status
);
3298 UnicodeString groupNum
= parseMatcher
->group(2, status
);
3299 if (groupNum
== "r") {
3300 // <r> or </r>, a region specification within the string
3301 if (parseMatcher
->group(1, status
) == "/") {
3302 regionEnd
= deTaggedInput
.length();
3304 regionStart
= deTaggedInput
.length();
3307 // <digits> or </digits>, a group match boundary tag.
3308 if (parseMatcher
->group(1, status
) == "/") {
3309 set(groupEnds
, deTaggedInput
.length(), groupNum
);
3311 set(groupStarts
, deTaggedInput
.length(), groupNum
);
3315 parseMatcher
->appendTail(deTaggedInput
);
3316 REGEX_ASSERT_L(groupStarts
.size() == groupEnds
.size(), line
);
3317 if ((regionStart
>=0 || regionEnd
>=0) && (regionStart
<0 || regionStart
>regionEnd
)) {
3318 errln("mismatched <r> tags");
3320 goto cleanupAndReturn
;
3324 // Configure the matcher according to the flags specified with this test.
3326 matcher
= callerPattern
->matcher(deTaggedInput
, status
);
3327 REGEX_CHECK_STATUS_L(line
);
3328 if (flags
.indexOf((UChar
)0x74) >= 0) { // 't' trace flag
3329 matcher
->setTrace(TRUE
);
3332 if (UTF8Pattern
!= NULL
) {
3333 inputUTF8Length
= deTaggedInput
.extract(NULL
, 0, UTF8Converter
, status
);
3334 status
= U_ZERO_ERROR
; // buffer overflow
3335 inputChars
= new char[inputUTF8Length
+1];
3336 deTaggedInput
.extract(inputChars
, inputUTF8Length
+1, UTF8Converter
, status
);
3337 utext_openUTF8(&inputText
, inputChars
, inputUTF8Length
, &status
);
3339 if (status
== U_ZERO_ERROR
) {
3340 UTF8Matcher
= UTF8Pattern
->matcher(&inputText
, RegexPattern::PATTERN_IS_UTEXT
, status
);
3341 REGEX_CHECK_STATUS_L(line
);
3344 if (UTF8Matcher
== NULL
) {
3345 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3346 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath
, line
);
3347 status
= U_ZERO_ERROR
;
3352 // Generate native indices for UTF8 versions of region and capture group info
3354 if (UTF8Matcher
!= NULL
) {
3355 if (regionStart
>=0) (void) utextOffsetToNative(&inputText
, regionStart
, regionStartUTF8
);
3356 if (regionEnd
>=0) (void) utextOffsetToNative(&inputText
, regionEnd
, regionEndUTF8
);
3358 // Fill out the native index UVector info.
3359 // Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3360 for (i
=0; i
<groupStarts
.size(); i
++) {
3361 int32_t start
= groupStarts
.elementAti(i
);
3362 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3365 if (!utextOffsetToNative(&inputText
, start
, startUTF8
)) {
3366 errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line
, i
, start
);
3368 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3370 setInt(groupStartsUTF8
, startUTF8
, i
);
3373 int32_t end
= groupEnds
.elementAti(i
);
3374 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3377 if (!utextOffsetToNative(&inputText
, end
, endUTF8
)) {
3378 errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line
, i
, end
);
3380 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3382 setInt(groupEndsUTF8
, endUTF8
, i
);
3387 if (regionStart
>=0) {
3388 matcher
->region(regionStart
, regionEnd
, status
);
3389 REGEX_CHECK_STATUS_L(line
);
3390 if (UTF8Matcher
!= NULL
) {
3391 UTF8Matcher
->region(regionStartUTF8
, regionEndUTF8
, status
);
3392 REGEX_CHECK_STATUS_L(line
);
3395 if (flags
.indexOf((UChar
)0x61) >= 0) { // 'a' anchoring bounds flag
3396 matcher
->useAnchoringBounds(FALSE
);
3397 if (UTF8Matcher
!= NULL
) {
3398 UTF8Matcher
->useAnchoringBounds(FALSE
);
3401 if (flags
.indexOf((UChar
)0x62) >= 0) { // 'b' transparent bounds flag
3402 matcher
->useTransparentBounds(TRUE
);
3403 if (UTF8Matcher
!= NULL
) {
3404 UTF8Matcher
->useTransparentBounds(TRUE
);
3411 // Do a find on the de-tagged input using the caller's pattern
3412 // TODO: error on count>1 and not find().
3413 // error on both matches() and lookingAt().
3415 for (i
=0; i
<numFinds
; i
++) {
3416 if (useMatchesFunc
) {
3417 isMatch
= matcher
->matches(status
);
3418 if (UTF8Matcher
!= NULL
) {
3419 isUTF8Match
= UTF8Matcher
->matches(status
);
3421 } else if (useLookingAtFunc
) {
3422 isMatch
= matcher
->lookingAt(status
);
3423 if (UTF8Matcher
!= NULL
) {
3424 isUTF8Match
= UTF8Matcher
->lookingAt(status
);
3427 isMatch
= matcher
->find();
3428 if (UTF8Matcher
!= NULL
) {
3429 isUTF8Match
= UTF8Matcher
->find();
3433 matcher
->setTrace(FALSE
);
3436 // Match up the groups from the find() with the groups from the tags
3439 // number of tags should match number of groups from find operation.
3440 // matcher->groupCount does not include group 0, the entire match, hence the +1.
3441 // G option in test means that capture group data is not available in the
3442 // expected results, so the check needs to be suppressed.
3443 if (isMatch
== FALSE
&& groupStarts
.size() != 0) {
3444 errln("Error at line %d: Match expected, but none found.", line
);
3446 goto cleanupAndReturn
;
3447 } else if (UTF8Matcher
!= NULL
&& isUTF8Match
== FALSE
&& groupStarts
.size() != 0) {
3448 errln("Error at line %d: Match expected, but none found. (UTF8)", line
);
3450 goto cleanupAndReturn
;
3453 if (flags
.indexOf((UChar
)0x47 /*G*/) >= 0) {
3454 // Only check for match / no match. Don't check capture groups.
3455 if (isMatch
&& groupStarts
.size() == 0) {
3456 errln("Error at line %d: No match expected, but one found.", line
);
3458 } else if (UTF8Matcher
!= NULL
&& isUTF8Match
&& groupStarts
.size() == 0) {
3459 errln("Error at line %d: No match expected, but one found. (UTF8)", line
);
3462 goto cleanupAndReturn
;
3465 REGEX_CHECK_STATUS_L(line
);
3466 for (i
=0; i
<=matcher
->groupCount(); i
++) {
3467 int32_t expectedStart
= (i
>= groupStarts
.size()? -1 : groupStarts
.elementAti(i
));
3468 int32_t expectedStartUTF8
= (i
>= groupStartsUTF8
.size()? -1 : groupStartsUTF8
.elementAti(i
));
3469 if (matcher
->start(i
, status
) != expectedStart
) {
3470 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
3471 line
, i
, expectedStart
, matcher
->start(i
, status
));
3473 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3474 } else if (UTF8Matcher
!= NULL
&& UTF8Matcher
->start(i
, status
) != expectedStartUTF8
) {
3475 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)",
3476 line
, i
, expectedStartUTF8
, UTF8Matcher
->start(i
, status
));
3478 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
3481 int32_t expectedEnd
= (i
>= groupEnds
.size()? -1 : groupEnds
.elementAti(i
));
3482 int32_t expectedEndUTF8
= (i
>= groupEndsUTF8
.size()? -1 : groupEndsUTF8
.elementAti(i
));
3483 if (matcher
->end(i
, status
) != expectedEnd
) {
3484 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
3485 line
, i
, expectedEnd
, matcher
->end(i
, status
));
3487 // Error on end position; keep going; real error is probably yet to come as group
3488 // end positions work from end of the input data towards the front.
3489 } else if (UTF8Matcher
!= NULL
&& UTF8Matcher
->end(i
, status
) != expectedEndUTF8
) {
3490 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)",
3491 line
, i
, expectedEndUTF8
, UTF8Matcher
->end(i
, status
));
3493 // Error on end position; keep going; real error is probably yet to come as group
3494 // end positions work from end of the input data towards the front.
3497 if ( matcher
->groupCount()+1 < groupStarts
.size()) {
3498 errln("Error at line %d: Expected %d capture groups, found %d.",
3499 line
, groupStarts
.size()-1, matcher
->groupCount());
3502 else if (UTF8Matcher
!= NULL
&& UTF8Matcher
->groupCount()+1 < groupStarts
.size()) {
3503 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3504 line
, groupStarts
.size()-1, UTF8Matcher
->groupCount());
3508 if ((flags
.indexOf((UChar
)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3509 matcher
->requireEnd() == TRUE
) {
3510 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line
);
3512 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3513 UTF8Matcher
->requireEnd() == TRUE
) {
3514 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line
);
3518 if ((flags
.indexOf((UChar
)0x79) >= 0) && // 'y' flag: RequireEnd() == true
3519 matcher
->requireEnd() == FALSE
) {
3520 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line
);
3522 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x79) >= 0) && // 'Y' flag: RequireEnd() == false
3523 UTF8Matcher
->requireEnd() == FALSE
) {
3524 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line
);
3528 if ((flags
.indexOf((UChar
)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3529 matcher
->hitEnd() == TRUE
) {
3530 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line
);
3532 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3533 UTF8Matcher
->hitEnd() == TRUE
) {
3534 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line
);
3538 if ((flags
.indexOf((UChar
)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3539 matcher
->hitEnd() == FALSE
) {
3540 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line
);
3542 } else if (UTF8Matcher
!= NULL
&& (flags
.indexOf((UChar
)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3543 UTF8Matcher
->hitEnd() == FALSE
) {
3544 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line
);
3551 infoln((UnicodeString
)"\""+pattern
+(UnicodeString
)"\" "
3552 +flags
+(UnicodeString
)" \""+inputString
+(UnicodeString
)"\"");
3553 // callerPattern->dump();
3555 delete parseMatcher
;
3560 delete callerPattern
;
3562 utext_close(&inputText
);
3563 delete[] inputChars
;
3564 utext_close(&patternText
);
3565 delete[] patternChars
;
3566 ucnv_close(UTF8Converter
);
3572 //---------------------------------------------------------------------------
3574 // Errors Check for error handling in patterns.
3576 //---------------------------------------------------------------------------
3577 void RegexTest::Errors() {
3578 // \escape sequences that aren't implemented yet.
3579 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3581 // Missing close parentheses
3582 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN
);
3583 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN
);
3584 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN
);
3586 // Extra close paren
3587 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN
);
3588 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN
);
3589 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN
);
3591 // Look-ahead, Look-behind
3592 // TODO: add tests for unbounded length look-behinds.
3593 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX
); // illegal construct
3595 // Attempt to use non-default flags
3598 UErrorCode status
= U_ZERO_ERROR
;
3599 int32_t flags
= UREGEX_CANON_EQ
|
3600 UREGEX_COMMENTS
| UREGEX_DOTALL
|
3602 RegexPattern
*pat1
= RegexPattern::compile(".*", flags
, pe
, status
);
3603 REGEX_ASSERT(status
== U_REGEX_UNIMPLEMENTED
);
3608 // Quantifiers are allowed only after something that can be quantified.
3609 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX
);
3610 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX
);
3611 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX
);
3613 // Mal-formed {min,max} quantifiers
3614 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL
);
3615 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN
);
3616 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL
);
3617 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL
);
3618 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL
);
3619 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG
);
3620 REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG
); // Overflows int during scan
3621 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG
); // Overflows regex binary format
3622 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG
);
3625 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX
);
3627 // Invalid Back Reference \0
3628 // For ICU 3.8 and earlier
3629 // For ICU versions newer than 3.8, \0 introduces an octal escape.
3631 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE
);
3636 //-------------------------------------------------------------------------------
3638 // Read a text data file, convert it to UChars, and return the data
3639 // in one big UChar * buffer, which the caller must delete.
3641 //--------------------------------------------------------------------------------
3642 UChar
*RegexTest::ReadAndConvertFile(const char *fileName
, int32_t &ulen
,
3643 const char *defEncoding
, UErrorCode
&status
) {
3644 UChar
*retPtr
= NULL
;
3645 char *fileBuf
= NULL
;
3646 UConverter
* conv
= NULL
;
3650 if (U_FAILURE(status
)) {
3657 f
= fopen(fileName
, "rb");
3659 dataerrln("Error opening test data file %s\n", fileName
);
3660 status
= U_FILE_ACCESS_ERROR
;
3669 fseek( f
, 0, SEEK_END
);
3670 fileSize
= ftell(f
);
3671 fileBuf
= new char[fileSize
];
3672 fseek(f
, 0, SEEK_SET
);
3673 amt_read
= fread(fileBuf
, 1, fileSize
, f
);
3674 if (amt_read
!= fileSize
|| fileSize
<= 0) {
3675 errln("Error reading test data file.");
3676 goto cleanUpAndReturn
;
3680 // Look for a Unicode Signature (BOM) on the data just read
3682 int32_t signatureLength
;
3683 const char * fileBufC
;
3684 const char* encoding
;
3687 encoding
= ucnv_detectUnicodeSignature(
3688 fileBuf
, fileSize
, &signatureLength
, &status
);
3689 if(encoding
!=NULL
){
3690 fileBufC
+= signatureLength
;
3691 fileSize
-= signatureLength
;
3693 encoding
= defEncoding
;
3694 if (strcmp(encoding
, "utf-8") == 0) {
3695 errln("file %s is missing its BOM", fileName
);
3700 // Open a converter to take the rule file to UTF-16
3702 conv
= ucnv_open(encoding
, &status
);
3703 if (U_FAILURE(status
)) {
3704 goto cleanUpAndReturn
;
3708 // Convert the rules to UChar.
3709 // Preflight first to determine required buffer size.
3711 ulen
= ucnv_toUChars(conv
,
3717 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
3718 // Buffer Overflow is expected from the preflight operation.
3719 status
= U_ZERO_ERROR
;
3721 retPtr
= new UChar
[ulen
+1];
3734 if (U_FAILURE(status
)) {
3735 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
3744 //-------------------------------------------------------------------------------
3746 // PerlTests - Run Perl's regular expression tests
3747 // The input file for this test is re_tests, the standard regular
3748 // expression test data distributed with the Perl source code.
3750 // Here is Perl's description of the test data file:
3752 // # The tests are in a separate file 't/op/re_tests'.
3753 // # Each line in that file is a separate test.
3754 // # There are five columns, separated by tabs.
3756 // # Column 1 contains the pattern, optionally enclosed in C<''>.
3757 // # Modifiers can be put after the closing C<'>.
3759 // # Column 2 contains the string to be matched.
3761 // # Column 3 contains the expected result:
3762 // # y expect a match
3763 // # n expect no match
3764 // # c expect an error
3765 // # B test exposes a known bug in Perl, should be skipped
3766 // # b test exposes a known bug in Perl, should be skipped if noamp
3768 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3770 // # Column 4 contains a string, usually C<$&>.
3772 // # Column 5 contains the expected result of double-quote
3773 // # interpolating that string after the match, or start of error message.
3775 // # Column 6, if present, contains a reason why the test is skipped.
3776 // # This is printed with "skipped", for harness to pick up.
3778 // # \n in the tests are interpolated, as are variables of the form ${\w+}.
3780 // # If you want to add a regular expression test that can't be expressed
3781 // # in this format, don't add it here: put it in op/pat.t instead.
3783 // For ICU, if field 3 contains an 'i', the test will be skipped.
3784 // The test exposes is some known incompatibility between ICU and Perl regexps.
3785 // (The i is in addition to whatever was there before.)
3787 //-------------------------------------------------------------------------------
3788 void RegexTest::PerlTests() {
3790 const char *srcPath
;
3791 UErrorCode status
= U_ZERO_ERROR
;
3795 // Open and read the test data file.
3797 srcPath
=getPath(tdd
, "re_tests.txt");
3799 return; /* something went wrong, error already output */
3803 UChar
*testData
= ReadAndConvertFile(srcPath
, len
, "iso-8859-1", status
);
3804 if (U_FAILURE(status
)) {
3805 return; /* something went wrong, error already output */
3809 // Put the test data into a UnicodeString
3811 UnicodeString
testDataString(FALSE
, testData
, len
);
3814 // Regex to break the input file into lines, and strip the new lines.
3815 // One line per match, capture group one is the desired data.
3817 RegexPattern
* linePat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe
, status
);
3818 if (U_FAILURE(status
)) {
3819 dataerrln("RegexPattern::compile() error");
3822 RegexMatcher
* lineMat
= linePat
->matcher(testDataString
, status
);
3825 // Regex to split a test file line into fields.
3826 // There are six fields, separated by tabs.
3828 RegexPattern
* fieldPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe
, status
);
3831 // Regex to identify test patterns with flag settings, and to separate them.
3832 // Test patterns with flags look like 'pattern'i
3833 // Test patterns without flags are not quoted: pattern
3834 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
3836 RegexPattern
*flagPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe
, status
);
3837 RegexMatcher
* flagMat
= flagPat
->matcher(status
);
3840 // The Perl tests reference several perl-isms, which are evaluated/substituted
3841 // in the test data. Not being perl, this must be done explicitly. Here
3842 // are string constants and REs for these constructs.
3844 UnicodeString
nulnulSrc("${nulnul}");
3845 UnicodeString
nulnul("\\u0000\\u0000", -1, US_INV
);
3846 nulnul
= nulnul
.unescape();
3848 UnicodeString
ffffSrc("${ffff}");
3849 UnicodeString
ffff("\\uffff", -1, US_INV
);
3850 ffff
= ffff
.unescape();
3852 // regexp for $-[0], $+[2], etc.
3853 RegexPattern
*groupsPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe
, status
);
3854 RegexMatcher
*groupsMat
= groupsPat
->matcher(status
);
3856 // regexp for $0, $1, $2, etc.
3857 RegexPattern
*cgPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe
, status
);
3858 RegexMatcher
*cgMat
= cgPat
->matcher(status
);
3862 // Main Loop for the Perl Tests, runs once per line from the
3865 int32_t lineNum
= 0;
3866 int32_t skippedUnimplementedCount
= 0;
3867 while (lineMat
->find()) {
3871 // Get a line, break it into its fields, do the Perl
3872 // variable substitutions.
3874 UnicodeString line
= lineMat
->group(1, status
);
3875 UnicodeString fields
[7];
3876 fieldPat
->split(line
, fields
, 7, status
);
3878 flagMat
->reset(fields
[0]);
3879 flagMat
->matches(status
);
3880 UnicodeString pattern
= flagMat
->group(2, status
);
3881 pattern
.findAndReplace("${bang}", "!");
3882 pattern
.findAndReplace(nulnulSrc
, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
3883 pattern
.findAndReplace(ffffSrc
, ffff
);
3886 // Identify patterns that include match flag settings,
3887 // split off the flags, remove the extra quotes.
3889 UnicodeString flagStr
= flagMat
->group(3, status
);
3890 if (U_FAILURE(status
)) {
3891 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
3895 const UChar UChar_c
= 0x63; // Char constants for the flag letters.
3896 const UChar UChar_i
= 0x69; // (Damn the lack of Unicode support in C)
3897 const UChar UChar_m
= 0x6d;
3898 const UChar UChar_x
= 0x78;
3899 const UChar UChar_y
= 0x79;
3900 if (flagStr
.indexOf(UChar_i
) != -1) {
3901 flags
|= UREGEX_CASE_INSENSITIVE
;
3903 if (flagStr
.indexOf(UChar_m
) != -1) {
3904 flags
|= UREGEX_MULTILINE
;
3906 if (flagStr
.indexOf(UChar_x
) != -1) {
3907 flags
|= UREGEX_COMMENTS
;
3911 // Compile the test pattern.
3913 status
= U_ZERO_ERROR
;
3914 RegexPattern
*testPat
= RegexPattern::compile(pattern
, flags
, pe
, status
);
3915 if (status
== U_REGEX_UNIMPLEMENTED
) {
3917 // Test of a feature that is planned for ICU, but not yet implemented.
3919 skippedUnimplementedCount
++;
3921 status
= U_ZERO_ERROR
;
3925 if (U_FAILURE(status
)) {
3926 // Some tests are supposed to generate errors.
3927 // Only report an error for tests that are supposed to succeed.
3928 if (fields
[2].indexOf(UChar_c
) == -1 && // Compilation is not supposed to fail AND
3929 fields
[2].indexOf(UChar_i
) == -1) // it's not an accepted ICU incompatibility
3931 errln("line %d: ICU Error \"%s\"\n", lineNum
, u_errorName(status
));
3933 status
= U_ZERO_ERROR
;
3938 if (fields
[2].indexOf(UChar_i
) >= 0) {
3939 // ICU should skip this test.
3944 if (fields
[2].indexOf(UChar_c
) >= 0) {
3945 // This pattern should have caused a compilation error, but didn't/
3946 errln("line %d: Expected a pattern compile error, got success.", lineNum
);
3952 // replace the Perl variables that appear in some of the
3953 // match data strings.
3955 UnicodeString matchString
= fields
[1];
3956 matchString
.findAndReplace(nulnulSrc
, nulnul
);
3957 matchString
.findAndReplace(ffffSrc
, ffff
);
3959 // Replace any \n in the match string with an actual new-line char.
3960 // Don't do full unescape, as this unescapes more than Perl does, which
3961 // causes other spurious failures in the tests.
3962 matchString
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
3967 // Run the test, check for expected match/don't match result.
3969 RegexMatcher
*testMat
= testPat
->matcher(matchString
, status
);
3970 UBool found
= testMat
->find();
3971 UBool expected
= FALSE
;
3972 if (fields
[2].indexOf(UChar_y
) >=0) {
3975 if (expected
!= found
) {
3976 errln("line %d: Expected %smatch, got %smatch",
3977 lineNum
, expected
?"":"no ", found
?"":"no " );
3981 // Don't try to check expected results if there is no match.
3982 // (Some have stuff in the expected fields)
3990 // Interpret the Perl expression from the fourth field of the data file,
3991 // building up an ICU string from the results of the ICU match.
3992 // The Perl expression will contain references to the results of
3993 // a regex match, including the matched string, capture group strings,
3994 // group starting and ending indicies, etc.
3996 UnicodeString resultString
;
3997 UnicodeString perlExpr
= fields
[3];
3998 #if SUPPORT_MUTATING_INPUT_STRING
3999 groupsMat
->reset(perlExpr
);
4000 cgMat
->reset(perlExpr
);
4003 while (perlExpr
.length() > 0) {
4004 #if !SUPPORT_MUTATING_INPUT_STRING
4005 // Perferred usage. Reset after any modification to input string.
4006 groupsMat
->reset(perlExpr
);
4007 cgMat
->reset(perlExpr
);
4010 if (perlExpr
.startsWith("$&")) {
4011 resultString
.append(testMat
->group(status
));
4012 perlExpr
.remove(0, 2);
4015 else if (groupsMat
->lookingAt(status
)) {
4017 UnicodeString digitString
= groupsMat
->group(2, status
);
4019 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4020 UnicodeString plusOrMinus
= groupsMat
->group(1, status
);
4021 int32_t matchPosition
;
4022 if (plusOrMinus
.compare("+") == 0) {
4023 matchPosition
= testMat
->end(groupNum
, status
);
4025 matchPosition
= testMat
->start(groupNum
, status
);
4027 if (matchPosition
!= -1) {
4028 ICU_Utility::appendNumber(resultString
, matchPosition
);
4030 perlExpr
.remove(0, groupsMat
->end(status
));
4033 else if (cgMat
->lookingAt(status
)) {
4035 UnicodeString digitString
= cgMat
->group(1, status
);
4037 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4038 if (U_SUCCESS(status
)) {
4039 resultString
.append(testMat
->group(groupNum
, status
));
4040 status
= U_ZERO_ERROR
;
4042 perlExpr
.remove(0, cgMat
->end(status
));
4045 else if (perlExpr
.startsWith("@-")) {
4047 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4049 resultString
.append(" ");
4051 ICU_Utility::appendNumber(resultString
, testMat
->start(i
, status
));
4053 perlExpr
.remove(0, 2);
4056 else if (perlExpr
.startsWith("@+")) {
4058 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4060 resultString
.append(" ");
4062 ICU_Utility::appendNumber(resultString
, testMat
->end(i
, status
));
4064 perlExpr
.remove(0, 2);
4067 else if (perlExpr
.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4068 // or as an escaped sequence (e.g. \n)
4069 if (perlExpr
.length() > 1) {
4070 perlExpr
.remove(0, 1); // Remove the '\', but only if not last char.
4072 UChar c
= perlExpr
.charAt(0);
4074 case 'n': c
= '\n'; break;
4075 // add any other escape sequences that show up in the test expected results.
4077 resultString
.append(c
);
4078 perlExpr
.remove(0, 1);
4082 // Any characters from the perl expression that we don't explicitly
4083 // recognize before here are assumed to be literals and copied
4084 // as-is to the expected results.
4085 resultString
.append(perlExpr
.charAt(0));
4086 perlExpr
.remove(0, 1);
4089 if (U_FAILURE(status
)) {
4090 errln("Line %d: ICU Error \"%s\"", lineNum
, u_errorName(status
));
4096 // Expected Results Compare
4098 UnicodeString
expectedS(fields
[4]);
4099 expectedS
.findAndReplace(nulnulSrc
, nulnul
);
4100 expectedS
.findAndReplace(ffffSrc
, ffff
);
4101 expectedS
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4104 if (expectedS
.compare(resultString
) != 0) {
4105 err("Line %d: Incorrect perl expression results.", lineNum
);
4106 infoln((UnicodeString
)"Expected \""+expectedS
+(UnicodeString
)"\"; got \""+resultString
+(UnicodeString
)"\"");
4114 // All done. Clean up allocated stuff.
4132 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount
);
4137 //-------------------------------------------------------------------------------
4139 // PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts
4140 // (instead of using UnicodeStrings) to test the alternate engine.
4141 // The input file for this test is re_tests, the standard regular
4142 // expression test data distributed with the Perl source code.
4143 // See PerlTests() for more information.
4145 //-------------------------------------------------------------------------------
4146 void RegexTest::PerlTestsUTF8() {
4148 const char *srcPath
;
4149 UErrorCode status
= U_ZERO_ERROR
;
4151 LocalUConverterPointer
UTF8Converter(ucnv_open("UTF-8", &status
));
4152 UText patternText
= UTEXT_INITIALIZER
;
4153 char *patternChars
= NULL
;
4154 int32_t patternLength
;
4155 int32_t patternCapacity
= 0;
4156 UText inputText
= UTEXT_INITIALIZER
;
4157 char *inputChars
= NULL
;
4158 int32_t inputLength
;
4159 int32_t inputCapacity
= 0;
4161 ucnv_setFromUCallBack(UTF8Converter
.getAlias(), UCNV_FROM_U_CALLBACK_STOP
, NULL
, NULL
, NULL
, &status
);
4164 // Open and read the test data file.
4166 srcPath
=getPath(tdd
, "re_tests.txt");
4168 return; /* something went wrong, error already output */
4172 UChar
*testData
= ReadAndConvertFile(srcPath
, len
, "iso-8859-1", status
);
4173 if (U_FAILURE(status
)) {
4174 return; /* something went wrong, error already output */
4178 // Put the test data into a UnicodeString
4180 UnicodeString
testDataString(FALSE
, testData
, len
);
4183 // Regex to break the input file into lines, and strip the new lines.
4184 // One line per match, capture group one is the desired data.
4186 RegexPattern
* linePat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe
, status
);
4187 if (U_FAILURE(status
)) {
4188 dataerrln("RegexPattern::compile() error");
4191 RegexMatcher
* lineMat
= linePat
->matcher(testDataString
, status
);
4194 // Regex to split a test file line into fields.
4195 // There are six fields, separated by tabs.
4197 RegexPattern
* fieldPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe
, status
);
4200 // Regex to identify test patterns with flag settings, and to separate them.
4201 // Test patterns with flags look like 'pattern'i
4202 // Test patterns without flags are not quoted: pattern
4203 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4205 RegexPattern
*flagPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe
, status
);
4206 RegexMatcher
* flagMat
= flagPat
->matcher(status
);
4209 // The Perl tests reference several perl-isms, which are evaluated/substituted
4210 // in the test data. Not being perl, this must be done explicitly. Here
4211 // are string constants and REs for these constructs.
4213 UnicodeString
nulnulSrc("${nulnul}");
4214 UnicodeString
nulnul("\\u0000\\u0000", -1, US_INV
);
4215 nulnul
= nulnul
.unescape();
4217 UnicodeString
ffffSrc("${ffff}");
4218 UnicodeString
ffff("\\uffff", -1, US_INV
);
4219 ffff
= ffff
.unescape();
4221 // regexp for $-[0], $+[2], etc.
4222 RegexPattern
*groupsPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe
, status
);
4223 RegexMatcher
*groupsMat
= groupsPat
->matcher(status
);
4225 // regexp for $0, $1, $2, etc.
4226 RegexPattern
*cgPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe
, status
);
4227 RegexMatcher
*cgMat
= cgPat
->matcher(status
);
4231 // Main Loop for the Perl Tests, runs once per line from the
4234 int32_t lineNum
= 0;
4235 int32_t skippedUnimplementedCount
= 0;
4236 while (lineMat
->find()) {
4240 // Get a line, break it into its fields, do the Perl
4241 // variable substitutions.
4243 UnicodeString line
= lineMat
->group(1, status
);
4244 UnicodeString fields
[7];
4245 fieldPat
->split(line
, fields
, 7, status
);
4247 flagMat
->reset(fields
[0]);
4248 flagMat
->matches(status
);
4249 UnicodeString pattern
= flagMat
->group(2, status
);
4250 pattern
.findAndReplace("${bang}", "!");
4251 pattern
.findAndReplace(nulnulSrc
, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4252 pattern
.findAndReplace(ffffSrc
, ffff
);
4255 // Identify patterns that include match flag settings,
4256 // split off the flags, remove the extra quotes.
4258 UnicodeString flagStr
= flagMat
->group(3, status
);
4259 if (U_FAILURE(status
)) {
4260 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
4264 const UChar UChar_c
= 0x63; // Char constants for the flag letters.
4265 const UChar UChar_i
= 0x69; // (Damn the lack of Unicode support in C)
4266 const UChar UChar_m
= 0x6d;
4267 const UChar UChar_x
= 0x78;
4268 const UChar UChar_y
= 0x79;
4269 if (flagStr
.indexOf(UChar_i
) != -1) {
4270 flags
|= UREGEX_CASE_INSENSITIVE
;
4272 if (flagStr
.indexOf(UChar_m
) != -1) {
4273 flags
|= UREGEX_MULTILINE
;
4275 if (flagStr
.indexOf(UChar_x
) != -1) {
4276 flags
|= UREGEX_COMMENTS
;
4280 // Put the pattern in a UTF-8 UText
4282 status
= U_ZERO_ERROR
;
4283 patternLength
= pattern
.extract(patternChars
, patternCapacity
, UTF8Converter
.getAlias(), status
);
4284 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
4285 status
= U_ZERO_ERROR
;
4286 delete[] patternChars
;
4287 patternCapacity
= patternLength
+ 1;
4288 patternChars
= new char[patternCapacity
];
4289 pattern
.extract(patternChars
, patternCapacity
, UTF8Converter
.getAlias(), status
);
4291 utext_openUTF8(&patternText
, patternChars
, patternLength
, &status
);
4294 // Compile the test pattern.
4296 RegexPattern
*testPat
= RegexPattern::compile(&patternText
, flags
, pe
, status
);
4297 if (status
== U_REGEX_UNIMPLEMENTED
) {
4299 // Test of a feature that is planned for ICU, but not yet implemented.
4301 skippedUnimplementedCount
++;
4303 status
= U_ZERO_ERROR
;
4307 if (U_FAILURE(status
)) {
4308 // Some tests are supposed to generate errors.
4309 // Only report an error for tests that are supposed to succeed.
4310 if (fields
[2].indexOf(UChar_c
) == -1 && // Compilation is not supposed to fail AND
4311 fields
[2].indexOf(UChar_i
) == -1) // it's not an accepted ICU incompatibility
4313 errln("line %d: ICU Error \"%s\"\n", lineNum
, u_errorName(status
));
4315 status
= U_ZERO_ERROR
;
4320 if (fields
[2].indexOf(UChar_i
) >= 0) {
4321 // ICU should skip this test.
4326 if (fields
[2].indexOf(UChar_c
) >= 0) {
4327 // This pattern should have caused a compilation error, but didn't/
4328 errln("line %d: Expected a pattern compile error, got success.", lineNum
);
4335 // replace the Perl variables that appear in some of the
4336 // match data strings.
4338 UnicodeString matchString
= fields
[1];
4339 matchString
.findAndReplace(nulnulSrc
, nulnul
);
4340 matchString
.findAndReplace(ffffSrc
, ffff
);
4342 // Replace any \n in the match string with an actual new-line char.
4343 // Don't do full unescape, as this unescapes more than Perl does, which
4344 // causes other spurious failures in the tests.
4345 matchString
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4348 // Put the input in a UTF-8 UText
4350 status
= U_ZERO_ERROR
;
4351 inputLength
= matchString
.extract(inputChars
, inputCapacity
, UTF8Converter
.getAlias(), status
);
4352 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
4353 status
= U_ZERO_ERROR
;
4354 delete[] inputChars
;
4355 inputCapacity
= inputLength
+ 1;
4356 inputChars
= new char[inputCapacity
];
4357 matchString
.extract(inputChars
, inputCapacity
, UTF8Converter
.getAlias(), status
);
4359 utext_openUTF8(&inputText
, inputChars
, inputLength
, &status
);
4362 // Run the test, check for expected match/don't match result.
4364 RegexMatcher
*testMat
= testPat
->matcher(&inputText
, RegexPattern::PATTERN_IS_UTEXT
, status
);
4365 UBool found
= testMat
->find();
4366 UBool expected
= FALSE
;
4367 if (fields
[2].indexOf(UChar_y
) >=0) {
4370 if (expected
!= found
) {
4371 errln("line %d: Expected %smatch, got %smatch",
4372 lineNum
, expected
?"":"no ", found
?"":"no " );
4376 // Don't try to check expected results if there is no match.
4377 // (Some have stuff in the expected fields)
4385 // Interpret the Perl expression from the fourth field of the data file,
4386 // building up an ICU string from the results of the ICU match.
4387 // The Perl expression will contain references to the results of
4388 // a regex match, including the matched string, capture group strings,
4389 // group starting and ending indicies, etc.
4391 UnicodeString resultString
;
4392 UnicodeString perlExpr
= fields
[3];
4394 while (perlExpr
.length() > 0) {
4395 groupsMat
->reset(perlExpr
);
4396 cgMat
->reset(perlExpr
);
4398 if (perlExpr
.startsWith("$&")) {
4399 resultString
.append(testMat
->group(status
));
4400 perlExpr
.remove(0, 2);
4403 else if (groupsMat
->lookingAt(status
)) {
4405 UnicodeString digitString
= groupsMat
->group(2, status
);
4407 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4408 UnicodeString plusOrMinus
= groupsMat
->group(1, status
);
4409 int32_t matchPosition
;
4410 if (plusOrMinus
.compare("+") == 0) {
4411 matchPosition
= testMat
->end(groupNum
, status
);
4413 matchPosition
= testMat
->start(groupNum
, status
);
4415 if (matchPosition
!= -1) {
4416 ICU_Utility::appendNumber(resultString
, matchPosition
);
4418 perlExpr
.remove(0, groupsMat
->end(status
));
4421 else if (cgMat
->lookingAt(status
)) {
4423 UnicodeString digitString
= cgMat
->group(1, status
);
4425 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
4426 if (U_SUCCESS(status
)) {
4427 resultString
.append(testMat
->group(groupNum
, status
));
4428 status
= U_ZERO_ERROR
;
4430 perlExpr
.remove(0, cgMat
->end(status
));
4433 else if (perlExpr
.startsWith("@-")) {
4435 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4437 resultString
.append(" ");
4439 ICU_Utility::appendNumber(resultString
, testMat
->start(i
, status
));
4441 perlExpr
.remove(0, 2);
4444 else if (perlExpr
.startsWith("@+")) {
4446 for (i
=0; i
<=testMat
->groupCount(); i
++) {
4448 resultString
.append(" ");
4450 ICU_Utility::appendNumber(resultString
, testMat
->end(i
, status
));
4452 perlExpr
.remove(0, 2);
4455 else if (perlExpr
.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4456 // or as an escaped sequence (e.g. \n)
4457 if (perlExpr
.length() > 1) {
4458 perlExpr
.remove(0, 1); // Remove the '\', but only if not last char.
4460 UChar c
= perlExpr
.charAt(0);
4462 case 'n': c
= '\n'; break;
4463 // add any other escape sequences that show up in the test expected results.
4465 resultString
.append(c
);
4466 perlExpr
.remove(0, 1);
4470 // Any characters from the perl expression that we don't explicitly
4471 // recognize before here are assumed to be literals and copied
4472 // as-is to the expected results.
4473 resultString
.append(perlExpr
.charAt(0));
4474 perlExpr
.remove(0, 1);
4477 if (U_FAILURE(status
)) {
4478 errln("Line %d: ICU Error \"%s\"", lineNum
, u_errorName(status
));
4484 // Expected Results Compare
4486 UnicodeString
expectedS(fields
[4]);
4487 expectedS
.findAndReplace(nulnulSrc
, nulnul
);
4488 expectedS
.findAndReplace(ffffSrc
, ffff
);
4489 expectedS
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4492 if (expectedS
.compare(resultString
) != 0) {
4493 err("Line %d: Incorrect perl expression results.", lineNum
);
4494 infoln((UnicodeString
)"Expected \""+expectedS
+(UnicodeString
)"\"; got \""+resultString
+(UnicodeString
)"\"");
4502 // All done. Clean up allocated stuff.
4519 utext_close(&patternText
);
4520 utext_close(&inputText
);
4522 delete [] patternChars
;
4523 delete [] inputChars
;
4526 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount
);
4531 //--------------------------------------------------------------
4533 // Bug6149 Verify limits to heap expansion for backtrack stack.
4534 // Use this pattern,
4536 // The zero-length match will repeat forever.
4537 // (That this goes into a loop is another bug)
4539 //---------------------------------------------------------------
4540 void RegexTest::Bug6149() {
4541 UnicodeString
pattern("(a?){1,}");
4542 UnicodeString
s("xyz");
4544 UErrorCode status
= U_ZERO_ERROR
;
4546 RegexMatcher
matcher(pattern
, s
, flags
, status
);
4547 UBool result
= false;
4548 REGEX_ASSERT_FAIL(result
=matcher
.matches(status
), U_REGEX_STACK_OVERFLOW
);
4549 REGEX_ASSERT(result
== FALSE
);
4554 // Callbacks() Test the callback function.
4555 // When set, callbacks occur periodically during matching operations,
4556 // giving the application code the ability to abort the operation
4557 // before it's normal completion.
4560 struct callBackContext
{
4565 void reset(int32_t max
) {maxCalls
=max
; numCalls
=0; lastSteps
=0;};
4569 static UBool U_CALLCONV
4570 testCallBackFn(const void *context
, int32_t steps
) {
4571 callBackContext
*info
= (callBackContext
*)context
;
4572 if (info
->lastSteps
+1 != steps
) {
4573 info
->test
->errln("incorrect steps in callback. Expected %d, got %d\n", info
->lastSteps
+1, steps
);
4575 info
->lastSteps
= steps
;
4577 return (info
->numCalls
< info
->maxCalls
);
4581 void RegexTest::Callbacks() {
4583 // Getter returns NULLs if no callback has been set
4585 // The variables that the getter will fill in.
4586 // Init to non-null values so that the action of the getter can be seen.
4587 const void *returnedContext
= &returnedContext
;
4588 URegexMatchCallback
*returnedFn
= &testCallBackFn
;
4590 UErrorCode status
= U_ZERO_ERROR
;
4591 RegexMatcher
matcher("x", 0, status
);
4593 matcher
.getMatchCallback(returnedFn
, returnedContext
, status
);
4595 REGEX_ASSERT(returnedFn
== NULL
);
4596 REGEX_ASSERT(returnedContext
== NULL
);
4601 callBackContext cbInfo
= {this, 0, 0, 0};
4602 const void *returnedContext
;
4603 URegexMatchCallback
*returnedFn
;
4604 UErrorCode status
= U_ZERO_ERROR
;
4605 RegexMatcher
matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status
); // A pattern that can run long.
4607 matcher
.setMatchCallback(testCallBackFn
, &cbInfo
, status
);
4609 matcher
.getMatchCallback(returnedFn
, returnedContext
, status
);
4611 REGEX_ASSERT(returnedFn
== testCallBackFn
);
4612 REGEX_ASSERT(returnedContext
== &cbInfo
);
4614 // A short-running match shouldn't invoke the callback
4615 status
= U_ZERO_ERROR
;
4617 UnicodeString s
= "xxx";
4619 REGEX_ASSERT(matcher
.matches(status
));
4621 REGEX_ASSERT(cbInfo
.numCalls
== 0);
4623 // A medium-length match that runs long enough to invoke the
4624 // callback, but not so long that the callback aborts it.
4625 status
= U_ZERO_ERROR
;
4627 s
= "aaaaaaaaaaaaaaaaaaab";
4629 REGEX_ASSERT(matcher
.matches(status
)==FALSE
);
4631 REGEX_ASSERT(cbInfo
.numCalls
> 0);
4633 // A longer running match that the callback function will abort.
4634 status
= U_ZERO_ERROR
;
4636 s
= "aaaaaaaaaaaaaaaaaaaaaaab";
4638 REGEX_ASSERT(matcher
.matches(status
)==FALSE
);
4639 REGEX_ASSERT(status
== U_REGEX_STOPPED_BY_CALLER
);
4640 REGEX_ASSERT(cbInfo
.numCalls
== 4);
4648 // FindProgressCallbacks() Test the find "progress" callback function.
4649 // When set, the find progress callback will be invoked during a find operations
4650 // after each return from a match attempt, giving the application the opportunity
4651 // to terminate a long-running find operation before it's normal completion.
4654 struct progressCallBackContext
{
4659 void reset(int32_t max
) {maxCalls
=max
; numCalls
=0;lastIndex
=0;};
4663 static UBool U_CALLCONV
4664 testProgressCallBackFn(const void *context
, int64_t matchIndex
) {
4665 progressCallBackContext
*info
= (progressCallBackContext
*)context
;
4667 info
->lastIndex
= matchIndex
;
4668 // info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4669 return (info
->numCalls
< info
->maxCalls
);
4673 void RegexTest::FindProgressCallbacks() {
4675 // Getter returns NULLs if no callback has been set
4677 // The variables that the getter will fill in.
4678 // Init to non-null values so that the action of the getter can be seen.
4679 const void *returnedContext
= &returnedContext
;
4680 URegexFindProgressCallback
*returnedFn
= &testProgressCallBackFn
;
4682 UErrorCode status
= U_ZERO_ERROR
;
4683 RegexMatcher
matcher("x", 0, status
);
4685 matcher
.getFindProgressCallback(returnedFn
, returnedContext
, status
);
4687 REGEX_ASSERT(returnedFn
== NULL
);
4688 REGEX_ASSERT(returnedContext
== NULL
);
4693 progressCallBackContext cbInfo
= {this, 0, 0, 0};
4694 const void *returnedContext
;
4695 URegexFindProgressCallback
*returnedFn
;
4696 UErrorCode status
= U_ZERO_ERROR
;
4697 RegexMatcher
matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status
); // A pattern that can run long.
4699 matcher
.setFindProgressCallback(testProgressCallBackFn
, &cbInfo
, status
);
4701 matcher
.getFindProgressCallback(returnedFn
, returnedContext
, status
);
4703 REGEX_ASSERT(returnedFn
== testProgressCallBackFn
);
4704 REGEX_ASSERT(returnedContext
== &cbInfo
);
4706 // A short-running match should NOT invoke the callback.
4707 status
= U_ZERO_ERROR
;
4709 UnicodeString s
= "abxxx";
4712 matcher
.setTrace(TRUE
);
4714 REGEX_ASSERT(matcher
.find(0, status
));
4716 REGEX_ASSERT(cbInfo
.numCalls
== 0);
4718 // A medium running match that causes matcher.find() to invoke our callback for each index.
4719 status
= U_ZERO_ERROR
;
4720 s
= "aaaaaaaaaaaaaaaaaaab";
4721 cbInfo
.reset(s
.length()); // Some upper limit for number of calls that is greater than size of our input string
4723 REGEX_ASSERT(matcher
.find(0, status
)==FALSE
);
4725 REGEX_ASSERT(cbInfo
.numCalls
> 0 && cbInfo
.numCalls
< 25);
4727 // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4728 status
= U_ZERO_ERROR
;
4729 UnicodeString s1
= "aaaaaaaaaaaaaaaaaaaaaaab";
4730 cbInfo
.reset(s1
.length() - 5); // Bail early somewhere near the end of input string
4732 REGEX_ASSERT(matcher
.find(0, status
)==FALSE
);
4734 REGEX_ASSERT(cbInfo
.numCalls
== s1
.length() - 5);
4737 // Now a match that will succeed, but after an interruption
4738 status
= U_ZERO_ERROR
;
4739 UnicodeString s2
= "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4740 cbInfo
.reset(s2
.length() - 10); // Bail early somewhere near the end of input string
4742 REGEX_ASSERT(matcher
.find(0, status
)==FALSE
);
4744 // Now retry the match from where left off
4745 cbInfo
.maxCalls
= 100; // No callback limit
4746 REGEX_ASSERT(matcher
.find(cbInfo
.lastIndex
, status
));
4755 //---------------------------------------------------------------------------
4757 // PreAllocatedUTextCAPI Check the C API with pre-allocated mutable
4758 // UTexts. The pure-C implementation of UText
4759 // has no mutable backing stores, but we can
4760 // use UnicodeString here to test the functionality.
4762 //---------------------------------------------------------------------------
4763 void RegexTest::PreAllocatedUTextCAPI () {
4764 UErrorCode status
= U_ZERO_ERROR
;
4765 URegularExpression
*re
;
4766 UText patternText
= UTEXT_INITIALIZER
;
4767 UnicodeString buffer
;
4768 UText bufferText
= UTEXT_INITIALIZER
;
4770 utext_openUnicodeString(&bufferText
, &buffer
, &status
);
4773 * getText() and getUText()
4776 UText text1
= UTEXT_INITIALIZER
;
4777 UText text2
= UTEXT_INITIALIZER
;
4778 UChar text2Chars
[20];
4781 status
= U_ZERO_ERROR
;
4782 regextst_openUTF8FromInvariant(&text1
, "abcccd", -1, &status
);
4783 regextst_openUTF8FromInvariant(&text2
, "abcccxd", -1, &status
);
4784 u_uastrncpy(text2Chars
, "abcccxd", sizeof(text2
)/2);
4785 utext_openUChars(&text2
, text2Chars
, -1, &status
);
4787 regextst_openUTF8FromInvariant(&patternText
, "abc*d", -1, &status
);
4788 re
= uregex_openUText(&patternText
, 0, NULL
, &status
);
4790 /* First set a UText */
4791 uregex_setUText(re
, &text1
, &status
);
4792 resultText
= uregex_getUText(re
, &bufferText
, &status
);
4794 REGEX_ASSERT(resultText
== &bufferText
);
4795 utext_setNativeIndex(resultText
, 0);
4796 utext_setNativeIndex(&text1
, 0);
4797 REGEX_ASSERT(utext_compare(resultText
, -1, &text1
, -1) == 0);
4799 resultText
= uregex_getUText(re
, &bufferText
, &status
);
4801 REGEX_ASSERT(resultText
== &bufferText
);
4802 utext_setNativeIndex(resultText
, 0);
4803 utext_setNativeIndex(&text1
, 0);
4804 REGEX_ASSERT(utext_compare(resultText
, -1, &text1
, -1) == 0);
4806 /* Then set a UChar * */
4807 uregex_setText(re
, text2Chars
, 7, &status
);
4808 resultText
= uregex_getUText(re
, &bufferText
, &status
);
4810 REGEX_ASSERT(resultText
== &bufferText
);
4811 utext_setNativeIndex(resultText
, 0);
4812 utext_setNativeIndex(&text2
, 0);
4813 REGEX_ASSERT(utext_compare(resultText
, -1, &text2
, -1) == 0);
4816 utext_close(&text1
);
4817 utext_close(&text2
);
4827 u_uastrncpy(text1
, "noise abc interior def, and this is off the end", sizeof(text1
)/2);
4829 status
= U_ZERO_ERROR
;
4830 re
= uregex_openC("abc(.*?)def", 0, NULL
, &status
);
4833 uregex_setText(re
, text1
, -1, &status
);
4834 result
= uregex_find(re
, 0, &status
);
4835 REGEX_ASSERT(result
==TRUE
);
4837 /* Capture Group 0, the full match. Should succeed. */
4838 status
= U_ZERO_ERROR
;
4839 actual
= uregex_groupUTextDeep(re
, 0, &bufferText
, &status
);
4841 REGEX_ASSERT(actual
== &bufferText
);
4842 REGEX_ASSERT_UTEXT_INVARIANT("abc interior def", actual
);
4844 /* Capture group #1. Should succeed. */
4845 status
= U_ZERO_ERROR
;
4846 actual
= uregex_groupUTextDeep(re
, 1, &bufferText
, &status
);
4848 REGEX_ASSERT(actual
== &bufferText
);
4849 REGEX_ASSERT_UTEXT_INVARIANT(" interior ", actual
);
4851 /* Capture group out of range. Error. */
4852 status
= U_ZERO_ERROR
;
4853 actual
= uregex_groupUTextDeep(re
, 2, &bufferText
, &status
);
4854 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
4855 REGEX_ASSERT(actual
== &bufferText
);
4867 UText replText
= UTEXT_INITIALIZER
;
4870 status
= U_ZERO_ERROR
;
4871 u_uastrncpy(text1
, "Replace xaax x1x x...x.", sizeof(text1
)/2);
4872 u_uastrncpy(text2
, "No match here.", sizeof(text2
)/2);
4873 regextst_openUTF8FromInvariant(&replText
, "<$1>", -1, &status
);
4875 re
= uregex_openC("x(.*?)x", 0, NULL
, &status
);
4878 /* Normal case, with match */
4879 uregex_setText(re
, text1
, -1, &status
);
4880 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
4881 result
= uregex_replaceFirstUText(re
, &replText
, &bufferText
, &status
);
4883 REGEX_ASSERT(result
== &bufferText
);
4884 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result
);
4886 /* No match. Text should copy to output with no changes. */
4887 uregex_setText(re
, text2
, -1, &status
);
4888 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
4889 result
= uregex_replaceFirstUText(re
, &replText
, &bufferText
, &status
);
4891 REGEX_ASSERT(result
== &bufferText
);
4892 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result
);
4894 /* Unicode escapes */
4895 uregex_setText(re
, text1
, -1, &status
);
4896 regextst_openUTF8FromInvariant(&replText
, "\\\\\\u0041$1\\U00000042$\\a", -1, &status
);
4897 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
4898 result
= uregex_replaceFirstUText(re
, &replText
, &bufferText
, &status
);
4900 REGEX_ASSERT(result
== &bufferText
);
4901 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result
);
4904 utext_close(&replText
);
4914 UText replText
= UTEXT_INITIALIZER
;
4917 status
= U_ZERO_ERROR
;
4918 u_uastrncpy(text1
, "Replace xaax x1x x...x.", sizeof(text1
)/2);
4919 u_uastrncpy(text2
, "No match here.", sizeof(text2
)/2);
4920 regextst_openUTF8FromInvariant(&replText
, "<$1>", -1, &status
);
4922 re
= uregex_openC("x(.*?)x", 0, NULL
, &status
);
4925 /* Normal case, with match */
4926 uregex_setText(re
, text1
, -1, &status
);
4927 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
4928 result
= uregex_replaceAllUText(re
, &replText
, &bufferText
, &status
);
4930 REGEX_ASSERT(result
== &bufferText
);
4931 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result
);
4933 /* No match. Text should copy to output with no changes. */
4934 uregex_setText(re
, text2
, -1, &status
);
4935 utext_replace(&bufferText
, 0, utext_nativeLength(&bufferText
), NULL
, 0, &status
);
4936 result
= uregex_replaceAllUText(re
, &replText
, &bufferText
, &status
);
4938 REGEX_ASSERT(result
== &bufferText
);
4939 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result
);
4942 utext_close(&replText
);
4947 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
4948 * so we don't need to test it here.
4951 utext_close(&bufferText
);
4952 utext_close(&patternText
);
4955 //--------------------------------------------------------------
4957 // Bug7651 Regex pattern that exceeds default operator stack depth in matcher.
4959 //---------------------------------------------------------------
4960 void RegexTest::Bug7651() {
4961 UnicodeString
pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
4962 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
4963 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
4964 UnicodeString
pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
4965 UnicodeString
s("#ff @abcd This is test");
4966 RegexPattern
*REPattern
= NULL
;
4967 RegexMatcher
*REMatcher
= NULL
;
4968 UErrorCode status
= U_ZERO_ERROR
;
4971 REPattern
= RegexPattern::compile(pattern1
, 0, pe
, status
);
4973 REMatcher
= REPattern
->matcher(s
, status
);
4975 REGEX_ASSERT(REMatcher
->find());
4976 REGEX_ASSERT(REMatcher
->start(status
) == 0);
4979 status
= U_ZERO_ERROR
;
4981 REPattern
= RegexPattern::compile(pattern2
, 0, pe
, status
);
4983 REMatcher
= REPattern
->matcher(s
, status
);
4985 REGEX_ASSERT(REMatcher
->find());
4986 REGEX_ASSERT(REMatcher
->start(status
) == 0);
4989 status
= U_ZERO_ERROR
;
4992 void RegexTest::Bug7740() {
4993 UErrorCode status
= U_ZERO_ERROR
;
4994 UnicodeString pattern
= "(a)";
4995 UnicodeString text
= "abcdef";
4996 RegexMatcher
*m
= new RegexMatcher(pattern
, text
, 0, status
);
4998 REGEX_ASSERT(m
->lookingAt(status
));
5000 status
= U_ILLEGAL_ARGUMENT_ERROR
;
5001 UnicodeString s
= m
->group(1, status
); // Bug 7740: segfault here.
5002 REGEX_ASSERT(status
== U_ILLEGAL_ARGUMENT_ERROR
);
5003 REGEX_ASSERT(s
== "");
5010 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */