1 /********************************************************************
3 * Copyright (c) 2002-2005, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
10 // ICU Regular Expressions test, part of intltest.
14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
16 #include "unicode/regex.h"
17 #include "unicode/uchar.h"
18 #include "unicode/ucnv.h"
27 //---------------------------------------------------------------------------
29 // Test class boilerplate
31 //---------------------------------------------------------------------------
32 RegexTest::RegexTest()
37 RegexTest::~RegexTest()
43 void RegexTest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* /*par*/ )
45 if (exec
) logln("TestSuite RegexTest: ");
48 case 0: name
= "Basic";
51 case 1: name
= "API_Match";
52 if (exec
) API_Match();
54 case 2: name
= "API_Replace";
55 if (exec
) API_Replace();
57 case 3: name
= "API_Pattern";
58 if (exec
) API_Pattern();
60 case 4: name
= "Extended";
63 case 5: name
= "Errors";
66 case 6: name
= "PerlTests";
67 if (exec
) PerlTests();
72 break; //needed to end loop
77 //---------------------------------------------------------------------------
79 // Error Checking / Reporting macros used in all of the tests.
81 //---------------------------------------------------------------------------
82 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {errln("RegexTest failure at line %d. status=%s\n", \
83 __LINE__, u_errorName(status)); return;}}
85 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("RegexTest failure at line %d.\n", __LINE__);};}
87 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
88 if (status!=errcode) {errln("RegexTest failure at line %d. Expected status=%s, got %s\n", \
89 __LINE__, u_errorName(errcode), u_errorName(status));};}
91 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
92 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
94 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
95 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
99 //---------------------------------------------------------------------------
101 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests
102 // for the LookingAt() and Match() functions.
105 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
107 // The expected results are UBool - TRUE or FALSE.
108 // The input text is unescaped. The pattern is not.
111 //---------------------------------------------------------------------------
113 #define REGEX_TESTLM(pat, text, looking, match) doRegexLMTest(pat, text, looking, match, __LINE__);
115 UBool
RegexTest::doRegexLMTest(const char *pat
, const char *text
, UBool looking
, UBool match
, int line
) {
116 const UnicodeString
pattern(pat
);
117 const UnicodeString
inputText(text
);
118 UErrorCode status
= U_ZERO_ERROR
;
120 RegexPattern
*REPattern
= NULL
;
121 RegexMatcher
*REMatcher
= NULL
;
124 UnicodeString
patString(pat
);
125 REPattern
= RegexPattern::compile(patString
, 0, pe
, status
);
126 if (U_FAILURE(status
)) {
127 errln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s\n",
128 line
, u_errorName(status
));
131 if (line
==376) { RegexPatternDump(REPattern
);}
133 UnicodeString
inputString(inputText
);
134 UnicodeString unEscapedInput
= inputString
.unescape();
135 REMatcher
= REPattern
->matcher(unEscapedInput
, status
);
136 if (U_FAILURE(status
)) {
137 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
138 line
, u_errorName(status
));
143 actualmatch
= REMatcher
->lookingAt(status
);
144 if (U_FAILURE(status
)) {
145 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
146 line
, u_errorName(status
));
149 if (actualmatch
!= looking
) {
150 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line
);
154 status
= U_ZERO_ERROR
;
155 actualmatch
= REMatcher
->matches(status
);
156 if (U_FAILURE(status
)) {
157 errln("RegexTest failure in matches() at line %d. Status = %s\n",
158 line
, u_errorName(status
));
161 if (actualmatch
!= match
) {
162 errln("RegexTest: wrong return from matches() at line %d.\n", line
);
166 if (retVal
== FALSE
) {
167 RegexPatternDump(REPattern
);
178 //---------------------------------------------------------------------------
180 // regex_find(pattern, inputString, lineNumber)
182 // function to simplify writing tests regex tests.
184 // The input text is unescaped. The pattern is not.
185 // The input text is marked with the expected match positions
186 // <0>text <1> more text </1> </0>
187 // The <n> </n> tags are removed before trying the match.
188 // The tags mark the start and end of the match and of any capture groups.
191 //---------------------------------------------------------------------------
194 // Set a value into a UVector at position specified by a decimal number in
195 // a UnicodeString. This is a utility function needed by the actual test function,
197 static void set(UVector
&vec
, int val
, UnicodeString index
) {
198 UErrorCode status
=U_ZERO_ERROR
;
200 for (int i
=0; i
<index
.length(); i
++) {
201 int d
=u_charDigitValue(index
.charAt(i
));
205 while (vec
.size()<idx
+1) {vec
.addElement(-1, status
);}
206 vec
.setElementAt(val
, idx
);
209 void RegexTest::regex_find(const UnicodeString
&pattern
,
210 const UnicodeString
&flags
,
211 const UnicodeString
&inputString
,
213 UnicodeString unEscapedInput
;
214 UnicodeString deTaggedInput
;
216 UErrorCode status
= U_ZERO_ERROR
;
218 RegexPattern
*parsePat
= NULL
;
219 RegexMatcher
*parseMatcher
= NULL
;
220 RegexPattern
*callerPattern
= NULL
;
221 RegexMatcher
*matcher
= NULL
;
222 UVector
groupStarts(status
);
223 UVector
groupEnds(status
);
224 UBool isMatch
= FALSE
;
225 UBool failed
= FALSE
;
230 // Compile the caller's pattern
233 if (flags
.indexOf((UChar
)0x69) >= 0) { // 'i' flag
234 bflags
|= UREGEX_CASE_INSENSITIVE
;
236 if (flags
.indexOf((UChar
)0x78) >= 0) { // 'x' flag
237 bflags
|= UREGEX_COMMENTS
;
239 if (flags
.indexOf((UChar
)0x73) >= 0) { // 's' flag
240 bflags
|= UREGEX_DOTALL
;
242 if (flags
.indexOf((UChar
)0x6d) >= 0) { // 'm' flag
243 bflags
|= UREGEX_MULTILINE
;
247 callerPattern
= RegexPattern::compile(pattern
, bflags
, pe
, status
);
248 if (status
!= U_ZERO_ERROR
) {
249 #if UCONFIG_NO_BREAK_ITERATION==1
250 // 'v' test flag means that the test pattern should not compile if ICU was configured
251 // to not include break iteration. RBBI is needed for Unicode word boundaries.
252 if (flags
.indexOf((UChar
)0x76) >= 0 /*'v'*/ && status
== U_UNSUPPORTED_ERROR
) {
253 goto cleanupAndReturn
;
256 errln("Line %d: error %s compiling pattern.", line
, u_errorName(status
));
257 goto cleanupAndReturn
;
260 if (flags
.indexOf((UChar
)'d') >= 0) {
261 RegexPatternDump(callerPattern
);
265 // Number of times find() should be called on the test string, default to 1
268 for (i
=2; i
<=9; i
++) {
269 if (flags
.indexOf((UChar
)(0x30 + i
)) >= 0) { // digit flag
271 errln("Line %d: more than one digit flag. Scanning %d.", line
, i
);
272 goto cleanupAndReturn
;
279 // Find the tags in the input data, remove them, and record the group boundary
282 parsePat
= RegexPattern::compile("<(/?)([0-9]+)>", 0, pe
, status
);
283 REGEX_CHECK_STATUS_L(line
);
285 unEscapedInput
= inputString
.unescape();
286 parseMatcher
= parsePat
->matcher(unEscapedInput
, status
);
287 REGEX_CHECK_STATUS_L(line
);
288 while(parseMatcher
->find()) {
289 parseMatcher
->appendReplacement(deTaggedInput
, "", status
);
291 UnicodeString groupNum
= parseMatcher
->group(2, status
);
292 if (parseMatcher
->group(1, status
) == "/") {
294 set(groupEnds
, deTaggedInput
.length(), groupNum
);
296 set(groupStarts
, deTaggedInput
.length(), groupNum
);
299 parseMatcher
->appendTail(deTaggedInput
);
300 REGEX_ASSERT_L(groupStarts
.size() == groupEnds
.size(), line
);
304 // Do a find on the de-tagged input using the caller's pattern
306 matcher
= callerPattern
->matcher(deTaggedInput
, status
);
307 REGEX_CHECK_STATUS_L(line
);
308 if (flags
.indexOf((UChar
)'t') >= 0) {
309 matcher
->setTrace(TRUE
);
312 for (i
=0; i
<numFinds
; i
++) {
313 isMatch
= matcher
->find();
315 matcher
->setTrace(FALSE
);
318 // Match up the groups from the find() with the groups from the tags
321 // number of tags should match number of groups from find operation.
322 // matcher->groupCount does not include group 0, the entire match, hence the +1.
323 // G option in test means that capture group data is not available in the
324 // expected results, so the check needs to be suppressed.
325 if (isMatch
== FALSE
&& groupStarts
.size() != 0) {
326 errln("Error at line %d: Match expected, but none found.\n", line
);
328 goto cleanupAndReturn
;
331 if (flags
.indexOf((UChar
)0x47 /*G*/) >= 0) {
332 // Only check for match / no match. Don't check capture groups.
333 if (isMatch
&& groupStarts
.size() == 0) {
334 errln("Error at line %d: No match expected, but one found.\n", line
);
337 goto cleanupAndReturn
;
340 for (i
=0; i
<=matcher
->groupCount(); i
++) {
341 int32_t expectedStart
= (i
>= groupStarts
.size()? -1 : groupStarts
.elementAti(i
));
342 if (matcher
->start(i
, status
) != expectedStart
) {
343 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
344 line
, i
, expectedStart
, matcher
->start(i
, status
));
346 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
348 int32_t expectedEnd
= (i
>= groupEnds
.size()? -1 : groupEnds
.elementAti(i
));
349 if (matcher
->end(i
, status
) != expectedEnd
) {
350 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
351 line
, i
, expectedEnd
, matcher
->end(i
, status
));
353 // Error on end position; keep going; real error is probably yet to come as group
354 // end positions work from end of the input data towards the front.
357 if ( matcher
->groupCount()+1 < groupStarts
.size()) {
358 errln("Error at line %d: Expected %d capture groups, found %d.",
359 line
, groupStarts
.size()-1, matcher
->groupCount());
365 errln((UnicodeString
)"\""+pattern
+(UnicodeString
)"\" "
366 +flags
+(UnicodeString
)" \""+inputString
+(UnicodeString
)"\"");
367 // callerPattern->dump();
372 delete callerPattern
;
382 //---------------------------------------------------------------------------
384 // REGEX_ERR Macro + invocation function to simplify writing tests
385 // regex tests for incorrect patterns
388 // REGEX_ERR("pattern", expected error line, column, expected status);
390 //---------------------------------------------------------------------------
391 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
393 void RegexTest::regex_err(const char *pat
, int32_t errLine
, int32_t errCol
,
394 UErrorCode expectedStatus
, int line
) {
395 UnicodeString
pattern(pat
);
397 UErrorCode status
= U_ZERO_ERROR
;
399 RegexPattern
*callerPattern
= NULL
;
402 // Compile the caller's pattern
404 UnicodeString
patString(pat
);
405 callerPattern
= RegexPattern::compile(patString
, 0, pe
, status
);
406 if (status
!= expectedStatus
) {
407 errln("Line %d: unexpected error %s compiling pattern.", line
, u_errorName(status
));
409 if (status
!= U_ZERO_ERROR
) {
410 if (pe
.line
!= errLine
|| pe
.offset
!= errCol
) {
411 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
412 line
, errLine
, errCol
, pe
.line
, pe
.offset
);
417 delete callerPattern
;
422 //---------------------------------------------------------------------------
424 // Basic Check for basic functionality of regex pattern matching.
425 // Avoid the use of REGEX_FIND test macro, which has
426 // substantial dependencies on basic Regex functionality.
428 //---------------------------------------------------------------------------
429 void RegexTest::Basic() {
433 // Debug - slide failing test cases early
437 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
439 UErrorCode status
= U_ZERO_ERROR
;
440 RegexPattern::compile("^(?:a?b?)*$", 0, pe
, status
);
441 // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
442 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
449 // Pattern with parentheses
451 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE
, FALSE
);
452 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE
, TRUE
);
453 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE
, FALSE
);
458 REGEX_TESTLM("st(abc)*ring", "string", TRUE
, TRUE
);
459 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE
, TRUE
);
460 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE
, TRUE
);
461 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE
, FALSE
);
462 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE
, FALSE
);
464 REGEX_TESTLM("a*", "", TRUE
, TRUE
);
465 REGEX_TESTLM("a*", "b", TRUE
, FALSE
);
471 REGEX_TESTLM(".", "abc", TRUE
, FALSE
);
472 REGEX_TESTLM("...", "abc", TRUE
, TRUE
);
473 REGEX_TESTLM("....", "abc", FALSE
, FALSE
);
474 REGEX_TESTLM(".*", "abcxyz123", TRUE
, TRUE
);
475 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE
, FALSE
);
476 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE
, TRUE
);
477 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE
, TRUE
);
478 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE
, FALSE
);
481 // Patterns with * applied to chars at end of literal string
483 REGEX_TESTLM("abc*", "ab", TRUE
, TRUE
);
484 REGEX_TESTLM("abc*", "abccccc", TRUE
, TRUE
);
487 // Supplemental chars match as single chars, not a pair of surrogates.
489 REGEX_TESTLM(".", "\\U00011000", TRUE
, TRUE
);
490 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE
, TRUE
);
491 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE
, FALSE
);
495 // UnicodeSets in the pattern
497 REGEX_TESTLM("[1-6]", "1", TRUE
, TRUE
);
498 REGEX_TESTLM("[1-6]", "3", TRUE
, TRUE
);
499 REGEX_TESTLM("[1-6]", "7", FALSE
, FALSE
);
500 REGEX_TESTLM("a[1-6]", "a3", TRUE
, TRUE
);
501 REGEX_TESTLM("a[1-6]", "a3", TRUE
, TRUE
);
502 REGEX_TESTLM("a[1-6]b", "a3b", TRUE
, TRUE
);
504 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE
, TRUE
);
505 REGEX_TESTLM("a[0-9]*b", "abc", TRUE
, FALSE
);
506 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE
, TRUE
);
507 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE
, FALSE
); // note that * matches 0 occurences.
508 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE
, TRUE
);
511 // OR operator in patterns
513 REGEX_TESTLM("(a|b)", "a", TRUE
, TRUE
);
514 REGEX_TESTLM("(a|b)", "b", TRUE
, TRUE
);
515 REGEX_TESTLM("(a|b)", "c", FALSE
, FALSE
);
516 REGEX_TESTLM("a|b", "b", TRUE
, TRUE
);
518 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE
, TRUE
);
519 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE
, FALSE
);
520 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE
, TRUE
);
521 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE
, TRUE
);
522 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE
, TRUE
);
523 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE
, FALSE
);
528 REGEX_TESTLM("ab+", "abbc", TRUE
, FALSE
);
529 REGEX_TESTLM("ab+c", "ac", FALSE
, FALSE
);
530 REGEX_TESTLM("b+", "", FALSE
, FALSE
);
531 REGEX_TESTLM("(abc|def)+", "defabc", TRUE
, TRUE
);
532 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE
, FALSE
);
533 REGEX_TESTLM(".+y", "zippity dooy", TRUE
, TRUE
);
538 REGEX_TESTLM("ab?", "ab", TRUE
, TRUE
);
539 REGEX_TESTLM("ab?", "a", TRUE
, TRUE
);
540 REGEX_TESTLM("ab?", "ac", TRUE
, FALSE
);
541 REGEX_TESTLM("ab?", "abb", TRUE
, FALSE
);
542 REGEX_TESTLM("a(b|c)?d", "abd", TRUE
, TRUE
);
543 REGEX_TESTLM("a(b|c)?d", "acd", TRUE
, TRUE
);
544 REGEX_TESTLM("a(b|c)?d", "ad", TRUE
, TRUE
);
545 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE
, FALSE
);
546 REGEX_TESTLM("a(b|c)?d", "ab", FALSE
, FALSE
);
549 // Escape sequences that become single literal chars, handled internally
550 // by ICU's Unescape.
553 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
554 REGEX_TESTLM("\\a", "\\u0007", TRUE
, TRUE
); // BEL
555 REGEX_TESTLM("\\cL", "\\u000c", TRUE
, TRUE
); // Control-L
556 REGEX_TESTLM("\\e", "\\u001b", TRUE
, TRUE
); // Escape
557 REGEX_TESTLM("\\f", "\\u000c", TRUE
, TRUE
); // Form Feed
558 REGEX_TESTLM("\\n", "\\u000a", TRUE
, TRUE
); // new line
559 REGEX_TESTLM("\\r", "\\u000d", TRUE
, TRUE
); // CR
560 REGEX_TESTLM("\\t", "\\u0009", TRUE
, TRUE
); // Tab
561 REGEX_TESTLM("\\u1234", "\\u1234", TRUE
, TRUE
);
562 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE
, TRUE
);
564 REGEX_TESTLM(".*\\Ax", "xyz", TRUE
, FALSE
); // \A matches only at the beginning of input
565 REGEX_TESTLM(".*\\Ax", " xyz", FALSE
, FALSE
); // \A matches only at the beginning of input
567 // Escape of special chars in patterns
568 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE
, TRUE
);
574 //---------------------------------------------------------------------------
576 // API_Match Test that the API for class RegexMatcher
577 // is present and nominally working, but excluding functions
578 // implementing replace operations.
580 //---------------------------------------------------------------------------
581 void RegexTest::API_Match() {
583 UErrorCode status
=U_ZERO_ERROR
;
587 // Debug - slide failing test cases early
596 // Simple pattern compilation
599 UnicodeString
re("abc");
601 pat2
= RegexPattern::compile(re
, flags
, pe
, status
);
604 UnicodeString inStr1
= "abcdef this is a test";
605 UnicodeString instr2
= "not abc";
606 UnicodeString empty
= "";
610 // Matcher creation and reset.
612 RegexMatcher
*m1
= pat2
->matcher(inStr1
, status
);
614 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
615 REGEX_ASSERT(m1
->input() == inStr1
);
617 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
618 REGEX_ASSERT(m1
->input() == instr2
);
620 REGEX_ASSERT(m1
->input() == inStr1
);
621 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
623 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
624 REGEX_ASSERT(m1
->input() == empty
);
625 REGEX_ASSERT(&m1
->pattern() == pat2
);
628 // reset(pos, status)
631 m1
->reset(4, status
);
633 REGEX_ASSERT(m1
->input() == inStr1
);
634 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
636 m1
->reset(-1, status
);
637 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
638 status
= U_ZERO_ERROR
;
640 m1
->reset(0, status
);
642 status
= U_ZERO_ERROR
;
644 int32_t len
= m1
->input().length();
645 m1
->reset(len
-1, status
);
647 status
= U_ZERO_ERROR
;
649 m1
->reset(len
, status
);
650 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
651 status
= U_ZERO_ERROR
;
654 // match(pos, status)
657 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
659 REGEX_ASSERT(m1
->matches(3, status
) == FALSE
);
661 REGEX_ASSERT(m1
->matches(5, status
) == FALSE
);
662 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
663 REGEX_ASSERT(m1
->matches(-1, status
) == FALSE
);
664 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
666 // Match() at end of string should fail, but should not
668 status
= U_ZERO_ERROR
;
669 len
= m1
->input().length();
670 REGEX_ASSERT(m1
->matches(len
, status
) == FALSE
);
673 // Match beyond end of string should fail with an error.
674 status
= U_ZERO_ERROR
;
675 REGEX_ASSERT(m1
->matches(len
+1, status
) == FALSE
);
676 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
678 // Successful match at end of string.
680 status
= U_ZERO_ERROR
;
681 RegexMatcher
m("A?", 0, status
); // will match zero length string.
684 len
= inStr1
.length();
685 REGEX_ASSERT(m
.matches(len
, status
) == TRUE
);
688 REGEX_ASSERT(m
.matches(0, status
) == TRUE
);
694 // lookingAt(pos, status)
696 status
= U_ZERO_ERROR
;
697 m1
->reset(instr2
); // "not abc"
698 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
699 REGEX_ASSERT(m1
->lookingAt(5, status
) == FALSE
);
700 REGEX_ASSERT(m1
->lookingAt(3, status
) == FALSE
);
701 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
702 REGEX_ASSERT(m1
->lookingAt(-1, status
) == FALSE
);
703 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
704 status
= U_ZERO_ERROR
;
705 len
= m1
->input().length();
706 REGEX_ASSERT(m1
->lookingAt(len
, status
) == FALSE
);
708 REGEX_ASSERT(m1
->lookingAt(len
+1, status
) == FALSE
);
709 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
718 // RegexMatcher::start();
719 // RegexMatcher::end();
720 // RegexMatcher::groupCount();
725 UErrorCode status
=U_ZERO_ERROR
;
727 UnicodeString
re("01(23(45)67)(.*)");
728 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
730 UnicodeString data
= "0123456789";
732 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
734 REGEX_ASSERT(matcher
->lookingAt(status
) == TRUE
);
735 int matchStarts
[] = {0, 2, 4, 8};
736 int matchEnds
[] = {10, 8, 6, 10};
738 for (i
=0; i
<4; i
++) {
739 int32_t actualStart
= matcher
->start(i
, status
);
741 if (actualStart
!= matchStarts
[i
]) {
742 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
743 __LINE__
, i
, matchStarts
[i
], actualStart
);
745 int32_t actualEnd
= matcher
->end(i
, status
);
747 if (actualEnd
!= matchEnds
[i
]) {
748 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
749 __LINE__
, i
, matchEnds
[i
], actualEnd
);
753 REGEX_ASSERT(matcher
->start(0, status
) == matcher
->start(status
));
754 REGEX_ASSERT(matcher
->end(0, status
) == matcher
->end(status
));
756 REGEX_ASSERT_FAIL(matcher
->start(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
757 REGEX_ASSERT_FAIL(matcher
->start( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
759 REGEX_ASSERT_FAIL(matcher
->start( 0, status
), U_REGEX_INVALID_STATE
);
761 matcher
->lookingAt(status
);
762 REGEX_ASSERT(matcher
->group(status
) == "0123456789");
763 REGEX_ASSERT(matcher
->group(0, status
) == "0123456789");
764 REGEX_ASSERT(matcher
->group(1, status
) == "234567" );
765 REGEX_ASSERT(matcher
->group(2, status
) == "45" );
766 REGEX_ASSERT(matcher
->group(3, status
) == "89" );
768 REGEX_ASSERT_FAIL(matcher
->group(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
769 REGEX_ASSERT_FAIL(matcher
->group( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
771 REGEX_ASSERT_FAIL(matcher
->group( 0, status
), U_REGEX_INVALID_STATE
);
784 UErrorCode status
=U_ZERO_ERROR
;
786 UnicodeString
re("abc");
787 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
789 UnicodeString data
= ".abc..abc...abc..";
790 // 012345678901234567
792 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
794 REGEX_ASSERT(matcher
->find());
795 REGEX_ASSERT(matcher
->start(status
) == 1);
796 REGEX_ASSERT(matcher
->find());
797 REGEX_ASSERT(matcher
->start(status
) == 6);
798 REGEX_ASSERT(matcher
->find());
799 REGEX_ASSERT(matcher
->start(status
) == 12);
800 REGEX_ASSERT(matcher
->find() == FALSE
);
801 REGEX_ASSERT(matcher
->find() == FALSE
);
804 REGEX_ASSERT(matcher
->find());
805 REGEX_ASSERT(matcher
->start(status
) == 1);
807 REGEX_ASSERT(matcher
->find(0, status
));
808 REGEX_ASSERT(matcher
->start(status
) == 1);
809 REGEX_ASSERT(matcher
->find(1, status
));
810 REGEX_ASSERT(matcher
->start(status
) == 1);
811 REGEX_ASSERT(matcher
->find(2, status
));
812 REGEX_ASSERT(matcher
->start(status
) == 6);
813 REGEX_ASSERT(matcher
->find(12, status
));
814 REGEX_ASSERT(matcher
->start(status
) == 12);
815 REGEX_ASSERT(matcher
->find(13, status
) == FALSE
);
816 REGEX_ASSERT(matcher
->find(16, status
) == FALSE
);
817 REGEX_ASSERT(matcher
->find(17, status
) == FALSE
);
818 REGEX_ASSERT_FAIL(matcher
->start(status
), U_REGEX_INVALID_STATE
);
820 status
= U_ZERO_ERROR
;
821 REGEX_ASSERT_FAIL(matcher
->find(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
822 status
= U_ZERO_ERROR
;
823 REGEX_ASSERT_FAIL(matcher
->find(18, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
825 REGEX_ASSERT(matcher
->groupCount() == 0);
833 // find, with \G in pattern (true if at the end of a previous match).
838 UErrorCode status
=U_ZERO_ERROR
;
840 UnicodeString
re(".*?(?:(\\Gabc)|(abc))");
841 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
843 UnicodeString data
= ".abcabc.abc..";
844 // 012345678901234567
846 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
848 REGEX_ASSERT(matcher
->find());
849 REGEX_ASSERT(matcher
->start(status
) == 0);
850 REGEX_ASSERT(matcher
->start(1, status
) == -1);
851 REGEX_ASSERT(matcher
->start(2, status
) == 1);
853 REGEX_ASSERT(matcher
->find());
854 REGEX_ASSERT(matcher
->start(status
) == 4);
855 REGEX_ASSERT(matcher
->start(1, status
) == 4);
856 REGEX_ASSERT(matcher
->start(2, status
) == -1);
864 // find with zero length matches, match position should bump ahead
869 UErrorCode status
=U_ZERO_ERROR
;
870 RegexMatcher
m("(?= ?)", 0, status
); // This pattern will zero-length matches anywhere,
871 // using an always-true look-ahead.
873 UnicodeString
s(" ");
876 if (m
.find() == FALSE
) {
879 REGEX_ASSERT(m
.start(status
) == i
);
880 REGEX_ASSERT(m
.end(status
) == i
);
884 // Check that the bump goes over surrogate pairs OK
885 s
= "\\U00010001\\U00010002\\U00010003\\U00010004";
889 if (m
.find() == FALSE
) {
892 REGEX_ASSERT(m
.start(status
) == i
);
893 REGEX_ASSERT(m
.end(status
) == i
);
898 // find() loop breaking test.
899 // with pattern of /.?/, should see a series of one char matches, then a single
900 // match of zero length at the end of the input string.
902 UErrorCode status
=U_ZERO_ERROR
;
903 RegexMatcher
m(".?", 0, status
);
905 UnicodeString
s(" ");
908 if (m
.find() == FALSE
) {
911 REGEX_ASSERT(m
.start(status
) == i
);
912 REGEX_ASSERT(m
.end(status
) == (i
<4 ? i
+1 : i
));
919 // Matchers with no input string behave as if they had an empty input string.
923 UErrorCode status
= U_ZERO_ERROR
;
924 RegexMatcher
m(".?", 0, status
);
926 REGEX_ASSERT(m
.find());
927 REGEX_ASSERT(m
.start(status
) == 0);
928 REGEX_ASSERT(m
.input() == "");
931 UErrorCode status
= U_ZERO_ERROR
;
932 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
933 RegexMatcher
*m
= p
->matcher(status
);
936 REGEX_ASSERT(m
->find() == FALSE
);
937 REGEX_ASSERT(m
->input() == "");
943 // Compilation error on reset with UChar *
944 // These were a hazard that people were stumbling over with runtime errors.
945 // Changed them to compiler errors by adding private methods that more closely
946 // matched the incorrect use of the functions.
950 UErrorCode status
= U_ZERO_ERROR
;
951 UChar ucharString
[20];
952 RegexMatcher
m(".", 0, status
);
953 m
.reset(ucharString
); // should not compile.
955 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
956 RegexMatcher
*m2
= p
->matcher(ucharString
, status
); // should not compile.
958 RegexMatcher
m3(".", ucharString
, 0, status
); // Should not compile
969 //---------------------------------------------------------------------------
971 // API_Replace API test for class RegexMatcher, testing the
972 // Replace family of functions.
974 //---------------------------------------------------------------------------
975 void RegexTest::API_Replace() {
981 UErrorCode status
=U_ZERO_ERROR
;
983 UnicodeString
re("abc");
984 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
986 UnicodeString data
= ".abc..abc...abc..";
987 // 012345678901234567
988 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
991 // Plain vanilla matches.
994 dest
= matcher
->replaceFirst("yz", status
);
996 REGEX_ASSERT(dest
== ".yz..abc...abc..");
998 dest
= matcher
->replaceAll("yz", status
);
1000 REGEX_ASSERT(dest
== ".yz..yz...yz..");
1003 // Plain vanilla non-matches.
1005 UnicodeString d2
= ".abx..abx...abx..";
1007 dest
= matcher
->replaceFirst("yz", status
);
1009 REGEX_ASSERT(dest
== ".abx..abx...abx..");
1011 dest
= matcher
->replaceAll("yz", status
);
1013 REGEX_ASSERT(dest
== ".abx..abx...abx..");
1016 // Empty source string
1018 UnicodeString d3
= "";
1020 dest
= matcher
->replaceFirst("yz", status
);
1022 REGEX_ASSERT(dest
== "");
1024 dest
= matcher
->replaceAll("yz", status
);
1026 REGEX_ASSERT(dest
== "");
1029 // Empty substitution string
1031 matcher
->reset(data
); // ".abc..abc...abc.."
1032 dest
= matcher
->replaceFirst("", status
);
1034 REGEX_ASSERT(dest
== "...abc...abc..");
1036 dest
= matcher
->replaceAll("", status
);
1038 REGEX_ASSERT(dest
== "........");
1041 // match whole string
1043 UnicodeString d4
= "abc";
1045 dest
= matcher
->replaceFirst("xyz", status
);
1047 REGEX_ASSERT(dest
== "xyz");
1049 dest
= matcher
->replaceAll("xyz", status
);
1051 REGEX_ASSERT(dest
== "xyz");
1054 // Capture Group, simple case
1056 UnicodeString
re2("a(..)");
1057 RegexPattern
*pat2
= RegexPattern::compile(re2
, flags
, pe
, status
);
1059 UnicodeString d5
= "abcdefg";
1060 RegexMatcher
*matcher2
= pat2
->matcher(d5
, status
);
1062 dest
= matcher2
->replaceFirst("$1$1", status
);
1064 REGEX_ASSERT(dest
== "bcbcdefg");
1066 dest
= matcher2
->replaceFirst("The value of \\$1 is $1.", status
);
1068 REGEX_ASSERT(dest
== "The value of $1 is bc.defg");
1070 dest
= matcher2
->replaceFirst("$ by itself, no group number $$$", status
);
1072 REGEX_ASSERT(dest
== "$ by itself, no group number $$$defg");
1074 UnicodeString replacement
= "Supplemental Digit 1 $\\U0001D7CF.";
1075 replacement
= replacement
.unescape();
1076 dest
= matcher2
->replaceFirst(replacement
, status
);
1078 REGEX_ASSERT(dest
== "Supplemental Digit 1 bc.defg");
1080 REGEX_ASSERT_FAIL(matcher2
->replaceFirst("bad capture group number $5...",status
), U_INDEX_OUTOFBOUNDS_ERROR
);
1084 // Replacement String with \u hex escapes
1087 UnicodeString src
= "abc 1 abc 2 abc 3";
1088 UnicodeString substitute
= "--\\u0043--";
1089 matcher
->reset(src
);
1090 UnicodeString result
= matcher
->replaceAll(substitute
, status
);
1092 REGEX_ASSERT(result
== "--C-- 1 --C-- 2 --C-- 3");
1095 UnicodeString src
= "abc !";
1096 UnicodeString substitute
= "--\\U00010000--";
1097 matcher
->reset(src
);
1098 UnicodeString result
= matcher
->replaceAll(substitute
, status
);
1100 UnicodeString expected
= UnicodeString("--");
1101 expected
.append((UChar32
)0x10000);
1102 expected
.append("-- !");
1103 REGEX_ASSERT(result
== expected
);
1105 // TODO: need more through testing of capture substitutions.
1110 status
= U_ZERO_ERROR
;
1111 UnicodeString s
= "The matches start with ss and end with ee ss stuff ee fin";
1112 RegexMatcher
m("ss(.*?)ee", 0, status
);
1114 UnicodeString result
;
1116 // Multiple finds do NOT bump up the previous appendReplacement postion.
1120 m
.appendReplacement(result
, "ooh", status
);
1122 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1124 // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1125 status
= U_ZERO_ERROR
;
1127 m
.reset(10, status
);
1130 m
.appendReplacement(result
, "ooh", status
);
1132 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1134 // find() at interior of string, appendReplacemnt still starts at beginning.
1135 status
= U_ZERO_ERROR
;
1140 m
.appendReplacement(result
, "ooh", status
);
1142 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1144 m
.appendTail(result
);
1145 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh fin");
1156 //---------------------------------------------------------------------------
1158 // API_Pattern Test that the API for class RegexPattern is
1159 // present and nominally working.
1161 //---------------------------------------------------------------------------
1162 void RegexTest::API_Pattern() {
1163 RegexPattern pata
; // Test default constructor to not crash.
1166 REGEX_ASSERT(pata
== patb
);
1167 REGEX_ASSERT(pata
== pata
);
1169 UnicodeString
re1("abc[a-l][m-z]");
1170 UnicodeString
re2("def");
1171 UErrorCode status
= U_ZERO_ERROR
;
1174 RegexPattern
*pat1
= RegexPattern::compile(re1
, 0, pe
, status
);
1175 RegexPattern
*pat2
= RegexPattern::compile(re2
, 0, pe
, status
);
1177 REGEX_ASSERT(*pat1
== *pat1
);
1178 REGEX_ASSERT(*pat1
!= pata
);
1182 REGEX_ASSERT(patb
== *pat1
);
1185 RegexPattern
patc(*pat1
);
1186 REGEX_ASSERT(patc
== *pat1
);
1187 REGEX_ASSERT(patb
== patc
);
1188 REGEX_ASSERT(pat1
!= pat2
);
1190 REGEX_ASSERT(patb
!= patc
);
1191 REGEX_ASSERT(patb
== *pat2
);
1193 // Compile with no flags.
1194 RegexPattern
*pat1a
= RegexPattern::compile(re1
, pe
, status
);
1195 REGEX_ASSERT(*pat1a
== *pat1
);
1197 REGEX_ASSERT(pat1a
->flags() == 0);
1199 // Compile with different flags should be not equal
1200 RegexPattern
*pat1b
= RegexPattern::compile(re1
, UREGEX_CASE_INSENSITIVE
, pe
, status
);
1203 REGEX_ASSERT(*pat1b
!= *pat1a
);
1204 REGEX_ASSERT(pat1b
->flags() == UREGEX_CASE_INSENSITIVE
);
1205 REGEX_ASSERT(pat1a
->flags() == 0);
1209 RegexPattern
*pat1c
= pat1
->clone();
1210 REGEX_ASSERT(*pat1c
== *pat1
);
1211 REGEX_ASSERT(*pat1c
!= *pat2
);
1220 // Verify that a matcher created from a cloned pattern works.
1224 UErrorCode status
= U_ZERO_ERROR
;
1225 RegexPattern
*pSource
= RegexPattern::compile("\\p{L}+", 0, status
);
1226 RegexPattern
*pClone
= pSource
->clone();
1228 RegexMatcher
*mFromClone
= pClone
->matcher(status
);
1230 UnicodeString s
= "Hello World";
1231 mFromClone
->reset(s
);
1232 REGEX_ASSERT(mFromClone
->find() == TRUE
);
1233 REGEX_ASSERT(mFromClone
->group(status
) == "Hello");
1234 REGEX_ASSERT(mFromClone
->find() == TRUE
);
1235 REGEX_ASSERT(mFromClone
->group(status
) == "World");
1236 REGEX_ASSERT(mFromClone
->find() == FALSE
);
1242 // matches convenience API
1244 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe
, status
) == TRUE
);
1246 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe
, status
) == FALSE
);
1248 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe
, status
) == TRUE
);
1250 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe
, status
) == TRUE
);
1252 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe
, status
) == FALSE
);
1254 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1255 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe
, status
) == FALSE
);
1256 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1262 status
= U_ZERO_ERROR
;
1263 pat1
= RegexPattern::compile(" +", pe
, status
);
1265 UnicodeString fields
[10];
1268 n
= pat1
->split("Now is the time", fields
, 10, status
);
1271 REGEX_ASSERT(fields
[0]=="Now");
1272 REGEX_ASSERT(fields
[1]=="is");
1273 REGEX_ASSERT(fields
[2]=="the");
1274 REGEX_ASSERT(fields
[3]=="time");
1275 REGEX_ASSERT(fields
[4]=="");
1277 n
= pat1
->split("Now is the time", fields
, 2, status
);
1280 REGEX_ASSERT(fields
[0]=="Now");
1281 REGEX_ASSERT(fields
[1]=="is the time");
1282 REGEX_ASSERT(fields
[2]=="the"); // left over from previous test
1285 status
= U_ZERO_ERROR
;
1286 n
= pat1
->split("Now is the time", fields
, 1, status
);
1289 REGEX_ASSERT(fields
[0]=="Now is the time");
1290 REGEX_ASSERT(fields
[1]=="*");
1291 status
= U_ZERO_ERROR
;
1293 n
= pat1
->split(" Now is the time ", fields
, 10, status
);
1296 REGEX_ASSERT(fields
[0]=="");
1297 REGEX_ASSERT(fields
[1]=="Now");
1298 REGEX_ASSERT(fields
[2]=="is");
1299 REGEX_ASSERT(fields
[3]=="the");
1300 REGEX_ASSERT(fields
[4]=="time");
1301 REGEX_ASSERT(fields
[5]=="");
1303 n
= pat1
->split(" ", fields
, 10, status
);
1306 REGEX_ASSERT(fields
[0]=="");
1309 n
= pat1
->split("", fields
, 10, status
);
1312 REGEX_ASSERT(fields
[0]=="foo");
1316 // split, with a pattern with (capture)
1317 pat1
= RegexPattern::compile("<(\\w*)>", pe
, status
);
1320 status
= U_ZERO_ERROR
;
1321 n
= pat1
->split("<a>Now is <b>the time<c>", fields
, 10, status
);
1324 REGEX_ASSERT(fields
[0]=="");
1325 REGEX_ASSERT(fields
[1]=="a");
1326 REGEX_ASSERT(fields
[2]=="Now is ");
1327 REGEX_ASSERT(fields
[3]=="b");
1328 REGEX_ASSERT(fields
[4]=="the time");
1329 REGEX_ASSERT(fields
[5]=="c");
1330 REGEX_ASSERT(fields
[6]=="");
1331 REGEX_ASSERT(status
==U_ZERO_ERROR
);
1333 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 10, status
);
1336 REGEX_ASSERT(fields
[0]==" ");
1337 REGEX_ASSERT(fields
[1]=="a");
1338 REGEX_ASSERT(fields
[2]=="Now is ");
1339 REGEX_ASSERT(fields
[3]=="b");
1340 REGEX_ASSERT(fields
[4]=="the time");
1341 REGEX_ASSERT(fields
[5]=="c");
1342 REGEX_ASSERT(fields
[6]=="");
1344 status
= U_ZERO_ERROR
;
1346 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 6, status
);
1349 REGEX_ASSERT(fields
[0]==" ");
1350 REGEX_ASSERT(fields
[1]=="a");
1351 REGEX_ASSERT(fields
[2]=="Now is ");
1352 REGEX_ASSERT(fields
[3]=="b");
1353 REGEX_ASSERT(fields
[4]=="the time");
1354 REGEX_ASSERT(fields
[5]=="c");
1355 REGEX_ASSERT(fields
[6]=="foo");
1357 status
= U_ZERO_ERROR
;
1359 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 5, status
);
1362 REGEX_ASSERT(fields
[0]==" ");
1363 REGEX_ASSERT(fields
[1]=="a");
1364 REGEX_ASSERT(fields
[2]=="Now is ");
1365 REGEX_ASSERT(fields
[3]=="b");
1366 REGEX_ASSERT(fields
[4]=="the time<c>");
1367 REGEX_ASSERT(fields
[5]=="foo");
1369 status
= U_ZERO_ERROR
;
1371 n
= pat1
->split(" <a>Now is <b>the time", fields
, 5, status
);
1374 REGEX_ASSERT(fields
[0]==" ");
1375 REGEX_ASSERT(fields
[1]=="a");
1376 REGEX_ASSERT(fields
[2]=="Now is ");
1377 REGEX_ASSERT(fields
[3]=="b");
1378 REGEX_ASSERT(fields
[4]=="the time");
1379 REGEX_ASSERT(fields
[5]=="foo");
1381 status
= U_ZERO_ERROR
;
1382 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 4, status
);
1385 REGEX_ASSERT(fields
[0]==" ");
1386 REGEX_ASSERT(fields
[1]=="a");
1387 REGEX_ASSERT(fields
[2]=="Now is ");
1388 REGEX_ASSERT(fields
[3]=="the time<c>");
1389 status
= U_ZERO_ERROR
;
1392 pat1
= RegexPattern::compile("([-,])", pe
, status
);
1394 n
= pat1
->split("1-10,20", fields
, 10, status
);
1397 REGEX_ASSERT(fields
[0]=="1");
1398 REGEX_ASSERT(fields
[1]=="-");
1399 REGEX_ASSERT(fields
[2]=="10");
1400 REGEX_ASSERT(fields
[3]==",");
1401 REGEX_ASSERT(fields
[4]=="20");
1406 // RegexPattern::pattern()
1408 pat1
= new RegexPattern();
1409 REGEX_ASSERT(pat1
->pattern() == "");
1412 pat1
= RegexPattern::compile("(Hello, world)*", pe
, status
);
1414 REGEX_ASSERT(pat1
->pattern() == "(Hello, world)*");
1419 // classID functions
1421 pat1
= RegexPattern::compile("(Hello, world)*", pe
, status
);
1423 REGEX_ASSERT(pat1
->getDynamicClassID() == RegexPattern::getStaticClassID());
1424 REGEX_ASSERT(pat1
->getDynamicClassID() != NULL
);
1425 UnicodeString
Hello("Hello, world.");
1426 RegexMatcher
*m
= pat1
->matcher(Hello
, status
);
1427 REGEX_ASSERT(pat1
->getDynamicClassID() != m
->getDynamicClassID());
1428 REGEX_ASSERT(m
->getDynamicClassID() == RegexMatcher::getStaticClassID());
1429 REGEX_ASSERT(m
->getDynamicClassID() != NULL
);
1435 //---------------------------------------------------------------------------
1437 // Extended A more thorough check for features of regex patterns
1438 // The test cases are in a separate data file,
1439 // source/tests/testdata/regextst.txt
1440 // A description of the test data format is included in that file.
1442 //---------------------------------------------------------------------------
1445 RegexTest::getPath(char buffer
[2048], const char *filename
) {
1446 UErrorCode status
=U_ZERO_ERROR
;
1447 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
1448 if (U_FAILURE(status
)) {
1449 errln("ERROR: loadTestData() failed - %s", u_errorName(status
));
1453 strcpy(buffer
, testDataDirectory
);
1454 strcat(buffer
, filename
);
1458 void RegexTest::Extended() {
1460 const char *srcPath
;
1461 UErrorCode status
= U_ZERO_ERROR
;
1462 int32_t lineNum
= 0;
1465 // Open and read the test data file.
1467 srcPath
=getPath(tdd
, "regextst.txt");
1469 return; /* something went wrong, error already output */
1473 UChar
*testData
= ReadAndConvertFile(srcPath
, len
, status
);
1474 if (U_FAILURE(status
)) {
1475 return; /* something went wrong, error already output */
1479 // Put the test data into a UnicodeString
1481 UnicodeString
testString(FALSE
, testData
, len
);
1483 RegexMatcher
quotedStuffMat("\\s*([\\'\\\"/])(.*?)\\1", 0, status
);
1484 RegexMatcher
commentMat ("\\s*(#.*)?$", 0, status
);
1485 RegexMatcher
flagsMat ("\\s*([ixsmdtGv2-9]*)([:letter:]*)", 0, status
);
1487 RegexMatcher
lineMat("(.*?)\\r?\\n", testString
, 0, status
);
1488 UnicodeString testPattern
; // The pattern for test from the test file.
1489 UnicodeString testFlags
; // the flags for a test.
1490 UnicodeString matchString
; // The marked up string to be used as input
1492 if (U_FAILURE(status
)){
1493 dataerrln("Construct RegexMatcher() error.");
1499 // Loop over the test data file, once per line.
1501 while (lineMat
.find()) {
1503 if (U_FAILURE(status
)) {
1504 errln("line %d: ICU Error \"%s\"", lineNum
, u_errorName(status
));
1507 status
= U_ZERO_ERROR
;
1508 UnicodeString testLine
= lineMat
.group(1, status
);
1509 if (testLine
.length() == 0) {
1514 // Parse the test line. Skip blank and comment only lines.
1515 // Separate out the three main fields - pattern, flags, target.
1518 commentMat
.reset(testLine
);
1519 if (commentMat
.lookingAt(status
)) {
1520 // This line is a comment, or blank.
1525 // Pull out the pattern field, remove it from the test file line.
1527 quotedStuffMat
.reset(testLine
);
1528 if (quotedStuffMat
.lookingAt(status
)) {
1529 testPattern
= quotedStuffMat
.group(2, status
);
1530 testLine
.remove(0, quotedStuffMat
.end(0, status
));
1532 errln("Bad pattern (missing quotes?) at test file line %d", lineNum
);
1538 // Pull out the flags from the test file line.
1540 flagsMat
.reset(testLine
);
1541 flagsMat
.lookingAt(status
); // Will always match, possibly an empty string.
1542 testFlags
= flagsMat
.group(1, status
);
1543 if (flagsMat
.group(2, status
).length() > 0) {
1544 errln("Bad Match flag at line %d. Scanning %c\n",
1545 lineNum
, flagsMat
.group(2, status
).charAt(0));
1548 testLine
.remove(0, flagsMat
.end(0, status
));
1551 // Pull out the match string, as a whole.
1552 // We'll process the <tags> later.
1554 quotedStuffMat
.reset(testLine
);
1555 if (quotedStuffMat
.lookingAt(status
)) {
1556 matchString
= quotedStuffMat
.group(2, status
);
1557 testLine
.remove(0, quotedStuffMat
.end(0, status
));
1559 errln("Bad match string at test file line %d", lineNum
);
1564 // The only thing left from the input line should be an optional trailing comment.
1566 commentMat
.reset(testLine
);
1567 if (commentMat
.lookingAt(status
) == FALSE
) {
1568 errln("Line %d: unexpected characters at end of test line.", lineNum
);
1575 regex_find(testPattern
, testFlags
, matchString
, lineNum
);
1584 //---------------------------------------------------------------------------
1586 // Errors Check for error handling in patterns.
1588 //---------------------------------------------------------------------------
1589 void RegexTest::Errors() {
1590 // \escape sequences that aren't implemented yet.
1591 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
1593 // Missing close parentheses
1594 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN
);
1595 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN
);
1596 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN
);
1598 // Extra close paren
1599 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN
);
1600 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN
);
1601 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN
);
1603 // Look-ahead, Look-behind
1604 // TODO: add tests for unbounded length look-behinds.
1605 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX
); // illegal construct
1607 // Attempt to use non-default flags
1610 UErrorCode status
= U_ZERO_ERROR
;
1611 int32_t flags
= UREGEX_CANON_EQ
|
1612 UREGEX_COMMENTS
| UREGEX_DOTALL
|
1614 RegexPattern
*pat1
= RegexPattern::compile(".*", flags
, pe
, status
);
1615 REGEX_ASSERT(status
== U_REGEX_UNIMPLEMENTED
);
1620 // Quantifiers are allowed only after something that can be quantified.
1621 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX
);
1622 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX
);
1623 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX
);
1625 // Mal-formed {min,max} quantifiers
1626 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL
);
1627 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN
);
1628 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL
);
1629 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL
);
1630 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL
);
1631 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG
);
1632 REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG
); // Overflows int during scan
1633 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG
); // Overflows regex binary format
1634 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG
);
1637 // UnicodeSet containing a string
1638 REGEX_ERR("abc[{def}]xyz", 1, 10, U_REGEX_SET_CONTAINS_STRING
);
1643 //-------------------------------------------------------------------------------
1645 // Read a text data file, convert it to UChars, and return the data
1646 // in one big UChar * buffer, which the caller must delete.
1648 //--------------------------------------------------------------------------------
1649 UChar
*RegexTest::ReadAndConvertFile(const char *fileName
, int &ulen
, UErrorCode
&status
) {
1650 UChar
*retPtr
= NULL
;
1651 char *fileBuf
= NULL
;
1652 UConverter
* conv
= NULL
;
1656 if (U_FAILURE(status
)) {
1663 f
= fopen(fileName
, "rb");
1665 errln("Error opening test data file %s\n", fileName
);
1666 status
= U_FILE_ACCESS_ERROR
;
1675 fseek( f
, 0, SEEK_END
);
1676 fileSize
= ftell(f
);
1677 fileBuf
= new char[fileSize
];
1678 fseek(f
, 0, SEEK_SET
);
1679 amt_read
= fread(fileBuf
, 1, fileSize
, f
);
1680 if (amt_read
!= fileSize
|| fileSize
<= 0) {
1681 errln("Error reading test data file.");
1682 goto cleanUpAndReturn
;
1686 // Look for a Unicode Signature (BOM) on the data just read
1688 int32_t signatureLength
;
1689 const char * fileBufC
;
1690 const char* encoding
;
1693 encoding
= ucnv_detectUnicodeSignature(
1694 fileBuf
, fileSize
, &signatureLength
, &status
);
1695 if(encoding
!=NULL
){
1696 fileBufC
+= signatureLength
;
1697 fileSize
-= signatureLength
;
1701 // Open a converter to take the rule file to UTF-16
1703 conv
= ucnv_open(encoding
, &status
);
1704 if (U_FAILURE(status
)) {
1705 goto cleanUpAndReturn
;
1709 // Convert the rules to UChar.
1710 // Preflight first to determine required buffer size.
1712 ulen
= ucnv_toUChars(conv
,
1718 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
1719 // Buffer Overflow is expected from the preflight operation.
1720 status
= U_ZERO_ERROR
;
1722 retPtr
= new UChar
[ulen
+1];
1735 if (U_FAILURE(status
)) {
1736 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
1745 //-------------------------------------------------------------------------------
1747 // PerlTests - Run Perl's regular expression tests
1748 // The input file for this test is re_tests, the standard regular
1749 // expression test data distributed with the Perl source code.
1751 // Here is Perl's description of the test data file:
1753 // # The tests are in a separate file 't/op/re_tests'.
1754 // # Each line in that file is a separate test.
1755 // # There are five columns, separated by tabs.
1757 // # Column 1 contains the pattern, optionally enclosed in C<''>.
1758 // # Modifiers can be put after the closing C<'>.
1760 // # Column 2 contains the string to be matched.
1762 // # Column 3 contains the expected result:
1763 // # y expect a match
1764 // # n expect no match
1765 // # c expect an error
1766 // # B test exposes a known bug in Perl, should be skipped
1767 // # b test exposes a known bug in Perl, should be skipped if noamp
1769 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
1771 // # Column 4 contains a string, usually C<$&>.
1773 // # Column 5 contains the expected result of double-quote
1774 // # interpolating that string after the match, or start of error message.
1776 // # Column 6, if present, contains a reason why the test is skipped.
1777 // # This is printed with "skipped", for harness to pick up.
1779 // # \n in the tests are interpolated, as are variables of the form ${\w+}.
1781 // # If you want to add a regular expression test that can't be expressed
1782 // # in this format, don't add it here: put it in op/pat.t instead.
1784 // For ICU, if field 3 contains an 'i', the test will be skipped.
1785 // The test exposes is some known incompatibility between ICU and Perl regexps.
1786 // (The i is in addition to whatever was there before.)
1788 //-------------------------------------------------------------------------------
1789 void RegexTest::PerlTests() {
1791 const char *srcPath
;
1792 UErrorCode status
= U_ZERO_ERROR
;
1796 // Open and read the test data file.
1798 srcPath
=getPath(tdd
, "re_tests.txt");
1800 return; /* something went wrong, error already output */
1804 UChar
*testData
= ReadAndConvertFile(srcPath
, len
, status
);
1805 if (U_FAILURE(status
)) {
1806 return; /* something went wrong, error already output */
1810 // Put the test data into a UnicodeString
1812 UnicodeString
testDataString(FALSE
, testData
, len
);
1815 // Regex to break the input file into lines, and strip the new lines.
1816 // One line per match, capture group one is the desired data.
1818 RegexPattern
* linePat
= RegexPattern::compile("(.+?)[\\r\\n]+", 0, pe
, status
);
1819 if (U_FAILURE(status
)) {
1820 dataerrln("RegexPattern::compile() error");
1823 RegexMatcher
* lineMat
= linePat
->matcher(testDataString
, status
);
1826 // Regex to split a test file line into fields.
1827 // There are six fields, separated by tabs.
1829 RegexPattern
* fieldPat
= RegexPattern::compile("\\t", 0, pe
, status
);
1832 // Regex to identify test patterns with flag settings, and to separate them.
1833 // Test patterns with flags look like 'pattern'i
1834 // Test patterns without flags are not quoted: pattern
1835 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
1837 RegexPattern
*flagPat
= RegexPattern::compile("('?)(.*)\\1(.*)", 0, pe
, status
);
1838 RegexMatcher
* flagMat
= flagPat
->matcher(status
);
1841 // The Perl tests reference several perl-isms, which are evaluated/substituted
1842 // in the test data. Not being perl, this must be done explicitly. Here
1843 // are string constants and REs for these constructs.
1845 UnicodeString
nulnulSrc("${nulnul}");
1846 UnicodeString
nulnul("\\u0000\\u0000");
1847 nulnul
= nulnul
.unescape();
1849 UnicodeString
ffffSrc("${ffff}");
1850 UnicodeString
ffff("\\uffff");
1851 ffff
= ffff
.unescape();
1853 // regexp for $-[0], $+[2], etc.
1854 RegexPattern
*groupsPat
= RegexPattern::compile("\\$([+\\-])\\[(\\d+)\\]", 0, pe
, status
);
1855 RegexMatcher
*groupsMat
= groupsPat
->matcher(status
);
1857 // regexp for $0, $1, $2, etc.
1858 RegexPattern
*cgPat
= RegexPattern::compile("\\$(\\d+)", 0, pe
, status
);
1859 RegexMatcher
*cgMat
= cgPat
->matcher(status
);
1863 // Main Loop for the Perl Tests, runs once per line from the
1866 int32_t lineNum
= 0;
1867 int32_t skippedUnimplementedCount
= 0;
1868 while (lineMat
->find()) {
1872 // Get a line, break it into its fields, do the Perl
1873 // variable substitutions.
1875 UnicodeString line
= lineMat
->group(1, status
);
1876 UnicodeString fields
[7];
1877 fieldPat
->split(line
, fields
, 7, status
);
1879 flagMat
->reset(fields
[0]);
1880 flagMat
->matches(status
);
1881 UnicodeString pattern
= flagMat
->group(2, status
);
1882 pattern
.findAndReplace("${bang}", "!");
1883 pattern
.findAndReplace(nulnulSrc
, "\\u0000\\u0000");
1884 pattern
.findAndReplace(ffffSrc
, ffff
);
1887 // Identify patterns that include match flag settings,
1888 // split off the flags, remove the extra quotes.
1890 UnicodeString flagStr
= flagMat
->group(3, status
);
1891 if (U_FAILURE(status
)) {
1892 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
1896 const UChar UChar_c
= 0x63; // Char constants for the flag letters.
1897 const UChar UChar_i
= 0x69; // (Damn the lack of Unicode support in C)
1898 const UChar UChar_m
= 0x6d;
1899 const UChar UChar_x
= 0x78;
1900 const UChar UChar_y
= 0x79;
1901 if (flagStr
.indexOf(UChar_i
) != -1) {
1902 flags
|= UREGEX_CASE_INSENSITIVE
;
1904 if (flagStr
.indexOf(UChar_m
) != -1) {
1905 flags
|= UREGEX_MULTILINE
;
1907 if (flagStr
.indexOf(UChar_x
) != -1) {
1908 flags
|= UREGEX_COMMENTS
;
1912 // Compile the test pattern.
1914 status
= U_ZERO_ERROR
;
1915 RegexPattern
*testPat
= RegexPattern::compile(pattern
, flags
, pe
, status
);
1916 if (status
== U_REGEX_UNIMPLEMENTED
) {
1918 // Test of a feature that is planned for ICU, but not yet implemented.
1920 skippedUnimplementedCount
++;
1922 status
= U_ZERO_ERROR
;
1926 if (U_FAILURE(status
)) {
1927 // Some tests are supposed to generate errors.
1928 // Only report an error for tests that are supposed to succeed.
1929 if (fields
[2].indexOf(UChar_c
) == -1 && // Compilation is not supposed to fail AND
1930 fields
[2].indexOf(UChar_i
) == -1) // it's not an accepted ICU incompatibility
1932 errln("line %d: ICU Error \"%s\"\n", lineNum
, u_errorName(status
));
1934 status
= U_ZERO_ERROR
;
1939 if (fields
[2].indexOf(UChar_i
) >= 0) {
1940 // ICU should skip this test.
1945 if (fields
[2].indexOf(UChar_c
) >= 0) {
1946 // This pattern should have caused a compilation error, but didn't/
1947 errln("line %d: Expected a pattern compile error, got success.", lineNum
);
1953 // replace the Perl variables that appear in some of the
1954 // match data strings.
1956 UnicodeString matchString
= fields
[1];
1957 matchString
.findAndReplace(nulnulSrc
, nulnul
);
1958 matchString
.findAndReplace(ffffSrc
, ffff
);
1960 // Replace any \n in the match string with an actual new-line char.
1961 // Don't do full unescape, as this unescapes more than Perl does, which
1962 // causes other spurious failures in the tests.
1963 matchString
.findAndReplace("\\n", "\n");
1968 // Run the test, check for expected match/don't match result.
1970 RegexMatcher
*testMat
= testPat
->matcher(matchString
, status
);
1971 UBool found
= testMat
->find();
1972 UBool expected
= FALSE
;
1973 if (fields
[2].indexOf(UChar_y
) >=0) {
1976 if (expected
!= found
) {
1977 errln("line %d: Expected %smatch, got %smatch",
1978 lineNum
, expected
?"":"no ", found
?"":"no " );
1983 // Interpret the Perl expression from the fourth field of the data file,
1984 // building up an ICU string from the results of the ICU match.
1985 // The Perl expression will contain references to the results of
1986 // a regex match, including the matched string, capture group strings,
1987 // group starting and ending indicies, etc.
1989 UnicodeString resultString
;
1990 UnicodeString perlExpr
= fields
[3];
1991 groupsMat
->reset(perlExpr
);
1992 cgMat
->reset(perlExpr
);
1994 while (perlExpr
.length() > 0) {
1995 if (perlExpr
.startsWith("$&")) {
1996 resultString
.append(testMat
->group(status
));
1997 perlExpr
.remove(0, 2);
2000 else if (groupsMat
->lookingAt(status
)) {
2002 UnicodeString digitString
= groupsMat
->group(2, status
);
2004 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
2005 UnicodeString plusOrMinus
= groupsMat
->group(1, status
);
2006 int32_t matchPosition
;
2007 if (plusOrMinus
.compare("+") == 0) {
2008 matchPosition
= testMat
->end(groupNum
, status
);
2010 matchPosition
= testMat
->start(groupNum
, status
);
2012 if (matchPosition
!= -1) {
2013 ICU_Utility::appendNumber(resultString
, matchPosition
);
2015 perlExpr
.remove(0, groupsMat
->end(status
));
2018 else if (cgMat
->lookingAt(status
)) {
2020 UnicodeString digitString
= cgMat
->group(1, status
);
2022 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
2023 if (U_SUCCESS(status
)) {
2024 resultString
.append(testMat
->group(groupNum
, status
));
2025 status
= U_ZERO_ERROR
;
2027 perlExpr
.remove(0, cgMat
->end(status
));
2030 else if (perlExpr
.startsWith("@-")) {
2032 for (i
=0; i
<=testMat
->groupCount(); i
++) {
2034 resultString
.append(" ");
2036 ICU_Utility::appendNumber(resultString
, testMat
->start(i
, status
));
2038 perlExpr
.remove(0, 2);
2041 else if (perlExpr
.startsWith("@+")) {
2043 for (i
=0; i
<=testMat
->groupCount(); i
++) {
2045 resultString
.append(" ");
2047 ICU_Utility::appendNumber(resultString
, testMat
->end(i
, status
));
2049 perlExpr
.remove(0, 2);
2052 else if (perlExpr
.startsWith("\\")) { // \Escape. Take following char as a literal.
2053 // or as an escaped sequence (e.g. \n)
2054 if (perlExpr
.length() > 1) {
2055 perlExpr
.remove(0, 1); // Remove the '\', but only if not last char.
2057 UChar c
= perlExpr
.charAt(0);
2059 case 'n': c
= '\n'; break;
2060 // add any other escape sequences that show up in the test expected results.
2062 resultString
.append(c
);
2063 perlExpr
.remove(0, 1);
2067 // Any characters from the perl expression that we don't explicitly
2068 // recognize before here are assumed to be literals and copied
2069 // as-is to the expected results.
2070 resultString
.append(perlExpr
.charAt(0));
2071 perlExpr
.remove(0, 1);
2074 if (U_FAILURE(status
)) {
2075 errln("Line %d: ICU Error \"%s\"", lineNum
, u_errorName(status
));
2081 // Expected Results Compare
2083 UnicodeString
expectedS(fields
[4]);
2084 expectedS
.findAndReplace(nulnulSrc
, nulnul
);
2085 expectedS
.findAndReplace(ffffSrc
, ffff
);
2086 expectedS
.findAndReplace("\\n", "\n");
2089 if (expectedS
.compare(resultString
) != 0) {
2090 err("Line %d: Incorrect perl expression results.", lineNum
);
2091 errln((UnicodeString
)"Expected \""+expectedS
+(UnicodeString
)"\"; got \""+resultString
+(UnicodeString
)"\"");
2099 // All done. Clean up allocated stuff.
2117 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount
);
2123 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */