1 /********************************************************************
3 * Copyright (c) 2002-2004, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
10 // ICU Regular Expressions test, part of intltest.
14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
16 #include "unicode/regex.h"
17 #include "unicode/uchar.h"
18 #include "unicode/ucnv.h"
27 //---------------------------------------------------------------------------
29 // Test class boilerplate
31 //---------------------------------------------------------------------------
32 RegexTest::RegexTest()
37 RegexTest::~RegexTest()
43 void RegexTest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* /*par*/ )
45 if (exec
) logln("TestSuite RegexTest: ");
48 case 0: name
= "Basic";
51 case 1: name
= "API_Match";
52 if (exec
) API_Match();
54 case 2: name
= "API_Replace";
55 if (exec
) API_Replace();
57 case 3: name
= "API_Pattern";
58 if (exec
) API_Pattern();
60 case 4: name
= "Extended";
63 case 5: name
= "Errors";
66 case 6: name
= "PerlTests";
67 if (exec
) PerlTests();
72 break; //needed to end loop
77 //---------------------------------------------------------------------------
79 // Error Checking / Reporting macros used in all of the tests.
81 //---------------------------------------------------------------------------
82 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {errln("RegexTest failure at line %d. status=%s\n", \
83 __LINE__, u_errorName(status)); return;}}
85 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("RegexTest failure at line %d.\n", __LINE__);};}
87 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
88 if (status!=errcode) {errln("RegexTest failure at line %d. Expected status=%s, got %s\n", \
89 __LINE__, u_errorName(errcode), u_errorName(status));};}
91 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
92 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
94 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
95 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
99 //---------------------------------------------------------------------------
101 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests
102 // for the LookingAt() and Match() functions.
105 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
107 // The expected results are UBool - TRUE or FALSE.
108 // The input text is unescaped. The pattern is not.
111 //---------------------------------------------------------------------------
113 #define REGEX_TESTLM(pat, text, looking, match) doRegexLMTest(pat, text, looking, match, __LINE__);
115 UBool
RegexTest::doRegexLMTest(const char *pat
, const char *text
, UBool looking
, UBool match
, int line
) {
116 const UnicodeString
pattern(pat
);
117 const UnicodeString
inputText(text
);
118 UErrorCode status
= U_ZERO_ERROR
;
120 RegexPattern
*REPattern
= NULL
;
121 RegexMatcher
*REMatcher
= NULL
;
124 UnicodeString
patString(pat
);
125 REPattern
= RegexPattern::compile(patString
, 0, pe
, status
);
126 if (U_FAILURE(status
)) {
127 errln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s\n",
128 line
, u_errorName(status
));
131 if (line
==376) { RegexPatternDump(REPattern
);}
133 UnicodeString
inputString(inputText
);
134 UnicodeString unEscapedInput
= inputString
.unescape();
135 REMatcher
= REPattern
->matcher(unEscapedInput
, status
);
136 if (U_FAILURE(status
)) {
137 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
138 line
, u_errorName(status
));
143 actualmatch
= REMatcher
->lookingAt(status
);
144 if (U_FAILURE(status
)) {
145 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
146 line
, u_errorName(status
));
149 if (actualmatch
!= looking
) {
150 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line
);
154 status
= U_ZERO_ERROR
;
155 actualmatch
= REMatcher
->matches(status
);
156 if (U_FAILURE(status
)) {
157 errln("RegexTest failure in matches() at line %d. Status = %s\n",
158 line
, u_errorName(status
));
161 if (actualmatch
!= match
) {
162 errln("RegexTest: wrong return from matches() at line %d.\n", line
);
166 if (retVal
== FALSE
) {
167 RegexPatternDump(REPattern
);
178 //---------------------------------------------------------------------------
180 // regex_find(pattern, inputString, lineNumber)
182 // function to simplify writing tests regex tests.
184 // The input text is unescaped. The pattern is not.
185 // The input text is marked with the expected match positions
186 // <0>text <1> more text </1> </0>
187 // The <n> </n> tags are removed before trying the match.
188 // The tags mark the start and end of the match and of any capture groups.
191 //---------------------------------------------------------------------------
194 // Set a value into a UVector at position specified by a decimal number in
195 // a UnicodeString. This is a utility function needed by the actual test function,
197 static void set(UVector
&vec
, int val
, UnicodeString index
) {
198 UErrorCode status
=U_ZERO_ERROR
;
200 for (int i
=0; i
<index
.length(); i
++) {
201 int d
=u_charDigitValue(index
.charAt(i
));
205 while (vec
.size()<idx
+1) {vec
.addElement(-1, status
);}
206 vec
.setElementAt(val
, idx
);
209 void RegexTest::regex_find(const UnicodeString
&pattern
,
210 const UnicodeString
&flags
,
211 const UnicodeString
&inputString
,
213 UnicodeString unEscapedInput
;
214 UnicodeString deTaggedInput
;
216 UErrorCode status
= U_ZERO_ERROR
;
218 RegexPattern
*parsePat
= NULL
;
219 RegexMatcher
*parseMatcher
= NULL
;
220 RegexPattern
*callerPattern
= NULL
;
221 RegexMatcher
*matcher
= NULL
;
222 UVector
groupStarts(status
);
223 UVector
groupEnds(status
);
224 UBool isMatch
= FALSE
;
225 UBool failed
= FALSE
;
230 // Compile the caller's pattern
233 if (flags
.indexOf((UChar
)0x69) >= 0) { // 'i' flag
234 bflags
|= UREGEX_CASE_INSENSITIVE
;
236 if (flags
.indexOf((UChar
)0x78) >= 0) { // 'x' flag
237 bflags
|= UREGEX_COMMENTS
;
239 if (flags
.indexOf((UChar
)0x73) >= 0) { // 's' flag
240 bflags
|= UREGEX_DOTALL
;
242 if (flags
.indexOf((UChar
)0x6d) >= 0) { // 'm' flag
243 bflags
|= UREGEX_MULTILINE
;
247 callerPattern
= RegexPattern::compile(pattern
, bflags
, pe
, status
);
248 if (status
!= U_ZERO_ERROR
) {
249 #if UCONFIG_NO_BREAK_ITERATION==1
250 // 'v' test flag means that the test pattern should not compile if ICU was configured
251 // to not include break iteration. RBBI is needed for Unicode word boundaries.
252 if (flags
.indexOf((UChar
)0x76) >= 0 /*'v'*/ && status
== U_UNSUPPORTED_ERROR
) {
253 goto cleanupAndReturn
;
256 errln("Line %d: error %s compiling pattern.", line
, u_errorName(status
));
257 goto cleanupAndReturn
;
260 if (flags
.indexOf((UChar
)'d') >= 0) {
261 RegexPatternDump(callerPattern
);
265 // Number of times find() should be called on the test string, default to 1
268 for (i
=2; i
<=9; i
++) {
269 if (flags
.indexOf((UChar
)(0x30 + i
)) >= 0) { // digit flag
271 errln("Line %d: more than one digit flag. Scanning %d.", line
, i
);
272 goto cleanupAndReturn
;
279 // Find the tags in the input data, remove them, and record the group boundary
282 parsePat
= RegexPattern::compile("<(/?)([0-9]+)>", 0, pe
, status
);
283 REGEX_CHECK_STATUS_L(line
);
285 unEscapedInput
= inputString
.unescape();
286 parseMatcher
= parsePat
->matcher(unEscapedInput
, status
);
287 REGEX_CHECK_STATUS_L(line
);
288 while(parseMatcher
->find()) {
289 parseMatcher
->appendReplacement(deTaggedInput
, "", status
);
291 UnicodeString groupNum
= parseMatcher
->group(2, status
);
292 if (parseMatcher
->group(1, status
) == "/") {
294 set(groupEnds
, deTaggedInput
.length(), groupNum
);
296 set(groupStarts
, deTaggedInput
.length(), groupNum
);
299 parseMatcher
->appendTail(deTaggedInput
);
300 REGEX_ASSERT_L(groupStarts
.size() == groupEnds
.size(), line
);
304 // Do a find on the de-tagged input using the caller's pattern
306 matcher
= callerPattern
->matcher(deTaggedInput
, status
);
307 REGEX_CHECK_STATUS_L(line
);
308 if (flags
.indexOf((UChar
)'t') >= 0) {
309 matcher
->setTrace(TRUE
);
312 for (i
=0; i
<numFinds
; i
++) {
313 isMatch
= matcher
->find();
315 matcher
->setTrace(FALSE
);
318 // Match up the groups from the find() with the groups from the tags
321 // number of tags should match number of groups from find operation.
322 // matcher->groupCount does not include group 0, the entire match, hence the +1.
323 // G option in test means that capture group data is not available in the
324 // expected results, so the check needs to be suppressed.
325 if (isMatch
== FALSE
&& groupStarts
.size() != 0) {
326 errln("Error at line %d: Match expected, but none found.\n", line
);
328 goto cleanupAndReturn
;
331 if (flags
.indexOf((UChar
)0x47 /*G*/) >= 0) {
332 // Only check for match / no match. Don't check capture groups.
333 if (isMatch
&& groupStarts
.size() == 0) {
334 errln("Error at line %d: No match expected, but one found.\n", line
);
337 goto cleanupAndReturn
;
340 for (i
=0; i
<=matcher
->groupCount(); i
++) {
341 int32_t expectedStart
= (i
>= groupStarts
.size()? -1 : groupStarts
.elementAti(i
));
342 if (matcher
->start(i
, status
) != expectedStart
) {
343 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
344 line
, i
, expectedStart
, matcher
->start(i
, status
));
346 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
348 int32_t expectedEnd
= (i
>= groupEnds
.size()? -1 : groupEnds
.elementAti(i
));
349 if (matcher
->end(i
, status
) != expectedEnd
) {
350 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
351 line
, i
, expectedEnd
, matcher
->end(i
, status
));
353 // Error on end position; keep going; real error is probably yet to come as group
354 // end positions work from end of the input data towards the front.
357 if ( matcher
->groupCount()+1 < groupStarts
.size()) {
358 errln("Error at line %d: Expected %d capture groups, found %d.",
359 line
, groupStarts
.size()-1, matcher
->groupCount());
365 errln("\"%s\" %s \"%s\"", (const char *)CharString(pattern
, 0),
366 (const char *)CharString(flags
, 0),
367 (const char *)CharString(inputString
, 0));
368 // callerPattern->dump();
373 delete callerPattern
;
383 //---------------------------------------------------------------------------
385 // REGEX_ERR Macro + invocation function to simplify writing tests
386 // regex tests for incorrect patterns
389 // REGEX_ERR("pattern", expected error line, column, expected status);
391 //---------------------------------------------------------------------------
392 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
394 void RegexTest::regex_err(const char *pat
, int32_t errLine
, int32_t errCol
,
395 UErrorCode expectedStatus
, int line
) {
396 UnicodeString
pattern(pat
);
398 UErrorCode status
= U_ZERO_ERROR
;
400 RegexPattern
*callerPattern
= NULL
;
403 // Compile the caller's pattern
405 UnicodeString
patString(pat
);
406 callerPattern
= RegexPattern::compile(patString
, 0, pe
, status
);
407 if (status
!= expectedStatus
) {
408 errln("Line %d: unexpected error %s compiling pattern.", line
, u_errorName(status
));
410 if (status
!= U_ZERO_ERROR
) {
411 if (pe
.line
!= errLine
|| pe
.offset
!= errCol
) {
412 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
413 line
, errLine
, errCol
, pe
.line
, pe
.offset
);
418 delete callerPattern
;
423 //---------------------------------------------------------------------------
425 // Basic Check for basic functionality of regex pattern matching.
426 // Avoid the use of REGEX_FIND test macro, which has
427 // substantial dependencies on basic Regex functionality.
429 //---------------------------------------------------------------------------
430 void RegexTest::Basic() {
434 // Debug - slide failing test cases early
438 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
440 UErrorCode status
= U_ZERO_ERROR
;
441 RegexPattern::compile("^(?:a?b?)*$", 0, pe
, status
);
442 // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
443 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
450 // Pattern with parentheses
452 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE
, FALSE
);
453 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE
, TRUE
);
454 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE
, FALSE
);
459 REGEX_TESTLM("st(abc)*ring", "string", TRUE
, TRUE
);
460 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE
, TRUE
);
461 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE
, TRUE
);
462 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE
, FALSE
);
463 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE
, FALSE
);
465 REGEX_TESTLM("a*", "", TRUE
, TRUE
);
466 REGEX_TESTLM("a*", "b", TRUE
, FALSE
);
472 REGEX_TESTLM(".", "abc", TRUE
, FALSE
);
473 REGEX_TESTLM("...", "abc", TRUE
, TRUE
);
474 REGEX_TESTLM("....", "abc", FALSE
, FALSE
);
475 REGEX_TESTLM(".*", "abcxyz123", TRUE
, TRUE
);
476 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE
, FALSE
);
477 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE
, TRUE
);
478 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE
, TRUE
);
479 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE
, FALSE
);
482 // Patterns with * applied to chars at end of literal string
484 REGEX_TESTLM("abc*", "ab", TRUE
, TRUE
);
485 REGEX_TESTLM("abc*", "abccccc", TRUE
, TRUE
);
488 // Supplemental chars match as single chars, not a pair of surrogates.
490 REGEX_TESTLM(".", "\\U00011000", TRUE
, TRUE
);
491 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE
, TRUE
);
492 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE
, FALSE
);
496 // UnicodeSets in the pattern
498 REGEX_TESTLM("[1-6]", "1", TRUE
, TRUE
);
499 REGEX_TESTLM("[1-6]", "3", TRUE
, TRUE
);
500 REGEX_TESTLM("[1-6]", "7", FALSE
, FALSE
);
501 REGEX_TESTLM("a[1-6]", "a3", TRUE
, TRUE
);
502 REGEX_TESTLM("a[1-6]", "a3", TRUE
, TRUE
);
503 REGEX_TESTLM("a[1-6]b", "a3b", TRUE
, TRUE
);
505 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE
, TRUE
);
506 REGEX_TESTLM("a[0-9]*b", "abc", TRUE
, FALSE
);
507 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE
, TRUE
);
508 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE
, FALSE
); // note that * matches 0 occurences.
509 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE
, TRUE
);
512 // OR operator in patterns
514 REGEX_TESTLM("(a|b)", "a", TRUE
, TRUE
);
515 REGEX_TESTLM("(a|b)", "b", TRUE
, TRUE
);
516 REGEX_TESTLM("(a|b)", "c", FALSE
, FALSE
);
517 REGEX_TESTLM("a|b", "b", TRUE
, TRUE
);
519 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE
, TRUE
);
520 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE
, FALSE
);
521 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE
, TRUE
);
522 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE
, TRUE
);
523 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE
, TRUE
);
524 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE
, FALSE
);
529 REGEX_TESTLM("ab+", "abbc", TRUE
, FALSE
);
530 REGEX_TESTLM("ab+c", "ac", FALSE
, FALSE
);
531 REGEX_TESTLM("b+", "", FALSE
, FALSE
);
532 REGEX_TESTLM("(abc|def)+", "defabc", TRUE
, TRUE
);
533 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE
, FALSE
);
534 REGEX_TESTLM(".+y", "zippity dooy", TRUE
, TRUE
);
539 REGEX_TESTLM("ab?", "ab", TRUE
, TRUE
);
540 REGEX_TESTLM("ab?", "a", TRUE
, TRUE
);
541 REGEX_TESTLM("ab?", "ac", TRUE
, FALSE
);
542 REGEX_TESTLM("ab?", "abb", TRUE
, FALSE
);
543 REGEX_TESTLM("a(b|c)?d", "abd", TRUE
, TRUE
);
544 REGEX_TESTLM("a(b|c)?d", "acd", TRUE
, TRUE
);
545 REGEX_TESTLM("a(b|c)?d", "ad", TRUE
, TRUE
);
546 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE
, FALSE
);
547 REGEX_TESTLM("a(b|c)?d", "ab", FALSE
, FALSE
);
550 // Escape sequences that become single literal chars, handled internally
551 // by ICU's Unescape.
554 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
555 REGEX_TESTLM("\\a", "\\u0007", TRUE
, TRUE
); // BEL
556 REGEX_TESTLM("\\cL", "\\u000c", TRUE
, TRUE
); // Control-L
557 REGEX_TESTLM("\\e", "\\u001b", TRUE
, TRUE
); // Escape
558 REGEX_TESTLM("\\f", "\\u000c", TRUE
, TRUE
); // Form Feed
559 REGEX_TESTLM("\\n", "\\u000a", TRUE
, TRUE
); // new line
560 REGEX_TESTLM("\\r", "\\u000d", TRUE
, TRUE
); // CR
561 REGEX_TESTLM("\\t", "\\u0009", TRUE
, TRUE
); // Tab
562 REGEX_TESTLM("\\u1234", "\\u1234", TRUE
, TRUE
);
563 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE
, TRUE
);
565 REGEX_TESTLM(".*\\Ax", "xyz", TRUE
, FALSE
); // \A matches only at the beginning of input
566 REGEX_TESTLM(".*\\Ax", " xyz", FALSE
, FALSE
); // \A matches only at the beginning of input
568 // Escape of special chars in patterns
569 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE
, TRUE
);
575 //---------------------------------------------------------------------------
577 // API_Match Test that the API for class RegexMatcher
578 // is present and nominally working, but excluding functions
579 // implementing replace operations.
581 //---------------------------------------------------------------------------
582 void RegexTest::API_Match() {
584 UErrorCode status
=U_ZERO_ERROR
;
588 // Debug - slide failing test cases early
597 // Simple pattern compilation
600 UnicodeString
re("abc");
602 pat2
= RegexPattern::compile(re
, flags
, pe
, status
);
605 UnicodeString inStr1
= "abcdef this is a test";
606 UnicodeString instr2
= "not abc";
607 UnicodeString empty
= "";
611 // Matcher creation and reset.
613 RegexMatcher
*m1
= pat2
->matcher(inStr1
, status
);
615 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
616 REGEX_ASSERT(m1
->input() == inStr1
);
618 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
619 REGEX_ASSERT(m1
->input() == instr2
);
621 REGEX_ASSERT(m1
->input() == inStr1
);
622 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
624 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
625 REGEX_ASSERT(m1
->input() == empty
);
626 REGEX_ASSERT(&m1
->pattern() == pat2
);
629 // reset(pos, status)
632 m1
->reset(4, status
);
634 REGEX_ASSERT(m1
->input() == inStr1
);
635 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
637 m1
->reset(-1, status
);
638 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
639 status
= U_ZERO_ERROR
;
641 m1
->reset(0, status
);
643 status
= U_ZERO_ERROR
;
645 int32_t len
= m1
->input().length();
646 m1
->reset(len
-1, status
);
648 status
= U_ZERO_ERROR
;
650 m1
->reset(len
, status
);
651 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
652 status
= U_ZERO_ERROR
;
655 // match(pos, status)
658 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
660 REGEX_ASSERT(m1
->matches(3, status
) == FALSE
);
662 REGEX_ASSERT(m1
->matches(5, status
) == FALSE
);
663 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
664 REGEX_ASSERT(m1
->matches(-1, status
) == FALSE
);
665 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
667 // Match() at end of string should fail, but should not
669 status
= U_ZERO_ERROR
;
670 len
= m1
->input().length();
671 REGEX_ASSERT(m1
->matches(len
, status
) == FALSE
);
674 // Match beyond end of string should fail with an error.
675 status
= U_ZERO_ERROR
;
676 REGEX_ASSERT(m1
->matches(len
+1, status
) == FALSE
);
677 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
679 // Successful match at end of string.
681 status
= U_ZERO_ERROR
;
682 RegexMatcher
m("A?", 0, status
); // will match zero length string.
685 len
= inStr1
.length();
686 REGEX_ASSERT(m
.matches(len
, status
) == TRUE
);
689 REGEX_ASSERT(m
.matches(0, status
) == TRUE
);
695 // lookingAt(pos, status)
697 status
= U_ZERO_ERROR
;
698 m1
->reset(instr2
); // "not abc"
699 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
700 REGEX_ASSERT(m1
->lookingAt(5, status
) == FALSE
);
701 REGEX_ASSERT(m1
->lookingAt(3, status
) == FALSE
);
702 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
703 REGEX_ASSERT(m1
->lookingAt(-1, status
) == FALSE
);
704 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
705 status
= U_ZERO_ERROR
;
706 len
= m1
->input().length();
707 REGEX_ASSERT(m1
->lookingAt(len
, status
) == FALSE
);
709 REGEX_ASSERT(m1
->lookingAt(len
+1, status
) == FALSE
);
710 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
719 // RegexMatcher::start();
720 // RegexMatcher::end();
721 // RegexMatcher::groupCount();
726 UErrorCode status
=U_ZERO_ERROR
;
728 UnicodeString
re("01(23(45)67)(.*)");
729 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
731 UnicodeString data
= "0123456789";
733 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
735 REGEX_ASSERT(matcher
->lookingAt(status
) == TRUE
);
736 int matchStarts
[] = {0, 2, 4, 8};
737 int matchEnds
[] = {10, 8, 6, 10};
739 for (i
=0; i
<4; i
++) {
740 int32_t actualStart
= matcher
->start(i
, status
);
742 if (actualStart
!= matchStarts
[i
]) {
743 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
744 __LINE__
, i
, matchStarts
[i
], actualStart
);
746 int32_t actualEnd
= matcher
->end(i
, status
);
748 if (actualEnd
!= matchEnds
[i
]) {
749 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
750 __LINE__
, i
, matchEnds
[i
], actualEnd
);
754 REGEX_ASSERT(matcher
->start(0, status
) == matcher
->start(status
));
755 REGEX_ASSERT(matcher
->end(0, status
) == matcher
->end(status
));
757 REGEX_ASSERT_FAIL(matcher
->start(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
758 REGEX_ASSERT_FAIL(matcher
->start( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
760 REGEX_ASSERT_FAIL(matcher
->start( 0, status
), U_REGEX_INVALID_STATE
);
762 matcher
->lookingAt(status
);
763 REGEX_ASSERT(matcher
->group(status
) == "0123456789");
764 REGEX_ASSERT(matcher
->group(0, status
) == "0123456789");
765 REGEX_ASSERT(matcher
->group(1, status
) == "234567" );
766 REGEX_ASSERT(matcher
->group(2, status
) == "45" );
767 REGEX_ASSERT(matcher
->group(3, status
) == "89" );
769 REGEX_ASSERT_FAIL(matcher
->group(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
770 REGEX_ASSERT_FAIL(matcher
->group( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
772 REGEX_ASSERT_FAIL(matcher
->group( 0, status
), U_REGEX_INVALID_STATE
);
785 UErrorCode status
=U_ZERO_ERROR
;
787 UnicodeString
re("abc");
788 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
790 UnicodeString data
= ".abc..abc...abc..";
791 // 012345678901234567
793 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
795 REGEX_ASSERT(matcher
->find());
796 REGEX_ASSERT(matcher
->start(status
) == 1);
797 REGEX_ASSERT(matcher
->find());
798 REGEX_ASSERT(matcher
->start(status
) == 6);
799 REGEX_ASSERT(matcher
->find());
800 REGEX_ASSERT(matcher
->start(status
) == 12);
801 REGEX_ASSERT(matcher
->find() == FALSE
);
802 REGEX_ASSERT(matcher
->find() == FALSE
);
805 REGEX_ASSERT(matcher
->find());
806 REGEX_ASSERT(matcher
->start(status
) == 1);
808 REGEX_ASSERT(matcher
->find(0, status
));
809 REGEX_ASSERT(matcher
->start(status
) == 1);
810 REGEX_ASSERT(matcher
->find(1, status
));
811 REGEX_ASSERT(matcher
->start(status
) == 1);
812 REGEX_ASSERT(matcher
->find(2, status
));
813 REGEX_ASSERT(matcher
->start(status
) == 6);
814 REGEX_ASSERT(matcher
->find(12, status
));
815 REGEX_ASSERT(matcher
->start(status
) == 12);
816 REGEX_ASSERT(matcher
->find(13, status
) == FALSE
);
817 REGEX_ASSERT(matcher
->find(16, status
) == FALSE
);
818 REGEX_ASSERT(matcher
->find(17, status
) == FALSE
);
819 REGEX_ASSERT_FAIL(matcher
->start(status
), U_REGEX_INVALID_STATE
);
821 status
= U_ZERO_ERROR
;
822 REGEX_ASSERT_FAIL(matcher
->find(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
823 status
= U_ZERO_ERROR
;
824 REGEX_ASSERT_FAIL(matcher
->find(18, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
826 REGEX_ASSERT(matcher
->groupCount() == 0);
834 // find, with \G in pattern (true if at the end of a previous match).
839 UErrorCode status
=U_ZERO_ERROR
;
841 UnicodeString
re(".*?(?:(\\Gabc)|(abc))");
842 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
844 UnicodeString data
= ".abcabc.abc..";
845 // 012345678901234567
847 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
849 REGEX_ASSERT(matcher
->find());
850 REGEX_ASSERT(matcher
->start(status
) == 0);
851 REGEX_ASSERT(matcher
->start(1, status
) == -1);
852 REGEX_ASSERT(matcher
->start(2, status
) == 1);
854 REGEX_ASSERT(matcher
->find());
855 REGEX_ASSERT(matcher
->start(status
) == 4);
856 REGEX_ASSERT(matcher
->start(1, status
) == 4);
857 REGEX_ASSERT(matcher
->start(2, status
) == -1);
865 // find with zero length matches, match position should bump ahead
870 UErrorCode status
=U_ZERO_ERROR
;
871 RegexMatcher
m("(?= ?)", 0, status
); // This pattern will zero-length matches anywhere,
872 // using an always-true look-ahead.
874 UnicodeString
s(" ");
877 if (m
.find() == FALSE
) {
880 REGEX_ASSERT(m
.start(status
) == i
);
881 REGEX_ASSERT(m
.end(status
) == i
);
885 // Check that the bump goes over surrogate pairs OK
886 s
= "\\U00010001\\U00010002\\U00010003\\U00010004";
890 if (m
.find() == FALSE
) {
893 REGEX_ASSERT(m
.start(status
) == i
);
894 REGEX_ASSERT(m
.end(status
) == i
);
899 // find() loop breaking test.
900 // with pattern of /.?/, should see a series of one char matches, then a single
901 // match of zero length at the end of the input string.
903 UErrorCode status
=U_ZERO_ERROR
;
904 RegexMatcher
m(".?", 0, status
);
906 UnicodeString
s(" ");
909 if (m
.find() == FALSE
) {
912 REGEX_ASSERT(m
.start(status
) == i
);
913 REGEX_ASSERT(m
.end(status
) == (i
<4 ? i
+1 : i
));
920 // Matchers with no input string behave as if they had an empty input string.
924 UErrorCode status
= U_ZERO_ERROR
;
925 RegexMatcher
m(".?", 0, status
);
927 REGEX_ASSERT(m
.find());
928 REGEX_ASSERT(m
.start(status
) == 0);
929 REGEX_ASSERT(m
.input() == "");
932 UErrorCode status
= U_ZERO_ERROR
;
933 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
934 RegexMatcher
*m
= p
->matcher(status
);
937 REGEX_ASSERT(m
->find() == FALSE
);
938 REGEX_ASSERT(m
->input() == "");
944 // Compilation error on reset with UChar *
945 // These were a hazard that people were stumbling over with runtime errors.
946 // Changed them to compiler errors by adding private methods that more closely
947 // matched the incorrect use of the functions.
951 UErrorCode status
= U_ZERO_ERROR
;
952 UChar ucharString
[20];
953 RegexMatcher
m(".", 0, status
);
954 m
.reset(ucharString
); // should not compile.
956 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
957 RegexMatcher
*m2
= p
->matcher(ucharString
, status
); // should not compile.
959 RegexMatcher
m3(".", ucharString
, 0, status
); // Should not compile
970 //---------------------------------------------------------------------------
972 // API_Replace API test for class RegexMatcher, testing the
973 // Replace family of functions.
975 //---------------------------------------------------------------------------
976 void RegexTest::API_Replace() {
982 UErrorCode status
=U_ZERO_ERROR
;
984 UnicodeString
re("abc");
985 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
987 UnicodeString data
= ".abc..abc...abc..";
988 // 012345678901234567
989 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
992 // Plain vanilla matches.
995 dest
= matcher
->replaceFirst("yz", status
);
997 REGEX_ASSERT(dest
== ".yz..abc...abc..");
999 dest
= matcher
->replaceAll("yz", status
);
1001 REGEX_ASSERT(dest
== ".yz..yz...yz..");
1004 // Plain vanilla non-matches.
1006 UnicodeString d2
= ".abx..abx...abx..";
1008 dest
= matcher
->replaceFirst("yz", status
);
1010 REGEX_ASSERT(dest
== ".abx..abx...abx..");
1012 dest
= matcher
->replaceAll("yz", status
);
1014 REGEX_ASSERT(dest
== ".abx..abx...abx..");
1017 // Empty source string
1019 UnicodeString d3
= "";
1021 dest
= matcher
->replaceFirst("yz", status
);
1023 REGEX_ASSERT(dest
== "");
1025 dest
= matcher
->replaceAll("yz", status
);
1027 REGEX_ASSERT(dest
== "");
1030 // Empty substitution string
1032 matcher
->reset(data
); // ".abc..abc...abc.."
1033 dest
= matcher
->replaceFirst("", status
);
1035 REGEX_ASSERT(dest
== "...abc...abc..");
1037 dest
= matcher
->replaceAll("", status
);
1039 REGEX_ASSERT(dest
== "........");
1042 // match whole string
1044 UnicodeString d4
= "abc";
1046 dest
= matcher
->replaceFirst("xyz", status
);
1048 REGEX_ASSERT(dest
== "xyz");
1050 dest
= matcher
->replaceAll("xyz", status
);
1052 REGEX_ASSERT(dest
== "xyz");
1055 // Capture Group, simple case
1057 UnicodeString
re2("a(..)");
1058 RegexPattern
*pat2
= RegexPattern::compile(re2
, flags
, pe
, status
);
1060 UnicodeString d5
= "abcdefg";
1061 RegexMatcher
*matcher2
= pat2
->matcher(d5
, status
);
1063 dest
= matcher2
->replaceFirst("$1$1", status
);
1065 REGEX_ASSERT(dest
== "bcbcdefg");
1067 dest
= matcher2
->replaceFirst("The value of \\$1 is $1.", status
);
1069 REGEX_ASSERT(dest
== "The value of $1 is bc.defg");
1071 dest
= matcher2
->replaceFirst("$ by itself, no group number $$$", status
);
1073 REGEX_ASSERT(dest
== "$ by itself, no group number $$$defg");
1075 UnicodeString replacement
= "Supplemental Digit 1 $\\U0001D7CF.";
1076 replacement
= replacement
.unescape();
1077 dest
= matcher2
->replaceFirst(replacement
, status
);
1079 REGEX_ASSERT(dest
== "Supplemental Digit 1 bc.defg");
1081 REGEX_ASSERT_FAIL(matcher2
->replaceFirst("bad capture group number $5...",status
), U_INDEX_OUTOFBOUNDS_ERROR
);
1085 // Replacement String with \u hex escapes
1088 UnicodeString src
= "abc 1 abc 2 abc 3";
1089 UnicodeString substitute
= "--\\u0043--";
1090 matcher
->reset(src
);
1091 UnicodeString result
= matcher
->replaceAll(substitute
, status
);
1093 REGEX_ASSERT(result
== "--C-- 1 --C-- 2 --C-- 3");
1096 UnicodeString src
= "abc !";
1097 UnicodeString substitute
= "--\\U00010000--";
1098 matcher
->reset(src
);
1099 UnicodeString result
= matcher
->replaceAll(substitute
, status
);
1101 UnicodeString expected
= UnicodeString("--");
1102 expected
.append((UChar32
)0x10000);
1103 expected
.append("-- !");
1104 REGEX_ASSERT(result
== expected
);
1106 // TODO: need more through testing of capture substitutions.
1111 status
= U_ZERO_ERROR
;
1112 UnicodeString s
= "The matches start with ss and end with ee ss stuff ee fin";
1113 RegexMatcher
m("ss(.*?)ee", 0, status
);
1115 UnicodeString result
;
1117 // Multiple finds do NOT bump up the previous appendReplacement postion.
1121 m
.appendReplacement(result
, "ooh", status
);
1123 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1125 // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1126 status
= U_ZERO_ERROR
;
1128 m
.reset(10, status
);
1131 m
.appendReplacement(result
, "ooh", status
);
1133 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1135 // find() at interior of string, appendReplacemnt still starts at beginning.
1136 status
= U_ZERO_ERROR
;
1141 m
.appendReplacement(result
, "ooh", status
);
1143 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1145 m
.appendTail(result
);
1146 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh fin");
1157 //---------------------------------------------------------------------------
1159 // API_Pattern Test that the API for class RegexPattern is
1160 // present and nominally working.
1162 //---------------------------------------------------------------------------
1163 void RegexTest::API_Pattern() {
1164 RegexPattern pata
; // Test default constructor to not crash.
1167 REGEX_ASSERT(pata
== patb
);
1168 REGEX_ASSERT(pata
== pata
);
1170 UnicodeString
re1("abc[a-l][m-z]");
1171 UnicodeString
re2("def");
1172 UErrorCode status
= U_ZERO_ERROR
;
1175 RegexPattern
*pat1
= RegexPattern::compile(re1
, 0, pe
, status
);
1176 RegexPattern
*pat2
= RegexPattern::compile(re2
, 0, pe
, status
);
1178 REGEX_ASSERT(*pat1
== *pat1
);
1179 REGEX_ASSERT(*pat1
!= pata
);
1183 REGEX_ASSERT(patb
== *pat1
);
1186 RegexPattern
patc(*pat1
);
1187 REGEX_ASSERT(patc
== *pat1
);
1188 REGEX_ASSERT(patb
== patc
);
1189 REGEX_ASSERT(pat1
!= pat2
);
1191 REGEX_ASSERT(patb
!= patc
);
1192 REGEX_ASSERT(patb
== *pat2
);
1194 // Compile with no flags.
1195 RegexPattern
*pat1a
= RegexPattern::compile(re1
, pe
, status
);
1196 REGEX_ASSERT(*pat1a
== *pat1
);
1198 REGEX_ASSERT(pat1a
->flags() == 0);
1200 // Compile with different flags should be not equal
1201 RegexPattern
*pat1b
= RegexPattern::compile(re1
, UREGEX_CASE_INSENSITIVE
, pe
, status
);
1204 REGEX_ASSERT(*pat1b
!= *pat1a
);
1205 REGEX_ASSERT(pat1b
->flags() == UREGEX_CASE_INSENSITIVE
);
1206 REGEX_ASSERT(pat1a
->flags() == 0);
1210 RegexPattern
*pat1c
= pat1
->clone();
1211 REGEX_ASSERT(*pat1c
== *pat1
);
1212 REGEX_ASSERT(*pat1c
!= *pat2
);
1221 // Verify that a matcher created from a cloned pattern works.
1225 UErrorCode status
= U_ZERO_ERROR
;
1226 RegexPattern
*pSource
= RegexPattern::compile("\\p{L}+", 0, status
);
1227 RegexPattern
*pClone
= pSource
->clone();
1229 RegexMatcher
*mFromClone
= pClone
->matcher(status
);
1231 UnicodeString s
= "Hello World";
1232 mFromClone
->reset(s
);
1233 REGEX_ASSERT(mFromClone
->find() == TRUE
);
1234 REGEX_ASSERT(mFromClone
->group(status
) == "Hello");
1235 REGEX_ASSERT(mFromClone
->find() == TRUE
);
1236 REGEX_ASSERT(mFromClone
->group(status
) == "World");
1237 REGEX_ASSERT(mFromClone
->find() == FALSE
);
1243 // matches convenience API
1245 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe
, status
) == TRUE
);
1247 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe
, status
) == FALSE
);
1249 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe
, status
) == TRUE
);
1251 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe
, status
) == TRUE
);
1253 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe
, status
) == FALSE
);
1255 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1256 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe
, status
) == FALSE
);
1257 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1263 status
= U_ZERO_ERROR
;
1264 pat1
= RegexPattern::compile(" +", pe
, status
);
1266 UnicodeString fields
[10];
1269 n
= pat1
->split("Now is the time", fields
, 10, status
);
1272 REGEX_ASSERT(fields
[0]=="Now");
1273 REGEX_ASSERT(fields
[1]=="is");
1274 REGEX_ASSERT(fields
[2]=="the");
1275 REGEX_ASSERT(fields
[3]=="time");
1276 REGEX_ASSERT(fields
[4]=="");
1278 n
= pat1
->split("Now is the time", fields
, 2, status
);
1281 REGEX_ASSERT(fields
[0]=="Now");
1282 REGEX_ASSERT(fields
[1]=="is the time");
1283 REGEX_ASSERT(fields
[2]=="the"); // left over from previous test
1286 status
= U_ZERO_ERROR
;
1287 n
= pat1
->split("Now is the time", fields
, 1, status
);
1290 REGEX_ASSERT(fields
[0]=="Now is the time");
1291 REGEX_ASSERT(fields
[1]=="*");
1292 status
= U_ZERO_ERROR
;
1294 n
= pat1
->split(" Now is the time ", fields
, 10, status
);
1297 REGEX_ASSERT(fields
[0]=="");
1298 REGEX_ASSERT(fields
[1]=="Now");
1299 REGEX_ASSERT(fields
[2]=="is");
1300 REGEX_ASSERT(fields
[3]=="the");
1301 REGEX_ASSERT(fields
[4]=="time");
1302 REGEX_ASSERT(fields
[5]=="");
1304 n
= pat1
->split(" ", fields
, 10, status
);
1307 REGEX_ASSERT(fields
[0]=="");
1310 n
= pat1
->split("", fields
, 10, status
);
1313 REGEX_ASSERT(fields
[0]=="foo");
1317 // split, with a pattern with (capture)
1318 pat1
= RegexPattern::compile("<(\\w*)>", pe
, status
);
1321 status
= U_ZERO_ERROR
;
1322 n
= pat1
->split("<a>Now is <b>the time<c>", fields
, 10, status
);
1325 REGEX_ASSERT(fields
[0]=="");
1326 REGEX_ASSERT(fields
[1]=="a");
1327 REGEX_ASSERT(fields
[2]=="Now is ");
1328 REGEX_ASSERT(fields
[3]=="b");
1329 REGEX_ASSERT(fields
[4]=="the time");
1330 REGEX_ASSERT(fields
[5]=="c");
1331 REGEX_ASSERT(fields
[6]=="");
1332 REGEX_ASSERT(status
==U_ZERO_ERROR
);
1334 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 10, status
);
1337 REGEX_ASSERT(fields
[0]==" ");
1338 REGEX_ASSERT(fields
[1]=="a");
1339 REGEX_ASSERT(fields
[2]=="Now is ");
1340 REGEX_ASSERT(fields
[3]=="b");
1341 REGEX_ASSERT(fields
[4]=="the time");
1342 REGEX_ASSERT(fields
[5]=="c");
1343 REGEX_ASSERT(fields
[6]=="");
1345 status
= U_ZERO_ERROR
;
1347 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 6, status
);
1350 REGEX_ASSERT(fields
[0]==" ");
1351 REGEX_ASSERT(fields
[1]=="a");
1352 REGEX_ASSERT(fields
[2]=="Now is ");
1353 REGEX_ASSERT(fields
[3]=="b");
1354 REGEX_ASSERT(fields
[4]=="the time");
1355 REGEX_ASSERT(fields
[5]=="c");
1356 REGEX_ASSERT(fields
[6]=="foo");
1358 status
= U_ZERO_ERROR
;
1360 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 5, status
);
1363 REGEX_ASSERT(fields
[0]==" ");
1364 REGEX_ASSERT(fields
[1]=="a");
1365 REGEX_ASSERT(fields
[2]=="Now is ");
1366 REGEX_ASSERT(fields
[3]=="b");
1367 REGEX_ASSERT(fields
[4]=="the time<c>");
1368 REGEX_ASSERT(fields
[5]=="foo");
1370 status
= U_ZERO_ERROR
;
1372 n
= pat1
->split(" <a>Now is <b>the time", fields
, 5, status
);
1375 REGEX_ASSERT(fields
[0]==" ");
1376 REGEX_ASSERT(fields
[1]=="a");
1377 REGEX_ASSERT(fields
[2]=="Now is ");
1378 REGEX_ASSERT(fields
[3]=="b");
1379 REGEX_ASSERT(fields
[4]=="the time");
1380 REGEX_ASSERT(fields
[5]=="foo");
1382 status
= U_ZERO_ERROR
;
1383 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 4, status
);
1386 REGEX_ASSERT(fields
[0]==" ");
1387 REGEX_ASSERT(fields
[1]=="a");
1388 REGEX_ASSERT(fields
[2]=="Now is ");
1389 REGEX_ASSERT(fields
[3]=="the time<c>");
1390 status
= U_ZERO_ERROR
;
1393 pat1
= RegexPattern::compile("([-,])", pe
, status
);
1395 n
= pat1
->split("1-10,20", fields
, 10, status
);
1398 REGEX_ASSERT(fields
[0]=="1");
1399 REGEX_ASSERT(fields
[1]=="-");
1400 REGEX_ASSERT(fields
[2]=="10");
1401 REGEX_ASSERT(fields
[3]==",");
1402 REGEX_ASSERT(fields
[4]=="20");
1407 // RegexPattern::pattern()
1409 pat1
= new RegexPattern();
1410 REGEX_ASSERT(pat1
->pattern() == "");
1413 pat1
= RegexPattern::compile("(Hello, world)*", pe
, status
);
1415 REGEX_ASSERT(pat1
->pattern() == "(Hello, world)*");
1420 // classID functions
1422 pat1
= RegexPattern::compile("(Hello, world)*", pe
, status
);
1424 REGEX_ASSERT(pat1
->getDynamicClassID() == RegexPattern::getStaticClassID());
1425 REGEX_ASSERT(pat1
->getDynamicClassID() != NULL
);
1426 UnicodeString
Hello("Hello, world.");
1427 RegexMatcher
*m
= pat1
->matcher(Hello
, status
);
1428 REGEX_ASSERT(pat1
->getDynamicClassID() != m
->getDynamicClassID());
1429 REGEX_ASSERT(m
->getDynamicClassID() == RegexMatcher::getStaticClassID());
1430 REGEX_ASSERT(m
->getDynamicClassID() != NULL
);
1436 //---------------------------------------------------------------------------
1438 // Extended A more thorough check for features of regex patterns
1439 // The test cases are in a separate data file,
1440 // source/tests/testdata/regextst.txt
1441 // A description of the test data format is included in that file.
1443 //---------------------------------------------------------------------------
1446 RegexTest::getPath(char buffer
[2048], const char *filename
) {
1447 UErrorCode status
=U_ZERO_ERROR
;
1448 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
1449 if (U_FAILURE(status
)) {
1450 errln("ERROR: loadTestData() failed - %s", u_errorName(status
));
1454 strcpy(buffer
, testDataDirectory
);
1455 strcat(buffer
, filename
);
1459 void RegexTest::Extended() {
1461 const char *srcPath
;
1462 UErrorCode status
= U_ZERO_ERROR
;
1463 int32_t lineNum
= 0;
1466 // Open and read the test data file.
1468 srcPath
=getPath(tdd
, "regextst.txt");
1470 return; /* something went wrong, error already output */
1474 UChar
*testData
= ReadAndConvertFile(srcPath
, len
, status
);
1475 if (U_FAILURE(status
)) {
1476 return; /* something went wrong, error already output */
1480 // Put the test data into a UnicodeString
1482 UnicodeString
testString(FALSE
, testData
, len
);
1484 RegexMatcher
quotedStuffMat("\\s*([\\'\\\"/])(.*?)\\1", 0, status
);
1485 RegexMatcher
commentMat ("\\s*(#.*)?$", 0, status
);
1486 RegexMatcher
flagsMat ("\\s*([ixsmdtGv2-9]*)([:letter:]*)", 0, status
);
1488 RegexMatcher
lineMat("(.*?)\\r?\\n", testString
, 0, status
);
1489 UnicodeString testPattern
; // The pattern for test from the test file.
1490 UnicodeString testFlags
; // the flags for a test.
1491 UnicodeString matchString
; // The marked up string to be used as input
1496 // Loop over the test data file, once per line.
1498 while (lineMat
.find()) {
1500 if (U_FAILURE(status
)) {
1501 errln("line %d: ICU Error \"%s\"", lineNum
, u_errorName(status
));
1504 status
= U_ZERO_ERROR
;
1505 UnicodeString testLine
= lineMat
.group(1, status
);
1506 if (testLine
.length() == 0) {
1511 // Parse the test line. Skip blank and comment only lines.
1512 // Separate out the three main fields - pattern, flags, target.
1515 commentMat
.reset(testLine
);
1516 if (commentMat
.lookingAt(status
)) {
1517 // This line is a comment, or blank.
1522 // Pull out the pattern field, remove it from the test file line.
1524 quotedStuffMat
.reset(testLine
);
1525 if (quotedStuffMat
.lookingAt(status
)) {
1526 testPattern
= quotedStuffMat
.group(2, status
);
1527 testLine
.remove(0, quotedStuffMat
.end(0, status
));
1529 errln("Bad pattern (missing quotes?) at test file line %d", lineNum
);
1535 // Pull out the flags from the test file line.
1537 flagsMat
.reset(testLine
);
1538 flagsMat
.lookingAt(status
); // Will always match, possibly an empty string.
1539 testFlags
= flagsMat
.group(1, status
);
1540 if (flagsMat
.group(2, status
).length() > 0) {
1541 errln("Bad Match flag at line %d. Scanning %c\n",
1542 lineNum
, flagsMat
.group(2, status
).charAt(0));
1545 testLine
.remove(0, flagsMat
.end(0, status
));
1548 // Pull out the match string, as a whole.
1549 // We'll process the <tags> later.
1551 quotedStuffMat
.reset(testLine
);
1552 if (quotedStuffMat
.lookingAt(status
)) {
1553 matchString
= quotedStuffMat
.group(2, status
);
1554 testLine
.remove(0, quotedStuffMat
.end(0, status
));
1556 errln("Bad match string at test file line %d", lineNum
);
1561 // The only thing left from the input line should be an optional trailing comment.
1563 commentMat
.reset(testLine
);
1564 if (commentMat
.lookingAt(status
) == FALSE
) {
1565 errln("Line %d: unexpected characters at end of test line.", lineNum
);
1572 regex_find(testPattern
, testFlags
, matchString
, lineNum
);
1581 //---------------------------------------------------------------------------
1583 // Errors Check for error handling in patterns.
1585 //---------------------------------------------------------------------------
1586 void RegexTest::Errors() {
1587 // \escape sequences that aren't implemented yet.
1588 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
1590 // Missing close parentheses
1591 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN
);
1592 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN
);
1593 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN
);
1595 // Extra close paren
1596 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN
);
1597 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN
);
1598 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN
);
1600 // Look-ahead, Look-behind
1601 // TODO: add tests for unbounded length look-behinds.
1602 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX
); // illegal construct
1604 // Attempt to use non-default flags
1607 UErrorCode status
= U_ZERO_ERROR
;
1608 int32_t flags
= UREGEX_CANON_EQ
|
1609 UREGEX_COMMENTS
| UREGEX_DOTALL
|
1611 RegexPattern
*pat1
= RegexPattern::compile(".*", flags
, pe
, status
);
1612 REGEX_ASSERT(status
== U_REGEX_UNIMPLEMENTED
);
1617 // Quantifiers are allowed only after something that can be quantified.
1618 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX
);
1619 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX
);
1620 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX
);
1622 // Mal-formed {min,max} quantifiers
1623 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL
);
1624 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN
);
1625 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL
);
1626 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL
);
1627 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL
);
1628 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG
);
1629 REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG
); // Overflows int during scan
1630 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG
); // Overflows regex binary format
1631 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG
);
1634 // UnicodeSet containing a string
1635 REGEX_ERR("abc[{def}]xyz", 1, 10, U_REGEX_SET_CONTAINS_STRING
);
1640 //-------------------------------------------------------------------------------
1642 // Read a text data file, convert it to UChars, and return the data
1643 // in one big UChar * buffer, which the caller must delete.
1645 //--------------------------------------------------------------------------------
1646 UChar
*RegexTest::ReadAndConvertFile(const char *fileName
, int &ulen
, UErrorCode
&status
) {
1647 UChar
*retPtr
= NULL
;
1648 char *fileBuf
= NULL
;
1649 UConverter
* conv
= NULL
;
1653 if (U_FAILURE(status
)) {
1660 f
= fopen(fileName
, "rb");
1662 errln("Error opening test data file %s\n", fileName
);
1663 status
= U_FILE_ACCESS_ERROR
;
1672 fseek( f
, 0, SEEK_END
);
1673 fileSize
= ftell(f
);
1674 fileBuf
= new char[fileSize
];
1675 fseek(f
, 0, SEEK_SET
);
1676 amt_read
= fread(fileBuf
, 1, fileSize
, f
);
1677 if (amt_read
!= fileSize
|| fileSize
<= 0) {
1678 errln("Error reading test data file.");
1679 goto cleanUpAndReturn
;
1683 // Look for a Unicode Signature (BOM) on the data just read
1685 int32_t signatureLength
;
1686 const char * fileBufC
;
1687 const char* encoding
;
1690 encoding
= ucnv_detectUnicodeSignature(
1691 fileBuf
, fileSize
, &signatureLength
, &status
);
1692 if(encoding
!=NULL
){
1693 fileBufC
+= signatureLength
;
1694 fileSize
-= signatureLength
;
1698 // Open a converter to take the rule file to UTF-16
1700 conv
= ucnv_open(encoding
, &status
);
1701 if (U_FAILURE(status
)) {
1702 goto cleanUpAndReturn
;
1706 // Convert the rules to UChar.
1707 // Preflight first to determine required buffer size.
1709 ulen
= ucnv_toUChars(conv
,
1715 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
1716 // Buffer Overflow is expected from the preflight operation.
1717 status
= U_ZERO_ERROR
;
1719 retPtr
= new UChar
[ulen
+1];
1732 if (U_FAILURE(status
)) {
1733 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
1742 //-------------------------------------------------------------------------------
1744 // PerlTests - Run Perl's regular expression tests
1745 // The input file for this test is re_tests, the standard regular
1746 // expression test data distributed with the Perl source code.
1748 // Here is Perl's description of the test data file:
1750 // # The tests are in a separate file 't/op/re_tests'.
1751 // # Each line in that file is a separate test.
1752 // # There are five columns, separated by tabs.
1754 // # Column 1 contains the pattern, optionally enclosed in C<''>.
1755 // # Modifiers can be put after the closing C<'>.
1757 // # Column 2 contains the string to be matched.
1759 // # Column 3 contains the expected result:
1760 // # y expect a match
1761 // # n expect no match
1762 // # c expect an error
1763 // # B test exposes a known bug in Perl, should be skipped
1764 // # b test exposes a known bug in Perl, should be skipped if noamp
1766 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
1768 // # Column 4 contains a string, usually C<$&>.
1770 // # Column 5 contains the expected result of double-quote
1771 // # interpolating that string after the match, or start of error message.
1773 // # Column 6, if present, contains a reason why the test is skipped.
1774 // # This is printed with "skipped", for harness to pick up.
1776 // # \n in the tests are interpolated, as are variables of the form ${\w+}.
1778 // # If you want to add a regular expression test that can't be expressed
1779 // # in this format, don't add it here: put it in op/pat.t instead.
1781 // For ICU, if field 3 contains an 'i', the test will be skipped.
1782 // The test exposes is some known incompatibility between ICU and Perl regexps.
1783 // (The i is in addition to whatever was there before.)
1785 //-------------------------------------------------------------------------------
1786 void RegexTest::PerlTests() {
1788 const char *srcPath
;
1789 UErrorCode status
= U_ZERO_ERROR
;
1793 // Open and read the test data file.
1795 srcPath
=getPath(tdd
, "re_tests.txt");
1797 return; /* something went wrong, error already output */
1801 UChar
*testData
= ReadAndConvertFile(srcPath
, len
, status
);
1802 if (U_FAILURE(status
)) {
1803 return; /* something went wrong, error already output */
1807 // Put the test data into a UnicodeString
1809 UnicodeString
testDataString(FALSE
, testData
, len
);
1812 // Regex to break the input file into lines, and strip the new lines.
1813 // One line per match, capture group one is the desired data.
1815 RegexPattern
* linePat
= RegexPattern::compile("(.+?)[\\r\\n]+", 0, pe
, status
);
1816 RegexMatcher
* lineMat
= linePat
->matcher(testDataString
, status
);
1819 // Regex to split a test file line into fields.
1820 // There are six fields, separated by tabs.
1822 RegexPattern
* fieldPat
= RegexPattern::compile("\\t", 0, pe
, status
);
1825 // Regex to identify test patterns with flag settings, and to separate them.
1826 // Test patterns with flags look like 'pattern'i
1827 // Test patterns without flags are not quoted: pattern
1828 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
1830 RegexPattern
*flagPat
= RegexPattern::compile("('?)(.*)\\1(.*)", 0, pe
, status
);
1831 RegexMatcher
* flagMat
= flagPat
->matcher(status
);
1834 // The Perl tests reference several perl-isms, which are evaluated/substituted
1835 // in the test data. Not being perl, this must be done explicitly. Here
1836 // are string constants and REs for these constructs.
1838 UnicodeString
nulnulSrc("${nulnul}");
1839 UnicodeString
nulnul("\\u0000\\u0000");
1840 nulnul
= nulnul
.unescape();
1842 UnicodeString
ffffSrc("${ffff}");
1843 UnicodeString
ffff("\\uffff");
1844 ffff
= ffff
.unescape();
1846 // regexp for $-[0], $+[2], etc.
1847 RegexPattern
*groupsPat
= RegexPattern::compile("\\$([+\\-])\\[(\\d+)\\]", 0, pe
, status
);
1848 RegexMatcher
*groupsMat
= groupsPat
->matcher(status
);
1850 // regexp for $0, $1, $2, etc.
1851 RegexPattern
*cgPat
= RegexPattern::compile("\\$(\\d+)", 0, pe
, status
);
1852 RegexMatcher
*cgMat
= cgPat
->matcher(status
);
1856 // Main Loop for the Perl Tests, runs once per line from the
1859 int32_t lineNum
= 0;
1860 int32_t skippedUnimplementedCount
= 0;
1861 while (lineMat
->find()) {
1865 // Get a line, break it into its fields, do the Perl
1866 // variable substitutions.
1868 UnicodeString line
= lineMat
->group(1, status
);
1869 UnicodeString fields
[7];
1870 fieldPat
->split(line
, fields
, 7, status
);
1872 flagMat
->reset(fields
[0]);
1873 flagMat
->matches(status
);
1874 UnicodeString pattern
= flagMat
->group(2, status
);
1875 pattern
.findAndReplace("${bang}", "!");
1876 pattern
.findAndReplace(nulnulSrc
, "\\u0000\\u0000");
1877 pattern
.findAndReplace(ffffSrc
, ffff
);
1880 // Identify patterns that include match flag settings,
1881 // split off the flags, remove the extra quotes.
1883 UnicodeString flagStr
= flagMat
->group(3, status
);
1884 if (U_FAILURE(status
)) {
1885 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
1889 const UChar UChar_c
= 0x63; // Char constants for the flag letters.
1890 const UChar UChar_i
= 0x69; // (Damn the lack of Unicode support in C)
1891 const UChar UChar_m
= 0x6d;
1892 const UChar UChar_x
= 0x78;
1893 const UChar UChar_y
= 0x79;
1894 if (flagStr
.indexOf(UChar_i
) != -1) {
1895 flags
|= UREGEX_CASE_INSENSITIVE
;
1897 if (flagStr
.indexOf(UChar_m
) != -1) {
1898 flags
|= UREGEX_MULTILINE
;
1900 if (flagStr
.indexOf(UChar_x
) != -1) {
1901 flags
|= UREGEX_COMMENTS
;
1905 // Compile the test pattern.
1907 status
= U_ZERO_ERROR
;
1908 RegexPattern
*testPat
= RegexPattern::compile(pattern
, flags
, pe
, status
);
1909 if (status
== U_REGEX_UNIMPLEMENTED
) {
1911 // Test of a feature that is planned for ICU, but not yet implemented.
1913 skippedUnimplementedCount
++;
1915 status
= U_ZERO_ERROR
;
1919 if (U_FAILURE(status
)) {
1920 // Some tests are supposed to generate errors.
1921 // Only report an error for tests that are supposed to succeed.
1922 if (fields
[2].indexOf(UChar_c
) == -1 && // Compilation is not supposed to fail AND
1923 fields
[2].indexOf(UChar_i
) == -1) // it's not an accepted ICU incompatibility
1925 errln("line %d: ICU Error \"%s\"\n", lineNum
, u_errorName(status
));
1927 status
= U_ZERO_ERROR
;
1932 if (fields
[2].indexOf(UChar_i
) >= 0) {
1933 // ICU should skip this test.
1938 if (fields
[2].indexOf(UChar_c
) >= 0) {
1939 // This pattern should have caused a compilation error, but didn't/
1940 errln("line %d: Expected a pattern compile error, got success.", lineNum
);
1946 // replace the Perl variables that appear in some of the
1947 // match data strings.
1949 UnicodeString matchString
= fields
[1];
1950 matchString
.findAndReplace(nulnulSrc
, nulnul
);
1951 matchString
.findAndReplace(ffffSrc
, ffff
);
1953 // Replace any \n in the match string with an actual new-line char.
1954 // Don't do full unescape, as this unescapes more than Perl does, which
1955 // causes other spurious failures in the tests.
1956 matchString
.findAndReplace("\\n", "\n");
1961 // Run the test, check for expected match/don't match result.
1963 RegexMatcher
*testMat
= testPat
->matcher(matchString
, status
);
1964 UBool found
= testMat
->find();
1965 UBool expected
= FALSE
;
1966 if (fields
[2].indexOf(UChar_y
) >=0) {
1969 if (expected
!= found
) {
1970 errln("line %d: Expected %smatch, got %smatch",
1971 lineNum
, expected
?"":"no ", found
?"":"no " );
1976 // Interpret the Perl expression from the fourth field of the data file,
1977 // building up an ICU string from the results of the ICU match.
1978 // The Perl expression will contain references to the results of
1979 // a regex match, including the matched string, capture group strings,
1980 // group starting and ending indicies, etc.
1982 UnicodeString resultString
;
1983 UnicodeString perlExpr
= fields
[3];
1984 groupsMat
->reset(perlExpr
);
1985 cgMat
->reset(perlExpr
);
1987 while (perlExpr
.length() > 0) {
1988 if (perlExpr
.startsWith("$&")) {
1989 resultString
.append(testMat
->group(status
));
1990 perlExpr
.remove(0, 2);
1993 else if (groupsMat
->lookingAt(status
)) {
1995 UnicodeString digitString
= groupsMat
->group(2, status
);
1997 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
1998 UnicodeString plusOrMinus
= groupsMat
->group(1, status
);
1999 int32_t matchPosition
;
2000 if (plusOrMinus
.compare("+") == 0) {
2001 matchPosition
= testMat
->end(groupNum
, status
);
2003 matchPosition
= testMat
->start(groupNum
, status
);
2005 if (matchPosition
!= -1) {
2006 ICU_Utility::appendNumber(resultString
, matchPosition
);
2008 perlExpr
.remove(0, groupsMat
->end(status
));
2011 else if (cgMat
->lookingAt(status
)) {
2013 UnicodeString digitString
= cgMat
->group(1, status
);
2015 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
2016 if (U_SUCCESS(status
)) {
2017 resultString
.append(testMat
->group(groupNum
, status
));
2018 status
= U_ZERO_ERROR
;
2020 perlExpr
.remove(0, cgMat
->end(status
));
2023 else if (perlExpr
.startsWith("@-")) {
2025 for (i
=0; i
<=testMat
->groupCount(); i
++) {
2027 resultString
.append(" ");
2029 ICU_Utility::appendNumber(resultString
, testMat
->start(i
, status
));
2031 perlExpr
.remove(0, 2);
2034 else if (perlExpr
.startsWith("@+")) {
2036 for (i
=0; i
<=testMat
->groupCount(); i
++) {
2038 resultString
.append(" ");
2040 ICU_Utility::appendNumber(resultString
, testMat
->end(i
, status
));
2042 perlExpr
.remove(0, 2);
2045 else if (perlExpr
.startsWith("\\")) { // \Escape. Take following char as a literal.
2046 // or as an escaped sequence (e.g. \n)
2047 if (perlExpr
.length() > 1) {
2048 perlExpr
.remove(0, 1); // Remove the '\', but only if not last char.
2050 UChar c
= perlExpr
.charAt(0);
2052 case 'n': c
= '\n'; break;
2053 // add any other escape sequences that show up in the test expected results.
2055 resultString
.append(c
);
2056 perlExpr
.remove(0, 1);
2060 // Any characters from the perl expression that we don't explicitly
2061 // recognize before here are assumed to be literals and copied
2062 // as-is to the expected results.
2063 resultString
.append(perlExpr
.charAt(0));
2064 perlExpr
.remove(0, 1);
2067 if (U_FAILURE(status
)) {
2068 errln("Line %d: ICU Error \"%s\"", lineNum
, u_errorName(status
));
2074 // Expected Results Compare
2076 UnicodeString
expectedS(fields
[4]);
2077 expectedS
.findAndReplace(nulnulSrc
, nulnul
);
2078 expectedS
.findAndReplace(ffffSrc
, ffff
);
2079 expectedS
.findAndReplace("\\n", "\n");
2082 if (expectedS
.compare(resultString
) != 0) {
2083 errln("Line %d: Incorrect perl expression results. Expected \"%s\"; got \"%s\"",
2084 lineNum
, (const char *)CharString(expectedS
, 0),
2085 (const char *)CharString(resultString
, 0));
2093 // All done. Clean up allocated stuff.
2111 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount
);
2117 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */