1 /********************************************************************
3 * Copyright (c) 2002-2003, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
10 // ICU Regular Expressions test, part of intltest.
13 #include "unicode/utypes.h"
14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
16 #include "unicode/uchar.h"
17 #include "unicode/ucnv.h"
26 //---------------------------------------------------------------------------
28 // Test class boilerplate
30 //---------------------------------------------------------------------------
31 RegexTest::RegexTest()
36 RegexTest::~RegexTest()
42 void RegexTest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* /*par*/ )
44 if (exec
) logln("TestSuite RegexTest: ");
47 case 0: name
= "Basic";
50 case 1: name
= "API_Match";
51 if (exec
) API_Match();
53 case 2: name
= "API_Replace";
54 if (exec
) API_Replace();
56 case 3: name
= "API_Pattern";
57 if (exec
) API_Pattern();
59 case 4: name
= "Extended";
62 case 5: name
= "Errors";
65 case 6: name
= "PerlTests";
66 if (exec
) PerlTests();
71 break; //needed to end loop
76 //---------------------------------------------------------------------------
78 // Error Checking / Reporting macros used in all of the tests.
80 //---------------------------------------------------------------------------
81 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {errln("RegexTest failure at line %d. status=%s\n", \
82 __LINE__, u_errorName(status)); return;}}
84 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("RegexTest failure at line %d.\n", __LINE__);};}
86 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
87 if (status!=errcode) {errln("RegexTest failure at line %d. Expected status=%s, got %s\n", \
88 __LINE__, u_errorName(errcode), u_errorName(status));};}
90 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
91 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
93 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
94 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
98 //---------------------------------------------------------------------------
100 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests
101 // for the LookingAt() and Match() functions.
104 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
106 // The expected results are UBool - TRUE or FALSE.
107 // The input text is unescaped. The pattern is not.
110 //---------------------------------------------------------------------------
112 #define REGEX_TESTLM(pat, text, looking, match) doRegexLMTest(pat, text, looking, match, __LINE__);
114 UBool
RegexTest::doRegexLMTest(const char *pat
, const char *text
, UBool looking
, UBool match
, int line
) {
115 const UnicodeString
pattern(pat
);
116 const UnicodeString
inputText(text
);
117 UErrorCode status
= U_ZERO_ERROR
;
119 RegexPattern
*REPattern
= NULL
;
120 RegexMatcher
*REMatcher
= NULL
;
123 UnicodeString
patString(pat
);
124 REPattern
= RegexPattern::compile(patString
, 0, pe
, status
);
125 if (U_FAILURE(status
)) {
126 errln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s\n",
127 line
, u_errorName(status
));
130 if (line
==376) { REPattern
->dump();}
132 UnicodeString
inputString(inputText
);
133 UnicodeString unEscapedInput
= inputString
.unescape();
134 REMatcher
= REPattern
->matcher(unEscapedInput
, status
);
135 if (U_FAILURE(status
)) {
136 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
137 line
, u_errorName(status
));
142 actualmatch
= REMatcher
->lookingAt(status
);
143 if (U_FAILURE(status
)) {
144 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
145 line
, u_errorName(status
));
148 if (actualmatch
!= looking
) {
149 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line
);
153 status
= U_ZERO_ERROR
;
154 actualmatch
= REMatcher
->matches(status
);
155 if (U_FAILURE(status
)) {
156 errln("RegexTest failure in matches() at line %d. Status = %s\n",
157 line
, u_errorName(status
));
160 if (actualmatch
!= match
) {
161 errln("RegexTest: wrong return from matches() at line %d.\n", line
);
165 if (retVal
== FALSE
) {
177 //---------------------------------------------------------------------------
179 // regex_find(pattern, inputString, lineNumber)
181 // function to simplify writing tests regex tests.
183 // The input text is unescaped. The pattern is not.
184 // The input text is marked with the expected match positions
185 // <0>text <1> more text </1> </0>
186 // The <n> </n> tags are removed before trying the match.
187 // The tags mark the start and end of the match and of any capture groups.
190 //---------------------------------------------------------------------------
193 // Set a value into a UVector at position specified by a decimal number in
194 // a UnicodeString. This is a utility function needed by the actual test function,
196 static void set(UVector
&vec
, int val
, UnicodeString index
) {
197 UErrorCode status
=U_ZERO_ERROR
;
199 for (int i
=0; i
<index
.length(); i
++) {
200 int d
=u_charDigitValue(index
.charAt(i
));
204 while (vec
.size()<idx
+1) {vec
.addElement(-1, status
);}
205 vec
.setElementAt(val
, idx
);
208 void RegexTest::regex_find(const UnicodeString
&pattern
,
209 const UnicodeString
&flags
,
210 const UnicodeString
&inputString
,
212 UnicodeString unEscapedInput
;
213 UnicodeString deTaggedInput
;
215 UErrorCode status
= U_ZERO_ERROR
;
217 RegexPattern
*parsePat
= NULL
;
218 RegexMatcher
*parseMatcher
= NULL
;
219 RegexPattern
*callerPattern
= NULL
;
220 RegexMatcher
*matcher
= NULL
;
221 UVector
groupStarts(status
);
222 UVector
groupEnds(status
);
224 UBool failed
= FALSE
;
227 // Compile the caller's pattern
230 if (flags
.indexOf((UChar
)0x69) >= 0) { // 'i' flag
231 bflags
|= UREGEX_CASE_INSENSITIVE
;
233 if (flags
.indexOf((UChar
)0x78) >= 0) { // 'x' flag
234 bflags
|= UREGEX_COMMENTS
;
236 if (flags
.indexOf((UChar
)0x73) >= 0) { // 's' flag
237 bflags
|= UREGEX_DOTALL
;
239 if (flags
.indexOf((UChar
)0x6d) >= 0) { // 'm' flag
240 bflags
|= UREGEX_MULTILINE
;
244 callerPattern
= RegexPattern::compile(pattern
, bflags
, pe
, status
);
245 if (status
!= U_ZERO_ERROR
) {
246 errln("Line %d: error %s compiling pattern.", line
, u_errorName(status
));
247 goto cleanupAndReturn
;
250 if (flags
.indexOf((UChar
)'d') >= 0) {
251 callerPattern
->dump();
255 // Find the tags in the input data, remove them, and record the group boundary
258 parsePat
= RegexPattern::compile("<(/?)([0-9]+)>", 0, pe
, status
);
259 REGEX_CHECK_STATUS_L(line
);
261 unEscapedInput
= inputString
.unescape();
262 parseMatcher
= parsePat
->matcher(unEscapedInput
, status
);
263 REGEX_CHECK_STATUS_L(line
);
264 while(parseMatcher
->find()) {
265 parseMatcher
->appendReplacement(deTaggedInput
, "", status
);
267 UnicodeString groupNum
= parseMatcher
->group(2, status
);
268 if (parseMatcher
->group(1, status
) == "/") {
270 set(groupEnds
, deTaggedInput
.length(), groupNum
);
272 set(groupStarts
, deTaggedInput
.length(), groupNum
);
275 parseMatcher
->appendTail(deTaggedInput
);
276 REGEX_ASSERT_L(groupStarts
.size() == groupEnds
.size(), line
);
280 // Do a find on the de-tagged input using the caller's pattern
282 matcher
= callerPattern
->matcher(deTaggedInput
, status
);
283 REGEX_CHECK_STATUS_L(line
);
284 if (flags
.indexOf((UChar
)'t') >= 0) {
285 matcher
->setTrace(TRUE
);
288 isMatch
= matcher
->find();
289 matcher
->setTrace(FALSE
);
292 // Match up the groups from the find() with the groups from the tags
295 // number of tags should match number of groups from find operation.
296 // matcher->groupCount does not include group 0, the entire match, hence the +1.
297 // G option in test means that capture group data is not available in the
298 // expected results, so the check needs to be suppressed.
299 if (isMatch
== FALSE
&& groupStarts
.size() != 0) {
300 errln("Error at line %d: Match expected, but none found.\n", line
);
302 goto cleanupAndReturn
;
305 if (flags
.indexOf((UChar
)0x47 /*G*/) >= 0) {
306 // Only check for match / no match. Don't check capture groups.
307 if (isMatch
&& groupStarts
.size() == 0) {
308 errln("Error at line %d: No match expected, but one found.\n", line
);
311 goto cleanupAndReturn
;
315 for (i
=0; i
<=matcher
->groupCount(); i
++) {
316 int32_t expectedStart
= (i
>= groupStarts
.size()? -1 : groupStarts
.elementAti(i
));
317 if (matcher
->start(i
, status
) != expectedStart
) {
318 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
319 line
, i
, expectedStart
, matcher
->start(i
, status
));
321 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
323 int32_t expectedEnd
= (i
>= groupEnds
.size()? -1 : groupEnds
.elementAti(i
));
324 if (matcher
->end(i
, status
) != expectedEnd
) {
325 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
326 line
, i
, expectedEnd
, matcher
->end(i
, status
));
328 // Error on end position; keep going; real error is probably yet to come as group
329 // end positions work from end of the input data towards the front.
332 if ( matcher
->groupCount()+1 < groupStarts
.size()) {
333 errln("Error at line %d: Expected %d capture groups, found %d.",
334 line
, groupStarts
.size()-1, matcher
->groupCount());
340 errln("\"%s\" %s \"%s\"", (const char *)CharString(pattern
),
341 (const char *)CharString(flags
),
342 (const char *)CharString(inputString
));
343 // callerPattern->dump();
348 delete callerPattern
;
358 //---------------------------------------------------------------------------
360 // REGEX_ERR Macro + invocation function to simplify writing tests
361 // regex tests for incorrect patterns
364 // REGEX_ERR("pattern", expected error line, column, expected status);
366 //---------------------------------------------------------------------------
367 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
369 void RegexTest::regex_err(const char *pat
, int32_t errLine
, int32_t errCol
,
370 UErrorCode expectedStatus
, int line
) {
371 UnicodeString
pattern(pat
);
373 UErrorCode status
= U_ZERO_ERROR
;
375 RegexPattern
*callerPattern
= NULL
;
378 // Compile the caller's pattern
380 UnicodeString
patString(pat
);
381 callerPattern
= RegexPattern::compile(patString
, 0, pe
, status
);
382 if (status
!= expectedStatus
) {
383 errln("Line %d: unexpected error %s compiling pattern.", line
, u_errorName(status
));
385 if (status
!= U_ZERO_ERROR
) {
386 if (pe
.line
!= errLine
|| pe
.offset
!= errCol
) {
387 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
388 line
, errLine
, errCol
, pe
.line
, pe
.offset
);
393 delete callerPattern
;
398 //---------------------------------------------------------------------------
400 // Basic Check for basic functionality of regex pattern matching.
401 // Avoid the use of REGEX_FIND test macro, which has
402 // substantial dependencies on basic Regex functionality.
404 //---------------------------------------------------------------------------
405 void RegexTest::Basic() {
409 // Debug - slide failing test cases early
413 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
415 UErrorCode status
= U_ZERO_ERROR
;
416 RegexPattern::compile("^(?:a?b?)*$", 0, pe
, status
);
417 // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
418 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
425 // Pattern with parentheses
427 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE
, FALSE
);
428 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE
, TRUE
);
429 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE
, FALSE
);
434 REGEX_TESTLM("st(abc)*ring", "string", TRUE
, TRUE
);
435 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE
, TRUE
);
436 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE
, TRUE
);
437 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE
, FALSE
);
438 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE
, FALSE
);
440 REGEX_TESTLM("a*", "", TRUE
, TRUE
);
441 REGEX_TESTLM("a*", "b", TRUE
, FALSE
);
447 REGEX_TESTLM(".", "abc", TRUE
, FALSE
);
448 REGEX_TESTLM("...", "abc", TRUE
, TRUE
);
449 REGEX_TESTLM("....", "abc", FALSE
, FALSE
);
450 REGEX_TESTLM(".*", "abcxyz123", TRUE
, TRUE
);
451 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE
, FALSE
);
452 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE
, TRUE
);
453 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE
, TRUE
);
454 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE
, FALSE
);
457 // Patterns with * applied to chars at end of literal string
459 REGEX_TESTLM("abc*", "ab", TRUE
, TRUE
);
460 REGEX_TESTLM("abc*", "abccccc", TRUE
, TRUE
);
463 // Supplemental chars match as single chars, not a pair of surrogates.
465 REGEX_TESTLM(".", "\\U00011000", TRUE
, TRUE
);
466 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE
, TRUE
);
467 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE
, FALSE
);
471 // UnicodeSets in the pattern
473 REGEX_TESTLM("[1-6]", "1", TRUE
, TRUE
);
474 REGEX_TESTLM("[1-6]", "3", TRUE
, TRUE
);
475 REGEX_TESTLM("[1-6]", "7", FALSE
, FALSE
);
476 REGEX_TESTLM("a[1-6]", "a3", TRUE
, TRUE
);
477 REGEX_TESTLM("a[1-6]", "a3", TRUE
, TRUE
);
478 REGEX_TESTLM("a[1-6]b", "a3b", TRUE
, TRUE
);
480 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE
, TRUE
);
481 REGEX_TESTLM("a[0-9]*b", "abc", TRUE
, FALSE
);
482 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE
, TRUE
);
483 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE
, FALSE
); // note that * matches 0 occurences.
484 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE
, TRUE
);
487 // OR operator in patterns
489 REGEX_TESTLM("(a|b)", "a", TRUE
, TRUE
);
490 REGEX_TESTLM("(a|b)", "b", TRUE
, TRUE
);
491 REGEX_TESTLM("(a|b)", "c", FALSE
, FALSE
);
492 REGEX_TESTLM("a|b", "b", TRUE
, TRUE
);
494 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE
, TRUE
);
495 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE
, FALSE
);
496 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE
, TRUE
);
497 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE
, TRUE
);
498 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE
, TRUE
);
499 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE
, FALSE
);
504 REGEX_TESTLM("ab+", "abbc", TRUE
, FALSE
);
505 REGEX_TESTLM("ab+c", "ac", FALSE
, FALSE
);
506 REGEX_TESTLM("b+", "", FALSE
, FALSE
);
507 REGEX_TESTLM("(abc|def)+", "defabc", TRUE
, TRUE
);
508 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE
, FALSE
);
509 REGEX_TESTLM(".+y", "zippity dooy", TRUE
, TRUE
);
514 REGEX_TESTLM("ab?", "ab", TRUE
, TRUE
);
515 REGEX_TESTLM("ab?", "a", TRUE
, TRUE
);
516 REGEX_TESTLM("ab?", "ac", TRUE
, FALSE
);
517 REGEX_TESTLM("ab?", "abb", TRUE
, FALSE
);
518 REGEX_TESTLM("a(b|c)?d", "abd", TRUE
, TRUE
);
519 REGEX_TESTLM("a(b|c)?d", "acd", TRUE
, TRUE
);
520 REGEX_TESTLM("a(b|c)?d", "ad", TRUE
, TRUE
);
521 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE
, FALSE
);
522 REGEX_TESTLM("a(b|c)?d", "ab", FALSE
, FALSE
);
525 // Escape sequences that become single literal chars, handled internally
526 // by ICU's Unescape.
529 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
530 REGEX_TESTLM("\\a", "\\u0007", TRUE
, TRUE
); // BEL
531 REGEX_TESTLM("\\cL", "\\u000c", TRUE
, TRUE
); // Control-L
532 REGEX_TESTLM("\\e", "\\u001b", TRUE
, TRUE
); // Escape
533 REGEX_TESTLM("\\f", "\\u000c", TRUE
, TRUE
); // Form Feed
534 REGEX_TESTLM("\\n", "\\u000a", TRUE
, TRUE
); // new line
535 REGEX_TESTLM("\\r", "\\u000d", TRUE
, TRUE
); // CR
536 REGEX_TESTLM("\\t", "\\u0009", TRUE
, TRUE
); // Tab
537 REGEX_TESTLM("\\u1234", "\\u1234", TRUE
, TRUE
);
538 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE
, TRUE
);
540 REGEX_TESTLM(".*\\Ax", "xyz", TRUE
, FALSE
); // \A matches only at the beginning of input
541 REGEX_TESTLM(".*\\Ax", " xyz", FALSE
, FALSE
); // \A matches only at the beginning of input
543 // Escape of special chars in patterns
544 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE
, TRUE
);
550 //---------------------------------------------------------------------------
552 // API_Match Test that the API for class RegexMatcher
553 // is present and nominally working, but excluding functions
554 // implementing replace operations.
556 //---------------------------------------------------------------------------
557 void RegexTest::API_Match() {
559 UErrorCode status
=U_ZERO_ERROR
;
563 // Debug - slide failing test cases early
572 // Simple pattern compilation
575 UnicodeString
re("abc");
577 pat2
= RegexPattern::compile(re
, flags
, pe
, status
);
580 UnicodeString inStr1
= "abcdef this is a test";
581 UnicodeString instr2
= "not abc";
582 UnicodeString empty
= "";
586 // Matcher creation and reset.
588 RegexMatcher
*m1
= pat2
->matcher(inStr1
, status
);
590 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
591 REGEX_ASSERT(m1
->input() == inStr1
);
593 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
594 REGEX_ASSERT(m1
->input() == instr2
);
596 REGEX_ASSERT(m1
->input() == inStr1
);
597 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
599 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
600 REGEX_ASSERT(m1
->input() == empty
);
601 REGEX_ASSERT(&m1
->pattern() == pat2
);
609 // RegexMatcher::start();
610 // RegexMatcher::end();
611 // RegexMatcher::groupCount();
616 UErrorCode status
=U_ZERO_ERROR
;
618 UnicodeString
re("01(23(45)67)(.*)");
619 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
621 UnicodeString data
= "0123456789";
623 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
625 REGEX_ASSERT(matcher
->lookingAt(status
) == TRUE
);
626 int matchStarts
[] = {0, 2, 4, 8};
627 int matchEnds
[] = {10, 8, 6, 10};
629 for (i
=0; i
<4; i
++) {
630 int32_t actualStart
= matcher
->start(i
, status
);
632 if (actualStart
!= matchStarts
[i
]) {
633 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
634 __LINE__
, i
, matchStarts
[i
], actualStart
);
636 int32_t actualEnd
= matcher
->end(i
, status
);
638 if (actualEnd
!= matchEnds
[i
]) {
639 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
640 __LINE__
, i
, matchEnds
[i
], actualEnd
);
644 REGEX_ASSERT(matcher
->start(0, status
) == matcher
->start(status
));
645 REGEX_ASSERT(matcher
->end(0, status
) == matcher
->end(status
));
647 REGEX_ASSERT_FAIL(matcher
->start(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
648 REGEX_ASSERT_FAIL(matcher
->start( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
650 REGEX_ASSERT_FAIL(matcher
->start( 0, status
), U_REGEX_INVALID_STATE
);
652 matcher
->lookingAt(status
);
653 REGEX_ASSERT(matcher
->group(status
) == "0123456789");
654 REGEX_ASSERT(matcher
->group(0, status
) == "0123456789");
655 REGEX_ASSERT(matcher
->group(1, status
) == "234567" );
656 REGEX_ASSERT(matcher
->group(2, status
) == "45" );
657 REGEX_ASSERT(matcher
->group(3, status
) == "89" );
659 REGEX_ASSERT_FAIL(matcher
->group(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
660 REGEX_ASSERT_FAIL(matcher
->group( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
662 REGEX_ASSERT_FAIL(matcher
->group( 0, status
), U_REGEX_INVALID_STATE
);
675 UErrorCode status
=U_ZERO_ERROR
;
677 UnicodeString
re("abc");
678 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
680 UnicodeString data
= ".abc..abc...abc..";
681 // 012345678901234567
683 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
685 REGEX_ASSERT(matcher
->find());
686 REGEX_ASSERT(matcher
->start(status
) == 1);
687 REGEX_ASSERT(matcher
->find());
688 REGEX_ASSERT(matcher
->start(status
) == 6);
689 REGEX_ASSERT(matcher
->find());
690 REGEX_ASSERT(matcher
->start(status
) == 12);
691 REGEX_ASSERT(matcher
->find() == FALSE
);
692 REGEX_ASSERT(matcher
->find() == FALSE
);
695 REGEX_ASSERT(matcher
->find());
696 REGEX_ASSERT(matcher
->start(status
) == 1);
698 REGEX_ASSERT(matcher
->find(0, status
));
699 REGEX_ASSERT(matcher
->start(status
) == 1);
700 REGEX_ASSERT(matcher
->find(1, status
));
701 REGEX_ASSERT(matcher
->start(status
) == 1);
702 REGEX_ASSERT(matcher
->find(2, status
));
703 REGEX_ASSERT(matcher
->start(status
) == 6);
704 REGEX_ASSERT(matcher
->find(12, status
));
705 REGEX_ASSERT(matcher
->start(status
) == 12);
706 REGEX_ASSERT(matcher
->find(13, status
) == FALSE
);
707 REGEX_ASSERT(matcher
->find(16, status
) == FALSE
);
708 REGEX_ASSERT_FAIL(matcher
->start(status
), U_REGEX_INVALID_STATE
);
711 REGEX_ASSERT_FAIL(matcher
->find(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
712 REGEX_ASSERT_FAIL(matcher
->find(17, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
714 REGEX_ASSERT(matcher
->groupCount() == 0);
722 // find, with \G in pattern (true if at the end of a previous match).
727 UErrorCode status
=U_ZERO_ERROR
;
729 UnicodeString
re(".*?(?:(\\Gabc)|(abc))");
730 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
732 UnicodeString data
= ".abcabc.abc..";
733 // 012345678901234567
735 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
737 REGEX_ASSERT(matcher
->find());
738 REGEX_ASSERT(matcher
->start(status
) == 0);
739 REGEX_ASSERT(matcher
->start(1, status
) == -1);
740 REGEX_ASSERT(matcher
->start(2, status
) == 1);
742 REGEX_ASSERT(matcher
->find());
743 REGEX_ASSERT(matcher
->start(status
) == 4);
744 REGEX_ASSERT(matcher
->start(1, status
) == 4);
745 REGEX_ASSERT(matcher
->start(2, status
) == -1);
753 // Matchers with no input string behave as if they had an empty input string.
757 UErrorCode status
= U_ZERO_ERROR
;
758 RegexMatcher
m(".?", 0, status
);
760 REGEX_ASSERT(m
.find());
761 REGEX_ASSERT(m
.start(status
) == 0);
762 REGEX_ASSERT(m
.input() == "");
765 UErrorCode status
= U_ZERO_ERROR
;
766 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
767 RegexMatcher
*m
= p
->matcher(status
);
770 REGEX_ASSERT(m
->find() == FALSE
);
771 REGEX_ASSERT(m
->input() == "");
783 //---------------------------------------------------------------------------
785 // API_Replace API test for class RegexMatcher, testing the
786 // Replace family of functions.
788 //---------------------------------------------------------------------------
789 void RegexTest::API_Replace() {
795 UErrorCode status
=U_ZERO_ERROR
;
797 UnicodeString
re("abc");
798 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
800 UnicodeString data
= ".abc..abc...abc..";
801 // 012345678901234567
802 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
805 // Plain vanilla matches.
808 dest
= matcher
->replaceFirst("yz", status
);
810 REGEX_ASSERT(dest
== ".yz..abc...abc..");
812 dest
= matcher
->replaceAll("yz", status
);
814 REGEX_ASSERT(dest
== ".yz..yz...yz..");
817 // Plain vanilla non-matches.
819 UnicodeString d2
= ".abx..abx...abx..";
821 dest
= matcher
->replaceFirst("yz", status
);
823 REGEX_ASSERT(dest
== ".abx..abx...abx..");
825 dest
= matcher
->replaceAll("yz", status
);
827 REGEX_ASSERT(dest
== ".abx..abx...abx..");
830 // Empty source string
832 UnicodeString d3
= "";
834 dest
= matcher
->replaceFirst("yz", status
);
836 REGEX_ASSERT(dest
== "");
838 dest
= matcher
->replaceAll("yz", status
);
840 REGEX_ASSERT(dest
== "");
843 // Empty substitution string
845 matcher
->reset(data
); // ".abc..abc...abc.."
846 dest
= matcher
->replaceFirst("", status
);
848 REGEX_ASSERT(dest
== "...abc...abc..");
850 dest
= matcher
->replaceAll("", status
);
852 REGEX_ASSERT(dest
== "........");
855 // match whole string
857 UnicodeString d4
= "abc";
859 dest
= matcher
->replaceFirst("xyz", status
);
861 REGEX_ASSERT(dest
== "xyz");
863 dest
= matcher
->replaceAll("xyz", status
);
865 REGEX_ASSERT(dest
== "xyz");
868 // Capture Group, simple case
870 UnicodeString
re2("a(..)");
871 RegexPattern
*pat2
= RegexPattern::compile(re2
, flags
, pe
, status
);
873 UnicodeString d5
= "abcdefg";
874 RegexMatcher
*matcher2
= pat2
->matcher(d5
, status
);
876 dest
= matcher2
->replaceFirst("$1$1", status
);
878 REGEX_ASSERT(dest
== "bcbcdefg");
880 dest
= matcher2
->replaceFirst("The value of \\$1 is $1.", status
);
882 REGEX_ASSERT(dest
== "The value of $1 is bc.defg");
884 dest
= matcher2
->replaceFirst("$ by itself, no group number $$$", status
);
886 REGEX_ASSERT(dest
== "$ by itself, no group number $$$defg");
888 UnicodeString replacement
= "Supplemental Digit 1 $\\U0001D7CF.";
889 replacement
= replacement
.unescape();
890 dest
= matcher2
->replaceFirst(replacement
, status
);
892 REGEX_ASSERT(dest
== "Supplemental Digit 1 bc.defg");
894 REGEX_ASSERT_FAIL(matcher2
->replaceFirst("bad capture group number $5...",status
), U_INDEX_OUTOFBOUNDS_ERROR
);
898 // TODO: need more through testing of capture substitutions.
908 //---------------------------------------------------------------------------
910 // API_Pattern Test that the API for class RegexPattern is
911 // present and nominally working.
913 //---------------------------------------------------------------------------
914 void RegexTest::API_Pattern() {
915 RegexPattern pata
; // Test default constructor to not crash.
918 REGEX_ASSERT(pata
== patb
);
919 REGEX_ASSERT(pata
== pata
);
921 UnicodeString
re1("abc[a-l][m-z]");
922 UnicodeString
re2("def");
923 UErrorCode status
= U_ZERO_ERROR
;
926 RegexPattern
*pat1
= RegexPattern::compile(re1
, 0, pe
, status
);
927 RegexPattern
*pat2
= RegexPattern::compile(re2
, 0, pe
, status
);
929 REGEX_ASSERT(*pat1
== *pat1
);
930 REGEX_ASSERT(*pat1
!= pata
);
934 REGEX_ASSERT(patb
== *pat1
);
937 RegexPattern
patc(*pat1
);
938 REGEX_ASSERT(patc
== *pat1
);
939 REGEX_ASSERT(patb
== patc
);
940 REGEX_ASSERT(pat1
!= pat2
);
942 REGEX_ASSERT(patb
!= patc
);
943 REGEX_ASSERT(patb
== *pat2
);
945 // Compile with no flags.
946 RegexPattern
*pat1a
= RegexPattern::compile(re1
, pe
, status
);
947 REGEX_ASSERT(*pat1a
== *pat1
);
949 REGEX_ASSERT(pat1a
->flags() == 0);
951 // Compile with different flags should be not equal
952 RegexPattern
*pat1b
= RegexPattern::compile(re1
, UREGEX_CASE_INSENSITIVE
, pe
, status
);
955 REGEX_ASSERT(*pat1b
!= *pat1a
);
956 REGEX_ASSERT(pat1b
->flags() == UREGEX_CASE_INSENSITIVE
);
957 REGEX_ASSERT(pat1a
->flags() == 0);
959 #endif // add test back in when we actually support flag settings.
962 RegexPattern
*pat1c
= pat1
->clone();
963 REGEX_ASSERT(*pat1c
== *pat1
);
964 REGEX_ASSERT(*pat1c
!= *pat2
);
967 // TODO: Actually do some matches with the cloned/copied/assigned patterns.
978 // matches convenience API
980 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe
, status
) == TRUE
);
982 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe
, status
) == FALSE
);
984 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe
, status
) == TRUE
);
986 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe
, status
) == TRUE
);
988 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe
, status
) == FALSE
);
990 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
991 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe
, status
) == FALSE
);
992 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
998 status
= U_ZERO_ERROR
;
999 pat1
= RegexPattern::compile(" +", pe
, status
);
1001 UnicodeString fields
[10];
1004 n
= pat1
->split("Now is the time", fields
, 10, status
);
1007 REGEX_ASSERT(fields
[0]=="Now");
1008 REGEX_ASSERT(fields
[1]=="is");
1009 REGEX_ASSERT(fields
[2]=="the");
1010 REGEX_ASSERT(fields
[3]=="time");
1011 REGEX_ASSERT(fields
[4]=="");
1013 n
= pat1
->split("Now is the time", fields
, 2, status
);
1016 REGEX_ASSERT(fields
[0]=="Now");
1017 REGEX_ASSERT(fields
[1]=="is the time");
1018 REGEX_ASSERT(fields
[2]=="the"); // left over from previous test
1021 status
= U_ZERO_ERROR
;
1022 n
= pat1
->split("Now is the time", fields
, 1, status
);
1025 REGEX_ASSERT(fields
[0]=="Now is the time");
1026 REGEX_ASSERT(fields
[1]=="*");
1027 status
= U_ZERO_ERROR
;
1029 n
= pat1
->split(" Now is the time ", fields
, 10, status
);
1032 REGEX_ASSERT(fields
[0]=="");
1033 REGEX_ASSERT(fields
[1]=="Now");
1034 REGEX_ASSERT(fields
[2]=="is");
1035 REGEX_ASSERT(fields
[3]=="the");
1036 REGEX_ASSERT(fields
[4]=="time");
1037 REGEX_ASSERT(fields
[5]=="");
1039 n
= pat1
->split(" ", fields
, 10, status
);
1042 REGEX_ASSERT(fields
[0]=="");
1045 n
= pat1
->split("", fields
, 10, status
);
1048 REGEX_ASSERT(fields
[0]=="foo");
1052 // split, with a pattern with (capture)
1053 pat1
= RegexPattern::compile("<(\\w*)>", pe
, status
);
1056 status
= U_ZERO_ERROR
;
1057 n
= pat1
->split("<a>Now is <b>the time<c>", fields
, 10, status
);
1060 REGEX_ASSERT(fields
[0]=="");
1061 REGEX_ASSERT(fields
[1]=="a");
1062 REGEX_ASSERT(fields
[2]=="Now is ");
1063 REGEX_ASSERT(fields
[3]=="b");
1064 REGEX_ASSERT(fields
[4]=="the time");
1065 REGEX_ASSERT(fields
[5]=="c");
1066 REGEX_ASSERT(fields
[6]=="");
1067 REGEX_ASSERT(status
==U_ZERO_ERROR
);
1069 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 10, status
);
1072 REGEX_ASSERT(fields
[0]==" ");
1073 REGEX_ASSERT(fields
[1]=="a");
1074 REGEX_ASSERT(fields
[2]=="Now is ");
1075 REGEX_ASSERT(fields
[3]=="b");
1076 REGEX_ASSERT(fields
[4]=="the time");
1077 REGEX_ASSERT(fields
[5]=="c");
1078 REGEX_ASSERT(fields
[6]=="");
1080 status
= U_ZERO_ERROR
;
1082 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 6, status
);
1085 REGEX_ASSERT(fields
[0]==" ");
1086 REGEX_ASSERT(fields
[1]=="a");
1087 REGEX_ASSERT(fields
[2]=="Now is ");
1088 REGEX_ASSERT(fields
[3]=="b");
1089 REGEX_ASSERT(fields
[4]=="the time");
1090 REGEX_ASSERT(fields
[5]=="c");
1091 REGEX_ASSERT(fields
[6]=="foo");
1093 status
= U_ZERO_ERROR
;
1095 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 5, status
);
1098 REGEX_ASSERT(fields
[0]==" ");
1099 REGEX_ASSERT(fields
[1]=="a");
1100 REGEX_ASSERT(fields
[2]=="Now is ");
1101 REGEX_ASSERT(fields
[3]=="b");
1102 REGEX_ASSERT(fields
[4]=="the time<c>");
1103 REGEX_ASSERT(fields
[5]=="foo");
1105 status
= U_ZERO_ERROR
;
1107 n
= pat1
->split(" <a>Now is <b>the time", fields
, 5, status
);
1110 REGEX_ASSERT(fields
[0]==" ");
1111 REGEX_ASSERT(fields
[1]=="a");
1112 REGEX_ASSERT(fields
[2]=="Now is ");
1113 REGEX_ASSERT(fields
[3]=="b");
1114 REGEX_ASSERT(fields
[4]=="the time");
1115 REGEX_ASSERT(fields
[5]=="foo");
1117 status
= U_ZERO_ERROR
;
1118 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 4, status
);
1121 REGEX_ASSERT(fields
[0]==" ");
1122 REGEX_ASSERT(fields
[1]=="a");
1123 REGEX_ASSERT(fields
[2]=="Now is ");
1124 REGEX_ASSERT(fields
[3]=="the time<c>");
1125 status
= U_ZERO_ERROR
;
1128 pat1
= RegexPattern::compile("([-,])", pe
, status
);
1130 n
= pat1
->split("1-10,20", fields
, 10, status
);
1133 REGEX_ASSERT(fields
[0]=="1");
1134 REGEX_ASSERT(fields
[1]=="-");
1135 REGEX_ASSERT(fields
[2]=="10");
1136 REGEX_ASSERT(fields
[3]==",");
1137 REGEX_ASSERT(fields
[4]=="20");
1142 // RegexPattern::pattern()
1144 pat1
= new RegexPattern();
1145 REGEX_ASSERT(pat1
->pattern() == "");
1148 pat1
= RegexPattern::compile("(Hello, world)*", pe
, status
);
1150 REGEX_ASSERT(pat1
->pattern() == "(Hello, world)*");
1155 // classID functions
1157 pat1
= RegexPattern::compile("(Hello, world)*", pe
, status
);
1159 REGEX_ASSERT(pat1
->getDynamicClassID() == RegexPattern::getStaticClassID());
1160 REGEX_ASSERT(pat1
->getDynamicClassID() != NULL
);
1161 RegexMatcher
*m
= pat1
->matcher("Hello, World", status
);
1162 REGEX_ASSERT(pat1
->getDynamicClassID() != m
->getDynamicClassID());
1163 REGEX_ASSERT(m
->getDynamicClassID() == RegexMatcher::getStaticClassID());
1164 REGEX_ASSERT(m
->getDynamicClassID() != NULL
);
1170 //---------------------------------------------------------------------------
1172 // Extended A more thorough check for features of regex patterns
1173 // The test cases are in a separate data file,
1174 // source/tests/testdata/regextst.txt
1175 // A description of the test data format is included in that file.
1177 //---------------------------------------------------------------------------
1178 void RegexTest::Extended() {
1179 UErrorCode status
= U_ZERO_ERROR
;
1180 int32_t lineNum
= 0;
1183 // Open and read the test data file.
1185 const char *testDataDirectory
= loadTestData(status
);
1186 if (U_FAILURE(status
)) {
1187 errln("ERROR: could not open test data %s", u_errorName(status
));
1190 UnicodeString
tdd(testDataDirectory
);
1191 RegexMatcher
m("([/\\\\])out[/\\\\]testdata", tdd
, 0, status
);
1192 if(U_SUCCESS(status
)) {
1193 tdd
= m
.replaceFirst("$1regextst.txt", status
);
1195 errln("Couldn't set up tests. Error %s", u_errorName(status
));
1200 UChar
*testData
= ReadAndConvertFile((const char *)CharString(tdd
), len
, status
);
1203 // Put the test data into a UnicodeString
1205 UnicodeString
testString(FALSE
, testData
, len
);
1207 RegexMatcher
quotedStuffMat("\\s*([\\'\\\"/])(.+?)\\1", 0, status
);
1208 RegexMatcher
commentMat ("\\s*(#.*)?$", 0, status
);
1209 RegexMatcher
flagsMat ("\\s*([ixsmdtG]*)([:letter:]*)", 0, status
);
1211 RegexMatcher
lineMat("(.*?)\\r?\\n", testString
, 0, status
);
1212 UnicodeString testPattern
; // The pattern for test from the test file.
1213 UnicodeString testFlags
; // the flags for a test.
1214 UnicodeString matchString
; // The marked up string to be used as input
1219 // Loop over the test data file, once per line.
1221 while (lineMat
.find()) {
1223 if (U_FAILURE(status
)) {
1224 errln("line %d: ICU Error \"%s\"", lineNum
, u_errorName(status
));
1227 status
= U_ZERO_ERROR
;
1228 UnicodeString testLine
= lineMat
.group(1, status
);
1229 if (testLine
.length() == 0) {
1234 // Parse the test line. Skip blank and comment only lines.
1235 // Separate out the three main fields - pattern, flags, target.
1238 commentMat
.reset(testLine
);
1239 if (commentMat
.lookingAt(status
)) {
1240 // This line is a comment, or blank.
1245 // Pull out the pattern field, remove it from the test file line.
1247 quotedStuffMat
.reset(testLine
);
1248 if (quotedStuffMat
.lookingAt(status
)) {
1249 testPattern
= quotedStuffMat
.group(2, status
);
1250 testLine
.remove(0, quotedStuffMat
.end(0, status
));
1252 errln("Bad pattern (missing quotes?) at test file line %d", lineNum
);
1258 // Pull out the flags from the test file line.
1260 flagsMat
.reset(testLine
);
1261 flagsMat
.lookingAt(status
); // Will always match, possibly an empty string.
1262 testFlags
= flagsMat
.group(1, status
);
1263 if (flagsMat
.group(2, status
).length() > 0) {
1264 errln("Bad Match flag at line %d. Scanning %c\n",
1265 lineNum
, flagsMat
.group(2, status
).charAt(0));
1268 testLine
.remove(0, flagsMat
.end(0, status
));
1271 // Pull out the match string, as a whole.
1272 // We'll process the <tags> later.
1274 quotedStuffMat
.reset(testLine
);
1275 if (quotedStuffMat
.lookingAt(status
)) {
1276 matchString
= quotedStuffMat
.group(2, status
);
1277 testLine
.remove(0, quotedStuffMat
.end(0, status
));
1279 errln("Bad match string at test file line %d", lineNum
);
1284 // The only thing left from the input line should be an optional trailing comment.
1286 commentMat
.reset(testLine
);
1287 if (commentMat
.lookingAt(status
) == FALSE
) {
1288 errln("Line %d: unexpected characters at end of test line.", lineNum
);
1295 regex_find(testPattern
, testFlags
, matchString
, lineNum
);
1304 //---------------------------------------------------------------------------
1306 // Errors Check for error handling in patterns.
1308 //---------------------------------------------------------------------------
1309 void RegexTest::Errors() {
1310 // \escape sequences that aren't implemented yet.
1311 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
1313 // Missing close parentheses
1314 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN
);
1315 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN
);
1316 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN
);
1318 // Extra close paren
1319 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN
);
1320 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN
);
1321 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN
);
1323 // Look-ahead, Look-behind
1324 // TODO: add tests for unbounded length look-behinds.
1325 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX
); // illegal construct
1327 // Attempt to use non-default flags
1330 UErrorCode status
= U_ZERO_ERROR
;
1331 int32_t flags
= UREGEX_CANON_EQ
|
1332 UREGEX_COMMENTS
| UREGEX_DOTALL
|
1334 RegexPattern
*pat1
= RegexPattern::compile(".*", flags
, pe
, status
);
1335 REGEX_ASSERT(status
== U_REGEX_UNIMPLEMENTED
);
1340 // Quantifiers are allowed only after something that can be quantified.
1341 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX
);
1342 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX
);
1343 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX
);
1345 // Mal-formed {min,max} quantifiers
1346 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL
);
1347 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN
);
1348 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL
);
1349 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL
);
1350 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL
);
1351 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG
);
1353 // UnicodeSet containing a string
1354 REGEX_ERR("abc[{def}]xyz", 1, 10, U_REGEX_SET_CONTAINS_STRING
);
1359 //-------------------------------------------------------------------------------
1361 // Read a text data file, convert it to UChars, and return the data
1362 // in one big UChar * buffer, which the caller must delete.
1364 //--------------------------------------------------------------------------------
1365 UChar
*RegexTest::ReadAndConvertFile(const char *fileName
, int &ulen
, UErrorCode
&status
) {
1366 UChar
*retPtr
= NULL
;
1367 char *fileBuf
= NULL
;
1368 UConverter
* conv
= NULL
;
1372 if (U_FAILURE(status
)) {
1379 f
= fopen(fileName
, "rb");
1381 errln("Error opening test data file %s\n", fileName
);
1382 goto cleanUpAndReturn
;
1390 fseek( f
, 0, SEEK_END
);
1391 fileSize
= ftell(f
);
1392 fileBuf
= new char[fileSize
];
1393 fseek(f
, 0, SEEK_SET
);
1394 amt_read
= fread(fileBuf
, 1, fileSize
, f
);
1395 if (amt_read
!= fileSize
|| fileSize
<= 0) {
1396 errln("Error reading test data file.");
1397 goto cleanUpAndReturn
;
1401 // Look for a Unicode Signature (BOM) on the data just read
1403 int32_t signatureLength
;
1404 const char * fileBufC
;
1405 const char* encoding
;
1408 encoding
= ucnv_detectUnicodeSignature(
1409 fileBuf
, fileSize
, &signatureLength
, &status
);
1410 if(encoding
!=NULL
){
1411 fileBufC
+= signatureLength
;
1412 fileSize
-= signatureLength
;
1416 // Open a converter to take the rule file to UTF-16
1418 conv
= ucnv_open(encoding
, &status
);
1419 if (U_FAILURE(status
)) {
1420 goto cleanUpAndReturn
;
1424 // Convert the rules to UChar.
1425 // Preflight first to determine required buffer size.
1427 ulen
= ucnv_toUChars(conv
,
1433 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
1434 // Buffer Overflow is expected from the preflight operation.
1435 status
= U_ZERO_ERROR
;
1437 retPtr
= new UChar
[ulen
+1];
1450 if (U_FAILURE(status
)) {
1451 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
1460 //-------------------------------------------------------------------------------
1462 // PerlTests - Run Perl's regular expression tests
1463 // The input file for this test is re_tests, the standard regular
1464 // expression test data distributed with the Perl source code.
1466 // Here is Perl's description of the test data file:
1468 // # The tests are in a separate file 't/op/re_tests'.
1469 // # Each line in that file is a separate test.
1470 // # There are five columns, separated by tabs.
1472 // # Column 1 contains the pattern, optionally enclosed in C<''>.
1473 // # Modifiers can be put after the closing C<'>.
1475 // # Column 2 contains the string to be matched.
1477 // # Column 3 contains the expected result:
1478 // # y expect a match
1479 // # n expect no match
1480 // # c expect an error
1481 // # B test exposes a known bug in Perl, should be skipped
1482 // # b test exposes a known bug in Perl, should be skipped if noamp
1484 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
1486 // # Column 4 contains a string, usually C<$&>.
1488 // # Column 5 contains the expected result of double-quote
1489 // # interpolating that string after the match, or start of error message.
1491 // # Column 6, if present, contains a reason why the test is skipped.
1492 // # This is printed with "skipped", for harness to pick up.
1494 // # \n in the tests are interpolated, as are variables of the form ${\w+}.
1496 // # If you want to add a regular expression test that can't be expressed
1497 // # in this format, don't add it here: put it in op/pat.t instead.
1499 // For ICU, if field 3 contains an 'i', the test will be skipped.
1500 // The test exposes is some known incompatibility between ICU and Perl regexps.
1501 // (The i is in addition to whatever was there before.)
1503 //-------------------------------------------------------------------------------
1504 void RegexTest::PerlTests() {
1505 UErrorCode status
= U_ZERO_ERROR
;
1509 // Open and read the test data file.
1511 const char *testDataDirectory
= loadTestData(status
);
1512 if (U_FAILURE(status
)) {
1513 errln("ERROR: could not open test data %s", u_errorName(status
));
1516 UnicodeString
tdd(testDataDirectory
);
1517 RegexMatcher
m("([/\\\\])out[/\\\\]testdata", tdd
, 0, status
);
1518 if(U_SUCCESS(status
)) {
1519 tdd
= m
.replaceFirst("$1re_tests.txt", status
);
1521 errln("Couldn't set up tests. Error %s", u_errorName(status
));
1526 UChar
*testData
= ReadAndConvertFile((const char *)CharString(tdd
), len
, status
);
1529 // Put the test data into a UnicodeString
1531 UnicodeString
testDataString(FALSE
, testData
, len
);
1534 // Regex to break the input file into lines, and strip the new lines.
1535 // One line per match, capture group one is the desired data.
1537 RegexPattern
* linePat
= RegexPattern::compile("(.+?)[\\r\\n]+", 0, pe
, status
);
1538 RegexMatcher
* lineMat
= linePat
->matcher(testDataString
, status
);
1541 // Regex to split a test file line into fields.
1542 // There are six fields, separated by tabs.
1544 RegexPattern
* fieldPat
= RegexPattern::compile("\\t", 0, pe
, status
);
1547 // Regex to identify test patterns with flag settings, and to separate them.
1548 // Test patterns with flags look like 'pattern'i
1549 // Test patterns without flags are not quoted: pattern
1550 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
1552 RegexPattern
*flagPat
= RegexPattern::compile("('?)(.*)\\1(.*)", 0, pe
, status
);
1553 RegexMatcher
* flagMat
= flagPat
->matcher("", status
);
1556 // The Perl tests reference several perl-isms, which are evaluated/substituted
1557 // in the test data. Not being perl, this must be done explicitly. Here
1558 // are string constants and REs for these constructs.
1560 UnicodeString
nulnulSrc("${nulnul}");
1561 UnicodeString
nulnul("\\u0000\\u0000");
1562 nulnul
= nulnul
.unescape();
1564 UnicodeString
ffffSrc("${ffff}");
1565 UnicodeString
ffff("\\uffff");
1566 ffff
= ffff
.unescape();
1568 // regexp for $-[0], $+[2], etc.
1569 RegexPattern
*groupsPat
= RegexPattern::compile("\\$([+\\-])\\[(\\d+)\\]", 0, pe
, status
);
1570 RegexMatcher
*groupsMat
= groupsPat
->matcher("", status
);
1572 // regexp for $0, $1, $2, etc.
1573 RegexPattern
*cgPat
= RegexPattern::compile("\\$(\\d+)", 0, pe
, status
);
1574 RegexMatcher
*cgMat
= cgPat
->matcher("", status
);
1578 // Main Loop for the Perl Tests, runs once per line from the
1581 int32_t lineNum
= 0;
1582 int32_t skippedUnimplementedCount
= 0;
1583 while (lineMat
->find()) {
1587 // Get a line, break it into its fields, do the Perl
1588 // variable substitutions.
1590 UnicodeString line
= lineMat
->group(1, status
);
1591 UnicodeString fields
[7];
1592 fieldPat
->split(line
, fields
, 7, status
);
1594 flagMat
->reset(fields
[0]);
1595 flagMat
->matches(status
);
1596 UnicodeString pattern
= flagMat
->group(2, status
);
1597 pattern
.findAndReplace("${bang}", "!");
1598 pattern
.findAndReplace(nulnulSrc
, "\\u0000\\u0000");
1599 pattern
.findAndReplace(ffffSrc
, ffff
);
1602 // Identify patterns that include match flag settings,
1603 // split off the flags, remove the extra quotes.
1605 UnicodeString flagStr
= flagMat
->group(3, status
);
1606 if (U_FAILURE(status
)) {
1607 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
1611 const UChar UChar_c
= 0x63; // Char constants for the flag letters.
1612 const UChar UChar_i
= 0x69; // (Damn the lack of Unicode support in C)
1613 const UChar UChar_m
= 0x6d;
1614 const UChar UChar_x
= 0x78;
1615 const UChar UChar_y
= 0x79;
1616 if (flagStr
.indexOf(UChar_i
) != -1) {
1617 flags
|= UREGEX_CASE_INSENSITIVE
;
1619 if (flagStr
.indexOf(UChar_m
) != -1) {
1620 flags
|= UREGEX_MULTILINE
;
1622 if (flagStr
.indexOf(UChar_x
) != -1) {
1623 flags
|= UREGEX_COMMENTS
;
1627 // Compile the test pattern.
1629 status
= U_ZERO_ERROR
;
1630 RegexPattern
*testPat
= RegexPattern::compile(pattern
, flags
, pe
, status
);
1631 if (status
== U_REGEX_UNIMPLEMENTED
) {
1633 // Test of a feature that is planned for ICU, but not yet implemented.
1635 skippedUnimplementedCount
++;
1637 status
= U_ZERO_ERROR
;
1641 if (U_FAILURE(status
)) {
1642 // Some tests are supposed to generate errors.
1643 // Only report an error for tests that are supposed to succeed.
1644 if (fields
[2].indexOf(UChar_c
) == -1 && // Compilation is not supposed to fail AND
1645 fields
[2].indexOf(UChar_i
) == -1) // it's not an accepted ICU incompatibility
1647 errln("line %d: ICU Error \"%s\"\n", lineNum
, u_errorName(status
));
1649 status
= U_ZERO_ERROR
;
1654 if (fields
[2].indexOf(UChar_i
) >= 0) {
1655 // ICU should skip this test.
1660 if (fields
[2].indexOf(UChar_c
) >= 0) {
1661 // This pattern should have caused a compilation error, but didn't/
1662 errln("line %d: Expected a pattern compile error, got success.", lineNum
);
1668 // replace the Perl variables that appear in some of the
1669 // match data strings.
1671 UnicodeString matchString
= fields
[1];
1672 matchString
.findAndReplace(nulnulSrc
, nulnul
);
1673 matchString
.findAndReplace(ffffSrc
, ffff
);
1675 // Replace any \n in the match string with an actual new-line char.
1676 // Don't do full unescape, as this unescapes more than Perl does, which
1677 // causes other spurious failures in the tests.
1678 matchString
.findAndReplace("\\n", "\n");
1683 // Run the test, check for expected match/don't match result.
1685 RegexMatcher
*testMat
= testPat
->matcher(matchString
, status
);
1686 UBool found
= testMat
->find();
1687 UBool expected
= FALSE
;
1688 if (fields
[2].indexOf(UChar_y
) >=0) {
1691 if (expected
!= found
) {
1692 errln("line %d: Expected %smatch, got %smatch",
1693 lineNum
, expected
?"":"no ", found
?"":"no " );
1698 // Interpret the Perl expression from the fourth field of the data file,
1699 // building up an ICU string from the results of the ICU match.
1700 // The Perl expression will contain references to the results of
1701 // a regex match, including the matched string, capture group strings,
1702 // group starting and ending indicies, etc.
1704 UnicodeString resultString
;
1705 UnicodeString perlExpr
= fields
[3];
1706 groupsMat
->reset(perlExpr
);
1707 cgMat
->reset(perlExpr
);
1709 while (perlExpr
.length() > 0) {
1710 if (perlExpr
.startsWith("$&")) {
1711 resultString
.append(testMat
->group(status
));
1712 perlExpr
.remove(0, 2);
1715 else if (groupsMat
->lookingAt(status
)) {
1717 UnicodeString digitString
= groupsMat
->group(2, status
);
1719 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
1720 UnicodeString plusOrMinus
= groupsMat
->group(1, status
);
1721 int32_t matchPosition
;
1722 if (plusOrMinus
.compare("+") == 0) {
1723 matchPosition
= testMat
->end(groupNum
, status
);
1725 matchPosition
= testMat
->start(groupNum
, status
);
1727 if (matchPosition
!= -1) {
1728 ICU_Utility::appendNumber(resultString
, matchPosition
);
1730 perlExpr
.remove(0, groupsMat
->end(status
));
1733 else if (cgMat
->lookingAt(status
)) {
1735 UnicodeString digitString
= cgMat
->group(1, status
);
1737 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
1738 if (U_SUCCESS(status
)) {
1739 resultString
.append(testMat
->group(groupNum
, status
));
1740 status
= U_ZERO_ERROR
;
1742 perlExpr
.remove(0, cgMat
->end(status
));
1745 else if (perlExpr
.startsWith("@-")) {
1747 for (i
=0; i
<=testMat
->groupCount(); i
++) {
1749 resultString
.append(" ");
1751 ICU_Utility::appendNumber(resultString
, testMat
->start(i
, status
));
1753 perlExpr
.remove(0, 2);
1756 else if (perlExpr
.startsWith("@+")) {
1758 for (i
=0; i
<=testMat
->groupCount(); i
++) {
1760 resultString
.append(" ");
1762 ICU_Utility::appendNumber(resultString
, testMat
->end(i
, status
));
1764 perlExpr
.remove(0, 2);
1767 else if (perlExpr
.startsWith("\\")) { // \Escape. Take following char as a literal.
1768 // or as an escaped sequence (e.g. \n)
1769 if (perlExpr
.length() > 1) {
1770 perlExpr
.remove(0, 1); // Remove the '\', but only if not last char.
1772 UChar c
= perlExpr
.charAt(0);
1774 case 'n': c
= '\n'; break;
1775 // add any other escape sequences that show up in the test expected results.
1777 resultString
.append(c
);
1778 perlExpr
.remove(0, 1);
1782 // Any characters from the perl expression that we don't explicitly
1783 // recognize before here are assumed to be literals and copied
1784 // as-is to the expected results.
1785 resultString
.append(perlExpr
.charAt(0));
1786 perlExpr
.remove(0, 1);
1789 if (U_FAILURE(status
)) {
1790 errln("Line %d: ICU Error \"%s\"", lineNum
, u_errorName(status
));
1796 // Expected Results Compare
1798 UnicodeString
expectedS(fields
[4]);
1799 expectedS
.findAndReplace(nulnulSrc
, nulnul
);
1800 expectedS
.findAndReplace(ffffSrc
, ffff
);
1801 expectedS
.findAndReplace("\\n", "\n");
1804 if (expectedS
.compare(resultString
) != 0) {
1805 errln("Line %d: Incorrect perl expression results. Expected \"%s\"; got \"%s\"",
1806 lineNum
, (const char *)CharString(expectedS
),
1807 (const char *)CharString(resultString
));
1815 // All done. Clean up allocated stuff.
1833 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount
);
1839 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */