1 /********************************************************************
3 * Copyright (c) 2002-2008, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
10 // ICU Regular Expressions test, part of intltest.
14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
16 #include "unicode/regex.h"
17 #include "unicode/uchar.h"
18 #include "unicode/ucnv.h"
27 //---------------------------------------------------------------------------
29 // Test class boilerplate
31 //---------------------------------------------------------------------------
32 RegexTest::RegexTest()
37 RegexTest::~RegexTest()
43 void RegexTest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* /*par*/ )
45 if (exec
) logln("TestSuite RegexTest: ");
48 case 0: name
= "Basic";
51 case 1: name
= "API_Match";
52 if (exec
) API_Match();
54 case 2: name
= "API_Replace";
55 if (exec
) API_Replace();
57 case 3: name
= "API_Pattern";
58 if (exec
) API_Pattern();
60 case 4: name
= "Extended";
63 case 5: name
= "Errors";
66 case 6: name
= "PerlTests";
67 if (exec
) PerlTests();
69 case 7: name
= "Callbacks";
70 if (exec
) Callbacks();
74 break; //needed to end loop
79 //---------------------------------------------------------------------------
81 // Error Checking / Reporting macros used in all of the tests.
83 //---------------------------------------------------------------------------
84 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {errln("RegexTest failure at line %d. status=%s\n", \
85 __LINE__, u_errorName(status)); return;}}
87 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("RegexTest failure at line %d.\n", __LINE__);};}
89 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
90 if (status!=errcode) {errln("RegexTest failure at line %d. Expected status=%s, got %s\n", \
91 __LINE__, u_errorName(errcode), u_errorName(status));};}
93 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
94 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
96 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
97 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
101 //---------------------------------------------------------------------------
103 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests
104 // for the LookingAt() and Match() functions.
107 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
109 // The expected results are UBool - TRUE or FALSE.
110 // The input text is unescaped. The pattern is not.
113 //---------------------------------------------------------------------------
115 #define REGEX_TESTLM(pat, text, looking, match) doRegexLMTest(pat, text, looking, match, __LINE__);
117 UBool
RegexTest::doRegexLMTest(const char *pat
, const char *text
, UBool looking
, UBool match
, int32_t line
) {
118 const UnicodeString
pattern(pat
, -1, US_INV
);
119 const UnicodeString
inputText(text
, -1, US_INV
);
120 UErrorCode status
= U_ZERO_ERROR
;
122 RegexPattern
*REPattern
= NULL
;
123 RegexMatcher
*REMatcher
= NULL
;
126 UnicodeString
patString(pat
, -1, US_INV
);
127 REPattern
= RegexPattern::compile(patString
, 0, pe
, status
);
128 if (U_FAILURE(status
)) {
129 errln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s\n",
130 line
, u_errorName(status
));
133 if (line
==376) { RegexPatternDump(REPattern
);}
135 UnicodeString
inputString(inputText
);
136 UnicodeString unEscapedInput
= inputString
.unescape();
137 REMatcher
= REPattern
->matcher(unEscapedInput
, status
);
138 if (U_FAILURE(status
)) {
139 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
140 line
, u_errorName(status
));
145 actualmatch
= REMatcher
->lookingAt(status
);
146 if (U_FAILURE(status
)) {
147 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
148 line
, u_errorName(status
));
151 if (actualmatch
!= looking
) {
152 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line
);
156 status
= U_ZERO_ERROR
;
157 actualmatch
= REMatcher
->matches(status
);
158 if (U_FAILURE(status
)) {
159 errln("RegexTest failure in matches() at line %d. Status = %s\n",
160 line
, u_errorName(status
));
163 if (actualmatch
!= match
) {
164 errln("RegexTest: wrong return from matches() at line %d.\n", line
);
168 if (retVal
== FALSE
) {
169 RegexPatternDump(REPattern
);
181 //---------------------------------------------------------------------------
183 // REGEX_ERR Macro + invocation function to simplify writing tests
184 // regex tests for incorrect patterns
187 // REGEX_ERR("pattern", expected error line, column, expected status);
189 //---------------------------------------------------------------------------
190 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
192 void RegexTest::regex_err(const char *pat
, int32_t errLine
, int32_t errCol
,
193 UErrorCode expectedStatus
, int32_t line
) {
194 UnicodeString
pattern(pat
);
196 UErrorCode status
= U_ZERO_ERROR
;
198 RegexPattern
*callerPattern
= NULL
;
201 // Compile the caller's pattern
203 UnicodeString
patString(pat
);
204 callerPattern
= RegexPattern::compile(patString
, 0, pe
, status
);
205 if (status
!= expectedStatus
) {
206 errln("Line %d: unexpected error %s compiling pattern.", line
, u_errorName(status
));
208 if (status
!= U_ZERO_ERROR
) {
209 if (pe
.line
!= errLine
|| pe
.offset
!= errCol
) {
210 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
211 line
, errLine
, errCol
, pe
.line
, pe
.offset
);
216 delete callerPattern
;
221 //---------------------------------------------------------------------------
223 // Basic Check for basic functionality of regex pattern matching.
224 // Avoid the use of REGEX_FIND test macro, which has
225 // substantial dependencies on basic Regex functionality.
227 //---------------------------------------------------------------------------
228 void RegexTest::Basic() {
232 // Debug - slide failing test cases early
236 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
238 UErrorCode status
= U_ZERO_ERROR
;
239 RegexPattern::compile("^(?:a?b?)*$", 0, pe
, status
);
240 // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
241 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
248 // Pattern with parentheses
250 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE
, FALSE
);
251 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE
, TRUE
);
252 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE
, FALSE
);
257 REGEX_TESTLM("st(abc)*ring", "string", TRUE
, TRUE
);
258 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE
, TRUE
);
259 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE
, TRUE
);
260 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE
, FALSE
);
261 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE
, FALSE
);
263 REGEX_TESTLM("a*", "", TRUE
, TRUE
);
264 REGEX_TESTLM("a*", "b", TRUE
, FALSE
);
270 REGEX_TESTLM(".", "abc", TRUE
, FALSE
);
271 REGEX_TESTLM("...", "abc", TRUE
, TRUE
);
272 REGEX_TESTLM("....", "abc", FALSE
, FALSE
);
273 REGEX_TESTLM(".*", "abcxyz123", TRUE
, TRUE
);
274 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE
, FALSE
);
275 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE
, TRUE
);
276 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE
, TRUE
);
277 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE
, FALSE
);
280 // Patterns with * applied to chars at end of literal string
282 REGEX_TESTLM("abc*", "ab", TRUE
, TRUE
);
283 REGEX_TESTLM("abc*", "abccccc", TRUE
, TRUE
);
286 // Supplemental chars match as single chars, not a pair of surrogates.
288 REGEX_TESTLM(".", "\\U00011000", TRUE
, TRUE
);
289 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE
, TRUE
);
290 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE
, FALSE
);
294 // UnicodeSets in the pattern
296 REGEX_TESTLM("[1-6]", "1", TRUE
, TRUE
);
297 REGEX_TESTLM("[1-6]", "3", TRUE
, TRUE
);
298 REGEX_TESTLM("[1-6]", "7", FALSE
, FALSE
);
299 REGEX_TESTLM("a[1-6]", "a3", TRUE
, TRUE
);
300 REGEX_TESTLM("a[1-6]", "a3", TRUE
, TRUE
);
301 REGEX_TESTLM("a[1-6]b", "a3b", TRUE
, TRUE
);
303 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE
, TRUE
);
304 REGEX_TESTLM("a[0-9]*b", "abc", TRUE
, FALSE
);
305 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE
, TRUE
);
306 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE
, FALSE
); // note that * matches 0 occurences.
307 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE
, TRUE
);
310 // OR operator in patterns
312 REGEX_TESTLM("(a|b)", "a", TRUE
, TRUE
);
313 REGEX_TESTLM("(a|b)", "b", TRUE
, TRUE
);
314 REGEX_TESTLM("(a|b)", "c", FALSE
, FALSE
);
315 REGEX_TESTLM("a|b", "b", TRUE
, TRUE
);
317 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE
, TRUE
);
318 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE
, FALSE
);
319 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE
, TRUE
);
320 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE
, TRUE
);
321 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE
, TRUE
);
322 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE
, FALSE
);
327 REGEX_TESTLM("ab+", "abbc", TRUE
, FALSE
);
328 REGEX_TESTLM("ab+c", "ac", FALSE
, FALSE
);
329 REGEX_TESTLM("b+", "", FALSE
, FALSE
);
330 REGEX_TESTLM("(abc|def)+", "defabc", TRUE
, TRUE
);
331 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE
, FALSE
);
332 REGEX_TESTLM(".+y", "zippity dooy", TRUE
, TRUE
);
337 REGEX_TESTLM("ab?", "ab", TRUE
, TRUE
);
338 REGEX_TESTLM("ab?", "a", TRUE
, TRUE
);
339 REGEX_TESTLM("ab?", "ac", TRUE
, FALSE
);
340 REGEX_TESTLM("ab?", "abb", TRUE
, FALSE
);
341 REGEX_TESTLM("a(b|c)?d", "abd", TRUE
, TRUE
);
342 REGEX_TESTLM("a(b|c)?d", "acd", TRUE
, TRUE
);
343 REGEX_TESTLM("a(b|c)?d", "ad", TRUE
, TRUE
);
344 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE
, FALSE
);
345 REGEX_TESTLM("a(b|c)?d", "ab", FALSE
, FALSE
);
348 // Escape sequences that become single literal chars, handled internally
349 // by ICU's Unescape.
352 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
353 REGEX_TESTLM("\\a", "\\u0007", TRUE
, TRUE
); // BEL
354 REGEX_TESTLM("\\cL", "\\u000c", TRUE
, TRUE
); // Control-L
355 REGEX_TESTLM("\\e", "\\u001b", TRUE
, TRUE
); // Escape
356 REGEX_TESTLM("\\f", "\\u000c", TRUE
, TRUE
); // Form Feed
357 REGEX_TESTLM("\\n", "\\u000a", TRUE
, TRUE
); // new line
358 REGEX_TESTLM("\\r", "\\u000d", TRUE
, TRUE
); // CR
359 REGEX_TESTLM("\\t", "\\u0009", TRUE
, TRUE
); // Tab
360 REGEX_TESTLM("\\u1234", "\\u1234", TRUE
, TRUE
);
361 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE
, TRUE
);
363 REGEX_TESTLM(".*\\Ax", "xyz", TRUE
, FALSE
); // \A matches only at the beginning of input
364 REGEX_TESTLM(".*\\Ax", " xyz", FALSE
, FALSE
); // \A matches only at the beginning of input
366 // Escape of special chars in patterns
367 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE
, TRUE
);
373 //---------------------------------------------------------------------------
375 // API_Match Test that the API for class RegexMatcher
376 // is present and nominally working, but excluding functions
377 // implementing replace operations.
379 //---------------------------------------------------------------------------
380 void RegexTest::API_Match() {
382 UErrorCode status
=U_ZERO_ERROR
;
386 // Debug - slide failing test cases early
395 // Simple pattern compilation
398 UnicodeString
re("abc");
400 pat2
= RegexPattern::compile(re
, flags
, pe
, status
);
403 UnicodeString inStr1
= "abcdef this is a test";
404 UnicodeString instr2
= "not abc";
405 UnicodeString empty
= "";
409 // Matcher creation and reset.
411 RegexMatcher
*m1
= pat2
->matcher(inStr1
, status
);
413 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
414 REGEX_ASSERT(m1
->input() == inStr1
);
416 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
417 REGEX_ASSERT(m1
->input() == instr2
);
419 REGEX_ASSERT(m1
->input() == inStr1
);
420 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
422 REGEX_ASSERT(m1
->lookingAt(status
) == FALSE
);
423 REGEX_ASSERT(m1
->input() == empty
);
424 REGEX_ASSERT(&m1
->pattern() == pat2
);
427 // reset(pos, status)
430 m1
->reset(4, status
);
432 REGEX_ASSERT(m1
->input() == inStr1
);
433 REGEX_ASSERT(m1
->lookingAt(status
) == TRUE
);
435 m1
->reset(-1, status
);
436 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
437 status
= U_ZERO_ERROR
;
439 m1
->reset(0, status
);
441 status
= U_ZERO_ERROR
;
443 int32_t len
= m1
->input().length();
444 m1
->reset(len
-1, status
);
446 status
= U_ZERO_ERROR
;
448 m1
->reset(len
, status
);
449 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
450 status
= U_ZERO_ERROR
;
453 // match(pos, status)
456 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
458 REGEX_ASSERT(m1
->matches(3, status
) == FALSE
);
460 REGEX_ASSERT(m1
->matches(5, status
) == FALSE
);
461 REGEX_ASSERT(m1
->matches(4, status
) == TRUE
);
462 REGEX_ASSERT(m1
->matches(-1, status
) == FALSE
);
463 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
465 // Match() at end of string should fail, but should not
467 status
= U_ZERO_ERROR
;
468 len
= m1
->input().length();
469 REGEX_ASSERT(m1
->matches(len
, status
) == FALSE
);
472 // Match beyond end of string should fail with an error.
473 status
= U_ZERO_ERROR
;
474 REGEX_ASSERT(m1
->matches(len
+1, status
) == FALSE
);
475 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
477 // Successful match at end of string.
479 status
= U_ZERO_ERROR
;
480 RegexMatcher
m("A?", 0, status
); // will match zero length string.
483 len
= inStr1
.length();
484 REGEX_ASSERT(m
.matches(len
, status
) == TRUE
);
487 REGEX_ASSERT(m
.matches(0, status
) == TRUE
);
493 // lookingAt(pos, status)
495 status
= U_ZERO_ERROR
;
496 m1
->reset(instr2
); // "not abc"
497 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
498 REGEX_ASSERT(m1
->lookingAt(5, status
) == FALSE
);
499 REGEX_ASSERT(m1
->lookingAt(3, status
) == FALSE
);
500 REGEX_ASSERT(m1
->lookingAt(4, status
) == TRUE
);
501 REGEX_ASSERT(m1
->lookingAt(-1, status
) == FALSE
);
502 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
503 status
= U_ZERO_ERROR
;
504 len
= m1
->input().length();
505 REGEX_ASSERT(m1
->lookingAt(len
, status
) == FALSE
);
507 REGEX_ASSERT(m1
->lookingAt(len
+1, status
) == FALSE
);
508 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
517 // RegexMatcher::start();
518 // RegexMatcher::end();
519 // RegexMatcher::groupCount();
524 UErrorCode status
=U_ZERO_ERROR
;
526 UnicodeString
re("01(23(45)67)(.*)");
527 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
529 UnicodeString data
= "0123456789";
531 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
533 REGEX_ASSERT(matcher
->lookingAt(status
) == TRUE
);
534 static const int32_t matchStarts
[] = {0, 2, 4, 8};
535 static const int32_t matchEnds
[] = {10, 8, 6, 10};
537 for (i
=0; i
<4; i
++) {
538 int32_t actualStart
= matcher
->start(i
, status
);
540 if (actualStart
!= matchStarts
[i
]) {
541 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
542 __LINE__
, i
, matchStarts
[i
], actualStart
);
544 int32_t actualEnd
= matcher
->end(i
, status
);
546 if (actualEnd
!= matchEnds
[i
]) {
547 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
548 __LINE__
, i
, matchEnds
[i
], actualEnd
);
552 REGEX_ASSERT(matcher
->start(0, status
) == matcher
->start(status
));
553 REGEX_ASSERT(matcher
->end(0, status
) == matcher
->end(status
));
555 REGEX_ASSERT_FAIL(matcher
->start(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
556 REGEX_ASSERT_FAIL(matcher
->start( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
558 REGEX_ASSERT_FAIL(matcher
->start( 0, status
), U_REGEX_INVALID_STATE
);
560 matcher
->lookingAt(status
);
561 REGEX_ASSERT(matcher
->group(status
) == "0123456789");
562 REGEX_ASSERT(matcher
->group(0, status
) == "0123456789");
563 REGEX_ASSERT(matcher
->group(1, status
) == "234567" );
564 REGEX_ASSERT(matcher
->group(2, status
) == "45" );
565 REGEX_ASSERT(matcher
->group(3, status
) == "89" );
567 REGEX_ASSERT_FAIL(matcher
->group(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
568 REGEX_ASSERT_FAIL(matcher
->group( 4, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
570 REGEX_ASSERT_FAIL(matcher
->group( 0, status
), U_REGEX_INVALID_STATE
);
583 UErrorCode status
=U_ZERO_ERROR
;
585 UnicodeString
re("abc");
586 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
588 UnicodeString data
= ".abc..abc...abc..";
589 // 012345678901234567
591 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
593 REGEX_ASSERT(matcher
->find());
594 REGEX_ASSERT(matcher
->start(status
) == 1);
595 REGEX_ASSERT(matcher
->find());
596 REGEX_ASSERT(matcher
->start(status
) == 6);
597 REGEX_ASSERT(matcher
->find());
598 REGEX_ASSERT(matcher
->start(status
) == 12);
599 REGEX_ASSERT(matcher
->find() == FALSE
);
600 REGEX_ASSERT(matcher
->find() == FALSE
);
603 REGEX_ASSERT(matcher
->find());
604 REGEX_ASSERT(matcher
->start(status
) == 1);
606 REGEX_ASSERT(matcher
->find(0, status
));
607 REGEX_ASSERT(matcher
->start(status
) == 1);
608 REGEX_ASSERT(matcher
->find(1, status
));
609 REGEX_ASSERT(matcher
->start(status
) == 1);
610 REGEX_ASSERT(matcher
->find(2, status
));
611 REGEX_ASSERT(matcher
->start(status
) == 6);
612 REGEX_ASSERT(matcher
->find(12, status
));
613 REGEX_ASSERT(matcher
->start(status
) == 12);
614 REGEX_ASSERT(matcher
->find(13, status
) == FALSE
);
615 REGEX_ASSERT(matcher
->find(16, status
) == FALSE
);
616 REGEX_ASSERT(matcher
->find(17, status
) == FALSE
);
617 REGEX_ASSERT_FAIL(matcher
->start(status
), U_REGEX_INVALID_STATE
);
619 status
= U_ZERO_ERROR
;
620 REGEX_ASSERT_FAIL(matcher
->find(-1, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
621 status
= U_ZERO_ERROR
;
622 REGEX_ASSERT_FAIL(matcher
->find(18, status
), U_INDEX_OUTOFBOUNDS_ERROR
);
624 REGEX_ASSERT(matcher
->groupCount() == 0);
632 // find, with \G in pattern (true if at the end of a previous match).
637 UErrorCode status
=U_ZERO_ERROR
;
639 UnicodeString
re(".*?(?:(\\Gabc)|(abc))", -1, US_INV
);
640 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
642 UnicodeString data
= ".abcabc.abc..";
643 // 012345678901234567
645 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
647 REGEX_ASSERT(matcher
->find());
648 REGEX_ASSERT(matcher
->start(status
) == 0);
649 REGEX_ASSERT(matcher
->start(1, status
) == -1);
650 REGEX_ASSERT(matcher
->start(2, status
) == 1);
652 REGEX_ASSERT(matcher
->find());
653 REGEX_ASSERT(matcher
->start(status
) == 4);
654 REGEX_ASSERT(matcher
->start(1, status
) == 4);
655 REGEX_ASSERT(matcher
->start(2, status
) == -1);
663 // find with zero length matches, match position should bump ahead
668 UErrorCode status
=U_ZERO_ERROR
;
669 RegexMatcher
m("(?= ?)", 0, status
); // This pattern will zero-length matches anywhere,
670 // using an always-true look-ahead.
672 UnicodeString
s(" ");
675 if (m
.find() == FALSE
) {
678 REGEX_ASSERT(m
.start(status
) == i
);
679 REGEX_ASSERT(m
.end(status
) == i
);
683 // Check that the bump goes over surrogate pairs OK
684 s
= UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
688 if (m
.find() == FALSE
) {
691 REGEX_ASSERT(m
.start(status
) == i
);
692 REGEX_ASSERT(m
.end(status
) == i
);
697 // find() loop breaking test.
698 // with pattern of /.?/, should see a series of one char matches, then a single
699 // match of zero length at the end of the input string.
701 UErrorCode status
=U_ZERO_ERROR
;
702 RegexMatcher
m(".?", 0, status
);
704 UnicodeString
s(" ");
707 if (m
.find() == FALSE
) {
710 REGEX_ASSERT(m
.start(status
) == i
);
711 REGEX_ASSERT(m
.end(status
) == (i
<4 ? i
+1 : i
));
718 // Matchers with no input string behave as if they had an empty input string.
722 UErrorCode status
= U_ZERO_ERROR
;
723 RegexMatcher
m(".?", 0, status
);
725 REGEX_ASSERT(m
.find());
726 REGEX_ASSERT(m
.start(status
) == 0);
727 REGEX_ASSERT(m
.input() == "");
730 UErrorCode status
= U_ZERO_ERROR
;
731 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
732 RegexMatcher
*m
= p
->matcher(status
);
735 REGEX_ASSERT(m
->find() == FALSE
);
736 REGEX_ASSERT(m
->input() == "");
745 UErrorCode status
= U_ZERO_ERROR
;
746 UnicodeString
testString("This is test data");
747 RegexMatcher
m(".*", testString
, 0, status
);
749 REGEX_ASSERT(m
.regionStart() == 0);
750 REGEX_ASSERT(m
.regionEnd() == testString
.length());
751 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
752 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
754 m
.region(2,4, status
);
756 REGEX_ASSERT(m
.matches(status
));
757 REGEX_ASSERT(m
.start(status
)==2);
758 REGEX_ASSERT(m
.end(status
)==4);
762 REGEX_ASSERT(m
.regionStart() == 0);
763 REGEX_ASSERT(m
.regionEnd() == testString
.length());
765 UnicodeString
shorterString("short");
766 m
.reset(shorterString
);
767 REGEX_ASSERT(m
.regionStart() == 0);
768 REGEX_ASSERT(m
.regionEnd() == shorterString
.length());
770 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
771 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(FALSE
));
772 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
773 REGEX_ASSERT(&m
== &m
.reset());
774 REGEX_ASSERT(m
.hasAnchoringBounds() == FALSE
);
776 REGEX_ASSERT(&m
== &m
.useAnchoringBounds(TRUE
));
777 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
778 REGEX_ASSERT(&m
== &m
.reset());
779 REGEX_ASSERT(m
.hasAnchoringBounds() == TRUE
);
781 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
782 REGEX_ASSERT(&m
== &m
.useTransparentBounds(TRUE
));
783 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
784 REGEX_ASSERT(&m
== &m
.reset());
785 REGEX_ASSERT(m
.hasTransparentBounds() == TRUE
);
787 REGEX_ASSERT(&m
== &m
.useTransparentBounds(FALSE
));
788 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
789 REGEX_ASSERT(&m
== &m
.reset());
790 REGEX_ASSERT(m
.hasTransparentBounds() == FALSE
);
795 // hitEnd() and requireEnd()
798 UErrorCode status
= U_ZERO_ERROR
;
799 UnicodeString
testString("aabb");
800 RegexMatcher
m1(".*", testString
, 0, status
);
801 REGEX_ASSERT(m1
.lookingAt(status
) == TRUE
);
802 REGEX_ASSERT(m1
.hitEnd() == TRUE
);
803 REGEX_ASSERT(m1
.requireEnd() == FALSE
);
806 status
= U_ZERO_ERROR
;
807 RegexMatcher
m2("a*", testString
, 0, status
);
808 REGEX_ASSERT(m2
.lookingAt(status
) == TRUE
);
809 REGEX_ASSERT(m2
.hitEnd() == FALSE
);
810 REGEX_ASSERT(m2
.requireEnd() == FALSE
);
813 status
= U_ZERO_ERROR
;
814 RegexMatcher
m3(".*$", testString
, 0, status
);
815 REGEX_ASSERT(m3
.lookingAt(status
) == TRUE
);
816 REGEX_ASSERT(m3
.hitEnd() == TRUE
);
817 REGEX_ASSERT(m3
.requireEnd() == TRUE
);
823 // Compilation error on reset with UChar *
824 // These were a hazard that people were stumbling over with runtime errors.
825 // Changed them to compiler errors by adding private methods that more closely
826 // matched the incorrect use of the functions.
830 UErrorCode status
= U_ZERO_ERROR
;
831 UChar ucharString
[20];
832 RegexMatcher
m(".", 0, status
);
833 m
.reset(ucharString
); // should not compile.
835 RegexPattern
*p
= RegexPattern::compile(".", 0, status
);
836 RegexMatcher
*m2
= p
->matcher(ucharString
, status
); // should not compile.
838 RegexMatcher
m3(".", ucharString
, 0, status
); // Should not compile
844 // Note: These tests will need to be changed when the regexp engine is
845 // able to detect and cut short the exponential time behavior on
846 // this type of match.
849 UErrorCode status
= U_ZERO_ERROR
;
850 // Enough 'a's in the string to cause the match to time out.
851 // (Each on additonal 'a' doubles the time)
852 UnicodeString
testString("aaaaaaaaaaaaaaaaaaaaa");
853 RegexMatcher
matcher("(a+)+b", testString
, 0, status
);
855 REGEX_ASSERT(matcher
.getTimeLimit() == 0);
856 matcher
.setTimeLimit(100, status
);
857 REGEX_ASSERT(matcher
.getTimeLimit() == 100);
858 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
859 REGEX_ASSERT(status
== U_REGEX_TIME_OUT
);
862 UErrorCode status
= U_ZERO_ERROR
;
863 // Few enough 'a's to slip in under the time limit.
864 UnicodeString
testString("aaaaaaaaaaaaaaaaaa");
865 RegexMatcher
matcher("(a+)+b", testString
, 0, status
);
867 matcher
.setTimeLimit(100, status
);
868 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
876 UErrorCode status
= U_ZERO_ERROR
;
877 UnicodeString
testString(600000, 0x41, 600000); // Length 600,000, filled with 'A'
879 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
880 // of the '+', and makes the stack frames larger.
881 RegexMatcher
matcher("(A)+A$", testString
, 0, status
);
883 // With the default stack, this match should fail to run
884 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
885 REGEX_ASSERT(status
== U_REGEX_STACK_OVERFLOW
);
887 // With unlimited stack, it should run
888 status
= U_ZERO_ERROR
;
889 matcher
.setStackLimit(0, status
);
891 REGEX_ASSERT(matcher
.lookingAt(status
) == TRUE
);
893 REGEX_ASSERT(matcher
.getStackLimit() == 0);
895 // With a limited stack, it the match should fail
896 status
= U_ZERO_ERROR
;
897 matcher
.setStackLimit(10000, status
);
898 REGEX_ASSERT(matcher
.lookingAt(status
) == FALSE
);
899 REGEX_ASSERT(status
== U_REGEX_STACK_OVERFLOW
);
900 REGEX_ASSERT(matcher
.getStackLimit() == 10000);
903 // A pattern that doesn't save state should work with
904 // a minimal sized stack
906 UErrorCode status
= U_ZERO_ERROR
;
907 UnicodeString testString
= "abc";
908 RegexMatcher
matcher("abc", testString
, 0, status
);
910 matcher
.setStackLimit(30, status
);
912 REGEX_ASSERT(matcher
.matches(status
) == TRUE
);
914 REGEX_ASSERT(matcher
.getStackLimit() == 30);
916 // Negative stack sizes should fail
917 status
= U_ZERO_ERROR
;
918 matcher
.setStackLimit(1000, status
);
920 matcher
.setStackLimit(-1, status
);
921 REGEX_ASSERT(status
== U_ILLEGAL_ARGUMENT_ERROR
);
922 REGEX_ASSERT(matcher
.getStackLimit() == 1000);
933 //---------------------------------------------------------------------------
935 // API_Replace API test for class RegexMatcher, testing the
936 // Replace family of functions.
938 //---------------------------------------------------------------------------
939 void RegexTest::API_Replace() {
945 UErrorCode status
=U_ZERO_ERROR
;
947 UnicodeString
re("abc");
948 RegexPattern
*pat
= RegexPattern::compile(re
, flags
, pe
, status
);
950 UnicodeString data
= ".abc..abc...abc..";
951 // 012345678901234567
952 RegexMatcher
*matcher
= pat
->matcher(data
, status
);
955 // Plain vanilla matches.
958 dest
= matcher
->replaceFirst("yz", status
);
960 REGEX_ASSERT(dest
== ".yz..abc...abc..");
962 dest
= matcher
->replaceAll("yz", status
);
964 REGEX_ASSERT(dest
== ".yz..yz...yz..");
967 // Plain vanilla non-matches.
969 UnicodeString d2
= ".abx..abx...abx..";
971 dest
= matcher
->replaceFirst("yz", status
);
973 REGEX_ASSERT(dest
== ".abx..abx...abx..");
975 dest
= matcher
->replaceAll("yz", status
);
977 REGEX_ASSERT(dest
== ".abx..abx...abx..");
980 // Empty source string
982 UnicodeString d3
= "";
984 dest
= matcher
->replaceFirst("yz", status
);
986 REGEX_ASSERT(dest
== "");
988 dest
= matcher
->replaceAll("yz", status
);
990 REGEX_ASSERT(dest
== "");
993 // Empty substitution string
995 matcher
->reset(data
); // ".abc..abc...abc.."
996 dest
= matcher
->replaceFirst("", status
);
998 REGEX_ASSERT(dest
== "...abc...abc..");
1000 dest
= matcher
->replaceAll("", status
);
1002 REGEX_ASSERT(dest
== "........");
1005 // match whole string
1007 UnicodeString d4
= "abc";
1009 dest
= matcher
->replaceFirst("xyz", status
);
1011 REGEX_ASSERT(dest
== "xyz");
1013 dest
= matcher
->replaceAll("xyz", status
);
1015 REGEX_ASSERT(dest
== "xyz");
1018 // Capture Group, simple case
1020 UnicodeString
re2("a(..)");
1021 RegexPattern
*pat2
= RegexPattern::compile(re2
, flags
, pe
, status
);
1023 UnicodeString d5
= "abcdefg";
1024 RegexMatcher
*matcher2
= pat2
->matcher(d5
, status
);
1026 dest
= matcher2
->replaceFirst("$1$1", status
);
1028 REGEX_ASSERT(dest
== "bcbcdefg");
1030 dest
= matcher2
->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status
);
1032 REGEX_ASSERT(dest
== "The value of $1 is bc.defg");
1034 dest
= matcher2
->replaceFirst("$ by itself, no group number $$$", status
);
1036 REGEX_ASSERT(dest
== "$ by itself, no group number $$$defg");
1038 UnicodeString replacement
= UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1039 replacement
= replacement
.unescape();
1040 dest
= matcher2
->replaceFirst(replacement
, status
);
1042 REGEX_ASSERT(dest
== "Supplemental Digit 1 bc.defg");
1044 REGEX_ASSERT_FAIL(matcher2
->replaceFirst("bad capture group number $5...",status
), U_INDEX_OUTOFBOUNDS_ERROR
);
1048 // Replacement String with \u hex escapes
1051 UnicodeString src
= "abc 1 abc 2 abc 3";
1052 UnicodeString substitute
= UNICODE_STRING_SIMPLE("--\\u0043--");
1053 matcher
->reset(src
);
1054 UnicodeString result
= matcher
->replaceAll(substitute
, status
);
1056 REGEX_ASSERT(result
== "--C-- 1 --C-- 2 --C-- 3");
1059 UnicodeString src
= "abc !";
1060 UnicodeString substitute
= UNICODE_STRING_SIMPLE("--\\U00010000--");
1061 matcher
->reset(src
);
1062 UnicodeString result
= matcher
->replaceAll(substitute
, status
);
1064 UnicodeString expected
= UnicodeString("--");
1065 expected
.append((UChar32
)0x10000);
1066 expected
.append("-- !");
1067 REGEX_ASSERT(result
== expected
);
1069 // TODO: need more through testing of capture substitutions.
1074 status
= U_ZERO_ERROR
;
1075 UnicodeString s
= "The matches start with ss and end with ee ss stuff ee fin";
1076 RegexMatcher
m("ss(.*?)ee", 0, status
);
1078 UnicodeString result
;
1080 // Multiple finds do NOT bump up the previous appendReplacement postion.
1084 m
.appendReplacement(result
, "ooh", status
);
1086 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1088 // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1089 status
= U_ZERO_ERROR
;
1091 m
.reset(10, status
);
1094 m
.appendReplacement(result
, "ooh", status
);
1096 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1098 // find() at interior of string, appendReplacemnt still starts at beginning.
1099 status
= U_ZERO_ERROR
;
1104 m
.appendReplacement(result
, "ooh", status
);
1106 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh");
1108 m
.appendTail(result
);
1109 REGEX_ASSERT(result
== "The matches start with ss and end with ee ooh fin");
1120 //---------------------------------------------------------------------------
1122 // API_Pattern Test that the API for class RegexPattern is
1123 // present and nominally working.
1125 //---------------------------------------------------------------------------
1126 void RegexTest::API_Pattern() {
1127 RegexPattern pata
; // Test default constructor to not crash.
1130 REGEX_ASSERT(pata
== patb
);
1131 REGEX_ASSERT(pata
== pata
);
1133 UnicodeString
re1("abc[a-l][m-z]");
1134 UnicodeString
re2("def");
1135 UErrorCode status
= U_ZERO_ERROR
;
1138 RegexPattern
*pat1
= RegexPattern::compile(re1
, 0, pe
, status
);
1139 RegexPattern
*pat2
= RegexPattern::compile(re2
, 0, pe
, status
);
1141 REGEX_ASSERT(*pat1
== *pat1
);
1142 REGEX_ASSERT(*pat1
!= pata
);
1146 REGEX_ASSERT(patb
== *pat1
);
1149 RegexPattern
patc(*pat1
);
1150 REGEX_ASSERT(patc
== *pat1
);
1151 REGEX_ASSERT(patb
== patc
);
1152 REGEX_ASSERT(pat1
!= pat2
);
1154 REGEX_ASSERT(patb
!= patc
);
1155 REGEX_ASSERT(patb
== *pat2
);
1157 // Compile with no flags.
1158 RegexPattern
*pat1a
= RegexPattern::compile(re1
, pe
, status
);
1159 REGEX_ASSERT(*pat1a
== *pat1
);
1161 REGEX_ASSERT(pat1a
->flags() == 0);
1163 // Compile with different flags should be not equal
1164 RegexPattern
*pat1b
= RegexPattern::compile(re1
, UREGEX_CASE_INSENSITIVE
, pe
, status
);
1167 REGEX_ASSERT(*pat1b
!= *pat1a
);
1168 REGEX_ASSERT(pat1b
->flags() == UREGEX_CASE_INSENSITIVE
);
1169 REGEX_ASSERT(pat1a
->flags() == 0);
1173 RegexPattern
*pat1c
= pat1
->clone();
1174 REGEX_ASSERT(*pat1c
== *pat1
);
1175 REGEX_ASSERT(*pat1c
!= *pat2
);
1184 // Verify that a matcher created from a cloned pattern works.
1188 UErrorCode status
= U_ZERO_ERROR
;
1189 RegexPattern
*pSource
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status
);
1190 RegexPattern
*pClone
= pSource
->clone();
1192 RegexMatcher
*mFromClone
= pClone
->matcher(status
);
1194 UnicodeString s
= "Hello World";
1195 mFromClone
->reset(s
);
1196 REGEX_ASSERT(mFromClone
->find() == TRUE
);
1197 REGEX_ASSERT(mFromClone
->group(status
) == "Hello");
1198 REGEX_ASSERT(mFromClone
->find() == TRUE
);
1199 REGEX_ASSERT(mFromClone
->group(status
) == "World");
1200 REGEX_ASSERT(mFromClone
->find() == FALSE
);
1206 // matches convenience API
1208 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe
, status
) == TRUE
);
1210 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe
, status
) == FALSE
);
1212 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe
, status
) == TRUE
);
1214 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe
, status
) == TRUE
);
1216 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe
, status
) == FALSE
);
1218 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1219 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe
, status
) == FALSE
);
1220 REGEX_ASSERT(status
== U_INDEX_OUTOFBOUNDS_ERROR
);
1226 status
= U_ZERO_ERROR
;
1227 pat1
= RegexPattern::compile(" +", pe
, status
);
1229 UnicodeString fields
[10];
1232 n
= pat1
->split("Now is the time", fields
, 10, status
);
1235 REGEX_ASSERT(fields
[0]=="Now");
1236 REGEX_ASSERT(fields
[1]=="is");
1237 REGEX_ASSERT(fields
[2]=="the");
1238 REGEX_ASSERT(fields
[3]=="time");
1239 REGEX_ASSERT(fields
[4]=="");
1241 n
= pat1
->split("Now is the time", fields
, 2, status
);
1244 REGEX_ASSERT(fields
[0]=="Now");
1245 REGEX_ASSERT(fields
[1]=="is the time");
1246 REGEX_ASSERT(fields
[2]=="the"); // left over from previous test
1249 status
= U_ZERO_ERROR
;
1250 n
= pat1
->split("Now is the time", fields
, 1, status
);
1253 REGEX_ASSERT(fields
[0]=="Now is the time");
1254 REGEX_ASSERT(fields
[1]=="*");
1255 status
= U_ZERO_ERROR
;
1257 n
= pat1
->split(" Now is the time ", fields
, 10, status
);
1260 REGEX_ASSERT(fields
[0]=="");
1261 REGEX_ASSERT(fields
[1]=="Now");
1262 REGEX_ASSERT(fields
[2]=="is");
1263 REGEX_ASSERT(fields
[3]=="the");
1264 REGEX_ASSERT(fields
[4]=="time");
1265 REGEX_ASSERT(fields
[5]=="");
1267 n
= pat1
->split(" ", fields
, 10, status
);
1270 REGEX_ASSERT(fields
[0]=="");
1273 n
= pat1
->split("", fields
, 10, status
);
1276 REGEX_ASSERT(fields
[0]=="foo");
1280 // split, with a pattern with (capture)
1281 pat1
= RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe
, status
);
1284 status
= U_ZERO_ERROR
;
1285 n
= pat1
->split("<a>Now is <b>the time<c>", fields
, 10, status
);
1288 REGEX_ASSERT(fields
[0]=="");
1289 REGEX_ASSERT(fields
[1]=="a");
1290 REGEX_ASSERT(fields
[2]=="Now is ");
1291 REGEX_ASSERT(fields
[3]=="b");
1292 REGEX_ASSERT(fields
[4]=="the time");
1293 REGEX_ASSERT(fields
[5]=="c");
1294 REGEX_ASSERT(fields
[6]=="");
1295 REGEX_ASSERT(status
==U_ZERO_ERROR
);
1297 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 10, status
);
1300 REGEX_ASSERT(fields
[0]==" ");
1301 REGEX_ASSERT(fields
[1]=="a");
1302 REGEX_ASSERT(fields
[2]=="Now is ");
1303 REGEX_ASSERT(fields
[3]=="b");
1304 REGEX_ASSERT(fields
[4]=="the time");
1305 REGEX_ASSERT(fields
[5]=="c");
1306 REGEX_ASSERT(fields
[6]=="");
1308 status
= U_ZERO_ERROR
;
1310 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 6, status
);
1313 REGEX_ASSERT(fields
[0]==" ");
1314 REGEX_ASSERT(fields
[1]=="a");
1315 REGEX_ASSERT(fields
[2]=="Now is ");
1316 REGEX_ASSERT(fields
[3]=="b");
1317 REGEX_ASSERT(fields
[4]=="the time");
1318 REGEX_ASSERT(fields
[5]=="c");
1319 REGEX_ASSERT(fields
[6]=="foo");
1321 status
= U_ZERO_ERROR
;
1323 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 5, status
);
1326 REGEX_ASSERT(fields
[0]==" ");
1327 REGEX_ASSERT(fields
[1]=="a");
1328 REGEX_ASSERT(fields
[2]=="Now is ");
1329 REGEX_ASSERT(fields
[3]=="b");
1330 REGEX_ASSERT(fields
[4]=="the time<c>");
1331 REGEX_ASSERT(fields
[5]=="foo");
1333 status
= U_ZERO_ERROR
;
1335 n
= pat1
->split(" <a>Now is <b>the time", fields
, 5, status
);
1338 REGEX_ASSERT(fields
[0]==" ");
1339 REGEX_ASSERT(fields
[1]=="a");
1340 REGEX_ASSERT(fields
[2]=="Now is ");
1341 REGEX_ASSERT(fields
[3]=="b");
1342 REGEX_ASSERT(fields
[4]=="the time");
1343 REGEX_ASSERT(fields
[5]=="foo");
1345 status
= U_ZERO_ERROR
;
1346 n
= pat1
->split(" <a>Now is <b>the time<c>", fields
, 4, status
);
1349 REGEX_ASSERT(fields
[0]==" ");
1350 REGEX_ASSERT(fields
[1]=="a");
1351 REGEX_ASSERT(fields
[2]=="Now is ");
1352 REGEX_ASSERT(fields
[3]=="the time<c>");
1353 status
= U_ZERO_ERROR
;
1356 pat1
= RegexPattern::compile("([-,])", pe
, status
);
1358 n
= pat1
->split("1-10,20", fields
, 10, status
);
1361 REGEX_ASSERT(fields
[0]=="1");
1362 REGEX_ASSERT(fields
[1]=="-");
1363 REGEX_ASSERT(fields
[2]=="10");
1364 REGEX_ASSERT(fields
[3]==",");
1365 REGEX_ASSERT(fields
[4]=="20");
1370 // RegexPattern::pattern()
1372 pat1
= new RegexPattern();
1373 REGEX_ASSERT(pat1
->pattern() == "");
1376 pat1
= RegexPattern::compile("(Hello, world)*", pe
, status
);
1378 REGEX_ASSERT(pat1
->pattern() == "(Hello, world)*");
1383 // classID functions
1385 pat1
= RegexPattern::compile("(Hello, world)*", pe
, status
);
1387 REGEX_ASSERT(pat1
->getDynamicClassID() == RegexPattern::getStaticClassID());
1388 REGEX_ASSERT(pat1
->getDynamicClassID() != NULL
);
1389 UnicodeString
Hello("Hello, world.");
1390 RegexMatcher
*m
= pat1
->matcher(Hello
, status
);
1391 REGEX_ASSERT(pat1
->getDynamicClassID() != m
->getDynamicClassID());
1392 REGEX_ASSERT(m
->getDynamicClassID() == RegexMatcher::getStaticClassID());
1393 REGEX_ASSERT(m
->getDynamicClassID() != NULL
);
1399 //---------------------------------------------------------------------------
1401 // Extended A more thorough check for features of regex patterns
1402 // The test cases are in a separate data file,
1403 // source/tests/testdata/regextst.txt
1404 // A description of the test data format is included in that file.
1406 //---------------------------------------------------------------------------
1409 RegexTest::getPath(char buffer
[2048], const char *filename
) {
1410 UErrorCode status
=U_ZERO_ERROR
;
1411 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
1412 if (U_FAILURE(status
)) {
1413 errln("ERROR: loadTestData() failed - %s", u_errorName(status
));
1417 strcpy(buffer
, testDataDirectory
);
1418 strcat(buffer
, filename
);
1422 void RegexTest::Extended() {
1424 const char *srcPath
;
1425 UErrorCode status
= U_ZERO_ERROR
;
1426 int32_t lineNum
= 0;
1429 // Open and read the test data file.
1431 srcPath
=getPath(tdd
, "regextst.txt");
1433 return; /* something went wrong, error already output */
1437 UChar
*testData
= ReadAndConvertFile(srcPath
, len
, "utf-8", status
);
1438 if (U_FAILURE(status
)) {
1439 return; /* something went wrong, error already output */
1443 // Put the test data into a UnicodeString
1445 UnicodeString
testString(FALSE
, testData
, len
);
1447 RegexMatcher
quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status
);
1448 RegexMatcher
commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status
);
1449 RegexMatcher
flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMvabtyYzZ2-9]*)([:letter:]*)"), 0, status
);
1451 RegexMatcher
lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString
, 0, status
);
1452 UnicodeString testPattern
; // The pattern for test from the test file.
1453 UnicodeString testFlags
; // the flags for a test.
1454 UnicodeString matchString
; // The marked up string to be used as input
1456 if (U_FAILURE(status
)){
1457 dataerrln("Construct RegexMatcher() error.");
1463 // Loop over the test data file, once per line.
1465 while (lineMat
.find()) {
1467 if (U_FAILURE(status
)) {
1468 errln("line %d: ICU Error \"%s\"", lineNum
, u_errorName(status
));
1471 status
= U_ZERO_ERROR
;
1472 UnicodeString testLine
= lineMat
.group(1, status
);
1473 if (testLine
.length() == 0) {
1478 // Parse the test line. Skip blank and comment only lines.
1479 // Separate out the three main fields - pattern, flags, target.
1482 commentMat
.reset(testLine
);
1483 if (commentMat
.lookingAt(status
)) {
1484 // This line is a comment, or blank.
1489 // Pull out the pattern field, remove it from the test file line.
1491 quotedStuffMat
.reset(testLine
);
1492 if (quotedStuffMat
.lookingAt(status
)) {
1493 testPattern
= quotedStuffMat
.group(2, status
);
1494 testLine
.remove(0, quotedStuffMat
.end(0, status
));
1496 errln("Bad pattern (missing quotes?) at test file line %d", lineNum
);
1502 // Pull out the flags from the test file line.
1504 flagsMat
.reset(testLine
);
1505 flagsMat
.lookingAt(status
); // Will always match, possibly an empty string.
1506 testFlags
= flagsMat
.group(1, status
);
1507 if (flagsMat
.group(2, status
).length() > 0) {
1508 errln("Bad Match flag at line %d. Scanning %c\n",
1509 lineNum
, flagsMat
.group(2, status
).charAt(0));
1512 testLine
.remove(0, flagsMat
.end(0, status
));
1515 // Pull out the match string, as a whole.
1516 // We'll process the <tags> later.
1518 quotedStuffMat
.reset(testLine
);
1519 if (quotedStuffMat
.lookingAt(status
)) {
1520 matchString
= quotedStuffMat
.group(2, status
);
1521 testLine
.remove(0, quotedStuffMat
.end(0, status
));
1523 errln("Bad match string at test file line %d", lineNum
);
1528 // The only thing left from the input line should be an optional trailing comment.
1530 commentMat
.reset(testLine
);
1531 if (commentMat
.lookingAt(status
) == FALSE
) {
1532 errln("Line %d: unexpected characters at end of test line.", lineNum
);
1539 regex_find(testPattern
, testFlags
, matchString
, lineNum
);
1548 //---------------------------------------------------------------------------
1550 // regex_find(pattern, flags, inputString, lineNumber)
1552 // Function to run a single test from the Extended (data driven) tests.
1553 // See file test/testdata/regextst.txt for a description of the
1554 // pattern and inputString fields, and the allowed flags.
1555 // lineNumber is the source line in regextst.txt of the test.
1557 //---------------------------------------------------------------------------
1560 // Set a value into a UVector at position specified by a decimal number in
1561 // a UnicodeString. This is a utility function needed by the actual test function,
1563 static void set(UVector
&vec
, int32_t val
, UnicodeString index
) {
1564 UErrorCode status
=U_ZERO_ERROR
;
1566 for (int32_t i
=0; i
<index
.length(); i
++) {
1567 int32_t d
=u_charDigitValue(index
.charAt(i
));
1571 while (vec
.size()<idx
+1) {vec
.addElement(-1, status
);}
1572 vec
.setElementAt(val
, idx
);
1575 void RegexTest::regex_find(const UnicodeString
&pattern
,
1576 const UnicodeString
&flags
,
1577 const UnicodeString
&inputString
,
1579 UnicodeString unEscapedInput
;
1580 UnicodeString deTaggedInput
;
1582 UErrorCode status
= U_ZERO_ERROR
;
1584 RegexPattern
*parsePat
= NULL
;
1585 RegexMatcher
*parseMatcher
= NULL
;
1586 RegexPattern
*callerPattern
= NULL
;
1587 RegexMatcher
*matcher
= NULL
;
1588 UVector
groupStarts(status
);
1589 UVector
groupEnds(status
);
1590 UBool isMatch
= FALSE
;
1591 UBool failed
= FALSE
;
1594 UBool useMatchesFunc
= FALSE
;
1595 UBool useLookingAtFunc
= FALSE
;
1596 int32_t regionStart
= -1;
1597 int32_t regionEnd
= -1;
1600 // Compile the caller's pattern
1602 uint32_t bflags
= 0;
1603 if (flags
.indexOf((UChar
)0x69) >= 0) { // 'i' flag
1604 bflags
|= UREGEX_CASE_INSENSITIVE
;
1606 if (flags
.indexOf((UChar
)0x78) >= 0) { // 'x' flag
1607 bflags
|= UREGEX_COMMENTS
;
1609 if (flags
.indexOf((UChar
)0x73) >= 0) { // 's' flag
1610 bflags
|= UREGEX_DOTALL
;
1612 if (flags
.indexOf((UChar
)0x6d) >= 0) { // 'm' flag
1613 bflags
|= UREGEX_MULTILINE
;
1616 if (flags
.indexOf((UChar
)0x65) >= 0) { // 'e' flag
1617 bflags
|= UREGEX_ERROR_ON_UNKNOWN_ESCAPES
;
1619 if (flags
.indexOf((UChar
)0x44) >= 0) { // 'D' flag
1620 bflags
|= UREGEX_UNIX_LINES
;
1624 callerPattern
= RegexPattern::compile(pattern
, bflags
, pe
, status
);
1625 if (status
!= U_ZERO_ERROR
) {
1626 #if UCONFIG_NO_BREAK_ITERATION==1
1627 // 'v' test flag means that the test pattern should not compile if ICU was configured
1628 // to not include break iteration. RBBI is needed for Unicode word boundaries.
1629 if (flags
.indexOf((UChar
)0x76) >= 0 /*'v'*/ && status
== U_UNSUPPORTED_ERROR
) {
1630 goto cleanupAndReturn
;
1633 if (flags
.indexOf((UChar
)0x45) >= 0) { // flags contain 'E'
1634 // Expected pattern compilation error.
1635 if (flags
.indexOf((UChar
)0x64) >= 0) { // flags contain 'd'
1636 logln("Pattern Compile returns \"%s\"", u_errorName(status
));
1638 goto cleanupAndReturn
;
1640 // Unexpected pattern compilation error.
1641 errln("Line %d: error %s compiling pattern.", line
, u_errorName(status
));
1642 goto cleanupAndReturn
;
1646 if (flags
.indexOf((UChar
)0x64) >= 0) { // 'd' flag
1647 RegexPatternDump(callerPattern
);
1650 if (flags
.indexOf((UChar
)0x45) >= 0) { // 'E' flag
1651 errln("Expected, but did not get, a pattern compilation error.");
1652 goto cleanupAndReturn
;
1657 // Number of times find() should be called on the test string, default to 1
1660 for (i
=2; i
<=9; i
++) {
1661 if (flags
.indexOf((UChar
)(0x30 + i
)) >= 0) { // digit flag
1662 if (numFinds
!= 1) {
1663 errln("Line %d: more than one digit flag. Scanning %d.", line
, i
);
1664 goto cleanupAndReturn
;
1670 // 'M' flag. Use matches() instead of find()
1671 if (flags
.indexOf((UChar
)0x4d) >= 0) {
1672 useMatchesFunc
= TRUE
;
1674 if (flags
.indexOf((UChar
)0x4c) >= 0) {
1675 useLookingAtFunc
= TRUE
;
1679 // Find the tags in the input data, remove them, and record the group boundary
1682 parsePat
= RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe
, status
);
1683 REGEX_CHECK_STATUS_L(line
);
1685 unEscapedInput
= inputString
.unescape();
1686 parseMatcher
= parsePat
->matcher(unEscapedInput
, status
);
1687 REGEX_CHECK_STATUS_L(line
);
1688 while(parseMatcher
->find()) {
1689 parseMatcher
->appendReplacement(deTaggedInput
, "", status
);
1691 UnicodeString groupNum
= parseMatcher
->group(2, status
);
1692 if (groupNum
== "r") {
1693 // <r> or </r>, a region specification within the string
1694 if (parseMatcher
->group(1, status
) == "/") {
1695 regionEnd
= deTaggedInput
.length();
1697 regionStart
= deTaggedInput
.length();
1700 // <digits> or </digits>, a group match boundary tag.
1701 if (parseMatcher
->group(1, status
) == "/") {
1702 set(groupEnds
, deTaggedInput
.length(), groupNum
);
1704 set(groupStarts
, deTaggedInput
.length(), groupNum
);
1708 parseMatcher
->appendTail(deTaggedInput
);
1709 REGEX_ASSERT_L(groupStarts
.size() == groupEnds
.size(), line
);
1710 if ((regionStart
>=0 || regionEnd
>=0) && (regionStart
<0 || regionStart
>regionEnd
)) {
1711 errln("mismatched <r> tags");
1713 goto cleanupAndReturn
;
1718 // Configure the matcher according to the flags specified with this test.
1720 matcher
= callerPattern
->matcher(deTaggedInput
, status
);
1721 REGEX_CHECK_STATUS_L(line
);
1722 if (flags
.indexOf((UChar
)0x74) >= 0) { // 't' trace flag
1723 matcher
->setTrace(TRUE
);
1725 if (regionStart
>=0) {
1726 matcher
->region(regionStart
, regionEnd
, status
);
1727 REGEX_CHECK_STATUS_L(line
);
1729 if (flags
.indexOf((UChar
)0x61) >= 0) { // 'a' anchoring bounds flag
1730 matcher
->useAnchoringBounds(FALSE
);
1732 if (flags
.indexOf((UChar
)0x62) >= 0) { // 'b' transparent bounds flag
1733 matcher
->useTransparentBounds(TRUE
);
1739 // Do a find on the de-tagged input using the caller's pattern
1740 // TODO: error on count>1 and not find().
1741 // error on both matches() and lookingAt().
1743 for (i
=0; i
<numFinds
; i
++) {
1744 if (useMatchesFunc
) {
1745 isMatch
= matcher
->matches(status
);
1746 } else if (useLookingAtFunc
) {
1747 isMatch
= matcher
->lookingAt(status
);
1749 isMatch
= matcher
->find();
1752 matcher
->setTrace(FALSE
);
1755 // Match up the groups from the find() with the groups from the tags
1758 // number of tags should match number of groups from find operation.
1759 // matcher->groupCount does not include group 0, the entire match, hence the +1.
1760 // G option in test means that capture group data is not available in the
1761 // expected results, so the check needs to be suppressed.
1762 if (isMatch
== FALSE
&& groupStarts
.size() != 0) {
1763 errln("Error at line %d: Match expected, but none found.\n", line
);
1765 goto cleanupAndReturn
;
1768 if (flags
.indexOf((UChar
)0x47 /*G*/) >= 0) {
1769 // Only check for match / no match. Don't check capture groups.
1770 if (isMatch
&& groupStarts
.size() == 0) {
1771 errln("Error at line %d: No match expected, but one found.\n", line
);
1774 goto cleanupAndReturn
;
1777 for (i
=0; i
<=matcher
->groupCount(); i
++) {
1778 int32_t expectedStart
= (i
>= groupStarts
.size()? -1 : groupStarts
.elementAti(i
));
1779 if (matcher
->start(i
, status
) != expectedStart
) {
1780 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
1781 line
, i
, expectedStart
, matcher
->start(i
, status
));
1783 goto cleanupAndReturn
; // Good chance of subsequent bogus errors. Stop now.
1785 int32_t expectedEnd
= (i
>= groupEnds
.size()? -1 : groupEnds
.elementAti(i
));
1786 if (matcher
->end(i
, status
) != expectedEnd
) {
1787 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
1788 line
, i
, expectedEnd
, matcher
->end(i
, status
));
1790 // Error on end position; keep going; real error is probably yet to come as group
1791 // end positions work from end of the input data towards the front.
1794 if ( matcher
->groupCount()+1 < groupStarts
.size()) {
1795 errln("Error at line %d: Expected %d capture groups, found %d.",
1796 line
, groupStarts
.size()-1, matcher
->groupCount());
1800 if ((flags
.indexOf((UChar
)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
1801 matcher
->requireEnd() == TRUE
) {
1802 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line
);
1805 if ((flags
.indexOf((UChar
)0x79) >= 0) && // 'y' flag: RequireEnd() == true
1806 matcher
->requireEnd() == FALSE
) {
1807 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line
);
1810 if ((flags
.indexOf((UChar
)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
1811 matcher
->hitEnd() == TRUE
) {
1812 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line
);
1815 if ((flags
.indexOf((UChar
)0x7A) >= 0) && // 'z' flag: hitEnd() == true
1816 matcher
->hitEnd() == FALSE
) {
1817 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line
);
1824 errln((UnicodeString
)"\""+pattern
+(UnicodeString
)"\" "
1825 +flags
+(UnicodeString
)" \""+inputString
+(UnicodeString
)"\"");
1826 // callerPattern->dump();
1828 delete parseMatcher
;
1831 delete callerPattern
;
1837 //---------------------------------------------------------------------------
1839 // Errors Check for error handling in patterns.
1841 //---------------------------------------------------------------------------
1842 void RegexTest::Errors() {
1843 // \escape sequences that aren't implemented yet.
1844 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
1846 // Missing close parentheses
1847 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN
);
1848 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN
);
1849 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN
);
1851 // Extra close paren
1852 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN
);
1853 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN
);
1854 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN
);
1856 // Look-ahead, Look-behind
1857 // TODO: add tests for unbounded length look-behinds.
1858 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX
); // illegal construct
1860 // Attempt to use non-default flags
1863 UErrorCode status
= U_ZERO_ERROR
;
1864 int32_t flags
= UREGEX_CANON_EQ
|
1865 UREGEX_COMMENTS
| UREGEX_DOTALL
|
1867 RegexPattern
*pat1
= RegexPattern::compile(".*", flags
, pe
, status
);
1868 REGEX_ASSERT(status
== U_REGEX_UNIMPLEMENTED
);
1873 // Quantifiers are allowed only after something that can be quantified.
1874 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX
);
1875 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX
);
1876 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX
);
1878 // Mal-formed {min,max} quantifiers
1879 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL
);
1880 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN
);
1881 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL
);
1882 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL
);
1883 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL
);
1884 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG
);
1885 REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG
); // Overflows int during scan
1886 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG
); // Overflows regex binary format
1887 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG
);
1890 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX
);
1895 //-------------------------------------------------------------------------------
1897 // Read a text data file, convert it to UChars, and return the data
1898 // in one big UChar * buffer, which the caller must delete.
1900 //--------------------------------------------------------------------------------
1901 UChar
*RegexTest::ReadAndConvertFile(const char *fileName
, int32_t &ulen
,
1902 const char *defEncoding
, UErrorCode
&status
) {
1903 UChar
*retPtr
= NULL
;
1904 char *fileBuf
= NULL
;
1905 UConverter
* conv
= NULL
;
1909 if (U_FAILURE(status
)) {
1916 f
= fopen(fileName
, "rb");
1918 dataerrln("[DATA] Error opening test data file %s\n", fileName
);
1919 status
= U_FILE_ACCESS_ERROR
;
1928 fseek( f
, 0, SEEK_END
);
1929 fileSize
= ftell(f
);
1930 fileBuf
= new char[fileSize
];
1931 fseek(f
, 0, SEEK_SET
);
1932 amt_read
= fread(fileBuf
, 1, fileSize
, f
);
1933 if (amt_read
!= fileSize
|| fileSize
<= 0) {
1934 errln("Error reading test data file.");
1935 goto cleanUpAndReturn
;
1939 // Look for a Unicode Signature (BOM) on the data just read
1941 int32_t signatureLength
;
1942 const char * fileBufC
;
1943 const char* encoding
;
1946 encoding
= ucnv_detectUnicodeSignature(
1947 fileBuf
, fileSize
, &signatureLength
, &status
);
1948 if(encoding
!=NULL
){
1949 fileBufC
+= signatureLength
;
1950 fileSize
-= signatureLength
;
1952 encoding
= defEncoding
;
1953 if (strcmp(encoding
, "utf-8") == 0) {
1954 errln("file %s is missing its BOM", fileName
);
1959 // Open a converter to take the rule file to UTF-16
1961 conv
= ucnv_open(encoding
, &status
);
1962 if (U_FAILURE(status
)) {
1963 goto cleanUpAndReturn
;
1967 // Convert the rules to UChar.
1968 // Preflight first to determine required buffer size.
1970 ulen
= ucnv_toUChars(conv
,
1976 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
1977 // Buffer Overflow is expected from the preflight operation.
1978 status
= U_ZERO_ERROR
;
1980 retPtr
= new UChar
[ulen
+1];
1993 if (U_FAILURE(status
)) {
1994 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
2003 //-------------------------------------------------------------------------------
2005 // PerlTests - Run Perl's regular expression tests
2006 // The input file for this test is re_tests, the standard regular
2007 // expression test data distributed with the Perl source code.
2009 // Here is Perl's description of the test data file:
2011 // # The tests are in a separate file 't/op/re_tests'.
2012 // # Each line in that file is a separate test.
2013 // # There are five columns, separated by tabs.
2015 // # Column 1 contains the pattern, optionally enclosed in C<''>.
2016 // # Modifiers can be put after the closing C<'>.
2018 // # Column 2 contains the string to be matched.
2020 // # Column 3 contains the expected result:
2021 // # y expect a match
2022 // # n expect no match
2023 // # c expect an error
2024 // # B test exposes a known bug in Perl, should be skipped
2025 // # b test exposes a known bug in Perl, should be skipped if noamp
2027 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
2029 // # Column 4 contains a string, usually C<$&>.
2031 // # Column 5 contains the expected result of double-quote
2032 // # interpolating that string after the match, or start of error message.
2034 // # Column 6, if present, contains a reason why the test is skipped.
2035 // # This is printed with "skipped", for harness to pick up.
2037 // # \n in the tests are interpolated, as are variables of the form ${\w+}.
2039 // # If you want to add a regular expression test that can't be expressed
2040 // # in this format, don't add it here: put it in op/pat.t instead.
2042 // For ICU, if field 3 contains an 'i', the test will be skipped.
2043 // The test exposes is some known incompatibility between ICU and Perl regexps.
2044 // (The i is in addition to whatever was there before.)
2046 //-------------------------------------------------------------------------------
2047 void RegexTest::PerlTests() {
2049 const char *srcPath
;
2050 UErrorCode status
= U_ZERO_ERROR
;
2054 // Open and read the test data file.
2056 srcPath
=getPath(tdd
, "re_tests.txt");
2058 return; /* something went wrong, error already output */
2062 UChar
*testData
= ReadAndConvertFile(srcPath
, len
, "iso-8859-1", status
);
2063 if (U_FAILURE(status
)) {
2064 return; /* something went wrong, error already output */
2068 // Put the test data into a UnicodeString
2070 UnicodeString
testDataString(FALSE
, testData
, len
);
2073 // Regex to break the input file into lines, and strip the new lines.
2074 // One line per match, capture group one is the desired data.
2076 RegexPattern
* linePat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe
, status
);
2077 if (U_FAILURE(status
)) {
2078 dataerrln("RegexPattern::compile() error");
2081 RegexMatcher
* lineMat
= linePat
->matcher(testDataString
, status
);
2084 // Regex to split a test file line into fields.
2085 // There are six fields, separated by tabs.
2087 RegexPattern
* fieldPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe
, status
);
2090 // Regex to identify test patterns with flag settings, and to separate them.
2091 // Test patterns with flags look like 'pattern'i
2092 // Test patterns without flags are not quoted: pattern
2093 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
2095 RegexPattern
*flagPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe
, status
);
2096 RegexMatcher
* flagMat
= flagPat
->matcher(status
);
2099 // The Perl tests reference several perl-isms, which are evaluated/substituted
2100 // in the test data. Not being perl, this must be done explicitly. Here
2101 // are string constants and REs for these constructs.
2103 UnicodeString
nulnulSrc("${nulnul}");
2104 UnicodeString
nulnul("\\u0000\\u0000", -1, US_INV
);
2105 nulnul
= nulnul
.unescape();
2107 UnicodeString
ffffSrc("${ffff}");
2108 UnicodeString
ffff("\\uffff", -1, US_INV
);
2109 ffff
= ffff
.unescape();
2111 // regexp for $-[0], $+[2], etc.
2112 RegexPattern
*groupsPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe
, status
);
2113 RegexMatcher
*groupsMat
= groupsPat
->matcher(status
);
2115 // regexp for $0, $1, $2, etc.
2116 RegexPattern
*cgPat
= RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe
, status
);
2117 RegexMatcher
*cgMat
= cgPat
->matcher(status
);
2121 // Main Loop for the Perl Tests, runs once per line from the
2124 int32_t lineNum
= 0;
2125 int32_t skippedUnimplementedCount
= 0;
2126 while (lineMat
->find()) {
2130 // Get a line, break it into its fields, do the Perl
2131 // variable substitutions.
2133 UnicodeString line
= lineMat
->group(1, status
);
2134 UnicodeString fields
[7];
2135 fieldPat
->split(line
, fields
, 7, status
);
2137 flagMat
->reset(fields
[0]);
2138 flagMat
->matches(status
);
2139 UnicodeString pattern
= flagMat
->group(2, status
);
2140 pattern
.findAndReplace("${bang}", "!");
2141 pattern
.findAndReplace(nulnulSrc
, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
2142 pattern
.findAndReplace(ffffSrc
, ffff
);
2145 // Identify patterns that include match flag settings,
2146 // split off the flags, remove the extra quotes.
2148 UnicodeString flagStr
= flagMat
->group(3, status
);
2149 if (U_FAILURE(status
)) {
2150 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
2154 const UChar UChar_c
= 0x63; // Char constants for the flag letters.
2155 const UChar UChar_i
= 0x69; // (Damn the lack of Unicode support in C)
2156 const UChar UChar_m
= 0x6d;
2157 const UChar UChar_x
= 0x78;
2158 const UChar UChar_y
= 0x79;
2159 if (flagStr
.indexOf(UChar_i
) != -1) {
2160 flags
|= UREGEX_CASE_INSENSITIVE
;
2162 if (flagStr
.indexOf(UChar_m
) != -1) {
2163 flags
|= UREGEX_MULTILINE
;
2165 if (flagStr
.indexOf(UChar_x
) != -1) {
2166 flags
|= UREGEX_COMMENTS
;
2170 // Compile the test pattern.
2172 status
= U_ZERO_ERROR
;
2173 RegexPattern
*testPat
= RegexPattern::compile(pattern
, flags
, pe
, status
);
2174 if (status
== U_REGEX_UNIMPLEMENTED
) {
2176 // Test of a feature that is planned for ICU, but not yet implemented.
2178 skippedUnimplementedCount
++;
2180 status
= U_ZERO_ERROR
;
2184 if (U_FAILURE(status
)) {
2185 // Some tests are supposed to generate errors.
2186 // Only report an error for tests that are supposed to succeed.
2187 if (fields
[2].indexOf(UChar_c
) == -1 && // Compilation is not supposed to fail AND
2188 fields
[2].indexOf(UChar_i
) == -1) // it's not an accepted ICU incompatibility
2190 errln("line %d: ICU Error \"%s\"\n", lineNum
, u_errorName(status
));
2192 status
= U_ZERO_ERROR
;
2197 if (fields
[2].indexOf(UChar_i
) >= 0) {
2198 // ICU should skip this test.
2203 if (fields
[2].indexOf(UChar_c
) >= 0) {
2204 // This pattern should have caused a compilation error, but didn't/
2205 errln("line %d: Expected a pattern compile error, got success.", lineNum
);
2211 // replace the Perl variables that appear in some of the
2212 // match data strings.
2214 UnicodeString matchString
= fields
[1];
2215 matchString
.findAndReplace(nulnulSrc
, nulnul
);
2216 matchString
.findAndReplace(ffffSrc
, ffff
);
2218 // Replace any \n in the match string with an actual new-line char.
2219 // Don't do full unescape, as this unescapes more than Perl does, which
2220 // causes other spurious failures in the tests.
2221 matchString
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
2226 // Run the test, check for expected match/don't match result.
2228 RegexMatcher
*testMat
= testPat
->matcher(matchString
, status
);
2229 UBool found
= testMat
->find();
2230 UBool expected
= FALSE
;
2231 if (fields
[2].indexOf(UChar_y
) >=0) {
2234 if (expected
!= found
) {
2235 errln("line %d: Expected %smatch, got %smatch",
2236 lineNum
, expected
?"":"no ", found
?"":"no " );
2240 // Don't try to check expected results if there is no match.
2241 // (Some have stuff in the expected fields)
2249 // Interpret the Perl expression from the fourth field of the data file,
2250 // building up an ICU string from the results of the ICU match.
2251 // The Perl expression will contain references to the results of
2252 // a regex match, including the matched string, capture group strings,
2253 // group starting and ending indicies, etc.
2255 UnicodeString resultString
;
2256 UnicodeString perlExpr
= fields
[3];
2257 groupsMat
->reset(perlExpr
);
2258 cgMat
->reset(perlExpr
);
2260 while (perlExpr
.length() > 0) {
2261 if (perlExpr
.startsWith("$&")) {
2262 resultString
.append(testMat
->group(status
));
2263 perlExpr
.remove(0, 2);
2266 else if (groupsMat
->lookingAt(status
)) {
2268 UnicodeString digitString
= groupsMat
->group(2, status
);
2270 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
2271 UnicodeString plusOrMinus
= groupsMat
->group(1, status
);
2272 int32_t matchPosition
;
2273 if (plusOrMinus
.compare("+") == 0) {
2274 matchPosition
= testMat
->end(groupNum
, status
);
2276 matchPosition
= testMat
->start(groupNum
, status
);
2278 if (matchPosition
!= -1) {
2279 ICU_Utility::appendNumber(resultString
, matchPosition
);
2281 perlExpr
.remove(0, groupsMat
->end(status
));
2284 else if (cgMat
->lookingAt(status
)) {
2286 UnicodeString digitString
= cgMat
->group(1, status
);
2288 int32_t groupNum
= ICU_Utility::parseNumber(digitString
, t
, 10);
2289 if (U_SUCCESS(status
)) {
2290 resultString
.append(testMat
->group(groupNum
, status
));
2291 status
= U_ZERO_ERROR
;
2293 perlExpr
.remove(0, cgMat
->end(status
));
2296 else if (perlExpr
.startsWith("@-")) {
2298 for (i
=0; i
<=testMat
->groupCount(); i
++) {
2300 resultString
.append(" ");
2302 ICU_Utility::appendNumber(resultString
, testMat
->start(i
, status
));
2304 perlExpr
.remove(0, 2);
2307 else if (perlExpr
.startsWith("@+")) {
2309 for (i
=0; i
<=testMat
->groupCount(); i
++) {
2311 resultString
.append(" ");
2313 ICU_Utility::appendNumber(resultString
, testMat
->end(i
, status
));
2315 perlExpr
.remove(0, 2);
2318 else if (perlExpr
.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
2319 // or as an escaped sequence (e.g. \n)
2320 if (perlExpr
.length() > 1) {
2321 perlExpr
.remove(0, 1); // Remove the '\', but only if not last char.
2323 UChar c
= perlExpr
.charAt(0);
2325 case 'n': c
= '\n'; break;
2326 // add any other escape sequences that show up in the test expected results.
2328 resultString
.append(c
);
2329 perlExpr
.remove(0, 1);
2333 // Any characters from the perl expression that we don't explicitly
2334 // recognize before here are assumed to be literals and copied
2335 // as-is to the expected results.
2336 resultString
.append(perlExpr
.charAt(0));
2337 perlExpr
.remove(0, 1);
2340 if (U_FAILURE(status
)) {
2341 errln("Line %d: ICU Error \"%s\"", lineNum
, u_errorName(status
));
2347 // Expected Results Compare
2349 UnicodeString
expectedS(fields
[4]);
2350 expectedS
.findAndReplace(nulnulSrc
, nulnul
);
2351 expectedS
.findAndReplace(ffffSrc
, ffff
);
2352 expectedS
.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
2355 if (expectedS
.compare(resultString
) != 0) {
2356 err("Line %d: Incorrect perl expression results.", lineNum
);
2357 errln((UnicodeString
)"Expected \""+expectedS
+(UnicodeString
)"\"; got \""+resultString
+(UnicodeString
)"\"");
2365 // All done. Clean up allocated stuff.
2383 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount
);
2389 // Callbacks() Test the callback function.
2390 // When set, callbacks occur periodically during matching operations,
2391 // giving the application code the ability to abort the operation
2392 // before it's normal completion.
2395 struct callBackContext
{
2400 void reset(int32_t max
) {maxCalls
=max
; numCalls
=0; lastSteps
=0;};
2404 static UBool U_CALLCONV
2405 testCallBackFn(const void *context
, int32_t steps
) {
2406 callBackContext
*info
= (callBackContext
*)context
;
2407 if (info
->lastSteps
+1 != steps
) {
2408 info
->test
->errln("incorrect steps in callback. Expected %d, got %d\n", info
->lastSteps
+1, steps
);
2410 info
->lastSteps
= steps
;
2412 return (info
->numCalls
< info
->maxCalls
);
2416 void RegexTest::Callbacks() {
2418 // Getter returns NULLs if no callback has been set
2420 // The variables that the getter will fill in.
2421 // Init to non-null values so that the action of the getter can be seen.
2422 const void *returnedContext
= &returnedContext
;
2423 URegexMatchCallback
*returnedFn
= &testCallBackFn
;
2425 UErrorCode status
= U_ZERO_ERROR
;
2426 RegexMatcher
matcher("x", 0, status
);
2428 matcher
.getMatchCallback(returnedFn
, returnedContext
, status
);
2430 REGEX_ASSERT(returnedFn
== NULL
);
2431 REGEX_ASSERT(returnedContext
== NULL
);
2436 callBackContext cbInfo
= {this, 0, 0, 0};
2437 const void *returnedContext
;
2438 URegexMatchCallback
*returnedFn
;
2439 UErrorCode status
= U_ZERO_ERROR
;
2440 RegexMatcher
matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status
); // A pattern that can run long.
2442 matcher
.setMatchCallback(testCallBackFn
, &cbInfo
, status
);
2444 matcher
.getMatchCallback(returnedFn
, returnedContext
, status
);
2446 REGEX_ASSERT(returnedFn
== testCallBackFn
);
2447 REGEX_ASSERT(returnedContext
== &cbInfo
);
2449 // A short-running match shouldn't invoke the callback
2450 status
= U_ZERO_ERROR
;
2452 UnicodeString s
= "xxx";
2454 REGEX_ASSERT(matcher
.matches(status
));
2456 REGEX_ASSERT(cbInfo
.numCalls
== 0);
2458 // A medium-length match that runs long enough to invoke the
2459 // callback, but not so long that the callback aborts it.
2460 status
= U_ZERO_ERROR
;
2462 s
= "aaaaaaaaaaaaaaaaaaab";
2464 REGEX_ASSERT(matcher
.matches(status
)==FALSE
);
2466 REGEX_ASSERT(cbInfo
.numCalls
> 0);
2468 // A longer running match that the callback function will abort.
2469 status
= U_ZERO_ERROR
;
2471 s
= "aaaaaaaaaaaaaaaaaaaaaaab";
2473 REGEX_ASSERT(matcher
.matches(status
)==FALSE
);
2474 REGEX_ASSERT(status
== U_REGEX_STOPPED_BY_CALLER
);
2475 REGEX_ASSERT(cbInfo
.numCalls
== 4);
2481 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */