]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/intltest/regextst.cpp
ICU-3.13.tar.gz
[apple/icu.git] / icuSources / test / intltest / regextst.cpp
1 /********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 2002-2003, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6
7 //
8 // regextst.cpp
9 //
10 // ICU Regular Expressions test, part of intltest.
11 //
12
13 #include "unicode/utypes.h"
14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
15
16 #include "unicode/uchar.h"
17 #include "unicode/ucnv.h"
18 #include "intltest.h"
19 #include "regextst.h"
20 #include "uvector.h"
21 #include "stdlib.h"
22 #include "charstr.h"
23 #include "util.h"
24
25
26 //---------------------------------------------------------------------------
27 //
28 // Test class boilerplate
29 //
30 //---------------------------------------------------------------------------
31 RegexTest::RegexTest()
32 {
33 };
34
35
36 RegexTest::~RegexTest()
37 {
38 };
39
40
41
42 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
43 {
44 if (exec) logln("TestSuite RegexTest: ");
45 switch (index) {
46
47 case 0: name = "Basic";
48 if (exec) Basic();
49 break;
50 case 1: name = "API_Match";
51 if (exec) API_Match();
52 break;
53 case 2: name = "API_Replace";
54 if (exec) API_Replace();
55 break;
56 case 3: name = "API_Pattern";
57 if (exec) API_Pattern();
58 break;
59 case 4: name = "Extended";
60 if (exec) Extended();
61 break;
62 case 5: name = "Errors";
63 if (exec) Errors();
64 break;
65 case 6: name = "PerlTests";
66 if (exec) PerlTests();
67 break;
68
69
70 default: name = "";
71 break; //needed to end loop
72 }
73 }
74
75
76 //---------------------------------------------------------------------------
77 //
78 // Error Checking / Reporting macros used in all of the tests.
79 //
80 //---------------------------------------------------------------------------
81 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {errln("RegexTest failure at line %d. status=%s\n", \
82 __LINE__, u_errorName(status)); return;}}
83
84 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("RegexTest failure at line %d.\n", __LINE__);};}
85
86 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
87 if (status!=errcode) {errln("RegexTest failure at line %d. Expected status=%s, got %s\n", \
88 __LINE__, u_errorName(errcode), u_errorName(status));};}
89
90 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
91 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
92
93 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
94 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
95
96
97
98 //---------------------------------------------------------------------------
99 //
100 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests
101 // for the LookingAt() and Match() functions.
102 //
103 // usage:
104 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
105 //
106 // The expected results are UBool - TRUE or FALSE.
107 // The input text is unescaped. The pattern is not.
108 //
109 //
110 //---------------------------------------------------------------------------
111
112 #define REGEX_TESTLM(pat, text, looking, match) doRegexLMTest(pat, text, looking, match, __LINE__);
113
114 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int line) {
115 const UnicodeString pattern(pat);
116 const UnicodeString inputText(text);
117 UErrorCode status = U_ZERO_ERROR;
118 UParseError pe;
119 RegexPattern *REPattern = NULL;
120 RegexMatcher *REMatcher = NULL;
121 UBool retVal = TRUE;
122
123 UnicodeString patString(pat);
124 REPattern = RegexPattern::compile(patString, 0, pe, status);
125 if (U_FAILURE(status)) {
126 errln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s\n",
127 line, u_errorName(status));
128 return FALSE;
129 }
130 if (line==376) { REPattern->dump();}
131
132 UnicodeString inputString(inputText);
133 UnicodeString unEscapedInput = inputString.unescape();
134 REMatcher = REPattern->matcher(unEscapedInput, status);
135 if (U_FAILURE(status)) {
136 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
137 line, u_errorName(status));
138 return FALSE;
139 }
140
141 UBool actualmatch;
142 actualmatch = REMatcher->lookingAt(status);
143 if (U_FAILURE(status)) {
144 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
145 line, u_errorName(status));
146 retVal = FALSE;
147 }
148 if (actualmatch != looking) {
149 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
150 retVal = FALSE;
151 }
152
153 status = U_ZERO_ERROR;
154 actualmatch = REMatcher->matches(status);
155 if (U_FAILURE(status)) {
156 errln("RegexTest failure in matches() at line %d. Status = %s\n",
157 line, u_errorName(status));
158 retVal = FALSE;
159 }
160 if (actualmatch != match) {
161 errln("RegexTest: wrong return from matches() at line %d.\n", line);
162 retVal = FALSE;
163 }
164
165 if (retVal == FALSE) {
166 REPattern->dump();
167 }
168
169 delete REPattern;
170 delete REMatcher;
171 return retVal;
172 }
173
174
175
176
177 //---------------------------------------------------------------------------
178 //
179 // regex_find(pattern, inputString, lineNumber)
180 //
181 // function to simplify writing tests regex tests.
182 //
183 // The input text is unescaped. The pattern is not.
184 // The input text is marked with the expected match positions
185 // <0>text <1> more text </1> </0>
186 // The <n> </n> tags are removed before trying the match.
187 // The tags mark the start and end of the match and of any capture groups.
188 //
189 //
190 //---------------------------------------------------------------------------
191
192
193 // Set a value into a UVector at position specified by a decimal number in
194 // a UnicodeString. This is a utility function needed by the actual test function,
195 // which follows.
196 static void set(UVector &vec, int val, UnicodeString index) {
197 UErrorCode status=U_ZERO_ERROR;
198 int idx = 0;
199 for (int i=0; i<index.length(); i++) {
200 int d=u_charDigitValue(index.charAt(i));
201 if (d<0) {return;}
202 idx = idx*10 + d;
203 }
204 while (vec.size()<idx+1) {vec.addElement(-1, status);}
205 vec.setElementAt(val, idx);
206 }
207
208 void RegexTest::regex_find(const UnicodeString &pattern,
209 const UnicodeString &flags,
210 const UnicodeString &inputString,
211 int line) {
212 UnicodeString unEscapedInput;
213 UnicodeString deTaggedInput;
214
215 UErrorCode status = U_ZERO_ERROR;
216 UParseError pe;
217 RegexPattern *parsePat = NULL;
218 RegexMatcher *parseMatcher = NULL;
219 RegexPattern *callerPattern = NULL;
220 RegexMatcher *matcher = NULL;
221 UVector groupStarts(status);
222 UVector groupEnds(status);
223 UBool isMatch;
224 UBool failed = FALSE;
225
226 //
227 // Compile the caller's pattern
228 //
229 uint32_t bflags = 0;
230 if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag
231 bflags |= UREGEX_CASE_INSENSITIVE;
232 }
233 if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag
234 bflags |= UREGEX_COMMENTS;
235 }
236 if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag
237 bflags |= UREGEX_DOTALL;
238 }
239 if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag
240 bflags |= UREGEX_MULTILINE;
241 }
242
243
244 callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
245 if (status != U_ZERO_ERROR) {
246 errln("Line %d: error %s compiling pattern.", line, u_errorName(status));
247 goto cleanupAndReturn;
248 }
249
250 if (flags.indexOf((UChar)'d') >= 0) {
251 callerPattern->dump();
252 }
253
254 //
255 // Find the tags in the input data, remove them, and record the group boundary
256 // positions.
257 //
258 parsePat = RegexPattern::compile("<(/?)([0-9]+)>", 0, pe, status);
259 REGEX_CHECK_STATUS_L(line);
260
261 unEscapedInput = inputString.unescape();
262 parseMatcher = parsePat->matcher(unEscapedInput, status);
263 REGEX_CHECK_STATUS_L(line);
264 while(parseMatcher->find()) {
265 parseMatcher->appendReplacement(deTaggedInput, "", status);
266 REGEX_CHECK_STATUS;
267 UnicodeString groupNum = parseMatcher->group(2, status);
268 if (parseMatcher->group(1, status) == "/") {
269 // close tag
270 set(groupEnds, deTaggedInput.length(), groupNum);
271 } else {
272 set(groupStarts, deTaggedInput.length(), groupNum);
273 }
274 }
275 parseMatcher->appendTail(deTaggedInput);
276 REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
277
278
279 //
280 // Do a find on the de-tagged input using the caller's pattern
281 //
282 matcher = callerPattern->matcher(deTaggedInput, status);
283 REGEX_CHECK_STATUS_L(line);
284 if (flags.indexOf((UChar)'t') >= 0) {
285 matcher->setTrace(TRUE);
286 }
287
288 isMatch = matcher->find();
289 matcher->setTrace(FALSE);
290
291 //
292 // Match up the groups from the find() with the groups from the tags
293 //
294
295 // number of tags should match number of groups from find operation.
296 // matcher->groupCount does not include group 0, the entire match, hence the +1.
297 // G option in test means that capture group data is not available in the
298 // expected results, so the check needs to be suppressed.
299 if (isMatch == FALSE && groupStarts.size() != 0) {
300 errln("Error at line %d: Match expected, but none found.\n", line);
301 failed = TRUE;
302 goto cleanupAndReturn;
303 }
304
305 if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
306 // Only check for match / no match. Don't check capture groups.
307 if (isMatch && groupStarts.size() == 0) {
308 errln("Error at line %d: No match expected, but one found.\n", line);
309 failed = TRUE;
310 }
311 goto cleanupAndReturn;
312 }
313
314 int i;
315 for (i=0; i<=matcher->groupCount(); i++) {
316 int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
317 if (matcher->start(i, status) != expectedStart) {
318 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
319 line, i, expectedStart, matcher->start(i, status));
320 failed = TRUE;
321 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
322 }
323 int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
324 if (matcher->end(i, status) != expectedEnd) {
325 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
326 line, i, expectedEnd, matcher->end(i, status));
327 failed = TRUE;
328 // Error on end position; keep going; real error is probably yet to come as group
329 // end positions work from end of the input data towards the front.
330 }
331 }
332 if ( matcher->groupCount()+1 < groupStarts.size()) {
333 errln("Error at line %d: Expected %d capture groups, found %d.",
334 line, groupStarts.size()-1, matcher->groupCount());
335 failed = TRUE;
336 }
337
338 cleanupAndReturn:
339 if (failed) {
340 errln("\"%s\" %s \"%s\"", (const char *)CharString(pattern),
341 (const char *)CharString(flags),
342 (const char *)CharString(inputString));
343 // callerPattern->dump();
344 }
345 delete parseMatcher;
346 delete parsePat;
347 delete matcher;
348 delete callerPattern;
349 }
350
351
352
353
354
355
356
357
358 //---------------------------------------------------------------------------
359 //
360 // REGEX_ERR Macro + invocation function to simplify writing tests
361 // regex tests for incorrect patterns
362 //
363 // usage:
364 // REGEX_ERR("pattern", expected error line, column, expected status);
365 //
366 //---------------------------------------------------------------------------
367 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
368
369 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
370 UErrorCode expectedStatus, int line) {
371 UnicodeString pattern(pat);
372
373 UErrorCode status = U_ZERO_ERROR;
374 UParseError pe;
375 RegexPattern *callerPattern = NULL;
376
377 //
378 // Compile the caller's pattern
379 //
380 UnicodeString patString(pat);
381 callerPattern = RegexPattern::compile(patString, 0, pe, status);
382 if (status != expectedStatus) {
383 errln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
384 } else {
385 if (status != U_ZERO_ERROR) {
386 if (pe.line != errLine || pe.offset != errCol) {
387 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
388 line, errLine, errCol, pe.line, pe.offset);
389 }
390 }
391 }
392
393 delete callerPattern;
394 }
395
396
397
398 //---------------------------------------------------------------------------
399 //
400 // Basic Check for basic functionality of regex pattern matching.
401 // Avoid the use of REGEX_FIND test macro, which has
402 // substantial dependencies on basic Regex functionality.
403 //
404 //---------------------------------------------------------------------------
405 void RegexTest::Basic() {
406
407
408 //
409 // Debug - slide failing test cases early
410 //
411 #if 0
412 {
413 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
414 UParseError pe;
415 UErrorCode status = U_ZERO_ERROR;
416 RegexPattern::compile("^(?:a?b?)*$", 0, pe, status);
417 // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
418 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
419 }
420 exit(1);
421 #endif
422
423
424 //
425 // Pattern with parentheses
426 //
427 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE);
428 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE);
429 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE);
430
431 //
432 // Patterns with *
433 //
434 REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
435 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
436 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
437 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
438 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
439
440 REGEX_TESTLM("a*", "", TRUE, TRUE);
441 REGEX_TESTLM("a*", "b", TRUE, FALSE);
442
443
444 //
445 // Patterns with "."
446 //
447 REGEX_TESTLM(".", "abc", TRUE, FALSE);
448 REGEX_TESTLM("...", "abc", TRUE, TRUE);
449 REGEX_TESTLM("....", "abc", FALSE, FALSE);
450 REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
451 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
452 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
453 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
454 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
455
456 //
457 // Patterns with * applied to chars at end of literal string
458 //
459 REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
460 REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
461
462 //
463 // Supplemental chars match as single chars, not a pair of surrogates.
464 //
465 REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
466 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
467 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
468
469
470 //
471 // UnicodeSets in the pattern
472 //
473 REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
474 REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
475 REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
476 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
477 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
478 REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
479
480 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
481 REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
482 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
483 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences.
484 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);
485
486 //
487 // OR operator in patterns
488 //
489 REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
490 REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
491 REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
492 REGEX_TESTLM("a|b", "b", TRUE, TRUE);
493
494 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
495 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
496 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
497 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
498 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
499 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
500
501 //
502 // +
503 //
504 REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
505 REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
506 REGEX_TESTLM("b+", "", FALSE, FALSE);
507 REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
508 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
509 REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
510
511 //
512 // ?
513 //
514 REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
515 REGEX_TESTLM("ab?", "a", TRUE, TRUE);
516 REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
517 REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
518 REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
519 REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
520 REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
521 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
522 REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
523
524 //
525 // Escape sequences that become single literal chars, handled internally
526 // by ICU's Unescape.
527 //
528
529 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
530 REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL
531 REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L
532 REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape
533 REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed
534 REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line
535 REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR
536 REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab
537 REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
538 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
539
540 REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input
541 REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input
542
543 // Escape of special chars in patterns
544 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
545
546
547 };
548
549
550 //---------------------------------------------------------------------------
551 //
552 // API_Match Test that the API for class RegexMatcher
553 // is present and nominally working, but excluding functions
554 // implementing replace operations.
555 //
556 //---------------------------------------------------------------------------
557 void RegexTest::API_Match() {
558 UParseError pe;
559 UErrorCode status=U_ZERO_ERROR;
560 int32_t flags = 0;
561
562 //
563 // Debug - slide failing test cases early
564 //
565 #if 0
566 {
567 }
568 return;
569 #endif
570
571 //
572 // Simple pattern compilation
573 //
574 {
575 UnicodeString re("abc");
576 RegexPattern *pat2;
577 pat2 = RegexPattern::compile(re, flags, pe, status);
578 REGEX_CHECK_STATUS;
579
580 UnicodeString inStr1 = "abcdef this is a test";
581 UnicodeString instr2 = "not abc";
582 UnicodeString empty = "";
583
584
585 //
586 // Matcher creation and reset.
587 //
588 RegexMatcher *m1 = pat2->matcher(inStr1, status);
589 REGEX_CHECK_STATUS;
590 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
591 REGEX_ASSERT(m1->input() == inStr1);
592 m1->reset(instr2);
593 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
594 REGEX_ASSERT(m1->input() == instr2);
595 m1->reset(inStr1);
596 REGEX_ASSERT(m1->input() == inStr1);
597 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
598 m1->reset(empty);
599 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
600 REGEX_ASSERT(m1->input() == empty);
601 REGEX_ASSERT(&m1->pattern() == pat2);
602 delete m1;
603 delete pat2;
604 }
605
606
607 //
608 // Capture Group.
609 // RegexMatcher::start();
610 // RegexMatcher::end();
611 // RegexMatcher::groupCount();
612 //
613 {
614 int32_t flags=0;
615 UParseError pe;
616 UErrorCode status=U_ZERO_ERROR;
617
618 UnicodeString re("01(23(45)67)(.*)");
619 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
620 REGEX_CHECK_STATUS;
621 UnicodeString data = "0123456789";
622
623 RegexMatcher *matcher = pat->matcher(data, status);
624 REGEX_CHECK_STATUS;
625 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
626 int matchStarts[] = {0, 2, 4, 8};
627 int matchEnds[] = {10, 8, 6, 10};
628 int i;
629 for (i=0; i<4; i++) {
630 int32_t actualStart = matcher->start(i, status);
631 REGEX_CHECK_STATUS;
632 if (actualStart != matchStarts[i]) {
633 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
634 __LINE__, i, matchStarts[i], actualStart);
635 }
636 int32_t actualEnd = matcher->end(i, status);
637 REGEX_CHECK_STATUS;
638 if (actualEnd != matchEnds[i]) {
639 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
640 __LINE__, i, matchEnds[i], actualEnd);
641 }
642 }
643
644 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
645 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
646
647 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
648 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
649 matcher->reset();
650 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
651
652 matcher->lookingAt(status);
653 REGEX_ASSERT(matcher->group(status) == "0123456789");
654 REGEX_ASSERT(matcher->group(0, status) == "0123456789");
655 REGEX_ASSERT(matcher->group(1, status) == "234567" );
656 REGEX_ASSERT(matcher->group(2, status) == "45" );
657 REGEX_ASSERT(matcher->group(3, status) == "89" );
658 REGEX_CHECK_STATUS;
659 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
660 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
661 matcher->reset();
662 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
663
664 delete matcher;
665 delete pat;
666
667 }
668
669 //
670 // find
671 //
672 {
673 int32_t flags=0;
674 UParseError pe;
675 UErrorCode status=U_ZERO_ERROR;
676
677 UnicodeString re("abc");
678 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
679 REGEX_CHECK_STATUS;
680 UnicodeString data = ".abc..abc...abc..";
681 // 012345678901234567
682
683 RegexMatcher *matcher = pat->matcher(data, status);
684 REGEX_CHECK_STATUS;
685 REGEX_ASSERT(matcher->find());
686 REGEX_ASSERT(matcher->start(status) == 1);
687 REGEX_ASSERT(matcher->find());
688 REGEX_ASSERT(matcher->start(status) == 6);
689 REGEX_ASSERT(matcher->find());
690 REGEX_ASSERT(matcher->start(status) == 12);
691 REGEX_ASSERT(matcher->find() == FALSE);
692 REGEX_ASSERT(matcher->find() == FALSE);
693
694 matcher->reset();
695 REGEX_ASSERT(matcher->find());
696 REGEX_ASSERT(matcher->start(status) == 1);
697
698 REGEX_ASSERT(matcher->find(0, status));
699 REGEX_ASSERT(matcher->start(status) == 1);
700 REGEX_ASSERT(matcher->find(1, status));
701 REGEX_ASSERT(matcher->start(status) == 1);
702 REGEX_ASSERT(matcher->find(2, status));
703 REGEX_ASSERT(matcher->start(status) == 6);
704 REGEX_ASSERT(matcher->find(12, status));
705 REGEX_ASSERT(matcher->start(status) == 12);
706 REGEX_ASSERT(matcher->find(13, status) == FALSE);
707 REGEX_ASSERT(matcher->find(16, status) == FALSE);
708 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
709 REGEX_CHECK_STATUS;
710
711 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
712 REGEX_ASSERT_FAIL(matcher->find(17, status), U_INDEX_OUTOFBOUNDS_ERROR);
713
714 REGEX_ASSERT(matcher->groupCount() == 0);
715
716 delete matcher;
717 delete pat;
718 }
719
720
721 //
722 // find, with \G in pattern (true if at the end of a previous match).
723 //
724 {
725 int32_t flags=0;
726 UParseError pe;
727 UErrorCode status=U_ZERO_ERROR;
728
729 UnicodeString re(".*?(?:(\\Gabc)|(abc))");
730 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
731 REGEX_CHECK_STATUS;
732 UnicodeString data = ".abcabc.abc..";
733 // 012345678901234567
734
735 RegexMatcher *matcher = pat->matcher(data, status);
736 REGEX_CHECK_STATUS;
737 REGEX_ASSERT(matcher->find());
738 REGEX_ASSERT(matcher->start(status) == 0);
739 REGEX_ASSERT(matcher->start(1, status) == -1);
740 REGEX_ASSERT(matcher->start(2, status) == 1);
741
742 REGEX_ASSERT(matcher->find());
743 REGEX_ASSERT(matcher->start(status) == 4);
744 REGEX_ASSERT(matcher->start(1, status) == 4);
745 REGEX_ASSERT(matcher->start(2, status) == -1);
746 REGEX_CHECK_STATUS;
747
748 delete matcher;
749 delete pat;
750 }
751
752 //
753 // Matchers with no input string behave as if they had an empty input string.
754 //
755
756 {
757 UErrorCode status = U_ZERO_ERROR;
758 RegexMatcher m(".?", 0, status);
759 REGEX_CHECK_STATUS;
760 REGEX_ASSERT(m.find());
761 REGEX_ASSERT(m.start(status) == 0);
762 REGEX_ASSERT(m.input() == "");
763 }
764 {
765 UErrorCode status = U_ZERO_ERROR;
766 RegexPattern *p = RegexPattern::compile(".", 0, status);
767 RegexMatcher *m = p->matcher(status);
768 REGEX_CHECK_STATUS;
769
770 REGEX_ASSERT(m->find() == FALSE);
771 REGEX_ASSERT(m->input() == "");
772 delete m;
773 delete p;
774 }
775
776 }
777
778
779
780
781
782
783 //---------------------------------------------------------------------------
784 //
785 // API_Replace API test for class RegexMatcher, testing the
786 // Replace family of functions.
787 //
788 //---------------------------------------------------------------------------
789 void RegexTest::API_Replace() {
790 //
791 // Replace
792 //
793 int32_t flags=0;
794 UParseError pe;
795 UErrorCode status=U_ZERO_ERROR;
796
797 UnicodeString re("abc");
798 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
799 REGEX_CHECK_STATUS;
800 UnicodeString data = ".abc..abc...abc..";
801 // 012345678901234567
802 RegexMatcher *matcher = pat->matcher(data, status);
803
804 //
805 // Plain vanilla matches.
806 //
807 UnicodeString dest;
808 dest = matcher->replaceFirst("yz", status);
809 REGEX_CHECK_STATUS;
810 REGEX_ASSERT(dest == ".yz..abc...abc..");
811
812 dest = matcher->replaceAll("yz", status);
813 REGEX_CHECK_STATUS;
814 REGEX_ASSERT(dest == ".yz..yz...yz..");
815
816 //
817 // Plain vanilla non-matches.
818 //
819 UnicodeString d2 = ".abx..abx...abx..";
820 matcher->reset(d2);
821 dest = matcher->replaceFirst("yz", status);
822 REGEX_CHECK_STATUS;
823 REGEX_ASSERT(dest == ".abx..abx...abx..");
824
825 dest = matcher->replaceAll("yz", status);
826 REGEX_CHECK_STATUS;
827 REGEX_ASSERT(dest == ".abx..abx...abx..");
828
829 //
830 // Empty source string
831 //
832 UnicodeString d3 = "";
833 matcher->reset(d3);
834 dest = matcher->replaceFirst("yz", status);
835 REGEX_CHECK_STATUS;
836 REGEX_ASSERT(dest == "");
837
838 dest = matcher->replaceAll("yz", status);
839 REGEX_CHECK_STATUS;
840 REGEX_ASSERT(dest == "");
841
842 //
843 // Empty substitution string
844 //
845 matcher->reset(data); // ".abc..abc...abc.."
846 dest = matcher->replaceFirst("", status);
847 REGEX_CHECK_STATUS;
848 REGEX_ASSERT(dest == "...abc...abc..");
849
850 dest = matcher->replaceAll("", status);
851 REGEX_CHECK_STATUS;
852 REGEX_ASSERT(dest == "........");
853
854 //
855 // match whole string
856 //
857 UnicodeString d4 = "abc";
858 matcher->reset(d4);
859 dest = matcher->replaceFirst("xyz", status);
860 REGEX_CHECK_STATUS;
861 REGEX_ASSERT(dest == "xyz");
862
863 dest = matcher->replaceAll("xyz", status);
864 REGEX_CHECK_STATUS;
865 REGEX_ASSERT(dest == "xyz");
866
867 //
868 // Capture Group, simple case
869 //
870 UnicodeString re2("a(..)");
871 RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
872 REGEX_CHECK_STATUS;
873 UnicodeString d5 = "abcdefg";
874 RegexMatcher *matcher2 = pat2->matcher(d5, status);
875 REGEX_CHECK_STATUS;
876 dest = matcher2->replaceFirst("$1$1", status);
877 REGEX_CHECK_STATUS;
878 REGEX_ASSERT(dest == "bcbcdefg");
879
880 dest = matcher2->replaceFirst("The value of \\$1 is $1.", status);
881 REGEX_CHECK_STATUS;
882 REGEX_ASSERT(dest == "The value of $1 is bc.defg");
883
884 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
885 REGEX_CHECK_STATUS;
886 REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");
887
888 UnicodeString replacement = "Supplemental Digit 1 $\\U0001D7CF.";
889 replacement = replacement.unescape();
890 dest = matcher2->replaceFirst(replacement, status);
891 REGEX_CHECK_STATUS;
892 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
893
894 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
895
896
897
898 // TODO: need more through testing of capture substitutions.
899
900
901 delete matcher2;
902 delete pat2;
903 delete matcher;
904 delete pat;
905 }
906
907
908 //---------------------------------------------------------------------------
909 //
910 // API_Pattern Test that the API for class RegexPattern is
911 // present and nominally working.
912 //
913 //---------------------------------------------------------------------------
914 void RegexTest::API_Pattern() {
915 RegexPattern pata; // Test default constructor to not crash.
916 RegexPattern patb;
917
918 REGEX_ASSERT(pata == patb);
919 REGEX_ASSERT(pata == pata);
920
921 UnicodeString re1("abc[a-l][m-z]");
922 UnicodeString re2("def");
923 UErrorCode status = U_ZERO_ERROR;
924 UParseError pe;
925
926 RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status);
927 RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status);
928 REGEX_CHECK_STATUS;
929 REGEX_ASSERT(*pat1 == *pat1);
930 REGEX_ASSERT(*pat1 != pata);
931
932 // Assign
933 patb = *pat1;
934 REGEX_ASSERT(patb == *pat1);
935
936 // Copy Construct
937 RegexPattern patc(*pat1);
938 REGEX_ASSERT(patc == *pat1);
939 REGEX_ASSERT(patb == patc);
940 REGEX_ASSERT(pat1 != pat2);
941 patb = *pat2;
942 REGEX_ASSERT(patb != patc);
943 REGEX_ASSERT(patb == *pat2);
944
945 // Compile with no flags.
946 RegexPattern *pat1a = RegexPattern::compile(re1, pe, status);
947 REGEX_ASSERT(*pat1a == *pat1);
948
949 REGEX_ASSERT(pat1a->flags() == 0);
950 #if 0
951 // Compile with different flags should be not equal
952 RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
953 REGEX_CHECK_STATUS;
954
955 REGEX_ASSERT(*pat1b != *pat1a);
956 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
957 REGEX_ASSERT(pat1a->flags() == 0);
958 delete pat1b;
959 #endif // add test back in when we actually support flag settings.
960
961 // clone
962 RegexPattern *pat1c = pat1->clone();
963 REGEX_ASSERT(*pat1c == *pat1);
964 REGEX_ASSERT(*pat1c != *pat2);
965
966
967 // TODO: Actually do some matches with the cloned/copied/assigned patterns.
968
969
970
971 delete pat1c;
972 delete pat1a;
973 delete pat1;
974 delete pat2;
975
976
977 //
978 // matches convenience API
979 //
980 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
981 REGEX_CHECK_STATUS;
982 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
983 REGEX_CHECK_STATUS;
984 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
985 REGEX_CHECK_STATUS;
986 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
987 REGEX_CHECK_STATUS;
988 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
989 REGEX_CHECK_STATUS;
990 status = U_INDEX_OUTOFBOUNDS_ERROR;
991 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
992 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
993
994
995 //
996 // Split()
997 //
998 status = U_ZERO_ERROR;
999 pat1 = RegexPattern::compile(" +", pe, status);
1000 REGEX_CHECK_STATUS;
1001 UnicodeString fields[10];
1002
1003 int32_t n;
1004 n = pat1->split("Now is the time", fields, 10, status);
1005 REGEX_CHECK_STATUS;
1006 REGEX_ASSERT(n==4);
1007 REGEX_ASSERT(fields[0]=="Now");
1008 REGEX_ASSERT(fields[1]=="is");
1009 REGEX_ASSERT(fields[2]=="the");
1010 REGEX_ASSERT(fields[3]=="time");
1011 REGEX_ASSERT(fields[4]=="");
1012
1013 n = pat1->split("Now is the time", fields, 2, status);
1014 REGEX_CHECK_STATUS;
1015 REGEX_ASSERT(n==2);
1016 REGEX_ASSERT(fields[0]=="Now");
1017 REGEX_ASSERT(fields[1]=="is the time");
1018 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
1019
1020 fields[1] = "*";
1021 status = U_ZERO_ERROR;
1022 n = pat1->split("Now is the time", fields, 1, status);
1023 REGEX_CHECK_STATUS;
1024 REGEX_ASSERT(n==1);
1025 REGEX_ASSERT(fields[0]=="Now is the time");
1026 REGEX_ASSERT(fields[1]=="*");
1027 status = U_ZERO_ERROR;
1028
1029 n = pat1->split(" Now is the time ", fields, 10, status);
1030 REGEX_CHECK_STATUS;
1031 REGEX_ASSERT(n==5);
1032 REGEX_ASSERT(fields[0]=="");
1033 REGEX_ASSERT(fields[1]=="Now");
1034 REGEX_ASSERT(fields[2]=="is");
1035 REGEX_ASSERT(fields[3]=="the");
1036 REGEX_ASSERT(fields[4]=="time");
1037 REGEX_ASSERT(fields[5]=="");
1038
1039 n = pat1->split(" ", fields, 10, status);
1040 REGEX_CHECK_STATUS;
1041 REGEX_ASSERT(n==1);
1042 REGEX_ASSERT(fields[0]=="");
1043
1044 fields[0] = "foo";
1045 n = pat1->split("", fields, 10, status);
1046 REGEX_CHECK_STATUS;
1047 REGEX_ASSERT(n==0);
1048 REGEX_ASSERT(fields[0]=="foo");
1049
1050 delete pat1;
1051
1052 // split, with a pattern with (capture)
1053 pat1 = RegexPattern::compile("<(\\w*)>", pe, status);
1054 REGEX_CHECK_STATUS;
1055
1056 status = U_ZERO_ERROR;
1057 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1058 REGEX_CHECK_STATUS;
1059 REGEX_ASSERT(n==6);
1060 REGEX_ASSERT(fields[0]=="");
1061 REGEX_ASSERT(fields[1]=="a");
1062 REGEX_ASSERT(fields[2]=="Now is ");
1063 REGEX_ASSERT(fields[3]=="b");
1064 REGEX_ASSERT(fields[4]=="the time");
1065 REGEX_ASSERT(fields[5]=="c");
1066 REGEX_ASSERT(fields[6]=="");
1067 REGEX_ASSERT(status==U_ZERO_ERROR);
1068
1069 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
1070 REGEX_CHECK_STATUS;
1071 REGEX_ASSERT(n==6);
1072 REGEX_ASSERT(fields[0]==" ");
1073 REGEX_ASSERT(fields[1]=="a");
1074 REGEX_ASSERT(fields[2]=="Now is ");
1075 REGEX_ASSERT(fields[3]=="b");
1076 REGEX_ASSERT(fields[4]=="the time");
1077 REGEX_ASSERT(fields[5]=="c");
1078 REGEX_ASSERT(fields[6]=="");
1079
1080 status = U_ZERO_ERROR;
1081 fields[6] = "foo";
1082 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status);
1083 REGEX_CHECK_STATUS;
1084 REGEX_ASSERT(n==6);
1085 REGEX_ASSERT(fields[0]==" ");
1086 REGEX_ASSERT(fields[1]=="a");
1087 REGEX_ASSERT(fields[2]=="Now is ");
1088 REGEX_ASSERT(fields[3]=="b");
1089 REGEX_ASSERT(fields[4]=="the time");
1090 REGEX_ASSERT(fields[5]=="c");
1091 REGEX_ASSERT(fields[6]=="foo");
1092
1093 status = U_ZERO_ERROR;
1094 fields[5] = "foo";
1095 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
1096 REGEX_CHECK_STATUS;
1097 REGEX_ASSERT(n==5);
1098 REGEX_ASSERT(fields[0]==" ");
1099 REGEX_ASSERT(fields[1]=="a");
1100 REGEX_ASSERT(fields[2]=="Now is ");
1101 REGEX_ASSERT(fields[3]=="b");
1102 REGEX_ASSERT(fields[4]=="the time<c>");
1103 REGEX_ASSERT(fields[5]=="foo");
1104
1105 status = U_ZERO_ERROR;
1106 fields[5] = "foo";
1107 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
1108 REGEX_CHECK_STATUS;
1109 REGEX_ASSERT(n==5);
1110 REGEX_ASSERT(fields[0]==" ");
1111 REGEX_ASSERT(fields[1]=="a");
1112 REGEX_ASSERT(fields[2]=="Now is ");
1113 REGEX_ASSERT(fields[3]=="b");
1114 REGEX_ASSERT(fields[4]=="the time");
1115 REGEX_ASSERT(fields[5]=="foo");
1116
1117 status = U_ZERO_ERROR;
1118 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
1119 REGEX_CHECK_STATUS;
1120 REGEX_ASSERT(n==4);
1121 REGEX_ASSERT(fields[0]==" ");
1122 REGEX_ASSERT(fields[1]=="a");
1123 REGEX_ASSERT(fields[2]=="Now is ");
1124 REGEX_ASSERT(fields[3]=="the time<c>");
1125 status = U_ZERO_ERROR;
1126 delete pat1;
1127
1128 pat1 = RegexPattern::compile("([-,])", pe, status);
1129 REGEX_CHECK_STATUS;
1130 n = pat1->split("1-10,20", fields, 10, status);
1131 REGEX_CHECK_STATUS;
1132 REGEX_ASSERT(n==5);
1133 REGEX_ASSERT(fields[0]=="1");
1134 REGEX_ASSERT(fields[1]=="-");
1135 REGEX_ASSERT(fields[2]=="10");
1136 REGEX_ASSERT(fields[3]==",");
1137 REGEX_ASSERT(fields[4]=="20");
1138 delete pat1;
1139
1140
1141 //
1142 // RegexPattern::pattern()
1143 //
1144 pat1 = new RegexPattern();
1145 REGEX_ASSERT(pat1->pattern() == "");
1146 delete pat1;
1147
1148 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1149 REGEX_CHECK_STATUS;
1150 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1151 delete pat1;
1152
1153
1154 //
1155 // classID functions
1156 //
1157 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1158 REGEX_CHECK_STATUS;
1159 REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1160 REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1161 RegexMatcher *m = pat1->matcher("Hello, World", status);
1162 REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1163 REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1164 REGEX_ASSERT(m->getDynamicClassID() != NULL);
1165 delete m;
1166 delete pat1;
1167
1168 }
1169
1170 //---------------------------------------------------------------------------
1171 //
1172 // Extended A more thorough check for features of regex patterns
1173 // The test cases are in a separate data file,
1174 // source/tests/testdata/regextst.txt
1175 // A description of the test data format is included in that file.
1176 //
1177 //---------------------------------------------------------------------------
1178 void RegexTest::Extended() {
1179 UErrorCode status = U_ZERO_ERROR;
1180 int32_t lineNum = 0;
1181
1182 //
1183 // Open and read the test data file.
1184 //
1185 const char *testDataDirectory = loadTestData(status);
1186 if (U_FAILURE(status)) {
1187 errln("ERROR: could not open test data %s", u_errorName(status));
1188 return;
1189 }
1190 UnicodeString tdd(testDataDirectory);
1191 RegexMatcher m("([/\\\\])out[/\\\\]testdata", tdd, 0, status);
1192 if(U_SUCCESS(status)) {
1193 tdd = m.replaceFirst("$1regextst.txt", status);
1194 } else {
1195 errln("Couldn't set up tests. Error %s", u_errorName(status));
1196 return;
1197 }
1198
1199 int len;
1200 UChar *testData = ReadAndConvertFile((const char *)CharString(tdd), len, status);
1201
1202 //
1203 // Put the test data into a UnicodeString
1204 //
1205 UnicodeString testString(FALSE, testData, len);
1206
1207 RegexMatcher quotedStuffMat("\\s*([\\'\\\"/])(.+?)\\1", 0, status);
1208 RegexMatcher commentMat ("\\s*(#.*)?$", 0, status);
1209 RegexMatcher flagsMat ("\\s*([ixsmdtG]*)([:letter:]*)", 0, status);
1210
1211 RegexMatcher lineMat("(.*?)\\r?\\n", testString, 0, status);
1212 UnicodeString testPattern; // The pattern for test from the test file.
1213 UnicodeString testFlags; // the flags for a test.
1214 UnicodeString matchString; // The marked up string to be used as input
1215
1216
1217
1218 //
1219 // Loop over the test data file, once per line.
1220 //
1221 while (lineMat.find()) {
1222 lineNum++;
1223 if (U_FAILURE(status)) {
1224 errln("line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
1225 }
1226
1227 status = U_ZERO_ERROR;
1228 UnicodeString testLine = lineMat.group(1, status);
1229 if (testLine.length() == 0) {
1230 continue;
1231 }
1232
1233 //
1234 // Parse the test line. Skip blank and comment only lines.
1235 // Separate out the three main fields - pattern, flags, target.
1236 //
1237
1238 commentMat.reset(testLine);
1239 if (commentMat.lookingAt(status)) {
1240 // This line is a comment, or blank.
1241 continue;
1242 }
1243
1244 //
1245 // Pull out the pattern field, remove it from the test file line.
1246 //
1247 quotedStuffMat.reset(testLine);
1248 if (quotedStuffMat.lookingAt(status)) {
1249 testPattern = quotedStuffMat.group(2, status);
1250 testLine.remove(0, quotedStuffMat.end(0, status));
1251 } else {
1252 errln("Bad pattern (missing quotes?) at test file line %d", lineNum);
1253 continue;
1254 }
1255
1256
1257 //
1258 // Pull out the flags from the test file line.
1259 //
1260 flagsMat.reset(testLine);
1261 flagsMat.lookingAt(status); // Will always match, possibly an empty string.
1262 testFlags = flagsMat.group(1, status);
1263 if (flagsMat.group(2, status).length() > 0) {
1264 errln("Bad Match flag at line %d. Scanning %c\n",
1265 lineNum, flagsMat.group(2, status).charAt(0));
1266 continue;
1267 }
1268 testLine.remove(0, flagsMat.end(0, status));
1269
1270 //
1271 // Pull out the match string, as a whole.
1272 // We'll process the <tags> later.
1273 //
1274 quotedStuffMat.reset(testLine);
1275 if (quotedStuffMat.lookingAt(status)) {
1276 matchString = quotedStuffMat.group(2, status);
1277 testLine.remove(0, quotedStuffMat.end(0, status));
1278 } else {
1279 errln("Bad match string at test file line %d", lineNum);
1280 continue;
1281 }
1282
1283 //
1284 // The only thing left from the input line should be an optional trailing comment.
1285 //
1286 commentMat.reset(testLine);
1287 if (commentMat.lookingAt(status) == FALSE) {
1288 errln("Line %d: unexpected characters at end of test line.", lineNum);
1289 continue;
1290 }
1291
1292 //
1293 // Run the test
1294 //
1295 regex_find(testPattern, testFlags, matchString, lineNum);
1296 }
1297
1298 delete [] testData;
1299
1300 }
1301
1302
1303
1304 //---------------------------------------------------------------------------
1305 //
1306 // Errors Check for error handling in patterns.
1307 //
1308 //---------------------------------------------------------------------------
1309 void RegexTest::Errors() {
1310 // \escape sequences that aren't implemented yet.
1311 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
1312
1313 // Missing close parentheses
1314 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
1315 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
1316 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
1317
1318 // Extra close paren
1319 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
1320 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
1321 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
1322
1323 // Look-ahead, Look-behind
1324 // TODO: add tests for unbounded length look-behinds.
1325 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct
1326
1327 // Attempt to use non-default flags
1328 {
1329 UParseError pe;
1330 UErrorCode status = U_ZERO_ERROR;
1331 int32_t flags = UREGEX_CANON_EQ |
1332 UREGEX_COMMENTS | UREGEX_DOTALL |
1333 UREGEX_MULTILINE;
1334 RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
1335 REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
1336 delete pat1;
1337 }
1338
1339
1340 // Quantifiers are allowed only after something that can be quantified.
1341 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
1342 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
1343 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
1344
1345 // Mal-formed {min,max} quantifiers
1346 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
1347 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
1348 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
1349 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
1350 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
1351 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
1352
1353 // UnicodeSet containing a string
1354 REGEX_ERR("abc[{def}]xyz", 1, 10, U_REGEX_SET_CONTAINS_STRING);
1355
1356 }
1357
1358
1359 //-------------------------------------------------------------------------------
1360 //
1361 // Read a text data file, convert it to UChars, and return the data
1362 // in one big UChar * buffer, which the caller must delete.
1363 //
1364 //--------------------------------------------------------------------------------
1365 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int &ulen, UErrorCode &status) {
1366 UChar *retPtr = NULL;
1367 char *fileBuf = NULL;
1368 UConverter* conv = NULL;
1369 FILE *f = NULL;
1370
1371 ulen = 0;
1372 if (U_FAILURE(status)) {
1373 return retPtr;
1374 }
1375
1376 //
1377 // Open the file.
1378 //
1379 f = fopen(fileName, "rb");
1380 if (f == 0) {
1381 errln("Error opening test data file %s\n", fileName);
1382 goto cleanUpAndReturn;
1383 }
1384 //
1385 // Read it in
1386 //
1387 int fileSize;
1388 int amt_read;
1389
1390 fseek( f, 0, SEEK_END);
1391 fileSize = ftell(f);
1392 fileBuf = new char[fileSize];
1393 fseek(f, 0, SEEK_SET);
1394 amt_read = fread(fileBuf, 1, fileSize, f);
1395 if (amt_read != fileSize || fileSize <= 0) {
1396 errln("Error reading test data file.");
1397 goto cleanUpAndReturn;
1398 }
1399
1400 //
1401 // Look for a Unicode Signature (BOM) on the data just read
1402 //
1403 int32_t signatureLength;
1404 const char * fileBufC;
1405 const char* encoding;
1406
1407 fileBufC = fileBuf;
1408 encoding = ucnv_detectUnicodeSignature(
1409 fileBuf, fileSize, &signatureLength, &status);
1410 if(encoding!=NULL ){
1411 fileBufC += signatureLength;
1412 fileSize -= signatureLength;
1413 }
1414
1415 //
1416 // Open a converter to take the rule file to UTF-16
1417 //
1418 conv = ucnv_open(encoding, &status);
1419 if (U_FAILURE(status)) {
1420 goto cleanUpAndReturn;
1421 }
1422
1423 //
1424 // Convert the rules to UChar.
1425 // Preflight first to determine required buffer size.
1426 //
1427 ulen = ucnv_toUChars(conv,
1428 NULL, // dest,
1429 0, // destCapacity,
1430 fileBufC,
1431 fileSize,
1432 &status);
1433 if (status == U_BUFFER_OVERFLOW_ERROR) {
1434 // Buffer Overflow is expected from the preflight operation.
1435 status = U_ZERO_ERROR;
1436
1437 retPtr = new UChar[ulen+1];
1438 ucnv_toUChars(conv,
1439 retPtr, // dest,
1440 ulen+1,
1441 fileBufC,
1442 fileSize,
1443 &status);
1444 }
1445
1446 cleanUpAndReturn:
1447 fclose(f);
1448 delete[] fileBuf;
1449 ucnv_close(conv);
1450 if (U_FAILURE(status)) {
1451 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1452 delete retPtr;
1453 retPtr = 0;
1454 ulen = 0;
1455 };
1456 return retPtr;
1457 }
1458
1459
1460 //-------------------------------------------------------------------------------
1461 //
1462 // PerlTests - Run Perl's regular expression tests
1463 // The input file for this test is re_tests, the standard regular
1464 // expression test data distributed with the Perl source code.
1465 //
1466 // Here is Perl's description of the test data file:
1467 //
1468 // # The tests are in a separate file 't/op/re_tests'.
1469 // # Each line in that file is a separate test.
1470 // # There are five columns, separated by tabs.
1471 // #
1472 // # Column 1 contains the pattern, optionally enclosed in C<''>.
1473 // # Modifiers can be put after the closing C<'>.
1474 // #
1475 // # Column 2 contains the string to be matched.
1476 // #
1477 // # Column 3 contains the expected result:
1478 // # y expect a match
1479 // # n expect no match
1480 // # c expect an error
1481 // # B test exposes a known bug in Perl, should be skipped
1482 // # b test exposes a known bug in Perl, should be skipped if noamp
1483 // #
1484 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
1485 // #
1486 // # Column 4 contains a string, usually C<$&>.
1487 // #
1488 // # Column 5 contains the expected result of double-quote
1489 // # interpolating that string after the match, or start of error message.
1490 // #
1491 // # Column 6, if present, contains a reason why the test is skipped.
1492 // # This is printed with "skipped", for harness to pick up.
1493 // #
1494 // # \n in the tests are interpolated, as are variables of the form ${\w+}.
1495 // #
1496 // # If you want to add a regular expression test that can't be expressed
1497 // # in this format, don't add it here: put it in op/pat.t instead.
1498 //
1499 // For ICU, if field 3 contains an 'i', the test will be skipped.
1500 // The test exposes is some known incompatibility between ICU and Perl regexps.
1501 // (The i is in addition to whatever was there before.)
1502 //
1503 //-------------------------------------------------------------------------------
1504 void RegexTest::PerlTests() {
1505 UErrorCode status = U_ZERO_ERROR;
1506 UParseError pe;
1507
1508 //
1509 // Open and read the test data file.
1510 //
1511 const char *testDataDirectory = loadTestData(status);
1512 if (U_FAILURE(status)) {
1513 errln("ERROR: could not open test data %s", u_errorName(status));
1514 return;
1515 }
1516 UnicodeString tdd(testDataDirectory);
1517 RegexMatcher m("([/\\\\])out[/\\\\]testdata", tdd, 0, status);
1518 if(U_SUCCESS(status)) {
1519 tdd = m.replaceFirst("$1re_tests.txt", status);
1520 } else {
1521 errln("Couldn't set up tests. Error %s", u_errorName(status));
1522 return;
1523 }
1524
1525 int len;
1526 UChar *testData = ReadAndConvertFile((const char *)CharString(tdd), len, status);
1527
1528 //
1529 // Put the test data into a UnicodeString
1530 //
1531 UnicodeString testDataString(FALSE, testData, len);
1532
1533 //
1534 // Regex to break the input file into lines, and strip the new lines.
1535 // One line per match, capture group one is the desired data.
1536 //
1537 RegexPattern* linePat = RegexPattern::compile("(.+?)[\\r\\n]+", 0, pe, status);
1538 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
1539
1540 //
1541 // Regex to split a test file line into fields.
1542 // There are six fields, separated by tabs.
1543 //
1544 RegexPattern* fieldPat = RegexPattern::compile("\\t", 0, pe, status);
1545
1546 //
1547 // Regex to identify test patterns with flag settings, and to separate them.
1548 // Test patterns with flags look like 'pattern'i
1549 // Test patterns without flags are not quoted: pattern
1550 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
1551 //
1552 RegexPattern *flagPat = RegexPattern::compile("('?)(.*)\\1(.*)", 0, pe, status);
1553 RegexMatcher* flagMat = flagPat->matcher("", status);
1554
1555 //
1556 // The Perl tests reference several perl-isms, which are evaluated/substituted
1557 // in the test data. Not being perl, this must be done explicitly. Here
1558 // are string constants and REs for these constructs.
1559 //
1560 UnicodeString nulnulSrc("${nulnul}");
1561 UnicodeString nulnul("\\u0000\\u0000");
1562 nulnul = nulnul.unescape();
1563
1564 UnicodeString ffffSrc("${ffff}");
1565 UnicodeString ffff("\\uffff");
1566 ffff = ffff.unescape();
1567
1568 // regexp for $-[0], $+[2], etc.
1569 RegexPattern *groupsPat = RegexPattern::compile("\\$([+\\-])\\[(\\d+)\\]", 0, pe, status);
1570 RegexMatcher *groupsMat = groupsPat->matcher("", status);
1571
1572 // regexp for $0, $1, $2, etc.
1573 RegexPattern *cgPat = RegexPattern::compile("\\$(\\d+)", 0, pe, status);
1574 RegexMatcher *cgMat = cgPat->matcher("", status);
1575
1576
1577 //
1578 // Main Loop for the Perl Tests, runs once per line from the
1579 // test data file.
1580 //
1581 int32_t lineNum = 0;
1582 int32_t skippedUnimplementedCount = 0;
1583 while (lineMat->find()) {
1584 lineNum++;
1585
1586 //
1587 // Get a line, break it into its fields, do the Perl
1588 // variable substitutions.
1589 //
1590 UnicodeString line = lineMat->group(1, status);
1591 UnicodeString fields[7];
1592 fieldPat->split(line, fields, 7, status);
1593
1594 flagMat->reset(fields[0]);
1595 flagMat->matches(status);
1596 UnicodeString pattern = flagMat->group(2, status);
1597 pattern.findAndReplace("${bang}", "!");
1598 pattern.findAndReplace(nulnulSrc, "\\u0000\\u0000");
1599 pattern.findAndReplace(ffffSrc, ffff);
1600
1601 //
1602 // Identify patterns that include match flag settings,
1603 // split off the flags, remove the extra quotes.
1604 //
1605 UnicodeString flagStr = flagMat->group(3, status);
1606 if (U_FAILURE(status)) {
1607 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1608 return;
1609 }
1610 int32_t flags = 0;
1611 const UChar UChar_c = 0x63; // Char constants for the flag letters.
1612 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
1613 const UChar UChar_m = 0x6d;
1614 const UChar UChar_x = 0x78;
1615 const UChar UChar_y = 0x79;
1616 if (flagStr.indexOf(UChar_i) != -1) {
1617 flags |= UREGEX_CASE_INSENSITIVE;
1618 }
1619 if (flagStr.indexOf(UChar_m) != -1) {
1620 flags |= UREGEX_MULTILINE;
1621 }
1622 if (flagStr.indexOf(UChar_x) != -1) {
1623 flags |= UREGEX_COMMENTS;
1624 }
1625
1626 //
1627 // Compile the test pattern.
1628 //
1629 status = U_ZERO_ERROR;
1630 RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
1631 if (status == U_REGEX_UNIMPLEMENTED) {
1632 //
1633 // Test of a feature that is planned for ICU, but not yet implemented.
1634 // skip the test.
1635 skippedUnimplementedCount++;
1636 delete testPat;
1637 status = U_ZERO_ERROR;
1638 continue;
1639 }
1640
1641 if (U_FAILURE(status)) {
1642 // Some tests are supposed to generate errors.
1643 // Only report an error for tests that are supposed to succeed.
1644 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
1645 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
1646 {
1647 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
1648 }
1649 status = U_ZERO_ERROR;
1650 delete testPat;
1651 continue;
1652 }
1653
1654 if (fields[2].indexOf(UChar_i) >= 0) {
1655 // ICU should skip this test.
1656 delete testPat;
1657 continue;
1658 }
1659
1660 if (fields[2].indexOf(UChar_c) >= 0) {
1661 // This pattern should have caused a compilation error, but didn't/
1662 errln("line %d: Expected a pattern compile error, got success.", lineNum);
1663 delete testPat;
1664 continue;
1665 }
1666
1667 //
1668 // replace the Perl variables that appear in some of the
1669 // match data strings.
1670 //
1671 UnicodeString matchString = fields[1];
1672 matchString.findAndReplace(nulnulSrc, nulnul);
1673 matchString.findAndReplace(ffffSrc, ffff);
1674
1675 // Replace any \n in the match string with an actual new-line char.
1676 // Don't do full unescape, as this unescapes more than Perl does, which
1677 // causes other spurious failures in the tests.
1678 matchString.findAndReplace("\\n", "\n");
1679
1680
1681
1682 //
1683 // Run the test, check for expected match/don't match result.
1684 //
1685 RegexMatcher *testMat = testPat->matcher(matchString, status);
1686 UBool found = testMat->find();
1687 UBool expected = FALSE;
1688 if (fields[2].indexOf(UChar_y) >=0) {
1689 expected = TRUE;
1690 }
1691 if (expected != found) {
1692 errln("line %d: Expected %smatch, got %smatch",
1693 lineNum, expected?"":"no ", found?"":"no " );
1694 continue;
1695 }
1696
1697 //
1698 // Interpret the Perl expression from the fourth field of the data file,
1699 // building up an ICU string from the results of the ICU match.
1700 // The Perl expression will contain references to the results of
1701 // a regex match, including the matched string, capture group strings,
1702 // group starting and ending indicies, etc.
1703 //
1704 UnicodeString resultString;
1705 UnicodeString perlExpr = fields[3];
1706 groupsMat->reset(perlExpr);
1707 cgMat->reset(perlExpr);
1708
1709 while (perlExpr.length() > 0) {
1710 if (perlExpr.startsWith("$&")) {
1711 resultString.append(testMat->group(status));
1712 perlExpr.remove(0, 2);
1713 }
1714
1715 else if (groupsMat->lookingAt(status)) {
1716 // $-[0] $+[2] etc.
1717 UnicodeString digitString = groupsMat->group(2, status);
1718 int32_t t = 0;
1719 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
1720 UnicodeString plusOrMinus = groupsMat->group(1, status);
1721 int32_t matchPosition;
1722 if (plusOrMinus.compare("+") == 0) {
1723 matchPosition = testMat->end(groupNum, status);
1724 } else {
1725 matchPosition = testMat->start(groupNum, status);
1726 }
1727 if (matchPosition != -1) {
1728 ICU_Utility::appendNumber(resultString, matchPosition);
1729 }
1730 perlExpr.remove(0, groupsMat->end(status));
1731 }
1732
1733 else if (cgMat->lookingAt(status)) {
1734 // $1, $2, $3, etc.
1735 UnicodeString digitString = cgMat->group(1, status);
1736 int32_t t = 0;
1737 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
1738 if (U_SUCCESS(status)) {
1739 resultString.append(testMat->group(groupNum, status));
1740 status = U_ZERO_ERROR;
1741 }
1742 perlExpr.remove(0, cgMat->end(status));
1743 }
1744
1745 else if (perlExpr.startsWith("@-")) {
1746 int i;
1747 for (i=0; i<=testMat->groupCount(); i++) {
1748 if (i>0) {
1749 resultString.append(" ");
1750 }
1751 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
1752 }
1753 perlExpr.remove(0, 2);
1754 }
1755
1756 else if (perlExpr.startsWith("@+")) {
1757 int i;
1758 for (i=0; i<=testMat->groupCount(); i++) {
1759 if (i>0) {
1760 resultString.append(" ");
1761 }
1762 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
1763 }
1764 perlExpr.remove(0, 2);
1765 }
1766
1767 else if (perlExpr.startsWith("\\")) { // \Escape. Take following char as a literal.
1768 // or as an escaped sequence (e.g. \n)
1769 if (perlExpr.length() > 1) {
1770 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
1771 }
1772 UChar c = perlExpr.charAt(0);
1773 switch (c) {
1774 case 'n': c = '\n'; break;
1775 // add any other escape sequences that show up in the test expected results.
1776 }
1777 resultString.append(c);
1778 perlExpr.remove(0, 1);
1779 }
1780
1781 else {
1782 // Any characters from the perl expression that we don't explicitly
1783 // recognize before here are assumed to be literals and copied
1784 // as-is to the expected results.
1785 resultString.append(perlExpr.charAt(0));
1786 perlExpr.remove(0, 1);
1787 }
1788
1789 if (U_FAILURE(status)) {
1790 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
1791 break;
1792 }
1793 }
1794
1795 //
1796 // Expected Results Compare
1797 //
1798 UnicodeString expectedS(fields[4]);
1799 expectedS.findAndReplace(nulnulSrc, nulnul);
1800 expectedS.findAndReplace(ffffSrc, ffff);
1801 expectedS.findAndReplace("\\n", "\n");
1802
1803
1804 if (expectedS.compare(resultString) != 0) {
1805 errln("Line %d: Incorrect perl expression results. Expected \"%s\"; got \"%s\"",
1806 lineNum, (const char *)CharString(expectedS),
1807 (const char *)CharString(resultString));
1808 }
1809
1810 delete testMat;
1811 delete testPat;
1812 }
1813
1814 //
1815 // All done. Clean up allocated stuff.
1816 //
1817 delete cgMat;
1818 delete cgPat;
1819
1820 delete groupsMat;
1821 delete groupsPat;
1822
1823 delete flagMat;
1824 delete flagPat;
1825
1826 delete lineMat;
1827 delete linePat;
1828
1829 delete fieldPat;
1830 delete [] testData;
1831
1832
1833 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
1834
1835 }
1836
1837
1838
1839 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
1840