]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/intltest/regextst.cpp
ICU-8.11.2.tar.gz
[apple/icu.git] / icuSources / test / intltest / regextst.cpp
1 /********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 2002-2005, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6
7 //
8 // regextst.cpp
9 //
10 // ICU Regular Expressions test, part of intltest.
11 //
12
13 #include "intltest.h"
14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
15
16 #include "unicode/regex.h"
17 #include "unicode/uchar.h"
18 #include "unicode/ucnv.h"
19 #include "regextst.h"
20 #include "uvector.h"
21 #include "util.h"
22 #include <stdlib.h>
23 #include <string.h>
24 #include <stdio.h>
25
26
27 //---------------------------------------------------------------------------
28 //
29 // Test class boilerplate
30 //
31 //---------------------------------------------------------------------------
32 RegexTest::RegexTest()
33 {
34 }
35
36
37 RegexTest::~RegexTest()
38 {
39 }
40
41
42
43 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
44 {
45 if (exec) logln("TestSuite RegexTest: ");
46 switch (index) {
47
48 case 0: name = "Basic";
49 if (exec) Basic();
50 break;
51 case 1: name = "API_Match";
52 if (exec) API_Match();
53 break;
54 case 2: name = "API_Replace";
55 if (exec) API_Replace();
56 break;
57 case 3: name = "API_Pattern";
58 if (exec) API_Pattern();
59 break;
60 case 4: name = "Extended";
61 if (exec) Extended();
62 break;
63 case 5: name = "Errors";
64 if (exec) Errors();
65 break;
66 case 6: name = "PerlTests";
67 if (exec) PerlTests();
68 break;
69
70
71 default: name = "";
72 break; //needed to end loop
73 }
74 }
75
76
77 //---------------------------------------------------------------------------
78 //
79 // Error Checking / Reporting macros used in all of the tests.
80 //
81 //---------------------------------------------------------------------------
82 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {errln("RegexTest failure at line %d. status=%s\n", \
83 __LINE__, u_errorName(status)); return;}}
84
85 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("RegexTest failure at line %d.\n", __LINE__);};}
86
87 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
88 if (status!=errcode) {errln("RegexTest failure at line %d. Expected status=%s, got %s\n", \
89 __LINE__, u_errorName(errcode), u_errorName(status));};}
90
91 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
92 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
93
94 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
95 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
96
97
98
99 //---------------------------------------------------------------------------
100 //
101 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests
102 // for the LookingAt() and Match() functions.
103 //
104 // usage:
105 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
106 //
107 // The expected results are UBool - TRUE or FALSE.
108 // The input text is unescaped. The pattern is not.
109 //
110 //
111 //---------------------------------------------------------------------------
112
113 #define REGEX_TESTLM(pat, text, looking, match) doRegexLMTest(pat, text, looking, match, __LINE__);
114
115 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int line) {
116 const UnicodeString pattern(pat);
117 const UnicodeString inputText(text);
118 UErrorCode status = U_ZERO_ERROR;
119 UParseError pe;
120 RegexPattern *REPattern = NULL;
121 RegexMatcher *REMatcher = NULL;
122 UBool retVal = TRUE;
123
124 UnicodeString patString(pat);
125 REPattern = RegexPattern::compile(patString, 0, pe, status);
126 if (U_FAILURE(status)) {
127 errln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s\n",
128 line, u_errorName(status));
129 return FALSE;
130 }
131 if (line==376) { RegexPatternDump(REPattern);}
132
133 UnicodeString inputString(inputText);
134 UnicodeString unEscapedInput = inputString.unescape();
135 REMatcher = REPattern->matcher(unEscapedInput, status);
136 if (U_FAILURE(status)) {
137 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
138 line, u_errorName(status));
139 return FALSE;
140 }
141
142 UBool actualmatch;
143 actualmatch = REMatcher->lookingAt(status);
144 if (U_FAILURE(status)) {
145 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
146 line, u_errorName(status));
147 retVal = FALSE;
148 }
149 if (actualmatch != looking) {
150 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
151 retVal = FALSE;
152 }
153
154 status = U_ZERO_ERROR;
155 actualmatch = REMatcher->matches(status);
156 if (U_FAILURE(status)) {
157 errln("RegexTest failure in matches() at line %d. Status = %s\n",
158 line, u_errorName(status));
159 retVal = FALSE;
160 }
161 if (actualmatch != match) {
162 errln("RegexTest: wrong return from matches() at line %d.\n", line);
163 retVal = FALSE;
164 }
165
166 if (retVal == FALSE) {
167 RegexPatternDump(REPattern);
168 }
169
170 delete REPattern;
171 delete REMatcher;
172 return retVal;
173 }
174
175
176
177
178 //---------------------------------------------------------------------------
179 //
180 // regex_find(pattern, inputString, lineNumber)
181 //
182 // function to simplify writing tests regex tests.
183 //
184 // The input text is unescaped. The pattern is not.
185 // The input text is marked with the expected match positions
186 // <0>text <1> more text </1> </0>
187 // The <n> </n> tags are removed before trying the match.
188 // The tags mark the start and end of the match and of any capture groups.
189 //
190 //
191 //---------------------------------------------------------------------------
192
193
194 // Set a value into a UVector at position specified by a decimal number in
195 // a UnicodeString. This is a utility function needed by the actual test function,
196 // which follows.
197 static void set(UVector &vec, int val, UnicodeString index) {
198 UErrorCode status=U_ZERO_ERROR;
199 int idx = 0;
200 for (int i=0; i<index.length(); i++) {
201 int d=u_charDigitValue(index.charAt(i));
202 if (d<0) {return;}
203 idx = idx*10 + d;
204 }
205 while (vec.size()<idx+1) {vec.addElement(-1, status);}
206 vec.setElementAt(val, idx);
207 }
208
209 void RegexTest::regex_find(const UnicodeString &pattern,
210 const UnicodeString &flags,
211 const UnicodeString &inputString,
212 int line) {
213 UnicodeString unEscapedInput;
214 UnicodeString deTaggedInput;
215
216 UErrorCode status = U_ZERO_ERROR;
217 UParseError pe;
218 RegexPattern *parsePat = NULL;
219 RegexMatcher *parseMatcher = NULL;
220 RegexPattern *callerPattern = NULL;
221 RegexMatcher *matcher = NULL;
222 UVector groupStarts(status);
223 UVector groupEnds(status);
224 UBool isMatch = FALSE;
225 UBool failed = FALSE;
226 int numFinds;
227 int i;
228
229 //
230 // Compile the caller's pattern
231 //
232 uint32_t bflags = 0;
233 if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag
234 bflags |= UREGEX_CASE_INSENSITIVE;
235 }
236 if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag
237 bflags |= UREGEX_COMMENTS;
238 }
239 if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag
240 bflags |= UREGEX_DOTALL;
241 }
242 if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag
243 bflags |= UREGEX_MULTILINE;
244 }
245
246
247 callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
248 if (status != U_ZERO_ERROR) {
249 #if UCONFIG_NO_BREAK_ITERATION==1
250 // 'v' test flag means that the test pattern should not compile if ICU was configured
251 // to not include break iteration. RBBI is needed for Unicode word boundaries.
252 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
253 goto cleanupAndReturn;
254 }
255 #endif
256 errln("Line %d: error %s compiling pattern.", line, u_errorName(status));
257 goto cleanupAndReturn;
258 }
259
260 if (flags.indexOf((UChar)'d') >= 0) {
261 RegexPatternDump(callerPattern);
262 }
263
264 //
265 // Number of times find() should be called on the test string, default to 1
266 //
267 numFinds = 1;
268 for (i=2; i<=9; i++) {
269 if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag
270 if (numFinds != 1) {
271 errln("Line %d: more than one digit flag. Scanning %d.", line, i);
272 goto cleanupAndReturn;
273 }
274 numFinds = i;
275 }
276 }
277
278 //
279 // Find the tags in the input data, remove them, and record the group boundary
280 // positions.
281 //
282 parsePat = RegexPattern::compile("<(/?)([0-9]+)>", 0, pe, status);
283 REGEX_CHECK_STATUS_L(line);
284
285 unEscapedInput = inputString.unescape();
286 parseMatcher = parsePat->matcher(unEscapedInput, status);
287 REGEX_CHECK_STATUS_L(line);
288 while(parseMatcher->find()) {
289 parseMatcher->appendReplacement(deTaggedInput, "", status);
290 REGEX_CHECK_STATUS;
291 UnicodeString groupNum = parseMatcher->group(2, status);
292 if (parseMatcher->group(1, status) == "/") {
293 // close tag
294 set(groupEnds, deTaggedInput.length(), groupNum);
295 } else {
296 set(groupStarts, deTaggedInput.length(), groupNum);
297 }
298 }
299 parseMatcher->appendTail(deTaggedInput);
300 REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
301
302
303 //
304 // Do a find on the de-tagged input using the caller's pattern
305 //
306 matcher = callerPattern->matcher(deTaggedInput, status);
307 REGEX_CHECK_STATUS_L(line);
308 if (flags.indexOf((UChar)'t') >= 0) {
309 matcher->setTrace(TRUE);
310 }
311
312 for (i=0; i<numFinds; i++) {
313 isMatch = matcher->find();
314 }
315 matcher->setTrace(FALSE);
316
317 //
318 // Match up the groups from the find() with the groups from the tags
319 //
320
321 // number of tags should match number of groups from find operation.
322 // matcher->groupCount does not include group 0, the entire match, hence the +1.
323 // G option in test means that capture group data is not available in the
324 // expected results, so the check needs to be suppressed.
325 if (isMatch == FALSE && groupStarts.size() != 0) {
326 errln("Error at line %d: Match expected, but none found.\n", line);
327 failed = TRUE;
328 goto cleanupAndReturn;
329 }
330
331 if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
332 // Only check for match / no match. Don't check capture groups.
333 if (isMatch && groupStarts.size() == 0) {
334 errln("Error at line %d: No match expected, but one found.\n", line);
335 failed = TRUE;
336 }
337 goto cleanupAndReturn;
338 }
339
340 for (i=0; i<=matcher->groupCount(); i++) {
341 int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
342 if (matcher->start(i, status) != expectedStart) {
343 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
344 line, i, expectedStart, matcher->start(i, status));
345 failed = TRUE;
346 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
347 }
348 int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
349 if (matcher->end(i, status) != expectedEnd) {
350 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
351 line, i, expectedEnd, matcher->end(i, status));
352 failed = TRUE;
353 // Error on end position; keep going; real error is probably yet to come as group
354 // end positions work from end of the input data towards the front.
355 }
356 }
357 if ( matcher->groupCount()+1 < groupStarts.size()) {
358 errln("Error at line %d: Expected %d capture groups, found %d.",
359 line, groupStarts.size()-1, matcher->groupCount());
360 failed = TRUE;
361 }
362
363 cleanupAndReturn:
364 if (failed) {
365 errln((UnicodeString)"\""+pattern+(UnicodeString)"\" "
366 +flags+(UnicodeString)" \""+inputString+(UnicodeString)"\"");
367 // callerPattern->dump();
368 }
369 delete parseMatcher;
370 delete parsePat;
371 delete matcher;
372 delete callerPattern;
373 }
374
375
376
377
378
379
380
381
382 //---------------------------------------------------------------------------
383 //
384 // REGEX_ERR Macro + invocation function to simplify writing tests
385 // regex tests for incorrect patterns
386 //
387 // usage:
388 // REGEX_ERR("pattern", expected error line, column, expected status);
389 //
390 //---------------------------------------------------------------------------
391 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
392
393 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
394 UErrorCode expectedStatus, int line) {
395 UnicodeString pattern(pat);
396
397 UErrorCode status = U_ZERO_ERROR;
398 UParseError pe;
399 RegexPattern *callerPattern = NULL;
400
401 //
402 // Compile the caller's pattern
403 //
404 UnicodeString patString(pat);
405 callerPattern = RegexPattern::compile(patString, 0, pe, status);
406 if (status != expectedStatus) {
407 errln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
408 } else {
409 if (status != U_ZERO_ERROR) {
410 if (pe.line != errLine || pe.offset != errCol) {
411 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
412 line, errLine, errCol, pe.line, pe.offset);
413 }
414 }
415 }
416
417 delete callerPattern;
418 }
419
420
421
422 //---------------------------------------------------------------------------
423 //
424 // Basic Check for basic functionality of regex pattern matching.
425 // Avoid the use of REGEX_FIND test macro, which has
426 // substantial dependencies on basic Regex functionality.
427 //
428 //---------------------------------------------------------------------------
429 void RegexTest::Basic() {
430
431
432 //
433 // Debug - slide failing test cases early
434 //
435 #if 0
436 {
437 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
438 UParseError pe;
439 UErrorCode status = U_ZERO_ERROR;
440 RegexPattern::compile("^(?:a?b?)*$", 0, pe, status);
441 // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
442 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
443 }
444 exit(1);
445 #endif
446
447
448 //
449 // Pattern with parentheses
450 //
451 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE);
452 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE);
453 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE);
454
455 //
456 // Patterns with *
457 //
458 REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
459 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
460 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
461 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
462 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
463
464 REGEX_TESTLM("a*", "", TRUE, TRUE);
465 REGEX_TESTLM("a*", "b", TRUE, FALSE);
466
467
468 //
469 // Patterns with "."
470 //
471 REGEX_TESTLM(".", "abc", TRUE, FALSE);
472 REGEX_TESTLM("...", "abc", TRUE, TRUE);
473 REGEX_TESTLM("....", "abc", FALSE, FALSE);
474 REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
475 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
476 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
477 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
478 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
479
480 //
481 // Patterns with * applied to chars at end of literal string
482 //
483 REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
484 REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
485
486 //
487 // Supplemental chars match as single chars, not a pair of surrogates.
488 //
489 REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
490 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
491 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
492
493
494 //
495 // UnicodeSets in the pattern
496 //
497 REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
498 REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
499 REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
500 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
501 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
502 REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
503
504 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
505 REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
506 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
507 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences.
508 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);
509
510 //
511 // OR operator in patterns
512 //
513 REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
514 REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
515 REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
516 REGEX_TESTLM("a|b", "b", TRUE, TRUE);
517
518 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
519 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
520 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
521 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
522 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
523 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
524
525 //
526 // +
527 //
528 REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
529 REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
530 REGEX_TESTLM("b+", "", FALSE, FALSE);
531 REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
532 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
533 REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
534
535 //
536 // ?
537 //
538 REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
539 REGEX_TESTLM("ab?", "a", TRUE, TRUE);
540 REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
541 REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
542 REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
543 REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
544 REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
545 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
546 REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
547
548 //
549 // Escape sequences that become single literal chars, handled internally
550 // by ICU's Unescape.
551 //
552
553 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
554 REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL
555 REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L
556 REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape
557 REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed
558 REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line
559 REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR
560 REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab
561 REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
562 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
563
564 REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input
565 REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input
566
567 // Escape of special chars in patterns
568 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
569
570
571 }
572
573
574 //---------------------------------------------------------------------------
575 //
576 // API_Match Test that the API for class RegexMatcher
577 // is present and nominally working, but excluding functions
578 // implementing replace operations.
579 //
580 //---------------------------------------------------------------------------
581 void RegexTest::API_Match() {
582 UParseError pe;
583 UErrorCode status=U_ZERO_ERROR;
584 int32_t flags = 0;
585
586 //
587 // Debug - slide failing test cases early
588 //
589 #if 0
590 {
591 }
592 return;
593 #endif
594
595 //
596 // Simple pattern compilation
597 //
598 {
599 UnicodeString re("abc");
600 RegexPattern *pat2;
601 pat2 = RegexPattern::compile(re, flags, pe, status);
602 REGEX_CHECK_STATUS;
603
604 UnicodeString inStr1 = "abcdef this is a test";
605 UnicodeString instr2 = "not abc";
606 UnicodeString empty = "";
607
608
609 //
610 // Matcher creation and reset.
611 //
612 RegexMatcher *m1 = pat2->matcher(inStr1, status);
613 REGEX_CHECK_STATUS;
614 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
615 REGEX_ASSERT(m1->input() == inStr1);
616 m1->reset(instr2);
617 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
618 REGEX_ASSERT(m1->input() == instr2);
619 m1->reset(inStr1);
620 REGEX_ASSERT(m1->input() == inStr1);
621 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
622 m1->reset(empty);
623 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
624 REGEX_ASSERT(m1->input() == empty);
625 REGEX_ASSERT(&m1->pattern() == pat2);
626
627 //
628 // reset(pos, status)
629 //
630 m1->reset(inStr1);
631 m1->reset(4, status);
632 REGEX_CHECK_STATUS;
633 REGEX_ASSERT(m1->input() == inStr1);
634 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
635
636 m1->reset(-1, status);
637 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
638 status = U_ZERO_ERROR;
639
640 m1->reset(0, status);
641 REGEX_CHECK_STATUS;
642 status = U_ZERO_ERROR;
643
644 int32_t len = m1->input().length();
645 m1->reset(len-1, status);
646 REGEX_CHECK_STATUS;
647 status = U_ZERO_ERROR;
648
649 m1->reset(len, status);
650 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
651 status = U_ZERO_ERROR;
652
653 //
654 // match(pos, status)
655 //
656 m1->reset(instr2);
657 REGEX_ASSERT(m1->matches(4, status) == TRUE);
658 m1->reset();
659 REGEX_ASSERT(m1->matches(3, status) == FALSE);
660 m1->reset();
661 REGEX_ASSERT(m1->matches(5, status) == FALSE);
662 REGEX_ASSERT(m1->matches(4, status) == TRUE);
663 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
664 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
665
666 // Match() at end of string should fail, but should not
667 // be an error.
668 status = U_ZERO_ERROR;
669 len = m1->input().length();
670 REGEX_ASSERT(m1->matches(len, status) == FALSE);
671 REGEX_CHECK_STATUS;
672
673 // Match beyond end of string should fail with an error.
674 status = U_ZERO_ERROR;
675 REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
676 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
677
678 // Successful match at end of string.
679 {
680 status = U_ZERO_ERROR;
681 RegexMatcher m("A?", 0, status); // will match zero length string.
682 REGEX_CHECK_STATUS;
683 m.reset(inStr1);
684 len = inStr1.length();
685 REGEX_ASSERT(m.matches(len, status) == TRUE);
686 REGEX_CHECK_STATUS;
687 m.reset(empty);
688 REGEX_ASSERT(m.matches(0, status) == TRUE);
689 REGEX_CHECK_STATUS;
690 }
691
692
693 //
694 // lookingAt(pos, status)
695 //
696 status = U_ZERO_ERROR;
697 m1->reset(instr2); // "not abc"
698 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
699 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
700 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
701 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
702 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
703 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
704 status = U_ZERO_ERROR;
705 len = m1->input().length();
706 REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
707 REGEX_CHECK_STATUS;
708 REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
709 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
710
711 delete m1;
712 delete pat2;
713 }
714
715
716 //
717 // Capture Group.
718 // RegexMatcher::start();
719 // RegexMatcher::end();
720 // RegexMatcher::groupCount();
721 //
722 {
723 int32_t flags=0;
724 UParseError pe;
725 UErrorCode status=U_ZERO_ERROR;
726
727 UnicodeString re("01(23(45)67)(.*)");
728 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
729 REGEX_CHECK_STATUS;
730 UnicodeString data = "0123456789";
731
732 RegexMatcher *matcher = pat->matcher(data, status);
733 REGEX_CHECK_STATUS;
734 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
735 int matchStarts[] = {0, 2, 4, 8};
736 int matchEnds[] = {10, 8, 6, 10};
737 int i;
738 for (i=0; i<4; i++) {
739 int32_t actualStart = matcher->start(i, status);
740 REGEX_CHECK_STATUS;
741 if (actualStart != matchStarts[i]) {
742 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
743 __LINE__, i, matchStarts[i], actualStart);
744 }
745 int32_t actualEnd = matcher->end(i, status);
746 REGEX_CHECK_STATUS;
747 if (actualEnd != matchEnds[i]) {
748 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
749 __LINE__, i, matchEnds[i], actualEnd);
750 }
751 }
752
753 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
754 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
755
756 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
757 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
758 matcher->reset();
759 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
760
761 matcher->lookingAt(status);
762 REGEX_ASSERT(matcher->group(status) == "0123456789");
763 REGEX_ASSERT(matcher->group(0, status) == "0123456789");
764 REGEX_ASSERT(matcher->group(1, status) == "234567" );
765 REGEX_ASSERT(matcher->group(2, status) == "45" );
766 REGEX_ASSERT(matcher->group(3, status) == "89" );
767 REGEX_CHECK_STATUS;
768 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
769 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
770 matcher->reset();
771 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
772
773 delete matcher;
774 delete pat;
775
776 }
777
778 //
779 // find
780 //
781 {
782 int32_t flags=0;
783 UParseError pe;
784 UErrorCode status=U_ZERO_ERROR;
785
786 UnicodeString re("abc");
787 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
788 REGEX_CHECK_STATUS;
789 UnicodeString data = ".abc..abc...abc..";
790 // 012345678901234567
791
792 RegexMatcher *matcher = pat->matcher(data, status);
793 REGEX_CHECK_STATUS;
794 REGEX_ASSERT(matcher->find());
795 REGEX_ASSERT(matcher->start(status) == 1);
796 REGEX_ASSERT(matcher->find());
797 REGEX_ASSERT(matcher->start(status) == 6);
798 REGEX_ASSERT(matcher->find());
799 REGEX_ASSERT(matcher->start(status) == 12);
800 REGEX_ASSERT(matcher->find() == FALSE);
801 REGEX_ASSERT(matcher->find() == FALSE);
802
803 matcher->reset();
804 REGEX_ASSERT(matcher->find());
805 REGEX_ASSERT(matcher->start(status) == 1);
806
807 REGEX_ASSERT(matcher->find(0, status));
808 REGEX_ASSERT(matcher->start(status) == 1);
809 REGEX_ASSERT(matcher->find(1, status));
810 REGEX_ASSERT(matcher->start(status) == 1);
811 REGEX_ASSERT(matcher->find(2, status));
812 REGEX_ASSERT(matcher->start(status) == 6);
813 REGEX_ASSERT(matcher->find(12, status));
814 REGEX_ASSERT(matcher->start(status) == 12);
815 REGEX_ASSERT(matcher->find(13, status) == FALSE);
816 REGEX_ASSERT(matcher->find(16, status) == FALSE);
817 REGEX_ASSERT(matcher->find(17, status) == FALSE);
818 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
819
820 status = U_ZERO_ERROR;
821 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
822 status = U_ZERO_ERROR;
823 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
824
825 REGEX_ASSERT(matcher->groupCount() == 0);
826
827 delete matcher;
828 delete pat;
829 }
830
831
832 //
833 // find, with \G in pattern (true if at the end of a previous match).
834 //
835 {
836 int32_t flags=0;
837 UParseError pe;
838 UErrorCode status=U_ZERO_ERROR;
839
840 UnicodeString re(".*?(?:(\\Gabc)|(abc))");
841 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
842 REGEX_CHECK_STATUS;
843 UnicodeString data = ".abcabc.abc..";
844 // 012345678901234567
845
846 RegexMatcher *matcher = pat->matcher(data, status);
847 REGEX_CHECK_STATUS;
848 REGEX_ASSERT(matcher->find());
849 REGEX_ASSERT(matcher->start(status) == 0);
850 REGEX_ASSERT(matcher->start(1, status) == -1);
851 REGEX_ASSERT(matcher->start(2, status) == 1);
852
853 REGEX_ASSERT(matcher->find());
854 REGEX_ASSERT(matcher->start(status) == 4);
855 REGEX_ASSERT(matcher->start(1, status) == 4);
856 REGEX_ASSERT(matcher->start(2, status) == -1);
857 REGEX_CHECK_STATUS;
858
859 delete matcher;
860 delete pat;
861 }
862
863 //
864 // find with zero length matches, match position should bump ahead
865 // to prevent loops.
866 //
867 {
868 int i;
869 UErrorCode status=U_ZERO_ERROR;
870 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
871 // using an always-true look-ahead.
872 REGEX_CHECK_STATUS;
873 UnicodeString s(" ");
874 m.reset(s);
875 for (i=0; ; i++) {
876 if (m.find() == FALSE) {
877 break;
878 }
879 REGEX_ASSERT(m.start(status) == i);
880 REGEX_ASSERT(m.end(status) == i);
881 }
882 REGEX_ASSERT(i==5);
883
884 // Check that the bump goes over surrogate pairs OK
885 s = "\\U00010001\\U00010002\\U00010003\\U00010004";
886 s = s.unescape();
887 m.reset(s);
888 for (i=0; ; i+=2) {
889 if (m.find() == FALSE) {
890 break;
891 }
892 REGEX_ASSERT(m.start(status) == i);
893 REGEX_ASSERT(m.end(status) == i);
894 }
895 REGEX_ASSERT(i==10);
896 }
897 {
898 // find() loop breaking test.
899 // with pattern of /.?/, should see a series of one char matches, then a single
900 // match of zero length at the end of the input string.
901 int i;
902 UErrorCode status=U_ZERO_ERROR;
903 RegexMatcher m(".?", 0, status);
904 REGEX_CHECK_STATUS;
905 UnicodeString s(" ");
906 m.reset(s);
907 for (i=0; ; i++) {
908 if (m.find() == FALSE) {
909 break;
910 }
911 REGEX_ASSERT(m.start(status) == i);
912 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
913 }
914 REGEX_ASSERT(i==5);
915 }
916
917
918 //
919 // Matchers with no input string behave as if they had an empty input string.
920 //
921
922 {
923 UErrorCode status = U_ZERO_ERROR;
924 RegexMatcher m(".?", 0, status);
925 REGEX_CHECK_STATUS;
926 REGEX_ASSERT(m.find());
927 REGEX_ASSERT(m.start(status) == 0);
928 REGEX_ASSERT(m.input() == "");
929 }
930 {
931 UErrorCode status = U_ZERO_ERROR;
932 RegexPattern *p = RegexPattern::compile(".", 0, status);
933 RegexMatcher *m = p->matcher(status);
934 REGEX_CHECK_STATUS;
935
936 REGEX_ASSERT(m->find() == FALSE);
937 REGEX_ASSERT(m->input() == "");
938 delete m;
939 delete p;
940 }
941
942 //
943 // Compilation error on reset with UChar *
944 // These were a hazard that people were stumbling over with runtime errors.
945 // Changed them to compiler errors by adding private methods that more closely
946 // matched the incorrect use of the functions.
947 //
948 #if 0
949 {
950 UErrorCode status = U_ZERO_ERROR;
951 UChar ucharString[20];
952 RegexMatcher m(".", 0, status);
953 m.reset(ucharString); // should not compile.
954
955 RegexPattern *p = RegexPattern::compile(".", 0, status);
956 RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile.
957
958 RegexMatcher m3(".", ucharString, 0, status); // Should not compile
959 }
960 #endif
961
962 }
963
964
965
966
967
968
969 //---------------------------------------------------------------------------
970 //
971 // API_Replace API test for class RegexMatcher, testing the
972 // Replace family of functions.
973 //
974 //---------------------------------------------------------------------------
975 void RegexTest::API_Replace() {
976 //
977 // Replace
978 //
979 int32_t flags=0;
980 UParseError pe;
981 UErrorCode status=U_ZERO_ERROR;
982
983 UnicodeString re("abc");
984 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
985 REGEX_CHECK_STATUS;
986 UnicodeString data = ".abc..abc...abc..";
987 // 012345678901234567
988 RegexMatcher *matcher = pat->matcher(data, status);
989
990 //
991 // Plain vanilla matches.
992 //
993 UnicodeString dest;
994 dest = matcher->replaceFirst("yz", status);
995 REGEX_CHECK_STATUS;
996 REGEX_ASSERT(dest == ".yz..abc...abc..");
997
998 dest = matcher->replaceAll("yz", status);
999 REGEX_CHECK_STATUS;
1000 REGEX_ASSERT(dest == ".yz..yz...yz..");
1001
1002 //
1003 // Plain vanilla non-matches.
1004 //
1005 UnicodeString d2 = ".abx..abx...abx..";
1006 matcher->reset(d2);
1007 dest = matcher->replaceFirst("yz", status);
1008 REGEX_CHECK_STATUS;
1009 REGEX_ASSERT(dest == ".abx..abx...abx..");
1010
1011 dest = matcher->replaceAll("yz", status);
1012 REGEX_CHECK_STATUS;
1013 REGEX_ASSERT(dest == ".abx..abx...abx..");
1014
1015 //
1016 // Empty source string
1017 //
1018 UnicodeString d3 = "";
1019 matcher->reset(d3);
1020 dest = matcher->replaceFirst("yz", status);
1021 REGEX_CHECK_STATUS;
1022 REGEX_ASSERT(dest == "");
1023
1024 dest = matcher->replaceAll("yz", status);
1025 REGEX_CHECK_STATUS;
1026 REGEX_ASSERT(dest == "");
1027
1028 //
1029 // Empty substitution string
1030 //
1031 matcher->reset(data); // ".abc..abc...abc.."
1032 dest = matcher->replaceFirst("", status);
1033 REGEX_CHECK_STATUS;
1034 REGEX_ASSERT(dest == "...abc...abc..");
1035
1036 dest = matcher->replaceAll("", status);
1037 REGEX_CHECK_STATUS;
1038 REGEX_ASSERT(dest == "........");
1039
1040 //
1041 // match whole string
1042 //
1043 UnicodeString d4 = "abc";
1044 matcher->reset(d4);
1045 dest = matcher->replaceFirst("xyz", status);
1046 REGEX_CHECK_STATUS;
1047 REGEX_ASSERT(dest == "xyz");
1048
1049 dest = matcher->replaceAll("xyz", status);
1050 REGEX_CHECK_STATUS;
1051 REGEX_ASSERT(dest == "xyz");
1052
1053 //
1054 // Capture Group, simple case
1055 //
1056 UnicodeString re2("a(..)");
1057 RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1058 REGEX_CHECK_STATUS;
1059 UnicodeString d5 = "abcdefg";
1060 RegexMatcher *matcher2 = pat2->matcher(d5, status);
1061 REGEX_CHECK_STATUS;
1062 dest = matcher2->replaceFirst("$1$1", status);
1063 REGEX_CHECK_STATUS;
1064 REGEX_ASSERT(dest == "bcbcdefg");
1065
1066 dest = matcher2->replaceFirst("The value of \\$1 is $1.", status);
1067 REGEX_CHECK_STATUS;
1068 REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1069
1070 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1071 REGEX_CHECK_STATUS;
1072 REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");
1073
1074 UnicodeString replacement = "Supplemental Digit 1 $\\U0001D7CF.";
1075 replacement = replacement.unescape();
1076 dest = matcher2->replaceFirst(replacement, status);
1077 REGEX_CHECK_STATUS;
1078 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1079
1080 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1081
1082
1083 //
1084 // Replacement String with \u hex escapes
1085 //
1086 {
1087 UnicodeString src = "abc 1 abc 2 abc 3";
1088 UnicodeString substitute = "--\\u0043--";
1089 matcher->reset(src);
1090 UnicodeString result = matcher->replaceAll(substitute, status);
1091 REGEX_CHECK_STATUS;
1092 REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1093 }
1094 {
1095 UnicodeString src = "abc !";
1096 UnicodeString substitute = "--\\U00010000--";
1097 matcher->reset(src);
1098 UnicodeString result = matcher->replaceAll(substitute, status);
1099 REGEX_CHECK_STATUS;
1100 UnicodeString expected = UnicodeString("--");
1101 expected.append((UChar32)0x10000);
1102 expected.append("-- !");
1103 REGEX_ASSERT(result == expected);
1104 }
1105 // TODO: need more through testing of capture substitutions.
1106
1107 // Bug 4057
1108 //
1109 {
1110 status = U_ZERO_ERROR;
1111 UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1112 RegexMatcher m("ss(.*?)ee", 0, status);
1113 REGEX_CHECK_STATUS;
1114 UnicodeString result;
1115
1116 // Multiple finds do NOT bump up the previous appendReplacement postion.
1117 m.reset(s);
1118 m.find();
1119 m.find();
1120 m.appendReplacement(result, "ooh", status);
1121 REGEX_CHECK_STATUS;
1122 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1123
1124 // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1125 status = U_ZERO_ERROR;
1126 result.truncate(0);
1127 m.reset(10, status);
1128 m.find();
1129 m.find();
1130 m.appendReplacement(result, "ooh", status);
1131 REGEX_CHECK_STATUS;
1132 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1133
1134 // find() at interior of string, appendReplacemnt still starts at beginning.
1135 status = U_ZERO_ERROR;
1136 result.truncate(0);
1137 m.reset();
1138 m.find(10, status);
1139 m.find();
1140 m.appendReplacement(result, "ooh", status);
1141 REGEX_CHECK_STATUS;
1142 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1143
1144 m.appendTail(result);
1145 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1146
1147 }
1148
1149 delete matcher2;
1150 delete pat2;
1151 delete matcher;
1152 delete pat;
1153 }
1154
1155
1156 //---------------------------------------------------------------------------
1157 //
1158 // API_Pattern Test that the API for class RegexPattern is
1159 // present and nominally working.
1160 //
1161 //---------------------------------------------------------------------------
1162 void RegexTest::API_Pattern() {
1163 RegexPattern pata; // Test default constructor to not crash.
1164 RegexPattern patb;
1165
1166 REGEX_ASSERT(pata == patb);
1167 REGEX_ASSERT(pata == pata);
1168
1169 UnicodeString re1("abc[a-l][m-z]");
1170 UnicodeString re2("def");
1171 UErrorCode status = U_ZERO_ERROR;
1172 UParseError pe;
1173
1174 RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status);
1175 RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status);
1176 REGEX_CHECK_STATUS;
1177 REGEX_ASSERT(*pat1 == *pat1);
1178 REGEX_ASSERT(*pat1 != pata);
1179
1180 // Assign
1181 patb = *pat1;
1182 REGEX_ASSERT(patb == *pat1);
1183
1184 // Copy Construct
1185 RegexPattern patc(*pat1);
1186 REGEX_ASSERT(patc == *pat1);
1187 REGEX_ASSERT(patb == patc);
1188 REGEX_ASSERT(pat1 != pat2);
1189 patb = *pat2;
1190 REGEX_ASSERT(patb != patc);
1191 REGEX_ASSERT(patb == *pat2);
1192
1193 // Compile with no flags.
1194 RegexPattern *pat1a = RegexPattern::compile(re1, pe, status);
1195 REGEX_ASSERT(*pat1a == *pat1);
1196
1197 REGEX_ASSERT(pat1a->flags() == 0);
1198
1199 // Compile with different flags should be not equal
1200 RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1201 REGEX_CHECK_STATUS;
1202
1203 REGEX_ASSERT(*pat1b != *pat1a);
1204 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1205 REGEX_ASSERT(pat1a->flags() == 0);
1206 delete pat1b;
1207
1208 // clone
1209 RegexPattern *pat1c = pat1->clone();
1210 REGEX_ASSERT(*pat1c == *pat1);
1211 REGEX_ASSERT(*pat1c != *pat2);
1212
1213 delete pat1c;
1214 delete pat1a;
1215 delete pat1;
1216 delete pat2;
1217
1218
1219 //
1220 // Verify that a matcher created from a cloned pattern works.
1221 // (Jitterbug 3423)
1222 //
1223 {
1224 UErrorCode status = U_ZERO_ERROR;
1225 RegexPattern *pSource = RegexPattern::compile("\\p{L}+", 0, status);
1226 RegexPattern *pClone = pSource->clone();
1227 delete pSource;
1228 RegexMatcher *mFromClone = pClone->matcher(status);
1229 REGEX_CHECK_STATUS;
1230 UnicodeString s = "Hello World";
1231 mFromClone->reset(s);
1232 REGEX_ASSERT(mFromClone->find() == TRUE);
1233 REGEX_ASSERT(mFromClone->group(status) == "Hello");
1234 REGEX_ASSERT(mFromClone->find() == TRUE);
1235 REGEX_ASSERT(mFromClone->group(status) == "World");
1236 REGEX_ASSERT(mFromClone->find() == FALSE);
1237 delete mFromClone;
1238 delete pClone;
1239 }
1240
1241 //
1242 // matches convenience API
1243 //
1244 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1245 REGEX_CHECK_STATUS;
1246 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1247 REGEX_CHECK_STATUS;
1248 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1249 REGEX_CHECK_STATUS;
1250 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1251 REGEX_CHECK_STATUS;
1252 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1253 REGEX_CHECK_STATUS;
1254 status = U_INDEX_OUTOFBOUNDS_ERROR;
1255 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1256 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1257
1258
1259 //
1260 // Split()
1261 //
1262 status = U_ZERO_ERROR;
1263 pat1 = RegexPattern::compile(" +", pe, status);
1264 REGEX_CHECK_STATUS;
1265 UnicodeString fields[10];
1266
1267 int32_t n;
1268 n = pat1->split("Now is the time", fields, 10, status);
1269 REGEX_CHECK_STATUS;
1270 REGEX_ASSERT(n==4);
1271 REGEX_ASSERT(fields[0]=="Now");
1272 REGEX_ASSERT(fields[1]=="is");
1273 REGEX_ASSERT(fields[2]=="the");
1274 REGEX_ASSERT(fields[3]=="time");
1275 REGEX_ASSERT(fields[4]=="");
1276
1277 n = pat1->split("Now is the time", fields, 2, status);
1278 REGEX_CHECK_STATUS;
1279 REGEX_ASSERT(n==2);
1280 REGEX_ASSERT(fields[0]=="Now");
1281 REGEX_ASSERT(fields[1]=="is the time");
1282 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
1283
1284 fields[1] = "*";
1285 status = U_ZERO_ERROR;
1286 n = pat1->split("Now is the time", fields, 1, status);
1287 REGEX_CHECK_STATUS;
1288 REGEX_ASSERT(n==1);
1289 REGEX_ASSERT(fields[0]=="Now is the time");
1290 REGEX_ASSERT(fields[1]=="*");
1291 status = U_ZERO_ERROR;
1292
1293 n = pat1->split(" Now is the time ", fields, 10, status);
1294 REGEX_CHECK_STATUS;
1295 REGEX_ASSERT(n==5);
1296 REGEX_ASSERT(fields[0]=="");
1297 REGEX_ASSERT(fields[1]=="Now");
1298 REGEX_ASSERT(fields[2]=="is");
1299 REGEX_ASSERT(fields[3]=="the");
1300 REGEX_ASSERT(fields[4]=="time");
1301 REGEX_ASSERT(fields[5]=="");
1302
1303 n = pat1->split(" ", fields, 10, status);
1304 REGEX_CHECK_STATUS;
1305 REGEX_ASSERT(n==1);
1306 REGEX_ASSERT(fields[0]=="");
1307
1308 fields[0] = "foo";
1309 n = pat1->split("", fields, 10, status);
1310 REGEX_CHECK_STATUS;
1311 REGEX_ASSERT(n==0);
1312 REGEX_ASSERT(fields[0]=="foo");
1313
1314 delete pat1;
1315
1316 // split, with a pattern with (capture)
1317 pat1 = RegexPattern::compile("<(\\w*)>", pe, status);
1318 REGEX_CHECK_STATUS;
1319
1320 status = U_ZERO_ERROR;
1321 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1322 REGEX_CHECK_STATUS;
1323 REGEX_ASSERT(n==6);
1324 REGEX_ASSERT(fields[0]=="");
1325 REGEX_ASSERT(fields[1]=="a");
1326 REGEX_ASSERT(fields[2]=="Now is ");
1327 REGEX_ASSERT(fields[3]=="b");
1328 REGEX_ASSERT(fields[4]=="the time");
1329 REGEX_ASSERT(fields[5]=="c");
1330 REGEX_ASSERT(fields[6]=="");
1331 REGEX_ASSERT(status==U_ZERO_ERROR);
1332
1333 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
1334 REGEX_CHECK_STATUS;
1335 REGEX_ASSERT(n==6);
1336 REGEX_ASSERT(fields[0]==" ");
1337 REGEX_ASSERT(fields[1]=="a");
1338 REGEX_ASSERT(fields[2]=="Now is ");
1339 REGEX_ASSERT(fields[3]=="b");
1340 REGEX_ASSERT(fields[4]=="the time");
1341 REGEX_ASSERT(fields[5]=="c");
1342 REGEX_ASSERT(fields[6]=="");
1343
1344 status = U_ZERO_ERROR;
1345 fields[6] = "foo";
1346 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status);
1347 REGEX_CHECK_STATUS;
1348 REGEX_ASSERT(n==6);
1349 REGEX_ASSERT(fields[0]==" ");
1350 REGEX_ASSERT(fields[1]=="a");
1351 REGEX_ASSERT(fields[2]=="Now is ");
1352 REGEX_ASSERT(fields[3]=="b");
1353 REGEX_ASSERT(fields[4]=="the time");
1354 REGEX_ASSERT(fields[5]=="c");
1355 REGEX_ASSERT(fields[6]=="foo");
1356
1357 status = U_ZERO_ERROR;
1358 fields[5] = "foo";
1359 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
1360 REGEX_CHECK_STATUS;
1361 REGEX_ASSERT(n==5);
1362 REGEX_ASSERT(fields[0]==" ");
1363 REGEX_ASSERT(fields[1]=="a");
1364 REGEX_ASSERT(fields[2]=="Now is ");
1365 REGEX_ASSERT(fields[3]=="b");
1366 REGEX_ASSERT(fields[4]=="the time<c>");
1367 REGEX_ASSERT(fields[5]=="foo");
1368
1369 status = U_ZERO_ERROR;
1370 fields[5] = "foo";
1371 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
1372 REGEX_CHECK_STATUS;
1373 REGEX_ASSERT(n==5);
1374 REGEX_ASSERT(fields[0]==" ");
1375 REGEX_ASSERT(fields[1]=="a");
1376 REGEX_ASSERT(fields[2]=="Now is ");
1377 REGEX_ASSERT(fields[3]=="b");
1378 REGEX_ASSERT(fields[4]=="the time");
1379 REGEX_ASSERT(fields[5]=="foo");
1380
1381 status = U_ZERO_ERROR;
1382 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
1383 REGEX_CHECK_STATUS;
1384 REGEX_ASSERT(n==4);
1385 REGEX_ASSERT(fields[0]==" ");
1386 REGEX_ASSERT(fields[1]=="a");
1387 REGEX_ASSERT(fields[2]=="Now is ");
1388 REGEX_ASSERT(fields[3]=="the time<c>");
1389 status = U_ZERO_ERROR;
1390 delete pat1;
1391
1392 pat1 = RegexPattern::compile("([-,])", pe, status);
1393 REGEX_CHECK_STATUS;
1394 n = pat1->split("1-10,20", fields, 10, status);
1395 REGEX_CHECK_STATUS;
1396 REGEX_ASSERT(n==5);
1397 REGEX_ASSERT(fields[0]=="1");
1398 REGEX_ASSERT(fields[1]=="-");
1399 REGEX_ASSERT(fields[2]=="10");
1400 REGEX_ASSERT(fields[3]==",");
1401 REGEX_ASSERT(fields[4]=="20");
1402 delete pat1;
1403
1404
1405 //
1406 // RegexPattern::pattern()
1407 //
1408 pat1 = new RegexPattern();
1409 REGEX_ASSERT(pat1->pattern() == "");
1410 delete pat1;
1411
1412 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1413 REGEX_CHECK_STATUS;
1414 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1415 delete pat1;
1416
1417
1418 //
1419 // classID functions
1420 //
1421 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1422 REGEX_CHECK_STATUS;
1423 REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1424 REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1425 UnicodeString Hello("Hello, world.");
1426 RegexMatcher *m = pat1->matcher(Hello, status);
1427 REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1428 REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1429 REGEX_ASSERT(m->getDynamicClassID() != NULL);
1430 delete m;
1431 delete pat1;
1432
1433 }
1434
1435 //---------------------------------------------------------------------------
1436 //
1437 // Extended A more thorough check for features of regex patterns
1438 // The test cases are in a separate data file,
1439 // source/tests/testdata/regextst.txt
1440 // A description of the test data format is included in that file.
1441 //
1442 //---------------------------------------------------------------------------
1443
1444 const char *
1445 RegexTest::getPath(char buffer[2048], const char *filename) {
1446 UErrorCode status=U_ZERO_ERROR;
1447 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1448 if (U_FAILURE(status)) {
1449 errln("ERROR: loadTestData() failed - %s", u_errorName(status));
1450 return NULL;
1451 }
1452
1453 strcpy(buffer, testDataDirectory);
1454 strcat(buffer, filename);
1455 return buffer;
1456 }
1457
1458 void RegexTest::Extended() {
1459 char tdd[2048];
1460 const char *srcPath;
1461 UErrorCode status = U_ZERO_ERROR;
1462 int32_t lineNum = 0;
1463
1464 //
1465 // Open and read the test data file.
1466 //
1467 srcPath=getPath(tdd, "regextst.txt");
1468 if(srcPath==NULL) {
1469 return; /* something went wrong, error already output */
1470 }
1471
1472 int len;
1473 UChar *testData = ReadAndConvertFile(srcPath, len, status);
1474 if (U_FAILURE(status)) {
1475 return; /* something went wrong, error already output */
1476 }
1477
1478 //
1479 // Put the test data into a UnicodeString
1480 //
1481 UnicodeString testString(FALSE, testData, len);
1482
1483 RegexMatcher quotedStuffMat("\\s*([\\'\\\"/])(.*?)\\1", 0, status);
1484 RegexMatcher commentMat ("\\s*(#.*)?$", 0, status);
1485 RegexMatcher flagsMat ("\\s*([ixsmdtGv2-9]*)([:letter:]*)", 0, status);
1486
1487 RegexMatcher lineMat("(.*?)\\r?\\n", testString, 0, status);
1488 UnicodeString testPattern; // The pattern for test from the test file.
1489 UnicodeString testFlags; // the flags for a test.
1490 UnicodeString matchString; // The marked up string to be used as input
1491
1492 if (U_FAILURE(status)){
1493 dataerrln("Construct RegexMatcher() error.");
1494 delete [] testData;
1495 return;
1496 }
1497
1498 //
1499 // Loop over the test data file, once per line.
1500 //
1501 while (lineMat.find()) {
1502 lineNum++;
1503 if (U_FAILURE(status)) {
1504 errln("line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
1505 }
1506
1507 status = U_ZERO_ERROR;
1508 UnicodeString testLine = lineMat.group(1, status);
1509 if (testLine.length() == 0) {
1510 continue;
1511 }
1512
1513 //
1514 // Parse the test line. Skip blank and comment only lines.
1515 // Separate out the three main fields - pattern, flags, target.
1516 //
1517
1518 commentMat.reset(testLine);
1519 if (commentMat.lookingAt(status)) {
1520 // This line is a comment, or blank.
1521 continue;
1522 }
1523
1524 //
1525 // Pull out the pattern field, remove it from the test file line.
1526 //
1527 quotedStuffMat.reset(testLine);
1528 if (quotedStuffMat.lookingAt(status)) {
1529 testPattern = quotedStuffMat.group(2, status);
1530 testLine.remove(0, quotedStuffMat.end(0, status));
1531 } else {
1532 errln("Bad pattern (missing quotes?) at test file line %d", lineNum);
1533 continue;
1534 }
1535
1536
1537 //
1538 // Pull out the flags from the test file line.
1539 //
1540 flagsMat.reset(testLine);
1541 flagsMat.lookingAt(status); // Will always match, possibly an empty string.
1542 testFlags = flagsMat.group(1, status);
1543 if (flagsMat.group(2, status).length() > 0) {
1544 errln("Bad Match flag at line %d. Scanning %c\n",
1545 lineNum, flagsMat.group(2, status).charAt(0));
1546 continue;
1547 }
1548 testLine.remove(0, flagsMat.end(0, status));
1549
1550 //
1551 // Pull out the match string, as a whole.
1552 // We'll process the <tags> later.
1553 //
1554 quotedStuffMat.reset(testLine);
1555 if (quotedStuffMat.lookingAt(status)) {
1556 matchString = quotedStuffMat.group(2, status);
1557 testLine.remove(0, quotedStuffMat.end(0, status));
1558 } else {
1559 errln("Bad match string at test file line %d", lineNum);
1560 continue;
1561 }
1562
1563 //
1564 // The only thing left from the input line should be an optional trailing comment.
1565 //
1566 commentMat.reset(testLine);
1567 if (commentMat.lookingAt(status) == FALSE) {
1568 errln("Line %d: unexpected characters at end of test line.", lineNum);
1569 continue;
1570 }
1571
1572 //
1573 // Run the test
1574 //
1575 regex_find(testPattern, testFlags, matchString, lineNum);
1576 }
1577
1578 delete [] testData;
1579
1580 }
1581
1582
1583
1584 //---------------------------------------------------------------------------
1585 //
1586 // Errors Check for error handling in patterns.
1587 //
1588 //---------------------------------------------------------------------------
1589 void RegexTest::Errors() {
1590 // \escape sequences that aren't implemented yet.
1591 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
1592
1593 // Missing close parentheses
1594 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
1595 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
1596 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
1597
1598 // Extra close paren
1599 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
1600 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
1601 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
1602
1603 // Look-ahead, Look-behind
1604 // TODO: add tests for unbounded length look-behinds.
1605 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct
1606
1607 // Attempt to use non-default flags
1608 {
1609 UParseError pe;
1610 UErrorCode status = U_ZERO_ERROR;
1611 int32_t flags = UREGEX_CANON_EQ |
1612 UREGEX_COMMENTS | UREGEX_DOTALL |
1613 UREGEX_MULTILINE;
1614 RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
1615 REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
1616 delete pat1;
1617 }
1618
1619
1620 // Quantifiers are allowed only after something that can be quantified.
1621 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
1622 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
1623 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
1624
1625 // Mal-formed {min,max} quantifiers
1626 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
1627 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
1628 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
1629 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
1630 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
1631 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
1632 REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan
1633 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format
1634 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
1635
1636
1637 // UnicodeSet containing a string
1638 REGEX_ERR("abc[{def}]xyz", 1, 10, U_REGEX_SET_CONTAINS_STRING);
1639
1640 }
1641
1642
1643 //-------------------------------------------------------------------------------
1644 //
1645 // Read a text data file, convert it to UChars, and return the data
1646 // in one big UChar * buffer, which the caller must delete.
1647 //
1648 //--------------------------------------------------------------------------------
1649 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int &ulen, UErrorCode &status) {
1650 UChar *retPtr = NULL;
1651 char *fileBuf = NULL;
1652 UConverter* conv = NULL;
1653 FILE *f = NULL;
1654
1655 ulen = 0;
1656 if (U_FAILURE(status)) {
1657 return retPtr;
1658 }
1659
1660 //
1661 // Open the file.
1662 //
1663 f = fopen(fileName, "rb");
1664 if (f == 0) {
1665 errln("Error opening test data file %s\n", fileName);
1666 status = U_FILE_ACCESS_ERROR;
1667 return NULL;
1668 }
1669 //
1670 // Read it in
1671 //
1672 int fileSize;
1673 int amt_read;
1674
1675 fseek( f, 0, SEEK_END);
1676 fileSize = ftell(f);
1677 fileBuf = new char[fileSize];
1678 fseek(f, 0, SEEK_SET);
1679 amt_read = fread(fileBuf, 1, fileSize, f);
1680 if (amt_read != fileSize || fileSize <= 0) {
1681 errln("Error reading test data file.");
1682 goto cleanUpAndReturn;
1683 }
1684
1685 //
1686 // Look for a Unicode Signature (BOM) on the data just read
1687 //
1688 int32_t signatureLength;
1689 const char * fileBufC;
1690 const char* encoding;
1691
1692 fileBufC = fileBuf;
1693 encoding = ucnv_detectUnicodeSignature(
1694 fileBuf, fileSize, &signatureLength, &status);
1695 if(encoding!=NULL ){
1696 fileBufC += signatureLength;
1697 fileSize -= signatureLength;
1698 }
1699
1700 //
1701 // Open a converter to take the rule file to UTF-16
1702 //
1703 conv = ucnv_open(encoding, &status);
1704 if (U_FAILURE(status)) {
1705 goto cleanUpAndReturn;
1706 }
1707
1708 //
1709 // Convert the rules to UChar.
1710 // Preflight first to determine required buffer size.
1711 //
1712 ulen = ucnv_toUChars(conv,
1713 NULL, // dest,
1714 0, // destCapacity,
1715 fileBufC,
1716 fileSize,
1717 &status);
1718 if (status == U_BUFFER_OVERFLOW_ERROR) {
1719 // Buffer Overflow is expected from the preflight operation.
1720 status = U_ZERO_ERROR;
1721
1722 retPtr = new UChar[ulen+1];
1723 ucnv_toUChars(conv,
1724 retPtr, // dest,
1725 ulen+1,
1726 fileBufC,
1727 fileSize,
1728 &status);
1729 }
1730
1731 cleanUpAndReturn:
1732 fclose(f);
1733 delete[] fileBuf;
1734 ucnv_close(conv);
1735 if (U_FAILURE(status)) {
1736 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1737 delete retPtr;
1738 retPtr = 0;
1739 ulen = 0;
1740 };
1741 return retPtr;
1742 }
1743
1744
1745 //-------------------------------------------------------------------------------
1746 //
1747 // PerlTests - Run Perl's regular expression tests
1748 // The input file for this test is re_tests, the standard regular
1749 // expression test data distributed with the Perl source code.
1750 //
1751 // Here is Perl's description of the test data file:
1752 //
1753 // # The tests are in a separate file 't/op/re_tests'.
1754 // # Each line in that file is a separate test.
1755 // # There are five columns, separated by tabs.
1756 // #
1757 // # Column 1 contains the pattern, optionally enclosed in C<''>.
1758 // # Modifiers can be put after the closing C<'>.
1759 // #
1760 // # Column 2 contains the string to be matched.
1761 // #
1762 // # Column 3 contains the expected result:
1763 // # y expect a match
1764 // # n expect no match
1765 // # c expect an error
1766 // # B test exposes a known bug in Perl, should be skipped
1767 // # b test exposes a known bug in Perl, should be skipped if noamp
1768 // #
1769 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
1770 // #
1771 // # Column 4 contains a string, usually C<$&>.
1772 // #
1773 // # Column 5 contains the expected result of double-quote
1774 // # interpolating that string after the match, or start of error message.
1775 // #
1776 // # Column 6, if present, contains a reason why the test is skipped.
1777 // # This is printed with "skipped", for harness to pick up.
1778 // #
1779 // # \n in the tests are interpolated, as are variables of the form ${\w+}.
1780 // #
1781 // # If you want to add a regular expression test that can't be expressed
1782 // # in this format, don't add it here: put it in op/pat.t instead.
1783 //
1784 // For ICU, if field 3 contains an 'i', the test will be skipped.
1785 // The test exposes is some known incompatibility between ICU and Perl regexps.
1786 // (The i is in addition to whatever was there before.)
1787 //
1788 //-------------------------------------------------------------------------------
1789 void RegexTest::PerlTests() {
1790 char tdd[2048];
1791 const char *srcPath;
1792 UErrorCode status = U_ZERO_ERROR;
1793 UParseError pe;
1794
1795 //
1796 // Open and read the test data file.
1797 //
1798 srcPath=getPath(tdd, "re_tests.txt");
1799 if(srcPath==NULL) {
1800 return; /* something went wrong, error already output */
1801 }
1802
1803 int len;
1804 UChar *testData = ReadAndConvertFile(srcPath, len, status);
1805 if (U_FAILURE(status)) {
1806 return; /* something went wrong, error already output */
1807 }
1808
1809 //
1810 // Put the test data into a UnicodeString
1811 //
1812 UnicodeString testDataString(FALSE, testData, len);
1813
1814 //
1815 // Regex to break the input file into lines, and strip the new lines.
1816 // One line per match, capture group one is the desired data.
1817 //
1818 RegexPattern* linePat = RegexPattern::compile("(.+?)[\\r\\n]+", 0, pe, status);
1819 if (U_FAILURE(status)) {
1820 dataerrln("RegexPattern::compile() error");
1821 return;
1822 }
1823 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
1824
1825 //
1826 // Regex to split a test file line into fields.
1827 // There are six fields, separated by tabs.
1828 //
1829 RegexPattern* fieldPat = RegexPattern::compile("\\t", 0, pe, status);
1830
1831 //
1832 // Regex to identify test patterns with flag settings, and to separate them.
1833 // Test patterns with flags look like 'pattern'i
1834 // Test patterns without flags are not quoted: pattern
1835 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
1836 //
1837 RegexPattern *flagPat = RegexPattern::compile("('?)(.*)\\1(.*)", 0, pe, status);
1838 RegexMatcher* flagMat = flagPat->matcher(status);
1839
1840 //
1841 // The Perl tests reference several perl-isms, which are evaluated/substituted
1842 // in the test data. Not being perl, this must be done explicitly. Here
1843 // are string constants and REs for these constructs.
1844 //
1845 UnicodeString nulnulSrc("${nulnul}");
1846 UnicodeString nulnul("\\u0000\\u0000");
1847 nulnul = nulnul.unescape();
1848
1849 UnicodeString ffffSrc("${ffff}");
1850 UnicodeString ffff("\\uffff");
1851 ffff = ffff.unescape();
1852
1853 // regexp for $-[0], $+[2], etc.
1854 RegexPattern *groupsPat = RegexPattern::compile("\\$([+\\-])\\[(\\d+)\\]", 0, pe, status);
1855 RegexMatcher *groupsMat = groupsPat->matcher(status);
1856
1857 // regexp for $0, $1, $2, etc.
1858 RegexPattern *cgPat = RegexPattern::compile("\\$(\\d+)", 0, pe, status);
1859 RegexMatcher *cgMat = cgPat->matcher(status);
1860
1861
1862 //
1863 // Main Loop for the Perl Tests, runs once per line from the
1864 // test data file.
1865 //
1866 int32_t lineNum = 0;
1867 int32_t skippedUnimplementedCount = 0;
1868 while (lineMat->find()) {
1869 lineNum++;
1870
1871 //
1872 // Get a line, break it into its fields, do the Perl
1873 // variable substitutions.
1874 //
1875 UnicodeString line = lineMat->group(1, status);
1876 UnicodeString fields[7];
1877 fieldPat->split(line, fields, 7, status);
1878
1879 flagMat->reset(fields[0]);
1880 flagMat->matches(status);
1881 UnicodeString pattern = flagMat->group(2, status);
1882 pattern.findAndReplace("${bang}", "!");
1883 pattern.findAndReplace(nulnulSrc, "\\u0000\\u0000");
1884 pattern.findAndReplace(ffffSrc, ffff);
1885
1886 //
1887 // Identify patterns that include match flag settings,
1888 // split off the flags, remove the extra quotes.
1889 //
1890 UnicodeString flagStr = flagMat->group(3, status);
1891 if (U_FAILURE(status)) {
1892 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1893 return;
1894 }
1895 int32_t flags = 0;
1896 const UChar UChar_c = 0x63; // Char constants for the flag letters.
1897 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
1898 const UChar UChar_m = 0x6d;
1899 const UChar UChar_x = 0x78;
1900 const UChar UChar_y = 0x79;
1901 if (flagStr.indexOf(UChar_i) != -1) {
1902 flags |= UREGEX_CASE_INSENSITIVE;
1903 }
1904 if (flagStr.indexOf(UChar_m) != -1) {
1905 flags |= UREGEX_MULTILINE;
1906 }
1907 if (flagStr.indexOf(UChar_x) != -1) {
1908 flags |= UREGEX_COMMENTS;
1909 }
1910
1911 //
1912 // Compile the test pattern.
1913 //
1914 status = U_ZERO_ERROR;
1915 RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
1916 if (status == U_REGEX_UNIMPLEMENTED) {
1917 //
1918 // Test of a feature that is planned for ICU, but not yet implemented.
1919 // skip the test.
1920 skippedUnimplementedCount++;
1921 delete testPat;
1922 status = U_ZERO_ERROR;
1923 continue;
1924 }
1925
1926 if (U_FAILURE(status)) {
1927 // Some tests are supposed to generate errors.
1928 // Only report an error for tests that are supposed to succeed.
1929 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
1930 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
1931 {
1932 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
1933 }
1934 status = U_ZERO_ERROR;
1935 delete testPat;
1936 continue;
1937 }
1938
1939 if (fields[2].indexOf(UChar_i) >= 0) {
1940 // ICU should skip this test.
1941 delete testPat;
1942 continue;
1943 }
1944
1945 if (fields[2].indexOf(UChar_c) >= 0) {
1946 // This pattern should have caused a compilation error, but didn't/
1947 errln("line %d: Expected a pattern compile error, got success.", lineNum);
1948 delete testPat;
1949 continue;
1950 }
1951
1952 //
1953 // replace the Perl variables that appear in some of the
1954 // match data strings.
1955 //
1956 UnicodeString matchString = fields[1];
1957 matchString.findAndReplace(nulnulSrc, nulnul);
1958 matchString.findAndReplace(ffffSrc, ffff);
1959
1960 // Replace any \n in the match string with an actual new-line char.
1961 // Don't do full unescape, as this unescapes more than Perl does, which
1962 // causes other spurious failures in the tests.
1963 matchString.findAndReplace("\\n", "\n");
1964
1965
1966
1967 //
1968 // Run the test, check for expected match/don't match result.
1969 //
1970 RegexMatcher *testMat = testPat->matcher(matchString, status);
1971 UBool found = testMat->find();
1972 UBool expected = FALSE;
1973 if (fields[2].indexOf(UChar_y) >=0) {
1974 expected = TRUE;
1975 }
1976 if (expected != found) {
1977 errln("line %d: Expected %smatch, got %smatch",
1978 lineNum, expected?"":"no ", found?"":"no " );
1979 continue;
1980 }
1981
1982 //
1983 // Interpret the Perl expression from the fourth field of the data file,
1984 // building up an ICU string from the results of the ICU match.
1985 // The Perl expression will contain references to the results of
1986 // a regex match, including the matched string, capture group strings,
1987 // group starting and ending indicies, etc.
1988 //
1989 UnicodeString resultString;
1990 UnicodeString perlExpr = fields[3];
1991 groupsMat->reset(perlExpr);
1992 cgMat->reset(perlExpr);
1993
1994 while (perlExpr.length() > 0) {
1995 if (perlExpr.startsWith("$&")) {
1996 resultString.append(testMat->group(status));
1997 perlExpr.remove(0, 2);
1998 }
1999
2000 else if (groupsMat->lookingAt(status)) {
2001 // $-[0] $+[2] etc.
2002 UnicodeString digitString = groupsMat->group(2, status);
2003 int32_t t = 0;
2004 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
2005 UnicodeString plusOrMinus = groupsMat->group(1, status);
2006 int32_t matchPosition;
2007 if (plusOrMinus.compare("+") == 0) {
2008 matchPosition = testMat->end(groupNum, status);
2009 } else {
2010 matchPosition = testMat->start(groupNum, status);
2011 }
2012 if (matchPosition != -1) {
2013 ICU_Utility::appendNumber(resultString, matchPosition);
2014 }
2015 perlExpr.remove(0, groupsMat->end(status));
2016 }
2017
2018 else if (cgMat->lookingAt(status)) {
2019 // $1, $2, $3, etc.
2020 UnicodeString digitString = cgMat->group(1, status);
2021 int32_t t = 0;
2022 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
2023 if (U_SUCCESS(status)) {
2024 resultString.append(testMat->group(groupNum, status));
2025 status = U_ZERO_ERROR;
2026 }
2027 perlExpr.remove(0, cgMat->end(status));
2028 }
2029
2030 else if (perlExpr.startsWith("@-")) {
2031 int i;
2032 for (i=0; i<=testMat->groupCount(); i++) {
2033 if (i>0) {
2034 resultString.append(" ");
2035 }
2036 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
2037 }
2038 perlExpr.remove(0, 2);
2039 }
2040
2041 else if (perlExpr.startsWith("@+")) {
2042 int i;
2043 for (i=0; i<=testMat->groupCount(); i++) {
2044 if (i>0) {
2045 resultString.append(" ");
2046 }
2047 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
2048 }
2049 perlExpr.remove(0, 2);
2050 }
2051
2052 else if (perlExpr.startsWith("\\")) { // \Escape. Take following char as a literal.
2053 // or as an escaped sequence (e.g. \n)
2054 if (perlExpr.length() > 1) {
2055 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
2056 }
2057 UChar c = perlExpr.charAt(0);
2058 switch (c) {
2059 case 'n': c = '\n'; break;
2060 // add any other escape sequences that show up in the test expected results.
2061 }
2062 resultString.append(c);
2063 perlExpr.remove(0, 1);
2064 }
2065
2066 else {
2067 // Any characters from the perl expression that we don't explicitly
2068 // recognize before here are assumed to be literals and copied
2069 // as-is to the expected results.
2070 resultString.append(perlExpr.charAt(0));
2071 perlExpr.remove(0, 1);
2072 }
2073
2074 if (U_FAILURE(status)) {
2075 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
2076 break;
2077 }
2078 }
2079
2080 //
2081 // Expected Results Compare
2082 //
2083 UnicodeString expectedS(fields[4]);
2084 expectedS.findAndReplace(nulnulSrc, nulnul);
2085 expectedS.findAndReplace(ffffSrc, ffff);
2086 expectedS.findAndReplace("\\n", "\n");
2087
2088
2089 if (expectedS.compare(resultString) != 0) {
2090 err("Line %d: Incorrect perl expression results.", lineNum);
2091 errln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
2092 }
2093
2094 delete testMat;
2095 delete testPat;
2096 }
2097
2098 //
2099 // All done. Clean up allocated stuff.
2100 //
2101 delete cgMat;
2102 delete cgPat;
2103
2104 delete groupsMat;
2105 delete groupsPat;
2106
2107 delete flagMat;
2108 delete flagPat;
2109
2110 delete lineMat;
2111 delete linePat;
2112
2113 delete fieldPat;
2114 delete [] testData;
2115
2116
2117 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
2118
2119 }
2120
2121
2122
2123 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
2124