]> git.saurik.com Git - apple/icu.git/blame - icuSources/test/intltest/regextst.cpp
ICU-400.42.tar.gz
[apple/icu.git] / icuSources / test / intltest / regextst.cpp
CommitLineData
b75a7d8f
A
1/********************************************************************
2 * COPYRIGHT:
46f4442e 3 * Copyright (c) 2002-2008, International Business Machines Corporation and
b75a7d8f
A
4 * others. All Rights Reserved.
5 ********************************************************************/
6
7//
8// regextst.cpp
9//
10// ICU Regular Expressions test, part of intltest.
11//
12
374ca955 13#include "intltest.h"
b75a7d8f
A
14#if !UCONFIG_NO_REGULAR_EXPRESSIONS
15
374ca955 16#include "unicode/regex.h"
b75a7d8f
A
17#include "unicode/uchar.h"
18#include "unicode/ucnv.h"
b75a7d8f
A
19#include "regextst.h"
20#include "uvector.h"
b75a7d8f 21#include "util.h"
374ca955 22#include <stdlib.h>
73c04bcf 23#include <string.h>
374ca955 24#include <stdio.h>
b75a7d8f
A
25
26
27//---------------------------------------------------------------------------
28//
29// Test class boilerplate
30//
31//---------------------------------------------------------------------------
374ca955 32RegexTest::RegexTest()
b75a7d8f 33{
73c04bcf 34}
b75a7d8f
A
35
36
37RegexTest::~RegexTest()
38{
73c04bcf 39}
b75a7d8f
A
40
41
42
43void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
44{
45 if (exec) logln("TestSuite RegexTest: ");
46 switch (index) {
47
48 case 0: name = "Basic";
374ca955 49 if (exec) Basic();
b75a7d8f
A
50 break;
51 case 1: name = "API_Match";
374ca955 52 if (exec) API_Match();
b75a7d8f
A
53 break;
54 case 2: name = "API_Replace";
374ca955 55 if (exec) API_Replace();
b75a7d8f
A
56 break;
57 case 3: name = "API_Pattern";
374ca955 58 if (exec) API_Pattern();
b75a7d8f
A
59 break;
60 case 4: name = "Extended";
374ca955 61 if (exec) Extended();
b75a7d8f
A
62 break;
63 case 5: name = "Errors";
374ca955 64 if (exec) Errors();
b75a7d8f
A
65 break;
66 case 6: name = "PerlTests";
67 if (exec) PerlTests();
68 break;
46f4442e
A
69 case 7: name = "Callbacks";
70 if (exec) Callbacks();
71 break;
b75a7d8f 72
374ca955 73 default: name = "";
b75a7d8f
A
74 break; //needed to end loop
75 }
76}
77
78
79//---------------------------------------------------------------------------
80//
81// Error Checking / Reporting macros used in all of the tests.
82//
83//---------------------------------------------------------------------------
84#define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {errln("RegexTest failure at line %d. status=%s\n", \
85__LINE__, u_errorName(status)); return;}}
86
87#define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("RegexTest failure at line %d.\n", __LINE__);};}
88
89#define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
90if (status!=errcode) {errln("RegexTest failure at line %d. Expected status=%s, got %s\n", \
91 __LINE__, u_errorName(errcode), u_errorName(status));};}
92
93#define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
94 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
95
96#define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
97 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
98
99
100
101//---------------------------------------------------------------------------
102//
103// REGEX_TESTLM Macro + invocation function to simplify writing quick tests
104// for the LookingAt() and Match() functions.
105//
106// usage:
107// REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
108//
109// The expected results are UBool - TRUE or FALSE.
110// The input text is unescaped. The pattern is not.
374ca955 111//
b75a7d8f
A
112//
113//---------------------------------------------------------------------------
114
115#define REGEX_TESTLM(pat, text, looking, match) doRegexLMTest(pat, text, looking, match, __LINE__);
116
46f4442e
A
117UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
118 const UnicodeString pattern(pat, -1, US_INV);
119 const UnicodeString inputText(text, -1, US_INV);
b75a7d8f
A
120 UErrorCode status = U_ZERO_ERROR;
121 UParseError pe;
122 RegexPattern *REPattern = NULL;
123 RegexMatcher *REMatcher = NULL;
124 UBool retVal = TRUE;
125
46f4442e 126 UnicodeString patString(pat, -1, US_INV);
b75a7d8f
A
127 REPattern = RegexPattern::compile(patString, 0, pe, status);
128 if (U_FAILURE(status)) {
129 errln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s\n",
130 line, u_errorName(status));
131 return FALSE;
132 }
374ca955 133 if (line==376) { RegexPatternDump(REPattern);}
b75a7d8f
A
134
135 UnicodeString inputString(inputText);
136 UnicodeString unEscapedInput = inputString.unescape();
137 REMatcher = REPattern->matcher(unEscapedInput, status);
138 if (U_FAILURE(status)) {
139 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
140 line, u_errorName(status));
141 return FALSE;
142 }
374ca955 143
b75a7d8f
A
144 UBool actualmatch;
145 actualmatch = REMatcher->lookingAt(status);
146 if (U_FAILURE(status)) {
147 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
148 line, u_errorName(status));
149 retVal = FALSE;
150 }
151 if (actualmatch != looking) {
152 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
153 retVal = FALSE;
154 }
155
156 status = U_ZERO_ERROR;
157 actualmatch = REMatcher->matches(status);
158 if (U_FAILURE(status)) {
159 errln("RegexTest failure in matches() at line %d. Status = %s\n",
160 line, u_errorName(status));
161 retVal = FALSE;
162 }
163 if (actualmatch != match) {
164 errln("RegexTest: wrong return from matches() at line %d.\n", line);
165 retVal = FALSE;
166 }
167
168 if (retVal == FALSE) {
374ca955 169 RegexPatternDump(REPattern);
b75a7d8f
A
170 }
171
172 delete REPattern;
173 delete REMatcher;
174 return retVal;
175}
374ca955 176
b75a7d8f
A
177
178
179
b75a7d8f
A
180
181//---------------------------------------------------------------------------
182//
183// REGEX_ERR Macro + invocation function to simplify writing tests
184// regex tests for incorrect patterns
185//
186// usage:
187// REGEX_ERR("pattern", expected error line, column, expected status);
188//
189//---------------------------------------------------------------------------
190#define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
191
192void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
46f4442e 193 UErrorCode expectedStatus, int32_t line) {
b75a7d8f
A
194 UnicodeString pattern(pat);
195
196 UErrorCode status = U_ZERO_ERROR;
197 UParseError pe;
198 RegexPattern *callerPattern = NULL;
199
200 //
201 // Compile the caller's pattern
202 //
203 UnicodeString patString(pat);
204 callerPattern = RegexPattern::compile(patString, 0, pe, status);
205 if (status != expectedStatus) {
206 errln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
207 } else {
208 if (status != U_ZERO_ERROR) {
209 if (pe.line != errLine || pe.offset != errCol) {
210 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
211 line, errLine, errCol, pe.line, pe.offset);
212 }
213 }
214 }
215
216 delete callerPattern;
217}
218
219
220
221//---------------------------------------------------------------------------
222//
223// Basic Check for basic functionality of regex pattern matching.
224// Avoid the use of REGEX_FIND test macro, which has
225// substantial dependencies on basic Regex functionality.
226//
227//---------------------------------------------------------------------------
228void RegexTest::Basic() {
229
230
231//
232// Debug - slide failing test cases early
233//
234#if 0
235 {
236 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
237 UParseError pe;
238 UErrorCode status = U_ZERO_ERROR;
239 RegexPattern::compile("^(?:a?b?)*$", 0, pe, status);
240 // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
241 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
242 }
243 exit(1);
244#endif
245
246
247 //
248 // Pattern with parentheses
249 //
250 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE);
251 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE);
252 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE);
253
254 //
255 // Patterns with *
256 //
257 REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
258 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
259 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
260 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
261 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
262
263 REGEX_TESTLM("a*", "", TRUE, TRUE);
264 REGEX_TESTLM("a*", "b", TRUE, FALSE);
265
266
267 //
268 // Patterns with "."
269 //
270 REGEX_TESTLM(".", "abc", TRUE, FALSE);
271 REGEX_TESTLM("...", "abc", TRUE, TRUE);
272 REGEX_TESTLM("....", "abc", FALSE, FALSE);
273 REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
274 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
275 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
276 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
277 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
278
279 //
280 // Patterns with * applied to chars at end of literal string
281 //
282 REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
283 REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
284
285 //
286 // Supplemental chars match as single chars, not a pair of surrogates.
287 //
288 REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
289 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
290 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
291
292
293 //
294 // UnicodeSets in the pattern
295 //
296 REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
297 REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
298 REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
299 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
300 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
301 REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
302
303 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
304 REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
305 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
306 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences.
307 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);
308
309 //
310 // OR operator in patterns
311 //
312 REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
313 REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
314 REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
315 REGEX_TESTLM("a|b", "b", TRUE, TRUE);
316
317 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
318 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
319 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
320 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
321 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
322 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
323
324 //
325 // +
326 //
327 REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
328 REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
329 REGEX_TESTLM("b+", "", FALSE, FALSE);
330 REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
331 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
332 REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
333
334 //
335 // ?
336 //
337 REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
338 REGEX_TESTLM("ab?", "a", TRUE, TRUE);
339 REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
340 REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
341 REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
342 REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
343 REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
344 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
345 REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
346
347 //
348 // Escape sequences that become single literal chars, handled internally
349 // by ICU's Unescape.
350 //
374ca955 351
b75a7d8f
A
352 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
353 REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL
374ca955
A
354 REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L
355 REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape
b75a7d8f
A
356 REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed
357 REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line
358 REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR
359 REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab
374ca955
A
360 REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
361 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
b75a7d8f
A
362
363 REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input
364 REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input
365
366 // Escape of special chars in patterns
374ca955 367 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
b75a7d8f
A
368
369
73c04bcf 370}
b75a7d8f
A
371
372
373//---------------------------------------------------------------------------
374//
374ca955 375// API_Match Test that the API for class RegexMatcher
b75a7d8f
A
376// is present and nominally working, but excluding functions
377// implementing replace operations.
378//
379//---------------------------------------------------------------------------
380void RegexTest::API_Match() {
381 UParseError pe;
382 UErrorCode status=U_ZERO_ERROR;
383 int32_t flags = 0;
384
385 //
386 // Debug - slide failing test cases early
387 //
388#if 0
389 {
390 }
391 return;
392#endif
393
394 //
395 // Simple pattern compilation
396 //
397 {
398 UnicodeString re("abc");
399 RegexPattern *pat2;
400 pat2 = RegexPattern::compile(re, flags, pe, status);
401 REGEX_CHECK_STATUS;
374ca955 402
b75a7d8f
A
403 UnicodeString inStr1 = "abcdef this is a test";
404 UnicodeString instr2 = "not abc";
405 UnicodeString empty = "";
374ca955
A
406
407
b75a7d8f
A
408 //
409 // Matcher creation and reset.
410 //
411 RegexMatcher *m1 = pat2->matcher(inStr1, status);
412 REGEX_CHECK_STATUS;
374ca955 413 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
b75a7d8f
A
414 REGEX_ASSERT(m1->input() == inStr1);
415 m1->reset(instr2);
416 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
417 REGEX_ASSERT(m1->input() == instr2);
418 m1->reset(inStr1);
419 REGEX_ASSERT(m1->input() == inStr1);
420 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
421 m1->reset(empty);
422 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
423 REGEX_ASSERT(m1->input() == empty);
424 REGEX_ASSERT(&m1->pattern() == pat2);
374ca955
A
425
426 //
427 // reset(pos, status)
428 //
429 m1->reset(inStr1);
430 m1->reset(4, status);
431 REGEX_CHECK_STATUS;
432 REGEX_ASSERT(m1->input() == inStr1);
433 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
434
435 m1->reset(-1, status);
436 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
437 status = U_ZERO_ERROR;
438
439 m1->reset(0, status);
440 REGEX_CHECK_STATUS;
441 status = U_ZERO_ERROR;
442
443 int32_t len = m1->input().length();
444 m1->reset(len-1, status);
445 REGEX_CHECK_STATUS;
446 status = U_ZERO_ERROR;
447
448 m1->reset(len, status);
449 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
450 status = U_ZERO_ERROR;
451
452 //
453 // match(pos, status)
454 //
455 m1->reset(instr2);
456 REGEX_ASSERT(m1->matches(4, status) == TRUE);
457 m1->reset();
458 REGEX_ASSERT(m1->matches(3, status) == FALSE);
459 m1->reset();
460 REGEX_ASSERT(m1->matches(5, status) == FALSE);
461 REGEX_ASSERT(m1->matches(4, status) == TRUE);
462 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
463 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
464
465 // Match() at end of string should fail, but should not
466 // be an error.
467 status = U_ZERO_ERROR;
468 len = m1->input().length();
469 REGEX_ASSERT(m1->matches(len, status) == FALSE);
470 REGEX_CHECK_STATUS;
471
472 // Match beyond end of string should fail with an error.
473 status = U_ZERO_ERROR;
474 REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
475 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
476
477 // Successful match at end of string.
478 {
479 status = U_ZERO_ERROR;
480 RegexMatcher m("A?", 0, status); // will match zero length string.
481 REGEX_CHECK_STATUS;
482 m.reset(inStr1);
483 len = inStr1.length();
484 REGEX_ASSERT(m.matches(len, status) == TRUE);
485 REGEX_CHECK_STATUS;
486 m.reset(empty);
487 REGEX_ASSERT(m.matches(0, status) == TRUE);
488 REGEX_CHECK_STATUS;
489 }
490
491
492 //
493 // lookingAt(pos, status)
494 //
495 status = U_ZERO_ERROR;
496 m1->reset(instr2); // "not abc"
497 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
498 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
499 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
500 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
501 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
502 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
503 status = U_ZERO_ERROR;
504 len = m1->input().length();
505 REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
506 REGEX_CHECK_STATUS;
507 REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
508 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
509
b75a7d8f
A
510 delete m1;
511 delete pat2;
512 }
513
514
515 //
374ca955 516 // Capture Group.
b75a7d8f
A
517 // RegexMatcher::start();
518 // RegexMatcher::end();
519 // RegexMatcher::groupCount();
520 //
521 {
522 int32_t flags=0;
523 UParseError pe;
524 UErrorCode status=U_ZERO_ERROR;
525
526 UnicodeString re("01(23(45)67)(.*)");
527 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
528 REGEX_CHECK_STATUS;
529 UnicodeString data = "0123456789";
374ca955 530
b75a7d8f
A
531 RegexMatcher *matcher = pat->matcher(data, status);
532 REGEX_CHECK_STATUS;
374ca955 533 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
46f4442e
A
534 static const int32_t matchStarts[] = {0, 2, 4, 8};
535 static const int32_t matchEnds[] = {10, 8, 6, 10};
536 int32_t i;
b75a7d8f
A
537 for (i=0; i<4; i++) {
538 int32_t actualStart = matcher->start(i, status);
539 REGEX_CHECK_STATUS;
540 if (actualStart != matchStarts[i]) {
541 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
542 __LINE__, i, matchStarts[i], actualStart);
543 }
544 int32_t actualEnd = matcher->end(i, status);
545 REGEX_CHECK_STATUS;
546 if (actualEnd != matchEnds[i]) {
547 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
548 __LINE__, i, matchEnds[i], actualEnd);
549 }
550 }
551
552 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
553 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
554
555 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
556 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
557 matcher->reset();
558 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
559
560 matcher->lookingAt(status);
561 REGEX_ASSERT(matcher->group(status) == "0123456789");
562 REGEX_ASSERT(matcher->group(0, status) == "0123456789");
563 REGEX_ASSERT(matcher->group(1, status) == "234567" );
564 REGEX_ASSERT(matcher->group(2, status) == "45" );
565 REGEX_ASSERT(matcher->group(3, status) == "89" );
566 REGEX_CHECK_STATUS;
567 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
568 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
569 matcher->reset();
570 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
571
572 delete matcher;
573 delete pat;
574
575 }
576
577 //
578 // find
579 //
580 {
581 int32_t flags=0;
582 UParseError pe;
583 UErrorCode status=U_ZERO_ERROR;
584
585 UnicodeString re("abc");
586 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
587 REGEX_CHECK_STATUS;
588 UnicodeString data = ".abc..abc...abc..";
589 // 012345678901234567
374ca955 590
b75a7d8f
A
591 RegexMatcher *matcher = pat->matcher(data, status);
592 REGEX_CHECK_STATUS;
593 REGEX_ASSERT(matcher->find());
594 REGEX_ASSERT(matcher->start(status) == 1);
595 REGEX_ASSERT(matcher->find());
596 REGEX_ASSERT(matcher->start(status) == 6);
597 REGEX_ASSERT(matcher->find());
598 REGEX_ASSERT(matcher->start(status) == 12);
599 REGEX_ASSERT(matcher->find() == FALSE);
600 REGEX_ASSERT(matcher->find() == FALSE);
601
602 matcher->reset();
603 REGEX_ASSERT(matcher->find());
604 REGEX_ASSERT(matcher->start(status) == 1);
605
606 REGEX_ASSERT(matcher->find(0, status));
607 REGEX_ASSERT(matcher->start(status) == 1);
608 REGEX_ASSERT(matcher->find(1, status));
609 REGEX_ASSERT(matcher->start(status) == 1);
610 REGEX_ASSERT(matcher->find(2, status));
611 REGEX_ASSERT(matcher->start(status) == 6);
612 REGEX_ASSERT(matcher->find(12, status));
613 REGEX_ASSERT(matcher->start(status) == 12);
614 REGEX_ASSERT(matcher->find(13, status) == FALSE);
615 REGEX_ASSERT(matcher->find(16, status) == FALSE);
374ca955 616 REGEX_ASSERT(matcher->find(17, status) == FALSE);
b75a7d8f 617 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
b75a7d8f 618
374ca955 619 status = U_ZERO_ERROR;
b75a7d8f 620 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
374ca955
A
621 status = U_ZERO_ERROR;
622 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
b75a7d8f
A
623
624 REGEX_ASSERT(matcher->groupCount() == 0);
625
626 delete matcher;
627 delete pat;
628 }
629
630
631 //
632 // find, with \G in pattern (true if at the end of a previous match).
633 //
634 {
635 int32_t flags=0;
636 UParseError pe;
637 UErrorCode status=U_ZERO_ERROR;
638
46f4442e 639 UnicodeString re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
b75a7d8f
A
640 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
641 REGEX_CHECK_STATUS;
642 UnicodeString data = ".abcabc.abc..";
643 // 012345678901234567
374ca955 644
b75a7d8f
A
645 RegexMatcher *matcher = pat->matcher(data, status);
646 REGEX_CHECK_STATUS;
647 REGEX_ASSERT(matcher->find());
648 REGEX_ASSERT(matcher->start(status) == 0);
374ca955 649 REGEX_ASSERT(matcher->start(1, status) == -1);
b75a7d8f
A
650 REGEX_ASSERT(matcher->start(2, status) == 1);
651
652 REGEX_ASSERT(matcher->find());
653 REGEX_ASSERT(matcher->start(status) == 4);
374ca955 654 REGEX_ASSERT(matcher->start(1, status) == 4);
b75a7d8f
A
655 REGEX_ASSERT(matcher->start(2, status) == -1);
656 REGEX_CHECK_STATUS;
657
658 delete matcher;
659 delete pat;
660 }
661
374ca955
A
662 //
663 // find with zero length matches, match position should bump ahead
664 // to prevent loops.
665 //
666 {
46f4442e 667 int32_t i;
374ca955
A
668 UErrorCode status=U_ZERO_ERROR;
669 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
670 // using an always-true look-ahead.
671 REGEX_CHECK_STATUS;
672 UnicodeString s(" ");
673 m.reset(s);
674 for (i=0; ; i++) {
675 if (m.find() == FALSE) {
676 break;
677 }
678 REGEX_ASSERT(m.start(status) == i);
679 REGEX_ASSERT(m.end(status) == i);
680 }
681 REGEX_ASSERT(i==5);
682
683 // Check that the bump goes over surrogate pairs OK
46f4442e 684 s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
374ca955
A
685 s = s.unescape();
686 m.reset(s);
687 for (i=0; ; i+=2) {
688 if (m.find() == FALSE) {
689 break;
690 }
691 REGEX_ASSERT(m.start(status) == i);
692 REGEX_ASSERT(m.end(status) == i);
693 }
694 REGEX_ASSERT(i==10);
695 }
696 {
697 // find() loop breaking test.
698 // with pattern of /.?/, should see a series of one char matches, then a single
699 // match of zero length at the end of the input string.
46f4442e 700 int32_t i;
374ca955
A
701 UErrorCode status=U_ZERO_ERROR;
702 RegexMatcher m(".?", 0, status);
703 REGEX_CHECK_STATUS;
704 UnicodeString s(" ");
705 m.reset(s);
706 for (i=0; ; i++) {
707 if (m.find() == FALSE) {
708 break;
709 }
710 REGEX_ASSERT(m.start(status) == i);
711 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
712 }
713 REGEX_ASSERT(i==5);
714 }
715
716
b75a7d8f
A
717 //
718 // Matchers with no input string behave as if they had an empty input string.
719 //
720
721 {
722 UErrorCode status = U_ZERO_ERROR;
723 RegexMatcher m(".?", 0, status);
724 REGEX_CHECK_STATUS;
725 REGEX_ASSERT(m.find());
726 REGEX_ASSERT(m.start(status) == 0);
727 REGEX_ASSERT(m.input() == "");
728 }
729 {
730 UErrorCode status = U_ZERO_ERROR;
731 RegexPattern *p = RegexPattern::compile(".", 0, status);
732 RegexMatcher *m = p->matcher(status);
733 REGEX_CHECK_STATUS;
374ca955 734
b75a7d8f
A
735 REGEX_ASSERT(m->find() == FALSE);
736 REGEX_ASSERT(m->input() == "");
737 delete m;
738 delete p;
739 }
46f4442e
A
740
741 //
742 // Regions
743 //
744 {
745 UErrorCode status = U_ZERO_ERROR;
746 UnicodeString testString("This is test data");
747 RegexMatcher m(".*", testString, 0, status);
748 REGEX_CHECK_STATUS;
749 REGEX_ASSERT(m.regionStart() == 0);
750 REGEX_ASSERT(m.regionEnd() == testString.length());
751 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
752 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
753
754 m.region(2,4, status);
755 REGEX_CHECK_STATUS;
756 REGEX_ASSERT(m.matches(status));
757 REGEX_ASSERT(m.start(status)==2);
758 REGEX_ASSERT(m.end(status)==4);
759 REGEX_CHECK_STATUS;
760
761 m.reset();
762 REGEX_ASSERT(m.regionStart() == 0);
763 REGEX_ASSERT(m.regionEnd() == testString.length());
764
765 UnicodeString shorterString("short");
766 m.reset(shorterString);
767 REGEX_ASSERT(m.regionStart() == 0);
768 REGEX_ASSERT(m.regionEnd() == shorterString.length());
769
770 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
771 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
772 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
773 REGEX_ASSERT(&m == &m.reset());
774 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
775
776 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
777 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
778 REGEX_ASSERT(&m == &m.reset());
779 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
780
781 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
782 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
783 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
784 REGEX_ASSERT(&m == &m.reset());
785 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
786
787 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
788 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
789 REGEX_ASSERT(&m == &m.reset());
790 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
791
792 }
793
794 //
795 // hitEnd() and requireEnd()
796 //
797 {
798 UErrorCode status = U_ZERO_ERROR;
799 UnicodeString testString("aabb");
800 RegexMatcher m1(".*", testString, 0, status);
801 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
802 REGEX_ASSERT(m1.hitEnd() == TRUE);
803 REGEX_ASSERT(m1.requireEnd() == FALSE);
804 REGEX_CHECK_STATUS;
805
806 status = U_ZERO_ERROR;
807 RegexMatcher m2("a*", testString, 0, status);
808 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
809 REGEX_ASSERT(m2.hitEnd() == FALSE);
810 REGEX_ASSERT(m2.requireEnd() == FALSE);
811 REGEX_CHECK_STATUS;
812
813 status = U_ZERO_ERROR;
814 RegexMatcher m3(".*$", testString, 0, status);
815 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
816 REGEX_ASSERT(m3.hitEnd() == TRUE);
817 REGEX_ASSERT(m3.requireEnd() == TRUE);
818 REGEX_CHECK_STATUS;
819 }
820
b75a7d8f 821
374ca955
A
822 //
823 // Compilation error on reset with UChar *
824 // These were a hazard that people were stumbling over with runtime errors.
825 // Changed them to compiler errors by adding private methods that more closely
826 // matched the incorrect use of the functions.
827 //
828#if 0
829 {
830 UErrorCode status = U_ZERO_ERROR;
831 UChar ucharString[20];
832 RegexMatcher m(".", 0, status);
833 m.reset(ucharString); // should not compile.
834
835 RegexPattern *p = RegexPattern::compile(".", 0, status);
836 RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile.
837
838 RegexMatcher m3(".", ucharString, 0, status); // Should not compile
839 }
840#endif
841
46f4442e
A
842 //
843 // Time Outs.
844 // Note: These tests will need to be changed when the regexp engine is
845 // able to detect and cut short the exponential time behavior on
846 // this type of match.
847 //
848 {
849 UErrorCode status = U_ZERO_ERROR;
850 // Enough 'a's in the string to cause the match to time out.
851 // (Each on additonal 'a' doubles the time)
852 UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
853 RegexMatcher matcher("(a+)+b", testString, 0, status);
854 REGEX_CHECK_STATUS;
855 REGEX_ASSERT(matcher.getTimeLimit() == 0);
856 matcher.setTimeLimit(100, status);
857 REGEX_ASSERT(matcher.getTimeLimit() == 100);
858 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
859 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
860 }
861 {
862 UErrorCode status = U_ZERO_ERROR;
863 // Few enough 'a's to slip in under the time limit.
864 UnicodeString testString("aaaaaaaaaaaaaaaaaa");
865 RegexMatcher matcher("(a+)+b", testString, 0, status);
866 REGEX_CHECK_STATUS;
867 matcher.setTimeLimit(100, status);
868 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
869 REGEX_CHECK_STATUS;
870 }
871
872 //
873 // Stack Limits
874 //
875 {
876 UErrorCode status = U_ZERO_ERROR;
877 UnicodeString testString(600000, 0x41, 600000); // Length 600,000, filled with 'A'
878
879 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
880 // of the '+', and makes the stack frames larger.
881 RegexMatcher matcher("(A)+A$", testString, 0, status);
882
883 // With the default stack, this match should fail to run
884 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
885 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
886
887 // With unlimited stack, it should run
888 status = U_ZERO_ERROR;
889 matcher.setStackLimit(0, status);
890 REGEX_CHECK_STATUS;
891 REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
892 REGEX_CHECK_STATUS;
893 REGEX_ASSERT(matcher.getStackLimit() == 0);
894
895 // With a limited stack, it the match should fail
896 status = U_ZERO_ERROR;
897 matcher.setStackLimit(10000, status);
898 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
899 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
900 REGEX_ASSERT(matcher.getStackLimit() == 10000);
901 }
902
903 // A pattern that doesn't save state should work with
904 // a minimal sized stack
905 {
906 UErrorCode status = U_ZERO_ERROR;
907 UnicodeString testString = "abc";
908 RegexMatcher matcher("abc", testString, 0, status);
909 REGEX_CHECK_STATUS;
910 matcher.setStackLimit(30, status);
911 REGEX_CHECK_STATUS;
912 REGEX_ASSERT(matcher.matches(status) == TRUE);
913 REGEX_CHECK_STATUS;
914 REGEX_ASSERT(matcher.getStackLimit() == 30);
915
916 // Negative stack sizes should fail
917 status = U_ZERO_ERROR;
918 matcher.setStackLimit(1000, status);
919 REGEX_CHECK_STATUS;
920 matcher.setStackLimit(-1, status);
921 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
922 REGEX_ASSERT(matcher.getStackLimit() == 1000);
923 }
924
925
b75a7d8f
A
926}
927
928
929
930
931
932
933//---------------------------------------------------------------------------
934//
374ca955 935// API_Replace API test for class RegexMatcher, testing the
b75a7d8f
A
936// Replace family of functions.
937//
938//---------------------------------------------------------------------------
939void RegexTest::API_Replace() {
940 //
941 // Replace
942 //
943 int32_t flags=0;
944 UParseError pe;
945 UErrorCode status=U_ZERO_ERROR;
374ca955 946
b75a7d8f
A
947 UnicodeString re("abc");
948 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
949 REGEX_CHECK_STATUS;
950 UnicodeString data = ".abc..abc...abc..";
951 // 012345678901234567
952 RegexMatcher *matcher = pat->matcher(data, status);
374ca955 953
b75a7d8f
A
954 //
955 // Plain vanilla matches.
956 //
957 UnicodeString dest;
958 dest = matcher->replaceFirst("yz", status);
959 REGEX_CHECK_STATUS;
960 REGEX_ASSERT(dest == ".yz..abc...abc..");
374ca955 961
b75a7d8f
A
962 dest = matcher->replaceAll("yz", status);
963 REGEX_CHECK_STATUS;
964 REGEX_ASSERT(dest == ".yz..yz...yz..");
374ca955 965
b75a7d8f
A
966 //
967 // Plain vanilla non-matches.
968 //
969 UnicodeString d2 = ".abx..abx...abx..";
970 matcher->reset(d2);
971 dest = matcher->replaceFirst("yz", status);
972 REGEX_CHECK_STATUS;
973 REGEX_ASSERT(dest == ".abx..abx...abx..");
374ca955 974
b75a7d8f
A
975 dest = matcher->replaceAll("yz", status);
976 REGEX_CHECK_STATUS;
977 REGEX_ASSERT(dest == ".abx..abx...abx..");
374ca955 978
b75a7d8f
A
979 //
980 // Empty source string
981 //
982 UnicodeString d3 = "";
983 matcher->reset(d3);
984 dest = matcher->replaceFirst("yz", status);
985 REGEX_CHECK_STATUS;
986 REGEX_ASSERT(dest == "");
374ca955 987
b75a7d8f
A
988 dest = matcher->replaceAll("yz", status);
989 REGEX_CHECK_STATUS;
990 REGEX_ASSERT(dest == "");
374ca955 991
b75a7d8f
A
992 //
993 // Empty substitution string
994 //
995 matcher->reset(data); // ".abc..abc...abc.."
996 dest = matcher->replaceFirst("", status);
997 REGEX_CHECK_STATUS;
998 REGEX_ASSERT(dest == "...abc...abc..");
374ca955 999
b75a7d8f
A
1000 dest = matcher->replaceAll("", status);
1001 REGEX_CHECK_STATUS;
1002 REGEX_ASSERT(dest == "........");
374ca955 1003
b75a7d8f
A
1004 //
1005 // match whole string
1006 //
1007 UnicodeString d4 = "abc";
374ca955 1008 matcher->reset(d4);
b75a7d8f
A
1009 dest = matcher->replaceFirst("xyz", status);
1010 REGEX_CHECK_STATUS;
1011 REGEX_ASSERT(dest == "xyz");
374ca955 1012
b75a7d8f
A
1013 dest = matcher->replaceAll("xyz", status);
1014 REGEX_CHECK_STATUS;
1015 REGEX_ASSERT(dest == "xyz");
374ca955 1016
b75a7d8f
A
1017 //
1018 // Capture Group, simple case
1019 //
1020 UnicodeString re2("a(..)");
1021 RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1022 REGEX_CHECK_STATUS;
1023 UnicodeString d5 = "abcdefg";
1024 RegexMatcher *matcher2 = pat2->matcher(d5, status);
1025 REGEX_CHECK_STATUS;
1026 dest = matcher2->replaceFirst("$1$1", status);
1027 REGEX_CHECK_STATUS;
1028 REGEX_ASSERT(dest == "bcbcdefg");
1029
46f4442e 1030 dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
b75a7d8f
A
1031 REGEX_CHECK_STATUS;
1032 REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1033
1034 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1035 REGEX_CHECK_STATUS;
1036 REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");
1037
46f4442e 1038 UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
b75a7d8f
A
1039 replacement = replacement.unescape();
1040 dest = matcher2->replaceFirst(replacement, status);
1041 REGEX_CHECK_STATUS;
1042 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
374ca955 1043
b75a7d8f 1044 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
374ca955
A
1045
1046
1047 //
1048 // Replacement String with \u hex escapes
1049 //
1050 {
1051 UnicodeString src = "abc 1 abc 2 abc 3";
46f4442e 1052 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
374ca955
A
1053 matcher->reset(src);
1054 UnicodeString result = matcher->replaceAll(substitute, status);
1055 REGEX_CHECK_STATUS;
1056 REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1057 }
1058 {
1059 UnicodeString src = "abc !";
46f4442e 1060 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
374ca955
A
1061 matcher->reset(src);
1062 UnicodeString result = matcher->replaceAll(substitute, status);
1063 REGEX_CHECK_STATUS;
1064 UnicodeString expected = UnicodeString("--");
1065 expected.append((UChar32)0x10000);
1066 expected.append("-- !");
1067 REGEX_ASSERT(result == expected);
1068 }
b75a7d8f 1069 // TODO: need more through testing of capture substitutions.
374ca955
A
1070
1071 // Bug 4057
1072 //
1073 {
1074 status = U_ZERO_ERROR;
1075 UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1076 RegexMatcher m("ss(.*?)ee", 0, status);
1077 REGEX_CHECK_STATUS;
1078 UnicodeString result;
1079
1080 // Multiple finds do NOT bump up the previous appendReplacement postion.
1081 m.reset(s);
1082 m.find();
1083 m.find();
1084 m.appendReplacement(result, "ooh", status);
1085 REGEX_CHECK_STATUS;
1086 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1087
1088 // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1089 status = U_ZERO_ERROR;
1090 result.truncate(0);
1091 m.reset(10, status);
1092 m.find();
1093 m.find();
1094 m.appendReplacement(result, "ooh", status);
1095 REGEX_CHECK_STATUS;
1096 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1097
1098 // find() at interior of string, appendReplacemnt still starts at beginning.
1099 status = U_ZERO_ERROR;
1100 result.truncate(0);
1101 m.reset();
1102 m.find(10, status);
1103 m.find();
1104 m.appendReplacement(result, "ooh", status);
1105 REGEX_CHECK_STATUS;
1106 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1107
1108 m.appendTail(result);
1109 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1110
1111 }
1112
b75a7d8f
A
1113 delete matcher2;
1114 delete pat2;
1115 delete matcher;
1116 delete pat;
1117}
1118
1119
1120//---------------------------------------------------------------------------
1121//
1122// API_Pattern Test that the API for class RegexPattern is
1123// present and nominally working.
1124//
1125//---------------------------------------------------------------------------
1126void RegexTest::API_Pattern() {
1127 RegexPattern pata; // Test default constructor to not crash.
1128 RegexPattern patb;
1129
1130 REGEX_ASSERT(pata == patb);
1131 REGEX_ASSERT(pata == pata);
1132
1133 UnicodeString re1("abc[a-l][m-z]");
1134 UnicodeString re2("def");
1135 UErrorCode status = U_ZERO_ERROR;
1136 UParseError pe;
1137
1138 RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status);
1139 RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status);
1140 REGEX_CHECK_STATUS;
1141 REGEX_ASSERT(*pat1 == *pat1);
1142 REGEX_ASSERT(*pat1 != pata);
1143
1144 // Assign
1145 patb = *pat1;
1146 REGEX_ASSERT(patb == *pat1);
1147
1148 // Copy Construct
1149 RegexPattern patc(*pat1);
1150 REGEX_ASSERT(patc == *pat1);
1151 REGEX_ASSERT(patb == patc);
1152 REGEX_ASSERT(pat1 != pat2);
1153 patb = *pat2;
1154 REGEX_ASSERT(patb != patc);
1155 REGEX_ASSERT(patb == *pat2);
1156
1157 // Compile with no flags.
1158 RegexPattern *pat1a = RegexPattern::compile(re1, pe, status);
1159 REGEX_ASSERT(*pat1a == *pat1);
1160
1161 REGEX_ASSERT(pat1a->flags() == 0);
374ca955 1162
b75a7d8f
A
1163 // Compile with different flags should be not equal
1164 RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1165 REGEX_CHECK_STATUS;
1166
1167 REGEX_ASSERT(*pat1b != *pat1a);
1168 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1169 REGEX_ASSERT(pat1a->flags() == 0);
1170 delete pat1b;
b75a7d8f
A
1171
1172 // clone
1173 RegexPattern *pat1c = pat1->clone();
1174 REGEX_ASSERT(*pat1c == *pat1);
1175 REGEX_ASSERT(*pat1c != *pat2);
1176
b75a7d8f
A
1177 delete pat1c;
1178 delete pat1a;
1179 delete pat1;
1180 delete pat2;
1181
1182
374ca955
A
1183 //
1184 // Verify that a matcher created from a cloned pattern works.
1185 // (Jitterbug 3423)
1186 //
1187 {
1188 UErrorCode status = U_ZERO_ERROR;
46f4442e 1189 RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
374ca955
A
1190 RegexPattern *pClone = pSource->clone();
1191 delete pSource;
1192 RegexMatcher *mFromClone = pClone->matcher(status);
1193 REGEX_CHECK_STATUS;
1194 UnicodeString s = "Hello World";
1195 mFromClone->reset(s);
1196 REGEX_ASSERT(mFromClone->find() == TRUE);
1197 REGEX_ASSERT(mFromClone->group(status) == "Hello");
1198 REGEX_ASSERT(mFromClone->find() == TRUE);
1199 REGEX_ASSERT(mFromClone->group(status) == "World");
1200 REGEX_ASSERT(mFromClone->find() == FALSE);
1201 delete mFromClone;
1202 delete pClone;
1203 }
1204
b75a7d8f
A
1205 //
1206 // matches convenience API
1207 //
1208 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1209 REGEX_CHECK_STATUS;
1210 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1211 REGEX_CHECK_STATUS;
1212 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1213 REGEX_CHECK_STATUS;
1214 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1215 REGEX_CHECK_STATUS;
1216 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1217 REGEX_CHECK_STATUS;
1218 status = U_INDEX_OUTOFBOUNDS_ERROR;
1219 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1220 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1221
1222
1223 //
1224 // Split()
1225 //
1226 status = U_ZERO_ERROR;
1227 pat1 = RegexPattern::compile(" +", pe, status);
1228 REGEX_CHECK_STATUS;
1229 UnicodeString fields[10];
1230
1231 int32_t n;
1232 n = pat1->split("Now is the time", fields, 10, status);
1233 REGEX_CHECK_STATUS;
1234 REGEX_ASSERT(n==4);
1235 REGEX_ASSERT(fields[0]=="Now");
1236 REGEX_ASSERT(fields[1]=="is");
1237 REGEX_ASSERT(fields[2]=="the");
1238 REGEX_ASSERT(fields[3]=="time");
1239 REGEX_ASSERT(fields[4]=="");
1240
1241 n = pat1->split("Now is the time", fields, 2, status);
1242 REGEX_CHECK_STATUS;
1243 REGEX_ASSERT(n==2);
1244 REGEX_ASSERT(fields[0]=="Now");
1245 REGEX_ASSERT(fields[1]=="is the time");
1246 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
1247
1248 fields[1] = "*";
1249 status = U_ZERO_ERROR;
1250 n = pat1->split("Now is the time", fields, 1, status);
1251 REGEX_CHECK_STATUS;
1252 REGEX_ASSERT(n==1);
1253 REGEX_ASSERT(fields[0]=="Now is the time");
1254 REGEX_ASSERT(fields[1]=="*");
1255 status = U_ZERO_ERROR;
1256
1257 n = pat1->split(" Now is the time ", fields, 10, status);
1258 REGEX_CHECK_STATUS;
1259 REGEX_ASSERT(n==5);
1260 REGEX_ASSERT(fields[0]=="");
1261 REGEX_ASSERT(fields[1]=="Now");
1262 REGEX_ASSERT(fields[2]=="is");
1263 REGEX_ASSERT(fields[3]=="the");
1264 REGEX_ASSERT(fields[4]=="time");
1265 REGEX_ASSERT(fields[5]=="");
1266
1267 n = pat1->split(" ", fields, 10, status);
1268 REGEX_CHECK_STATUS;
1269 REGEX_ASSERT(n==1);
1270 REGEX_ASSERT(fields[0]=="");
1271
1272 fields[0] = "foo";
1273 n = pat1->split("", fields, 10, status);
1274 REGEX_CHECK_STATUS;
1275 REGEX_ASSERT(n==0);
1276 REGEX_ASSERT(fields[0]=="foo");
1277
1278 delete pat1;
1279
1280 // split, with a pattern with (capture)
46f4442e 1281 pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status);
b75a7d8f
A
1282 REGEX_CHECK_STATUS;
1283
1284 status = U_ZERO_ERROR;
1285 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1286 REGEX_CHECK_STATUS;
1287 REGEX_ASSERT(n==6);
1288 REGEX_ASSERT(fields[0]=="");
1289 REGEX_ASSERT(fields[1]=="a");
1290 REGEX_ASSERT(fields[2]=="Now is ");
1291 REGEX_ASSERT(fields[3]=="b");
1292 REGEX_ASSERT(fields[4]=="the time");
1293 REGEX_ASSERT(fields[5]=="c");
1294 REGEX_ASSERT(fields[6]=="");
1295 REGEX_ASSERT(status==U_ZERO_ERROR);
1296
1297 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
1298 REGEX_CHECK_STATUS;
1299 REGEX_ASSERT(n==6);
1300 REGEX_ASSERT(fields[0]==" ");
1301 REGEX_ASSERT(fields[1]=="a");
1302 REGEX_ASSERT(fields[2]=="Now is ");
1303 REGEX_ASSERT(fields[3]=="b");
1304 REGEX_ASSERT(fields[4]=="the time");
1305 REGEX_ASSERT(fields[5]=="c");
1306 REGEX_ASSERT(fields[6]=="");
1307
1308 status = U_ZERO_ERROR;
1309 fields[6] = "foo";
1310 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status);
1311 REGEX_CHECK_STATUS;
1312 REGEX_ASSERT(n==6);
1313 REGEX_ASSERT(fields[0]==" ");
1314 REGEX_ASSERT(fields[1]=="a");
1315 REGEX_ASSERT(fields[2]=="Now is ");
1316 REGEX_ASSERT(fields[3]=="b");
1317 REGEX_ASSERT(fields[4]=="the time");
1318 REGEX_ASSERT(fields[5]=="c");
1319 REGEX_ASSERT(fields[6]=="foo");
1320
1321 status = U_ZERO_ERROR;
1322 fields[5] = "foo";
1323 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
1324 REGEX_CHECK_STATUS;
1325 REGEX_ASSERT(n==5);
1326 REGEX_ASSERT(fields[0]==" ");
1327 REGEX_ASSERT(fields[1]=="a");
1328 REGEX_ASSERT(fields[2]=="Now is ");
1329 REGEX_ASSERT(fields[3]=="b");
1330 REGEX_ASSERT(fields[4]=="the time<c>");
1331 REGEX_ASSERT(fields[5]=="foo");
1332
1333 status = U_ZERO_ERROR;
1334 fields[5] = "foo";
1335 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
1336 REGEX_CHECK_STATUS;
1337 REGEX_ASSERT(n==5);
1338 REGEX_ASSERT(fields[0]==" ");
1339 REGEX_ASSERT(fields[1]=="a");
1340 REGEX_ASSERT(fields[2]=="Now is ");
1341 REGEX_ASSERT(fields[3]=="b");
1342 REGEX_ASSERT(fields[4]=="the time");
1343 REGEX_ASSERT(fields[5]=="foo");
1344
1345 status = U_ZERO_ERROR;
1346 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
1347 REGEX_CHECK_STATUS;
1348 REGEX_ASSERT(n==4);
1349 REGEX_ASSERT(fields[0]==" ");
1350 REGEX_ASSERT(fields[1]=="a");
1351 REGEX_ASSERT(fields[2]=="Now is ");
1352 REGEX_ASSERT(fields[3]=="the time<c>");
1353 status = U_ZERO_ERROR;
1354 delete pat1;
1355
1356 pat1 = RegexPattern::compile("([-,])", pe, status);
1357 REGEX_CHECK_STATUS;
1358 n = pat1->split("1-10,20", fields, 10, status);
1359 REGEX_CHECK_STATUS;
1360 REGEX_ASSERT(n==5);
1361 REGEX_ASSERT(fields[0]=="1");
1362 REGEX_ASSERT(fields[1]=="-");
1363 REGEX_ASSERT(fields[2]=="10");
1364 REGEX_ASSERT(fields[3]==",");
1365 REGEX_ASSERT(fields[4]=="20");
1366 delete pat1;
1367
1368
1369 //
1370 // RegexPattern::pattern()
1371 //
1372 pat1 = new RegexPattern();
1373 REGEX_ASSERT(pat1->pattern() == "");
1374 delete pat1;
1375
1376 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1377 REGEX_CHECK_STATUS;
1378 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1379 delete pat1;
1380
1381
1382 //
1383 // classID functions
1384 //
1385 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1386 REGEX_CHECK_STATUS;
1387 REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1388 REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
374ca955
A
1389 UnicodeString Hello("Hello, world.");
1390 RegexMatcher *m = pat1->matcher(Hello, status);
b75a7d8f
A
1391 REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1392 REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1393 REGEX_ASSERT(m->getDynamicClassID() != NULL);
1394 delete m;
1395 delete pat1;
1396
1397}
1398
1399//---------------------------------------------------------------------------
1400//
1401// Extended A more thorough check for features of regex patterns
1402// The test cases are in a separate data file,
1403// source/tests/testdata/regextst.txt
1404// A description of the test data format is included in that file.
1405//
1406//---------------------------------------------------------------------------
374ca955
A
1407
1408const char *
1409RegexTest::getPath(char buffer[2048], const char *filename) {
1410 UErrorCode status=U_ZERO_ERROR;
1411 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1412 if (U_FAILURE(status)) {
1413 errln("ERROR: loadTestData() failed - %s", u_errorName(status));
1414 return NULL;
1415 }
1416
1417 strcpy(buffer, testDataDirectory);
1418 strcat(buffer, filename);
1419 return buffer;
1420}
1421
b75a7d8f 1422void RegexTest::Extended() {
374ca955
A
1423 char tdd[2048];
1424 const char *srcPath;
b75a7d8f
A
1425 UErrorCode status = U_ZERO_ERROR;
1426 int32_t lineNum = 0;
1427
1428 //
1429 // Open and read the test data file.
1430 //
374ca955
A
1431 srcPath=getPath(tdd, "regextst.txt");
1432 if(srcPath==NULL) {
1433 return; /* something went wrong, error already output */
b75a7d8f
A
1434 }
1435
46f4442e
A
1436 int32_t len;
1437 UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
374ca955
A
1438 if (U_FAILURE(status)) {
1439 return; /* something went wrong, error already output */
1440 }
b75a7d8f
A
1441
1442 //
1443 // Put the test data into a UnicodeString
1444 //
1445 UnicodeString testString(FALSE, testData, len);
1446
46f4442e
A
1447 RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
1448 RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
1449 RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
b75a7d8f 1450
46f4442e 1451 RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
b75a7d8f
A
1452 UnicodeString testPattern; // The pattern for test from the test file.
1453 UnicodeString testFlags; // the flags for a test.
1454 UnicodeString matchString; // The marked up string to be used as input
1455
73c04bcf
A
1456 if (U_FAILURE(status)){
1457 dataerrln("Construct RegexMatcher() error.");
1458 delete [] testData;
1459 return;
1460 }
b75a7d8f
A
1461
1462 //
1463 // Loop over the test data file, once per line.
1464 //
1465 while (lineMat.find()) {
1466 lineNum++;
1467 if (U_FAILURE(status)) {
1468 errln("line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
1469 }
1470
1471 status = U_ZERO_ERROR;
1472 UnicodeString testLine = lineMat.group(1, status);
1473 if (testLine.length() == 0) {
1474 continue;
1475 }
1476
1477 //
1478 // Parse the test line. Skip blank and comment only lines.
1479 // Separate out the three main fields - pattern, flags, target.
1480 //
1481
1482 commentMat.reset(testLine);
1483 if (commentMat.lookingAt(status)) {
1484 // This line is a comment, or blank.
1485 continue;
1486 }
1487
1488 //
1489 // Pull out the pattern field, remove it from the test file line.
1490 //
1491 quotedStuffMat.reset(testLine);
1492 if (quotedStuffMat.lookingAt(status)) {
1493 testPattern = quotedStuffMat.group(2, status);
1494 testLine.remove(0, quotedStuffMat.end(0, status));
1495 } else {
1496 errln("Bad pattern (missing quotes?) at test file line %d", lineNum);
1497 continue;
1498 }
1499
1500
1501 //
1502 // Pull out the flags from the test file line.
1503 //
1504 flagsMat.reset(testLine);
1505 flagsMat.lookingAt(status); // Will always match, possibly an empty string.
1506 testFlags = flagsMat.group(1, status);
1507 if (flagsMat.group(2, status).length() > 0) {
1508 errln("Bad Match flag at line %d. Scanning %c\n",
1509 lineNum, flagsMat.group(2, status).charAt(0));
1510 continue;
1511 }
1512 testLine.remove(0, flagsMat.end(0, status));
1513
1514 //
1515 // Pull out the match string, as a whole.
1516 // We'll process the <tags> later.
1517 //
1518 quotedStuffMat.reset(testLine);
1519 if (quotedStuffMat.lookingAt(status)) {
1520 matchString = quotedStuffMat.group(2, status);
1521 testLine.remove(0, quotedStuffMat.end(0, status));
1522 } else {
1523 errln("Bad match string at test file line %d", lineNum);
1524 continue;
1525 }
1526
1527 //
1528 // The only thing left from the input line should be an optional trailing comment.
1529 //
1530 commentMat.reset(testLine);
1531 if (commentMat.lookingAt(status) == FALSE) {
1532 errln("Line %d: unexpected characters at end of test line.", lineNum);
1533 continue;
1534 }
1535
1536 //
1537 // Run the test
1538 //
1539 regex_find(testPattern, testFlags, matchString, lineNum);
1540 }
1541
1542 delete [] testData;
1543
1544}
1545
1546
1547
46f4442e
A
1548//---------------------------------------------------------------------------
1549//
1550// regex_find(pattern, flags, inputString, lineNumber)
1551//
1552// Function to run a single test from the Extended (data driven) tests.
1553// See file test/testdata/regextst.txt for a description of the
1554// pattern and inputString fields, and the allowed flags.
1555// lineNumber is the source line in regextst.txt of the test.
1556//
1557//---------------------------------------------------------------------------
1558
1559
1560// Set a value into a UVector at position specified by a decimal number in
1561// a UnicodeString. This is a utility function needed by the actual test function,
1562// which follows.
1563static void set(UVector &vec, int32_t val, UnicodeString index) {
1564 UErrorCode status=U_ZERO_ERROR;
1565 int32_t idx = 0;
1566 for (int32_t i=0; i<index.length(); i++) {
1567 int32_t d=u_charDigitValue(index.charAt(i));
1568 if (d<0) {return;}
1569 idx = idx*10 + d;
1570 }
1571 while (vec.size()<idx+1) {vec.addElement(-1, status);}
1572 vec.setElementAt(val, idx);
1573}
1574
1575void RegexTest::regex_find(const UnicodeString &pattern,
1576 const UnicodeString &flags,
1577 const UnicodeString &inputString,
1578 int32_t line) {
1579 UnicodeString unEscapedInput;
1580 UnicodeString deTaggedInput;
1581
1582 UErrorCode status = U_ZERO_ERROR;
1583 UParseError pe;
1584 RegexPattern *parsePat = NULL;
1585 RegexMatcher *parseMatcher = NULL;
1586 RegexPattern *callerPattern = NULL;
1587 RegexMatcher *matcher = NULL;
1588 UVector groupStarts(status);
1589 UVector groupEnds(status);
1590 UBool isMatch = FALSE;
1591 UBool failed = FALSE;
1592 int32_t numFinds;
1593 int32_t i;
1594 UBool useMatchesFunc = FALSE;
1595 UBool useLookingAtFunc = FALSE;
1596 int32_t regionStart = -1;
1597 int32_t regionEnd = -1;
1598
1599 //
1600 // Compile the caller's pattern
1601 //
1602 uint32_t bflags = 0;
1603 if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag
1604 bflags |= UREGEX_CASE_INSENSITIVE;
1605 }
1606 if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag
1607 bflags |= UREGEX_COMMENTS;
1608 }
1609 if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag
1610 bflags |= UREGEX_DOTALL;
1611 }
1612 if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag
1613 bflags |= UREGEX_MULTILINE;
1614 }
1615
1616 if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
1617 bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
1618 }
1619 if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
1620 bflags |= UREGEX_UNIX_LINES;
1621 }
1622
1623
1624 callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
1625 if (status != U_ZERO_ERROR) {
1626 #if UCONFIG_NO_BREAK_ITERATION==1
1627 // 'v' test flag means that the test pattern should not compile if ICU was configured
1628 // to not include break iteration. RBBI is needed for Unicode word boundaries.
1629 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
1630 goto cleanupAndReturn;
1631 }
1632 #endif
1633 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
1634 // Expected pattern compilation error.
1635 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
1636 logln("Pattern Compile returns \"%s\"", u_errorName(status));
1637 }
1638 goto cleanupAndReturn;
1639 } else {
1640 // Unexpected pattern compilation error.
1641 errln("Line %d: error %s compiling pattern.", line, u_errorName(status));
1642 goto cleanupAndReturn;
1643 }
1644 }
1645
1646 if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag
1647 RegexPatternDump(callerPattern);
1648 }
1649
1650 if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag
1651 errln("Expected, but did not get, a pattern compilation error.");
1652 goto cleanupAndReturn;
1653 }
1654
1655
1656 //
1657 // Number of times find() should be called on the test string, default to 1
1658 //
1659 numFinds = 1;
1660 for (i=2; i<=9; i++) {
1661 if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag
1662 if (numFinds != 1) {
1663 errln("Line %d: more than one digit flag. Scanning %d.", line, i);
1664 goto cleanupAndReturn;
1665 }
1666 numFinds = i;
1667 }
1668 }
1669
1670 // 'M' flag. Use matches() instead of find()
1671 if (flags.indexOf((UChar)0x4d) >= 0) {
1672 useMatchesFunc = TRUE;
1673 }
1674 if (flags.indexOf((UChar)0x4c) >= 0) {
1675 useLookingAtFunc = TRUE;
1676 }
1677
1678 //
1679 // Find the tags in the input data, remove them, and record the group boundary
1680 // positions.
1681 //
1682 parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
1683 REGEX_CHECK_STATUS_L(line);
1684
1685 unEscapedInput = inputString.unescape();
1686 parseMatcher = parsePat->matcher(unEscapedInput, status);
1687 REGEX_CHECK_STATUS_L(line);
1688 while(parseMatcher->find()) {
1689 parseMatcher->appendReplacement(deTaggedInput, "", status);
1690 REGEX_CHECK_STATUS;
1691 UnicodeString groupNum = parseMatcher->group(2, status);
1692 if (groupNum == "r") {
1693 // <r> or </r>, a region specification within the string
1694 if (parseMatcher->group(1, status) == "/") {
1695 regionEnd = deTaggedInput.length();
1696 } else {
1697 regionStart = deTaggedInput.length();
1698 }
1699 } else {
1700 // <digits> or </digits>, a group match boundary tag.
1701 if (parseMatcher->group(1, status) == "/") {
1702 set(groupEnds, deTaggedInput.length(), groupNum);
1703 } else {
1704 set(groupStarts, deTaggedInput.length(), groupNum);
1705 }
1706 }
1707 }
1708 parseMatcher->appendTail(deTaggedInput);
1709 REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
1710 if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
1711 errln("mismatched <r> tags");
1712 failed = TRUE;
1713 goto cleanupAndReturn;
1714 }
1715
1716
1717 //
1718 // Configure the matcher according to the flags specified with this test.
1719 //
1720 matcher = callerPattern->matcher(deTaggedInput, status);
1721 REGEX_CHECK_STATUS_L(line);
1722 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
1723 matcher->setTrace(TRUE);
1724 }
1725 if (regionStart>=0) {
1726 matcher->region(regionStart, regionEnd, status);
1727 REGEX_CHECK_STATUS_L(line);
1728 }
1729 if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag
1730 matcher->useAnchoringBounds(FALSE);
1731 }
1732 if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag
1733 matcher->useTransparentBounds(TRUE);
1734 }
1735
1736
1737
1738 //
1739 // Do a find on the de-tagged input using the caller's pattern
1740 // TODO: error on count>1 and not find().
1741 // error on both matches() and lookingAt().
1742 //
1743 for (i=0; i<numFinds; i++) {
1744 if (useMatchesFunc) {
1745 isMatch = matcher->matches(status);
1746 } else if (useLookingAtFunc) {
1747 isMatch = matcher->lookingAt(status);
1748 } else {
1749 isMatch = matcher->find();
1750 }
1751 }
1752 matcher->setTrace(FALSE);
1753
1754 //
1755 // Match up the groups from the find() with the groups from the tags
1756 //
1757
1758 // number of tags should match number of groups from find operation.
1759 // matcher->groupCount does not include group 0, the entire match, hence the +1.
1760 // G option in test means that capture group data is not available in the
1761 // expected results, so the check needs to be suppressed.
1762 if (isMatch == FALSE && groupStarts.size() != 0) {
1763 errln("Error at line %d: Match expected, but none found.\n", line);
1764 failed = TRUE;
1765 goto cleanupAndReturn;
1766 }
1767
1768 if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
1769 // Only check for match / no match. Don't check capture groups.
1770 if (isMatch && groupStarts.size() == 0) {
1771 errln("Error at line %d: No match expected, but one found.\n", line);
1772 failed = TRUE;
1773 }
1774 goto cleanupAndReturn;
1775 }
1776
1777 for (i=0; i<=matcher->groupCount(); i++) {
1778 int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
1779 if (matcher->start(i, status) != expectedStart) {
1780 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
1781 line, i, expectedStart, matcher->start(i, status));
1782 failed = TRUE;
1783 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
1784 }
1785 int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
1786 if (matcher->end(i, status) != expectedEnd) {
1787 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
1788 line, i, expectedEnd, matcher->end(i, status));
1789 failed = TRUE;
1790 // Error on end position; keep going; real error is probably yet to come as group
1791 // end positions work from end of the input data towards the front.
1792 }
1793 }
1794 if ( matcher->groupCount()+1 < groupStarts.size()) {
1795 errln("Error at line %d: Expected %d capture groups, found %d.",
1796 line, groupStarts.size()-1, matcher->groupCount());
1797 failed = TRUE;
1798 }
1799
1800 if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
1801 matcher->requireEnd() == TRUE) {
1802 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line);
1803 failed = TRUE;
1804 }
1805 if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true
1806 matcher->requireEnd() == FALSE) {
1807 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line);
1808 failed = TRUE;
1809 }
1810 if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
1811 matcher->hitEnd() == TRUE) {
1812 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line);
1813 failed = TRUE;
1814 }
1815 if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
1816 matcher->hitEnd() == FALSE) {
1817 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line);
1818 failed = TRUE;
1819 }
1820
1821
1822cleanupAndReturn:
1823 if (failed) {
1824 errln((UnicodeString)"\""+pattern+(UnicodeString)"\" "
1825 +flags+(UnicodeString)" \""+inputString+(UnicodeString)"\"");
1826 // callerPattern->dump();
1827 }
1828 delete parseMatcher;
1829 delete parsePat;
1830 delete matcher;
1831 delete callerPattern;
1832}
1833
1834
1835
1836
b75a7d8f
A
1837//---------------------------------------------------------------------------
1838//
1839// Errors Check for error handling in patterns.
1840//
1841//---------------------------------------------------------------------------
1842void RegexTest::Errors() {
1843 // \escape sequences that aren't implemented yet.
1844 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
1845
1846 // Missing close parentheses
1847 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
1848 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
1849 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
1850
1851 // Extra close paren
1852 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
1853 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
1854 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
1855
1856 // Look-ahead, Look-behind
1857 // TODO: add tests for unbounded length look-behinds.
1858 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct
1859
374ca955 1860 // Attempt to use non-default flags
b75a7d8f
A
1861 {
1862 UParseError pe;
1863 UErrorCode status = U_ZERO_ERROR;
1864 int32_t flags = UREGEX_CANON_EQ |
1865 UREGEX_COMMENTS | UREGEX_DOTALL |
1866 UREGEX_MULTILINE;
1867 RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
1868 REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
1869 delete pat1;
1870 }
1871
1872
1873 // Quantifiers are allowed only after something that can be quantified.
1874 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
1875 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
1876 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
1877
1878 // Mal-formed {min,max} quantifiers
1879 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
1880 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
1881 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
1882 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
1883 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
1884 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
374ca955
A
1885 REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan
1886 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format
1887 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
1888
46f4442e
A
1889 // Ticket 5389
1890 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
b75a7d8f
A
1891
1892}
1893
1894
1895//-------------------------------------------------------------------------------
1896//
1897// Read a text data file, convert it to UChars, and return the data
1898// in one big UChar * buffer, which the caller must delete.
1899//
1900//--------------------------------------------------------------------------------
46f4442e
A
1901UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
1902 const char *defEncoding, UErrorCode &status) {
b75a7d8f
A
1903 UChar *retPtr = NULL;
1904 char *fileBuf = NULL;
1905 UConverter* conv = NULL;
1906 FILE *f = NULL;
374ca955 1907
b75a7d8f
A
1908 ulen = 0;
1909 if (U_FAILURE(status)) {
1910 return retPtr;
1911 }
374ca955 1912
b75a7d8f
A
1913 //
1914 // Open the file.
1915 //
1916 f = fopen(fileName, "rb");
1917 if (f == 0) {
46f4442e 1918 dataerrln("[DATA] Error opening test data file %s\n", fileName);
374ca955
A
1919 status = U_FILE_ACCESS_ERROR;
1920 return NULL;
b75a7d8f
A
1921 }
1922 //
1923 // Read it in
1924 //
46f4442e
A
1925 int32_t fileSize;
1926 int32_t amt_read;
374ca955 1927
b75a7d8f
A
1928 fseek( f, 0, SEEK_END);
1929 fileSize = ftell(f);
1930 fileBuf = new char[fileSize];
1931 fseek(f, 0, SEEK_SET);
1932 amt_read = fread(fileBuf, 1, fileSize, f);
1933 if (amt_read != fileSize || fileSize <= 0) {
1934 errln("Error reading test data file.");
1935 goto cleanUpAndReturn;
1936 }
374ca955 1937
b75a7d8f
A
1938 //
1939 // Look for a Unicode Signature (BOM) on the data just read
1940 //
1941 int32_t signatureLength;
1942 const char * fileBufC;
1943 const char* encoding;
374ca955 1944
b75a7d8f
A
1945 fileBufC = fileBuf;
1946 encoding = ucnv_detectUnicodeSignature(
1947 fileBuf, fileSize, &signatureLength, &status);
1948 if(encoding!=NULL ){
1949 fileBufC += signatureLength;
1950 fileSize -= signatureLength;
46f4442e
A
1951 } else {
1952 encoding = defEncoding;
1953 if (strcmp(encoding, "utf-8") == 0) {
1954 errln("file %s is missing its BOM", fileName);
1955 }
b75a7d8f 1956 }
374ca955 1957
b75a7d8f
A
1958 //
1959 // Open a converter to take the rule file to UTF-16
1960 //
1961 conv = ucnv_open(encoding, &status);
1962 if (U_FAILURE(status)) {
1963 goto cleanUpAndReturn;
1964 }
374ca955 1965
b75a7d8f
A
1966 //
1967 // Convert the rules to UChar.
1968 // Preflight first to determine required buffer size.
1969 //
1970 ulen = ucnv_toUChars(conv,
1971 NULL, // dest,
1972 0, // destCapacity,
1973 fileBufC,
1974 fileSize,
1975 &status);
1976 if (status == U_BUFFER_OVERFLOW_ERROR) {
1977 // Buffer Overflow is expected from the preflight operation.
1978 status = U_ZERO_ERROR;
374ca955 1979
b75a7d8f
A
1980 retPtr = new UChar[ulen+1];
1981 ucnv_toUChars(conv,
1982 retPtr, // dest,
1983 ulen+1,
1984 fileBufC,
1985 fileSize,
1986 &status);
1987 }
1988
1989cleanUpAndReturn:
1990 fclose(f);
1991 delete[] fileBuf;
1992 ucnv_close(conv);
1993 if (U_FAILURE(status)) {
1994 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1995 delete retPtr;
1996 retPtr = 0;
1997 ulen = 0;
1998 };
1999 return retPtr;
2000}
2001
2002
2003//-------------------------------------------------------------------------------
2004//
2005// PerlTests - Run Perl's regular expression tests
2006// The input file for this test is re_tests, the standard regular
2007// expression test data distributed with the Perl source code.
2008//
2009// Here is Perl's description of the test data file:
2010//
2011// # The tests are in a separate file 't/op/re_tests'.
2012// # Each line in that file is a separate test.
2013// # There are five columns, separated by tabs.
2014// #
2015// # Column 1 contains the pattern, optionally enclosed in C<''>.
2016// # Modifiers can be put after the closing C<'>.
2017// #
2018// # Column 2 contains the string to be matched.
2019// #
2020// # Column 3 contains the expected result:
374ca955
A
2021// # y expect a match
2022// # n expect no match
2023// # c expect an error
2024// # B test exposes a known bug in Perl, should be skipped
2025// # b test exposes a known bug in Perl, should be skipped if noamp
b75a7d8f
A
2026// #
2027// # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
2028// #
2029// # Column 4 contains a string, usually C<$&>.
2030// #
2031// # Column 5 contains the expected result of double-quote
2032// # interpolating that string after the match, or start of error message.
2033// #
2034// # Column 6, if present, contains a reason why the test is skipped.
2035// # This is printed with "skipped", for harness to pick up.
2036// #
2037// # \n in the tests are interpolated, as are variables of the form ${\w+}.
2038// #
2039// # If you want to add a regular expression test that can't be expressed
2040// # in this format, don't add it here: put it in op/pat.t instead.
2041//
2042// For ICU, if field 3 contains an 'i', the test will be skipped.
2043// The test exposes is some known incompatibility between ICU and Perl regexps.
2044// (The i is in addition to whatever was there before.)
2045//
2046//-------------------------------------------------------------------------------
2047void RegexTest::PerlTests() {
374ca955
A
2048 char tdd[2048];
2049 const char *srcPath;
b75a7d8f
A
2050 UErrorCode status = U_ZERO_ERROR;
2051 UParseError pe;
2052
2053 //
2054 // Open and read the test data file.
2055 //
374ca955
A
2056 srcPath=getPath(tdd, "re_tests.txt");
2057 if(srcPath==NULL) {
2058 return; /* something went wrong, error already output */
b75a7d8f
A
2059 }
2060
46f4442e
A
2061 int32_t len;
2062 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
374ca955
A
2063 if (U_FAILURE(status)) {
2064 return; /* something went wrong, error already output */
2065 }
b75a7d8f
A
2066
2067 //
2068 // Put the test data into a UnicodeString
2069 //
2070 UnicodeString testDataString(FALSE, testData, len);
2071
2072 //
2073 // Regex to break the input file into lines, and strip the new lines.
2074 // One line per match, capture group one is the desired data.
2075 //
46f4442e 2076 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
73c04bcf
A
2077 if (U_FAILURE(status)) {
2078 dataerrln("RegexPattern::compile() error");
2079 return;
2080 }
b75a7d8f
A
2081 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
2082
2083 //
2084 // Regex to split a test file line into fields.
2085 // There are six fields, separated by tabs.
2086 //
46f4442e 2087 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
b75a7d8f
A
2088
2089 //
2090 // Regex to identify test patterns with flag settings, and to separate them.
2091 // Test patterns with flags look like 'pattern'i
2092 // Test patterns without flags are not quoted: pattern
2093 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
2094 //
46f4442e 2095 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
374ca955 2096 RegexMatcher* flagMat = flagPat->matcher(status);
b75a7d8f
A
2097
2098 //
2099 // The Perl tests reference several perl-isms, which are evaluated/substituted
2100 // in the test data. Not being perl, this must be done explicitly. Here
2101 // are string constants and REs for these constructs.
2102 //
2103 UnicodeString nulnulSrc("${nulnul}");
46f4442e 2104 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
b75a7d8f
A
2105 nulnul = nulnul.unescape();
2106
2107 UnicodeString ffffSrc("${ffff}");
46f4442e 2108 UnicodeString ffff("\\uffff", -1, US_INV);
b75a7d8f
A
2109 ffff = ffff.unescape();
2110
2111 // regexp for $-[0], $+[2], etc.
46f4442e 2112 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
374ca955
A
2113 RegexMatcher *groupsMat = groupsPat->matcher(status);
2114
b75a7d8f 2115 // regexp for $0, $1, $2, etc.
46f4442e 2116 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
374ca955 2117 RegexMatcher *cgMat = cgPat->matcher(status);
b75a7d8f
A
2118
2119
2120 //
2121 // Main Loop for the Perl Tests, runs once per line from the
2122 // test data file.
2123 //
2124 int32_t lineNum = 0;
2125 int32_t skippedUnimplementedCount = 0;
2126 while (lineMat->find()) {
2127 lineNum++;
2128
2129 //
2130 // Get a line, break it into its fields, do the Perl
2131 // variable substitutions.
2132 //
2133 UnicodeString line = lineMat->group(1, status);
2134 UnicodeString fields[7];
2135 fieldPat->split(line, fields, 7, status);
2136
2137 flagMat->reset(fields[0]);
2138 flagMat->matches(status);
2139 UnicodeString pattern = flagMat->group(2, status);
2140 pattern.findAndReplace("${bang}", "!");
46f4442e 2141 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
b75a7d8f
A
2142 pattern.findAndReplace(ffffSrc, ffff);
2143
2144 //
2145 // Identify patterns that include match flag settings,
2146 // split off the flags, remove the extra quotes.
2147 //
2148 UnicodeString flagStr = flagMat->group(3, status);
2149 if (U_FAILURE(status)) {
2150 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
2151 return;
2152 }
2153 int32_t flags = 0;
2154 const UChar UChar_c = 0x63; // Char constants for the flag letters.
2155 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
2156 const UChar UChar_m = 0x6d;
2157 const UChar UChar_x = 0x78;
2158 const UChar UChar_y = 0x79;
2159 if (flagStr.indexOf(UChar_i) != -1) {
2160 flags |= UREGEX_CASE_INSENSITIVE;
2161 }
2162 if (flagStr.indexOf(UChar_m) != -1) {
2163 flags |= UREGEX_MULTILINE;
2164 }
2165 if (flagStr.indexOf(UChar_x) != -1) {
2166 flags |= UREGEX_COMMENTS;
2167 }
2168
2169 //
2170 // Compile the test pattern.
2171 //
2172 status = U_ZERO_ERROR;
2173 RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
2174 if (status == U_REGEX_UNIMPLEMENTED) {
2175 //
2176 // Test of a feature that is planned for ICU, but not yet implemented.
2177 // skip the test.
2178 skippedUnimplementedCount++;
2179 delete testPat;
2180 status = U_ZERO_ERROR;
2181 continue;
2182 }
2183
2184 if (U_FAILURE(status)) {
2185 // Some tests are supposed to generate errors.
2186 // Only report an error for tests that are supposed to succeed.
2187 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
2188 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
2189 {
2190 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
2191 }
2192 status = U_ZERO_ERROR;
2193 delete testPat;
2194 continue;
2195 }
2196
2197 if (fields[2].indexOf(UChar_i) >= 0) {
2198 // ICU should skip this test.
2199 delete testPat;
2200 continue;
2201 }
2202
2203 if (fields[2].indexOf(UChar_c) >= 0) {
2204 // This pattern should have caused a compilation error, but didn't/
2205 errln("line %d: Expected a pattern compile error, got success.", lineNum);
2206 delete testPat;
2207 continue;
2208 }
2209
2210 //
2211 // replace the Perl variables that appear in some of the
374ca955 2212 // match data strings.
b75a7d8f
A
2213 //
2214 UnicodeString matchString = fields[1];
2215 matchString.findAndReplace(nulnulSrc, nulnul);
2216 matchString.findAndReplace(ffffSrc, ffff);
2217
2218 // Replace any \n in the match string with an actual new-line char.
2219 // Don't do full unescape, as this unescapes more than Perl does, which
2220 // causes other spurious failures in the tests.
46f4442e 2221 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
374ca955 2222
b75a7d8f
A
2223
2224
2225 //
2226 // Run the test, check for expected match/don't match result.
2227 //
2228 RegexMatcher *testMat = testPat->matcher(matchString, status);
2229 UBool found = testMat->find();
2230 UBool expected = FALSE;
2231 if (fields[2].indexOf(UChar_y) >=0) {
2232 expected = TRUE;
2233 }
2234 if (expected != found) {
374ca955 2235 errln("line %d: Expected %smatch, got %smatch",
b75a7d8f
A
2236 lineNum, expected?"":"no ", found?"":"no " );
2237 continue;
2238 }
46f4442e
A
2239
2240 // Don't try to check expected results if there is no match.
2241 // (Some have stuff in the expected fields)
2242 if (!found) {
2243 delete testMat;
2244 delete testPat;
2245 continue;
2246 }
b75a7d8f
A
2247
2248 //
2249 // Interpret the Perl expression from the fourth field of the data file,
2250 // building up an ICU string from the results of the ICU match.
374ca955 2251 // The Perl expression will contain references to the results of
b75a7d8f
A
2252 // a regex match, including the matched string, capture group strings,
2253 // group starting and ending indicies, etc.
2254 //
2255 UnicodeString resultString;
2256 UnicodeString perlExpr = fields[3];
2257 groupsMat->reset(perlExpr);
2258 cgMat->reset(perlExpr);
2259
2260 while (perlExpr.length() > 0) {
2261 if (perlExpr.startsWith("$&")) {
2262 resultString.append(testMat->group(status));
2263 perlExpr.remove(0, 2);
2264 }
2265
2266 else if (groupsMat->lookingAt(status)) {
2267 // $-[0] $+[2] etc.
2268 UnicodeString digitString = groupsMat->group(2, status);
2269 int32_t t = 0;
2270 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
2271 UnicodeString plusOrMinus = groupsMat->group(1, status);
2272 int32_t matchPosition;
2273 if (plusOrMinus.compare("+") == 0) {
2274 matchPosition = testMat->end(groupNum, status);
2275 } else {
2276 matchPosition = testMat->start(groupNum, status);
2277 }
2278 if (matchPosition != -1) {
2279 ICU_Utility::appendNumber(resultString, matchPosition);
2280 }
2281 perlExpr.remove(0, groupsMat->end(status));
2282 }
2283
2284 else if (cgMat->lookingAt(status)) {
2285 // $1, $2, $3, etc.
2286 UnicodeString digitString = cgMat->group(1, status);
2287 int32_t t = 0;
2288 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
2289 if (U_SUCCESS(status)) {
2290 resultString.append(testMat->group(groupNum, status));
2291 status = U_ZERO_ERROR;
2292 }
2293 perlExpr.remove(0, cgMat->end(status));
2294 }
2295
2296 else if (perlExpr.startsWith("@-")) {
46f4442e 2297 int32_t i;
b75a7d8f
A
2298 for (i=0; i<=testMat->groupCount(); i++) {
2299 if (i>0) {
2300 resultString.append(" ");
2301 }
2302 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
2303 }
2304 perlExpr.remove(0, 2);
2305 }
2306
2307 else if (perlExpr.startsWith("@+")) {
46f4442e 2308 int32_t i;
b75a7d8f
A
2309 for (i=0; i<=testMat->groupCount(); i++) {
2310 if (i>0) {
2311 resultString.append(" ");
2312 }
2313 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
2314 }
2315 perlExpr.remove(0, 2);
2316 }
2317
46f4442e 2318 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
b75a7d8f
A
2319 // or as an escaped sequence (e.g. \n)
2320 if (perlExpr.length() > 1) {
2321 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
2322 }
2323 UChar c = perlExpr.charAt(0);
2324 switch (c) {
2325 case 'n': c = '\n'; break;
2326 // add any other escape sequences that show up in the test expected results.
2327 }
374ca955 2328 resultString.append(c);
b75a7d8f
A
2329 perlExpr.remove(0, 1);
2330 }
2331
2332 else {
2333 // Any characters from the perl expression that we don't explicitly
2334 // recognize before here are assumed to be literals and copied
2335 // as-is to the expected results.
2336 resultString.append(perlExpr.charAt(0));
2337 perlExpr.remove(0, 1);
2338 }
2339
2340 if (U_FAILURE(status)) {
2341 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
2342 break;
2343 }
2344 }
374ca955 2345
b75a7d8f
A
2346 //
2347 // Expected Results Compare
2348 //
2349 UnicodeString expectedS(fields[4]);
2350 expectedS.findAndReplace(nulnulSrc, nulnul);
2351 expectedS.findAndReplace(ffffSrc, ffff);
46f4442e 2352 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
b75a7d8f
A
2353
2354
2355 if (expectedS.compare(resultString) != 0) {
73c04bcf
A
2356 err("Line %d: Incorrect perl expression results.", lineNum);
2357 errln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
b75a7d8f
A
2358 }
2359
2360 delete testMat;
2361 delete testPat;
2362 }
2363
2364 //
2365 // All done. Clean up allocated stuff.
2366 //
2367 delete cgMat;
2368 delete cgPat;
374ca955 2369
b75a7d8f
A
2370 delete groupsMat;
2371 delete groupsPat;
374ca955 2372
b75a7d8f
A
2373 delete flagMat;
2374 delete flagPat;
2375
2376 delete lineMat;
2377 delete linePat;
374ca955 2378
b75a7d8f
A
2379 delete fieldPat;
2380 delete [] testData;
374ca955 2381
b75a7d8f
A
2382
2383 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
2384
2385}
2386
2387
46f4442e
A
2388//
2389// Callbacks() Test the callback function.
2390// When set, callbacks occur periodically during matching operations,
2391// giving the application code the ability to abort the operation
2392// before it's normal completion.
2393//
2394
2395struct callBackContext {
2396 RegexTest *test;
2397 int32_t maxCalls;
2398 int32_t numCalls;
2399 int32_t lastSteps;
2400 void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
2401};
2402
2403U_CDECL_BEGIN
2404static UBool U_CALLCONV
2405testCallBackFn(const void *context, int32_t steps) {
2406 callBackContext *info = (callBackContext *)context;
2407 if (info->lastSteps+1 != steps) {
2408 info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps);
2409 }
2410 info->lastSteps = steps;
2411 info->numCalls++;
2412 return (info->numCalls < info->maxCalls);
2413}
2414U_CDECL_END
2415
2416void RegexTest::Callbacks() {
2417 {
2418 // Getter returns NULLs if no callback has been set
2419
2420 // The variables that the getter will fill in.
2421 // Init to non-null values so that the action of the getter can be seen.
2422 const void *returnedContext = &returnedContext;
2423 URegexMatchCallback *returnedFn = &testCallBackFn;
2424
2425 UErrorCode status = U_ZERO_ERROR;
2426 RegexMatcher matcher("x", 0, status);
2427 REGEX_CHECK_STATUS;
2428 matcher.getMatchCallback(returnedFn, returnedContext, status);
2429 REGEX_CHECK_STATUS;
2430 REGEX_ASSERT(returnedFn == NULL);
2431 REGEX_ASSERT(returnedContext == NULL);
2432 }
2433
2434 {
2435 // Set and Get work
2436 callBackContext cbInfo = {this, 0, 0, 0};
2437 const void *returnedContext;
2438 URegexMatchCallback *returnedFn;
2439 UErrorCode status = U_ZERO_ERROR;
2440 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long.
2441 REGEX_CHECK_STATUS;
2442 matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
2443 REGEX_CHECK_STATUS;
2444 matcher.getMatchCallback(returnedFn, returnedContext, status);
2445 REGEX_CHECK_STATUS;
2446 REGEX_ASSERT(returnedFn == testCallBackFn);
2447 REGEX_ASSERT(returnedContext == &cbInfo);
2448
2449 // A short-running match shouldn't invoke the callback
2450 status = U_ZERO_ERROR;
2451 cbInfo.reset(1);
2452 UnicodeString s = "xxx";
2453 matcher.reset(s);
2454 REGEX_ASSERT(matcher.matches(status));
2455 REGEX_CHECK_STATUS;
2456 REGEX_ASSERT(cbInfo.numCalls == 0);
2457
2458 // A medium-length match that runs long enough to invoke the
2459 // callback, but not so long that the callback aborts it.
2460 status = U_ZERO_ERROR;
2461 cbInfo.reset(4);
2462 s = "aaaaaaaaaaaaaaaaaaab";
2463 matcher.reset(s);
2464 REGEX_ASSERT(matcher.matches(status)==FALSE);
2465 REGEX_CHECK_STATUS;
2466 REGEX_ASSERT(cbInfo.numCalls > 0);
2467
2468 // A longer running match that the callback function will abort.
2469 status = U_ZERO_ERROR;
2470 cbInfo.reset(4);
2471 s = "aaaaaaaaaaaaaaaaaaaaaaab";
2472 matcher.reset(s);
2473 REGEX_ASSERT(matcher.matches(status)==FALSE);
2474 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
2475 REGEX_ASSERT(cbInfo.numCalls == 4);
2476 }
2477
2478
2479}
b75a7d8f
A
2480
2481#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
2482